Originally Posted by
awsdert
Have you thought about using the asm keyword and making a block of assembly? or how bout just putting that one function inside an asm file and just directly control what is done within the function, means you need to be more careful with the code but should be doable
Writing a routine in assembly not necessarily will improve its performance. Here's a simple example:
Code:
# Makefile
CFLAGS=-O2
test: test.o f.o g.o
$(CC) -o $@ $^
test.o: test.c
f.o: f.c
g.o: g.asm
nasm -felf64 -o $@ $<
Code:
/* test.c */
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <inttypes.h>
#include "cycle_counting.h"
extern long f( int *, size_t );
extern long g( int *, size_t );
#define ARRAY_SIZE 128
int main ( void )
{
static int array[ARRAY_SIZE];
int *p;
size_t count;
long sum1, sum2;
counter_T c1, c2;
srand ( time ( NULL ) );
// fill array with random values...
count = ARRAY_SIZE;
p = array;
while ( count-- )
*p++ = rand();
c1 = BEGIN_TSC();
sum1 = f( array, ARRAY_SIZE );
END_TSC( &c1 );
c2 = BEGIN_TSC();
sum2 = g( array, ARRAY_SIZE );
END_TSC( &c2 );
printf( "sum = %ld, from f(): %" PRIu64 " cycles\n"
"sum = %ld, from g(): %" PRIu64 " cycles\n",
sum1, c1, sum2, c2 );
}
Code:
/* f.c */
#include <stddef.h>
long f( int *p, size_t size )
{
long sum = 0L;
while ( size-- )
sum += *p++;
return sum;
}
Code:
; g.asm
bits 64
default rel
section .text
; long g( int *, size_t );
; Entry: RDI = ptr, RSI = size
; Returns RAX = sum.
global g
g:
xor rax,rax ; sum = 0;
.loop:
test rsi,rsi
jz .exit
sub rsi,1
mov ecx,[rdi+rsi*4]
add rax,rcx
jmp .loop
.exit:
ret
Compiling, linking and testing I get:
Code:
$ make
cc -O2 -c -o test.o test.c
cc -O2 -c -o f.o f.c
nasm -felf64 -o g.o g.asm
cc -o test test.o f.o g.o
$ ./test
sum = 140386635490, from f(): 680 cycles
sum = 140386635490, from g(): 892 cycles
The assembly routine is 31% slower than the C routine (tested in an i5-3570 @ 3.4 GHz)!
Here's the routine created by GCC:
Code:
$ objdump -dM intel f.o
0000000000000000 <f>:
0: 48 85 f6 test rsi,rsi
3: 74 23 je 28 <f+0x28>
5: 31 d2 xor edx,edx
7: 31 c0 xor eax,eax
9: 0f 1f 80 00 00 00 00 nop DWORD PTR [rax+0x0]
10: 48 63 0c 97 movsxd rcx,DWORD PTR [rdi+rdx*4]
14: 48 83 c2 01 add rdx,0x1
18: 48 01 c8 add rax,rcx
1b: 48 39 f2 cmp rdx,rsi
1e: 75 f0 jne 10 <f+0x10>
20: f3 c3 repz ret
22: 66 0f 1f 44 00 00 nop WORD PTR [rax+rax*1+0x0]
28: 31 c0 xor eax,eax
2a: c3 ret
Notice the C implementation is more complex (testing the index two times to improve the effects of static branch predictor algorithm, and uses jumps qword alignment!)... In general, C compilers do a better job than manually created assembly routines.