Code:
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <sys/time.h>
#include <string.h>
#define SIZE (1024*1024*100)
#define SIZE2 (SIZE+50)
typedef unsigned int uint32;
typedef unsigned char uint8;
void manualCopy( uint8 *pDest, const uint8 *pSrc, size_t len )
{
uint32 i;
// Manually copy the data
for ( i = 0; i < len; i++ )
{
// Copy data from source to destination
*pDest++ = *pSrc++;
}
}
void *sysFastMemCopy( void *d, const void *s, size_t len )
{
uint8 *pDest = d;
const uint8 *pSrc = s;
uint32 srcCnt;
uint32 destCnt;
uint32 newLen;
uint32 endLen;
uint32 longLen;
uint32 *pLongSrc;
uint32 *pLongDest;
uint32 longWord1;
uint32 longWord2;
uint32 methodSelect;
if ( len <= 32 )
{
// For small copy lengths, copy byte-by-byte to the destination
manualCopy( pDest, pSrc, len );
// Exit now
return pDest;
}
// Determine the number of bytes in the first word of src and dest
srcCnt = 4 - ( (intptr_t) pSrc & 0x03 );
destCnt = 4 - ( (intptr_t) pDest & 0x03 );
// Copy the initial bytes to the destination
manualCopy( pDest, pSrc, destCnt );
// Determine the number of bytes remaining
newLen = len - destCnt;
// Determine how many full long words to copy to the destination
longLen = newLen / 16;
// Determine number of lingering bytes to copy at the end
endLen = newLen & (16-1);
// Pick the initial long destination word to copy to
pLongDest = (uint32*) ( pDest + destCnt );
// Pick the initial source word to start our algorithm at
if ( srcCnt <= destCnt )
{
// Advance to pSrc at the start of the next full word
pLongSrc = (uint32*) ( pSrc + srcCnt );
}
else // There are still source bytes remaining in the first word
{
// Set pSrc to the start of the first full word
pLongSrc = (uint32*) ( pSrc + srcCnt - 4 );
}
// There are 4 different longWord copy methods
methodSelect = ( srcCnt - destCnt ) & 0x03;
// Just copy one-to-one
if ( methodSelect == 0 )
{
// Just copy the specified number of long words
do {
*pLongDest++ = *pLongSrc++;
*pLongDest++ = *pLongSrc++;
*pLongDest++ = *pLongSrc++;
*pLongDest++ = *pLongSrc++;
}
while ( longLen -= 4 > 0 );
} else {
int left = 0, right = 0;
switch ( methodSelect ) {
case 1: left = 8; right = 24; break;
case 2: left = 16; right = 16; break;
case 3: left = 24; right = 8; break;
default: exit(1); // it's all gone wrong
}
// Get the first long word
longWord1 = *pLongSrc++;
// Copy words created by combining 2 adjacent long words
do {
longWord2 = *pLongSrc++;
*pLongDest++ = ( longWord1 >> right ) | ( longWord2 << left );
longWord1 = longWord2;
longWord2 = *pLongSrc++;
*pLongDest++ = ( longWord1 >> right ) | ( longWord2 << left );
longWord1 = longWord2;
longWord2 = *pLongSrc++;
*pLongDest++ = ( longWord1 >> right ) | ( longWord2 << left );
longWord1 = longWord2;
longWord2 = *pLongSrc++;
*pLongDest++ = ( longWord1 >> right ) | ( longWord2 << left );
longWord1 = longWord2;
}
while ( longLen -= 4 > 0 );
}
// Copy any remaining bytes
if ( endLen != 0 )
{
// The trailing bytes will be copied next
pDest = (uint8*) pLongDest;
// Determine where the trailing source bytes are located
pSrc += len - endLen;
// Copy the remaining bytes
manualCopy( pDest, pSrc, endLen );
}
return pDest;
}
void fill ( char *p, size_t s, char f ) {
while ( s-- ) {
*p++ = f;
}
}
int main ( int argc, char *argv[] ) {
struct {
char *fname;
void *(*func)(void*,const void*,size_t);
} tests[] = {
#define TEST(x) #x, x
{ TEST(memcpy) },
{ TEST(memmove) },
{ TEST(sysFastMemCopy) },
};
int fromOffset = (argc>1) ? atoi(argv[1]) : 0;
int toOffset = (argc>2) ? atoi(argv[2]) : 0;
char *from_block = malloc(SIZE2);
char *to_block = malloc(SIZE2);
char *dummy_block = malloc(SIZE2);
char *refblock = NULL;
printf("Copy offsets are %d and %d\n", fromOffset, toOffset );
for ( size_t i = 0 ; i < sizeof(tests)/sizeof(tests[0]) ; i++ ) {
fill(from_block,SIZE2,'A');
fill(to_block,SIZE2,'@');
fill(dummy_block,SIZE2,'X'); // make sure from and to are out of cache
struct timeval start,end;
gettimeofday(&start,NULL);
tests[i].func(to_block+toOffset,from_block+fromOffset,SIZE);
gettimeofday(&end,NULL);
if ( refblock == NULL ) {
refblock = malloc(SIZE2);
memcpy(refblock,to_block,SIZE2);
}
int diff = memcmp(to_block,refblock,SIZE2);
unsigned long elapsed = (end.tv_sec-start.tv_sec)*1000000+(end.tv_usec-start.tv_usec);
printf("Copying %d bytes using %s took %lu uSec, result=%d\n", SIZE, tests[i].fname, elapsed, diff );
}
free(from_block);
free(to_block);
free(dummy_block);
free(refblock);
}
$ gcc -std=c99 -Wall -Wextra -g -O2 foo.c
$ ./a.out ; ./a.out 1 2 ; ./a.out 2 1 ; ./a.out 2 2
Copy offsets are 0 and 0
Copying 104857600 bytes using memcpy took 12326 uSec, result=0
Copying 104857600 bytes using memmove took 12338 uSec, result=0
Copying 104857600 bytes using sysFastMemCopy took 16871 uSec, result=0
Copy offsets are 1 and 2
Copying 104857600 bytes using memcpy took 12316 uSec, result=0
Copying 104857600 bytes using memmove took 12654 uSec, result=0
Copying 104857600 bytes using sysFastMemCopy took 21705 uSec, result=0
Copy offsets are 2 and 1
Copying 104857600 bytes using memcpy took 12746 uSec, result=0
Copying 104857600 bytes using memmove took 12643 uSec, result=0
Copying 104857600 bytes using sysFastMemCopy took 21832 uSec, result=0
Copy offsets are 2 and 2
Copying 104857600 bytes using memcpy took 12162 uSec, result=0
Copying 104857600 bytes using memmove took 12057 uSec, result=0
Copying 104857600 bytes using sysFastMemCopy took 16843 uSec, result=0
1. The non-zero methodSelect's have been collapsed into a single else, and the shift amounts turned into variables initialised within a switch.