Here's a rewrite of your program using some ideas from here about the difficulties of using rdtsc. Basically, they involve the effects of data and code caching (and also "out of order execution", which I didn't compensate for).
To alleviate the caching effects, each routine is called 3 times, taking the timing only from the last call (if USE_CACHE_WARMING is non-zero).
I usually get "r172 n168" using cache warming and "r236 n232" without, although occasionally something like "n664" crops up.
Code:
#include <stdio.h>
#include <stdlib.h>
#define USE_CACHE_WARMING 1 /* 0 don't, 1 do */
typedef unsigned long long ull;
unsigned before_low, before_high, after_low, after_high;
char *rstr(char *dest, const char *src) {
int d0, d1, d2;
asm("rdtsc" : "=a" (before_low), "=d" (before_high));
asm(
"1: \n"
"lodsb \n"
"stosb \n"
"testb %%al, %%al \n"
"jne 1b"
: "=&S" (d0), "=&D" (d1), "=&a" (d2)
: "0" (src), "1" (dest)
: "memory"
);
asm("rdtsc" : "=a" (after_low), "=d" (after_high));
return dest;
}
char *nstr(char *dest, const char *src) {
asm("rdtsc" : "=a" (before_low), "=d" (before_high));
asm(
"1: \n"
"lodsb \n"
"stosb \n"
"testb %%al, %%al \n"
"jne 1b"
:
: "S" (src), "D" (dest)
: "memory"
);
asm("rdtsc" : "=a" (after_low), "=d" (after_high));
return dest;
}
ull cycles(void) {
return (ull)(after_low - before_low)
+ ((ull)(after_high - before_high) << 32);
}
int main(void) {
char out[40], *str = "abcdefg";
ull rcycles, ncycles;
#if USE_CACHE_WARMING
nstr(out, str);
nstr(out, str);
nstr(out, str);
ncycles = cycles();
rstr(out, str);
rstr(out, str);
rstr(out, str);
rcycles = cycles();
#else
nstr(out, str);
ncycles = cycles();
rstr(out, str);
rcycles = cycles();
#endif
printf("r%llu\t", rcycles);
printf("n%llu\n", ncycles);
return 0;
}