OK, I thought I was correct in my original assertion so I looked back at some notes. VS does NOT support inline assembly for 64 bit mode.
and to the OP, this demonstrates how much faster SIMD is than C++, at least on Intel processors, AMD SIMD performance is less than spectacular last time I checked.
Code:
#include <windows.h>
#include <stdio.h>
int main(){
double* pData = (double*)_aligned_malloc(sizeof(double) * 2 * 67108864, 16);
double* pOutput = (double*)_aligned_malloc(sizeof(double) * 2, 16);
for(int x = 0;x<67108864 * 2;x++) pData[x] = 1.0;
DWORD SIMF_Start = GetTickCount();
_asm {
pushad;
mov eax , pData;
prefetcht0 [eax];
mov edx , pOutput;
mov edi , edx;
mov edx , 0x00000010;
mov ecx , 0x04000000;
xorpd xmm0 , xmm0;
the_loop:
prefetcht0 [eax+16];
addpd xmm0 , [eax];
add eax , edx;
loop the_loop;
mov edx , edi;
movapd [edx] , xmm0;
popad;
}
DWORD SIMD_Stop = GetTickCount();
DWORD CPP_Start = GetTickCount();
for(int x = 0;x<2 * 67108864;x++) pOutput[0] += pData[x];
DWORD CPP_Stop = GetTickCount();
printf("SIMD took %d\nCPP took %d\n" , SIMD_Stop-SIMF_Start , CPP_Stop-CPP_Start);
return 0;
}