Problem with unaligned intrinsics
I have a problem where my code behaves quite strangely in release mode. I'm practicing how to use intrinsics by writing a simple hash function and I'm getting a crash when I load in data to a xmm register. The data is not 16 byte aligned, but I'm using the __mm_loadu_si128 intrinsic so that shouldn't be the problem, but it looks to me like something gets messed up when the code is generated. The compiler I'm using is visual studio 2008.
Code:
unsigned long getHashed(int* data)
{
static const int mulConst[4] = {0x0F98A402, 0x50A89EC7, 0x8362A7DE, 0xEF83C901};
static __m128i hashConstant = _mm_loadu_si128((__m128i*)mulConst);
static __m128i dataReg;
static __m128i res;
static unsigned int result[4] = {0};
unsigned long hash = 0;
dataReg = _mm_loadu_si128((__m128i*)data);
res = _mm_mullo_epi16(dataReg, hashConstant);
_mm_storeu_si128((__m128i*)result, res);
hash += result[0];
hash += result[1];
hash += result[2];
hash += result[3];
return hash;
}
unsigned long ToHash(const char* cStr, int len)
{
unsigned long hash = 2508234;
unsigned long num = 0;
static int shortBuf[4] = {0};
while (len >= 16)
{
hash += getHashed((int*)&cStr[len - 17]);
len -= 16;
}
if (len < 16)
{
memset(shortBuf, 0, sizeof(int) * 4);
memcpy(shortBuf, cStr, len);
hash += getHashed(shortBuf);
}
return hash;
}
The disassembly for this is:
Code:
--- c:\users\daniel\documents\visual studio 2008\projects\hashtest\hashtest\main.cpp
unsigned long getHashed(int* data)
{
00F11070 push ebp
00F11071 mov ebp,esp
00F11073 and esp,0FFFFFFF0h
static const int mulConst[4] = {0x0F98A402, 0x50A89EC7, 0x8362A7DE, 0xEF83C901};
static __m128i hashConstant = _mm_loadu_si128((__m128i*)mulConst);
00F11076 mov eax,1
00F1107B test byte ptr [$S1 (0F14420h)],al
00F11081 jne getHashed+29h (0F11099h)
00F11083 movdqu xmm0,xmmword ptr [mulConst (0F13248h)]
00F1108B or dword ptr [$S1 (0F14420h)],eax
00F11091 movdqu xmmword ptr [hashConstant (0F14410h)],xmm0
static __m128i dataReg;
static __m128i res;
static unsigned int result[4] = {0};
unsigned long hash = 0;
dataReg = _mm_loadu_si128((__m128i*)data);
00F11099 mov eax,dword ptr [data]
00F1109C movdqu xmm0,xmmword ptr [eax]
00F110A0 movdqa xmmword ptr [dataReg (0F14400h)],xmm0
res = _mm_mullo_epi16(dataReg, hashConstant);
00F110A8 pmullw xmm0,xmmword ptr [hashConstant (0F14410h)]
_mm_storeu_si128((__m128i*)result, res);
00F110B0 mov ecx,offset result (0F143CCh)
00F110B5 movdqu xmmword ptr [ecx],xmm0
00F110B9 mov edx,dword ptr [result+8 (0F143D4h)]
00F110BF mov eax,dword ptr [result+0Ch (0F143D8h)]
00F110C4 add eax,edx
hash += result[0];
hash += result[1];
hash += result[2];
hash += result[3];
return hash;
00F110C6 add eax,dword ptr [result+4 (0F143D0h)]
00F110CC movdqa xmmword ptr [res (0F143F0h)],xmm0
00F110D4 add eax,dword ptr [result (0F143CCh)]
}
00F110DA mov esp,ebp
00F110DC pop ebp
00F110DD ret
--- No source file -------------------------------------------------------------
00F110DE int 3
00F110DF int 3
--- c:\users\daniel\documents\visual studio 2008\projects\hashtest\hashtest\main.cpp
unsigned long ToHash(const char* cStr, int len)
{
unsigned long hash = 2508234;
unsigned long num = 0;
static int shortBuf[4] = {0};
while (len >= 16)
00F110E0 cmp eax,10h
00F110E3 push ebx
00F110E4 push esi
00F110E5 push edi
00F110E6 mov esi,2645CAh
00F110EB jl ToHash+84h (0F11164h)
00F110ED movdqa xmm2,xmmword ptr [mulConst (0F13248h)]
00F110F5 mov edx,dword ptr [$S1 (0F14420h)]
00F110FB movdqa xmm1,xmmword ptr [hashConstant (0F14410h)]
{
hash += getHashed((int*)&cStr[len - 17]);
00F11103 test dl,1
00F11106 jne ToHash+2Fh (0F1110Fh)
00F11108 or edx,1
00F1110B movdqu xmm1,xmm2
00F1110F movdqu xmm0,xmmword ptr [ecx+eax-11h]
00F11115 mov edi,offset result (0F143CCh)
00F1111A movdqa xmmword ptr [dataReg (0F14400h)],xmm0
00F11122 pmullw xmm0,xmm1
00F11126 movdqu xmmword ptr [edi],xmm0
00F1112A mov edi,dword ptr [result+8 (0F143D4h)]
00F11130 mov ebx,dword ptr [result+0Ch (0F143D8h)]
00F11136 add ebx,edi
00F11138 add ebx,dword ptr [result+4 (0F143D0h)]
len -= 16;
00F1113E sub eax,10h
00F11141 add ebx,dword ptr [result (0F143CCh)]
00F11147 movdqa xmmword ptr [res (0F143F0h)],xmm0
00F1114F add esi,ebx
00F11151 cmp eax,10h
00F11154 jge ToHash+23h (0F11103h)
00F11156 movdqa xmmword ptr [hashConstant (0F14410h)],xmm1
00F1115E mov dword ptr [$S1 (0F14420h)],edx
}
if (len < 16)
{
memset(shortBuf, 0, sizeof(int) * 4);
memcpy(shortBuf, cStr, len);
00F11164 push eax
00F11165 push ecx
00F11166 pxor xmm0,xmm0
00F1116A push offset shortBuf (0F143DCh)
00F1116F movq mmword ptr [shortBuf (0F143DCh)],xmm0
00F11177 movq mmword ptr [shortBuf+8 (0F143E4h)],xmm0
00F1117F call memcpy (0F12086h)
00F11184 add esp,0Ch
hash += getHashed(shortBuf);
00F11187 push offset shortBuf (0F143DCh)
00F1118C call getHashed (0F11070h)
00F11191 add esp,4
00F11194 add esi,eax
}
return hash;
}
It crashed on the blue line, and I assume it's because the data isn't aligned.
I'm not too familiar with assembler, but as far as I can tell the getHash function gets inlined - but then the aligned SSE commands are used instead of the unaligned?
How do you think I should solve this? Maybe I should be more strict on what to inline, maybe I should just "inline" it myself by pasting the code there or maybe I should write the getHash function in assembler - which would be a quite nice lesson in a way. However, I would really be interested in what's causing this change of assembler command - maybe I've missed something that's right before my eyes.
While I'm still at it I have two more questions; is it really a good idea to store the hashConstant variable as static if this code should be in a bigger project? Are there any better ways to get the calculated data from the _m128i result into the hash than adding each number at the time? I'm kinda hooked on adding 128 bits at the time now. ;)
Would appreciate any input on this.
Daniel