I have a problem where my code behaves quite strangely in release mode. I'm practicing how to use intrinsics by writing a simple hash function and I'm getting a crash when I load in data to a xmm register. The data is not 16 byte aligned, but I'm using the __mm_loadu_si128 intrinsic so that shouldn't be the problem, but it looks to me like something gets messed up when the code is generated. The compiler I'm using is visual studio 2008.

Code:
unsigned long getHashed(int* data)
{
	static const int mulConst[4] = {0x0F98A402, 0x50A89EC7, 0x8362A7DE, 0xEF83C901};

	static __m128i hashConstant = _mm_loadu_si128((__m128i*)mulConst);
	static __m128i dataReg;
	static __m128i res;
	static unsigned int result[4] = {0};

	unsigned long hash = 0;

	dataReg = _mm_loadu_si128((__m128i*)data);
	res = _mm_mullo_epi16(dataReg, hashConstant);
	_mm_storeu_si128((__m128i*)result, res);
	hash += result[0];
	hash += result[1];
	hash += result[2];
	hash += result[3];

	return hash;
}


unsigned long ToHash(const char* cStr, int len)
{
	unsigned long hash = 2508234;
	unsigned long num = 0;

	static int shortBuf[4] = {0};

	while (len >= 16)
	{
		hash += getHashed((int*)&cStr[len - 17]);

		len -= 16;
	}

	if (len < 16)
	{
		memset(shortBuf, 0, sizeof(int) * 4);
		memcpy(shortBuf, cStr, len);
		hash += getHashed(shortBuf);
	}
	return hash;
}
The disassembly for this is:
Code:
--- c:\users\daniel\documents\visual studio 2008\projects\hashtest\hashtest\main.cpp 

unsigned long getHashed(int* data)
{
00F11070  push        ebp  
00F11071  mov         ebp,esp 
00F11073  and         esp,0FFFFFFF0h 
	static const int mulConst[4] = {0x0F98A402, 0x50A89EC7, 0x8362A7DE, 0xEF83C901};
	static __m128i hashConstant = _mm_loadu_si128((__m128i*)mulConst);
00F11076  mov         eax,1 
00F1107B  test        byte ptr [$S1 (0F14420h)],al 
00F11081  jne         getHashed+29h (0F11099h) 
00F11083  movdqu      xmm0,xmmword ptr [mulConst (0F13248h)] 
00F1108B  or          dword ptr [$S1 (0F14420h)],eax 
00F11091  movdqu      xmmword ptr [hashConstant (0F14410h)],xmm0 
	static __m128i dataReg;
	static __m128i res;
	static unsigned int result[4] = {0};

	unsigned long hash = 0;

	dataReg = _mm_loadu_si128((__m128i*)data);
00F11099  mov         eax,dword ptr [data] 
00F1109C  movdqu      xmm0,xmmword ptr [eax] 
00F110A0  movdqa      xmmword ptr [dataReg (0F14400h)],xmm0 
	res = _mm_mullo_epi16(dataReg, hashConstant);
00F110A8  pmullw      xmm0,xmmword ptr [hashConstant (0F14410h)] 
	_mm_storeu_si128((__m128i*)result, res);
00F110B0  mov         ecx,offset result (0F143CCh) 
00F110B5  movdqu      xmmword ptr [ecx],xmm0 
00F110B9  mov         edx,dword ptr [result+8 (0F143D4h)] 
00F110BF  mov         eax,dword ptr [result+0Ch (0F143D8h)] 
00F110C4  add         eax,edx 
	hash += result[0];
	hash += result[1];
	hash += result[2];
	hash += result[3];

	return hash;
00F110C6  add         eax,dword ptr [result+4 (0F143D0h)] 
00F110CC  movdqa      xmmword ptr [res (0F143F0h)],xmm0 
00F110D4  add         eax,dword ptr [result (0F143CCh)] 
}
00F110DA  mov         esp,ebp 
00F110DC  pop         ebp  
00F110DD  ret              
--- No source file -------------------------------------------------------------
00F110DE  int         3    
00F110DF  int         3    
--- c:\users\daniel\documents\visual studio 2008\projects\hashtest\hashtest\main.cpp 


unsigned long ToHash(const char* cStr, int len)
{
	unsigned long hash = 2508234;
	unsigned long num = 0;

	static int shortBuf[4] = {0};

	while (len >= 16)
00F110E0  cmp         eax,10h 
00F110E3  push        ebx  
00F110E4  push        esi  
00F110E5  push        edi  
00F110E6  mov         esi,2645CAh 
00F110EB  jl          ToHash+84h (0F11164h) 
00F110ED  movdqa      xmm2,xmmword ptr [mulConst (0F13248h)] 
00F110F5  mov         edx,dword ptr [$S1 (0F14420h)] 
00F110FB  movdqa      xmm1,xmmword ptr [hashConstant (0F14410h)] 
	{
		hash += getHashed((int*)&cStr[len - 17]);
00F11103  test        dl,1 
00F11106  jne         ToHash+2Fh (0F1110Fh) 
00F11108  or          edx,1 
00F1110B  movdqu      xmm1,xmm2 
00F1110F  movdqu      xmm0,xmmword ptr [ecx+eax-11h] 
00F11115  mov         edi,offset result (0F143CCh) 
00F1111A  movdqa      xmmword ptr [dataReg (0F14400h)],xmm0 
00F11122  pmullw      xmm0,xmm1 
00F11126  movdqu      xmmword ptr [edi],xmm0 
00F1112A  mov         edi,dword ptr [result+8 (0F143D4h)] 
00F11130  mov         ebx,dword ptr [result+0Ch (0F143D8h)] 
00F11136  add         ebx,edi 
00F11138  add         ebx,dword ptr [result+4 (0F143D0h)] 

		len -= 16;
00F1113E  sub         eax,10h 
00F11141  add         ebx,dword ptr [result (0F143CCh)] 
00F11147  movdqa      xmmword ptr [res (0F143F0h)],xmm0 
00F1114F  add         esi,ebx 
00F11151  cmp         eax,10h 
00F11154  jge         ToHash+23h (0F11103h) 
00F11156  movdqa      xmmword ptr [hashConstant (0F14410h)],xmm1 
00F1115E  mov         dword ptr [$S1 (0F14420h)],edx 
	}

	if (len < 16)
	{
		memset(shortBuf, 0, sizeof(int) * 4);
		memcpy(shortBuf, cStr, len);
00F11164  push        eax  
00F11165  push        ecx  
00F11166  pxor        xmm0,xmm0 
00F1116A  push        offset shortBuf (0F143DCh) 
00F1116F  movq        mmword ptr [shortBuf (0F143DCh)],xmm0 
00F11177  movq        mmword ptr [shortBuf+8 (0F143E4h)],xmm0 
00F1117F  call        memcpy (0F12086h) 
00F11184  add         esp,0Ch 
		hash += getHashed(shortBuf);
00F11187  push        offset shortBuf (0F143DCh) 
00F1118C  call        getHashed (0F11070h) 
00F11191  add         esp,4 
00F11194  add         esi,eax 
	}
	return hash;
}
It crashed on the blue line, and I assume it's because the data isn't aligned.
I'm not too familiar with assembler, but as far as I can tell the getHash function gets inlined - but then the aligned SSE commands are used instead of the unaligned?

How do you think I should solve this? Maybe I should be more strict on what to inline, maybe I should just "inline" it myself by pasting the code there or maybe I should write the getHash function in assembler - which would be a quite nice lesson in a way. However, I would really be interested in what's causing this change of assembler command - maybe I've missed something that's right before my eyes.

While I'm still at it I have two more questions; is it really a good idea to store the hashConstant variable as static if this code should be in a bigger project? Are there any better ways to get the calculated data from the _m128i result into the hash than adding each number at the time? I'm kinda hooked on adding 128 bits at the time now.

Would appreciate any input on this.

Daniel