Thread: Problem with unaligned intrinsics

  1. #1
    Registered User
    Join Date
    Jan 2006
    Location
    Sweden
    Posts
    92

    Problem with unaligned intrinsics

    I have a problem where my code behaves quite strangely in release mode. I'm practicing how to use intrinsics by writing a simple hash function and I'm getting a crash when I load in data to a xmm register. The data is not 16 byte aligned, but I'm using the __mm_loadu_si128 intrinsic so that shouldn't be the problem, but it looks to me like something gets messed up when the code is generated. The compiler I'm using is visual studio 2008.

    Code:
    unsigned long getHashed(int* data)
    {
    	static const int mulConst[4] = {0x0F98A402, 0x50A89EC7, 0x8362A7DE, 0xEF83C901};
    
    	static __m128i hashConstant = _mm_loadu_si128((__m128i*)mulConst);
    	static __m128i dataReg;
    	static __m128i res;
    	static unsigned int result[4] = {0};
    
    	unsigned long hash = 0;
    
    	dataReg = _mm_loadu_si128((__m128i*)data);
    	res = _mm_mullo_epi16(dataReg, hashConstant);
    	_mm_storeu_si128((__m128i*)result, res);
    	hash += result[0];
    	hash += result[1];
    	hash += result[2];
    	hash += result[3];
    
    	return hash;
    }
    
    
    unsigned long ToHash(const char* cStr, int len)
    {
    	unsigned long hash = 2508234;
    	unsigned long num = 0;
    
    	static int shortBuf[4] = {0};
    
    	while (len >= 16)
    	{
    		hash += getHashed((int*)&cStr[len - 17]);
    
    		len -= 16;
    	}
    
    	if (len < 16)
    	{
    		memset(shortBuf, 0, sizeof(int) * 4);
    		memcpy(shortBuf, cStr, len);
    		hash += getHashed(shortBuf);
    	}
    	return hash;
    }
    The disassembly for this is:
    Code:
    --- c:\users\daniel\documents\visual studio 2008\projects\hashtest\hashtest\main.cpp 
    
    unsigned long getHashed(int* data)
    {
    00F11070  push        ebp  
    00F11071  mov         ebp,esp 
    00F11073  and         esp,0FFFFFFF0h 
    	static const int mulConst[4] = {0x0F98A402, 0x50A89EC7, 0x8362A7DE, 0xEF83C901};
    	static __m128i hashConstant = _mm_loadu_si128((__m128i*)mulConst);
    00F11076  mov         eax,1 
    00F1107B  test        byte ptr [$S1 (0F14420h)],al 
    00F11081  jne         getHashed+29h (0F11099h) 
    00F11083  movdqu      xmm0,xmmword ptr [mulConst (0F13248h)] 
    00F1108B  or          dword ptr [$S1 (0F14420h)],eax 
    00F11091  movdqu      xmmword ptr [hashConstant (0F14410h)],xmm0 
    	static __m128i dataReg;
    	static __m128i res;
    	static unsigned int result[4] = {0};
    
    	unsigned long hash = 0;
    
    	dataReg = _mm_loadu_si128((__m128i*)data);
    00F11099  mov         eax,dword ptr [data] 
    00F1109C  movdqu      xmm0,xmmword ptr [eax] 
    00F110A0  movdqa      xmmword ptr [dataReg (0F14400h)],xmm0 
    	res = _mm_mullo_epi16(dataReg, hashConstant);
    00F110A8  pmullw      xmm0,xmmword ptr [hashConstant (0F14410h)] 
    	_mm_storeu_si128((__m128i*)result, res);
    00F110B0  mov         ecx,offset result (0F143CCh) 
    00F110B5  movdqu      xmmword ptr [ecx],xmm0 
    00F110B9  mov         edx,dword ptr [result+8 (0F143D4h)] 
    00F110BF  mov         eax,dword ptr [result+0Ch (0F143D8h)] 
    00F110C4  add         eax,edx 
    	hash += result[0];
    	hash += result[1];
    	hash += result[2];
    	hash += result[3];
    
    	return hash;
    00F110C6  add         eax,dword ptr [result+4 (0F143D0h)] 
    00F110CC  movdqa      xmmword ptr [res (0F143F0h)],xmm0 
    00F110D4  add         eax,dword ptr [result (0F143CCh)] 
    }
    00F110DA  mov         esp,ebp 
    00F110DC  pop         ebp  
    00F110DD  ret              
    --- No source file -------------------------------------------------------------
    00F110DE  int         3    
    00F110DF  int         3    
    --- c:\users\daniel\documents\visual studio 2008\projects\hashtest\hashtest\main.cpp 
    
    
    unsigned long ToHash(const char* cStr, int len)
    {
    	unsigned long hash = 2508234;
    	unsigned long num = 0;
    
    	static int shortBuf[4] = {0};
    
    	while (len >= 16)
    00F110E0  cmp         eax,10h 
    00F110E3  push        ebx  
    00F110E4  push        esi  
    00F110E5  push        edi  
    00F110E6  mov         esi,2645CAh 
    00F110EB  jl          ToHash+84h (0F11164h) 
    00F110ED  movdqa      xmm2,xmmword ptr [mulConst (0F13248h)] 
    00F110F5  mov         edx,dword ptr [$S1 (0F14420h)] 
    00F110FB  movdqa      xmm1,xmmword ptr [hashConstant (0F14410h)] 
    	{
    		hash += getHashed((int*)&cStr[len - 17]);
    00F11103  test        dl,1 
    00F11106  jne         ToHash+2Fh (0F1110Fh) 
    00F11108  or          edx,1 
    00F1110B  movdqu      xmm1,xmm2 
    00F1110F  movdqu      xmm0,xmmword ptr [ecx+eax-11h] 
    00F11115  mov         edi,offset result (0F143CCh) 
    00F1111A  movdqa      xmmword ptr [dataReg (0F14400h)],xmm0 
    00F11122  pmullw      xmm0,xmm1 
    00F11126  movdqu      xmmword ptr [edi],xmm0 
    00F1112A  mov         edi,dword ptr [result+8 (0F143D4h)] 
    00F11130  mov         ebx,dword ptr [result+0Ch (0F143D8h)] 
    00F11136  add         ebx,edi 
    00F11138  add         ebx,dword ptr [result+4 (0F143D0h)] 
    
    		len -= 16;
    00F1113E  sub         eax,10h 
    00F11141  add         ebx,dword ptr [result (0F143CCh)] 
    00F11147  movdqa      xmmword ptr [res (0F143F0h)],xmm0 
    00F1114F  add         esi,ebx 
    00F11151  cmp         eax,10h 
    00F11154  jge         ToHash+23h (0F11103h) 
    00F11156  movdqa      xmmword ptr [hashConstant (0F14410h)],xmm1 
    00F1115E  mov         dword ptr [$S1 (0F14420h)],edx 
    	}
    
    	if (len < 16)
    	{
    		memset(shortBuf, 0, sizeof(int) * 4);
    		memcpy(shortBuf, cStr, len);
    00F11164  push        eax  
    00F11165  push        ecx  
    00F11166  pxor        xmm0,xmm0 
    00F1116A  push        offset shortBuf (0F143DCh) 
    00F1116F  movq        mmword ptr [shortBuf (0F143DCh)],xmm0 
    00F11177  movq        mmword ptr [shortBuf+8 (0F143E4h)],xmm0 
    00F1117F  call        memcpy (0F12086h) 
    00F11184  add         esp,0Ch 
    		hash += getHashed(shortBuf);
    00F11187  push        offset shortBuf (0F143DCh) 
    00F1118C  call        getHashed (0F11070h) 
    00F11191  add         esp,4 
    00F11194  add         esi,eax 
    	}
    	return hash;
    }
    It crashed on the blue line, and I assume it's because the data isn't aligned.
    I'm not too familiar with assembler, but as far as I can tell the getHash function gets inlined - but then the aligned SSE commands are used instead of the unaligned?

    How do you think I should solve this? Maybe I should be more strict on what to inline, maybe I should just "inline" it myself by pasting the code there or maybe I should write the getHash function in assembler - which would be a quite nice lesson in a way. However, I would really be interested in what's causing this change of assembler command - maybe I've missed something that's right before my eyes.

    While I'm still at it I have two more questions; is it really a good idea to store the hashConstant variable as static if this code should be in a bigger project? Are there any better ways to get the calculated data from the _m128i result into the hash than adding each number at the time? I'm kinda hooked on adding 128 bits at the time now.

    Would appreciate any input on this.

    Daniel

  2. #2
    and the hat of int overfl Salem's Avatar
    Join Date
    Aug 2001
    Location
    The edge of the known universe
    Posts
    39,656
    > hash += getHashed((int*)&cStr[len - 17]);
    So why is this -17 and not -16?

    Worst possible alignment.
    If you dance barefoot on the broken glass of undefined behaviour, you've got to expect the occasional cut.
    If at first you don't succeed, try writing your phone number on the exam paper.

  3. #3
    Registered User
    Join Date
    Jan 2006
    Location
    Sweden
    Posts
    92
    Quote Originally Posted by Salem View Post
    > hash += getHashed((int*)&cStr[len - 17]);
    So why is this -17 and not -16?

    Worst possible alignment.
    Thanks for pointing it out, I don't really know what I was thinking about when I wrote -17.
    It also seemed to fix the problem.

    I also changed the code a bit to use aligned data instead, I assumed I can use #pragma pack(16) like this, am I correct? It's working very nice, but I just want to make sure I'm not just lucky with the alignments.

    Code:
    unsigned long ToHash(const char* cStr, int len)
    {
    	unsigned long hash = 2508234;
    
    #pragma pack(16)
    	int mulConst[4] = {0x0F98A402, 0x50A89EC7, 0x8362A7DE, 0xEF83C901};
    	int result[4] = {0};
    	int shortBuf[4] = {0};
    #pragma pack()
    
    	__m128i dataReg;
    	__m128i res;
    	__m128i hashConstant = _mm_load_si128((__m128i*)mulConst);
    
    	
    	while (len >= 16)
    	{
    		dataReg = _mm_loadu_si128((__m128i*)&cStr[len - 16]);
    		res = _mm_madd_epi16(dataReg, hashConstant);
    		_mm_store_si128((__m128i*)result, res);
    		hash += result[0];
    		hash += result[1];
    		hash += result[2];
    		hash += result[3];
    
    		len -= 16;
    	}
    
    	if (len < 16 && len > 1)
    	{
    		memcpy(shortBuf, cStr, len);
    		
    		dataReg = _mm_load_si128((__m128i*)shortBuf);
    		res = _mm_madd_epi16(dataReg, hashConstant);
    		_mm_store_si128((__m128i*)result, res);
    		hash += result[0];
    		hash += result[1];
    		hash += result[2];
    		hash += result[3];
    
    	} else if (len == 1)
    	{
    		hash += cStr[0] * 0x390482F;
    	}
    
    	return hash;
    }
    Thanks.

    Daniel

  4. #4
    Kernel hacker
    Join Date
    Jul 2007
    Location
    Farncombe, Surrey, England
    Posts
    15,677
    pragma pack, as far as I know, is only for padding within structures, not stack-based alignment of arrays.

    You probably want this: http://msdn.microsoft.com/en-us/library/83ythb65.aspx

    --
    Mats
    Compilers can produce warnings - make the compiler programmers happy: Use them!
    Please don't PM me for help - and no, I don't do help over instant messengers.

  5. #5
    Registered User
    Join Date
    Jan 2006
    Location
    Sweden
    Posts
    92
    Quote Originally Posted by matsp View Post
    pragma pack, as far as I know, is only for padding within structures, not stack-based alignment of arrays.

    You probably want this: http://msdn.microsoft.com/en-us/library/83ythb65.aspx

    --
    Mats
    Ah, that was exactly was I was looking for - thanks.

    I changed the declaration to this:
    Code:
    	__declspec(align(16)) int mulConst[4] = {0x0F98A402, 0x50A89EC7, 0x8362A7DE, 0xEF83C901};
    	__declspec(align(16)) int result[4] = {0};
    	__declspec(align(16)) int shortBuf[4] = {0};
    And it works perfectly, thanks for your help.

    Daniel

Popular pages Recent additions subscribe to a feed

Similar Threads

  1. Memory problem with Borland C 3.1
    By AZ1699 in forum C Programming
    Replies: 16
    Last Post: 11-16-2007, 11:22 AM
  2. Someone having same problem with Code Block?
    By ofayto in forum C++ Programming
    Replies: 1
    Last Post: 07-12-2007, 08:38 AM
  3. A question related to strcmp
    By meili100 in forum C++ Programming
    Replies: 6
    Last Post: 07-07-2007, 02:51 PM
  4. WS_POPUP, continuation of old problem
    By blurrymadness in forum Windows Programming
    Replies: 1
    Last Post: 04-20-2007, 06:54 PM
  5. beginner problem
    By The_Nymph in forum C Programming
    Replies: 4
    Last Post: 03-05-2002, 05:46 PM