Thread: Compiling c++ intrinsics commands

  1. #31
    Kernel hacker
    Join Date
    Jul 2007
    Location
    Farncombe, Surrey, England
    Posts
    15,677
    I took the "blue" pixel conversion and re-wrote it to inline assembler:
    Code:
    _asm 
    {
    // THis should be done by declaring a static const variable of __m128 which is loaded up 
    // with 255.0f, 255.0f, 255.0f, 255.0f once and for all. I have called that variable m4_255. 
    
    ; 140  : 			__m128 temp = _mm_set_ps(256,256,256,256);
    	movaps	xmm0, XMMWORD PTR m4_255
    ; 141  : 
    ; 142  : 			// Make alpha in the range 0 to 1
    ; 143  : 			textAlpha = _mm_div_ps(textAlpha, temp);
    
    	movaps	xmm2, XMMWORD PTR textAlpha
    	divps	xmm2, xmm0
    
    ; 144  : 
    ; 145  : 			// Blue
    ; 146  : 			temp = _mm_sub_ps(textBlue, screenBlue);
    
    	movaps	xmm0, XMMWORD PTR screenBlue
    	movaps	xmm1, XMMWORD PTR textBlue
    	subps	xmm1, xmm0
    
    ; 147  : 			temp = _mm_mul_ps(temp,textAlpha);
    
    	movaps	xmm3, XMMWORD PTR textAlpha
    	mulps	xmm1, xmm3
    
    ; 148  : 			temp = _mm_add_ps(temp, screenBlue);
    	// xmm0 is still screenBlue
    	addps	xmm1, xmm0
    
    ; 149  : 
    ; 150  : 			_mm_storeu_ps(blue, temp);
    
    	movups	XMMWORD PTR blue, xmm1
    }
    --
    Mats
    Compilers can produce warnings - make the compiler programmers happy: Use them!
    Please don't PM me for help - and no, I don't do help over instant messengers.

  2. #32
    Registered User
    Join Date
    Oct 2006
    Location
    UK/Norway
    Posts
    485
    Thanks matsp

    I did as you did in your code (copied the code so it worked for green and red also), but it still very slow.

    In the normal c++ code there is no conversion from BYTE to float, is that maybe the big difference?
    The fps now for the same scene is about 85, so 10fps better

    Here is my code:
    Code:
    //-------------------------------------------------------
    	// SEE BLITTING CODE!
    	//-------------------------------------------------------
    
    	// Create a pointer to the colours we are using
    	float *cBlue = b;
    	float *cGreen = g;
    	float *cRed = r;
    	float *cAlpha = t;
    
    	// Array for storing the result of each blitting
    	__declspec(align(16)) float blue[4];
    	__declspec(align(16)) float green[4];
    	__declspec(align(16)) float red[4];
    
    	for (int i = 0; i < height; i++)
    	{
    		//dividedWidth is width of the texture / 4
    		for (int j = 0; j < dividedWidth; j++)
    		{
    			__m128 textBlue =  _mm_load_ps( cBlue);
    			__m128 textGreen = _mm_load_ps( cGreen);
    			__m128 textRed =   _mm_load_ps( cRed);
    			__m128 textAlpha = _mm_load_ps( cAlpha);
    
    			__m128 screenBlue =  _mm_setr_ps( *(screenDataPnt) ,     *(screenDataPnt + 4) ,*(screenDataPnt + 8) ,*(screenDataPnt + 12) );
    			__m128 screenGreen = _mm_setr_ps( *(screenDataPnt + 1) , *(screenDataPnt + 5) ,*(screenDataPnt + 9) ,*(screenDataPnt + 13) );
    			__m128 screenRed =   _mm_setr_ps( *(screenDataPnt + 2) , *(screenDataPnt + 6) ,*(screenDataPnt + 10) ,*(screenDataPnt + 14));
    
    			_asm 
    			{
    				// THis should be done by declaring a static const variable of __m128 which is loaded up 
    				// with 255.0f, 255.0f, 255.0f, 255.0f once and for all. I have called that variable m4_255. 
    				movaps	xmm0, XMMWORD PTR m4_255
    	
    				// Make alpha in the range 0 to 1
    				movaps	xmm2, XMMWORD PTR textAlpha
    				mulps	xmm2, xmm0
    
    				// Load the new alpha into xmm3
    				movaps	xmm3, XMMWORD PTR textAlpha
    
    				// Blue
    				//------------------------------------
    					movaps	xmm0, XMMWORD PTR screenBlue
    					movaps	xmm1, XMMWORD PTR textBlue
    					subps	xmm1, xmm0
    	
    					mulps	xmm1, xmm3
    					addps	xmm1, xmm0
    
    					// Save the result in blue
    					movups	XMMWORD PTR blue, xmm1
    
    				// Green
    				//------------------------------------
    					movaps  xmm4, XMMWORD PTR screenGreen
    					movaps  xmm5, XMMWORD PTR textGreen
    					subps   xmm5, xmm4
    
    					mulps xmm5, xmm3
    					addps xmm5, xmm4
    
    					// Save the result in green
    					movups	XMMWORD PTR green, xmm5
    
    				// Red
    				//------------------------------------
    				movaps  xmm6, XMMWORD PTR screenRed
    				movaps  xmm7, XMMWORD PTR textRed
    				subps   xmm7, xmm6
    
    				mulps xmm7, xmm3
    				addps xmm7, xmm6
    
    				// Save the result in red
    				movups	XMMWORD PTR red, xmm7
    			}	
    
    			// Increment colour pointer
    			cBlue  +=4;
    			cGreen +=4;
    			cRed   +=4;
    			cAlpha +=4;
    
    			// Copy the result into the screenData pointer
    			for(int p = 0; p < 4; p++)
    			{
    				*(screenDataPnt + 0) = blue[p];
    				*(screenDataPnt + 1) = green[p];
    				*(screenDataPnt + 2) = red[p];
    
    				screenDataPnt += 4;
    			}
    
    		}
    
    		// (ScreenWidth - textureWidth) * number of pixels
    		//	640         -      64		*     4
    		screenDataPnt += 2304;
    	}
    Last edited by h3ro; 07-11-2008 at 10:41 AM.

  3. #33
    Cat without Hat CornedBee's Avatar
    Join Date
    Apr 2003
    Posts
    8,895
    I was also told the correctly written intrinsics should be almost as fast as inlined assembly
    The only intrinsics code I've ever worked with was actually faster than the inline assembly due to instruction reordering.
    All the buzzt!
    CornedBee

    "There is not now, nor has there ever been, nor will there ever be, any programming language in which it is the least bit difficult to write bad code."
    - Flon's Law

  4. #34
    Kernel hacker
    Join Date
    Jul 2007
    Location
    Farncombe, Surrey, England
    Posts
    15,677
    Quote Originally Posted by CornedBee View Post
    The only intrinsics code I've ever worked with was actually faster than the inline assembly due to instruction reordering.
    I'd expect that's true if you use gcc - in my experience, gcc's support for SSE is much better than Visual Studio.

    --
    Mats
    Compilers can produce warnings - make the compiler programmers happy: Use them!
    Please don't PM me for help - and no, I don't do help over instant messengers.

  5. #35
    Kernel hacker
    Join Date
    Jul 2007
    Location
    Farncombe, Surrey, England
    Posts
    15,677
    Quote Originally Posted by h3ro View Post
    Thanks matsp

    I did as you did in your code (copied the code so it worked for green and red also), but it still very slow.

    In the normal c++ code there is no conversion from BYTE to float, is that maybe the big difference?
    The fps now for the same scene is about 85, so 10fps better

    Here is my code:
    <snip>
    I suppose you may want to look at optimzing the loading of "textBlue" (in fact, you don't even need a separate variable for that, just load it straight of the cBlue variable). And of course, green and red as well. Writing your own code to load screen colours as well would also help - I'm sure the MS code is as messy for this, as it was for the central part (if not worse).

    Code:
    			__m128 textBlue =  _mm_load_ps( cBlue);
    			__m128 textGreen = _mm_load_ps( cGreen);
    			__m128 textRed =   _mm_load_ps( cRed);
    			__m128 textAlpha = _mm_load_ps( cAlpha);
    
    			__m128 screenBlue =  _mm_setr_ps( *(screenDataPnt) ,     *(screenDataPnt + 4) ,*(screenDataPnt + 8) ,*(screenDataPnt + 12) );
    			__m128 screenGreen = _mm_setr_ps( *(screenDataPnt + 1) , *(screenDataPnt + 5) ,*(screenDataPnt + 9) ,*(screenDataPnt + 13) );
    			__m128 screenRed =   _mm_setr_ps( *(screenDataPnt + 2) , *(screenDataPnt + 6) ,*(screenDataPnt + 10) ,*(screenDataPnt + 14));
    This code should be inline assembler'd too.


    Converting to float will take a couple of clock-cycles, but the math later on is simpler, so it should make no great difference.

    --
    Mats
    Compilers can produce warnings - make the compiler programmers happy: Use them!
    Please don't PM me for help - and no, I don't do help over instant messengers.

  6. #36
    Registered User
    Join Date
    Oct 2006
    Location
    UK/Norway
    Posts
    485
    Is there a way to avoid having to use this code?

    Code:
    			// Copy the result into the screenData pointer
    			for(int p = 0; p < 4; p++)
    			{
    				*(screenDataPnt + 0) = blue[p];
    				*(screenDataPnt + 1) = green[p];
    				*(screenDataPnt + 2) = red[p];
    
    				screenDataPnt += 4;
    			}
    In c++ I would do
    screenDataPnt = alpha * ( screenDataPnt - textGreen) + screenDataPnt

    But in assembly it is like this:
    Load the screenData and textData into variables
    Do the calculation in a temp variable
    Copy the temp variable to screenData

    Not sure if it is possible to change, but that part of the code takes a while to execute

    EDIT:
    Thanks for your reply matsp, ill try it out as soon as I get home

    Writing your own code to load screen colours as well would also help
    I am not sure I can do something to that array, as it is an array that points to the screen memory.
    Would it work if I create 3 new pointers and they point to different places in the screenData pointer?

    for(..)
    screenRed = screenData
    screenGreen = screenData+1
    screenBlue = screenData +2
    Last edited by h3ro; 07-12-2008 at 05:32 AM.

  7. #37
    Kernel hacker
    Join Date
    Jul 2007
    Location
    Farncombe, Surrey, England
    Posts
    15,677
    Quote Originally Posted by h3ro View Post
    Is there a way to avoid having to use this code?

    Code:
    			// Copy the result into the screenData pointer
    			for(int p = 0; p < 4; p++)
    			{
    				*(screenDataPnt + 0) = blue[p];
    				*(screenDataPnt + 1) = green[p];
    				*(screenDataPnt + 2) = red[p];
    
    				screenDataPnt += 4;
    			}
    In c++ I would do
    screenDataPnt = alpha * ( screenDataPnt - textGreen) + screenDataPnt

    But in assembly it is like this:
    Load the screenData and textData into variables
    Do the calculation in a temp variable
    Copy the temp variable to screenData

    Not sure if it is possible to change, but that part of the code takes a while to execute

    EDIT:
    Thanks for your reply matsp, ill try it out as soon as I get home


    I am not sure I can do something to that array, as it is an array that points to the screen memory.
    Would it work if I create 3 new pointers and they point to different places in the screenData pointer?

    for(..)
    screenRed = screenData
    screenGreen = screenData+1
    screenBlue = screenData +2
    Well, you still need to load, unpack and pack together the right data - I've not quite figured out what instructions you need, so I can't say for sure. You can just as well have screenData+1 as your green - no need for a separate pointer, really.

    --
    Mats
    Compilers can produce warnings - make the compiler programmers happy: Use them!
    Please don't PM me for help - and no, I don't do help over instant messengers.

  8. #38
    Registered User
    Join Date
    Oct 2006
    Location
    UK/Norway
    Posts
    485
    I suppose you may want to look at optimzing the loading of "textBlue" (in fact, you don't even need a separate variable for that, just load it straight of the cBlue variable). And of course, green and red as well. Writing your own code to load screen colours as well would also help - I'm sure the MS code is as messy for this, as it was for the central part (if not worse).
    I am starting to feel a bit bad her as im asking all there questions, but im lost.
    I tried doing this, but it did not work
    movaps xmm1, XMMWORD PTR cBlue

    I assume it has something to do with pointers, but im not sure. What would be the correct command to load cBlue?

    I have ordered an assembly book now. Decided to take several steps back and try to get a basic understanding of how to write assembly code.

Popular pages Recent additions subscribe to a feed

Similar Threads

  1. Get user commands from text file.
    By Ironic in forum C Programming
    Replies: 4
    Last Post: 12-08-2008, 11:38 PM
  2. Replies: 2
    Last Post: 07-27-2007, 12:48 PM
  3. Screwy Linker Error - VC2005
    By Tonto in forum C++ Programming
    Replies: 5
    Last Post: 06-19-2007, 02:39 PM
  4. Disable ALT key commands
    By Lionmane in forum Windows Programming
    Replies: 9
    Last Post: 09-23-2005, 10:41 AM
  5. Dos commands hehe
    By Carp in forum A Brief History of Cprogramming.com
    Replies: 2
    Last Post: 01-17-2003, 02:51 PM