Thanks matsp
I did as you did in your code (copied the code so it worked for green and red also), but it still very slow.
In the normal c++ code there is no conversion from BYTE to float, is that maybe the big difference?
The fps now for the same scene is about 85, so 10fps better
Here is my code:
Code:
//-------------------------------------------------------
// SEE BLITTING CODE!
//-------------------------------------------------------
// Create a pointer to the colours we are using
float *cBlue = b;
float *cGreen = g;
float *cRed = r;
float *cAlpha = t;
// Array for storing the result of each blitting
__declspec(align(16)) float blue[4];
__declspec(align(16)) float green[4];
__declspec(align(16)) float red[4];
for (int i = 0; i < height; i++)
{
//dividedWidth is width of the texture / 4
for (int j = 0; j < dividedWidth; j++)
{
__m128 textBlue = _mm_load_ps( cBlue);
__m128 textGreen = _mm_load_ps( cGreen);
__m128 textRed = _mm_load_ps( cRed);
__m128 textAlpha = _mm_load_ps( cAlpha);
__m128 screenBlue = _mm_setr_ps( *(screenDataPnt) , *(screenDataPnt + 4) ,*(screenDataPnt + 8) ,*(screenDataPnt + 12) );
__m128 screenGreen = _mm_setr_ps( *(screenDataPnt + 1) , *(screenDataPnt + 5) ,*(screenDataPnt + 9) ,*(screenDataPnt + 13) );
__m128 screenRed = _mm_setr_ps( *(screenDataPnt + 2) , *(screenDataPnt + 6) ,*(screenDataPnt + 10) ,*(screenDataPnt + 14));
_asm
{
// THis should be done by declaring a static const variable of __m128 which is loaded up
// with 255.0f, 255.0f, 255.0f, 255.0f once and for all. I have called that variable m4_255.
movaps xmm0, XMMWORD PTR m4_255
// Make alpha in the range 0 to 1
movaps xmm2, XMMWORD PTR textAlpha
mulps xmm2, xmm0
// Load the new alpha into xmm3
movaps xmm3, XMMWORD PTR textAlpha
// Blue
//------------------------------------
movaps xmm0, XMMWORD PTR screenBlue
movaps xmm1, XMMWORD PTR textBlue
subps xmm1, xmm0
mulps xmm1, xmm3
addps xmm1, xmm0
// Save the result in blue
movups XMMWORD PTR blue, xmm1
// Green
//------------------------------------
movaps xmm4, XMMWORD PTR screenGreen
movaps xmm5, XMMWORD PTR textGreen
subps xmm5, xmm4
mulps xmm5, xmm3
addps xmm5, xmm4
// Save the result in green
movups XMMWORD PTR green, xmm5
// Red
//------------------------------------
movaps xmm6, XMMWORD PTR screenRed
movaps xmm7, XMMWORD PTR textRed
subps xmm7, xmm6
mulps xmm7, xmm3
addps xmm7, xmm6
// Save the result in red
movups XMMWORD PTR red, xmm7
}
// Increment colour pointer
cBlue +=4;
cGreen +=4;
cRed +=4;
cAlpha +=4;
// Copy the result into the screenData pointer
for(int p = 0; p < 4; p++)
{
*(screenDataPnt + 0) = blue[p];
*(screenDataPnt + 1) = green[p];
*(screenDataPnt + 2) = red[p];
screenDataPnt += 4;
}
}
// (ScreenWidth - textureWidth) * number of pixels
// 640 - 64 * 4
screenDataPnt += 2304;
}