I took the "blue" pixel conversion and re-wrote it to inline assembler:
--Code:_asm
{
// THis should be done by declaring a static const variable of __m128 which is loaded up
// with 255.0f, 255.0f, 255.0f, 255.0f once and for all. I have called that variable m4_255.
; 140 : __m128 temp = _mm_set_ps(256,256,256,256);
movaps xmm0, XMMWORD PTR m4_255
; 141 :
; 142 : // Make alpha in the range 0 to 1
; 143 : textAlpha = _mm_div_ps(textAlpha, temp);
movaps xmm2, XMMWORD PTR textAlpha
divps xmm2, xmm0
; 144 :
; 145 : // Blue
; 146 : temp = _mm_sub_ps(textBlue, screenBlue);
movaps xmm0, XMMWORD PTR screenBlue
movaps xmm1, XMMWORD PTR textBlue
subps xmm1, xmm0
; 147 : temp = _mm_mul_ps(temp,textAlpha);
movaps xmm3, XMMWORD PTR textAlpha
mulps xmm1, xmm3
; 148 : temp = _mm_add_ps(temp, screenBlue);
// xmm0 is still screenBlue
addps xmm1, xmm0
; 149 :
; 150 : _mm_storeu_ps(blue, temp);
movups XMMWORD PTR blue, xmm1
}
Mats