I write a short _asm, but it is very slow:
//this is C++
Code:
double *pdb0 = pMatrix + ic;
double *pdb1 = pMatrix + i;
double dbT = ...;
//this piece of c code is replaced by _asm as bellow
Code:
for(j=0; j<iOrder; j++)
{
*pdb0 = *pdb0 + dbT * *pdb1;
pdb0 += iOrder;
pdb1 += iOrder;
}
Code:
_asm
{
finit;
mov edi, pdb0; edi = pdb0
mov esi, pdb1; esi = pdb1
mov eax, iOrder; eax = const iOrder * 8
shl eax, 3;
mov ecx, iOrder;
fld dbT;
Loop_Start:
fld qword ptr [esi]; load *pd1
fmul st(0), st(1);
fld qword ptr [edi]; load *pd0
fadd;
fstp qword ptr [edi]; set to *pd0
add edi, eax; add iOrder*8;
add esi, eax; add iOrder*8;
dec ecx;
jnz Loop_Start;
}
above _asm works fine, but it is slower than C code.
why and how to improve it?
Thanks.