how about using SIMD?
a quick hack with VS
Code:
#include "stdafx.h"
#include <iostream>
#include <vector>
//0.0965613
void sqrmmv(std::vector<float> &vin)
{
__m128* pSrc = (__m128*)&vin[0];
size_t nLoop = vin.size() / 4;
for (size_t i = 0; i < nLoop; i++ )
{
*pSrc = _mm_mul_ps(*pSrc, *pSrc);
pSrc++;
}
}
//0.242307
void sqrnv(std::vector<float> &vin)
{
for(size_t idx = 0 ; idx < vin.size();idx++)
{
vin[idx] = vin[idx]*vin[idx];
}
}
void test1(void)
{
const size_t kCnt = 160000000;
std::vector<float> vin(kCnt);
for(size_t idx = 0 ; idx < kCnt;idx++)
{
vin[idx]= (float)idx;
}
LARGE_INTEGER freq,start,end;
QueryPerformanceFrequency(&freq);
QueryPerformanceCounter(&start);
sqrmmv(vin);
QueryPerformanceCounter(&end);
std::wcout << (double)(end.QuadPart-start.QuadPart)/freq.QuadPart << std::endl;
if(kCnt == 16)
{
for(size_t idx = 0 ; idx < kCnt;idx++)
{
std::wcout << " " << vin[idx];
}
}
std::wcout << std::endl;
}
void test2(void)
{
const size_t kCnt = 160000000;
std::vector<float> vin(kCnt);
for(size_t idx = 0 ; idx < kCnt;idx++)
{
vin[idx]= (float)idx;
}
LARGE_INTEGER freq,start,end;
QueryPerformanceFrequency(&freq);
QueryPerformanceCounter(&start);
sqrnv(vin);
QueryPerformanceCounter(&end);
std::wcout << (double)(end.QuadPart-start.QuadPart)/freq.QuadPart << std::endl;
if(kCnt == 16)
{
for(size_t idx = 0 ; idx < kCnt;idx++)
{
std::wcout << " " << vin[idx];
}
}
std::wcout << std::endl;
}
int _tmain(int argc, _TCHAR* argv[])
{
test1();
test2();
system("pause");
return 0;
}