Hi,

I have implemented both of your suggestions.

I have changed the names of the _m128 variables from x and y to m128x and m128y (per iMalc's suggestion.

I am also now using the gcc equivalent of _aligned_malloc (which is posix_memalign) per tabstop's input. This time, the program progresses past the first segmentation fault. It is now Seg faulting at the 8886 'th iteration of the first loop (the loop that initializes the array elements to rand / 100000.

I wonder if it is because the program has run out of memory, or is there a limit to the amount of memory that can be assigned to an array in C. In either or another case, what is the workaround/fix?

Here is the revised code.

Thanks.

Code:

#include <stdlib.h>
#include "xmmintrin.h"
#include <stdio.h>
#define NUM_ELEMS (32*1024)
#define NUM_ITERS 10000
/* Note: xmmintrin.h is a standard header file available under the GNU Open
GPL license. It contains definitions for C functions that wrap the SSE
instruction set
*/
float a ;
float* x ;
float* y ;
int main(int argc, char **argv) {
int err = 0 ;
srand(1);
//x = (float*) (NUM_ELEMS * sizeof(float));
//y = (float*) malloc(NUM_ELEMS * sizeof(float));
if (posix_memalign ((void) &x, 16, NUM_ELEMS * sizeof(float))) {
fprintf(stderr, "Error in aligned memory allocation");
exit(-1) ;
}
if (posix_memalign ((void) &y, 16, NUM_ELEMS * sizeof(float))) {
fprintf(stderr, "Error in aligned memory allocation");
exit(-1) ;
}
__m128 m1, m2, m3, m4;
// type cast x, y and a to efficient intrinsic __m128 data type
__m128* m128x = (__m128*) x ;
__m128* m128y = (__m128*) y ;
__m128 a = (__m128) a ;
printf("Here2") ;
for (int i = 0; i < NUM_ELEMS; i++) {
*m128x = _mm_set_ps1((float)rand()/100000) ;
*m128y = _mm_set_ps1((float)rand()/100000) ;
m128x++ ;
m128y++ ;
}
printf("Here3") ;
for (int k = 0; k < NUM_ITERS; k++) {
a = _mm_set_ps1(0.0) ;
for ( int i = 0; i < NUM_ELEMS; i++ )
{
//a += (x[i] + y[i]) * (x[i] - y[i]);
m1 = _mm_add_ps(*m128x,*m128y); // m1 = x[i] + y[i]
m2 = _mm_sub_ps(*m128x,*m128y); // m2 = x[i] - y[i]
m3 = _mm_add_ps(m1, m2); // m3 = (x[i] + y[i]) * (x[i] - y[i])
a = _mm_add_ps(a, m3) ; // a+= x[i] + y[i] * (x[i] - y[i])
m128x++ ;
m128y++ ;
}
}
/*
fprintf(stderr, "a = %f\n", a.m128_f32[0]);
fprintf(stderr, "a = %f\n", a.m128_f32[1]);
fprintf(stderr, "a = %f\n", a.m128_f32[2]);
fprintf(stderr, "a = %f\n", a.m128_f32[3]);
*/
return 0;
}