I'm trying to get the hand of using SSE instructions through gcc, and I can't get multiplication to work.
Here is my code and my Makefile
simd.c
Code:
#include <stdio.h>
#include <stdlib.h>
typedef int v4si __attribute__ ((vector_size (16)));
typedef union { int s[4]; v4si v; } v4si_u;
void usage(char **argv){
printf("Usage: %s [1|2]\n1: SIMD mode\n2: SISD mode\n", argv[0]);
exit(1);
}
int main(int argc, char **argv){
int aa[4] = { 1, 2, 3, 4 },
bb[4] = { 4, 3, 2, 1 };
v4si_u a, b;
int i, j;
if(argc < 2)
usage(argv);
for(i = 0; i < 4; ++i){
a.s[i] = aa[i];
b.s[i] = bb[i];
}
if(argv[1][0] == '1')
for(i = 0; i < 0xFFFFFF; ++i)
a.v = a.v * b.v;
else if(argv[1][0] == '2')
for(i = 0; i < 0xFFFFFF; ++i)
for(j = 0; j < 4; ++j)
aa[j] = aa[j] * bb[j];
else
usage(argv);
return 0;
}
Makefile
Code:
CC = gcc
FLAGS = -O0 -g -msse -m64
OUTPUT = simd
all:
$(CC) $(FLAGS) -o $(OUTPUT) simd.c
exec:
$(CC) $(FLAGS) -o $(OUTPUT) simd.c
obj:
$(CC) $(FLAGS) -c simd.c
OS X's otool -vt gives me this for the multiplication of the vectors
Code:
00000000000000e1 movdqa 0xc0(%rbp),%xmm1
00000000000000e6 movdqa 0xb0(%rbp),%xmm2
00000000000000eb movd %xmm1,%edx
00000000000000ef movd %xmm2,%eax
00000000000000f3 movl %edx,%ecx
00000000000000f5 imull %eax,%ecx
00000000000000f8 movl %ecx,0x90(%rbp)
00000000000000fb pshufd $0x55,%xmm1,%xmm0
0000000000000100 movd %xmm0,%edx
0000000000000104 pshufd $0x55,%xmm2,%xmm0
0000000000000109 movd %xmm0,%eax
000000000000010d movl %edx,%ecx
000000000000010f imull %eax,%ecx
0000000000000112 movl %ecx,0x94(%rbp)
0000000000000115 movdqa %xmm1,%xmm0
0000000000000119 punpckhdq %xmm1,%xmm0
000000000000011d movd %xmm0,%edx
0000000000000121 movdqa %xmm2,%xmm0
0000000000000125 punpckhdq %xmm2,%xmm0
0000000000000129 movd %xmm0,%eax
000000000000012d movl %edx,%ecx
000000000000012f imull %eax,%ecx
0000000000000132 movl %ecx,0x98(%rbp)
0000000000000135 pshufd $0xff,%xmm1,%xmm0
000000000000013a movd %xmm0,%edx
000000000000013e pshufd $0xff,%xmm2,%xmm0
0000000000000143 movd %xmm0,%eax
0000000000000147 movl %edx,%ecx
0000000000000149 imull %eax,%ecx
000000000000014c movl %ecx,0x9c(%rbp)
000000000000014f movd 0x90(%rbp),%xmm1
0000000000000154 movd 0x94(%rbp),%xmm0
0000000000000159 punpckldq %xmm0,%xmm1
000000000000015d movd 0x98(%rbp),%xmm0
0000000000000162 movd 0x9c(%rbp),%xmm2
0000000000000167 punpckldq %xmm2,%xmm0
000000000000016b movq %xmm1,%xmm2
000000000000016f punpcklqdq %xmm0,%xmm2
0000000000000173 movdqa %xmm2,%xmm0
0000000000000177 movdqa %xmm0,0xc0(%rbp)
Basically, it loads the value from the stack to the xmm registers, moves it into normal registers, multiplies it, moves it back into xmm registers, then moves it back onto the stack, completely defeating the purpose of sse and in fact harming performance. If I change to addition it works correctly
Code:
00000000000000da movdqa 0xc0(%rbp),%xmm1
00000000000000df movdqa 0xb0(%rbp),%xmm0
00000000000000e4 paddd %xmm1,%xmm0
00000000000000e8 movdqa %xmm0,0xc0(%rbp)
I thought sse supported integer multiplication. Am I wrong or am I doing something wrong
**EDIT**
A little bit of system info
Code:
$ gcc -v
Using built-in specs.
Target: i686-apple-darwin9
[huge string removed]
Thread model: posix
gcc version 4.0.1 (Apple Inc. build 5465)
$ uname -a
Darwin 9.4.0 Darwin Kernel Version 9.4.0: Mon Jun 9 19:30:53 PDT 2008; root:xnu-1228.5.20~1/RELEASE_I386 i386