I'm using -Wall -arch i386 -O2 for my compiler switches.
I think the problem is that for some reason, the gcc that ships with Snow Leopard is horrendous. I looked at the assembly code output from an Ubuntu box and it was MUCH more efficient. For some reason, the gcc I have on this machine really, really likes memory accesses.
Here's the assembly that my gcc produced (I'm on Snow Leopard):
Code:
decl %edx
movl %edx, -400028(%ebp)
movl $0, -400032(%ebp)
movl -400028(%ebp), %eax
cmpl %eax, -400032(%ebp)
jge L23
L6:
movl -400032(%ebp), %edi
incl %edi
cmpl -400036(%ebp), %edi
jge L7
movl -400032(%ebp), %eax
movl -400040(%ebp), %edx
leal (%edx,%eax,4), %ecx
movl %edi, %edx
movl %eax, %esi
.align 4,0x90
L9:
movl 4(%ecx), %eax
cmpl -400024(%ebp,%esi,4), %eax
cmovl %edx, %esi
incl %edx
addl $4, %ecx
cmpl -400036(%ebp), %edx
jl L9
cmpl %esi, -400032(%ebp)
je L7
movl -400040(%ebp), %ecx
movl -4(%ecx,%edi,4), %edx
movl -400024(%ebp,%esi,4), %eax
movl %eax, -4(%ecx,%edi,4)
movl %edx, -400024(%ebp,%esi,4)
L7:
movl %edi, -400032(%ebp)
movl -400028(%ebp), %eax
cmpl %eax, -400032(%ebp)
jl L6
So I think my problem is that my toolchain sucks. Thanks for the help!