Mostly, you just let the compiler optimiser worry about all the things Quzah mentioned.
Common sub-expression elimination and replacing certain mathematical operations with bit operations have been common for some time.
Example
Code:
void foo ( int num_c_symbols, unsigned char *LT_c_symbols, int SymSize, unsigned char block_Id ) {
int j;
for (j = 0; j < num_c_symbols; j++)
{
LT_c_symbols[j*(SymSize+3)]=block_Id;
LT_c_symbols[j*(SymSize+3)+1]=j/255;
LT_c_symbols[j*(SymSize+3)+2]=j%255; // I'm guessing 5 was a typo?
}
}
Taken "as is", you can see everything being calculated for each array subscript
Code:
$ gcc -S -c baz.c
$ more baz.s
.file "baz.c"
.text
.globl foo
.type foo, @function
foo:
pushl %ebp
movl %esp, %ebp
pushl %esi
pushl %ebx
subl $20, %esp
movl 20(%ebp), %eax
movb %al, -28(%ebp)
movl $0, -12(%ebp)
jmp .L2
.L3:
movl 16(%ebp), %eax
addl $3, %eax
imull -12(%ebp), %eax
addl 12(%ebp), %eax
movzbl -28(%ebp), %edx
movb %dl, (%eax)
movl 16(%ebp), %eax
addl $3, %eax
imull -12(%ebp), %eax
addl $1, %eax
movl %eax, %ebx
addl 12(%ebp), %ebx
movl -12(%ebp), %ecx
movl $-2139062143, %edx
movl %ecx, %eax
imull %edx
leal (%edx,%ecx), %eax
movl %eax, %edx
sarl $7, %edx
movl %ecx, %eax
sarl $31, %eax
movl %edx, %ecx
subl %eax, %ecx
movl %ecx, %eax
movb %al, (%ebx)
movl 16(%ebp), %eax
addl $3, %eax
imull -12(%ebp), %eax
addl $2, %eax
movl %eax, %ebx
addl 12(%ebp), %ebx
movl -12(%ebp), %ecx
movl $-2139062143, %edx
movl %ecx, %eax
imull %edx
leal (%edx,%ecx), %eax
movl %eax, %edx
sarl $7, %edx
movl %ecx, %eax
sarl $31, %eax
movl %edx, %esi
subl %eax, %esi
movl %esi, %eax
movl %eax, %edx
sall $8, %edx
subl %eax, %edx
movl %ecx, %eax
subl %edx, %eax
movb %al, (%ebx)
addl $1, -12(%ebp)
.L2:
movl -12(%ebp), %eax
cmpl 8(%ebp), %eax
jl .L3
addl $20, %esp
popl %ebx
popl %esi
popl %ebp
ret
But turn on optimisation, and most of that vanishes.
Code:
$ gcc -S -c -O2 baz.c
$ more baz.s
.file "baz.c"
.text
.p2align 4,,15
.globl foo
.type foo, @function
foo:
pushl %ebp
movl %esp, %ebp
pushl %edi
pushl %esi
pushl %ebx
subl $4, %esp
movl 8(%ebp), %edi
movzbl 20(%ebp), %esi
testl %edi, %edi
jle .L4
movl 16(%ebp), %eax
xorl %ecx, %ecx
movl 12(%ebp), %ebx
addl $3, %eax
movl %eax, -16(%ebp)
.p2align 4,,7
.p2align 3
.L3:
movl %esi, %eax
movb %al, (%ebx)
movl $-2139062143, %eax
imull %ecx
movl %ecx, %eax
sarl $31, %eax
addl %ecx, %edx
sarl $7, %edx
subl %eax, %edx
movl %edx, %eax
sall $8, %eax
subl %edx, %eax
movb %dl, 1(%ebx)
movl %ecx, %edx
addl $1, %ecx
subl %eax, %edx
movb %dl, 2(%ebx)
addl -16(%ebp), %ebx
cmpl %edi, %ecx
jne .L3
.L4:
addl $4, %esp
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
That's a loop of 54 instructions reduced to a loop of 20 instructions.
And only one 'mul' instruction (instead of 5), and that's to calculate "j*(SymSize+3)"
The +1 and +2 offsets from that come for free by using the in-build offset feature of certain addressing modes in the x86.
Unless you can come up with an actual better algorithm (one without nested loops for example, which is what is really hurting you here), chances are anything you do in terms of "hand tweaking" of the code will have little impact. If you're too clever, you'll just make the compiler optimiser more conservative and get worse code as a result.