Generally, when you write loops that profiling shows to be bottlenecks, try to see if you can write them in a simpler form, accessing fewer variables.
Consider the following example code:
Code:
#include <stdlib.h>
#include <errno.h>
typedef struct {
size_t size;
size_t used;
char data[];
} mystr;
int mystr_cat1(mystr *const s, const char *z)
{
if (s == NULL)
return EINVAL;
if (z != NULL) {
size_t n = s->size - s->used;
while (*z)
if (n-->0)
s->data[s->used++] = *(z++);
else
return ENOMEM;
}
return 0;
}
int mystr_cat2(mystr *const s, const char *z)
{
if (s == NULL)
return EINVAL;
if (z == NULL)
return 0;
while (*z)
if (s->size >= s->used)
return ENOMEM;
else
s->data[s->used++] = *(z++);
return 0;
}
int mystr_cat3(mystr *const s, const char *z)
{
if (s == NULL)
return EINVAL;
if (z != NULL) {
char *p = s->data + s->used;
char *const q = s->data + s->size;
while (*z)
if (p >= q)
return ENOMEM;
else
*(p++) = *(z++);
}
return 0;
}
It is a simplified version of a C99 mutable string/data type I sometimes use.
Using GCC-4.8.2 and heavy optimization for generic x86 (-O3 -fomit-frame-pointer -ftree-vectorize -mtune=i686 -m32), the three functions compile to the following assembly code:
Code:
mystr_cat1: | mystr_cat2: | mystr_cat3:
pushl %esi | pushl %esi | pushl %ebx
movl $22, %eax | movl $22, %eax | movl 8(%esp), %ebx
pushl %ebx | pushl %ebx | movl $22, %eax
movl 12(%esp), %ebx | movl 12(%esp), %ecx | movl 12(%esp), %edx
movl 16(%esp), %edx | movl 16(%esp), %edx | testl %ebx, %ebx
testl %ebx, %ebx | testl %ecx, %ecx | je .L28
je .L2 | je .L18 | testl %edx, %edx
testl %edx, %edx | testl %edx, %edx | je .L30
je .L4 | je .L20 | movl 4(%ebx), %eax
movl 4(%ebx), %eax | cmpb $0, (%edx) | movzbl (%edx), %ecx
movl (%ebx), %ecx | je .L20 | addl %ebx, %eax
subl %eax, %ecx | movl (%ecx), %esi | addl (%ebx), %ebx
cmpb $0, (%edx) | movl 4(%ecx), %eax | addl $8, %eax
je .L4 | cmpl %eax, %esi | addl $8, %ebx
testl %ecx, %ecx | jb .L22 | testb %cl, %cl
movl %ecx, %esi | jmp .L21 | je .L30
jne .L12 | .L23: | cmpl %ebx, %eax
jmp .L7 | cmpl %esi, %eax | jb .L32
.L8: | jbe .L21 | jmp .L31
subl $1, %esi | .L22: | .L33:
je .L7 | addl $1, %eax | cmpl %ebx, %eax
.L12: | addl $1, %edx | je .L31
addl $1, %eax | movl %eax, 4(%ecx) | .L32:
addl $1, %edx | movzbl -1(%edx), %ebx | addl $1, %eax
movl %eax, 4(%ebx) | movb %bl, 7(%ecx,%eax) | addl $1, %edx
movzbl -1(%edx), %ecx | cmpb $0, (%edx) | movb %cl, -1(%eax)
movb %cl, 7(%ebx,%eax) | jne .L23 | movzbl (%edx), %ecx
cmpb $0, (%edx) | .L20: | testb %cl, %cl
jne .L8 | xorl %eax, %eax | jne .L33
.L4: | .L18: | .L30:
xorl %eax, %eax | popl %ebx | xorl %eax, %eax
.L2: | popl %esi | .L28:
popl %ebx | ret | popl %ebx
popl %esi | .L21: | ret
ret | movl $12, %eax | .L31:
.L7: | popl %ebx | movl $12, %eax
movl $12, %eax | popl %esi | popl %ebx
popl %ebx | ret | ret
popl %esi | |
ret | |
On generic x86-64 (-O3 -fomit-frame-pointer -ftree-vectorize -mtune=generic -m64), the assembly is
Code:
mystr_cat1: | mystr_cat2: | mystr_cat3:
testq %rdi, %rdi | testq %rdi, %rdi | testq %rdi, %rdi
movl $22, %eax | movl $22, %eax | movl $22, %eax
je .L2 | je .L17 | je .L26
testq %rsi, %rsi | testq %rsi, %rsi | testq %rsi, %rsi
je .L4 | je .L19 | je .L28
movq 8(%rdi), %rdx | cmpb $0, (%rsi) | movq (%rdi), %rdx
movq (%rdi), %rcx | je .L19 | movq 8(%rdi), %rax
subq %rdx, %rcx | movq (%rdi), %rcx | leaq 16(%rdi,%rdx), %rcx
cmpb $0, (%rsi) | movq 8(%rdi), %rax | movzbl (%rsi), %edx
je .L4 | cmpq %rax, %rcx | leaq 16(%rdi,%rax), %rax
subq %rdx, %rsi | jae .L20 | testb %dl, %dl
testq %rcx, %rcx | subq %rax, %rsi | je .L28
jne .L14 | jmp .L21 | cmpq %rcx, %rax
jmp .L7 | .L22: | jb .L30
.L8: | cmpq %rcx, %rax | jmp .L29
subq $1, %rcx | jbe .L20 | .L31:
je .L7 | .L21: | cmpq %rcx, %rax
movq %rax, %rdx | addq $1, %rax | je .L29
.L14: | movq %rax, 8(%rdi) | .L30:
leaq 1(%rdx), %rax | movzbl -1(%rsi,%rax), %edx | addq $1, %rax
movq %rax, 8(%rdi) | movb %dl, 15(%rdi,%rax) | addq $1, %rsi
movzbl -1(%rsi,%rax), %r8d | cmpb $0, (%rsi,%rax) | movb %dl, -1(%rax)
movb %r8b, 15(%rdi,%rax) | jne .L22 | movzbl (%rsi), %edx
cmpb $0, 1(%rsi,%rdx) | .L19: | testb %dl, %dl
jne .L8 | xorl %eax, %eax | jne .L31
.L4: | ret | .L28:
xorl %eax, %eax | .L17: | xorl %eax, %eax
ret | rep ret | ret
.L2: | .L20: | .L26:
rep ret | movl $12, %eax | rep ret
.L7: | ret | .L29:
movl $12, %eax | | movl $12, %eax
ret | | ret
We cannot tell much from just the length of the assembly -- especially x86-64 instruction timing is so complicated that a difference of a couple of instructions does not tell us anything at all.
We can tell that on i686, mystr_cat3() used only one word of stack, whereas the others used two. You'd need to run some microbenchmarks to say anything about the speed. The object code lengths for the functions are 96, 80, and 79 bytes, respectively.
On x86-64, no stack is used, because there are more registers available. mystr_cat1() and mystr_cat2() use funky addressing modes (%rsi+%rax+constant and %rdi+%rax+constant),whereas mystr_cat3() uses straight pointers. The object code lengths of the functions are 128, 112, and 97 bytes.
Although I have not benchmarked the exact functions above, prior experience with GCC (and also ICC, Portland Group, and Pathscale compilers, although it's been a few years since my last extensive checks) indicates the latter form of these is most likely to yield good compiled code: it has the simplest form, using the fewest variables within the loop (including the loop condition).
All this is to say: strive for simplicity. Do clever tricks only when profiling shows they really help with the run time; they tend to be difficult to maintain.
Don't worry about whether code is fast when you write it; worry about whether it is as simple (and robust) as you can make it. Error checking is good, it makes code more robust (especially if you write sensible error messages), but otherwise, if you can make some code simpler, do so. For loops, that means thinking about whether you can avoid using one of the variables altogether.
Most speedups in real life are achieved by switching to a faster algorithm. It is rarely useful to tweak such details as here (how best to iterate over some loop), in real-life code. Whenever I find my programs spending most of their time in some sort of copy loop, it seems more than likely that there is some way to avoid that copy altogether.