Re-write loop to make it run quicker??

**clarkk** · 06-19-2011

Hello, How can i make this loop quicker??

Code:

    for (j = 0; j < num_c_symbols; j++)
    {
        LT_c_symbols[j*(SymSize+3)]=block_Id;
        LT_c_symbols[j*(SymSize+3)+1]=j/255;    
        LT_c_symbols[j*(SymSize+3)+2]=j%5;
        for(int ss=3;ss<SymSize+3;ss++)
        {
            LT_c_symbols[j*(SymSize+3)+ss] = 0;
        }
        for ( i = 0; i < LT_c_node[j].degree; i++)
            for(int ss=3;ss<SymSize+3;ss++)
            {
                LT_c_symbols[j*(SymSize+3)+ss] = LT_c_symbols[j*(SymSize+3)+ss] ^ LT_s_symbols[(LT_c_node[j].edge[i])*(SymSize+3)+ss];
            }
    }
    coded_length=num_c_symbols;

thanks

**quzah** · 06-19-2011

Compute as much of that as you can only once, not over and over in the loop. For example, you don't need to keep computing j*(SymSize+3) when you could just do it once, and change how much you increment your loop by:

Code:

    for( j = 0, inc = (SymSize+3), limit = inc * num_c_symbols; j < limit; j += inc )
	{
		LT_c_symbols[ j ] = block_Id;

The next two lines you can do bitwise:

Code:

		LT_c_symbols[ j+1 ] = j >> 8;
		LT_c_symbols[ j+2 ] = j & 0xFF;

Of course since I changed what J was, you might need to check the math on that. There's no point in the next loop, since you are changing their value again anyway. In the final loops, just do:

Code:

LT_c_symbols[ j ] = 0 ^ ...;

Or if you are flipping the bits, (I guess that's what you are doing), just use ~.

I probably missed something here, but I'm tired, so I'll call it a night.

Quzah.

**Salem** · 06-19-2011

Mostly, you just let the compiler optimiser worry about all the things Quzah mentioned.
Common sub-expression elimination and replacing certain mathematical operations with bit operations have been common for some time.

Example

Code:

void foo ( int num_c_symbols, unsigned char *LT_c_symbols, int SymSize, unsigned char block_Id ) {
    int j;
    for (j = 0; j < num_c_symbols; j++)
    {
        LT_c_symbols[j*(SymSize+3)]=block_Id;
        LT_c_symbols[j*(SymSize+3)+1]=j/255;
        LT_c_symbols[j*(SymSize+3)+2]=j%255;    // I'm guessing 5 was a typo?
    }
}

Taken "as is", you can see everything being calculated for each array subscript

Code:

$ gcc -S -c baz.c
$ more baz.s
    .file    "baz.c"
    .text
.globl foo
    .type    foo, @function
foo:
    pushl    %ebp
    movl    %esp, %ebp
    pushl    %esi
    pushl    %ebx
    subl    $20, %esp
    movl    20(%ebp), %eax
    movb    %al, -28(%ebp)
    movl    $0, -12(%ebp)
    jmp    .L2
.L3:
    movl    16(%ebp), %eax
    addl    $3, %eax
    imull    -12(%ebp), %eax
    addl    12(%ebp), %eax
    movzbl    -28(%ebp), %edx
    movb    %dl, (%eax)
    movl    16(%ebp), %eax
    addl    $3, %eax
    imull    -12(%ebp), %eax
    addl    $1, %eax
    movl    %eax, %ebx
    addl    12(%ebp), %ebx
    movl    -12(%ebp), %ecx
    movl    $-2139062143, %edx
    movl    %ecx, %eax
    imull    %edx
    leal    (%edx,%ecx), %eax
    movl    %eax, %edx
    sarl    $7, %edx
    movl    %ecx, %eax
    sarl    $31, %eax
    movl    %edx, %ecx
    subl    %eax, %ecx
    movl    %ecx, %eax
    movb    %al, (%ebx)
    movl    16(%ebp), %eax
    addl    $3, %eax
    imull    -12(%ebp), %eax
    addl    $2, %eax
    movl    %eax, %ebx
    addl    12(%ebp), %ebx
    movl    -12(%ebp), %ecx
    movl    $-2139062143, %edx
    movl    %ecx, %eax
    imull    %edx
    leal    (%edx,%ecx), %eax
    movl    %eax, %edx
    sarl    $7, %edx
    movl    %ecx, %eax
    sarl    $31, %eax
    movl    %edx, %esi
    subl    %eax, %esi
    movl    %esi, %eax
    movl    %eax, %edx
    sall    $8, %edx
    subl    %eax, %edx
    movl    %ecx, %eax
    subl    %edx, %eax
    movb    %al, (%ebx)
    addl    $1, -12(%ebp)
.L2:
    movl    -12(%ebp), %eax
    cmpl    8(%ebp), %eax
    jl    .L3
    addl    $20, %esp
    popl    %ebx
    popl    %esi
    popl    %ebp
    ret

But turn on optimisation, and most of that vanishes.

Code:

$ gcc -S -c -O2 baz.c
$ more baz.s
    .file    "baz.c"
    .text
    .p2align 4,,15
.globl foo
    .type    foo, @function
foo:
    pushl    %ebp
    movl    %esp, %ebp
    pushl    %edi
    pushl    %esi
    pushl    %ebx
    subl    $4, %esp
    movl    8(%ebp), %edi
    movzbl    20(%ebp), %esi
    testl    %edi, %edi
    jle    .L4
    movl    16(%ebp), %eax
    xorl    %ecx, %ecx
    movl    12(%ebp), %ebx
    addl    $3, %eax
    movl    %eax, -16(%ebp)
    .p2align 4,,7
    .p2align 3
.L3:
    movl    %esi, %eax
    movb    %al, (%ebx)
    movl    $-2139062143, %eax
    imull    %ecx
    movl    %ecx, %eax
    sarl    $31, %eax
    addl    %ecx, %edx
    sarl    $7, %edx
    subl    %eax, %edx
    movl    %edx, %eax
    sall    $8, %eax
    subl    %edx, %eax
    movb    %dl, 1(%ebx)
    movl    %ecx, %edx
    addl    $1, %ecx
    subl    %eax, %edx
    movb    %dl, 2(%ebx)
    addl    -16(%ebp), %ebx
    cmpl    %edi, %ecx
    jne    .L3
.L4:
    addl    $4, %esp
    popl    %ebx
    popl    %esi
    popl    %edi
    popl    %ebp
    ret

That's a loop of 54 instructions reduced to a loop of 20 instructions.

And only one 'mul' instruction (instead of 5), and that's to calculate "j*(SymSize+3)"
The +1 and +2 offsets from that come for free by using the in-build offset feature of certain addressing modes in the x86.

Unless you can come up with an actual better algorithm (one without nested loops for example, which is what is really hurting you here), chances are anything you do in terms of "hand tweaking" of the code will have little impact. If you're too clever, you'll just make the compiler optimiser more conservative and get worse code as a result.

**clarkk** · 06-21-2011

Thanks Guys! =)

Thread: Re-write loop to make it run quicker??

Thread Tools

Search Thread

Display

Re-write loop to make it run quicker??

Similar Threads

Don' t know what to write in the while loop.

How to make itoa write the left zeros?

program to make a floppy write protected

How can we write a C shell script to make Dirs

write then make R-O