Thread: What Do You think About This?

  1. #1
    Registered User
    Join Date
    Apr 2008
    Posts
    4

    What Do You think About This?

    Proggie Takes a file input
    and calculates number of
    byte values that are same...

    Code is messy and unpolished
    but main idea is the calculation
    algo that is fast.

    Example 5mb file takes 15 seconds.

    Im interested what you guys think and what
    can be done better!
    Thanks alot!

    Code:
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    
    #define CHAR 256
    
    /*
    This version uses malloc() for dinamic array location
    Uses improved same byte suming algorithm
    Uses impreved weight byte sorting algoritm
    Argc and argv not implemented yet
    */
    
    int data_harvest();
    void repetition_sorting(int data_counter);
    void weight_sorting(int data_counter);
    void missing_bytes_sorting();
    
    struct data
    {
     int offset;
     char value;
     int repetitions;
    };
    
    struct data *byte_table;
    struct data word_table[CHAR];
    struct data weight_table[CHAR];
    
    main()
    {
        int data_counter,loop_counter;
        int time1,time2;
    
         data_counter=data_harvest();
     
    //    printf("MEASURING_WEIGHT_SORTING!\n");
    //    getchar();
    /*
        for(loop_counter=0; loop_counter<data_counter; loop_counter++)
          {
           printf("BYTE_TABLE[%d][%d][%x]\n",loop_counter,byte_table[loop_counter].offset,byte_table[loop_counter].value);
           getchar();
          }
    getchar();
    */
        time1=clock();
        repetition_sorting(data_counter);
        time2=clock();
    
        printf("REPETITION_SORTING() ## %d ms\n",time2-time1);
    
        time1=clock();
        weight_sorting(data_counter);
        time2=clock();
    
        printf("WEIGHT_SORTING() ## %d ms\n",time2-time1);
        getchar();
    
         printf("WEIGHT_BYTE_TABLE\n");
    
         for(loop_counter=0; loop_counter<=CHAR; loop_counter++)
          {
           
           printf("[%c] [0x%d] X%d\n",weight_table[loop_counter].value,weight_table[loop_counter].offset,weight_table[loop_counter].repetitions);
    //       getchar();
    
          }
    /*
         for(loop_counter=0; loop_counter<=data_counter; loop_counter++)
          {
           printf("[%d] %2x # %d \n",weight_table[loop_counter].repetitions,weight_table[loop_counter].value,weight_table[loop_counter].offset);
    //       getchar();
          }
    */
    
          puts("MISSING_BYTES");
    /*
          for(loop_counter=0; loop_counter<=256; loop_counter++)
          {
           printf("[%d][0x%x][%d]\n",loop_counter,word_table[loop_counter].value,word_table[loop_counter].offset);
    //       getchar();
    
          }
    */
    
    }
    
    int data_harvest()
    {
      int data_num,loop_counter=0,filesize_num;
      int data_counter=0,latest_word_offset=0;
      
      FILE *fp_r; fp_r=fopen("file.exe","rb");
      
    //Ucitavanje bajtova fajla u array (memoriju)
    
          for(loop_counter=0; loop_counter<CHAR; loop_counter++)
          {
           word_table[loop_counter].value=(char)loop_counter;
          }
    
          for(loop_counter=0; loop_counter<CHAR; loop_counter++)
          {
    //       printf("WORD_TABLE[%d][%x]\n",loop_counter,word_table[loop_counter].value=loop_counter);
    //       getchar();
          }
    
          while (filesize_num != EOF)
          {
            filesize_num=fgetc(fp_r);
            loop_counter++;
          }
          rewind(fp_r);
          
    //      printf("LOOP_COUNTER # %d\n",loop_counter);
    //      getchar();
    
          byte_table=malloc(loop_counter*(sizeof(struct data)));
       
          while (!feof(fp_r))
          {
    
            fread(&byte_table[data_counter].value,sizeof(char),1,fp_r);
            byte_table[data_counter].offset=data_counter;
    //        printf("BYTE_TABLE[%d][%d]\n",data_counter,byte_table[data_counter].offset);
    //        getchar();
            
    //        byte_table[data_counter].value=data_num;
    //        printf("BYTE_TABLE[%d][%c]\n",data_counter,byte_table[data_counter].value);
    //        getchar();
    
            for(loop_counter=0; loop_counter<CHAR; loop_counter++)
            {
             if((byte_table[data_counter].value == word_table[loop_counter].value)&&(word_table[loop_counter].offset==0))
              word_table[loop_counter].offset=data_counter;
            }
    
    //        printf("BYTE_TABLE[%d][%d][%x]\n",data_counter,byte_table[data_counter].offset,byte_table[data_counter].value);
    
            data_counter++;
            
          }
          fclose(fp_r);
          return(data_counter);
    }
    
    
    
    /*
    Sorting File Repetition Bytes
    Calculating number of same byte in file
    */
    
    
    void repetition_sorting(int data_counter)
    {
         int loop_counter,real_bit_counter=0,repetition_counter=0,break_flag=0;
         int cmp_byte,tmp_byte,repetition_byte,repetition_switch=0;
         int weight_table_bit,*p_cmp_bit;
    
         for(real_bit_counter=0; real_bit_counter<CHAR; real_bit_counter++)
         {
           cmp_byte=byte_table[word_table[real_bit_counter].offset].value;
    //       printf("CMP_BYTE # %x\n",byte_table[word_table[real_bit_counter].offset].value);
    //       getchar();
    
    
           for(loop_counter=0; loop_counter<=data_counter; loop_counter++)
          {
           tmp_byte=byte_table[loop_counter].value;
    
    
           if(cmp_byte == tmp_byte)
           {
            byte_table[word_table[real_bit_counter].offset].repetitions++;
           }
    
          }
    
    //      printf("BYTE_TABLE[%d][%x][%d]\n",word_table[real_bit_counter].offset,byte_table[word_table[real_bit_counter].offset].value,byte_table[word_table[real_bit_counter].offset].repetitions);
    //      getchar();
         }
    
    
         for(real_bit_counter=0; real_bit_counter<= data_counter; real_bit_counter++)
         {
          cmp_byte=byte_table[real_bit_counter].value;
          repetition_switch=0;
    
           for(repetition_counter=0; repetition_counter<CHAR; repetition_counter++)
           {
    
            repetition_byte=word_table[repetition_counter].value;
    //        printf("REPETITION_BYTE %x \n",repetition_byte);
    //        getchar();
    
    //      printf("WEIGHT_OFFSET_TABLE[%d][%c] \n",loop_counter,weight_offset_table
    
            if((cmp_byte == repetition_byte)&&(real_bit_counter>0))
              {
               byte_table[real_bit_counter].repetitions=byte_table[word_table[repetition_counter].offset].repetitions;
    //            printf("BYTE_TABLE[%d][%x][%d]\n",real_bit_counter,byte_table[real_bit_counter].value,byte_table[real_bit_counter].repetitions);
               repetition_switch=1;
    //            getchar();
               }
           }
    
         if(repetition_switch==0)
         {
          for(loop_counter=0; loop_counter<=data_counter; loop_counter++)
          {
           tmp_byte=byte_table[loop_counter].value;
    
    //     printf("CMP_BIT [%c] == REPETITION_BIT [%c]\n",cmp_bit,repetition_bit);
    //     printf("WEIGHT_OFFSET_TABLE[%d][%c][%d]\n",real_bit_counter,cmp_bit,weight_offset_table[repetition_counter][0][0]);
    
           if((cmp_byte == tmp_byte)&&(repetition_switch==0))
           {
            byte_table[real_bit_counter].repetitions++;
            repetition_switch=0;
    //         printf("WEIGHT_BIT_TABLE[%d][%x]# %d \n",real_bit_counter,weight_offset_table[loop_counter][0],weight_offset_table[loop_counter][0][0]);
    //printf("G_DATA_BIT_TABLE[%x] \n",g_data_bit_table[real_bit_counter][0]);
           }
    
          }
         }
        }
    
    }
    
    /*
    Sorting Bytes According To
    Number of repetitions in file
    */
    
    void weight_sorting(int data_counter)
    {
     int loop_counter,weight_counter,loop_weight_pointer=0;
     int tmp_byte,max_value_byte=0,prev_max_byte,repetition_counter;
     int repetition_switch=0;
     
     for(weight_counter=0; weight_counter<256; weight_counter++)
     {
    
      for(loop_counter=0; loop_counter<data_counter; loop_counter++)
      {
       tmp_byte=byte_table[loop_counter].repetitions;
    /*
       for(repetition_counter=weight_counter; repetition_counter>0; repetition_counter--)
       {
        if(tmp_byte >=weight_table[repetition_counter].value)
           loop_counter++;
        }
    */
    //   printf("TMP_BYTE # %d\n",tmp_byte);
    //   printf("BYTE_TABLE[%d].repetitions # %d\n",loop_counter,byte_table[loop_counter].repetitions);
    
           if((weight_counter==0)&&(tmp_byte>max_value_byte))
           {
            max_value_byte=tmp_byte;
            loop_weight_pointer=loop_counter;
           }
    
           if((tmp_byte>max_value_byte)&&(tmp_byte<prev_max_byte))
           {
            max_value_byte=tmp_byte;
            loop_weight_pointer=loop_counter;
           }
    
    }
    
    //  printf("MAX_VALUE_BYTE # %d\n",max_value_byte);
    //  printf("LOOP_WEIGHT_POINTER # %d\n\n",loop_weight_pointer);
    //  getchar();
      weight_table[weight_counter].repetitions=byte_table[loop_weight_pointer].repetitions;
      weight_table[weight_counter].value=byte_table[loop_weight_pointer].value;
      weight_table[weight_counter].offset=byte_table[loop_weight_pointer].offset;
      
    //  printf("[%d] %2x # %d \n",weight_table[weight_counter].repetitions,weight_table[weight_counter].value,weight_table[weight_counter].offset);
    //  getchar();
      prev_max_byte=max_value_byte;
      max_value_byte=0;
    }
    //getchar();
    }
    
    void missing_bytes_sorting()
    {
     }
    Last edited by sphaaz; 04-01-2009 at 11:45 PM.

  2. #2
    ATH0 quzah's Avatar
    Join Date
    Oct 2001
    Posts
    14,826
    Never add line numbers to a program you want people to help with. That aside, we can do your program in about fifteen lines:
    Code:
    #include<stdio.h>
    int main( void )
    {
        unsigned char bytes[256] = {0};
        FILE *fp = fopen( "myfile.txt", "rb" );
        if( fp )
        {
            int x;
            while( (x = fgetc( fp )) != EOF )
                bytes[x]++;
            fclose( fp );
        }
        /* do something for output */
        return 0;
    }
    You might get some performance improvement by doing an fread, reading a large block at a time, and running through that buffer instead of single calls to the file. But for pure simplicity, the above code beats everything else hands down.


    Quzah.
    Hope is the first step on the road to disappointment.

  3. #3
    spurious conceit MK27's Avatar
    Join Date
    Jul 2008
    Location
    segmentation fault
    Posts
    8,300
    Yah, I couldn't be bothered reading the code here because of the absurd formatting with the line numbers.

    Quote Originally Posted by quzah View Post
    You might get some performance improvement by doing an fread, reading a large block at a time
    I did read somewhere (maybe from Adak? probably not or I would have asked then) that there is an "ideal block size" to use with fread for this purpose (optimization) depending on the system.

    However, the POSIX standard reads:
    Quote Originally Posted by The Open Group
    For each object, "size" calls shall be made to the fgetc() function...
    which says to me that the best/fastest way is just to use fgetc yourself, as in quzah's (clever) post. (??)
    Last edited by MK27; 04-01-2009 at 03:04 PM.
    C programming resources:
    GNU C Function and Macro Index -- glibc reference manual
    The C Book -- nice online learner guide
    Current ISO draft standard
    CCAN -- new CPAN like open source library repository
    3 (different) GNU debugger tutorials: #1 -- #2 -- #3
    cpwiki -- our wiki on sourceforge

  4. #4
    Algorithm Dissector iMalc's Avatar
    Join Date
    Dec 2005
    Location
    New Zealand
    Posts
    6,318
    Having to use the scroll wheel any more than just a little bit will put most people off bothering to read the code (including myself in this case). Remember that for next time.
    My homepage
    Advice: Take only as directed - If symptoms persist, please see your debugger

    Linus Torvalds: "But it clearly is the only right way. The fact that everybody else does it some other way only means that they are wrong"

  5. #5
    Registered User
    Join Date
    Sep 2006
    Posts
    8,868
    It's a "piano"!!

    You'll win the Rube Goldberg prize for the week, I'm sure!

  6. #6
    Registered User
    Join Date
    Apr 2008
    Posts
    4
    hehe
    as i said code is messy its not
    polished yet,its a working print...

    point is it takes char by char because it makes
    a table of frequent chars when it fills 0-255 value table
    with values and offsts in file it then calculates them first
    all other instances of same value is just copied
    from previous calculation,otherwise it wolud have to loop the whole file for every char

    I suppose rube goldberg prize is not that good...
    btw what does it mean its a "piano"?
    Also not good probably

    Point of this project is that it takev different size
    variables and calculates weight of repetitions in
    files as fast as it can...

    I will implement integere sizes long sizes and double
    whole point is in ne approach to data packing and of course learning
    int the process

    changed, the post no more stupid number lines ahahah

  7. #7
    Registered User
    Join Date
    Sep 2006
    Posts
    8,868
    Instead of a simple doghouse that you needed, you have a "piano". It has a contrived shape, dozens of keys - black, white, sharps, flats, and hundreds of levers and wires, and standoff's and felt pads and hammers - even foot pedals.

    I'd advise you to look again at the short and concise program you were shown. What you want to do here is a trivial job. Re-think it and get on with it. Your dog still wants that doghouse.

  8. #8
    Registered User
    Join Date
    Apr 2008
    Posts
    4
    I agree there was a lot robust and simpler way to
    do what i did... But i was intterested into performance gains
    thats why its messy... first variant was simple get chars
    count each char through file and make table according to the "heaviest" char...
    it was smaller more neat and easier to read... but
    it was slow as hell... so i began tweaking and thats why code looks like this...

    i probably need to rewrite like this...

    get_chars()
    ....
    fill char_value_table()
    (0-255 counting each char later just copy those results onto the table places for each char)
    compare and make "heaviest" list print it out!

    thats what prog does its just messy

    i was wondering what you think about performance tweaks, not if code is pretty or not
    because its not heheh ima a pig i know!

  9. #9
    Registered User slingerland3g's Avatar
    Join Date
    Jan 2008
    Location
    Seattle
    Posts
    603
    Quote Originally Posted by Adak View Post
    It's a "piano"!!

    You'll win the Rube Goldberg prize for the week, I'm sure!

    Lol, a somewhat cryptic way to say your code, the OP, is too complicated for what it is to accomplish.

  10. #10
    Registered User
    Join Date
    Apr 2008
    Posts
    4
    okay for example i use simple code for input like
    the code above, than i need to cound each byte
    in a 500 kb file its 500.000*500.000 repetitions right?
    or is there a better algo for that?
    thats my main concern!
    Its Sloooooooooow!

    It has to be extremly fast if i make together
    1 byte 2 byte 4 byte and 8 byte counter and comparer
    it cant take more than 20-30 seconds on a 1mb file foe analisis
    it will display number of same byte offsets so you can see the
    data that can be packed and has a lot of uses and implementation

    sorry for my bad english haha
    and if someone can help with that i would be grateful
    Can someone help with that?
    Last edited by sphaaz; 04-02-2009 at 01:40 PM.

Popular pages Recent additions subscribe to a feed