<regex.h> help needed! C slower than Java

This is a discussion on <regex.h> help needed! C slower than Java within the C Programming forums, part of the General Programming Boards category; Hello! For some reason, this C program using REGEX is slower than an equivalent Java program which is also using ...

  1. #1
    Registered User
    Join Date
    Feb 2005
    Posts
    12

    <regex.h> help needed! C slower than Java

    Hello!

    For some reason, this C program using REGEX is slower than an equivalent Java program which is also using REGEX!

    I'm only compiling the PATTERN once and storing it in an array and then reusing that array PATTERN, but the run time is almost 4 times slower than Java and 10 times slower than when i create my own pattern matcher!

    Can anyone see an glaring inefficiencies in the way i'm handling the REGEX package please?

    Thanks very much from BobK!

    Also, at the very bottom i've included my own pattern mathing routine, very simple prototype for now, and this runs in 3 minutes for 75 million records, versus the 30 minutes for the program below!

    Code:
    #include <stdio.h>
    #include <stdlib.h>
    #include <regex.h> 
    #include <strings.h> 
    #include <errno.h>
    #include <time.h>
    #define SIZE 256
    
    int main(int argc, char **argv) {
    
    char buffer[SIZE]; time_t curtime; struct tm *loctime;curtime=time(NULL);loctime=localtime(&curtime);fputs (asctime (loctime), stdout);
    int c; char *filename=NULL;extern char *optarg; extern int optind, opterr, optopt; int err_no;char *pattern;
    if(argc==1) { return(EXIT_SUCCESS); }
    
    while((c=getopt(argc, argv, "f:p:"))!= EOF) { switch(c) { 
    case 'f': filename=optarg; break; case 'p': pattern=optarg; break;
    case '?': fprintf (stderr, "Unknown option `-%c'.\n", optopt); return EXIT_FAILURE; default: return EXIT_SUCCESS; } }
    
    int c2;int a1=0;int a2=0;char aHS[30][20]={"\0"};FILE *HS; HS=fopen("HSTATUS.in","r");
    while((c2=fgetc(HS))!=EOF){if (c2=='\n'){;a1++;a2=0;}else{aHS[a1][a2]=c2;a2++;}}fclose(HS);
    char (*pHS)[20]=aHS; 
    
        c2=0;  a1=0;    a2=0;char aHF[30][20]={"\0"};FILE *HF; HF=fopen("HFATYPCD.in","r");
    while((c2=fgetc(HF))!=EOF){if (c2=='\n'){;a1++;a2=0;}else{aHF[a1][a2]=c2;a2++;}}fclose(HF);
    char (*pHF)[20]=aHF;
    
        c2=0;  a1=0;    a2=0;char aDQ[30][20]={"\0"};FILE *DQ; DQ=fopen("DQIND.in","r");
    while((c2=fgetc(DQ))!=EOF){if (c2=='\n'){;a1++;a2=0;}else{aDQ[a1][a2]=c2;a2++;}}fclose(DQ);
    char(*pDQ)[20]=aDQ;
    
    int NUM_COLS=10;
    
    struct parms_grp { int type; char *nam; char *pat; int l; int m; } par[10];
    par[0].type='p';par[0].nam="ACT_DTE" ;par[0].pat="([0-9]|(/|)){1,14}"; par[0].l=0;par[0].m=14; 
    par[1].type='b';par[1].nam="FNMA_LN" ;par[1].pat=" ";                  par[1].l=1;par[1].m=10; 
    par[2].type='c';par[2].nam="DQIND"   ;par[2].pat="cat";                par[2].l=0;par[2].m=0;
    par[3].type='c';par[3].nam="HFATYPCD";par[3].pat="cat";                par[3].l=0;par[3].m=0;
    par[4].type='c';par[4].nam="HSTATUS" ;par[4].pat="cat";                par[4].l=0;par[4].m=0;
    par[5].type='p';par[5].nam="LPI_DTE" ;par[5].pat="([0-9]|(/|)){1,14}"; par[5].l=0;par[5].m=14;
    par[6].type='p';par[6].nam="ACT_UPB" ;par[6].pat="([0-9]|(/|)){1,09}"; par[6].l=1;par[6].m=9;
    par[7].type='p';par[7].nam="HFR_UPB" ;par[7].pat="([0-9]|(/|)){1,10}"; par[7].l=0;par[7].m=10;
    par[8].type='p';par[8].nam="REMLIFE" ;par[8].pat="([0-9]|(/|)){1,04}"; par[8].l=0;par[8].m=4;
    par[9].type='b';par[9].nam="HPOOL_NO";par[9].pat=" ";                  par[9].l=0;par[9].m=6;
    
    regex_t *regx[10];
    
    int i;for(i=0;i<NUM_COLS;i++) {
       if  (par[i].type=='p') {  
           regx[i]=(regex_t *) malloc(sizeof(regex_t)); memset(regx[i],0,sizeof(regex_t)); 
           if((err_no=do_regex(regx[i], par[i].pat))!=EXIT_SUCCESS) { return EXIT_FAILURE; } 
       }
    }
    FILE *FH=NULL;if((FH=fopen(filename, "r"))==NULL){fprintf(stderr,"x open file %s;%s\n",filename,strerror(errno));exit(EXIT_FAILURE);}
    char line[1024]; int line_no=1;char *cp; 
    
    while((cp=fgets(line, 1023, FH))!=NULL) {
    size_t i; char field [10][20]; char *ptr = line; int n = 0;
     for (i=0; i<sizeof field/sizeof *field; ++i )
     {
       if ( sscanf(ptr, "%19[^|]%n%*c", field[i], &n) == 1 )
       {
         if (par[i].type=='p') 
         { 
            if (regexec(regx[i],field[i],0,NULL,0)==0) <<<<<<<<<<<<
            {
                chklen(par[i].l,par[i].m,field[i]);
            } 
            else { errp(i,field[i]); }
         }
         else 
         if (par[i].type=='c') 
         {   
            if (par[i].nam=="HSTATUS")  {chkcat(pHS,field[i]);}else
            if (par[i].nam=="HFATYPCD") {chkcat(pHF,field[i]);}else
            if (par[i].nam=="DQIND")    {chkcat(pDQ,field[i]);} 
            chklen (par[i].l,par[i].m,field[i]);
         }    
         else
         if (par[i].type=='b') 
         {
            chklen(par[i].l,par[i].m,field[i]);
         }
         else { /*printf("ERROR1: type not found");*/ }
         
         line_no++; ptr += n + 1;
       } 
       else 
       { 
         field[i][0] = '\0'; ++ptr; 
       } 
     }
    }
    
    int i2;for(i2=0;i2<NUM_COLS;i2++)
    {
      if (par[i].type=='p') { regfree(regx[i2]); free(regx[i2]); }
    }
    
    curtime=time(NULL);loctime=localtime(&curtime);fputs (asctime (loctime), stdout);
    fclose(FH);return EXIT_SUCCESS;
    
    } /* end of MAIN */
    
    chkcat(char (*ar)[20],char *fld)
    {
      int i=0;for (i=0;i<30;i++) {
      if (strcmp(fld,ar[i])==0) { /*printf("ERROR2 match yes %s",fld);*/} 
      }
    }
    int do_regex(regex_t *r, char *p)
    {
      int err_no=0;
      if((err_no=regcomp(r, p,REG_EXTENDED))!=0) {
          size_t length; char *buffer; length=regerror(err_no, r, NULL, 0);buffer = malloc(length);regerror(err_no, r, buffer, length);
          fprintf(stderr, "%s\n", buffer); free(buffer); regfree(r); return EXIT_FAILURE; }
      return EXIT_SUCCESS;
    }
    chklen(int min,int max,char *s)
    {
      if (min>0 && max>0) 
      {
          if ((strlen(s)==min|strlen(s)>min) && (strlen(s)<max|strlen(s)==max)) {}
          else {/*printf("ERROR3 MinMax: %s %d",s,strlen(s));*/}
      }
      else
      if (min==0 && max>0) 
      {
          if ( (strlen(s)<max | strlen(s)==max) || strlen(s)==0) {}
          else {/*printf("ERROR4 MinMax: %s %d",s,strlen(s));*/}
      }
    }
    errp(int i,char *s)
    {
     /*printf("ERROR5 with PATTERN: %d %s",i,s);*/
    }

  2. #2
    Gawking at stupidity
    Join Date
    Jul 2004
    Location
    Oregon, USA
    Posts
    3,163
    Aside from that being some of the worst formatted code I've ever seen, let's take a look at this function:
    Code:
    chklen(int min,int max,char *s)
    {
      if (min>0 && max>0) 
      {
          if ((strlen(s)==min|strlen(s)>min) && (strlen(s)<max|strlen(s)==max)) {}
          else {/*printf("ERROR3 MinMax: %s %d",s,strlen(s));*/}
      }
      else
      if (min==0 && max>0) 
      {
          if ( (strlen(s)<max | strlen(s)==max) || strlen(s)==0) {}
          else {/*printf("ERROR4 MinMax: %s %d",s,strlen(s));*/}
      }
    }
    This could be made a whole lot faster by defining a size_t len = strlen(s); at the beginning of the function. You're also using | instead of || in your if() statements. I didn't even really bother reading the rest of the code since it hurt my eyes.

    EDIT: Looking back over that function, it's horrible. Change:
    Code:
    if ((strlen(s)==min|strlen(s)>min) && (strlen(s)<max|strlen(s)==max))
    To:
    Code:
    {
      size_t len = strlen(s);
    
      if(len >= min && len <= max) // It's in bounds
    }
    ...if that's what you're trying to do. It's hard to tell.
    Last edited by itsme86; 02-04-2005 at 10:07 AM.
    If you understand what you're doing, you're not learning anything.

  3. #3
    Registered User
    Join Date
    Feb 2005
    Posts
    12
    Thanks for reviewing the code!

    I made those changes you mentioned:

    Code:
    chklen(int min,int max,int len)
    {
      if (min>0 && max>0)
      {
          if ((len==min || len>min) && (len<max || len==max)) {}
          else {/*printf("ERROR3 MinMax: %s %d",s,strlen(s));*/}
      }
      else
      if (min==0 && max>0)
      {
          if ( (len<max || len==max) || len==0) {}
          else {/*printf("ERROR4 MinMax: %s %d",s,strlen(s));*/}
      }
    }
    and it's still running about 30 minutes for 75 million records, here are the time stamps:

    Fri Feb 4 12:49:12 2005
    Fri Feb 4 13:19:48 2005

  4. #4
    Code Goddess Prelude's Avatar
    Join Date
    Sep 2001
    Posts
    9,796
    Processing regular expressions on 75 million records, not to mention file input and extensive string handling...30 minutes isn't really unreasonable with how you're going about it.
    My best code is written with the delete key.

  5. #5
    Registered User
    Join Date
    Feb 2005
    Posts
    12
    Well when i pull out that REGEX package usage and replace it with one of the typical types of pattern checks that i'll be doing, i goes from 30mins to 3mins!

    See the <<<<<<<<<, code below

    Here is what i ran:
    Code:
    while((cp=fgets(line, 1023, FH))!=NULL) {
    size_t i; char field [10][20]; char *ptr = line; int n = 0;
     for (i=0; i<sizeof field/sizeof *field; ++i )
     {
       if ( sscanf(ptr, "%19[^|]%n%*c", field[i], &n) == 1 )
       {
         if (par[i].type=='p')
         {
             int sl=strlen(field[i]);
             int f;for(f=0;f<sl;f++)
     
    <<<<<<<<<below this
            {
               if ( (field[i][f]==0 || field[i][f]>0 ) && (field[i][f]<9 || field[i][f]==9 )) {} else { /*printf("error [0-9]" );*/ }
               if ( (sl==par[i].l   || sl>par[i].m   ) && (sl<par[i].m   || sl==par[i].m   )) {} else { /*printf("error minmax");*/ }
             }
         }
         else
         if (par[i].type=='c')
         {
            if (i==2)  {chkcat(pHS,field[i]);}else
            if (i==3)  {chkcat(pHF,field[i]);}else
            if (i==4)  {chkcat(pDQ,field[i]);}
            chklen (par[i].l,par[i].m,field[i]);
         }
         else
         if (par[i].type=='b')
         {
            chklen(par[i].l,par[i].m,field[i]);
         }
         else { /*printf("ERROR1: type not found");*/ }
    
         line_no++; ptr += n + 1;
       }
       else
       {
         field[i][0] = '\0'; ++ptr;
       }
     }
    }
    So there must be something i'm doing wrong in using the REGEX, but i've checked quite a bit and that seems to be the standard usage of the regexec() method!

    Thanks for helping!

    bk

  6. #6
    and the hat of wrongness Salem's Avatar
    Join Date
    Aug 2001
    Location
    The edge of the known universe
    Posts
    32,485
    1. That is simply horrible code

    Your code, care of
    indent --no-tabs --indent-level4 --k-and-r-style hello.c
    Code:
    #include <stdio.h>
    #include <stdlib.h>
    #include <regex.h>
    #include <strings.h>
    #include <errno.h>
    #include <time.h>
    #define SIZE 256
    
    int main(int argc, char **argv)
    {
    
        char buffer[SIZE];
        time_t curtime;
        struct tm *loctime;
        curtime = time(NULL);
        loctime = localtime(&curtime);
        fputs(asctime(loctime), stdout);
        int c;
        char *filename = NULL;
        extern char *optarg;
        extern int optind, opterr, optopt;
        int err_no;
        char *pattern;
        if (argc == 1) {
            return (EXIT_SUCCESS);
        }
    
        while ((c = getopt(argc, argv, "f:p:")) != EOF) {
            switch (c) {
            case 'f':
                filename = optarg;
                break;
            case 'p':
                pattern = optarg;
                break;
            case '?':
                fprintf(stderr, "Unknown option `-%c'.\n", optopt);
                return EXIT_FAILURE;
            default:
                return EXIT_SUCCESS;
            }
        }
    
        int c2;
        int a1 = 0;
        int a2 = 0;
        char aHS[30][20] = { "\0" };
        FILE *HS;
        HS = fopen("HSTATUS.in", "r");
        while ((c2 = fgetc(HS)) != EOF) {
            if (c2 == '\n') {;
                a1++;
                a2 = 0;
            } else {
                aHS[a1][a2] = c2;
                a2++;
            }
        }
        fclose(HS);
        char (*pHS)[20] = aHS;
    
        c2 = 0;
        a1 = 0;
        a2 = 0;
        char aHF[30][20] = { "\0" };
        FILE *HF;
        HF = fopen("HFATYPCD.in", "r");
        while ((c2 = fgetc(HF)) != EOF) {
            if (c2 == '\n') {;
                a1++;
                a2 = 0;
            } else {
                aHF[a1][a2] = c2;
                a2++;
            }
        }
        fclose(HF);
        char (*pHF)[20] = aHF;
    
        c2 = 0;
        a1 = 0;
        a2 = 0;
        char aDQ[30][20] = { "\0" };
        FILE *DQ;
        DQ = fopen("DQIND.in", "r");
        while ((c2 = fgetc(DQ)) != EOF) {
            if (c2 == '\n') {;
                a1++;
                a2 = 0;
            } else {
                aDQ[a1][a2] = c2;
                a2++;
            }
        }
        fclose(DQ);
        char (*pDQ)[20] = aDQ;
    
        int NUM_COLS = 10;
    
        struct parms_grp {
            int type;
            char *nam;
            char *pat;
            int l;
            int m;
        } par[10];
        par[0].type = 'p';
        par[0].nam = "ACT_DTE";
        par[0].pat = "([0-9]|(/|)){1,14}";
        par[0].l = 0;
        par[0].m = 14;
        par[1].type = 'b';
        par[1].nam = "FNMA_LN";
        par[1].pat = " ";
        par[1].l = 1;
        par[1].m = 10;
        par[2].type = 'c';
        par[2].nam = "DQIND";
        par[2].pat = "cat";
        par[2].l = 0;
        par[2].m = 0;
        par[3].type = 'c';
        par[3].nam = "HFATYPCD";
        par[3].pat = "cat";
        par[3].l = 0;
        par[3].m = 0;
        par[4].type = 'c';
        par[4].nam = "HSTATUS";
        par[4].pat = "cat";
        par[4].l = 0;
        par[4].m = 0;
        par[5].type = 'p';
        par[5].nam = "LPI_DTE";
        par[5].pat = "([0-9]|(/|)){1,14}";
        par[5].l = 0;
        par[5].m = 14;
        par[6].type = 'p';
        par[6].nam = "ACT_UPB";
        par[6].pat = "([0-9]|(/|)){1,09}";
        par[6].l = 1;
        par[6].m = 9;
        par[7].type = 'p';
        par[7].nam = "HFR_UPB";
        par[7].pat = "([0-9]|(/|)){1,10}";
        par[7].l = 0;
        par[7].m = 10;
        par[8].type = 'p';
        par[8].nam = "REMLIFE";
        par[8].pat = "([0-9]|(/|)){1,04}";
        par[8].l = 0;
        par[8].m = 4;
        par[9].type = 'b';
        par[9].nam = "HPOOL_NO";
        par[9].pat = " ";
        par[9].l = 0;
        par[9].m = 6;
    
        regex_t *regx[10];
    
        int i;
        for (i = 0; i < NUM_COLS; i++) {
            if (par[i].type == 'p') {
                regx[i] = (regex_t *) malloc(sizeof(regex_t));
                memset(regx[i], 0, sizeof(regex_t));
                if ((err_no = do_regex(regx[i], par[i].pat)) != EXIT_SUCCESS) {
                    return EXIT_FAILURE;
                }
            }
        }
        FILE *FH = NULL;
        if ((FH = fopen(filename, "r")) == NULL) {
            fprintf(stderr, "x open file %s;%s\n", filename, strerror(errno));
            exit(EXIT_FAILURE);
        }
        char line[1024];
        int line_no = 1;
        char *cp;
    
        while ((cp = fgets(line, 1023, FH)) != NULL) {
            size_t i;
            char field[10][20];
            char *ptr = line;
            int n = 0;
            for (i = 0; i < sizeof field / sizeof *field; ++i) {
                if (sscanf(ptr, "%19[^|]%n%*c", field[i], &n) == 1) {
    #ifdef TEST_WITHOUT_REGEX
                    if (par[i].type == 'p') {
                        if (regexec(regx[i], field[i], 0, NULL, 0) == 0) {
                            chklen(par[i].l, par[i].m, field[i]);
                        } else {
                            errp(i, field[i]);
                        }
                    } else if (par[i].type == 'c') {
                        if (par[i].nam == "HSTATUS") {
                            chkcat(pHS, field[i]);
                        } else if (par[i].nam == "HFATYPCD") {
                            chkcat(pHF, field[i]);
                        } else if (par[i].nam == "DQIND") {
                            chkcat(pDQ, field[i]);
                        }
                        chklen(par[i].l, par[i].m, field[i]);
                    } else if (par[i].type == 'b') {
                        chklen(par[i].l, par[i].m, field[i]);
                    } else {        /*printf("ERROR1: type not found"); */
                    }
    #endif
                    line_no++;
                    ptr += n + 1;
                } else {
                    field[i][0] = '\0';
                    ++ptr;
                }
            }
        }
    
        int i2;
        for (i2 = 0; i2 < NUM_COLS; i2++) {
            if (par[i].type == 'p') {
                regfree(regx[i2]);
                free(regx[i2]);
            }
        }
    
        curtime = time(NULL);
        loctime = localtime(&curtime);
        fputs(asctime(loctime), stdout);
        fclose(FH);
        return EXIT_SUCCESS;
    
    }                               /* end of MAIN */
    
    chkcat(char (*ar)[20], char *fld)
    {
        int i = 0;
        for (i = 0; i < 30; i++) {
            if (strcmp(fld, ar[i]) == 0) {  /*printf("ERROR2 match yes %s",fld); */
            }
        }
    }
    int do_regex(regex_t * r, char *p)
    {
        int err_no = 0;
        if ((err_no = regcomp(r, p, REG_EXTENDED)) != 0) {
            size_t length;
            char *buffer;
            length = regerror(err_no, r, NULL, 0);
            buffer = malloc(length);
            regerror(err_no, r, buffer, length);
            fprintf(stderr, "%s\n", buffer);
            free(buffer);
            regfree(r);
            return EXIT_FAILURE;
        }
        return EXIT_SUCCESS;
    }
    
    chklen(int min, int max, char *s)
    {
        if (min > 0 && max > 0) {
            if ((strlen(s) == min | strlen(s) > min)
                && (strlen(s) < max | strlen(s) == max)) {
            } else {                /*printf("ERROR3 MinMax: %s %d",s,strlen(s)); */
            }
        } else if (min == 0 && max > 0) {
            if ((strlen(s) < max | strlen(s) == max) || strlen(s) == 0) {
            } else {                /*printf("ERROR4 MinMax: %s %d",s,strlen(s)); */
            }
        }
    }
    errp(int i, char *s)
    {
        /*printf("ERROR5 with PATTERN: %d %s",i,s); */
    }
    2. It's also a strange mix of K&R C, C, C99 and C++ (who knows what else)
    There's implicit declaration of functions
    There's mix of declarations and statements.

    Did you compile it with any compiler warnings at all?
    Code:
    gcc -Wall hello.c
    hello.c: In function `main':
    hello.c:28: warning: implicit declaration of function `getopt'
    hello.c:164: warning: implicit declaration of function `memset'
    hello.c:165: warning: implicit declaration of function `do_regex'
    hello.c:172: warning: implicit declaration of function `strerror'
    hello.c:172: warning: format argument is not a pointer (arg 4)
    hello.c:188: warning: implicit declaration of function `chklen'
    hello.c:190: warning: implicit declaration of function `errp'
    hello.c:194: warning: implicit declaration of function `chkcat'
    hello.c:12: warning: unused variable `buffer'
    hello.c:21: warning: unused variable `optind'
    hello.c:21: warning: unused variable `opterr'
    hello.c: At top level:
    hello.c:232: warning: return type defaults to `int'
    hello.c: In function `chkcat':
    hello.c:235: warning: implicit declaration of function `strcmp'
    hello.c:238: warning: control reaches end of non-void function
    hello.c: At top level:
    hello.c:257: warning: return type defaults to `int'
    hello.c: In function `chklen':
    hello.c:259: warning: implicit declaration of function `strlen'
    hello.c:259: warning: suggest parentheses around comparison in operand of |
    hello.c:260: warning: suggest parentheses around comparison in operand of |
    hello.c:264: warning: suggest parentheses around comparison in operand of |
    hello.c:268: warning: control reaches end of non-void function
    hello.c: At top level:
    hello.c:270: warning: return type defaults to `int'
    hello.c: In function `errp':
    hello.c:272: warning: control reaches end of non-void function
    I'm surprised it runs at all.

    > Can anyone see an glaring inefficiencies in the way i'm handling the REGEX package please?
    No, but I see one in the way you're reading the file.
    if (sscanf(ptr, "%19[^|]%n%*c", field[i], &n) == 1) {
    A length-limited complemented character class is just about the worst thing you can pass to sscanf
    I suggest you use something like strtok() if you want to break up the string into "|" delimted strings, but watch out for strtok() modifying the original string if this is important to you.

    I wonder how long your code takes to read 75m records if you just comment out the regex itself (gcc -DTEST_WITHOUT_REGEX prog.c)
    Knowing that will tell you how much is down to file reading, and how much is down to regex'ing.

    Finally, use more functions - your main (reformatted) is well over 200 lines long, and it totally dominates the code.
    If you dance barefoot on the broken glass of undefined behaviour, you've got to expect the occasional cut.
    If at first you don't succeed, try writing your phone number on the exam paper.
    I support http://www.ukip.org/ as the first necessary step to a free Europe.

  7. #7
    Cat without Hat CornedBee's Avatar
    Join Date
    Apr 2003
    Posts
    8,893
    Admittedly, though, Java has an advantage in Regex handling, because it can compile the regex to bytecode. I'm not aware of any C/C++ regex library that does a similar thing.
    All the buzzt!
    CornedBee

    "There is not now, nor has there ever been, nor will there ever be, any programming language in which it is the least bit difficult to write bad code."
    - Flon's Law

  8. #8
    and the hat of wrongness Salem's Avatar
    Join Date
    Aug 2001
    Location
    The edge of the known universe
    Posts
    32,485
    > "([0-9]|(/|)){1,14}"
    Given that you've pre-chopped the line at pipe delimiters, your regexes will never see a pipe character. So how does this differ from say

    Code:
    "[0-9/]{1,14}"
    If you dance barefoot on the broken glass of undefined behaviour, you've got to expect the occasional cut.
    If at first you don't succeed, try writing your phone number on the exam paper.
    I support http://www.ukip.org/ as the first necessary step to a free Europe.

  9. #9
    Registered User
    Join Date
    Feb 2005
    Posts
    12

    Thanks Salem

    What is interesting about the C Regex is that it doesn't really seem to use those patterns correctly to begin with , when i test it, ie for instance, the {1,14}, it never seemed to work, so that why i decdied to to my own pattern matching and the [0-9] didn't work correctly either!

    And as far as that / between patterns, what i was basically trying to do, is load the same pattern we are using in a Java program, just to put a load on the compliler/matcher so we could do time comparisons between C & Java and the Java seems to be comparable to the C, which was a big suprise to me and in addition, i'm using a good number of objects in the Java code as well, ie HashSet, ArrayList and maybe that makes up for diff when figuring in the the straight array search in C!

    Anway, when i pulled out the REGEX from both, the C ran in 3 seconds, for 8 million recs and the Java ran in 3.2 seconds!

    Also Salem, i like that Wall feature, going to give that a try, and forgot to mention, this is my first C program in 5 years, been using Java primarily, but their both about same, but i really love programmig in Java, how do you feel about Java Salem?

    Thanks!

  10. #10
    and the hat of wrongness Salem's Avatar
    Join Date
    Aug 2001
    Location
    The edge of the known universe
    Posts
    32,485
    > how do you feel about Java Salem?
    I got as far as buying a book which so far remains unread.
    Nearly all my 'C' is ultimately for embedded systems, so there's no room for luxury higher level languages which make heavy demands on processors and memory.

    > Anway, when i pulled out the REGEX from both, the C ran in 3 seconds, for 8 million recs and the Java ran in 3.2 seconds!
    OK, so it looks like the regex is really something to do with it then.

    Can you post some example lines from your data set (or attach a file with say 10 records if line folding is going to be a problem). regex is a really funny animal at times, and it seems more art than science when it comes to using it.
    If you dance barefoot on the broken glass of undefined behaviour, you've got to expect the occasional cut.
    If at first you don't succeed, try writing your phone number on the exam paper.
    I support http://www.ukip.org/ as the first necessary step to a free Europe.

Popular pages Recent additions subscribe to a feed

Similar Threads

  1. Mats, the java answers
    By Jaqui in forum A Brief History of Cprogramming.com
    Replies: 1
    Last Post: 04-22-2008, 02:12 AM
  2. C#, Java, C++
    By incognito in forum A Brief History of Cprogramming.com
    Replies: 10
    Last Post: 10-05-2004, 02:06 PM
  3. The Java language is being expanded
    By DavidP in forum A Brief History of Cprogramming.com
    Replies: 26
    Last Post: 06-11-2004, 09:07 PM
  4. Java woes
    By DavidP in forum A Brief History of Cprogramming.com
    Replies: 15
    Last Post: 07-06-2003, 12:37 AM
  5. C or Java as a first language
    By CorJava in forum A Brief History of Cprogramming.com
    Replies: 34
    Last Post: 10-23-2002, 05:12 PM

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21