<regex.h> help needed! C slower than Java
Hello!
For some reason, this C program using REGEX is slower than an equivalent Java program which is also using REGEX!
I'm only compiling the PATTERN once and storing it in an array and then reusing that array PATTERN, but the run time is almost 4 times slower than Java and 10 times slower than when i create my own pattern matcher!
Can anyone see an glaring inefficiencies in the way i'm handling the REGEX package please?
Thanks very much from BobK!
Also, at the very bottom i've included my own pattern mathing routine, very simple prototype for now, and this runs in 3 minutes for 75 million records, versus the 30 minutes for the program below!
Code:
#include <stdio.h>
#include <stdlib.h>
#include <regex.h>
#include <strings.h>
#include <errno.h>
#include <time.h>
#define SIZE 256
int main(int argc, char **argv) {
char buffer[SIZE]; time_t curtime; struct tm *loctime;curtime=time(NULL);loctime=localtime(&curtime);fputs (asctime (loctime), stdout);
int c; char *filename=NULL;extern char *optarg; extern int optind, opterr, optopt; int err_no;char *pattern;
if(argc==1) { return(EXIT_SUCCESS); }
while((c=getopt(argc, argv, "f:p:"))!= EOF) { switch(c) {
case 'f': filename=optarg; break; case 'p': pattern=optarg; break;
case '?': fprintf (stderr, "Unknown option `-%c'.\n", optopt); return EXIT_FAILURE; default: return EXIT_SUCCESS; } }
int c2;int a1=0;int a2=0;char aHS[30][20]={"\0"};FILE *HS; HS=fopen("HSTATUS.in","r");
while((c2=fgetc(HS))!=EOF){if (c2=='\n'){;a1++;a2=0;}else{aHS[a1][a2]=c2;a2++;}}fclose(HS);
char (*pHS)[20]=aHS;
c2=0; a1=0; a2=0;char aHF[30][20]={"\0"};FILE *HF; HF=fopen("HFATYPCD.in","r");
while((c2=fgetc(HF))!=EOF){if (c2=='\n'){;a1++;a2=0;}else{aHF[a1][a2]=c2;a2++;}}fclose(HF);
char (*pHF)[20]=aHF;
c2=0; a1=0; a2=0;char aDQ[30][20]={"\0"};FILE *DQ; DQ=fopen("DQIND.in","r");
while((c2=fgetc(DQ))!=EOF){if (c2=='\n'){;a1++;a2=0;}else{aDQ[a1][a2]=c2;a2++;}}fclose(DQ);
char(*pDQ)[20]=aDQ;
int NUM_COLS=10;
struct parms_grp { int type; char *nam; char *pat; int l; int m; } par[10];
par[0].type='p';par[0].nam="ACT_DTE" ;par[0].pat="([0-9]|(/|)){1,14}"; par[0].l=0;par[0].m=14;
par[1].type='b';par[1].nam="FNMA_LN" ;par[1].pat=" "; par[1].l=1;par[1].m=10;
par[2].type='c';par[2].nam="DQIND" ;par[2].pat="cat"; par[2].l=0;par[2].m=0;
par[3].type='c';par[3].nam="HFATYPCD";par[3].pat="cat"; par[3].l=0;par[3].m=0;
par[4].type='c';par[4].nam="HSTATUS" ;par[4].pat="cat"; par[4].l=0;par[4].m=0;
par[5].type='p';par[5].nam="LPI_DTE" ;par[5].pat="([0-9]|(/|)){1,14}"; par[5].l=0;par[5].m=14;
par[6].type='p';par[6].nam="ACT_UPB" ;par[6].pat="([0-9]|(/|)){1,09}"; par[6].l=1;par[6].m=9;
par[7].type='p';par[7].nam="HFR_UPB" ;par[7].pat="([0-9]|(/|)){1,10}"; par[7].l=0;par[7].m=10;
par[8].type='p';par[8].nam="REMLIFE" ;par[8].pat="([0-9]|(/|)){1,04}"; par[8].l=0;par[8].m=4;
par[9].type='b';par[9].nam="HPOOL_NO";par[9].pat=" "; par[9].l=0;par[9].m=6;
regex_t *regx[10];
int i;for(i=0;i<NUM_COLS;i++) {
if (par[i].type=='p') {
regx[i]=(regex_t *) malloc(sizeof(regex_t)); memset(regx[i],0,sizeof(regex_t));
if((err_no=do_regex(regx[i], par[i].pat))!=EXIT_SUCCESS) { return EXIT_FAILURE; }
}
}
FILE *FH=NULL;if((FH=fopen(filename, "r"))==NULL){fprintf(stderr,"x open file %s;%s\n",filename,strerror(errno));exit(EXIT_FAILURE);}
char line[1024]; int line_no=1;char *cp;
while((cp=fgets(line, 1023, FH))!=NULL) {
size_t i; char field [10][20]; char *ptr = line; int n = 0;
for (i=0; i<sizeof field/sizeof *field; ++i )
{
if ( sscanf(ptr, "%19[^|]%n%*c", field[i], &n) == 1 )
{
if (par[i].type=='p')
{
if (regexec(regx[i],field[i],0,NULL,0)==0) <<<<<<<<<<<<
{
chklen(par[i].l,par[i].m,field[i]);
}
else { errp(i,field[i]); }
}
else
if (par[i].type=='c')
{
if (par[i].nam=="HSTATUS") {chkcat(pHS,field[i]);}else
if (par[i].nam=="HFATYPCD") {chkcat(pHF,field[i]);}else
if (par[i].nam=="DQIND") {chkcat(pDQ,field[i]);}
chklen (par[i].l,par[i].m,field[i]);
}
else
if (par[i].type=='b')
{
chklen(par[i].l,par[i].m,field[i]);
}
else { /*printf("ERROR1: type not found");*/ }
line_no++; ptr += n + 1;
}
else
{
field[i][0] = '\0'; ++ptr;
}
}
}
int i2;for(i2=0;i2<NUM_COLS;i2++)
{
if (par[i].type=='p') { regfree(regx[i2]); free(regx[i2]); }
}
curtime=time(NULL);loctime=localtime(&curtime);fputs (asctime (loctime), stdout);
fclose(FH);return EXIT_SUCCESS;
} /* end of MAIN */
chkcat(char (*ar)[20],char *fld)
{
int i=0;for (i=0;i<30;i++) {
if (strcmp(fld,ar[i])==0) { /*printf("ERROR2 match yes %s",fld);*/}
}
}
int do_regex(regex_t *r, char *p)
{
int err_no=0;
if((err_no=regcomp(r, p,REG_EXTENDED))!=0) {
size_t length; char *buffer; length=regerror(err_no, r, NULL, 0);buffer = malloc(length);regerror(err_no, r, buffer, length);
fprintf(stderr, "%s\n", buffer); free(buffer); regfree(r); return EXIT_FAILURE; }
return EXIT_SUCCESS;
}
chklen(int min,int max,char *s)
{
if (min>0 && max>0)
{
if ((strlen(s)==min|strlen(s)>min) && (strlen(s)<max|strlen(s)==max)) {}
else {/*printf("ERROR3 MinMax: %s %d",s,strlen(s));*/}
}
else
if (min==0 && max>0)
{
if ( (strlen(s)<max | strlen(s)==max) || strlen(s)==0) {}
else {/*printf("ERROR4 MinMax: %s %d",s,strlen(s));*/}
}
}
errp(int i,char *s)
{
/*printf("ERROR5 with PATTERN: %d %s",i,s);*/
}