Hello there,
I've got the following programming challenge:
A C program that performs the following text-processing on ascii files: it compares two files and prints out lines of the second file that are not identical to any line in the first file. Assume that the files are large -- about 1 million lines each. Do not use library functions other than those in the standard C library.
I have the following draft which does the job, however, for one million lines, as it turns out, it takes many hours!
I was hoping to get some ideas on this list on how to speed up the code.
Thanks.
Code:
#include <stdio.h>
#include <stdlib.h>
#define LINE_LENGTH 256
int main(int argc, char **argv)
{
int j, n1, i1;
char line[LINE_LENGTH];
FILE * fp;
char * file_name;
int rvalue;
/* read into memory file 1 */
fp = fopen("text1.txt","r");
char ** file1 = NULL;
int * lengths = NULL;
j = 0;
while ( fgets(line,LINE_LENGTH,fp) != NULL) {
file1 = realloc(file1,(j+1) * sizeof(char *));
file1[j] = calloc(sizeof(char), strlen(line)+1);
strcpy(file1[j],line);
lengths = (int *)realloc(lengths, (j+1)*sizeof(int));
lengths[j] = strlen(line);
j++;
}
n1 = j;
fclose(fp);
/* open file 2 and compare lines */
fp = fopen("text2.txt","r");
int has_match;
int i = 0 ;
while ( fgets(line,LINE_LENGTH,fp) != NULL) {
has_match = 0;
for ( i1 = 0 ; (i1 < n1) && (has_match == 0); i1++) {
if ( strcmp(file1[i1], line) == 0 ) {
has_match = 1;
}
}
if ( has_match == 0 ) {
printf("%s",line);
}
}
fclose(fp);
return EXIT_SUCCESS;
}