-
large reference file
hi,
i am working on a code it has two parts:
for the first part i have
1500-4000 text files which have strings in them.some files are same and some
have different strings and i have to find amongst those a subset of files which
are 10-20 n number and they are covering all the strings in 1500 files. since
the string are repeated alot in files, only some files can cover the variation
of strings.
to implement this code i used the following logic:
1. Read
all the coverage information from each test folder for Statement Coverage and
put it into a String array, the array should contain only distinct element. Let
this array be A
2. Now create a similar array B of boolean values of
lenght A with all values False. This will be our array to check eveything is
covered or not
3. Read coverage info one by one from test0 to test1590
and start including into the set. As you include one in the set, mark the
corresponding elements in array B to be true.
4. Continue 3 till all the
values in B is true.
my code has two parts,one of it which reads all
the files and collects unique strings is as follows:
also the reference file
is at first chosen as the file among 1500 files with maximum no of statements.
Code:
int comparefiles(const char *filename1,const char *filename2, int
x);
int countlines(const char *filename);
#include
<stdio.h>
#include <stdlib.h>
#include <string.h>
int main()
{
int i=0,j=0,cmpx,nl1=0,nl2=0,dt;
char
buffer[25000]={},buffer1[25000]={}, buffero[25000]={};
FILE *output;
char
buf[20000]={};
char buf1[20000]={};
char bufo[20000]={};
char
string2[]="/COND.txt";
char string[]="rctcas.txt";
//char
string[]="TEST";
for(i=0;i<1;i++)
{
sprintf(buf,"/home/csgrads/akhan015/desktop/programs/benchmarks/tcas/coverage/test1260%s",string2);
printf("reading
reference file %s
\n",buf);
for(j=0;j<=1589;j++)
{
sprintf(buf1,"/home/csgrads/akhan015/desktop/programs/benchmarks/tcas/coverage/test%d%s",
j,string2);
printf("reading inner loop %s \n",buf1); //sending reference file
and another file to compare
dt=comparefiles(buf,buf1,j);
}
}
return
0;
}
int countlines(const char *filename) //count no of
lines
{
FILE *fm;
char line[1024];
int NumberOfLines =
0;
fm=fopen(filename, "r");
while( fgets(line,sizeof(line),fm) !=
NULL)
NumberOfLines++;
return(NumberOfLines);
fclose(fm);
}
int
comparefiles(const char *filename1,const char *filename2, int x)
{
FILE *
fref;
FILE *output;
FILE * myfile1;
char bufo[20000]={};
char
cx1[10000]={} ,cx2[10000]={},cx3[10000]={};
int cmpx,cmpx1;
signed int
s=-1;
int nl1,nl2,nl3;
//fflush(fref);
fref= fopen(filename1,
"r");
nl1=countlines(filename1);
myfile1=
fopen(filename2,"r");
nl2=countlines(filename2);
if((fref== NULL) ||
(myfile1== NULL))
printf("Error occurs in the file
\n");
else
{
int j = 0,
k=0;
rewind(myfile1);
first:
while((fgets(cx2 ,30 ,myfile1)!= NULL))
//choose strings and compare and stop when all the strings from //a file match
reference file, i.e. no unique string to add
{j++;
int i =
0;
rewind(fref);
while((fgets(cx1 ,30, fref)!=
NULL))
{
i++;
if((cmpx=strcmp(cx2 ,cx1))== 0)
{
k++;
if(k==(nl2))
{printf("%d=%d FILES ARE SAME\n",k,nl2
);
return;
}
goto
first;
}
}
if((j!=k)&&((cmpx=strcmp(cx2
,cx1))!= 0)) //here a different statement is found //and checked whether it has
been saved in rctcas during comparison with some other file
{
printf("
STATEMENT DOESNOT EXIST\n");
output=fopen("rctcas.txt","a+");
int
l=0;
nl3=countlines("rctcas.txt");
second:
while((fgets(cx3 ,30,
output)!= NULL))
{puts(cx2);
puts(cx3);
l++;
if((cmpx1=strcmp(cx2
,cx3))== 0)
{
return;}
else
if(l==nl3)
{fputs(cx2,output);
fclose(output);}
}
}
}
return;
}
i
am having two problems in my code:
first after executing 255 times the code
gives segmentation fault. this could be because of buffer overflow. maybe using
malloc can work but i am not sure how to use malloc as i am a
beginner.
second the code creates a reference file which compares a large
number of files, e.g. 1500 or 4000 etc and it extracts all the unique strings
from the files and store them.whenever the loop runs,reference file creates a
checklist to check that all strings are covered. any file being compared is
added on the condition that it has atleast one string not covered by previous
files.according to the observation the refernce file might contain about 100
unique strings but my reference file is giving 17000 strings because the code
has some error. please help me i have to submit the code tomorrow and i cant
find the fault. while using fopen i gave 'a+' but that after running two times
gives a weird memory table and says aborted.
-
Are you going out of your way to post deliberately hard to read code?
SourceForge.net: Indentation - cpwiki
-
As I mentioned in that OTHER forum, you open two files in the comparefiles() function (which is apparently malformed), and close neither of them before the function returns. That will eventually crash your program.
You may have other errors as well.
-
I will definitily not try to read this....
-
.i tried closing the files every time but it still
gives segmentation fault.secondly please tell me how to modify the code to add
malloc.i havent used it before. i read abt it but i cant understand how to
change this code to add malloc.please help me.
-
Code:
void comparefiles(const char *filename1,const char *filename2, int x);
int countlines(const char *filename);//these functions are to compare files passed as an argument and compare files.the countline function is to count the number of lines in the code.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main()
{
int i=0,j=0,cmpx,nl1=0,nl2=0,dt;
char buf[20000]={};//this buffer has the path copied in it for the reference file
char buf1[20000]={};//this buffer has the path copied in it for the files to be compared
char string2[]="/COND.txt";//the text file to be accessed
char string[]="rctcas.txt";//name of reference file
for(i=0;i<1;i++) //this loop opens reference file
{
sprint(buf,"/home/csgrads/akhan015/desktop/programs/benchmarks/tcas/coverage/test1260%s",string2);
printf("reading reference file %s \n",buf);
for(j=0;j<=1589;j++)//this loop opens the 1500 files to be compared
{
sprintf(buf1,"/home/csgrads/akhan015/desktop/programs/benchmarks/tcas/coverage/test%d%s", j,string2);//variable paths
printf("reading inner loop %s \n",buf1);
comparefiles(buf,buf1,j);
}
}
return 0;
}
int countlines(const char *filename)
{
FILE *fm;
char line[1024];
int NumberOfLines = 0;
fm=fopen(filename, "r");
while( fgets(line,sizeof(line),fm) != NULL)
NumberOfLines++;
return(NumberOfLines);
fclose(fm);
}
void comparefiles(const char *filename1,const char *filename2, int x){
FILE * fref;
FILE *output;
FILE * myfile1;
char cx1[10000]={} ,cx2[10000]={},cx3[10000]={};
int cmpx,cmpx1;
signed int s=-1;
int nl1,nl2,nl3;
fref= fopen(filename1, "r");//opening reference file
nl1=countlines(filename1);//counting no of lines in reference file
myfile1= fopen(filename2,"r");//opening file one of 1500 files
nl2=countlines(filename2);//counting lines
if((fref== NULL) || (myfile1== NULL))
printf("Error occurs in the file \n");
else
{
int j = 0, k=0;
rewind(myfile1);
first:
while((fgets(cx2 ,30 ,myfile1)!= NULL)) //pick 1 string from file to compare it with the refernce file
{j++; //j tells the no. of times this loop executes
int i = 0;
rewind(fref);
while((fgets(cx1 ,30, fref)!= NULL))//this loop compares a string with all the strings in reference file
{
i++;// no of times 2nd loop execute
if((cmpx=strcmp(cx2 ,cx1))== 0)
{
if(k==(nl2))// if all strins of a file are in the reference file then we go to another file and repeat the procedure
{
printf("%d=%d FILES ARE SAME\n",k,nl2 );
return;
}
k++,//no of matches in both files
else
goto first;//continue comparing strings
}
}
if((j!=k)&&((cmpx=strcmp(cx2 ,cx1))!= 0))// if all the strings are picked and all match then j==k but here some string doesnt match
{
printf(" STATEMENT DOESNOT EXIST\n");
output=fopen("rctcas.txt","a+");
int l=0;
nl3=countlines("rctcas.txt");//count lines in the file
second:
while((fgets(cx3 ,30, output)!= NULL))// here before writing the string to the file i want to check it is not appended again i.e if the same string was there in some other file but not in reference that it would written again and again
{
puts(cx2);
puts(cx3);
l++;// l shows how many times loop is executed
if((cmpx1=strcmp(cx2 ,cx3))== 0)
{
fclose(myfile1);
fclose(fref);
return;}
else if(l==nl3)//if string does not exist and the whole file is checked
{
fputs(cx2,output);
fclose(output);
}
}
}
}
}
fclose(myfile1);
fclose(fref);
return;
}
-
Ok... that's a freaking mess.
Indent that code.
Clean up the comments.
Then, maybe I'll take a look at it.
-
Here's a good start at a cleaned up version:
Code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void comparefiles(const char *filename1,const char *filename2, int x);
int countlines(const char *filename);
/*
these functions are to compare files passed as an argument and compare files.the countline function is to count the number of lines in the code.
*/
int main()
{
int i=0,j=0,cmpx,nl1=0,nl2=0,dt;
char buf[20000]= {}; //buf has the path copied in it for the reference file
char buf1[20000]= {}; //buf1 has the path copied in it for the files to be compared
char string2[]="/COND.txt";//the text file to be accessed
char string[]="rctcas.txt";//name of reference file
for(i=0; i<1; i++) //this loop opens reference file
{
sprint(buf,"/home/csgrads/akhan015/desktop/programs/benchmarks/tcas/coverage/test1260%s",string2);
printf("reading reference file %s \n",buf);
//loop opens the 1500 files to be compared
for(j=0; j<=1589; j++)
{
sprintf(buf1,"/home/csgrads/akhan015/desktop/programs/benchmarks/tcas/coverage/test%d%s", j,string2);//variable paths
printf("reading inner loop %s \n",buf1);
comparefiles(buf,buf1,j);
}
}
return 0;
}
int countlines(const char *filename)
{
FILE *fm;
char line[1024];
int NumberOfLines = 0;
fm=fopen(filename, "r");
while( fgets(line,sizeof(line),fm) != NULL)
NumberOfLines++;
return(NumberOfLines);
fclose(fm);
}
void comparefiles(const char *filename1,const char *filename2, int x) {
FILE * fref;
FILE *output;
FILE * myfile1;
char cx1[10000]= {} ,cx2[10000]= {},cx3[10000]= {};
int cmpx,cmpx1;
signed int s=-1;
int nl1,nl2,nl3;
fref= fopen(filename1, "r");//opening reference file
nl1=countlines(filename1);//counting no of lines in reference file
myfile1= fopen(filename2,"r");//opening file one of 1500 files
nl2=countlines(filename2);//counting lines
if((fref== NULL) || (myfile1== NULL))
printf("Error occurs in the file \n");
else
{
int j = 0, k=0;
rewind(myfile1);
first:
while((fgets(cx2 ,30 ,myfile1)!= NULL)) //pick 1 string from file to compare it with the refernce file
{ j++; //j tells the no. of times this loop executes
int i = 0;
rewind(fref);
while((fgets(cx1 ,30, fref)!= NULL))//this loop compares a string with all the strings in reference file
{
i++;// no of times 2nd loop execute
if((cmpx=strcmp(cx2 ,cx1))== 0)
{
if(k==(nl2))// if all strins of a file are in the reference file then we go to another file and repeat the procedure
{
printf("%d=%d FILES ARE SAME\n",k,nl2 );
return;
}
k++,//no of matches in both files
else
goto first;//continue comparing strings
}
}
if((j!=k)&&((cmpx=strcmp(cx2 ,cx1))!= 0))// if all the strings are picked and all match then j==k but here some string doesnt match
{
printf(" STATEMENT DOESNOT EXIST\n");
output=fopen("rctcas.txt","a+");
int l=0;
nl3=countlines("rctcas.txt");//count lines in the file
second:
while((fgets(cx3 ,30, output)!= NULL))// here before writing the string to the file i want to check it is not appended again i.e if the same string was there in some other file but not in reference that it would written again and again
{
puts(cx2);
puts(cx3);
l++;// l shows how many times loop is executed
if((cmpx1=strcmp(cx2 ,cx3))== 0)
{
fclose(myfile1);
fclose(fref);
return;
}
else if(l==nl3)//if string does not exist and the whole file is checked
{
fputs(cx2,output);
fclose(output);
}
}
}
}
}
fclose(myfile1);
fclose(fref);
return;
}
-
A couple of comments (line numbers relate to previous unindented version, sorry):
line 34: you have an fclose after a return, meaning the file won't ever be closed. In general I think you need to organise your file opening and closing -- I think your loop in main could open and close 1 file each time through the loop, and pass a FILE pointer around for the likes of countlines. I might be wrong but I think it'd improve the code a fair bit.
Code:
return(NumberOfLines);
fclose(fm);
line 46:
Code:
fref= fopen(filename1, "r");//opening reference file
nl1=countlines(filename1);//counting no of lines in reference file
myfile1= fopen(filename2,"r");//opening file one of 1500 files
nl2=countlines(filename2);//counting lines
if((fref== NULL) || (myfile1== NULL))
printf("Error occurs in the file \n");
Ordering problem. If the file doesn't exist or can't be opened, fopen() will return NULL, but then the code calls countlines() which does another fopen and unprotected fread. This could well be causing a segfault.
line 83 and others
Code:
while((fgets(cx2 ,30 ,myfile1)!= NULL))
Why are you declaring such enormous arrays, and then barely using them? This reads a maximum of 30 characters into a 20000 char buffer. Can the lines in the files be longer than 30 characters? I suspect they can, 30 char is really short. This would probably result in your program thinking it'd identifies more strings: it's split some of them.
Probably best to decide what the longest line is, and read that much into an array of an appropriate size. Your filename arrays are ginormous too, no need for it (though if the large arrays were going to cause a problem I'd expect it to happen earlier than the 255th iteration).
Eh... good luck!
-
hi all,
thanks for your suggestions.
1)
i added fclose but it gives segmentation fault again. i posted the modified code on forum under the same thread.i also read abt malloc but i cant understand how to use malloc in this code. if i only declare my pointer to data and use malloc function will my code remain the same.where in my code i have to make changes to use malloc.i never used it before.
2)
Please can u check the logic of the last part of the function compare files which is trying to eliminate identical strings to be added. because i am having a large reference file which contains unique strings from all the files. the reference file should contain only 100 strings because in all of 1500 files the unique strings are 100 - 200 but i am getting 17000 strings.
for choosing the reference file i take the file with maximum no. of strings among the 1500 files and compare it with all other files.
3)
Another question i am confused about is that for the last part of logic in compare files the program gives a weird error. it shows a table with memory addresses talks abt glib.c and says aborted at the end. this error comes when i use 'a+' in fopen instead of 'a'. but if i use 'a' i cant read the refence file because i have to compare the existing reference file so that i can eliminate identical strings. is my logic fine or i should try to make the code for comparing the referece file with string arrays.
i have a sample of data to give an idea how things go in my code.
my reference files initially is:
Code:
tcas.c:63:0x8048447:0
tcas.c:73:0x8048470:1
tcas.c:79:0x80484B7:0
tcas.c:79:0x80484C3:0
tcas.c:79:0x80484D0:0
tcas.c:91:0x8048502:1
tcas.c:97:0x804854A:0
tcas.c:97:0x8048553:0
tcas.c:97:0x8048560:1
tcas.c:118:0x80485B6:0
tcas.c:118:0x80485C2:0
tcas.c:118:0x80485CE:0
tcas.c:120:0x80485F5:0
tcas.c:120:0x80485FE:0
tcas.c:124:0x8048612:0
tcas.c:124:0x804861C:0
tcas.c:124:0x8048622:1
tcas.c:126:0x804863C:0
tcas.c:126:0x8048645:1
tcas.c:127:0x8048662:0
tcas.c:127:0x804866B:0
tcas.c:128:0x804867E:1
tcas.c:133:0x8048693:1
tcas.c:135:0x80486A2:0
tcas.c:148:0x80486D9:1
with this reference file i am comparing 1500 files.
sample of two files out of 1500 are:
first file:
Code:
tcas.c:63:0x8048447:0
tcas.c:73:0x8048470:0
tcas.c:75:0x8048480:0
tcas.c:75:0x8048489:0
tcas.c:75:0x8048496:0
tcas.c:91:0x8048502:0
tcas.c:93:0x8048512:0
tcas.c:93:0x804851E:0
tcas.c:93:0x804852B:0
tcas.c:118:0x80485B6:0
tcas.c:118:0x80485C2:0
tcas.c:118:0x80485CE:0
tcas.c:120:0x80485F5:1
tcas.c:124:0x8048612:0
tcas.c:124:0x804861C:1
tcas.c:124:0x8048628:0
tcas.c:126:0x804863C:1
tcas.c:127:0x8048662:0
tcas.c:127:0x804866B:1
tcas.c:128:0x804867E:1
tcas.c:133:0x8048693:1
tcas.c:135:0x80486A2:1
tcas.c:148:0x80486D9:1
2nd file:
Code:
tcas.c:63:0x8048447:0 //programname.c:line no.:address:input
tcas.c:73:0x8048470:0
tcas.c:75:0x8048480:0
tcas.c:75:0x8048489:0
tcas.c:75:0x8048496:1
tcas.c:91:0x8048502:0
tcas.c:93:0x8048512:0
tcas.c:93:0x804851E:0
tcas.c:93:0x804852B:1
tcas.c:118:0x80485B6:0
tcas.c:118:0x80485C2:0
tcas.c:118:0x80485CE:0
tcas.c:120:0x80485F5:1
tcas.c:124:0x8048612:0
tcas.c:124:0x804861C:1
tcas.c:124:0x8048628:0
tcas.c:126:0x804863C:0
tcas.c:126:0x8048645:0
tcas.c:127:0x8048662:1
tcas.c:128:0x804867E:0
tcas.c:128:0x8048684:1
tcas.c:133:0x8048693:0
tcas.c:148:0x80486D9:1
for the output the strings in the reference file are not repeated, but the strings that are different than the reference file are appended again and again. there is probably some error in the last part of the function compare files, after fopen("rctcas.txt","a+")
please suggest what i should do, i will be really obliged.
-
Please post your current code...
I rather suspect your problem is not one of code, but one of process...
I would begin by loading the reference file as an array of pointers to strings. An example of how to create and load such a file is ... HERE
I would then sort the array into alphabetical order using quicksort or some other optimized sorting algorythm. You don't actually have to copy/move the strings, just swap pointers, so it can be very fast.
I would then load each of the other files, one at a time, line by line, using a binary search to check each line against the reference file. An example of a binary search can be found HERE Any strings not found can be inserted into the array in order so there's no need to resort. Any strings that are found can simply be discarded.
Once all the secondary files are parsed, I would rewrite the reference file with the data from memory. Since at this point the file is sorted, you can simply load and search on future uses of the program.
There should be no need to do things like pre-counting the number of lines in the files...
-
I was wondering where you were earlier today, Amna.
Your logic does not cover all the possible contingencies, and is being overwhelmed by the mass of data and the number of files.
IMO, you don't want to keep this file based, you need to do the work in memory, at the much faster RAM speed.
I agree with Tater's suggestions above, 100%. Time is short, so let's get a quick start on this.
For our quick start, let's say you will have a maximum of 400 strings, and each string will be less than 80 chars in length. (I know your actual char's are much longer than the examples you have here.) There should be a define for the length, so it can be easily changed.
Here's a binary search function for strings: A return of -1 indicates that no match was found for the string. It's essential for a quick and easy search. Your array of strings is called str in this example:
Code:
int binarySearch(char goal[], int left, int right) {
int lo, mid, hi,count=0;
lo=left;
hi=right;
while (lo <= hi) {
++count;
mid = (lo + hi) / 2;
if (strcmp(goal, str[mid]) < 0) //goal < str[mid]:
hi = mid-1;
else if(strcmp(goal, str[mid]) > 0) //goal > str[mid]:
lo = mid+1;
else
return mid;
}
return -1;
}
More later. Hang in there, Amna! :cool:
-
Tidying up some details still.
-
This is the kind of skeleton program that I had in mind. Note that the binary search does not work with the index[] array, yet. I'll fix it in just a bit.
Code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define STRNUM 400
#define STRLEN 80
#define MAXFILES 3
char str[STRNUM][STRLEN]; //a large array for strings
int binarySearch(const char goal[], int left, int right);
void printIt(int);
void sort(int idx[STRNUM], int lo, int hi);
int main(void) {
FILE *fp = fopen("AmnaRef.txt","r"); //reference file
int i=0, count = 0, found=0;
int idx[STRNUM];
char fname[30]={"data"};
char fext[5]={".txt"};
char test[STRLEN]; //test string array
char charN[7];
if(!fp) {
printf("Error: reference file did not open!\n");
return 1;
}
fgets(str[0], STRLEN, fp); //first string
count++;
for(i=0;i<2;i++) { //MAXFILES will replace the 2, when ready
if(i) {
_itoa(i,charN,10); //get next data filename
strcat(fname,charN);
strcat(fname,fext);
printf("Next filename: %s\n",fname);
getchar();
break;
}
while((fgets(test,STRLEN,fp)) != NULL) {
if(strlen(test)< 2) //ignore any blank rows of strings
continue;
//found=binarySearch(test, 0, count);
found=-1;
if(found == -1) {
//it's unique, copy into str[]
strcpy(str[++count],test);
}
sort(idx,0, count); //resort the str array
}
}
fclose(fp);
printIt(count);
getchar();
printf("\n\n");
return 0;
}
//insertion sort of strings, through an index[] array
void sort(int idx[STRNUM], int lo, int hi) {
int i,j;
char *pval;
for(i=0;i<=hi;i++) //initialize index array
idx[i]=i;
for(i=lo+1;i<hi;i++) { //insertion sort - very fast on almost sorted arrays
pval = str[idx[i]];
j = i-1;
while(strcmp(str[idx[j]], pval) >0) {
idx[j+1] = idx[j];
--j;
if(j<0) break;
}
idx[j+1] = i;
}
puts("\nAfter Insertion Sort:\n");
for(i=lo;i<hi;i++)
printf("%s",str[idx[i]]);
getchar();
}
int binarySearch(const char test[], int left, int right) {
int lo, mid, hi,count=0;
lo=left;
hi=right;
while (lo <= hi) {
++count;
mid = (lo + hi) / 2;
if (strcmp(test, str[mid]) < 0)
hi = mid-1;
else if(strcmp(test, str[mid]) > 0)
lo = mid+1;
else
return mid;
}
return -1;
}
void printIt(int count) {
int i;
for(i=0;i<count;i++)
printf("%s",str[i]);
}
An array of the size you need, perhaps 400 strings, 1024 char long, is fine in global space. For this skeleton, I just used a shorter 80 char's, but I recall your actual data is longer than that. Changing the define makes it easy to adjust.
I'll fix up the binary search function, but I'd start here and adapt this to your needs. Do you have the string function itoa() or _itoa()?
Note that this is not the fastest program possible. That would require more time to create and test. This will be much faster than what you had previously however, and accurate, when it's finished.
Nothing has been tested here, yet, so watch out for bugs! :p
-
The above program skeleton, but with all the parts working. Not necessarily all working correctly, but they seem to be, in very limited testing (almost none).
It may not be the fastest way to do this job, but it is surprisingly fast.
Code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define STRNUM 400
#define STRLEN 80
#define MAXFILES 3
char str[STRNUM][STRLEN]; //a large array for strings
int idx[STRNUM]; //the index array
int binarySearch(const char goal[], int left, int right);
void printIt(int);
void sort(int lo, int hi);
int toFile(int);
int main(void) {
FILE *fp = fopen("AmnaRef.txt","r"); //reference file
int i=0, n, count = 0, found=0;
char fname[30]={"data"};
char fext[5]={".txt"};
char test[STRLEN]; //test string array
char charN[8],yesNo;
if(!fp) {
printf("Error: reference file did not open!\n");
return 1;
}
fgets(str[0], STRLEN, fp); //first string
count++;
for(i=0;i<MAXFILES;i++) { //MAXFILES;
if(i) {
for(n=4;n<sizeof(fname);n++)
fname[n]='\0';
_itoa(i,charN,10); //get next data filename
strcat(fname,charN);
strcat(fname,fext);
printf("Now processing file: %s\n",fname);
if((fp=fopen(fname, "r"))== NULL) {
printf("Error: opening file %s\n",fname);
return 1;
}
}
while((fgets(test,STRLEN,fp)) != NULL) {
if(strlen(test)< 2) //ignore any blank rows of strings
continue;
found=binarySearch(test, 0, count);
if(found == -1) {
//it's unique, copy test into str[]
strcpy(str[++count],test);
}
sort(0, count); //resort the str array
}
fclose(fp);
}
fclose(fp);
//printIt(count);
i=toFile(count);
if(i) {
printf("\nThe output file wasn't written - retry [y/n]?");
scanf("%c",&yesNo);
if(yesNo=='y' || yesNo=='Y')
i=toFile(count);
if(i)
printf("\n Not writing to the file!\n");
}
printf("\n");
return 0;
}
//insertion sort of strings, through an index[] array
void sort(int lo, int hi) {
int i,j;
char *pval;
for(i=0;i<=hi;i++) //initialize index array
idx[i]=i;
for(i=lo+1;i<hi;i++) { //insertion sort - very fast on almost sorted arrays
pval = str[idx[i]];
j = i-1;
while(strcmp(str[idx[j]], pval) >0) {
idx[j+1] = idx[j];
--j;
if(j<0) break;
}
idx[j+1] = i;
}
/*
puts("\nAfter Insertion Sort:\n");
for(i=lo;i<hi;i++)
printf("%s",str[idx[i]]);
getchar();
*/
}
int binarySearch(const char test[], int left, int right) {
int lo, mid, hi;
lo=left;
hi=right;
while (lo < hi) {
mid = (lo + hi) / 2;
if (strcmp(test, str[idx[mid]]) < 0)
hi = mid-1;
else if(strcmp(test, str[idx[mid]]) > 0)
lo = mid+1;
else
return mid;
}
return -1;
}
void printIt(int count) {
int i;
for(i=0;i<count;i++)
printf("%s",str[idx[i]]);
}
int toFile(int count) {
int i;
FILE *fp=fopen("UniqueStrings.txt", "w");
if(!fp) {
printf("Error: unable to open UniqueStrings.txt file\n");
return 1;
}
for(i=0;i<count;i++)
fprintf(fp, "%s",str[idx[i]]);
fclose(fp);
return 0;
}
You no longer need a "reference" file for this. Any file will do. If the rest of the data files are numbered consecutively, that's all that matters to the program.