Code:
/*
This is a text based (not binary), word searching program, It's purpose is educational, rather than utilitarian. It shows how to
automatically build an index array, and how that index array can then be used to cut down the search space, causing 30% fewer
comparisons, in this case. Every letter of indexing you add, would increase that percentage.
The indices are passed on to the binary search, to narrow the range of words in the binary search's search space. It is designed
to work with a large wordlist.txt file, with each word on it's own row, e.g.:
a
aardvark
aback
abacus
...
zygote
The list must be alphabetized and, for this example program, the words must have less than MAX-1 letters. The total wordList.txt
file must have less than 39,300 words presently.
The words array wastes a lot of memory, but saving memory isn't what this program is trying to show. I did not want to distract
from it's purpose of showing an indexed search.
This version finds every word in a text file, and searches for it, giving just a summary of the words found and not found, and
displaying the elapsed time of the program's run.
Note: This is version 0.1
The changes include:
1) All newlines have been removed from the words
2) Input is from a file, rather than from the user. The file is the Gettysburg
Address, The Art Of War, Kidnapped, or A Tale of Two Cities, right now. All the books were ebooks free from Project Guttenberg.
3) The run is timed by the program.
4) Educational message output is commented out from the active code.
5) A larger wordlist.txt file is used, and longer words can be searched for. The wordlist file
has been uploaded to Swoopshare at:
6) The words being searched for is changed from uppercase to lowercase, if needed.
7) The display is slightly more organized.
Created on the Pelles C (free) Windows C compiler.
Extensive testing has shown that indexing is of little use on this fast hardware (overclocked i7 cpu). More word comparisons are
needed, but they are very cheap, time-wise. When A Tale of Two Cities can have every word checked in less than 1/10th of a
second, it makes no sense to work with a better algorithm.
The advantage of indexing has sharply diminished with today's powerful PC's. Definitely not worth doing unless for some reason,
comparisons again become more costly.
*/
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <time.h>
#define MAX 17
#define MaxWords 39300
//longest word is 15 chars
char words[MaxWords][MAX];
int binarySearch(const char *, int, int);
int getInput(char *, int);
int main(void) {
int c, i, idx, n, lo, hi, len, inWord=0, found, notFound, inList=0;
FILE *fp, *fpget;
int index[27]={0};
char goal[MAX];
char testchar;
clock_t start, stop;
if((fp=fopen("wordlist.txt", "r")) == NULL) {
perror("Error: unable to open the wordlist.txt file");
return 0;
}
printf("\nLoading words\n");
//fill the index array
testchar='a'-1;
n=0; i=idx=0;
while((fgets(words[n], MAX, fp)) != NULL) {
len = strlen(words[n]);
if(words[n][len-1]=='\n') //remove the newline
words[n][len-1]='\0';
//words[n][strspn(words[n], "\n")]='\0';
if((words[n][0]) > testchar) {
testchar = words[n][0];
index[testchar-'a'] = n; //char - 'a' makes a=0, b=1, c=2, etc.
}
++n; //line counter
}
inList = index[26] = n-1;
fclose(fp);
printf("\nThe Index Array Table:\nLetter\tStarts At Line #\n"); //show the index array
printf("======================\n");
for(i=0;i<26;i++) {
printf(" %c: %10d\n", (i+'a'), index[i]);
}
start = clock();
putchar('\n');
//get user input
if((fpget=fopen("ArtOfWar.txt", "r")) == NULL) { //was ATaleOfTwoCities.txt "Kidnapped.txt", "ArtOfWar.txt" "Gettysburg.txt"
perror("Error: Unable to open the Gettysburg.txt file\n");
return 0;
}
//get a word from the text file
i=found=notFound=0;
while((c=fgetc(fpget)) != EOF) {
if(isalpha(c)) {
inWord=1;
goal[i]=c;
goal[i]=tolower(goal[i]);
//printf("%d: %c ", i, goal[i]); getchar();
++i;
}
else {
goal[i]='\0';
if(inWord)
inWord=0;
//printf(" goal: %s \n", goal); getchar();
//set the lo and hi indices
i = goal[0]-'a'; //<--
//printf("\n low word in the search: %s\nhigh word in the search: %s\n\n",words[index[i]], words[index[i+1]-1]); getchar();
lo = index[i]; hi = index[i+1]+1; //NOT[i+1]-1;
idx=binarySearch(goal, lo, hi);
if(idx > -1) {
// printf(" The word: %s was found: %s at index#: %d \n\n", goal, words[idx], idx);
++found;
}else {
////printf("%s\n", goal);
++notFound;
}
goal[0]='\0';
i=0;
//getchar();
}
}
fclose(fpget);
stop=clock();
printf("\n\nThe word list has %d entries in it, out of a total size of %d\n\n", inList, MaxWords);
printf("%d words were found and %d words were not found in the wordlist\n\n", found, notFound);
printf("Elaspsed time for this run: %.3lf seconds.\n", (double)(stop-start)/CLOCKS_PER_SEC);
getchar();
return 0;
}
/*
//reset for the comparison run << Reset >>
start=clock();
putchar('\n');
//get input
if((fpget=fopen("ArtOfWar.txt", "r")) == NULL) { //was "Gettysburg.txt" ATaleOfTwoCities.txt
perror("Error: Unable to open the Gettysburg.txt file\n");
return 0;
}
//get a word from the text file
i=found=notFound=idx=0;
while((c=fgetc(fpget)) != EOF) {
if(isalpha(c)) {
inWord=1;
goal[i]=c;
goal[i]=tolower(goal[i]);
//printf("%d: %c ", i, goal[i]); getchar();
++i;
}
else {
goal[i]='\0';
//putchar('\n');
if(inWord)
inWord=0;
lo=0;hi=n-1;
idx+=binarySearch(goal, lo, hi);
if(idx > -1) {
//printf(" The word: %s was found: %s at index#: %d \n\n", goal, words[idx], idx);
++found;
}else {
// //printf("%s zzz\n", goal);
++notFound;
}
goal[0]='\0';
i=0;
//getchar();
}
}
}
fclose(fpget);
stop=clock();
printf("\n\n*The word list has %d entries in it, out of a total size of %d\n\n", inList, MaxWords);
printf("%d words were found and %d words were not found in the wordlist\n\n", found, notFound);
printf("Elaspsed time for this run: %.3lf seconds.\n", (double)(stop-start)/CLOCKS_PER_SEC);
printf("\nNumber of comparisons: %d\n", idx);
getchar();
//goto Cut2;
return 0;
} */
int binarySearch(const char goal[], int left, int right) {
int lo, mid, hi,count=0;
lo=left;
hi=right;
while (lo <= hi) {
++count;
mid = (lo + hi) / 2;
/* watch how the binary search zerooes in on the goal word */
//printf("goal: %s mid: %d words[mid]: %s ", goal, mid, words[mid]);
//printf("stringcmp returns: %d\n", strcmp(goal, words[mid]));
//getchar();
if (strcmp(goal, words[mid]) < 0) //goal < words[mid]: -1
hi = mid-1;
else if(strcmp(goal, words[mid]) > 0) //goal > words[mid]: 1
lo = mid+1;
else
return mid;
}
return -1;
}
/* needs guard code to stop empty input from crashing it, etc.
In this version, this function is never called.
*/
int getInput(char goal[], int i) {
int len;
if(i) {
printf("\nEnter a word to be searched for, up to %d letters: ", MAX-2);
fgets(goal, MAX, stdin);
len = strlen(goal);
if(goal[len-1]=='\n')
goal[len-1]='\0';
printf("String Length is %d\n", strlen(goal));
}
return 0;
}
/*
// makes wordlistx.txt file
int main(void) {
int i, j, n, len;
FILE *in, *out;
char goal[30];
//char testchar;
/* if((in=fopen("TEMPDICX", "r")) == NULL) {
perror("Error: unable to open the wordlist.txt file");
return 0;
}
*/
/*
if((out=fopen("wordlistx.txt", "r")) == NULL) {
perror("Error: unable to open the wordlistx.txt file");
return 0;
}
printf("\n\n");
i=n=0;
while((fgets(goal, 30, out)) != NULL) {
//sscanf("%s", goal);
//for(j=0;j<30;j++) {
// if(isalpha(goal[j]) || goal[j]=='-')
// ;
// else
// goal[j]='\0';
//}
//fprintf(out,"%s\n",goal);
len = strlen(goal);
if(len > i) {
i = len;
printf("maxlength: %d\n", i);
}
++n;
//if(n<20) {
// printf("%s\n",goal);
// getchar();
//}
}
//fclose(in);
fclose(out);
return 0;
}
*/
Note the declaration of the words[][] array, above main(). That's a global array, and all arrays can access it without parameter passing. Generally, a bad thing, but here, it shines.