Code:
/*
This is a word searching program, but it's text based, instead
of binary. It's purpose is educational, rather than utilitarian.
It shows how to automatically build an index array, and how that
index array can then be used to cut down the search space.
The indices are passed on to the binary search, to narrow the range
of words that the binary search must handle.
It is designed to work with a large wordlist.txt file, with each
word on it's own row, e.g.:
a
aardvark
aback
abacus
etc.
The list must be alphabetized and, for this example program, the
words must have less than 16 letters, and have lowercase letters only.
The total wordList.txt file must have less than 39,200 words.
The words array wastes a lot of memory, but saving memory isn't what
this program is trying to show. I did not want to distract from it's
purpose of showing an indexed search.
This version finds every word in a text file, and searches for it,
giving just a summary of the words found and not found, and
displaying the elapsed time of the program's run.
Note: This is version 0.1
The changes include:
1) All newlines have been removed from the words
2) Input is from a file, rather than from the user. The file is the Gettysburg
Address, right now. It is available all over the net, but you can also get it
from Swoopshare here:
http://www.swoopshare.com/file/430710f78c16fee9096e92dbefd80c6f/Gettysburg.txt.html
3) The run is timed by the program.
4) Educational message output is commented out from the active code.
5) A larger wordlist.txt file is used, and longer words can be searched for. The wordlist file
has been uploaded to Swoopshare at:
http://www.swoopshare.com/file/c1d80e4e64ad601a30b3e2fdbcf09ed0/wordList.txt.html
6) The first letter of the word being searched for, is changed from uppercase to lowercase, as needed.
7) The display is slightly more organized.
--Adak
Created on the Pelles C (free) Windows C compiler.
*/
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <time.h>
#define MAX 17
//longest word is 15 chars
char words[39200][MAX];
int binarySearch(const char *, int, int);
int getInput(char *, int);
int main(void) {
int c, i, idx, n, lo, hi, len, inWord=0, found, notFound;
FILE *fp, *fpget;
int index[27]={0};
char goal[MAX];
char testchar;
clock_t start, stop;
if((fp=fopen("wordlist.txt", "r")) == NULL) {
perror("Error: unable to open the wordlist.txt file");
return 0;
}
printf("\nLoading words\n");
start = clock();
//fill the index array
testchar='a'-1;
n = 0; i = 0;
while((fgets(words[n], MAX, fp)) != NULL) {
len = strlen(words[n]);
if(words[n][len-1]=='\n') //remove the newline
words[n][len-1]='\0';
if((words[n][0]) > testchar) {
testchar = words[n][0];
index[testchar-'a'] = n; //char - 'a' makes a=0, b=1, c=2, etc.
}
++n; //line counter
}
index[26] = n-1;
fclose(fp);
printf("\nThe Index Array Table:\nLetter\tStarts At Line #\n"); //show the index array
printf("======================\n");
for(i=0;i<26;i++) {
printf(" %c: %10d\n", (i+'a'), index[i]);
}
putchar('\n');
//get user input
if((fpget=fopen("Gettysburg.txt", "r")) == NULL) {
perror("Error: Unable to open the Gettysburg.txt file\n");
return 0;
}
//get a word from the text file
i=found=notFound=0;
while((c=fgetc(fpget)) != EOF) {
if(isalpha(c)) {
inWord=1;
goal[i]=c;
//printf("%d: %c ", i, goal[i]); getchar();
++i;
}
else {
goal[i]='\0';
//putchar('\n');
if(inWord) {
inWord=0;
//printf(" goal: %s \n", goal); getchar();
/* This is the benefit of using an index. Only one index level is used here, but you can make as
many levels as you need, using the second, third, and etc., letter of the goal word.
*/
//set the first letter to lowercase
goal[0]=tolower(goal[0]);
//set the lo and hi indices
i = goal[0]-'a'; //<--
//printf("\n lowest word in the search: %s\nhighest word in the search: %s\n\n", words[index[i]], words[index[i+1]-1]);
lo = index[i]; hi = index[i+1]-1;
idx=binarySearch(goal, lo, hi);
if(idx > -1) {
//printf(" The word: %s was found: %s at index#: %d \n\n", goal, words[idx], idx);
++found;
}else {
printf("The word: %s was not found\n", goal);
++notFound;
}
goal[0]='\0';
i=0;
//getchar();
}
}
}
fclose(fpget);
stop=clock();
printf("\n\n %d words were found and %d words were not found in the wordlist\n\n", found, notFound);
printf("Elaspsed time for this run: %.3lf seconds.\n", (double)(stop-start)/CLOCKS_PER_SEC);
return 0;
}
int binarySearch(const char goal[], int lo, int hi) {
int mid;
while (lo <= hi) {
mid = (lo + hi) / 2;
/* watch how the binary search zerooes in on the goal word */
//printf("goal: %s mid: %d words[mid]: %s ", goal, mid, words[mid]);
//printf("stringcmp returns: %d\n", strcmp(goal, words[mid]));
//getchar();
if (strcmp(goal, words[mid]) < 0) //goal < words[mid]: -1
hi = mid-1;
else if(strcmp(goal, words[mid]) > 0) //goal > words[mid]: 1
lo = mid+1;
else
return mid;
}
return -1;
}
/* needs guard code to stop empty input from crashing it, etc.
In this version, this function is never called.
*/
int getInput(char goal[], int i) {
int len;
if(i) {
printf("\nEnter a word to be searched for, up to %d letters: ", MAX-2);
fgets(goal, MAX, stdin);
len = strlen(goal);
if(goal[len-1]=='\n')
goal[len-1]='\0';
printf("String Length is %d\n", strlen(goal));
}
return 0;
}