Code:
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <ctype.h>
#include "url.h"
#include "bool.h"
#include "hashset.h"
#include "string.h"
#include "urlconnection.h"
#include "streamtokenizer.h"
#include "html-utils.h"
static const int STOP_BUCKET_NO = 10007;
static const int WORD_BUCKET_NO = 10007;
static const int INIT_VECT_ALLOCATION= 4;
static const signed long kHashMultiplier = -1664117991L;
typedef struct article{
const char * title;
const char * server;
const char * urlstring;
int articleindex;
} article;
typedef struct wordstruct{
char* word;
vector* wordfreq;
} wordstruct;
typedef struct wordfreqstruct{
int articleindex;
int count;
} wordfreqstruct;
static void Welcome(const char *welcomeTextFileName);
static void LoadStopWords (hashset* stopwords, const char *stopFileName);
static void BuildIndices(const char *feedsFileName, hashset* stopwords,
vector* seenurls, hashset* words);
static void ProcessFeed(const char *remoteDocumentName, hashset
*stopwords, vector* seenurls, hashset *words);
static void PullAllNewsItems(urlconnection *urlconnm, hashset *stopwords,
vector* seenurls, hashset *words);
static bool GetNextItemTag(streamtokenizer *st);
static void ProcessSingleNewsItem(streamtokenizer *st, hashset *stopwords,
vector* seenurls, hashset *words );
static void ExtractElement(streamtokenizer *st, const char *htmlTag, char dataBuffer[], int bufferLength);
static void ParseArticle(const char *articleTitle, const char
*articleDescription, const char *articleURL,
hashset *stopwords, vector* seenurls, hashset
*words );
static void ScanArticle(streamtokenizer *st, const char *articleTitle,
const char *unused, const char *articleURL,
hashset *stopwords, vector* seenurls, hashset
*words, article* thisarticle);
static void UpdateIndices(char* word, hashset *words, hashset
*stopwords, vector *seenurls, article*
thisarticle );
static void QueryIndices(hashset *words, vector*seenurls, hashset*stopwords);
static void ProcessResponse(char *word, hashset*words, vector*
seenurls, hashset *stopwords);
static bool WordIsWellFormed(const char *word);
static int StopWordHash(const void *s, int numBuckets);
static void StringFree(void * wordpoint);
static int WordStructCompare(const void *one, const void *two);
static void WordStructFree(void *point);
static int ArticleCompare(const void *one, const void *two);
static int StringComparison (const void* one, const void* two);
static void ArticleFree(void *thiselem);
static void UpdateArticleVector(article* thisarticle, vector*
seenurls );
static void UpdateWordsHashset(hashset *words, void * lowerword);
static void AddWordArticleVect (vector *seenurls, void * lowerword,
article* thisarticle);
static int WordFreqCompare(const void * one, const void * two);
static void GetArticlesContainingWord( char *word, hashset *words,
vector* seenurls, void* wordarr);
static void MapOffsetToWordFreq(void *offsetaddr, void* word);
static int WordStructHash(const void *s, int numBuckets) ;
static const char *const kWelcomeTextFile = "/usr/class/cs107/assignments/assn-4-rss-news-search-data/welcome.txt";
static const char *const kDefaultFeedsFile =
"/usr/class/cs107/assignments/assn-4-rss-news-search-data/rss-feeds-tiny.txt";
static const char *const kStopFile =
"/usr/class/cs107/assignments/assn-4-rss-news-search-data/stop-words.txt";
/**
* Function: main
* --------------
* Serves as the entry point of the full application.
* You'll want to update main to declare several hashsets--
* one for stop words, another for previously seen urls, etc--
* and pass them (by address) to BuildIndices and QueryIndices.
* In fact, you'll need to extend many of the prototypes of the
* supplied helpers functions to take one or more hashset *s.
*
* Think very carefully about how you're going to keep track of
* all of the stop words, how you're going to keep track of
* all the previously seen articles, and how you're going to
* map words to the collection of news articles where that
* word appears.
*/
int main(int argc, char **argv)
{
hashset stopwords;
hashset words;
vector seenurls;
Welcome(kWelcomeTextFile);
//printf("before stop \n");
LoadStopWords(&stopwords, kStopFile);
//char *word = "the";
//char* dummy = word;
//void *addr = HashSetLookup(&stopwords, &dummy);
//assert(addr != NULL);
//printf("ADDR POINTER: %p\n", addr);
//printf("calling build indices \n");
BuildIndices((argc == 1) ? kDefaultFeedsFile : argv[1],
&stopwords,&seenurls, &words);
printf("\n");
printf("here is the # of seenurls %d \n",VectorLength(&seenurls));
//QueryIndices(&words,&seenurls, &stopwords);
//HashSetDispose(stopwords);
//HashSetDispose(words);
//VectorDispose(seenurls);
return 0;
}
/**
* Function: Welcome
* -----------------
* Displays the contents of the specified file, which
* holds the introductory remarks to be printed every time
* the application launches. This type of overhead may
* seem silly, but by placing the text in an external file,
* we can change the welcome text without forcing a recompilation and
* build of the application. It's as if welcomeTextFileName
* is a configuration file that travels with the application.
*/
static const char *const kNewLineDelimiters = "\r\n";
static void Welcome(const char *welcomeTextFileName)
{
FILE *infile;
streamtokenizer st;
char buffer[1024];
infile = fopen(welcomeTextFileName, "r");
assert(infile != NULL);
STNew(&st, infile, kNewLineDelimiters, true);
while (STNextToken(&st, buffer, sizeof(buffer))) {
printf("%s\n", buffer);
}
STDispose(&st); // remember that STDispose doesn't close the file, since STNew doesn't open one..
fclose(infile);
}
static int WordStructCompare(const void *one, const void *two){
char* oneword = ((wordstruct*)one)-> word;
char* twoword = ((wordstruct*)two)->word;
return strcasecmp( oneword, twoword);
}
static int WordFreqCompare (const void*one, const void*two){
// printf("freq compare \n");
return (((wordfreqstruct*)one)->articleindex- ((wordfreqstruct*)two)->articleindex);
}
static void WordStructFree(void*point){
free((*(wordstruct*)point).wordfreq); //free the thing pointed to (the actual word)
}
//I want to deference the char** but then I seg fault way earlier
static int StringComparison (const void* one, const void* two){
return strcasecmp(*(char**)one, *(char**) two);
}
//Returns zero if they are equal and returns -1 if they are not.
//Question: Should this return 1 in any case?
static int ArticleCompare(const void *one, const void *two){
if (strcmp( ( (article*) one)->title , ( (article*) two)->title) == 0 &&
(strcmp(((article*)one)->server, ((article*)two)->server) ==0)) return 0;
else if (strcmp(((article*)one)->urlstring, ((article*)two)->urlstring) ==0) return 0;
else return -1;
}
static void ArticleFree(void * thiselem){
// free( ( *(article*) thiselem).wordfreqpoint);
//Question: Why does it seg fault when we call this?
//free(( *(article*) thiselem).server);
//free((*(article*)thiselem).urlstring);
// free((*(article*)thiselem).title);
// free((*(article*)thiselem).articleindex);
}
/**
* StopWordHash
* ----------
* This function adapted from Eric Roberts' "The Art and Science of C"
* It takes a string and uses it to derive a hash code, which
* is an integer in the range [0, numBuckets). The hash code is computed
* using a method called "linear congruence." A similar function using this
* method is described on page 144 of Kernighan and Ritchie. The choice of
* the value for the kHashMultiplier can have a significant effect on the
* performance of the algorithm, but not on its correctness.
* This hash function has the additional feature of being case-insensitive,
* hashing "Peter Pawlowski" and "PETER PAWLOWSKI" to the same code.
*/
//doesn't match prototype for void*
//cast pointer to char** and then
static int StopWordHash(const void *s, int numBuckets)
{
//printf("calling stop word hash \n");
int i;
unsigned long hashcode = 0;
char* h = *(char**)s;
for (i = 0; i < strlen(h); i++)
hashcode = hashcode * kHashMultiplier + tolower(h[i]);
return hashcode % numBuckets;
}
/**
* WordStructHash
* ----------
* This function adapted from Eric Roberts' "The Art and Science of C"
* It takes a string and uses it to derive a hash code, which
* is an integer in the range [0, numBuckets). The hash code is computed
* using a method called "linear congruence." A similar function using this
* method is described on page 144 of Kernighan and Ritchie. The choice of
* the value for the kHashMultiplier can have a significant effect on the
* performance of the algorithm, but not on its correctness.
* This hash function has the additional feature of being case-insensitive,
* hashing "Peter Pawlowski" and "PETER PAWLOWSKI" to the same code.
*/
//doesn't match prototype for void*
//cast pointer to char** and then
static int WordStructHash(const void *s, int numBuckets)
{
// printf("calling wordstruct hash \n");
int i;
unsigned long hashcode = 0;
wordstruct* x = (wordstruct*)s;
char* h = x-> word;
for (i = 0; i < strlen(h); i++)
hashcode = hashcode * kHashMultiplier + tolower(h[i]);
return hashcode % numBuckets;
}
static void StringFree(void *wordpoint){
free(*(char**)wordpoint);
}
/*
//Compares wordstruct's by seeing if the words match up.
static int FreqCompare(const void * one, const void * two){
//printf("calls word struct compare \n");
return strcmp((*(freq*)one).articleindex, (*(freq*)two).articleindex);
}*/
//LoadStopWords goes through the data file and loads the stop words in.
static void LoadStopWords (hashset *stopwords, const char* stopFileName){
FILE *infile;
streamtokenizer st;
char buffer[1024];
HashSetNew(stopwords, sizeof(char*), STOP_BUCKET_NO,StopWordHash,
StringComparison, StringFree);
infile = fopen(stopFileName, "r");
assert(infile != NULL);
STNew(&st, infile, kNewLineDelimiters, true);
while (STNextToken(&st, buffer, sizeof(buffer) ) ) {
char* temp = strdup(buffer);
HashSetEnter(stopwords, &temp);
//printf("here is the number of elements inside our hashset %d \n", HashSetCount(stopwords));
}
STDispose(&st);
fclose(infile);
printf("\n");
}
/**
* Function: BuildIndices
* ----------------------
* As far as the user is concerned, BuildIndices needs to read each and every
* one of the feeds listed in the specied feedsFileName, and for each feed parse
* content of all referenced articles and store the content in the hashset of indices.
* Each line of the specified feeds file looks like this:
*
* <feed name>: <URL of remore xml document>
*
* Each iteration of the supplied while loop parses and discards the feed name (it's
* in the file for humans to read, but our aggregator doesn't care what the name is)
* and then extracts the URL. It then relies on ProcessFeed to pull the remote
* document and index its content.
*/
//So this function calls ProcessFeed, which passes the remoteFileName to
//ProcessFeed is called as many times for as many feeds that there are!
static void BuildIndices(const char *feedsFileName, hashset* stopwords,
vector* seenurls, hashset* words )
{
printf("start build indices \n");
FILE *infile;
streamtokenizer st;
char remoteFileName[1024];
infile = fopen(feedsFileName, "r");
assert(infile != NULL);
STNew(&st, infile, kNewLineDelimiters, true);
while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line
STSkipOver(&st, ": "); // now ignore the semicolon and any whitespace directly after it
STNextToken(&st, remoteFileName, sizeof(remoteFileName));
// printf("before calling process feed \n");
ProcessFeed(remoteFileName, stopwords, seenurls, words);
}
STDispose(&st);
fclose(infile);
//printf("\n");
printf("here is the size of the seenurls vector, %d \n",
VectorLength(seenurls));
printf("here is the size of the words hashset, %d \n", HashSetCount(words));
printf("gets to end of build indices");
}
/**
* Function: ProcessFeed
* ---------------------
* ProcessFeed locates the specified RSS document, and if a (possibly redirected) connection to that remote
* document can be established, then PullAllNewsItems is tapped to actually read the feed. Check out the
* documentation of the PullAllNewsItems function for more information, and inspect the documentation
* for ParseArticle or information about what the different response codes mean.
*/
static void ProcessFeed(const char *remoteDocumentName, hashset *stopwords,
vector* seenurls, hashset* words){
//printf("gets into process feed \n");
url u;
urlconnection urlconn;
URLNewAbsolute(&u, remoteDocumentName);
//printf("remotedoc name, %s \n", remoteDocumentName);
URLConnectionNew(&urlconn, &u);
//printf("hello this is an annoying seg fault \n");
switch (urlconn.responseCode) {
case 0: printf("Unable to connect to \"%s\". Ignoring...",u.serverName);
break;
case 200:
PullAllNewsItems(&urlconn, stopwords, seenurls, words);
break;
case 301:
//recursively calls itself to get the newURL?
case 302: ProcessFeed(urlconn.newUrl, stopwords, seenurls, words);
break;
default: printf("Connection to \"%s\" was established, but unable to retrieve \"%s\". [response code: %d, response message:\"%s\"]\n",
u.serverName, u.fileName, urlconn.responseCode, urlconn.responseMessage);
break;
URLConnectionDispose(&urlconn);
URLDispose(&u);
}
}
/**
* Function: PullAllNewsItems
* --------------------------
* Steps though the data of what is assumed to be an RSS feed identifying the names and
* URLs of online news articles. Check out "datafiles/sample-rss-feed.txt" for an idea of what an
* RSS feed from the www.nytimes.com (or anything other server that syndicates is stories).
*
* PullAllNewsItems views a typical RSS feed as a sequence of "items", where each item is detailed
* using a generalization of HTML called XML. A typical XML fragment for a single news item will certainly
* adhere to the format of the following example:
*
* <item>
* <title>At Installation Mass, New Pope Strikes a Tone of Openness</title>
* <link>http://www.nytimes.com/2005/04/24/international/worldspecial2/24cnd-pope.html</link>
* <description>The Mass, which drew 350,000 spectators, marked an important moment in the transformation of Benedict XVI.</description>
* <author>By IAN FISHER and LAURIE GOODSTEIN</author>
* <pubDate>Sun, 24 Apr 2005 00:00:00 EDT</pubDate>
* <guid isPermaLink="false">http://www.nytimes.com/2005/04/24/international/worldspecial2/24cnd-pope.html</guid>
* </item>
*
* PullAllNewsItems reads and discards all characters up through the opening <item> tag (discarding the <item> tag
* as well, because once it's read and indentified, it's been pulled,) and then hands the state of the stream to
* ProcessSingleNewsItem, which handles the job of pulling and analyzing everything up through and including the </item>
* tag. PullAllNewsItems processes the entire RSS feed and repeatedly advancing to the next <item> tag and then allowing
* ProcessSingleNewsItem do process everything up until </item>.
*/
static const char *const kTextDelimiters = " \t\n\r\b!@$%^*()_+={[}]|\\'\":;/?.>,<~`";
static void PullAllNewsItems(urlconnection *urlconn, hashset* stopwords,
vector* seenurls, hashset* words )
{
//printf("start pull all news items \n");
//We initially construct the hashsets here.
printf("initializing hashset \n");
HashSetNew(words, sizeof(wordstruct), WORD_BUCKET_NO, WordStructHash,
WordStructCompare, WordStructFree);
printf("after hashset new \n");
VectorNew(seenurls, sizeof(article), ArticleFree, INIT_VECT_ALLOCATION);
streamtokenizer st;
STNew(&st, urlconn->dataStream, kTextDelimiters, false);
while (GetNextItemTag(&st)) { // if true is returned, then assume that
// <item ...> has just been read and
// pulled from the data stream
printf("process single \n");
ProcessSingleNewsItem(&st, stopwords, seenurls, words);
};
STDispose(&st);
}
/**
* Function: GetNextItemTag
* ------------------------
* Works more or less like GetNextTag below, but this time
* we're searching for an <item> tag, since that marks the
* beginning of a block of HTML that's relevant to us.
*
* Note that each tag is compared to "<item" and not "<item>".
* That's because the item tag, though unlikely, could include
* attributes and perhaps look like any one of these:
*
* <item>
* <item rdf:about="Latin America reacts to the Vatican">
* <item requiresPassword=true>
*
* We're just trying to be as general as possible without
* going overboard. (Note that we use strncasecmp so that
* string comparisons are case-insensitive. That's the case
* throughout the entire code base.)
*/
static const char *const kItemTagPrefix = "<item";
static bool GetNextItemTag(streamtokenizer *st)
{
char htmlTag[1024];
while (GetNextTag(st, htmlTag, sizeof(htmlTag))) {
if (strncasecmp(htmlTag, kItemTagPrefix, strlen(kItemTagPrefix)) == 0) {
return true;
}
}
return false;
}
/**
* Function: ProcessSingleNewsItem
* -------------------------------
* Code which parses the contents of a single <item> node within an RSS/XML feed.
* At the moment this function is called, we're to assume that the <item> tag was just
* read and that the streamtokenizer is currently pointing to everything else, as with:
*
* <title>Carrie Underwood takes American Idol Crown</title>
* <description>Oklahoma farm girl beats out Alabama rocker Bo Bice and 100,000 other contestants to win competition.</description>
* <link>http://www.nytimes.com/frontpagenews/2841028302.html</link>
* </item>
*
* ProcessSingleNewsItem parses everything up through and including the </item>, storing the title, link, and article
* description in local buffers long enough so that the online new article identified by the link can itself be parsed
* and indexed. We don't rely on <title>, <link>, and <description> coming in any particular order. We do asssume that
* the link field exists (although we can certainly proceed if the title and article descrption are missing.) There
* are often other tags inside an item, but we ignore them.
*/
static const char *const kItemEndTag = "</item>";
static const char *const kTitleTagPrefix = "<title";
static const char *const kDescriptionTagPrefix = "<description";
static const char *const kLinkTagPrefix = "<link";
static void ProcessSingleNewsItem(streamtokenizer *st, hashset* stopwords,
vector* seenurls, hashset* words )
{
char htmlTag[1024];
char articleTitle[1024];
char articleDescription[1024];
char articleURL[1024];
articleTitle[0] = articleDescription[0] = articleURL[0] = '\0';
while (GetNextTag(st, htmlTag, sizeof(htmlTag)) && (strcasecmp(htmlTag, kItemEndTag) != 0)) {
if (strncasecmp(htmlTag, kTitleTagPrefix, strlen(kTitleTagPrefix)) == 0) ExtractElement(st, htmlTag, articleTitle, sizeof(articleTitle));
if (strncasecmp(htmlTag, kDescriptionTagPrefix, strlen(kDescriptionTagPrefix)) == 0) ExtractElement(st, htmlTag, articleDescription, sizeof(articleDescription));
if (strncasecmp(htmlTag, kLinkTagPrefix, strlen(kLinkTagPrefix)) == 0) ExtractElement(st, htmlTag, articleURL, sizeof(articleURL));
}
if (strncmp(articleURL, "", sizeof(articleURL)) == 0) return; // punt, since it's not going to take us anywhere
//printf("before parse article \n");
ParseArticle(articleTitle, articleDescription, articleURL, stopwords,
seenurls,words);
}
/*
* Function: ExtractElement
* ------------------------
* Potentially pulls text from the stream up through and including the matching end tag. It assumes that
* the most recently extracted HTML tag resides in the buffer addressed by htmlTag. The implementation
* populates the specified data buffer with all of the text up to but not including the opening '<' of the
* closing tag, and then skips over all of the closing tag as irrelevant. Assuming for illustration purposes
* that htmlTag addresses a buffer containing "<description" followed by other text, these three scenarios are
* handled:
*
* Normal Situation: <description>http://some.server.com/someRelativePath.html</description>
* Uncommon Situation: <description></description>
* Uncommon Situation: <description/>
*
* In each of the second and third scenarios, the document has omitted the data. This is not uncommon
* for the description data to be missing, so we need to cover all three scenarious (I've actually seen all three.)
* It would be quite unusual for the title and/or link fields to be empty, but this handles those possibilities too.
*/
static void ExtractElement(streamtokenizer *st, const char *htmlTag, char dataBuffer[], int bufferLength)
{
assert(htmlTag[strlen(htmlTag) - 1] == '>');
if (htmlTag[strlen(htmlTag) - 2] == '/') return; // e.g. <description/> would state that a description is not being supplied
STNextTokenUsingDifferentDelimiters(st, dataBuffer, bufferLength, "<");
RemoveEscapeCharacters(dataBuffer);
if (dataBuffer[0] == '<') strcpy(dataBuffer, ""); // e.g. <description></description> also means there's no description
STSkipUntil(st, ">");
STSkipOver(st, ">");
}
/*
static void WordFreqFree (void * thiselem){
free ((wordfreqstruct*)thiselem);
}*/
/**
* Function: ParseArticle
* ----------------------
* Attempts to establish a network connect to the news article identified by the three
* parameters. The network connection is either established of not. The implementation
* is prepared to handle a subset of possible (but by far the most common) scenarios,
* and those scenarios are categorized by response code:
*
*/
static void ParseArticle(const char *articleTitle, const char
*articleDescription, const char *articleURL,
hashset* stopwords, vector* seenurls, hashset
*words)
{
//printf("gets into parse article \n");
url u;
urlconnection urlconn;
streamtokenizer st;
article thisarticle;
int apos;
URLNewAbsolute(&u, articleURL);
URLConnectionNew(&urlconn, &u);
thisarticle.title = strdup(articleTitle);
// printf("strdup of article title worked \n");
thisarticle.urlstring = strdup(u.fullName);
char* server = strdup(u.serverName);
thisarticle.server = strdup(server);
//printf("after urlconnectionnew \n");
//for the case in which the article has already been populated into our
//vector of articles
switch (urlconn.responseCode) {
case 0: printf("Unable to connect to \"%s\". Domain name or IP address is nonexistent.\n", articleURL);
break;
case 200:
apos = VectorSearch(seenurls, &thisarticle,ArticleCompare,0,
false);
if (apos!=-1){
thisarticle.articleindex = apos;
printf("after articleoffset * \n");
printf("[Ignoring \"%s\": we've seen it before.] ",thisarticle.title);
return;
}
//If the article doesn't already exist, append it to teh vector of seenurls.
printf("[%s] Indexing \"%s\" \n", thisarticle.server, thisarticle.title);
thisarticle.articleindex = VectorLength(seenurls);
VectorAppend(seenurls, &thisarticle);
STNew(&st, urlconn.dataStream, kTextDelimiters, false);
//printf("gets bo efore call scanarticle %d\n", *(articleoffset));
ScanArticle (&st, articleTitle, articleDescription,
articleURL, stopwords, seenurls, words,
&thisarticle );
STDispose(&st);
//printf("before updatearticle veoctr \n");
break;
case 301:
case 302: // just pretend we have the redirected URL all along, though
// index using the new URL and not the old one...
ParseArticle(articleTitle, articleDescription, urlconn.newUrl,
stopwords, seenurls, words );
break;
default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode);
break;
}
;
URLConnectionDispose(&urlconn);
URLDispose(&u);
}
/**
* Function: ScanArticle
* ---------------------
* Parses the specified article, skipping over all HTML tags, and counts the numbers
* of well-formed words that could potentially serve as keys in the set of indices.
* Once the full article has been scanned, the number of well-formed words is
* printed, and the longest well-formed word we encountered along the way
* is printed as well.
*
* This is really a placeholder implementation for what will ultimately be
* code that indexes the specified content.
*/
static void ScanArticle(streamtokenizer *st, const char *articleTitle,
const char *unused, const char *articleURL,
hashset *stopwords, vector* seenurls, hashset
*words, article* thisarticle )
{
char word[1024];
while (STNextToken(st, word, sizeof(word))) {
if (strcasecmp(word, "<") == 0) {
SkipIrrelevantContent(st); // in html-utls.h
} else {
RemoveEscapeCharacters(word);
if (WordIsWellFormed(word)) {
printf("_________________this line indicates a new word______________________\n");
//printf("----- %s", word);
UpdateIndices(word, words, stopwords, seenurls, thisarticle);
}
}
}
}
//UpdateIndices updates the words into the words hashset and updates the
//vector with the article information. Due to the sequential nature of all
//the updates, I decided not to decomp this function ( initially did, but
//things to messy).
static void UpdateIndices(char* word, hashset *words, hashset
*stopwords, vector *seenurls, article*
thisarticle ){
wordstruct dummy;
dummy.word = word;
const void* addr = HashSetLookup(stopwords, &word);
if(addr!=NULL)printf (" - Stop word - \n");
if (addr == NULL){
printf("we have a stop word \n");
void*bucketadd = HashSetLookup(words,&dummy);
printf("after bucket add \n");
if(bucketadd==NULL){
HashSetEnter(words,&dummy);
printf("after hash set enter \n");
wordstruct*newbucket = (wordstruct*)bucketadd;
printf("seg \n");
//newbucket->wordfreq = malloc(sizeof(vector));
VectorNew(newbucket->wordfreq,sizeof(wordfreqstruct),NULL,INIT_VECT_ALLOCATION);
printf("seg fault \n");
wordfreqstruct one;
printf("after wordfreqstruct one \n");
one.articleindex = thisarticle->articleindex;
one.count =1 ;
VectorAppend(newbucket->wordfreq,&one);
}else{
wordstruct *oldbucket = (wordstruct*) bucketadd;
vector*ptr = oldbucket->wordfreq;
wordfreqstruct local;
local.articleindex = thisarticle->articleindex;
int pos = VectorSearch(ptr,&local,WordFreqCompare,0,false);
wordfreqstruct*wfs = (wordfreqstruct*)ptr;
wfs +=sizeof(wordfreqstruct) * pos;
wfs->count++;
}
}
}
*/
/**
* Function: QueryIndices
* ----------------------
* Standard query loop that allows the user to specify a single search term, and
* then proceeds (via ProcessResponse) to list up to 10 articles (sorted by relevance)
* that contain that word.
*/
static void QueryIndices(hashset* words, vector* seenurls, hashset* stopwords)
{
char response[1024];
while (true) {
printf("Please enter a single query term that might be in our set of indices [enter to quit]: ");
fgets(response, sizeof(response), stdin);
response[strlen(response) - 1] = '\0';
if (strcasecmp(response, "") == 0) break;
ProcessResponse(response, words, seenurls, stopwords);
}
}
static void GetArticlesContainingWord(char *word, hashset *words,
vector* seenurls, void* wordarr){
printf("00000000000000000000000000000 here is hash set size of words %d \n", HashSetCount(words));
if (wordarr == NULL) {
printf("None of today's news articles contain the word \"%s\".\n",
word);
}else{
wordstruct* inherit = (wordstruct*)wordarr;
wordfreqstruct *a = (wordfreqstruct*) inherit->wordfreq;
printf("Nice! we found %d articles that include the word \"%s\".",
VectorLength(inherit->wordfreq), word);
for(int i=0; i < VectorLength(inherit->wordfreq); i++){
int index = (*a).articleindex;
printf("here is the index %d \n", index);
article* supt = (article*) seenurls;
supt+=sizeof(article)*index;
printf("%d.) \"%s\" [search term occurs %d times] \n",
i,supt->title, (*a).count);
printf("\"%s\" \n", supt->urlstring);
a+=sizeof(wordfreqstruct);
}
}
}
/*
* Function: ProcessResponse
* -------------------------
* Placeholder implementation for what will become the search of a set of indices
* for a list of web documents containing the specified word.
*/
static void ProcessResponse(char *word, hashset *words, vector*
seenurls, hashset *stopwords)
{
if (WordIsWellFormed(word)) {
printf("word is well formed \n");
wordstruct* thisstruct;
thisstruct->word = word;
//look for word inside of words hashset
if(HashSetLookup(stopwords, &word)!=NULL){
printf("Too common a word to be taken seriously. Try something more specific. \n");
return;
}
printf("before hash set loookkup \n");
printf("00000000000000000000000000000 here is hash set size of words %d \n", HashSetCount(words));
void* wordarr = HashSetLookup (words, &thisstruct);
printf("after hashsetlookup \n");
if (wordarr == NULL) {
printf("None of today's news articles contain the word \"%s\".\n",
word);
return;
}
GetArticlesContainingWord(word, words, seenurls, wordarr);
}
else {
printf("We won't be allowing words like \"%s\" into our set of indices.\n", word);
}
}
/**
* Predicate Function: WordIsWellFormed
* ------------------------------------
* Before we allow a word to be inserted into our map
* of indices, we'd like to confirm that it's a good search term.
* One could generalize this function to allow different criteria, but
* this version hard codes the requirement that a word begin with
* a letter of the alphabet and that all letters are either letters, numbers,
* or the '-' character.
*/
static bool WordIsWellFormed(const char *word)
{
int i;
if (strlen(word) == 0) return true;
if (!isalpha((int) word[0])) return false;
for (i = 1; i < strlen(word); i++)
if (!isalnum((int) word[i]) && (word[i] != '-')) return false;
return true;
}