seg fault at vectornew

**tytelizgal** · 10-24-2008

i am trying to declare a vector new on a pointer ....i am seg faulting at the vector new and i have no clue why: (the goal of this program is to go through a website adn parse words and populate a hashset with words so taht you can retrieve how mny times a certain word apppears in ana rticle)...the function i am having issue with is buildindices..

Code:

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <ctype.h>
#include "url.h"
#include "bool.h"
#include "hashset.h" 
#include "string.h"
#include "urlconnection.h"
#include "streamtokenizer.h"
#include "html-utils.h"

static const int STOP_BUCKET_NO = 10007;
static const int WORD_BUCKET_NO = 10007; 
static const int INIT_VECT_ALLOCATION= 4;
static const signed long kHashMultiplier = -1664117991L;

typedef struct article{
  const char * title;
  const char * server;
  const char * urlstring;
  int articleindex;
} article;

typedef struct  wordstruct{
  char* word;
  vector* wordfreq;
} wordstruct;

typedef struct wordfreqstruct{
  int articleindex;
  int count;
} wordfreqstruct;
  

static void Welcome(const char *welcomeTextFileName);
static void LoadStopWords (hashset* stopwords, const char *stopFileName);
static void BuildIndices(const char *feedsFileName, hashset* stopwords,
			 vector* seenurls, hashset* words);
static void ProcessFeed(const char *remoteDocumentName, hashset
			*stopwords, vector* seenurls, hashset *words);
static void PullAllNewsItems(urlconnection *urlconnm, hashset *stopwords,
			     vector* seenurls, hashset *words);
static bool GetNextItemTag(streamtokenizer *st);
static void ProcessSingleNewsItem(streamtokenizer *st, hashset *stopwords,
				  vector* seenurls, hashset *words   );
static void ExtractElement(streamtokenizer *st, const char *htmlTag, char dataBuffer[], int bufferLength);
static void ParseArticle(const char *articleTitle, const char
			 *articleDescription, const char *articleURL,
			 hashset *stopwords, vector* seenurls, hashset
			 *words                    );
static void ScanArticle(streamtokenizer *st, const char *articleTitle,
			const char *unused, const char *articleURL,
			hashset *stopwords, vector* seenurls, hashset
			*words, article* thisarticle);
static void UpdateIndices(char* word, hashset *words, hashset
			  *stopwords, vector *seenurls, article*
			  thisarticle                    );
static void QueryIndices(hashset *words, vector*seenurls, hashset*stopwords);
static void ProcessResponse(char *word, hashset*words, vector*
seenurls, hashset *stopwords);
static bool WordIsWellFormed(const char *word);
static int StopWordHash(const void *s, int numBuckets);
static void  StringFree(void * wordpoint);
static int WordStructCompare(const void *one, const void *two);
static void WordStructFree(void *point);
static int ArticleCompare(const void *one, const void *two);
static int StringComparison (const void* one, const void* two);
static void ArticleFree(void *thiselem);
static void UpdateArticleVector(article* thisarticle,  vector*
seenurls                   );
static void UpdateWordsHashset(hashset *words, void * lowerword);
static void AddWordArticleVect (vector *seenurls, void * lowerword,
				article* thisarticle);
static int WordFreqCompare(const void * one, const void * two);
static void GetArticlesContainingWord( char *word, hashset *words,
				      vector* seenurls, void* wordarr);
static void MapOffsetToWordFreq(void *offsetaddr, void* word);
static int WordStructHash(const void *s, int numBuckets) ;



static const char *const kWelcomeTextFile = "/usr/class/cs107/assignments/assn-4-rss-news-search-data/welcome.txt";
static const char *const kDefaultFeedsFile =
"/usr/class/cs107/assignments/assn-4-rss-news-search-data/rss-feeds-tiny.txt";
static const char *const kStopFile =
  "/usr/class/cs107/assignments/assn-4-rss-news-search-data/stop-words.txt";


/**
 * Function: main
 * --------------
 * Serves as the entry point of the full application.
 * You'll want to update main to declare several hashsets--
 * one for stop words, another for previously seen urls, etc--
 * and pass them (by address) to BuildIndices and QueryIndices.
 * In fact, you'll need to extend many of the prototypes of the
 * supplied helpers functions to take one or more hashset *s.
 *
 * Think very carefully about how you're going to keep track of
 * all of the stop words, how you're going to keep track of
 * all the previously seen articles, and how you're going to 
 * map words to the collection of news articles where that
 * word appears.
 */
int main(int argc, char **argv)
{
  hashset stopwords;
  hashset words;
  vector seenurls;
  Welcome(kWelcomeTextFile);
  //printf("before stop \n");
  LoadStopWords(&stopwords, kStopFile);
  //char *word = "the";
  //char* dummy = word;
  //void *addr = HashSetLookup(&stopwords, &dummy);
  //assert(addr != NULL);
  //printf("ADDR POINTER: %p\n", addr);

  //printf("calling build indices \n");
  BuildIndices((argc == 1) ? kDefaultFeedsFile : argv[1],
  &stopwords,&seenurls, &words);
  printf("\n");
  printf("here is the # of seenurls %d \n",VectorLength(&seenurls));
  //QueryIndices(&words,&seenurls, &stopwords);
  //HashSetDispose(stopwords);
  //HashSetDispose(words);
  //VectorDispose(seenurls);
  return 0;
}



/** 
 * Function: Welcome
 * -----------------
 * Displays the contents of the specified file, which
 * holds the introductory remarks to be printed every time
 * the application launches.  This type of overhead may
 * seem silly, but by placing the text in an external file,
 * we can change the welcome text without forcing a recompilation and
 * build of the application.  It's as if welcomeTextFileName
 * is a configuration file that travels with the application.
 */
 
static const char *const kNewLineDelimiters = "\r\n";
static void Welcome(const char *welcomeTextFileName)
{
  FILE *infile;
  streamtokenizer st;
  char buffer[1024];
  
  infile = fopen(welcomeTextFileName, "r");
  assert(infile != NULL);    
  
  STNew(&st, infile, kNewLineDelimiters, true);
  while (STNextToken(&st, buffer, sizeof(buffer))) {
    printf("%s\n", buffer);
  }
  STDispose(&st); // remember that STDispose doesn't close the file, since STNew doesn't open one.. 
  fclose(infile);
}

static int WordStructCompare(const void *one, const void *two){
  char* oneword = ((wordstruct*)one)-> word;
  char* twoword = ((wordstruct*)two)->word;
  return strcasecmp( oneword, twoword);
}

static int WordFreqCompare (const void*one, const void*two){
  // printf("freq compare \n");
  return (((wordfreqstruct*)one)->articleindex- ((wordfreqstruct*)two)->articleindex);
}


static void WordStructFree(void*point){
  free((*(wordstruct*)point).wordfreq); //free the thing pointed to (the actual word)
}

//I want to deference the char** but then I seg fault way earlier 
static int StringComparison (const void* one, const void* two){
  return strcasecmp(*(char**)one, *(char**) two); 
}

//Returns zero if they are equal and returns -1 if they are not. 
//Question: Should this return 1 in any case? 
static int  ArticleCompare(const void *one, const void *two){
  if (strcmp( ( (article*) one)->title , ( (article*) two)->title) == 0 &&
      (strcmp(((article*)one)->server,  ((article*)two)->server) ==0)) return 0;
  else if (strcmp(((article*)one)->urlstring, ((article*)two)->urlstring) ==0) return 0;
  else return -1;
}

static void ArticleFree(void   * thiselem){
  // free( ( *(article*) thiselem).wordfreqpoint);
  //Question: Why does it seg fault when we call this? 
  //free(( *(article*) thiselem).server);
  //free((*(article*)thiselem).urlstring);
  //   free((*(article*)thiselem).title);
  //   free((*(article*)thiselem).articleindex);
} 

/** 
 * StopWordHash                     
 * ----------  
 * This function adapted from Eric Roberts' "The Art and Science of C"
 * It takes a string and uses it to derive a hash code, which   
 * is an integer in the range [0, numBuckets).  The hash code is computed  
 * using a method called "linear congruence."  A similar function using this     
 * method is described on page 144 of Kernighan and Ritchie.  The choice of                                                     
 * the value for the kHashMultiplier can have a significant effect on the                            
 * performance of the algorithm, but not on its correctness.                                                    
 * This hash function has the additional feature of being case-insensitive,  
 * hashing "Peter Pawlowski" and "PETER PAWLOWSKI" to the same code.  
 */  
//doesn't match prototype for void* 
//cast pointer to char** and then 
static int StopWordHash(const void *s, int numBuckets)  
{    
  //printf("calling stop word hash \n");        
  int i;
  unsigned long hashcode = 0;

  char* h = *(char**)s;
  
  for (i = 0; i < strlen(h); i++)  
    hashcode = hashcode * kHashMultiplier + tolower(h[i]); 
  
  return hashcode % numBuckets;                                
}

/** 
 * WordStructHash                     
 * ----------  
 * This function adapted from Eric Roberts' "The Art and Science of C"
 * It takes a string and uses it to derive a hash code, which   
 * is an integer in the range [0, numBuckets).  The hash code is computed  
 * using a method called "linear congruence."  A similar function using this     
 * method is described on page 144 of Kernighan and Ritchie.  The choice of                                                     
 * the value for the kHashMultiplier can have a significant effect on the                            
 * performance of the algorithm, but not on its correctness.                                                    
 * This hash function has the additional feature of being case-insensitive,  
 * hashing "Peter Pawlowski" and "PETER PAWLOWSKI" to the same code.  
 */  
//doesn't match prototype for void* 
//cast pointer to char** and then 
static int WordStructHash(const void *s, int numBuckets)  
{            

  // printf("calling wordstruct hash \n");
  int i;
  unsigned long hashcode = 0;

  wordstruct* x = (wordstruct*)s;
  char* h = x-> word;
  
  for (i = 0; i < strlen(h); i++)  
    hashcode = hashcode * kHashMultiplier + tolower(h[i]);  
  
  return hashcode % numBuckets;                                
}

static void StringFree(void *wordpoint){
  free(*(char**)wordpoint);
}

/*
//Compares wordstruct's by seeing if the words match up. 
static int FreqCompare(const void * one, const void * two){
  //printf("calls word struct compare \n");
  return strcmp((*(freq*)one).articleindex, (*(freq*)two).articleindex);
  }*/




//LoadStopWords goes through the data file and loads the stop words in.
static void LoadStopWords (hashset *stopwords, const char* stopFileName){
  
  FILE *infile;
  streamtokenizer st;
  char buffer[1024];
  HashSetNew(stopwords, sizeof(char*), STOP_BUCKET_NO,StopWordHash,
	     StringComparison, StringFree);
  infile = fopen(stopFileName, "r");
  assert(infile != NULL);
  STNew(&st, infile, kNewLineDelimiters, true);
  
  

  
  while (STNextToken(&st, buffer, sizeof(buffer) ) ) {
  
    char* temp = strdup(buffer);
    HashSetEnter(stopwords, &temp);
    //printf("here is the number of elements inside our hashset %d \n", HashSetCount(stopwords));
  }
  STDispose(&st);
  fclose(infile);
  printf("\n");
  
}



/**
 * Function: BuildIndices
 * ----------------------
 * As far as the user is concerned, BuildIndices needs to read each and every
 * one of the feeds listed in the specied feedsFileName, and for each feed parse
 * content of all referenced articles and store the content in the hashset of indices.
 * Each line of the specified feeds file looks like this:
 *
 *   <feed name>: <URL of remore xml document>
 *
 * Each iteration of the supplied while loop parses and discards the feed name (it's
 * in the file for humans to read, but our aggregator doesn't care what the name is)
 * and then extracts the URL.  It then relies on ProcessFeed to pull the remote
 * document and index its content.
 */

//So this function calls ProcessFeed, which passes the remoteFileName to
//ProcessFeed is called as many times for as many feeds that there are! 
static void BuildIndices(const char *feedsFileName, hashset* stopwords,
			 vector* seenurls, hashset* words )
{
  printf("start build indices \n");
  FILE *infile;
  streamtokenizer st;
  char remoteFileName[1024];
  infile = fopen(feedsFileName, "r");
  assert(infile != NULL);
  STNew(&st, infile, kNewLineDelimiters, true);
  while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line
    STSkipOver(&st, ": ");		 // now ignore the semicolon and any whitespace directly after it
    STNextToken(&st, remoteFileName, sizeof(remoteFileName));
    // printf("before calling process feed \n");
    ProcessFeed(remoteFileName, stopwords, seenurls, words);
  }
  STDispose(&st);
  fclose(infile);
  //printf("\n");
  printf("here is the size of the seenurls vector, %d \n",
  VectorLength(seenurls));
  printf("here is the size of the words hashset, %d \n", HashSetCount(words));
  printf("gets to end of build indices");
}



/**
 * Function: ProcessFeed
 * ---------------------
 * ProcessFeed locates the specified RSS document, and if a (possibly redirected) connection to that remote
 * document can be established, then PullAllNewsItems is tapped to actually read the feed.  Check out the
 * documentation of the PullAllNewsItems function for more information, and inspect the documentation
 * for ParseArticle or information about what the different response codes mean.
 */

static void ProcessFeed(const char *remoteDocumentName, hashset *stopwords,
			vector* seenurls, hashset* words){
  //printf("gets into process feed \n");
  url u;
  urlconnection urlconn;
  URLNewAbsolute(&u, remoteDocumentName);
  //printf("remotedoc name, %s \n", remoteDocumentName);
  URLConnectionNew(&urlconn, &u);
  //printf("hello this is an annoying seg fault \n");
  switch (urlconn.responseCode) {
  case 0: printf("Unable to connect to \"%s\".  Ignoring...",u.serverName);
    break;
  case 200:
    PullAllNewsItems(&urlconn, stopwords, seenurls, words);
    break;
  case 301: 
    //recursively calls itself to get the newURL?
  case 302: ProcessFeed(urlconn.newUrl, stopwords, seenurls, words);
    break;
  default: printf("Connection to \"%s\" was established, but unable to retrieve \"%s\". [response code: %d, response message:\"%s\"]\n",
		  u.serverName, u.fileName, urlconn.responseCode, urlconn.responseMessage);
    break;
    URLConnectionDispose(&urlconn);
    URLDispose(&u);
  }
}


/**
 * Function: PullAllNewsItems
 * --------------------------
 * Steps though the data of what is assumed to be an RSS feed identifying the names and
 * URLs of online news articles.  Check out "datafiles/sample-rss-feed.txt" for an idea of what an
 * RSS feed from the www.nytimes.com (or anything other server that syndicates is stories).
 *
 * PullAllNewsItems views a typical RSS feed as a sequence of "items", where each item is detailed
 * using a generalization of HTML called XML.  A typical XML fragment for a single news item will certainly
 * adhere to the format of the following example:
 *
 * <item>
 *   <title>At Installation Mass, New Pope Strikes a Tone of Openness</title>
 *   <link>http://www.nytimes.com/2005/04/24/international/worldspecial2/24cnd-pope.html</link>
 *   <description>The Mass, which drew 350,000 spectators, marked an important moment in the transformation of Benedict XVI.</description>
 *   <author>By IAN FISHER and LAURIE GOODSTEIN</author>
 *   <pubDate>Sun, 24 Apr 2005 00:00:00 EDT</pubDate>
 *   <guid isPermaLink="false">http://www.nytimes.com/2005/04/24/international/worldspecial2/24cnd-pope.html</guid>
 * </item>
 *
 * PullAllNewsItems reads and discards all characters up through the opening <item> tag (discarding the <item> tag
 * as well, because once it's read and indentified, it's been pulled,) and then hands the state of the stream to
 * ProcessSingleNewsItem, which handles the job of pulling and analyzing everything up through and including the </item>
 * tag. PullAllNewsItems processes the entire RSS feed and repeatedly advancing to the next <item> tag and then allowing
 * ProcessSingleNewsItem do process everything up until </item>.
 */

static const char *const kTextDelimiters = " \t\n\r\b!@$%^*()_+={[}]|\\'\":;/?.>,<~`";
static void PullAllNewsItems(urlconnection *urlconn, hashset* stopwords,
			     vector* seenurls, hashset* words )
{

  //printf("start pull all news items \n");
  //We initially construct the hashsets here.
  printf("initializing hashset  \n");
  HashSetNew(words, sizeof(wordstruct), WORD_BUCKET_NO, WordStructHash,
	     WordStructCompare, WordStructFree);
  printf("after hashset new \n");
  VectorNew(seenurls, sizeof(article), ArticleFree, INIT_VECT_ALLOCATION);

  streamtokenizer st;
  STNew(&st, urlconn->dataStream, kTextDelimiters, false);
  while (GetNextItemTag(&st)) { // if true is returned, then assume that
				// <item ...> has just been read and
				// pulled from the data stream
    printf("process single  \n");
    ProcessSingleNewsItem(&st, stopwords, seenurls, words);   
  };
  STDispose(&st);
}

/**
 * Function: GetNextItemTag
 * ------------------------
 * Works more or less like GetNextTag below, but this time
 * we're searching for an <item> tag, since that marks the
 * beginning of a block of HTML that's relevant to us.  
 * 
 * Note that each tag is compared to "<item" and not "<item>".
 * That's because the item tag, though unlikely, could include
 * attributes and perhaps look like any one of these:
 *
 *   <item>
 *   <item rdf:about="Latin America reacts to the Vatican">
 *   <item requiresPassword=true>
 *
 * We're just trying to be as general as possible without
 * going overboard.  (Note that we use strncasecmp so that
 * string comparisons are case-insensitive.  That's the case
 * throughout the entire code base.)
 */

static const char *const kItemTagPrefix = "<item";
static bool GetNextItemTag(streamtokenizer *st)
{
  char htmlTag[1024];
  while (GetNextTag(st, htmlTag, sizeof(htmlTag))) {
    if (strncasecmp(htmlTag, kItemTagPrefix, strlen(kItemTagPrefix)) == 0) {
      return true;
    }
  }	 
  return false;
}

/**
 * Function: ProcessSingleNewsItem
 * -------------------------------
 * Code which parses the contents of a single <item> node within an RSS/XML feed.
 * At the moment this function is called, we're to assume that the <item> tag was just
 * read and that the streamtokenizer is currently pointing to everything else, as with:
 *   
 *      <title>Carrie Underwood takes American Idol Crown</title>
 *      <description>Oklahoma farm girl beats out Alabama rocker Bo Bice and 100,000 other contestants to win competition.</description>
 *      <link>http://www.nytimes.com/frontpagenews/2841028302.html</link>
 *   </item>
 *
 * ProcessSingleNewsItem parses everything up through and including the </item>, storing the title, link, and article
 * description in local buffers long enough so that the online new article identified by the link can itself be parsed
 * and indexed.  We don't rely on <title>, <link>, and <description> coming in any particular order.  We do asssume that
 * the link field exists (although we can certainly proceed if the title and article descrption are missing.)  There
 * are often other tags inside an item, but we ignore them.
 */

static const char *const kItemEndTag = "</item>";
static const char *const kTitleTagPrefix = "<title";
static const char *const kDescriptionTagPrefix = "<description";
static const char *const kLinkTagPrefix = "<link";
static void ProcessSingleNewsItem(streamtokenizer *st, hashset* stopwords,
				  vector* seenurls, hashset* words  )
{

  char htmlTag[1024];
  char articleTitle[1024];
  char articleDescription[1024];
  char articleURL[1024];
  articleTitle[0] = articleDescription[0] = articleURL[0] = '\0';
  
  while (GetNextTag(st, htmlTag, sizeof(htmlTag)) && (strcasecmp(htmlTag, kItemEndTag) != 0)) {
    if (strncasecmp(htmlTag, kTitleTagPrefix, strlen(kTitleTagPrefix)) == 0) ExtractElement(st, htmlTag, articleTitle, sizeof(articleTitle));
    if (strncasecmp(htmlTag, kDescriptionTagPrefix, strlen(kDescriptionTagPrefix)) == 0) ExtractElement(st, htmlTag, articleDescription, sizeof(articleDescription));
    if (strncasecmp(htmlTag, kLinkTagPrefix, strlen(kLinkTagPrefix)) == 0) ExtractElement(st, htmlTag, articleURL, sizeof(articleURL));
  }

  if (strncmp(articleURL, "", sizeof(articleURL)) == 0) return;     // punt, since it's not going to take us anywhere
  //printf("before parse article \n");
  ParseArticle(articleTitle, articleDescription, articleURL, stopwords,
  seenurls,words);
}

/* 
 * Function: ExtractElement
 * ------------------------
 * Potentially pulls text from the stream up through and including the matching end tag.  It assumes that
 * the most recently extracted HTML tag resides in the buffer addressed by htmlTag.  The implementation
 * populates the specified data buffer with all of the text up to but not including the opening '<' of the
 * closing tag, and then skips over all of the closing tag as irrelevant.  Assuming for illustration purposes
 * that htmlTag addresses a buffer containing "<description" followed by other text, these three scenarios are
 * handled:
 *
 *    Normal Situation:     <description>http://some.server.com/someRelativePath.html</description>
 *    Uncommon Situation:   <description></description>
 *    Uncommon Situation:   <description/>
 *
 * In each of the second and third scenarios, the document has omitted the data.  This is not uncommon
 * for the description data to be missing, so we need to cover all three scenarious (I've actually seen all three.)
 * It would be quite unusual for the title and/or link fields to be empty, but this handles those possibilities too.
 */
 
static void ExtractElement(streamtokenizer *st, const char *htmlTag, char dataBuffer[], int bufferLength)
{
  assert(htmlTag[strlen(htmlTag) - 1] == '>');
  if (htmlTag[strlen(htmlTag) - 2] == '/') return;    // e.g. <description/> would state that a description is not being supplied
  STNextTokenUsingDifferentDelimiters(st, dataBuffer, bufferLength, "<");
  RemoveEscapeCharacters(dataBuffer);
  if (dataBuffer[0] == '<') strcpy(dataBuffer, "");  // e.g. <description></description> also means there's no description
  STSkipUntil(st, ">");
  STSkipOver(st, ">");
}

/*
static void WordFreqFree (void * thiselem){
  free ((wordfreqstruct*)thiselem);
  }*/



/** 
 * Function: ParseArticle
 * ----------------------
 * Attempts to establish a network connect to the news article identified by the three
 * parameters.  The network connection is either established of not.  The implementation
 * is prepared to handle a subset of possible (but by far the most common) scenarios,
 * and those scenarios are categorized by response code:
 *

 */
static void ParseArticle(const char *articleTitle, const char
			   *articleDescription, const char *articleURL,
			 hashset* stopwords, vector* seenurls, hashset
			 *words)
{
  //printf("gets into parse article \n");
  url u;
  urlconnection urlconn;
  streamtokenizer st; 
  article thisarticle;
  int apos;

  URLNewAbsolute(&u, articleURL);
  URLConnectionNew(&urlconn, &u);
  thisarticle.title = strdup(articleTitle);
  // printf("strdup of article title worked \n");
  thisarticle.urlstring = strdup(u.fullName);
  char* server = strdup(u.serverName);
  thisarticle.server = strdup(server);
 
  //printf("after urlconnectionnew \n");
  //for the case in which the article has already been populated into our
  //vector of articles


  switch (urlconn.responseCode) {
  case 0: printf("Unable to connect to \"%s\".  Domain name or IP address is nonexistent.\n", articleURL);
    break;
  case 200: 
    
    apos = VectorSearch(seenurls, &thisarticle,ArticleCompare,0,
    false);
    if (apos!=-1){
      thisarticle.articleindex = apos;
      printf("after articleoffset * \n");
      printf("[Ignoring \"%s\": we've seen it before.] ",thisarticle.title);
      return;
    }
    //If the article doesn't already exist, append it to teh vector of seenurls.
    printf("[%s] Indexing \"%s\" \n", thisarticle.server, thisarticle.title);
    thisarticle.articleindex = VectorLength(seenurls);   
    VectorAppend(seenurls, &thisarticle);


    STNew(&st, urlconn.dataStream, kTextDelimiters, false);
    //printf("gets bo efore call scanarticle %d\n", *(articleoffset));
    ScanArticle (&st, articleTitle, articleDescription,
		articleURL, stopwords, seenurls, words,
		&thisarticle               );
    STDispose(&st);
    
    //printf("before updatearticle veoctr \n");
    
    break;
  case 301:
  case 302: // just pretend we have the redirected URL all along, though
	    // index using the new URL and not the old one...
    ParseArticle(articleTitle, articleDescription, urlconn.newUrl,
		 stopwords, seenurls, words               );
    break;
  default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode);
    break;
  }
  ;
  URLConnectionDispose(&urlconn);
  URLDispose(&u);
}


/**
 * Function: ScanArticle
 * ---------------------
 * Parses the specified article, skipping over all HTML tags, and counts the numbers
 * of well-formed words that could potentially serve as keys in the set of indices.
 * Once the full article has been scanned, the number of well-formed words is
 * printed, and the longest well-formed word we encountered along the way
 * is printed as well.
 *
 * This is really a placeholder implementation for what will ultimately be
 * code that indexes the specified content.
 */	       
static void ScanArticle(streamtokenizer *st, const char *articleTitle,
			const char *unused, const char *articleURL,
			hashset *stopwords, vector* seenurls, hashset
			*words, article* thisarticle                    )
{ 

  char word[1024];
  while (STNextToken(st, word, sizeof(word))) {
    if (strcasecmp(word, "<") == 0) {
       SkipIrrelevantContent(st); // in html-utls.h
    } else {
      RemoveEscapeCharacters(word);
      if (WordIsWellFormed(word)) {
	printf("_________________this line indicates a new word______________________\n");
	//printf("----- %s", word);
	UpdateIndices(word, words, stopwords, seenurls, thisarticle);
      }
    }
  }
}



//UpdateIndices updates the words into the words hashset and updates the
//vector with the article information. Due to the sequential nature of all
//the updates, I decided not to decomp this function ( initially did, but
//things to messy).

static void UpdateIndices(char* word, hashset *words, hashset
			  *stopwords, vector *seenurls, article*
			  thisarticle                    ){  
  wordstruct dummy;
  dummy.word = word;
  const void* addr = HashSetLookup(stopwords, &word);
  if(addr!=NULL)printf (" - Stop word - \n");
  if (addr == NULL){
    printf("we have a stop word \n");
    void*bucketadd = HashSetLookup(words,&dummy);
    printf("after bucket add \n");
    if(bucketadd==NULL){
      HashSetEnter(words,&dummy);
      printf("after hash set enter \n");
      wordstruct*newbucket = (wordstruct*)bucketadd;
      printf("seg \n");
      //newbucket->wordfreq = malloc(sizeof(vector));
      VectorNew(newbucket->wordfreq,sizeof(wordfreqstruct),NULL,INIT_VECT_ALLOCATION);
      printf("seg fault \n");
      wordfreqstruct one;
      printf("after wordfreqstruct one \n");
      one.articleindex = thisarticle->articleindex;
      one.count =1 ;
      VectorAppend(newbucket->wordfreq,&one);
    }else{
      wordstruct *oldbucket = (wordstruct*) bucketadd;
      vector*ptr = oldbucket->wordfreq;
      wordfreqstruct local;
      local.articleindex = thisarticle->articleindex;
      int pos = VectorSearch(ptr,&local,WordFreqCompare,0,false);
      wordfreqstruct*wfs = (wordfreqstruct*)ptr;
      wfs +=sizeof(wordfreqstruct) * pos;
      wfs->count++;
    }
  }
}
  */
/** 
 * Function: QueryIndices
 * ----------------------
 * Standard query loop that allows the user to specify a single search term, and
 * then proceeds (via ProcessResponse) to list up to 10 articles (sorted by relevance)
 * that contain that word.
 */

static void QueryIndices(hashset* words, vector* seenurls, hashset* stopwords)
{
  char response[1024];
  while (true) {
    printf("Please enter a single query term that might be in our set of indices [enter to quit]: ");
    fgets(response, sizeof(response), stdin);
    response[strlen(response) - 1] = '\0';
    if (strcasecmp(response, "") == 0) break;
    ProcessResponse(response, words, seenurls, stopwords);
  }
}


static void GetArticlesContainingWord(char *word, hashset *words,
				      vector* seenurls, void* wordarr){
  printf("00000000000000000000000000000 here is hash set size of words %d \n", HashSetCount(words));

  if (wordarr == NULL) {
    printf("None of  today's news articles contain the word  \"%s\".\n",
	   word);
  }else{
    wordstruct* inherit = (wordstruct*)wordarr;
    wordfreqstruct *a = (wordfreqstruct*) inherit->wordfreq;
    printf("Nice! we found %d articles that include the word \"%s\".",
	   VectorLength(inherit->wordfreq), word);
    for(int i=0; i < VectorLength(inherit->wordfreq); i++){
      int index = (*a).articleindex;
      printf("here is the index %d \n", index);
      article* supt = (article*) seenurls;    
      supt+=sizeof(article)*index;
      printf("%d.) \"%s\" [search term occurs %d times] \n",
	     i,supt->title, (*a).count);
      printf("\"%s\" \n", supt->urlstring);
      a+=sizeof(wordfreqstruct);
    }
  }
}



/*
 * Function: ProcessResponse
 * -------------------------
 * Placeholder implementation for what will become the search of a set of indices
 * for a list of web documents containing the specified word.
 */
static void ProcessResponse(char *word, hashset *words, vector*
seenurls, hashset *stopwords)
{
  if (WordIsWellFormed(word)) {
    printf("word is well formed \n");
    wordstruct* thisstruct;
    thisstruct->word = word;
    //look for word inside of words hashset 
    if(HashSetLookup(stopwords, &word)!=NULL){
      printf("Too common a word to be taken seriously. Try something more specific. \n");
      return;
    }
    printf("before hash set loookkup \n");
     printf("00000000000000000000000000000 here is hash set size of words %d \n", HashSetCount(words));
    void* wordarr = HashSetLookup (words, &thisstruct);
    printf("after hashsetlookup \n");
    if (wordarr == NULL) {
      printf("None of  today's news articles contain the word  \"%s\".\n",
	     word);
      return;
    }
    GetArticlesContainingWord(word, words, seenurls, wordarr);
  }
  else {
    printf("We won't be allowing words like \"%s\" into our set of indices.\n", word);
  }
}

/**
 * Predicate Function: WordIsWellFormed
 * ------------------------------------
 * Before we allow a word to be inserted into our map
 * of indices, we'd like to confirm that it's a good search term.
 * One could generalize this function to allow different criteria, but
 * this version hard codes the requirement that a word begin with 
 * a letter of the alphabet and that all letters are either letters, numbers,
 * or the '-' character.  
 */

static bool WordIsWellFormed(const char *word)
{
  int i;
  if (strlen(word) == 0) return true;
  if (!isalpha((int) word[0])) return false;
  for (i = 1; i < strlen(word); i++)
    if (!isalnum((int) word[i]) && (word[i] != '-')) return false; 

  return true;
}

Thread: seg fault at vectornew

Thread Tools

Search Thread

Display

Threaded View

seg fault at vectornew

Similar Threads

Getting a seg fault

Seg Fault in Compare Function

How can you make a parent process wait for a child? I'm gettin a seg fault.

weird seg fault

Seg Fault Problem