seg fault at vectornew

This is a discussion on seg fault at vectornew within the C Programming forums, part of the General Programming Boards category; i am trying to declare a vector new on a pointer ....i am seg faulting at the vector new and ...

  1. #1
    Registered User
    Join Date
    Oct 2008
    Location
    CA
    Posts
    19

    seg fault at vectornew

    i am trying to declare a vector new on a pointer ....i am seg faulting at the vector new and i have no clue why: (the goal of this program is to go through a website adn parse words and populate a hashset with words so taht you can retrieve how mny times a certain word apppears in ana rticle)...the function i am having issue with is buildindices..

    Code:
    #include <stdio.h>
    #include <stdlib.h>
    #include <assert.h>
    #include <string.h>
    #include <ctype.h>
    #include "url.h"
    #include "bool.h"
    #include "hashset.h" 
    #include "string.h"
    #include "urlconnection.h"
    #include "streamtokenizer.h"
    #include "html-utils.h"
    
    static const int STOP_BUCKET_NO = 10007;
    static const int WORD_BUCKET_NO = 10007; 
    static const int INIT_VECT_ALLOCATION= 4;
    static const signed long kHashMultiplier = -1664117991L;
    
    typedef struct article{
      const char * title;
      const char * server;
      const char * urlstring;
      int articleindex;
    } article;
    
    typedef struct  wordstruct{
      char* word;
      vector* wordfreq;
    } wordstruct;
    
    typedef struct wordfreqstruct{
      int articleindex;
      int count;
    } wordfreqstruct;
      
    
    static void Welcome(const char *welcomeTextFileName);
    static void LoadStopWords (hashset* stopwords, const char *stopFileName);
    static void BuildIndices(const char *feedsFileName, hashset* stopwords,
    			 vector* seenurls, hashset* words);
    static void ProcessFeed(const char *remoteDocumentName, hashset
    			*stopwords, vector* seenurls, hashset *words);
    static void PullAllNewsItems(urlconnection *urlconnm, hashset *stopwords,
    			     vector* seenurls, hashset *words);
    static bool GetNextItemTag(streamtokenizer *st);
    static void ProcessSingleNewsItem(streamtokenizer *st, hashset *stopwords,
    				  vector* seenurls, hashset *words   );
    static void ExtractElement(streamtokenizer *st, const char *htmlTag, char dataBuffer[], int bufferLength);
    static void ParseArticle(const char *articleTitle, const char
    			 *articleDescription, const char *articleURL,
    			 hashset *stopwords, vector* seenurls, hashset
    			 *words                    );
    static void ScanArticle(streamtokenizer *st, const char *articleTitle,
    			const char *unused, const char *articleURL,
    			hashset *stopwords, vector* seenurls, hashset
    			*words, article* thisarticle);
    static void UpdateIndices(char* word, hashset *words, hashset
    			  *stopwords, vector *seenurls, article*
    			  thisarticle                    );
    static void QueryIndices(hashset *words, vector*seenurls, hashset*stopwords);
    static void ProcessResponse(char *word, hashset*words, vector*
    seenurls, hashset *stopwords);
    static bool WordIsWellFormed(const char *word);
    static int StopWordHash(const void *s, int numBuckets);
    static void  StringFree(void * wordpoint);
    static int WordStructCompare(const void *one, const void *two);
    static void WordStructFree(void *point);
    static int ArticleCompare(const void *one, const void *two);
    static int StringComparison (const void* one, const void* two);
    static void ArticleFree(void *thiselem);
    static void UpdateArticleVector(article* thisarticle,  vector*
    seenurls                   );
    static void UpdateWordsHashset(hashset *words, void * lowerword);
    static void AddWordArticleVect (vector *seenurls, void * lowerword,
    				article* thisarticle);
    static int WordFreqCompare(const void * one, const void * two);
    static void GetArticlesContainingWord( char *word, hashset *words,
    				      vector* seenurls, void* wordarr);
    static void MapOffsetToWordFreq(void *offsetaddr, void* word);
    static int WordStructHash(const void *s, int numBuckets) ;
    
    
    
    static const char *const kWelcomeTextFile = "/usr/class/cs107/assignments/assn-4-rss-news-search-data/welcome.txt";
    static const char *const kDefaultFeedsFile =
    "/usr/class/cs107/assignments/assn-4-rss-news-search-data/rss-feeds-tiny.txt";
    static const char *const kStopFile =
      "/usr/class/cs107/assignments/assn-4-rss-news-search-data/stop-words.txt";
    
    
    /**
     * Function: main
     * --------------
     * Serves as the entry point of the full application.
     * You'll want to update main to declare several hashsets--
     * one for stop words, another for previously seen urls, etc--
     * and pass them (by address) to BuildIndices and QueryIndices.
     * In fact, you'll need to extend many of the prototypes of the
     * supplied helpers functions to take one or more hashset *s.
     *
     * Think very carefully about how you're going to keep track of
     * all of the stop words, how you're going to keep track of
     * all the previously seen articles, and how you're going to 
     * map words to the collection of news articles where that
     * word appears.
     */
    int main(int argc, char **argv)
    {
      hashset stopwords;
      hashset words;
      vector seenurls;
      Welcome(kWelcomeTextFile);
      //printf("before stop \n");
      LoadStopWords(&stopwords, kStopFile);
      //char *word = "the";
      //char* dummy = word;
      //void *addr = HashSetLookup(&stopwords, &dummy);
      //assert(addr != NULL);
      //printf("ADDR POINTER: %p\n", addr);
    
      //printf("calling build indices \n");
      BuildIndices((argc == 1) ? kDefaultFeedsFile : argv[1],
      &stopwords,&seenurls, &words);
      printf("\n");
      printf("here is the # of seenurls %d \n",VectorLength(&seenurls));
      //QueryIndices(&words,&seenurls, &stopwords);
      //HashSetDispose(stopwords);
      //HashSetDispose(words);
      //VectorDispose(seenurls);
      return 0;
    }
    
    
    
    /** 
     * Function: Welcome
     * -----------------
     * Displays the contents of the specified file, which
     * holds the introductory remarks to be printed every time
     * the application launches.  This type of overhead may
     * seem silly, but by placing the text in an external file,
     * we can change the welcome text without forcing a recompilation and
     * build of the application.  It's as if welcomeTextFileName
     * is a configuration file that travels with the application.
     */
     
    static const char *const kNewLineDelimiters = "\r\n";
    static void Welcome(const char *welcomeTextFileName)
    {
      FILE *infile;
      streamtokenizer st;
      char buffer[1024];
      
      infile = fopen(welcomeTextFileName, "r");
      assert(infile != NULL);    
      
      STNew(&st, infile, kNewLineDelimiters, true);
      while (STNextToken(&st, buffer, sizeof(buffer))) {
        printf("%s\n", buffer);
      }
      STDispose(&st); // remember that STDispose doesn't close the file, since STNew doesn't open one.. 
      fclose(infile);
    }
    
    static int WordStructCompare(const void *one, const void *two){
      char* oneword = ((wordstruct*)one)-> word;
      char* twoword = ((wordstruct*)two)->word;
      return strcasecmp( oneword, twoword);
    }
    
    static int WordFreqCompare (const void*one, const void*two){
      // printf("freq compare \n");
      return (((wordfreqstruct*)one)->articleindex- ((wordfreqstruct*)two)->articleindex);
    }
    
    
    static void WordStructFree(void*point){
      free((*(wordstruct*)point).wordfreq); //free the thing pointed to (the actual word)
    }
    
    //I want to deference the char** but then I seg fault way earlier 
    static int StringComparison (const void* one, const void* two){
      return strcasecmp(*(char**)one, *(char**) two); 
    }
    
    //Returns zero if they are equal and returns -1 if they are not. 
    //Question: Should this return 1 in any case? 
    static int  ArticleCompare(const void *one, const void *two){
      if (strcmp( ( (article*) one)->title , ( (article*) two)->title) == 0 &&
          (strcmp(((article*)one)->server,  ((article*)two)->server) ==0)) return 0;
      else if (strcmp(((article*)one)->urlstring, ((article*)two)->urlstring) ==0) return 0;
      else return -1;
    }
    
    static void ArticleFree(void   * thiselem){
      // free( ( *(article*) thiselem).wordfreqpoint);
      //Question: Why does it seg fault when we call this? 
      //free(( *(article*) thiselem).server);
      //free((*(article*)thiselem).urlstring);
      //   free((*(article*)thiselem).title);
      //   free((*(article*)thiselem).articleindex);
    } 
    
    /** 
     * StopWordHash                     
     * ----------  
     * This function adapted from Eric Roberts' "The Art and Science of C"
     * It takes a string and uses it to derive a hash code, which   
     * is an integer in the range [0, numBuckets).  The hash code is computed  
     * using a method called "linear congruence."  A similar function using this     
     * method is described on page 144 of Kernighan and Ritchie.  The choice of                                                     
     * the value for the kHashMultiplier can have a significant effect on the                            
     * performance of the algorithm, but not on its correctness.                                                    
     * This hash function has the additional feature of being case-insensitive,  
     * hashing "Peter Pawlowski" and "PETER PAWLOWSKI" to the same code.  
     */  
    //doesn't match prototype for void* 
    //cast pointer to char** and then 
    static int StopWordHash(const void *s, int numBuckets)  
    {    
      //printf("calling stop word hash \n");        
      int i;
      unsigned long hashcode = 0;
    
      char* h = *(char**)s;
      
      for (i = 0; i < strlen(h); i++)  
        hashcode = hashcode * kHashMultiplier + tolower(h[i]); 
      
      return hashcode % numBuckets;                                
    }
    
    /** 
     * WordStructHash                     
     * ----------  
     * This function adapted from Eric Roberts' "The Art and Science of C"
     * It takes a string and uses it to derive a hash code, which   
     * is an integer in the range [0, numBuckets).  The hash code is computed  
     * using a method called "linear congruence."  A similar function using this     
     * method is described on page 144 of Kernighan and Ritchie.  The choice of                                                     
     * the value for the kHashMultiplier can have a significant effect on the                            
     * performance of the algorithm, but not on its correctness.                                                    
     * This hash function has the additional feature of being case-insensitive,  
     * hashing "Peter Pawlowski" and "PETER PAWLOWSKI" to the same code.  
     */  
    //doesn't match prototype for void* 
    //cast pointer to char** and then 
    static int WordStructHash(const void *s, int numBuckets)  
    {            
    
      // printf("calling wordstruct hash \n");
      int i;
      unsigned long hashcode = 0;
    
      wordstruct* x = (wordstruct*)s;
      char* h = x-> word;
      
      for (i = 0; i < strlen(h); i++)  
        hashcode = hashcode * kHashMultiplier + tolower(h[i]);  
      
      return hashcode % numBuckets;                                
    }
    
    static void StringFree(void *wordpoint){
      free(*(char**)wordpoint);
    }
    
    /*
    //Compares wordstruct's by seeing if the words match up. 
    static int FreqCompare(const void * one, const void * two){
      //printf("calls word struct compare \n");
      return strcmp((*(freq*)one).articleindex, (*(freq*)two).articleindex);
      }*/
    
    
    
    
    //LoadStopWords goes through the data file and loads the stop words in.
    static void LoadStopWords (hashset *stopwords, const char* stopFileName){
      
      FILE *infile;
      streamtokenizer st;
      char buffer[1024];
      HashSetNew(stopwords, sizeof(char*), STOP_BUCKET_NO,StopWordHash,
    	     StringComparison, StringFree);
      infile = fopen(stopFileName, "r");
      assert(infile != NULL);
      STNew(&st, infile, kNewLineDelimiters, true);
      
      
    
      
      while (STNextToken(&st, buffer, sizeof(buffer) ) ) {
      
        char* temp = strdup(buffer);
        HashSetEnter(stopwords, &temp);
        //printf("here is the number of elements inside our hashset %d \n", HashSetCount(stopwords));
      }
      STDispose(&st);
      fclose(infile);
      printf("\n");
      
    }
    
    
    
    /**
     * Function: BuildIndices
     * ----------------------
     * As far as the user is concerned, BuildIndices needs to read each and every
     * one of the feeds listed in the specied feedsFileName, and for each feed parse
     * content of all referenced articles and store the content in the hashset of indices.
     * Each line of the specified feeds file looks like this:
     *
     *   <feed name>: <URL of remore xml document>
     *
     * Each iteration of the supplied while loop parses and discards the feed name (it's
     * in the file for humans to read, but our aggregator doesn't care what the name is)
     * and then extracts the URL.  It then relies on ProcessFeed to pull the remote
     * document and index its content.
     */
    
    //So this function calls ProcessFeed, which passes the remoteFileName to
    //ProcessFeed is called as many times for as many feeds that there are! 
    static void BuildIndices(const char *feedsFileName, hashset* stopwords,
    			 vector* seenurls, hashset* words )
    {
      printf("start build indices \n");
      FILE *infile;
      streamtokenizer st;
      char remoteFileName[1024];
      infile = fopen(feedsFileName, "r");
      assert(infile != NULL);
      STNew(&st, infile, kNewLineDelimiters, true);
      while (STSkipUntil(&st, ":") != EOF) { // ignore everything up to the first selicolon of the line
        STSkipOver(&st, ": ");		 // now ignore the semicolon and any whitespace directly after it
        STNextToken(&st, remoteFileName, sizeof(remoteFileName));
        // printf("before calling process feed \n");
        ProcessFeed(remoteFileName, stopwords, seenurls, words);
      }
      STDispose(&st);
      fclose(infile);
      //printf("\n");
      printf("here is the size of the seenurls vector, %d \n",
      VectorLength(seenurls));
      printf("here is the size of the words hashset, %d \n", HashSetCount(words));
      printf("gets to end of build indices");
    }
    
    
    
    /**
     * Function: ProcessFeed
     * ---------------------
     * ProcessFeed locates the specified RSS document, and if a (possibly redirected) connection to that remote
     * document can be established, then PullAllNewsItems is tapped to actually read the feed.  Check out the
     * documentation of the PullAllNewsItems function for more information, and inspect the documentation
     * for ParseArticle or information about what the different response codes mean.
     */
    
    static void ProcessFeed(const char *remoteDocumentName, hashset *stopwords,
    			vector* seenurls, hashset* words){
      //printf("gets into process feed \n");
      url u;
      urlconnection urlconn;
      URLNewAbsolute(&u, remoteDocumentName);
      //printf("remotedoc name, %s \n", remoteDocumentName);
      URLConnectionNew(&urlconn, &u);
      //printf("hello this is an annoying seg fault \n");
      switch (urlconn.responseCode) {
      case 0: printf("Unable to connect to \"%s\".  Ignoring...",u.serverName);
        break;
      case 200:
        PullAllNewsItems(&urlconn, stopwords, seenurls, words);
        break;
      case 301: 
        //recursively calls itself to get the newURL?
      case 302: ProcessFeed(urlconn.newUrl, stopwords, seenurls, words);
        break;
      default: printf("Connection to \"%s\" was established, but unable to retrieve \"%s\". [response code: %d, response message:\"%s\"]\n",
    		  u.serverName, u.fileName, urlconn.responseCode, urlconn.responseMessage);
        break;
        URLConnectionDispose(&urlconn);
        URLDispose(&u);
      }
    }
    
    
    /**
     * Function: PullAllNewsItems
     * --------------------------
     * Steps though the data of what is assumed to be an RSS feed identifying the names and
     * URLs of online news articles.  Check out "datafiles/sample-rss-feed.txt" for an idea of what an
     * RSS feed from the www.nytimes.com (or anything other server that syndicates is stories).
     *
     * PullAllNewsItems views a typical RSS feed as a sequence of "items", where each item is detailed
     * using a generalization of HTML called XML.  A typical XML fragment for a single news item will certainly
     * adhere to the format of the following example:
     *
     * <item>
     *   <title>At Installation Mass, New Pope Strikes a Tone of Openness</title>
     *   <link>http://www.nytimes.com/2005/04/24/international/worldspecial2/24cnd-pope.html</link>
     *   <description>The Mass, which drew 350,000 spectators, marked an important moment in the transformation of Benedict XVI.</description>
     *   <author>By IAN FISHER and LAURIE GOODSTEIN</author>
     *   <pubDate>Sun, 24 Apr 2005 00:00:00 EDT</pubDate>
     *   <guid isPermaLink="false">http://www.nytimes.com/2005/04/24/international/worldspecial2/24cnd-pope.html</guid>
     * </item>
     *
     * PullAllNewsItems reads and discards all characters up through the opening <item> tag (discarding the <item> tag
     * as well, because once it's read and indentified, it's been pulled,) and then hands the state of the stream to
     * ProcessSingleNewsItem, which handles the job of pulling and analyzing everything up through and including the </item>
     * tag. PullAllNewsItems processes the entire RSS feed and repeatedly advancing to the next <item> tag and then allowing
     * ProcessSingleNewsItem do process everything up until </item>.
     */
    
    static const char *const kTextDelimiters = " \t\n\r\b!@$%^*()_+={[}]|\\'\":;/?.>,<~`";
    static void PullAllNewsItems(urlconnection *urlconn, hashset* stopwords,
    			     vector* seenurls, hashset* words )
    {
    
      //printf("start pull all news items \n");
      //We initially construct the hashsets here.
      printf("initializing hashset  \n");
      HashSetNew(words, sizeof(wordstruct), WORD_BUCKET_NO, WordStructHash,
    	     WordStructCompare, WordStructFree);
      printf("after hashset new \n");
      VectorNew(seenurls, sizeof(article), ArticleFree, INIT_VECT_ALLOCATION);
    
      streamtokenizer st;
      STNew(&st, urlconn->dataStream, kTextDelimiters, false);
      while (GetNextItemTag(&st)) { // if true is returned, then assume that
    				// <item ...> has just been read and
    				// pulled from the data stream
        printf("process single  \n");
        ProcessSingleNewsItem(&st, stopwords, seenurls, words);   
      };
      STDispose(&st);
    }
    
    /**
     * Function: GetNextItemTag
     * ------------------------
     * Works more or less like GetNextTag below, but this time
     * we're searching for an <item> tag, since that marks the
     * beginning of a block of HTML that's relevant to us.  
     * 
     * Note that each tag is compared to "<item" and not "<item>".
     * That's because the item tag, though unlikely, could include
     * attributes and perhaps look like any one of these:
     *
     *   <item>
     *   <item rdf:about="Latin America reacts to the Vatican">
     *   <item requiresPassword=true>
     *
     * We're just trying to be as general as possible without
     * going overboard.  (Note that we use strncasecmp so that
     * string comparisons are case-insensitive.  That's the case
     * throughout the entire code base.)
     */
    
    static const char *const kItemTagPrefix = "<item";
    static bool GetNextItemTag(streamtokenizer *st)
    {
      char htmlTag[1024];
      while (GetNextTag(st, htmlTag, sizeof(htmlTag))) {
        if (strncasecmp(htmlTag, kItemTagPrefix, strlen(kItemTagPrefix)) == 0) {
          return true;
        }
      }	 
      return false;
    }
    
    /**
     * Function: ProcessSingleNewsItem
     * -------------------------------
     * Code which parses the contents of a single <item> node within an RSS/XML feed.
     * At the moment this function is called, we're to assume that the <item> tag was just
     * read and that the streamtokenizer is currently pointing to everything else, as with:
     *   
     *      <title>Carrie Underwood takes American Idol Crown</title>
     *      <description>Oklahoma farm girl beats out Alabama rocker Bo Bice and 100,000 other contestants to win competition.</description>
     *      <link>http://www.nytimes.com/frontpagenews/2841028302.html</link>
     *   </item>
     *
     * ProcessSingleNewsItem parses everything up through and including the </item>, storing the title, link, and article
     * description in local buffers long enough so that the online new article identified by the link can itself be parsed
     * and indexed.  We don't rely on <title>, <link>, and <description> coming in any particular order.  We do asssume that
     * the link field exists (although we can certainly proceed if the title and article descrption are missing.)  There
     * are often other tags inside an item, but we ignore them.
     */
    
    static const char *const kItemEndTag = "</item>";
    static const char *const kTitleTagPrefix = "<title";
    static const char *const kDescriptionTagPrefix = "<description";
    static const char *const kLinkTagPrefix = "<link";
    static void ProcessSingleNewsItem(streamtokenizer *st, hashset* stopwords,
    				  vector* seenurls, hashset* words  )
    {
    
      char htmlTag[1024];
      char articleTitle[1024];
      char articleDescription[1024];
      char articleURL[1024];
      articleTitle[0] = articleDescription[0] = articleURL[0] = '\0';
      
      while (GetNextTag(st, htmlTag, sizeof(htmlTag)) && (strcasecmp(htmlTag, kItemEndTag) != 0)) {
        if (strncasecmp(htmlTag, kTitleTagPrefix, strlen(kTitleTagPrefix)) == 0) ExtractElement(st, htmlTag, articleTitle, sizeof(articleTitle));
        if (strncasecmp(htmlTag, kDescriptionTagPrefix, strlen(kDescriptionTagPrefix)) == 0) ExtractElement(st, htmlTag, articleDescription, sizeof(articleDescription));
        if (strncasecmp(htmlTag, kLinkTagPrefix, strlen(kLinkTagPrefix)) == 0) ExtractElement(st, htmlTag, articleURL, sizeof(articleURL));
      }
    
      if (strncmp(articleURL, "", sizeof(articleURL)) == 0) return;     // punt, since it's not going to take us anywhere
      //printf("before parse article \n");
      ParseArticle(articleTitle, articleDescription, articleURL, stopwords,
      seenurls,words);
    }
    
    /* 
     * Function: ExtractElement
     * ------------------------
     * Potentially pulls text from the stream up through and including the matching end tag.  It assumes that
     * the most recently extracted HTML tag resides in the buffer addressed by htmlTag.  The implementation
     * populates the specified data buffer with all of the text up to but not including the opening '<' of the
     * closing tag, and then skips over all of the closing tag as irrelevant.  Assuming for illustration purposes
     * that htmlTag addresses a buffer containing "<description" followed by other text, these three scenarios are
     * handled:
     *
     *    Normal Situation:     <description>http://some.server.com/someRelativePath.html</description>
     *    Uncommon Situation:   <description></description>
     *    Uncommon Situation:   <description/>
     *
     * In each of the second and third scenarios, the document has omitted the data.  This is not uncommon
     * for the description data to be missing, so we need to cover all three scenarious (I've actually seen all three.)
     * It would be quite unusual for the title and/or link fields to be empty, but this handles those possibilities too.
     */
     
    static void ExtractElement(streamtokenizer *st, const char *htmlTag, char dataBuffer[], int bufferLength)
    {
      assert(htmlTag[strlen(htmlTag) - 1] == '>');
      if (htmlTag[strlen(htmlTag) - 2] == '/') return;    // e.g. <description/> would state that a description is not being supplied
      STNextTokenUsingDifferentDelimiters(st, dataBuffer, bufferLength, "<");
      RemoveEscapeCharacters(dataBuffer);
      if (dataBuffer[0] == '<') strcpy(dataBuffer, "");  // e.g. <description></description> also means there's no description
      STSkipUntil(st, ">");
      STSkipOver(st, ">");
    }
    
    /*
    static void WordFreqFree (void * thiselem){
      free ((wordfreqstruct*)thiselem);
      }*/
    
    
    
    /** 
     * Function: ParseArticle
     * ----------------------
     * Attempts to establish a network connect to the news article identified by the three
     * parameters.  The network connection is either established of not.  The implementation
     * is prepared to handle a subset of possible (but by far the most common) scenarios,
     * and those scenarios are categorized by response code:
     *
    
     */
    static void ParseArticle(const char *articleTitle, const char
    			   *articleDescription, const char *articleURL,
    			 hashset* stopwords, vector* seenurls, hashset
    			 *words)
    {
      //printf("gets into parse article \n");
      url u;
      urlconnection urlconn;
      streamtokenizer st; 
      article thisarticle;
      int apos;
    
      URLNewAbsolute(&u, articleURL);
      URLConnectionNew(&urlconn, &u);
      thisarticle.title = strdup(articleTitle);
      // printf("strdup of article title worked \n");
      thisarticle.urlstring = strdup(u.fullName);
      char* server = strdup(u.serverName);
      thisarticle.server = strdup(server);
     
      //printf("after urlconnectionnew \n");
      //for the case in which the article has already been populated into our
      //vector of articles
    
    
      switch (urlconn.responseCode) {
      case 0: printf("Unable to connect to \"%s\".  Domain name or IP address is nonexistent.\n", articleURL);
        break;
      case 200: 
        
        apos = VectorSearch(seenurls, &thisarticle,ArticleCompare,0,
        false);
        if (apos!=-1){
          thisarticle.articleindex = apos;
          printf("after articleoffset * \n");
          printf("[Ignoring \"%s\": we've seen it before.] ",thisarticle.title);
          return;
        }
        //If the article doesn't already exist, append it to teh vector of seenurls.
        printf("[%s] Indexing \"%s\" \n", thisarticle.server, thisarticle.title);
        thisarticle.articleindex = VectorLength(seenurls);   
        VectorAppend(seenurls, &thisarticle);
    
    
        STNew(&st, urlconn.dataStream, kTextDelimiters, false);
        //printf("gets bo efore call scanarticle %d\n", *(articleoffset));
        ScanArticle (&st, articleTitle, articleDescription,
    		articleURL, stopwords, seenurls, words,
    		&thisarticle               );
        STDispose(&st);
        
        //printf("before updatearticle veoctr \n");
        
        break;
      case 301:
      case 302: // just pretend we have the redirected URL all along, though
    	    // index using the new URL and not the old one...
        ParseArticle(articleTitle, articleDescription, urlconn.newUrl,
    		 stopwords, seenurls, words               );
        break;
      default: printf("Unable to pull \"%s\" from \"%s\". [Response code: %d] Punting...\n", articleTitle, u.serverName, urlconn.responseCode);
        break;
      }
      ;
      URLConnectionDispose(&urlconn);
      URLDispose(&u);
    }
    
    
    /**
     * Function: ScanArticle
     * ---------------------
     * Parses the specified article, skipping over all HTML tags, and counts the numbers
     * of well-formed words that could potentially serve as keys in the set of indices.
     * Once the full article has been scanned, the number of well-formed words is
     * printed, and the longest well-formed word we encountered along the way
     * is printed as well.
     *
     * This is really a placeholder implementation for what will ultimately be
     * code that indexes the specified content.
     */	       
    static void ScanArticle(streamtokenizer *st, const char *articleTitle,
    			const char *unused, const char *articleURL,
    			hashset *stopwords, vector* seenurls, hashset
    			*words, article* thisarticle                    )
    { 
    
      char word[1024];
      while (STNextToken(st, word, sizeof(word))) {
        if (strcasecmp(word, "<") == 0) {
           SkipIrrelevantContent(st); // in html-utls.h
        } else {
          RemoveEscapeCharacters(word);
          if (WordIsWellFormed(word)) {
    	printf("_________________this line indicates a new word______________________\n");
    	//printf("----- %s", word);
    	UpdateIndices(word, words, stopwords, seenurls, thisarticle);
          }
        }
      }
    }
    
    
    
    //UpdateIndices updates the words into the words hashset and updates the
    //vector with the article information. Due to the sequential nature of all
    //the updates, I decided not to decomp this function ( initially did, but
    //things to messy).
    
    static void UpdateIndices(char* word, hashset *words, hashset
    			  *stopwords, vector *seenurls, article*
    			  thisarticle                    ){  
      wordstruct dummy;
      dummy.word = word;
      const void* addr = HashSetLookup(stopwords, &word);
      if(addr!=NULL)printf (" - Stop word - \n");
      if (addr == NULL){
        printf("we have a stop word \n");
        void*bucketadd = HashSetLookup(words,&dummy);
        printf("after bucket add \n");
        if(bucketadd==NULL){
          HashSetEnter(words,&dummy);
          printf("after hash set enter \n");
          wordstruct*newbucket = (wordstruct*)bucketadd;
          printf("seg \n");
          //newbucket->wordfreq = malloc(sizeof(vector));
          VectorNew(newbucket->wordfreq,sizeof(wordfreqstruct),NULL,INIT_VECT_ALLOCATION);
          printf("seg fault \n");
          wordfreqstruct one;
          printf("after wordfreqstruct one \n");
          one.articleindex = thisarticle->articleindex;
          one.count =1 ;
          VectorAppend(newbucket->wordfreq,&one);
        }else{
          wordstruct *oldbucket = (wordstruct*) bucketadd;
          vector*ptr = oldbucket->wordfreq;
          wordfreqstruct local;
          local.articleindex = thisarticle->articleindex;
          int pos = VectorSearch(ptr,&local,WordFreqCompare,0,false);
          wordfreqstruct*wfs = (wordfreqstruct*)ptr;
          wfs +=sizeof(wordfreqstruct) * pos;
          wfs->count++;
        }
      }
    }
      */
    /** 
     * Function: QueryIndices
     * ----------------------
     * Standard query loop that allows the user to specify a single search term, and
     * then proceeds (via ProcessResponse) to list up to 10 articles (sorted by relevance)
     * that contain that word.
     */
    
    static void QueryIndices(hashset* words, vector* seenurls, hashset* stopwords)
    {
      char response[1024];
      while (true) {
        printf("Please enter a single query term that might be in our set of indices [enter to quit]: ");
        fgets(response, sizeof(response), stdin);
        response[strlen(response) - 1] = '\0';
        if (strcasecmp(response, "") == 0) break;
        ProcessResponse(response, words, seenurls, stopwords);
      }
    }
    
    
    static void GetArticlesContainingWord(char *word, hashset *words,
    				      vector* seenurls, void* wordarr){
      printf("00000000000000000000000000000 here is hash set size of words %d \n", HashSetCount(words));
    
      if (wordarr == NULL) {
        printf("None of  today's news articles contain the word  \"%s\".\n",
    	   word);
      }else{
        wordstruct* inherit = (wordstruct*)wordarr;
        wordfreqstruct *a = (wordfreqstruct*) inherit->wordfreq;
        printf("Nice! we found %d articles that include the word \"%s\".",
    	   VectorLength(inherit->wordfreq), word);
        for(int i=0; i < VectorLength(inherit->wordfreq); i++){
          int index = (*a).articleindex;
          printf("here is the index %d \n", index);
          article* supt = (article*) seenurls;    
          supt+=sizeof(article)*index;
          printf("%d.) \"%s\" [search term occurs %d times] \n",
    	     i,supt->title, (*a).count);
          printf("\"%s\" \n", supt->urlstring);
          a+=sizeof(wordfreqstruct);
        }
      }
    }
    
    
    
    /*
     * Function: ProcessResponse
     * -------------------------
     * Placeholder implementation for what will become the search of a set of indices
     * for a list of web documents containing the specified word.
     */
    static void ProcessResponse(char *word, hashset *words, vector*
    seenurls, hashset *stopwords)
    {
      if (WordIsWellFormed(word)) {
        printf("word is well formed \n");
        wordstruct* thisstruct;
        thisstruct->word = word;
        //look for word inside of words hashset 
        if(HashSetLookup(stopwords, &word)!=NULL){
          printf("Too common a word to be taken seriously. Try something more specific. \n");
          return;
        }
        printf("before hash set loookkup \n");
         printf("00000000000000000000000000000 here is hash set size of words %d \n", HashSetCount(words));
        void* wordarr = HashSetLookup (words, &thisstruct);
        printf("after hashsetlookup \n");
        if (wordarr == NULL) {
          printf("None of  today's news articles contain the word  \"%s\".\n",
    	     word);
          return;
        }
        GetArticlesContainingWord(word, words, seenurls, wordarr);
      }
      else {
        printf("We won't be allowing words like \"%s\" into our set of indices.\n", word);
      }
    }
    
    /**
     * Predicate Function: WordIsWellFormed
     * ------------------------------------
     * Before we allow a word to be inserted into our map
     * of indices, we'd like to confirm that it's a good search term.
     * One could generalize this function to allow different criteria, but
     * this version hard codes the requirement that a word begin with 
     * a letter of the alphabet and that all letters are either letters, numbers,
     * or the '-' character.  
     */
    
    static bool WordIsWellFormed(const char *word)
    {
      int i;
      if (strlen(word) == 0) return true;
      if (!isalpha((int) word[0])) return false;
      for (i = 1; i < strlen(word); i++)
        if (!isalnum((int) word[i]) && (word[i] != '-')) return false; 
    
      return true;
    }

  2. #2
    a_capitalist_story
    Join Date
    Dec 2007
    Posts
    2,652
    Looks to me like newbucket is going to be NULL when you try to access its wordfreq member in the VectorNew call...

    Code:
    if(bucketadd==NULL){
        HashSetEnter(words,&dummy);
        printf("after hash set enter \n");
        wordstruct*newbucket = (wordstruct*)bucketadd;
        printf("seg \n");
        VectorNew(newbucket->wordfreq,sizeof(wordfreqstruct),NULL,INIT_VECT_ALLOCATION);

  3. #3
    Registered User
    Join Date
    Oct 2008
    Location
    CA
    Posts
    19
    i tried doing the malloc and allocating memory on the heap but that didnt work

    i think the issue is instantiating the struct

    basically what i have is a pointer to a struct called wordstruct which contains a vector* called wordfreq which points to a vector of wordfreqstructs

    here is the code i'm iffy on (this is my potential rewrite of updateindices that also doesn;t work)

    <<untagged code deleted>>
    Last edited by Salem; 10-25-2008 at 03:13 PM. Reason: Couldn't be arsed to tag the code (just like the poster), so I deleted it.

Popular pages Recent additions subscribe to a feed

Similar Threads

  1. Getting a seg fault
    By ammochck21 in forum C Programming
    Replies: 11
    Last Post: 01-23-2009, 05:27 AM
  2. Seg Fault in Compare Function
    By tytelizgal in forum C Programming
    Replies: 1
    Last Post: 10-25-2008, 04:06 PM
  3. Replies: 3
    Last Post: 10-15-2008, 10:24 AM
  4. weird seg fault
    By Vermelho in forum C Programming
    Replies: 3
    Last Post: 05-10-2008, 09:27 PM
  5. Seg Fault Problem
    By ChazWest in forum C++ Programming
    Replies: 2
    Last Post: 04-18-2002, 04:24 PM

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21