I have a program that reads sentences from a file. The sentence has a delimiter that separates text and and an integer. Each word in the sentence is saved to a struct along with keeping count of whether the word was used in a positive or negative context.
In the following code, I define the maximum number of rows to be read in size_t row_size. If row_size = 1024, no problem. If it is 4096, a segfault occurs. If it is 10000, no problem. Why is a segfault occurring only for certain sizes?
Please note that the realloc in the following code for row_size does not occur since the file being read currently only has ~150 sentences.
Code:
// load sentiment data from filename
int loadDataset(char *filename, struct sentiment_data *sd)
{
FILE *fp;
fp = fopen(filename, "r");
if(!fp) //open file
return 0;
size_t row_size = 10000;
char **row_str = NULL;
if((row_str = malloc(row_size*sizeof(char*)))==NULL) { //malloc array of row strings (sentences)
fclose(fp);
return 0;
}
int *row_int = NULL;
if((row_int = malloc(row_size*sizeof(int)))==NULL) { //malloc array of ints (each sentence sentiment)
free(row_str);
fclose(fp);
return 0;
}
char buf[4096];
size_t row_count = 0;
unsigned int field_count = 0;
sd->pos_sentence = 0;
sd->neg_sentence = 0;
while(fgets(buf, 4096, fp)) { //get line from file
char *field = strtok(buf, "+"); //delimit line with + symbol
while(field) {
if(field_count == 0) { //handle sentence
if((row_str[row_count] = malloc((strlen(field)+1)*sizeof(char)))==NULL) { //malloc for sentence
int j;
for(j=0; j<row_count; ++j)
free(row_str[j]);
free(row_str);
free(row_int);
fclose(fp);
return 0;
}
strcpy(row_str[row_count], field); //add sentence to row str
} else if(field_count == 1) { //handle sentence sentiment
row_int[row_count] = atoi(field); //add sentiment to row int
if(row_int[row_count] == 1) //add to sum of total pos/neg sentences
sd->pos_sentence +=1;
else
sd->neg_sentence +=1;
}
field = strtok(NULL, "+"); //delimit next line in file
++field_count;
}
++row_count;
field_count = 0;
if(row_count >= row_size) { //check if there is enough space for new sentences, if not, realloc
row_size += 4096;
char **row_str_tmp = NULL;
int *row_int_tmp= NULL;
if((row_str_tmp = realloc(row_str, row_size * sizeof(char*)))==NULL) {
int j;
for(j=0; j<row_count; ++j)
free(row_str[j]);
free(row_str);
free(row_int);
fclose(fp);
return 0;
}
if((row_int_tmp = realloc(row_int, row_size * sizeof(int)))==NULL) {
int j;
for(j=0; j<row_count; ++j)
free(row_str[j]);
free(row_str);
free(row_int);
fclose(fp);
return 0;
}
row_str = row_str_tmp;
row_int = row_int_tmp;
}
}
sd->word_count = 0;
sd->word_max = 4096;
if((sd->words = malloc(sd->word_max * sizeof(struct sentiment_word)))==NULL) { //malloc sd->words
int j;
for(j=0; j<row_count; ++j)
free(row_str[j]);
free(row_str);
free(row_int);
fclose(fp);
return 0;
}
field_count = 0;
unsigned int i;
initdelims(sd->delims); //initialize delimeters
for(i=0; i<row_count; ++i) {
char *field = strtok(row_str[i], sd->delims); //delimit each string in row_str, get each word
while(field) {
if(strlen(field)>2) { //only take words greater than 2 chars in size
char *p = field;
for ( ; *p; ++p) *p = tolower(*p); //make word lowercase
int index;
if((index = wordExist(field, sd)) != -1) { //check if word exists
//if((index = binaryWordExist(field, 0, sd->word_count, sd)) != -1) {
if(row_int[i] == 1)
++sd->words[index].pos; //if word is in positive sentence, add pos sentiment to word and vice versa
else
++sd->words[index].neg;
} else { //if word doesn't exist
unsigned long j = 0, k = 0;
while(sd->word_count > j && strcmp(field, sd->words[j++].word)>=0); //get index j where to insert word
unsigned int times_to_shift = sd->word_count - j;
for(k=0; k<times_to_shift; ++k) //move words for insertion
sd->words[sd->word_count-k] = sd->words[sd->word_count-k-1];
++(sd->word_count);
if((sd->words[j].word = malloc((strlen(field)+1)*sizeof(char)))==NULL) { //malloc for word
int x;
for(x=0; x<row_count; ++x)
free(row_str[j]);
for(x=0; x<sd->word_count; ++x)
if(x!=j)
free(sd->words[j].word);
free(sd->words);
free(row_str);
free(row_int);
fclose(fp);
return 0;
}
strcpy(sd->words[j].word, field); //add the word
if(row_int[i] == 1) { //set word sentiment
sd->words[j].pos = 1;
sd->words[j].neg = 0;
} else {
sd->words[j].pos = 0;
sd->words[j].neg = 1;
}
if(sd->word_count >= sd->word_max) { //check if sd->words is big enough, if not, realloc
sd->word_max += 4096;
struct sentiment_word *words_tmp = NULL;
if((words_tmp = realloc(sd->words, sd->word_max * sizeof(struct sentiment_word)))==NULL) {
int x;
for(x=0; x<row_count; ++x)
free(row_str[j]);
for(x=0; x<sd->word_count; ++x)
free(sd->words[j].word);
free(sd->words);
free(row_str);
free(row_int);
fclose(fp);
return 0;
}
sd->words = words_tmp;
}
}
}
field = strtok(NULL, sd->delims); //get next word
}
}
free(row_int); //free all malloc'd vars no longer needed
for(i=0; i<row_count; ++i)
free(row_str[i]);
free(row_str);
fclose(fp);
return 1;
}