Non-whitespace in-file sequences counter using own hashtable
main.c:
Code:
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <iso646.h>
#include <string.h>
#include <ctype.h>
#include <stdint.h>
#include <time.h>
// exit on failure
#define FAIL return EXIT_FAILURE
#define MAX_WORD_LENGTH_PRIME 251 // some high word length
#define LAST_ELEMENT "<END_OF_STRING>" // do not use in text
#define EMPTY_ELEMENT "<EMPTY_SYMBOL>" // do not use in text
#define MIN_PRIME_NUMBER 2
// remainders for each letter code in hash function computation
size_t remainders[MAX_WORD_LENGTH_PRIME];
// hash table entry
typedef struct
{
char word[MAX_WORD_LENGTH_PRIME]; // 8-bit UTF-8
size_t count;
} element; // to use declaration without struct keyword over all source
typedef struct
{
size_t index;
size_t count;
} found_element;
typedef struct
{
bool result;
bool is_error;
size_t index;
} insert_status;
size_t hashtable_prime_size = MIN_PRIME_NUMBER; // starts from minimal prime size because of empty textfile existence probability (we do not got memory loss in this case)
void hashtable_content_print(element * hashtable)
{
if (hashtable != NULL)
{
element element_i;
for(size_t i = 0; i< hashtable_prime_size; ++i)
{
element_i = hashtable[i];
if (strcmp(element_i.word, EMPTY_ELEMENT) != 0 && strcmp(element_i.word, LAST_ELEMENT) != 0)
printf("%s count is %zu\n", element_i.word, element_i.count);
}
}
else
puts("NULL table pointer passed to print.");
}
void recalculate_remainders()
{
for (size_t i = 0; i < MAX_WORD_LENGTH_PRIME; ++i)
remainders[i] = rand() % MAX_WORD_LENGTH_PRIME;
}
size_t get_next_prime_number(size_t current_prime_number) // required to get new hashtable size on reconstruction (with rounds == 1 we will got minimal memory loss)
{
size_t saved = current_prime_number;
bool is_prime = true;
if (current_prime_number >= MIN_PRIME_NUMBER)
for(current_prime_number = current_prime_number + 1; current_prime_number < SIZE_MAX; ++current_prime_number)
{
for(size_t j = 2; j < current_prime_number; ++j)
if (current_prime_number % j == 0)
{
is_prime = false;
break;
}
else
is_prime = true;
if (is_prime)
break;
}
return (current_prime_number == saved) ? 0 : current_prime_number;
}
// argc, argv checking
bool check_params(int params_count, char *params[])
{
return (params_count == 2) and (params != NULL) and (params[1] != NULL);
}
size_t hash(char value[], size_t offset) // double hashing probe
{
size_t result = 0;
if (value != NULL and strlen(value) > 0)
for (size_t i = 0; value[i] != '\0'; ++i)
result += remainders[i] * value[i];
else
puts("NULL value pointer or empty value passed to hash function.");
result = (result % hashtable_prime_size + offset * (1 + result % (hashtable_prime_size - 1))) % hashtable_prime_size;
printf("word = %s; offset = %zu; hash = %zu\n", value, offset, result);
return result;
}
bool hashtable_element_search(element * hashtable, char key[], found_element * result)
{
if ((hashtable != NULL) and (key != NULL))
{
puts("SEARCHING...");
element found;
size_t offset = 0, index;
do
if (strcmp((found = hashtable[index = hash(key, offset++)]).word, key) == 0)
{
*result = (found_element) { index, found.count };
printf("FOUND ELEMENT WORD = \"%s\" WITH INDEX = %zu AND COUNT = %zu\n", found.word, index, found.count);
return true;
}
while(offset != hashtable_prime_size);
}
else
puts("NULL table pointer or NULL key pointer passed to search.");
return false;
}
bool hashtable_size_increase(element * hashtable, size_t rounds) // rounds is a number of prime hashtable size rises
{
if (hashtable != NULL and rounds > 0)
{
printf("INCREASING HASHTABLE USING %zu ROUNDS...", rounds);
size_t new_size, offset = 0, index = 0, saved_size = 0;
while(rounds-- != 0)
saved_size = new_size = get_next_prime_number(hashtable_prime_size);
printf("NEW PRIME SIZE = %zu\n", saved_size);
element new_hashtable[new_size];
memset(&new_hashtable, 0, sizeof(new_hashtable));
hashtable_prime_size = new_size;
new_hashtable[--new_size] = (element) { LAST_ELEMENT, 0 };
// cleaning array
while(--new_size > 0)
new_hashtable[new_size] = (element) { EMPTY_ELEMENT, 0 };
new_hashtable[new_size] = (element) { EMPTY_ELEMENT, 0 };
element found;
puts("FINDING EMPTY ELEMENT...");
// empty element in new hashtable already exists
while(strcmp((found = hashtable[new_size++]).word, LAST_ELEMENT) != 0)
{
printf("FINDING PLACE FOR WORD = \"%s\" WITH COUNT = %zu\n", found.word, found.count);
if (strcmp(found.word, EMPTY_ELEMENT) == 0) // pass
{
puts("PASSED BECAUSE OF EMPTY");
continue;
}
else
{
while(strcmp(new_hashtable[index = hash(found.word, offset++)].word, EMPTY_ELEMENT) != 0);
new_hashtable[index] = found;
printf("POSTED WITH INDEX = %zu INTO TEMP TABLE\n", index);
}
}
if(realloc(hashtable, saved_size * sizeof(element)))
{
memset(&hashtable, 0, sizeof(hashtable));
if (memcpy(hashtable, new_hashtable, sizeof(new_hashtable)))
{
puts("COPIED INTO NEW MEMORY LOCATION");
return true; // success
}
}
}
else
puts("NULL pointer to hashtable or incorrect rounds count passed to increase function.");
return false;
}
insert_status hashtable_element_insert(element * hashtable, element new_element)
{
insert_status result = { false, true, 0 };
if (new_element.word != NULL and strcmp(new_element.word, LAST_ELEMENT) != 0 and strcmp(new_element.word, EMPTY_ELEMENT) != 0) //validation
{
puts("INSERTING...");
found_element search_output;
result.is_error = false;
if (hashtable_element_search(hashtable, new_element.word, &search_output))
{
result.index = search_output.index;
printf("STEP 1. FOUND AT INDEX = %zu. NOT INSERTED.", result.index);
return result; // element already exists
}
if (!hashtable_element_search(hashtable, EMPTY_ELEMENT, &search_output)) // key not found
{
puts("STEP 2. EMPTY ELEMENT NOT FOUND.");
hashtable_size_increase(hashtable, 1); // reconstruct hashtable to add empty elements
hashtable_element_search(hashtable, EMPTY_ELEMENT, &search_output);
}
hashtable[search_output.index] = new_element; // rewrite empty element
result.result = true;
printf("NEW ELEMENT WORD = \"%s\" FOUND WITH INDEX = %zu\n", new_element.word, search_output.index);
return result;
}
else
puts("Incorrect new element word.");
return result;
}
void hashtable_elements_fill(FILE * textfile, element * hashtable)
{
char word[MAX_WORD_LENGTH_PRIME];
word[0] = '\0';
bool is_word = false;
char input;
while(!feof(textfile) and !ferror(textfile))
{
input = getc(textfile);
puts("FILLING...");
if (!isspace(input)) // append to a whole UTF-8 word
{
is_word = true;
strncat(word, &input, 1);
}
else if (is_word)
{
element new_element = { "", 1 };
strcpy(new_element.word, word);
puts("===WORD===");
puts(word);
puts("===END===");
insert_status output = hashtable_element_insert(hashtable, new_element);
printf("INSERTED INDEX = %zu\n", output.index);
if(output.is_error)
{
puts("OUTPUT.IS_ERROR = TRUE");
continue; // invalid word
}
else if (!output.result)
{
puts("OUTPUT.RESULT = FALSE");
++hashtable[output.index].count; // increasing found word counter
}
word[0] = '\0';
is_word = false;
}
}
if (ferror(textfile))
perror("File reading error occured.");
}
int main(int params_count, char *params[])
{
srand(time(NULL)); // setup randomizer
if (!check_params(params_count, params))
{
puts("Params check failed. Program terminated.");
FAIL;
}
char *filename;
filename = params[1];
FILE *textfile = fopen(filename, "r");
if(textfile == NULL)
{
puts("Cannot read input file. Program terminated.");
FAIL;
}
printf("FILENAME = %s", filename);
recalculate_remainders();
printf("HASHTABLE SIZE = %zu", MIN_PRIME_NUMBER * sizeof(element));
element * hashtable = malloc(MIN_PRIME_NUMBER * sizeof(element));
if (hashtable)
{
size_t counter = MIN_PRIME_NUMBER;
hashtable[--counter] = (element){ LAST_ELEMENT, 0 };
while(counter > 0)
hashtable[--counter] = (element){ EMPTY_ELEMENT, 0 };
// there are may be no words in a file at all (end element with NULL string always stored for barrier search)
puts("===HASHTABLE CONTENT===");
hashtable_content_print(hashtable);
puts("===END===");
hashtable_elements_fill(textfile, hashtable);
puts("===HASHTABLE CONTENT===");
hashtable_content_print(hashtable);
puts("===END===");
free(hashtable);
}
else
perror("Cannot assign memory location.");
fclose(textfile);
return EXIT_SUCCESS;
}
Trace from stdout:
Quote:
$ ./a.out input.txt
FILENAME = input.txtHASHTABLE SIZE = 528===HASHTABLE CONTENT===
<EMPTY_SYMBOL> count is 0
<END_OF_STRING> count is 0
===END===
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
===WORD===
qwerty
===END===
INSERTING...
SEARCHING...
word = qwerty; offset = 0; hash = 0
word = qwerty; offset = 1; hash = 1
SEARCHING...
word = <EMPTY_SYMBOL>; offset = 0; hash = 1
word = <EMPTY_SYMBOL>; offset = 1; hash = 0
FOUND ELEMENT WORD = "<EMPTY_SYMBOL>" WITH INDEX = 0 AND COUNT = 0
NEW ELEMENT WORD = "qwerty" FOUND WITH INDEX = 0
INSERTED INDEX = 0
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
===WORD===
qwerty
===END===
INSERTING...
SEARCHING...
word = qwerty; offset = 0; hash = 0
FOUND ELEMENT WORD = "qwerty" WITH INDEX = 0 AND COUNT = 1
STEP 1. FOUND AT INDEX = 0. NOT INSERTED.INSERTED INDEX = 0
OUTPUT.RESULT = FALSE
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
===WORD===
qwerty
===END===
INSERTING...
SEARCHING...
word = qwerty; offset = 0; hash = 0
FOUND ELEMENT WORD = "qwerty" WITH INDEX = 0 AND COUNT = 2
STEP 1. FOUND AT INDEX = 0. NOT INSERTED.INSERTED INDEX = 0
OUTPUT.RESULT = FALSE
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
===WORD===
ytrewq
===END===
INSERTING...
SEARCHING...
word = ytrewq; offset = 0; hash = 1
word = ytrewq; offset = 1; hash = 0
SEARCHING...
word = <EMPTY_SYMBOL>; offset = 0; hash = 1
word = <EMPTY_SYMBOL>; offset = 1; hash = 0
STEP 2. EMPTY ELEMENT NOT FOUND.
INCREASING HASHTABLE USING 1 ROUNDS...NEW PRIME SIZE = 3
FINDING EMPTY ELEMENT...
FINDING PLACE FOR WORD = "qwerty" WITH COUNT = 3
word = qwerty; offset = 0; hash = 0
POSTED WITH INDEX = 0 INTO TEMP TABLE
Segmentation fault (core dumped)
I am writing a program that will be able to count words in a textfile input.txt with content:
Quote:
qwerty qwerty qwerty ytrewq qazwsx hi dasf
Compilation:
Quote:
gcc main.c -Wall -Wextra -pedantic -std=c11
I did not understand where is the bug... Now, I've posted all code here because of it. My program crashes after first resizement. Please, tell anything about.