Non-whitespace in-file sequences counter using own hashtable

**Wido** · 11-17-2022

main.c:

Code:

#include <stdio.h>
#include <stdlib.h> 
#include <stdbool.h>
#include <iso646.h>
#include <string.h>
#include <ctype.h>
#include <stdint.h>
#include <time.h>

// exit on failure                           
#define FAIL return EXIT_FAILURE
#define MAX_WORD_LENGTH_PRIME 251 // some high word length
#define LAST_ELEMENT "<END_OF_STRING>" // do not use in text
#define EMPTY_ELEMENT "<EMPTY_SYMBOL>" // do not use in text
#define MIN_PRIME_NUMBER 2

// remainders for each letter code in hash function computation
size_t remainders[MAX_WORD_LENGTH_PRIME];

// hash table entry
typedef struct
{
    char word[MAX_WORD_LENGTH_PRIME]; // 8-bit UTF-8
    size_t count;
} element; // to use declaration without struct keyword over all source

typedef struct
{
    size_t index;
    size_t count;
} found_element;

typedef struct
{
    bool result;
    bool is_error;
    size_t index;
} insert_status;

size_t hashtable_prime_size = MIN_PRIME_NUMBER; // starts from minimal prime size because of empty textfile existence probability (we do not got memory loss in this case)

void hashtable_content_print(element * hashtable)
{
    if (hashtable != NULL)
    {    
        element element_i;
        
        for(size_t i = 0; i< hashtable_prime_size; ++i)
        {
            element_i = hashtable[i];
            
            if (strcmp(element_i.word, EMPTY_ELEMENT) != 0 && strcmp(element_i.word, LAST_ELEMENT) != 0)
                printf("%s count is %zu\n", element_i.word, element_i.count);
        }
    }    
    else
        puts("NULL table pointer passed to print.");
}


void recalculate_remainders()
{
    for (size_t i = 0; i < MAX_WORD_LENGTH_PRIME; ++i)
        remainders[i] = rand() % MAX_WORD_LENGTH_PRIME;
}

size_t get_next_prime_number(size_t current_prime_number) // required to get new hashtable size on reconstruction (with rounds == 1 we will got minimal memory loss)
{
    size_t saved = current_prime_number;
    bool is_prime = true;
    
    if (current_prime_number >= MIN_PRIME_NUMBER)
        for(current_prime_number = current_prime_number + 1; current_prime_number < SIZE_MAX; ++current_prime_number)    
            {
                for(size_t j = 2; j < current_prime_number; ++j)
                    if (current_prime_number % j == 0)
                    {
                        is_prime = false;
                        
                        break;    
                    }
                    else
                        is_prime = true;
    
                if (is_prime)
                    break;        
            }
    
    return (current_prime_number == saved) ? 0 : current_prime_number;
}

// argc, argv checking
bool check_params(int params_count, char *params[])
{
    return (params_count == 2) and (params != NULL) and (params[1] != NULL);
}

size_t hash(char value[], size_t offset) // double hashing probe
{
    size_t result = 0;
    
    if (value != NULL and strlen(value) > 0)
        for (size_t i = 0; value[i] != '\0'; ++i)
            result += remainders[i] * value[i];
    else
        puts("NULL value pointer or empty value passed to hash function.");
    result = (result % hashtable_prime_size + offset * (1 + result % (hashtable_prime_size - 1))) % hashtable_prime_size;
    
    printf("word = %s; offset = %zu; hash = %zu\n", value, offset, result);
    
    return result;
}

bool hashtable_element_search(element * hashtable, char key[], found_element * result)
{    
    if ((hashtable != NULL) and (key != NULL))
    {
        puts("SEARCHING...");
        
        element found;
        size_t offset = 0, index;
        
        do        
            if (strcmp((found = hashtable[index = hash(key, offset++)]).word, key) == 0)    
            {    
                *result = (found_element) { index, found.count };
                
                printf("FOUND ELEMENT WORD = \"%s\" WITH INDEX = %zu AND COUNT = %zu\n", found.word, index, found.count);
                
                return true;            
            }
        while(offset != hashtable_prime_size);
    }
    else
        puts("NULL table pointer or NULL key pointer passed to search.");
    
    return false;
}

bool hashtable_size_increase(element * hashtable, size_t rounds) // rounds is a number of prime hashtable size rises
{    
    if (hashtable != NULL and rounds > 0)
        {    
            printf("INCREASING HASHTABLE USING %zu ROUNDS...", rounds);
            
            size_t new_size, offset = 0, index = 0, saved_size = 0;
        
            while(rounds-- != 0)
                saved_size = new_size = get_next_prime_number(hashtable_prime_size);
                
            printf("NEW PRIME SIZE = %zu\n", saved_size);
            
            element new_hashtable[new_size];
            memset(&new_hashtable, 0, sizeof(new_hashtable));
            hashtable_prime_size = new_size;
            new_hashtable[--new_size] = (element) { LAST_ELEMENT, 0 };
                        
            // cleaning array
            while(--new_size > 0)
                new_hashtable[new_size] = (element) { EMPTY_ELEMENT, 0 };
                
            new_hashtable[new_size] = (element) { EMPTY_ELEMENT, 0 };
            element found;    
            
            puts("FINDING EMPTY ELEMENT...");
            
            // empty element in new hashtable already exists
            while(strcmp((found = hashtable[new_size++]).word, LAST_ELEMENT) != 0)
                {    
                    printf("FINDING PLACE FOR WORD = \"%s\" WITH COUNT = %zu\n", found.word, found.count);
                     
                    if (strcmp(found.word, EMPTY_ELEMENT) == 0) // pass
                    {
                        puts("PASSED BECAUSE OF EMPTY");
                        
                        continue;
                    }
                    else
                    {
                        while(strcmp(new_hashtable[index = hash(found.word, offset++)].word, EMPTY_ELEMENT) != 0);
                        new_hashtable[index] = found;    
                        
                        printf("POSTED WITH INDEX = %zu INTO TEMP TABLE\n", index);
                    }
                }
            
            if(realloc(hashtable, saved_size * sizeof(element)))
            {
                memset(&hashtable, 0, sizeof(hashtable));
                
                if (memcpy(hashtable, new_hashtable, sizeof(new_hashtable)))
                {
                    puts("COPIED INTO NEW MEMORY LOCATION");
                
                    return true; // success
                }
            }
        }
        else
            puts("NULL pointer to hashtable or incorrect rounds count passed to increase function.");
            
    return false;
}

insert_status hashtable_element_insert(element * hashtable, element new_element)
{
    insert_status result = { false, true, 0 };
    
    if (new_element.word != NULL and strcmp(new_element.word, LAST_ELEMENT) != 0 and strcmp(new_element.word, EMPTY_ELEMENT) != 0) //validation
    {
        puts("INSERTING...");
        
        found_element search_output;
        result.is_error = false;
        
        if (hashtable_element_search(hashtable, new_element.word, &search_output))
        {    
            result.index = search_output.index;
            
            printf("STEP 1. FOUND AT INDEX = %zu. NOT INSERTED.", result.index);
            
            return result; // element already exists        
        }
        
        if (!hashtable_element_search(hashtable, EMPTY_ELEMENT, &search_output)) // key not found
        {    
            puts("STEP 2. EMPTY ELEMENT NOT FOUND.");
            
            hashtable_size_increase(hashtable, 1); // reconstruct hashtable to add empty elements            
            hashtable_element_search(hashtable, EMPTY_ELEMENT, &search_output);        
        }
        
        hashtable[search_output.index] = new_element; // rewrite empty element
        result.result = true;        
        
        printf("NEW ELEMENT WORD = \"%s\" FOUND WITH INDEX = %zu\n", new_element.word, search_output.index);
        
        return result;        
    }
    else
        puts("Incorrect new element word.");
    
    return result;
}

void hashtable_elements_fill(FILE * textfile, element * hashtable)
{    
    char word[MAX_WORD_LENGTH_PRIME];
    word[0] = '\0';
    bool is_word = false;
    char input;
    
    while(!feof(textfile) and !ferror(textfile))
    {
        input = getc(textfile);
        
        puts("FILLING...");
        
        if (!isspace(input)) // append to a whole UTF-8 word
        {            
            is_word = true;            
            strncat(word, &input, 1);                                     
        }
        else if (is_word)
        {            
            element new_element = { "", 1 };
            strcpy(new_element.word, word);
            
            puts("===WORD===");
            puts(word);
            puts("===END===");   
            
            insert_status output = hashtable_element_insert(hashtable, new_element);
            
            printf("INSERTED INDEX = %zu\n", output.index);
            
            if(output.is_error)
            {
                puts("OUTPUT.IS_ERROR = TRUE");
                
                continue; // invalid word
            }
            else if (!output.result)        
            {
                puts("OUTPUT.RESULT = FALSE");
                
                ++hashtable[output.index].count; // increasing found word counter    
            }
            
            word[0] = '\0';
            is_word = false;            
        }
    }    
    
    if (ferror(textfile))    
        perror("File reading error occured.");    
}

int main(int params_count, char *params[])
{            
    srand(time(NULL)); // setup randomizer

    if (!check_params(params_count, params))
    {
        puts("Params check failed. Program terminated.");
        
        FAIL;
    }
    
    char *filename;
    
    filename = params[1];                            
    FILE *textfile = fopen(filename, "r");

    if(textfile == NULL)           
    {    
        puts("Cannot read input file. Program terminated.");
        
        FAIL;
    }
    printf("FILENAME = %s", filename);
    recalculate_remainders();
    printf("HASHTABLE SIZE = %zu", MIN_PRIME_NUMBER * sizeof(element));
    element * hashtable = malloc(MIN_PRIME_NUMBER * sizeof(element)); 
    
    if (hashtable)
    {
        size_t counter = MIN_PRIME_NUMBER;
        hashtable[--counter] = (element){ LAST_ELEMENT, 0 };
        
        while(counter > 0)
            hashtable[--counter] = (element){ EMPTY_ELEMENT, 0 };
        // there are may be no words in a file at all (end element with NULL string always stored for barrier search)
        puts("===HASHTABLE CONTENT===");
        hashtable_content_print(hashtable);
        puts("===END===");
        hashtable_elements_fill(textfile, hashtable); 
        puts("===HASHTABLE CONTENT===");
        hashtable_content_print(hashtable);
        puts("===END===");
        free(hashtable);             
     }
     else
         perror("Cannot assign memory location.");
     
       fclose(textfile);       
    
    return EXIT_SUCCESS;
}

Trace from stdout:

$ ./a.out input.txt
FILENAME = input.txtHASHTABLE SIZE = 528===HASHTABLE CONTENT===
<EMPTY_SYMBOL> count is 0
<END_OF_STRING> count is 0
===END===
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
===WORD===
qwerty
===END===
INSERTING...
SEARCHING...
word = qwerty; offset = 0; hash = 0
word = qwerty; offset = 1; hash = 1
SEARCHING...
word = <EMPTY_SYMBOL>; offset = 0; hash = 1
word = <EMPTY_SYMBOL>; offset = 1; hash = 0
FOUND ELEMENT WORD = "<EMPTY_SYMBOL>" WITH INDEX = 0 AND COUNT = 0
NEW ELEMENT WORD = "qwerty" FOUND WITH INDEX = 0
INSERTED INDEX = 0
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
===WORD===
qwerty
===END===
INSERTING...
SEARCHING...
word = qwerty; offset = 0; hash = 0
FOUND ELEMENT WORD = "qwerty" WITH INDEX = 0 AND COUNT = 1
STEP 1. FOUND AT INDEX = 0. NOT INSERTED.INSERTED INDEX = 0
OUTPUT.RESULT = FALSE
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
===WORD===
qwerty
===END===
INSERTING...
SEARCHING...
word = qwerty; offset = 0; hash = 0
FOUND ELEMENT WORD = "qwerty" WITH INDEX = 0 AND COUNT = 2
STEP 1. FOUND AT INDEX = 0. NOT INSERTED.INSERTED INDEX = 0
OUTPUT.RESULT = FALSE
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
FILLING...
===WORD===
ytrewq
===END===
INSERTING...
SEARCHING...
word = ytrewq; offset = 0; hash = 1
word = ytrewq; offset = 1; hash = 0
SEARCHING...
word = <EMPTY_SYMBOL>; offset = 0; hash = 1
word = <EMPTY_SYMBOL>; offset = 1; hash = 0
STEP 2. EMPTY ELEMENT NOT FOUND.
INCREASING HASHTABLE USING 1 ROUNDS...NEW PRIME SIZE = 3
FINDING EMPTY ELEMENT...
FINDING PLACE FOR WORD = "qwerty" WITH COUNT = 3
word = qwerty; offset = 0; hash = 0
POSTED WITH INDEX = 0 INTO TEMP TABLE
Segmentation fault (core dumped)

I am writing a program that will be able to count words in a textfile input.txt with content:

qwerty qwerty qwerty ytrewq qazwsx hi dasf

Compilation:

gcc main.c -Wall -Wextra -pedantic -std=c11

I did not understand where is the bug... Now, I've posted all code here because of it. My program crashes after first resizement. Please, tell anything about.

Thread: Non-whitespace in-file sequences counter using own hashtable

Thread Tools

Search Thread

Display

Threaded View

Non-whitespace in-file sequences counter using own hashtable

Similar Threads

Identify and count word sequences within a text file

Searching for a string in txt file with whitespace!

Page File counter and Private Bytes Counter

Interpreting literal escape sequences from a file...

parsing a FASTA file (protein sequences)

Tags for this Thread