Segfault in Hash table implementation

**Jon Christan** · 03-25-2017

Hi, I'm new here. I have a problem with my hash table implementation. It's causing a segmentation fault. I know what a segfault is but I'm having a hard time identifying why it is occurring. Below is the code I have.

This is the Header file for the hash table:

Code:

#ifndef __ARCHIVE_H__
#define __ARCHIVE_H__

typedef union entry Entry;
typedef struct archive Archive;


Archive* arcreate (size_t);
void ardelete (Archive*);
void addEntry (Archive*, char*, char*, char*, unsigned int);
void removeEntry (Archive*, char*);
Entry* getEntry (Archive*, char*);

#endif

And here is Archive.c:

Code:

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>


#include "archive.h"


union entry {
    char* key;
    char* type;
    char* value;
    unsigned int line;
    union entry* next;
};


struct archive {
    size_t size;
    size_t curitems;
    union entry** items;
    struct archive* prev;
};


Archive* arcreate (size_t size) {
    Archive* resource;
    if (size < 1) {
        return NULL;
    }
    resource = malloc(sizeof(Archive));
    assert(resource != 0);
    resource->size = size;
    resource->curitems = 0;
    resource->items = calloc(size, sizeof(Entry*));
    assert(resource->items != 0);
    resource->prev = NULL;
    return resource;
}


void ardelete (Archive* resource) {
    Entry* ent;
    Entry* next;
    assert(resource);
    for (size_t i = 0; i < resource->size; i++) {
        for (ent = resource->items[i]; ent != 0; ent = next) {
            next = ent->next;
            free(ent->key);
            free(ent->type);
            free(ent->value);
            free(ent);
        }
    }
    free(resource->items);
    free(resource);
    return;
}


static unsigned long hash (char* key) {
    unsigned char* ukey;
    unsigned long node = 0;
    for (ukey = (unsigned char*) key; * ukey; ukey++) {
        node = (node >> 2) + * ukey;
    }
    return node;
}


static void grow (Archive* rsrc) {
    assert(rsrc);
    Archive* temp;
    Archive swap;
    Entry* ent;
    temp = arcreate (rsrc->size * 2);
    for (size_t i = 0; i < rsrc->size; i++) {
        for (ent = rsrc->items[i]; ent != 0; ent = ent->next) {
            addEntry(temp, ent->key, ent->type, ent->value, ent->line);
        }
    }
    swap = * rsrc;
    * rsrc = * temp;
    * temp = swap;
    ardelete(temp);
    return;
}
void addEntry(Archive* rsrc, char* key, char* type, char* value, unsigned int line) {
    assert(rsrc);
    Entry* ent;
    unsigned long node;
    assert(key); assert(type);
    assert(value); assert(line);
    ent = malloc(sizeof(Entry));
    assert(ent);
    ent->key = strdup(key);
    ent->type = strdup(type);
    ent->value = strdup(value);
    ent->line = line;
    node = hash(key) % rsrc->size;
    ent->next = rsrc->items[node];
    rsrc->items[node] = ent;
    rsrc->curitems++;
    if (rsrc->curitems >= rsrc->size * 1) {
        grow(rsrc);
    }
    return;
}


void removeEntry (Archive* rsrc, char* key) {
    Entry** prev;
    Entry* ent;
    for (prev = &(rsrc->items[hash(key) % rsrc->size]); * prev != 0; prev = &(* prev)->next) {
        if (!strcmp((* prev)->key, key)) {
            ent = * prev;
            * prev = ent->next;
            free(ent->key);
            free(ent->type);
            free(ent->value);
            free(ent);
        }
    }
    return;
}


Entry* getEntry (Archive* rsrc, char* key) {
    Entry* ent;
    unsigned long node;
    node = hash(key) % rsrc->size;
    for (ent = rsrc->items[node]; ent != 0; ent = ent->next) {
        if (!strcmp(ent->key, key)) {
            return ent;
        }
    }
    return NULL;
}


int exists (Archive* rsrc, char* key) {
    printf("checking if exists\n");
    Entry* ent;
    for (ent = rsrc->items[hash(key) % rsrc->size]; ent != 0; ent = ent->next) {
        printf("searching\n");
        if (!strcmp(ent->key, key)) {
            printf("found\n");
            return 1;
        }
    }
    return 0;
}

And finally Main.c:

Code:

#include <stdio.h>
#include <stdlib.h>


#include "archive.h"


int main() {
    Archive* resource;
    resource = arcreate(AR_SIZE);
    addEntry(resource, "OPEN", "KEYWORD", "open", 1);
    if (exists(resource, "OPEN")) {
        printf("confirmed.\n");
    }
    ardelete(resource);
    return 0;
}

According to my debugger, the segfault is occurring the execution of the exist function during hashing for (ent = rsrc->items[hash(key) % rsrc->size]; ent != 0; ent = ent->next). If you can help me identify the source of why the segfault is occurring or even better, help me resolve the issue, I'd be very thankful. So, thank you in advance for your help.

**algorism** · 03-25-2017

entry shouldn't be a union!
Do you know what a union is?

**algorism** · 03-25-2017

So did that help you or what? entry needs to be a struct. If it's a union then every time you write to a different member it overwrites the previous one since union members are essentially all on top of each other.

Some more notes:

You traced the problem to the wrong line. It's the strcmp that's failing since ent->key was equal to 1, an invalid address (1 was the value of line, which was the last value you stored into the union).

Your hash function doesn't seem very good. It would make more sense to shift to the left instead of to the right (which throws away the lowest (most unique) bits). Something like node * 5 + *ukey would be better (5 in binary is 101 so multiplying by 5 is like adding the number to itself shifted left 2 positions). If your input strings are always uppercase then subtracting 'A' from *ukey would be best. If they are always alphabetic but of both cases, perhaps converting to uppercase first then subtracting 'A' would be best. Hash functions are tricky and in the end you need to see how many collisions you get with realistic input to tune the function properly.

AR_SIZE is not defined in the code you posted.

strdup is not a standard function.

"Archive" is a strange name for a "HashTable". An archive is something that is stored (e.g., on disk) and kept for a long time.

Putting a space before the asterisk used for indirection is a very unusual spacing choice.

It is unusual to have a return at the end of a void function.

What is the * 1 for here?

Code:

    if (rsrc->curitems >= rsrc->size * 1)

You are assuming that calloc will set pointers to NULL, but that is not entirely portable (although it is essentially portable). It's considered more correct to loop through the elements setting them to NULL.

It is a hack to use assert to test if a malloc/calloc/realloc/strdup succeeded. assert is for debugging. Allocations can fail during normal execution. And remember that strdup allocates memory and returns NULL if it fails, so it needs to be checked as well. You might want to make xmalloc and xstrdup functions that exit if the allocation fails.

Code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

void *xmalloc(size_t sz) {
    void *p = malloc(sz);
    if (!p) {
        perror("xmalloc");
        exit(EXIT_FAILURE);
    }
    return p;    
}

char *xstrdup(const char *s) {
    return strcpy(xmalloc(strlen(s) + 1), s);
}

**Jon Christan** · 03-25-2017

Originally Posted by algorism

So did that help you or what? entry needs to be a struct. If it's a union then every time you write to a different member it overwrites the previous one since union members are essentially all on top of each other.

Some more notes:

You traced the problem to the wrong line. It's the strcmp that's failing since ent->key was equal to 1, an invalid address (1 was the value of line, which was the last value you stored into the union).

Your hash function doesn't seem very good. It would make more sense to shift to the left instead of to the right (which throws away the lowest (most unique) bits). Something like node * 5 + *ukey would be better (5 in binary is 101 so multiplying by 5 is like adding the number to itself shifted left 2 positions). If your input strings are always uppercase then subtracting 'A' from *ukey would be best. If they are always alphabetic but of both cases, perhaps converting to uppercase first then subtracting 'A' would be best. Hash functions are tricky and in the end you need to see how many collisions you get with realistic input to tune the function properly.

AR_SIZE is not defined in the code you posted.

strdup is not a standard function.

"Archive" is a strange name for a "HashTable". An archive is something that is stored (e.g., on disk) and kept for a long time.

Putting a space before the asterisk used for indirection is a very unusual spacing choice.

It is unusual to have a return at the end of a void function.

What is the * 1 for here?

Code:

    if (rsrc->curitems >= rsrc->size * 1)

You are assuming that calloc will set pointers to NULL, but that is not entirely portable (although it is essentially portable). It's considered more correct to loop through the elements setting them to NULL.

It is a hack to use assert to test if a malloc/calloc/realloc/strdup succeeded. assert is for debugging. Allocations can fail during normal execution. And remember that strdup allocates memory and returns NULL if it fails, so it needs to be checked as well. You might want to make xmalloc and xstrdup functions that exit if the allocation fails.

Code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

void *xmalloc(size_t sz) {
    void *p = malloc(sz);
    if (!p) {
        perror("xmalloc");
        exit(EXIT_FAILURE);
    }
    return p;    
}

char *xstrdup(const char *s) {
    return strcpy(xmalloc(strlen(s) + 1), s);
}

Thank you for your gracious reply. I'm sorry I'm late getting back to this. Your reply fixed everything and then some. One problem (because I've never really used unions before) was I thought unions made different instances like structs but it just compacted the data. Obviously by your reply this isn't true. I realize some of my syntax writing isn't exactly common, it's just easier for me to understand how things are related and so on. Do you have any other suggestions to how I can improve my hash table? I've taken everything into account and made due changes, but I'm wonder if there is more I could do to increase it's portability or its runtime efficiency.

**algorism** · 03-26-2017

What exactly do your keys look like?

Your exists function is essentially the same as getEntry. I.e., it could be written:

Code:

int exists(HashTable *ht, const char *key) { return getEntry(ht, key) != NULL; }

Or you could just use the getEntry function instead.

You probably shouldn't grow the table when curitems == size but maybe when it's twice the size or something like that.

You forgot to decrement curitems and return after removing a member.

I'm not sure what you're using prev for (in Archive). Some sort of yet-to-be-implemented hash table chaining?

Here's a quick rewrite. Mostly name changes and some statistics. The hash function currently assumes that the keys are purely alphabetic. The includes of "hashtable.h" and "util.h" are commented out so the code can be copied/pasted/and run in one file. To use the conditional code (#if 0) change the 0 to 1.

Code:

// util.h //////////////////////////////////////////////

#ifndef UTIL_H__
#define UTIL_H__

#include <stddef.h> // size_t

void die(const char *msg);
void *xmalloc(size_t sz);
char *xstrdup(const char *s);

#endif


// util.c //////////////////////////////////////////////

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
//#include "util.h"

void die(const char *msg) {
    if (msg[0] == '!')
        perror(msg + 1);
    else
        fprintf(stderr, "Error: %s\n", msg);
    exit(EXIT_FAILURE);
}

void *xmalloc(size_t sz) {
    void *p = malloc(sz);
    if (!p) die("!xmalloc");
    return p;    
}

char *xstrdup(const char *s) {
    return strcpy(xmalloc(strlen(s) + 1), s);
}


// hashtable.h //////////////////////////////////////////////

#ifndef HASHTABLE_H__
#define HASHTABLE_H__

#include <stddef.h> // size_t

typedef struct HTEntry HTEntry;
typedef struct HashTable HashTable;

HashTable *ht_create (size_t);
void ht_delete(HashTable*);
void ht_add(HashTable*, const char*, const char*,
           const char*, unsigned int);
void ht_remove(HashTable*, const char*);
HTEntry *ht_get(HashTable*, const char*);

#endif


// hashtable.c //////////////////////////////////////////////

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
//#include "hashtable.h"
//#include "util.h"

struct HTEntry {
    char *key, *type, *value;
    unsigned int line;
    struct HTEntry *next;
};

struct HashTable {
    size_t size, num_items;
    struct HTEntry **items;
    struct HashTable *prev;
};

static unsigned long hash (const char* key) {
    unsigned long sum = 0;
    for ( ; *key; key++) {
        if (!isalpha(*key)) die("bad key");
        sum = (sum * 65599) ^ (toupper(*key) - 'A');
    }
    return sum;
}

static void grow(HashTable *ht) {
    HashTable *temp, swap;
    HTEntry *ent;
    temp = ht_create(ht->size * 2);
    for (size_t i = 0; i < ht->size; i++)
        for (ent = ht->items[i]; ent; ent = ent->next)
            ht_add(temp, ent->key, ent->type, ent->value,
                   ent->line);
    swap = *ht;
    *ht = *temp;
    *temp = swap;
    ht_delete(temp);
}

HashTable *ht_create(size_t size) {
    HashTable *ht;
    if (size < 1) return NULL;
    ht = xmalloc(sizeof(HashTable));
    ht->size = size;
    ht->num_items = 0;
    ht->items = xmalloc(size * sizeof(HTEntry*));
    for (size_t i = 0; i < size; i++)
        ht->items[i] = NULL;
    ht->prev = NULL;
    return ht;
}

void ht_delete(HashTable* ht) {
    HTEntry *ent, *next;
    for (size_t i = 0; i < ht->size; i++)
        for (ent = ht->items[i]; ent; ent = next) {
            next = ent->next;
            free(ent->key);
            free(ent->type);
            free(ent->value);
            free(ent);
        }
    free(ht->items);
    free(ht);
}

void ht_add(HashTable   *ht,
            const char  *key,
            const char  *type,
            const char  *value,
            unsigned int line) {
    assert(key);
    assert(type);
    assert(value);

    HTEntry *ent = xmalloc(sizeof(HTEntry));
    ent->key = xstrdup(key);
    ent->type = xstrdup(type);
    ent->value = xstrdup(value);
    ent->line = line;

    unsigned long elem = hash(key) % ht->size;
    ent->next = ht->items[elem];
    ht->items[elem] = ent;
    ht->num_items++;
    
    if (ht->num_items > ht->size * 2)
        grow(ht);
}

void ht_remove(HashTable *ht, const char *key) {
    HTEntry **prev, *ent;
    unsigned long elem = hash(key) % ht->size;
    for (prev = &ht->items[elem]; *prev; prev = &(*prev)->next) {
        if (!strcmp((*prev)->key, key)) {
            ent = *prev;
            *prev = ent->next;
            free(ent->key);
            free(ent->type);
            free(ent->value);
            free(ent);

            ht->num_items--;
            return;
        }
    }
}

HTEntry *ht_get(HashTable *ht, const char *key) {
    unsigned long elem = hash(key) % ht->size;
    for (HTEntry *ent = ht->items[elem]; ent; ent = ent->next)
        if (!strcmp(ent->key, key))
            return ent;
    return NULL;
}

void ht_stats(HashTable *ht, int show_detail) {
    size_t used = 0, maxcnt = 0;
    for (size_t i = 0; i < ht->size; i++) {
        if (show_detail) printf("%4zu:\n", i);
        HTEntry *e = ht->items[i];
        if (!e) {
            if (show_detail) printf("    null\n");
        }
        else {
            size_t cnt = 0;
            used++;
            for ( ; e; e = e->next) {
                cnt++;
                if (show_detail)
                    printf("    key: %s\n", e->key);
            }
            if (cnt > maxcnt) maxcnt = cnt;
        }
    }
    printf("size:      %4zu\n", ht->size);
    printf("num_items: %4zu\n", ht->num_items);
    printf("Used buckets: %zu out of %zu\n", used, ht->size);
    printf("Max bucket size: %zu\n", maxcnt);
    printf("Avg per used bucket: %f\n",
           (double)ht->num_items / used);
}


// main.c ///////////////////////////////////////////////////
#include <stdio.h>
#include <stdlib.h>
//#include "hashtable.h"

#define HT_SIZE 10

int main() {
    const char *s[] = {
        "ALPHA", "BETA", "GAMMA", "DELTA",
        "EPSILON", "ZETA", "ETA", "THETA",
        "IOTA", "KAPPA", "LAMBDA", "MU",
        "NU", "XI", "OMICRON", "PI",
        "RHO", "SIGMA", "TAU", "UPSILON",
        "PHI", "CHI", "PSI", "OMEGA",
#if 0
        "HYDROGEN", "HELIUM", "LITHIUM", "BERYLLIUM",
        "BORON", "CARBON", "NITROGEN", "OXYGEN",
        "FLUORINE", "NEON", "SODIUM", "MAGNESIUM",
        "ALUMINUM", "SILICON", "PHOSPHORUS", "SULFUR",
        "CHLORINE", "ARGON",
#endif
    };
    size_t sz = sizeof s / sizeof s[0];

    HashTable *ht = ht_create(HT_SIZE);

    for (size_t i = 0; i < sz; i++)
        ht_add(ht, s[i], "xxx", "yyy", 1);

// Delete every second string in s
#if 0
    for (size_t i = 0; i < sz; i += 2)
        ht_remove(ht, s[i]);
#endif

    ht_stats(ht, 1);

    ht_delete(ht);

    return 0;
}

Thread: Segfault in Hash table implementation

Thread Tools

Search Thread

Display

Segfault in Hash table implementation

Thanks

Similar Threads

Dynamic Hash Table Implementation?!?

Dynamic Hash Table Implementation

Debugging Hash Table Implementation

Hash Table implementation

hash table data structure implementation

Tags for this Thread