Missing Entries in Hashtable

**wuzzo87** · 05-12-2007

Ok guys..
i'm so sorry this is a big chunka code....
But this is my problem, there seem to be some missing entries when i insert these stuff into my hashtable...

The input file is as follows.....

Code:

AA
AH
OW
AU
AA
AWE
AY
AYE
IY
EE
EH
EH
ER
ER
AY
EYE
OW
OH
OW
OW
OW
OWE
AH
UH
AE N AH M AH L
A N I M A L
AE N D IY AH N
A N D E A N
AE N T IY
A N T I
AE P AH TH IY
A P A TH Y
AE N T
A N T
AE NG K ER
A N CH OR
AE N D
A N D
AE M P AH L
A M P LE _
AE N T IY K
A N T I QUE
AE N
A N
AE M P
A M P
AH P L AY
A PP L Y

the entries comes in pairs, phonemes 1st line, graphemes 2nd line, and each entry is seperated by a space(the delimiter)

eg, entries for
AE N AH M AH L
A N I M A L

will be , "A|AE,-" , "N|N,A", "I|AH,N" so on...
like "grapheme(i)| phoneme(i),grapheme(i-1)"...
and also along with it, ".|AE,-" , ".|N,A" ...so on....
of format ".|phoneme(i), grapheme(i-1)"....(this is the bit that usually gets missing)

so this is my read_records() func that reads the whole chunka word in pairs...

Code:

static void read_records
(FILE *fp, HashTable *ht_bigram, HashTable *ht_trigram)
{
        char *phoneme = safe_malloc(sizeof(char)*WORD_LEN);
        char *grapheme = safe_malloc(sizeof(char)*WORD_LEN);

        while (fgets(phoneme, WORD_LEN, fp) != NULL)
        {
                if (fgets(grapheme, WORD_LEN, fp) == NULL)
                {
                        fprintf(stderr, "Badly formed input file");
                        exit(EXIT_FAILURE);
                }
                trigram_format(ht_trigram, phoneme, grapheme);
        }
}

and trigram_format() splits them up to proper single entries to be inserted in the hashtable....

Code:

static void trigram_format(HashTable *ht, char *phoneme, char *grapheme)
{
    int i=0, j=0, k=0, n=0, m=0, p=0;
    char *trigram_entry = safe_malloc(sizeof(char)*ENTRY_LEN);
    char *phoneme_curr = safe_malloc(sizeof(char)*ENTRY_LEN);
    char *grapheme_prev = safe_malloc(sizeof(char)*ENTRY_LEN);
    char *grapheme_curr = safe_malloc(sizeof(char)*ENTRY_LEN);
    char *phoneme_entry = safe_malloc(sizeof(char)*ENTRY_LEN);

    trigram_entry = init_string(trigram_entry, ENTRY_LEN);
    phoneme_curr = init_string(phoneme_curr, ENTRY_LEN);
    grapheme_prev = init_string(grapheme_prev, ENTRY_LEN);
    grapheme_curr = init_string(grapheme_curr,ENTRY_LEN);
    phoneme_entry = init_string(phoneme_entry,ENTRY_LEN);

    grapheme_prev[0] = '-';

    while(grapheme[k] != NEW_LINE || phoneme[j] != NEW_LINE)
    {
            while(!isspace(grapheme[k]))
            {
                    grapheme_curr[n] = grapheme[k];
                    n++;
                    k++;
            }


            while(!isspace(phoneme[j]))
            {
                    phoneme_curr[m] = phoneme[j];
                    m++;
                    j++;
            }

            /*copy phoneme and grapheme entries in proper order
             * into trigram_entry*/
            trigram_entry = trigram_strcpy(phoneme_curr, grapheme_prev,               grapheme_curr);
            /*Function to extract phonemes out of entries*/
            phoneme_entry = phoneme_extract(phoneme_entry);

            hashtable_insert(ht, phoneme_entry);
            hashtable_insert(ht, trigram_entry);

            strcpy(grapheme_prev, grapheme_curr);

            /*Initialise all entries to NULL_CHAR to prepare for
             * next batch of entries*/
            phoneme_curr = init_string(phoneme_curr,
                                strlen(phoneme_curr));
            grapheme_curr = init_string(grapheme_curr,
                                strlen(grapheme_curr));
            phoneme_entry = init_string(phoneme_entry,
                                strlen(phoneme_entry));
            if(phoneme[j] != NEW_LINE && grapheme[k] != NEW_LINE)
            {
                    j++;
                    k++;
                    n=0;
                    m=0;
            }

    }
}


char *trigram_strcpy(char *phoneme, char *grapheme_prev, char *grapheme_curr)
{
        int i=0, j, k;

        char *trigram_entry = safe_malloc(sizeof(char)*WORD_LEN);
        trigram_entry = init_string(trigram_entry, WORD_LEN);


    for(i=0; i < strlen(grapheme_curr); i++)
        trigram_entry[i] = grapheme_curr[i];


        trigram_entry[i++] = '|';

        for(j=0; j < strlen(phoneme); j++)
    {
                trigram_entry[i] = phoneme[j];
        i++;
    }

        trigram_entry[i++] = ',';

        for(k=0; k < strlen(grapheme_prev); k++)
    {
                trigram_entry[i] = grapheme_prev[k];
        i++;
    }

        trigram_entry = chomp(trigram_entry);

        return trigram_entry;
}


char *init_string(char *string , int length)
{
        int i;

        for(i=0; i < length; i++)
                string[i] = NULL_CHAR;

        return string;
}
                                                                                                                                      
char *phoneme_extract(char *entry)
{
        int i=0, j=0;
        char *phoneme = safe_malloc(sizeof(char)*ENTRY_LEN);

        phoneme[j++] = '.';

        while(entry[i] != '|') i++;

        while(entry[i] != NULL_CHAR)
        {
                phoneme[j] = entry[i];
                j++;
                i++;
        }

        phoneme = chomp(phoneme);

        return phoneme;
}

My Hashtable Functions.....

Code:

static HashTable *hashtable_init(size_t size)
{
        int i , N = (int)(2*size);

        /*Allocate memory for new hashtable*/
        HashTable *ht = safe_malloc(sizeof(HashTable)*N);
        ht->table = safe_malloc(sizeof(TableEntry)*size);
        ht->table_size = size;
        ht->num_items = (int)size;

        /*Allocate memory for contents of table*/
        for(i = 0; i < (int)size; i++)
        {
                ht->table[i].entry = safe_malloc(sizeof(char)*WORD_LEN*2);
        }

        /*Initiallise values for table contents*/
        for(i = 0; i < (int)size; i++)
        {
                ht->table[i].entry = NULL_CHAR;
                ht->table[i].freq = 0;
                ht->table[i].log_prob = 0;
        }

        return ht;
}


static void hashtable_insert(HashTable *ht, char *entry)
{
        int M = ht->num_items;
        int hashval = hash_func(entry, M);

        /*if encounter an already existing similar entry while probing,
         * update its frequency*/
        while(ht->table[hashval].entry != NULL_CHAR)
        {
                if(eq(entry, ht->table[hashval].entry))
                {
                        ht->table[hashval].freq++;
                        break;
                }
                else
                hashval = (hashval+1) % M;
        }

        if(ht->table[hashval].entry == NULL_CHAR)
        {
            ht->table[hashval].entry = safe_strdup(entry);
            ht->table[hashval].freq++;
        }
}

static int hashtable_lookup_freq(HashTable *ht, char *key)
{
        int M = ht->num_items;
        int h = hash_func(key, M);

        while(ht->table[h].entry != NULL_CHAR)
        {
                if(eq(key, ht->table[h].entry)) return ht->table[h].freq;
                else h = (h+1) % M;
        }

        return 0;
}


int hash_func(char *v, int M)
{
        int h = 0, a = 31415, b = 27183;

        for(h=0; *v != NULL_CHAR; v++, a = a*b % (M-1))
                h = (a*h + *v) % M;

        return h;
}

oh yes, my structure

Code:

#define WORD_LEN 20
#define ENTRY_LEN 10
#define NULL_CHAR '\0'
#define NEW_LINE '\n'
#define eq(A,B) (strcmp(A,B) == 0)

typedef struct {
        char *entry;
        int freq;
        double log_prob;
} TableEntry;

typedef struct {
        int table_size;
        int num_items;
        TableEntry *table;
} HashTable;

Yep, i hope i didn't forget anything important..
safe_malloc is alrite, just malloc() space for entries..
Sorry for the enormously long post....

Thanks guys

**wuzzo87** · 05-12-2007

like ermm...
is there anything particularly wrong with my hashtable_insert() function and my lookup function?

**no1uno** · 05-12-2007

your readrecords function reads in the same amount of letters for each word, but they are different lengths

AE N AH M AH L

is longer than

A N I M A L

but you read in WORD_LEN characters for both

Edit: and what's with this stuff?

AA
AH
OW
AU
AA
AWE
AY
AYE
...

**wuzzo87** · 05-13-2007

sorry guys..
i've fixed the problem..
it was a silly mistake of mine

Sorry for disturbing..
THX!

Thread: Missing Entries in Hashtable

Thread Tools

Search Thread

Display

Missing Entries in Hashtable

Similar Threads

Errors including <windows.h>

failure to import external C libraries in C++ project

more then 100errors in header

ras.h errors

pointer to array of objects of struct