Ok guys..
i'm so sorry this is a big chunka code....
But this is my problem, there seem to be some missing entries when i insert these stuff into my hashtable...
The input file is as follows.....
the entries comes in pairs, phonemes 1st line, graphemes 2nd line, and each entry is seperated by a space(the delimiter)Code:AA AH OW AU AA AWE AY AYE IY EE EH EH ER ER AY EYE OW OH OW OW OW OWE AH UH AE N AH M AH L A N I M A L AE N D IY AH N A N D E A N AE N T IY A N T I AE P AH TH IY A P A TH Y AE N T A N T AE NG K ER A N CH OR AE N D A N D AE M P AH L A M P LE _ AE N T IY K A N T I QUE AE N A N AE M P A M P AH P L AY A PP L Y
eg, entries for
AE N AH M AH L
A N I M A L
will be , "A|AE,-" , "N|N,A", "I|AH,N" so on...
like "grapheme(i)| phoneme(i),grapheme(i-1)"...
and also along with it, ".|AE,-" , ".|N,A" ...so on....
of format ".|phoneme(i), grapheme(i-1)"....(this is the bit that usually gets missing)
so this is my read_records() func that reads the whole chunka word in pairs...
Code:static void read_records (FILE *fp, HashTable *ht_bigram, HashTable *ht_trigram) { char *phoneme = safe_malloc(sizeof(char)*WORD_LEN); char *grapheme = safe_malloc(sizeof(char)*WORD_LEN); while (fgets(phoneme, WORD_LEN, fp) != NULL) { if (fgets(grapheme, WORD_LEN, fp) == NULL) { fprintf(stderr, "Badly formed input file"); exit(EXIT_FAILURE); } trigram_format(ht_trigram, phoneme, grapheme); } }
and trigram_format() splits them up to proper single entries to be inserted in the hashtable....
My Hashtable Functions.....Code:static void trigram_format(HashTable *ht, char *phoneme, char *grapheme) { int i=0, j=0, k=0, n=0, m=0, p=0; char *trigram_entry = safe_malloc(sizeof(char)*ENTRY_LEN); char *phoneme_curr = safe_malloc(sizeof(char)*ENTRY_LEN); char *grapheme_prev = safe_malloc(sizeof(char)*ENTRY_LEN); char *grapheme_curr = safe_malloc(sizeof(char)*ENTRY_LEN); char *phoneme_entry = safe_malloc(sizeof(char)*ENTRY_LEN); trigram_entry = init_string(trigram_entry, ENTRY_LEN); phoneme_curr = init_string(phoneme_curr, ENTRY_LEN); grapheme_prev = init_string(grapheme_prev, ENTRY_LEN); grapheme_curr = init_string(grapheme_curr,ENTRY_LEN); phoneme_entry = init_string(phoneme_entry,ENTRY_LEN); grapheme_prev[0] = '-'; while(grapheme[k] != NEW_LINE || phoneme[j] != NEW_LINE) { while(!isspace(grapheme[k])) { grapheme_curr[n] = grapheme[k]; n++; k++; } while(!isspace(phoneme[j])) { phoneme_curr[m] = phoneme[j]; m++; j++; } /*copy phoneme and grapheme entries in proper order * into trigram_entry*/ trigram_entry = trigram_strcpy(phoneme_curr, grapheme_prev, grapheme_curr); /*Function to extract phonemes out of entries*/ phoneme_entry = phoneme_extract(phoneme_entry); hashtable_insert(ht, phoneme_entry); hashtable_insert(ht, trigram_entry); strcpy(grapheme_prev, grapheme_curr); /*Initialise all entries to NULL_CHAR to prepare for * next batch of entries*/ phoneme_curr = init_string(phoneme_curr, strlen(phoneme_curr)); grapheme_curr = init_string(grapheme_curr, strlen(grapheme_curr)); phoneme_entry = init_string(phoneme_entry, strlen(phoneme_entry)); if(phoneme[j] != NEW_LINE && grapheme[k] != NEW_LINE) { j++; k++; n=0; m=0; } } } char *trigram_strcpy(char *phoneme, char *grapheme_prev, char *grapheme_curr) { int i=0, j, k; char *trigram_entry = safe_malloc(sizeof(char)*WORD_LEN); trigram_entry = init_string(trigram_entry, WORD_LEN); for(i=0; i < strlen(grapheme_curr); i++) trigram_entry[i] = grapheme_curr[i]; trigram_entry[i++] = '|'; for(j=0; j < strlen(phoneme); j++) { trigram_entry[i] = phoneme[j]; i++; } trigram_entry[i++] = ','; for(k=0; k < strlen(grapheme_prev); k++) { trigram_entry[i] = grapheme_prev[k]; i++; } trigram_entry = chomp(trigram_entry); return trigram_entry; } char *init_string(char *string , int length) { int i; for(i=0; i < length; i++) string[i] = NULL_CHAR; return string; } char *phoneme_extract(char *entry) { int i=0, j=0; char *phoneme = safe_malloc(sizeof(char)*ENTRY_LEN); phoneme[j++] = '.'; while(entry[i] != '|') i++; while(entry[i] != NULL_CHAR) { phoneme[j] = entry[i]; j++; i++; } phoneme = chomp(phoneme); return phoneme; }
oh yes, my structureCode:static HashTable *hashtable_init(size_t size) { int i , N = (int)(2*size); /*Allocate memory for new hashtable*/ HashTable *ht = safe_malloc(sizeof(HashTable)*N); ht->table = safe_malloc(sizeof(TableEntry)*size); ht->table_size = size; ht->num_items = (int)size; /*Allocate memory for contents of table*/ for(i = 0; i < (int)size; i++) { ht->table[i].entry = safe_malloc(sizeof(char)*WORD_LEN*2); } /*Initiallise values for table contents*/ for(i = 0; i < (int)size; i++) { ht->table[i].entry = NULL_CHAR; ht->table[i].freq = 0; ht->table[i].log_prob = 0; } return ht; } static void hashtable_insert(HashTable *ht, char *entry) { int M = ht->num_items; int hashval = hash_func(entry, M); /*if encounter an already existing similar entry while probing, * update its frequency*/ while(ht->table[hashval].entry != NULL_CHAR) { if(eq(entry, ht->table[hashval].entry)) { ht->table[hashval].freq++; break; } else hashval = (hashval+1) % M; } if(ht->table[hashval].entry == NULL_CHAR) { ht->table[hashval].entry = safe_strdup(entry); ht->table[hashval].freq++; } } static int hashtable_lookup_freq(HashTable *ht, char *key) { int M = ht->num_items; int h = hash_func(key, M); while(ht->table[h].entry != NULL_CHAR) { if(eq(key, ht->table[h].entry)) return ht->table[h].freq; else h = (h+1) % M; } return 0; } int hash_func(char *v, int M) { int h = 0, a = 31415, b = 27183; for(h=0; *v != NULL_CHAR; v++, a = a*b % (M-1)) h = (a*h + *v) % M; return h; }
Yep, i hope i didn't forget anything important..Code:#define WORD_LEN 20 #define ENTRY_LEN 10 #define NULL_CHAR '\0' #define NEW_LINE '\n' #define eq(A,B) (strcmp(A,B) == 0) typedef struct { char *entry; int freq; double log_prob; } TableEntry; typedef struct { int table_size; int num_items; TableEntry *table; } HashTable;
safe_malloc is alrite, just malloc() space for entries..
Sorry for the enormously long post....
Thanks guys![]()



LinkBack URL
About LinkBacks



