Ok guys..
i'm so sorry this is a big chunka code....
But this is my problem, there seem to be some missing entries when i insert these stuff into my hashtable...
The input file is as follows.....
Code:
AA
AH
OW
AU
AA
AWE
AY
AYE
IY
EE
EH
EH
ER
ER
AY
EYE
OW
OH
OW
OW
OW
OWE
AH
UH
AE N AH M AH L
A N I M A L
AE N D IY AH N
A N D E A N
AE N T IY
A N T I
AE P AH TH IY
A P A TH Y
AE N T
A N T
AE NG K ER
A N CH OR
AE N D
A N D
AE M P AH L
A M P LE _
AE N T IY K
A N T I QUE
AE N
A N
AE M P
A M P
AH P L AY
A PP L Y
the entries comes in pairs, phonemes 1st line, graphemes 2nd line, and each entry is seperated by a space(the delimiter)
eg, entries for
AE N AH M AH L
A N I M A L
will be , "A|AE,-" , "N|N,A", "I|AH,N" so on...
like "grapheme(i)| phoneme(i),grapheme(i-1)"...
and also along with it, ".|AE,-" , ".|N,A" ...so on....
of format ".|phoneme(i), grapheme(i-1)"....(this is the bit that usually gets missing)
so this is my read_records() func that reads the whole chunka word in pairs...
Code:
static void read_records
(FILE *fp, HashTable *ht_bigram, HashTable *ht_trigram)
{
char *phoneme = safe_malloc(sizeof(char)*WORD_LEN);
char *grapheme = safe_malloc(sizeof(char)*WORD_LEN);
while (fgets(phoneme, WORD_LEN, fp) != NULL)
{
if (fgets(grapheme, WORD_LEN, fp) == NULL)
{
fprintf(stderr, "Badly formed input file");
exit(EXIT_FAILURE);
}
trigram_format(ht_trigram, phoneme, grapheme);
}
}
and trigram_format() splits them up to proper single entries to be inserted in the hashtable....
Code:
static void trigram_format(HashTable *ht, char *phoneme, char *grapheme)
{
int i=0, j=0, k=0, n=0, m=0, p=0;
char *trigram_entry = safe_malloc(sizeof(char)*ENTRY_LEN);
char *phoneme_curr = safe_malloc(sizeof(char)*ENTRY_LEN);
char *grapheme_prev = safe_malloc(sizeof(char)*ENTRY_LEN);
char *grapheme_curr = safe_malloc(sizeof(char)*ENTRY_LEN);
char *phoneme_entry = safe_malloc(sizeof(char)*ENTRY_LEN);
trigram_entry = init_string(trigram_entry, ENTRY_LEN);
phoneme_curr = init_string(phoneme_curr, ENTRY_LEN);
grapheme_prev = init_string(grapheme_prev, ENTRY_LEN);
grapheme_curr = init_string(grapheme_curr,ENTRY_LEN);
phoneme_entry = init_string(phoneme_entry,ENTRY_LEN);
grapheme_prev[0] = '-';
while(grapheme[k] != NEW_LINE || phoneme[j] != NEW_LINE)
{
while(!isspace(grapheme[k]))
{
grapheme_curr[n] = grapheme[k];
n++;
k++;
}
while(!isspace(phoneme[j]))
{
phoneme_curr[m] = phoneme[j];
m++;
j++;
}
/*copy phoneme and grapheme entries in proper order
* into trigram_entry*/
trigram_entry = trigram_strcpy(phoneme_curr, grapheme_prev, grapheme_curr);
/*Function to extract phonemes out of entries*/
phoneme_entry = phoneme_extract(phoneme_entry);
hashtable_insert(ht, phoneme_entry);
hashtable_insert(ht, trigram_entry);
strcpy(grapheme_prev, grapheme_curr);
/*Initialise all entries to NULL_CHAR to prepare for
* next batch of entries*/
phoneme_curr = init_string(phoneme_curr,
strlen(phoneme_curr));
grapheme_curr = init_string(grapheme_curr,
strlen(grapheme_curr));
phoneme_entry = init_string(phoneme_entry,
strlen(phoneme_entry));
if(phoneme[j] != NEW_LINE && grapheme[k] != NEW_LINE)
{
j++;
k++;
n=0;
m=0;
}
}
}
char *trigram_strcpy(char *phoneme, char *grapheme_prev, char *grapheme_curr)
{
int i=0, j, k;
char *trigram_entry = safe_malloc(sizeof(char)*WORD_LEN);
trigram_entry = init_string(trigram_entry, WORD_LEN);
for(i=0; i < strlen(grapheme_curr); i++)
trigram_entry[i] = grapheme_curr[i];
trigram_entry[i++] = '|';
for(j=0; j < strlen(phoneme); j++)
{
trigram_entry[i] = phoneme[j];
i++;
}
trigram_entry[i++] = ',';
for(k=0; k < strlen(grapheme_prev); k++)
{
trigram_entry[i] = grapheme_prev[k];
i++;
}
trigram_entry = chomp(trigram_entry);
return trigram_entry;
}
char *init_string(char *string , int length)
{
int i;
for(i=0; i < length; i++)
string[i] = NULL_CHAR;
return string;
}
char *phoneme_extract(char *entry)
{
int i=0, j=0;
char *phoneme = safe_malloc(sizeof(char)*ENTRY_LEN);
phoneme[j++] = '.';
while(entry[i] != '|') i++;
while(entry[i] != NULL_CHAR)
{
phoneme[j] = entry[i];
j++;
i++;
}
phoneme = chomp(phoneme);
return phoneme;
}
My Hashtable Functions.....
Code:
static HashTable *hashtable_init(size_t size)
{
int i , N = (int)(2*size);
/*Allocate memory for new hashtable*/
HashTable *ht = safe_malloc(sizeof(HashTable)*N);
ht->table = safe_malloc(sizeof(TableEntry)*size);
ht->table_size = size;
ht->num_items = (int)size;
/*Allocate memory for contents of table*/
for(i = 0; i < (int)size; i++)
{
ht->table[i].entry = safe_malloc(sizeof(char)*WORD_LEN*2);
}
/*Initiallise values for table contents*/
for(i = 0; i < (int)size; i++)
{
ht->table[i].entry = NULL_CHAR;
ht->table[i].freq = 0;
ht->table[i].log_prob = 0;
}
return ht;
}
static void hashtable_insert(HashTable *ht, char *entry)
{
int M = ht->num_items;
int hashval = hash_func(entry, M);
/*if encounter an already existing similar entry while probing,
* update its frequency*/
while(ht->table[hashval].entry != NULL_CHAR)
{
if(eq(entry, ht->table[hashval].entry))
{
ht->table[hashval].freq++;
break;
}
else
hashval = (hashval+1) % M;
}
if(ht->table[hashval].entry == NULL_CHAR)
{
ht->table[hashval].entry = safe_strdup(entry);
ht->table[hashval].freq++;
}
}
static int hashtable_lookup_freq(HashTable *ht, char *key)
{
int M = ht->num_items;
int h = hash_func(key, M);
while(ht->table[h].entry != NULL_CHAR)
{
if(eq(key, ht->table[h].entry)) return ht->table[h].freq;
else h = (h+1) % M;
}
return 0;
}
int hash_func(char *v, int M)
{
int h = 0, a = 31415, b = 27183;
for(h=0; *v != NULL_CHAR; v++, a = a*b % (M-1))
h = (a*h + *v) % M;
return h;
}
oh yes, my structure
Code:
#define WORD_LEN 20
#define ENTRY_LEN 10
#define NULL_CHAR '\0'
#define NEW_LINE '\n'
#define eq(A,B) (strcmp(A,B) == 0)
typedef struct {
char *entry;
int freq;
double log_prob;
} TableEntry;
typedef struct {
int table_size;
int num_items;
TableEntry *table;
} HashTable;
Yep, i hope i didn't forget anything important..
safe_malloc is alrite, just malloc() space for entries..
Sorry for the enormously long post....
Thanks guys