Code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/time.h>
#define BUFSIZE 21
#define NUM_OF_FILES 10000
/* Collision Resolution Chain */
typedef struct OverflowBucket
{
char record1[21];
int temp;
char record2[21];
struct OverflowBucket* next;
}OverflowBucket;
/* Bucket */
typedef struct Bucket
{
OverflowBucket* head;
char record1[21];
int temp;
char record2[21];
struct Bucket* next;
}Bucket;
/* Hash Table */
typedef struct Hashtable
{
Bucket* head;
}HashTable;
int hashPartition(char*,int);
int secondHash(char*,int);
int* partitionOuter(char*,int,int*);
int* partitionInner(char*,int,int*);
int readTwoRecordsAndCompare(int,HashTable*,int,int);
int compareKey(char*,int,HashTable*,int);
int compareString(char*, char*, int, HashTable*,int);
int readLineAndHash(HashTable*, int, int, int);
int readAndMatch(int,int,int,int);
void hashToTable(char*, HashTable*, int);
void removeFiles(int);
void systemFree(HashTable*);
HashTable* generateTable(HashTable*,int);
OverflowBucket* generateNewOverflowBucket(Bucket*);
OverflowBucket* generateOverflowBucket(OverflowBucket*);
int main(int argc, char *argv[])
{
char buffer[BUFSIZE];
int flagExists = 0;
int num = 0,buffer_size = 0,temp_buf;
int fd, fd1, i, n;
int number_of_tuples = 0;
int *outerPartition;
int *innerPartition;
char tempBuffer_size[10];
char *in,*out;
if(argc == 5)
{
if(strcmp(argv[1], "-d") == 0)
flagExists = 1;
else
{
fprintf(stderr,"Error, must be -d\n");
exit(1);
}
in = argv[3];
out = argv[4];
strcpy(tempBuffer_size,argv[2]);
buffer_size = atoi(tempBuffer_size);
temp_buf = buffer_size - 1;
}
else if(argc == 4)
{
flagExists = 0;
in = argv[2];
out = argv[3];
strcpy(tempBuffer_size,argv[1]);
buffer_size = atoi(tempBuffer_size);
temp_buf = buffer_size - 1;
}
else
{
fprintf(stderr,"Error : %d arguments, need 3 or 4 \n",argc);
fprintf(stderr,"./HashJoin [-d] <buffer_size> <path1> <path2>\n");
}
outerPartition = malloc(sizeof(int)*(buffer_size-1));
innerPartition = malloc(sizeof(int)*(buffer_size-1));
for(i=0;i<temp_buf;i++)
{
outerPartition[i] = -1;
}
for(i=0;i<temp_buf;i++)
{
innerPartition[i] = -1;
}
num = (buffer_size/2);
fd = open(in, O_RDONLY);
if(fd == -1)
{
fprintf(stderr,"File not found\n");
exit(1);
}
/* PARTITION PHASE */
while((n=read(fd,buffer,BUFSIZE)) > 0)
{
outerPartition = partitionOuter(buffer,buffer_size,outerPartition);
}
fd1 = open(out, O_RDONLY);
if(fd1 == -1)
{
fprintf(stderr,"File not found\n");
exit(1);
}
/* PARTITION PHASE */
while((n=read(fd1,buffer,BUFSIZE)) > 0)
{
innerPartition = partitionInner(buffer,buffer_size,innerPartition);
}
close(fd);
close(fd1);
/* MATCHING PHASE */
for(i=0;i<=temp_buf;i++)
{
if(outerPartition[i] != -1 && innerPartition[i] != -1)
{
number_of_tuples += readAndMatch(flagExists,outerPartition[i],innerPartition[i],buffer_size);
}
}
if(flagExists == 0)
fprintf(stderr,"Number of tuple : %d\n",number_of_tuples);
for(i=0;i<temp_buf;i++)
{
if(outerPartition[i] != -1)
close(outerPartition[i]);
}
for(i=0;i<temp_buf;i++)
{
if(innerPartition[i] != -1)
close(innerPartition[i]);
}
/*
removeFiles(buffer_size);
*/
return 0;
}
/* MATCHING PHASE */
int readAndMatch(int flag,int outerFD,int innerFD,int buffer_size)
{
int FP = 0,tempFP = 0;
int firstTime = 0;
int endOfFile = 1;
int total = 0;
int num = 0;
HashTable table, *currTable;
lseek(outerFD,0,SEEK_SET);
lseek(innerFD,0,SEEK_SET);
currTable = generateTable(&table,buffer_size);
num = (buffer_size/2);
while(endOfFile != 0)
{
if(firstTime != 0)
{
lseek(outerFD, tempFP, SEEK_SET);
FP = tempFP;
}
endOfFile = readLineAndHash(currTable,outerFD,buffer_size,FP);
tempFP = endOfFile;
firstTime = 1;
total += readTwoRecordsAndCompare(innerFD,currTable,flag,num);
systemFree(currTable);
}
return total;
}
int readTwoRecordsAndCompare(int fd, HashTable* table, int flag, int num)
{
int buffer_size, n = 1, number_of_tuples = 0, temp = 0;
char buffer[BUFSIZE];
char str1[BUFSIZE], str2[BUFSIZE];
lseek(fd,0,SEEK_SET);
/* Read until EOF */
while(n != 0)
{
buffer_size = 2;
/* Read until buffer_size limit and compare */
while((buffer_size != 0) && (n = read(fd,buffer,BUFSIZE)) > 0)
{
if(buffer_size == 2)
strcpy(str1,strtok(buffer,"\n"));
else if(buffer_size == 1)
strcpy(str2,strtok(buffer,"\n"));
temp += n;
buffer_size--;
}
/* Compare and get number of matching tuples */
number_of_tuples += compareString(str1, str2, flag, table, num);
str1[0] = '\0';
str2[0] = '\0';
}
return number_of_tuples;
}
int compareKey(char* str, int key, HashTable* table, int flag)
{
int count = 0;
int numberOfTuples = 0;
Bucket *currBucket;
OverflowBucket *currOverflowBucket;
currBucket = table->head;
/* Traverse to the specified bucket */
while(currBucket != NULL)
{
if(count == key)
break;
count++;
currBucket = currBucket->next;
}
/* Compare with the 2 slots at current bucket, if there is *
* a chain attached to it, traverse the chain as well, increase*
* counter if string matches, if flag is specified, print the *
* string */
if((strcmp(currBucket->record1,str)) == 0)
{
if(flag == 1)
fprintf(stderr,"%s\n",currBucket->record1);
numberOfTuples++;
}
if((strcmp(currBucket->record2,str)) == 0)
{
if(flag == 1)
printf("%s\n",currBucket->record2);
numberOfTuples++;
}
if(currBucket->head != NULL)
{
currOverflowBucket = currBucket->head;
while(currOverflowBucket != NULL)
{
if((strcmp(currOverflowBucket->record1,str)) == 0)
{
if(flag == 1)
printf("%s\n",currOverflowBucket->record1);
numberOfTuples++;
}
if((strcmp(currOverflowBucket->record2,str)) == 0)
{
if(flag == 1)
printf("%s\n",currOverflowBucket->record2);
numberOfTuples++;
}
currOverflowBucket = currOverflowBucket->next;
}
}
return numberOfTuples;
}
/* Compare string */
int compareString(char* str1, char* str2, int flag, HashTable* table, int num)
{
int key1, key2;
int numberOfTuples = 0;
/* Get Hash Key value and compare, if string is null, do nothing */
if(str1[0] != '\0')
{
key1 = hashPartition(str1,num);
numberOfTuples += compareKey(str1,key1,table,flag);
}
if(str2[0] != '\0')
{
key2 = hashPartition(str2,num);
numberOfTuples += compareKey(str2,key2,table,flag);
}
return numberOfTuples;
}
/* Read a record and Hash to table */
int readLineAndHash(HashTable* table, int fd, int buffer_size, int FP)
{
int n,num;
int temp;
char buffer[BUFSIZE];
char *str;
temp = FP;
num = (buffer_size/2);
/* Reserve 2 for input and output and increase buffer by 2 to read 2 records per page*/
buffer_size = buffer_size - 2;
buffer_size = buffer_size * 2;
/* Read one line and hash, if buffer is max, terminate */
while((n = read(fd,buffer,BUFSIZE)) > 0)
{
if(buffer_size == 0)
break;
temp += n;
str = strtok(buffer,"\n");
hashToTable(str,table,num);
buffer_size--;
}
if(n == 0)
return n;
else
return temp;
}
void hashToTable(char* buffer,HashTable* table,int num)
{
int key,count;
Bucket *currBucket;
OverflowBucket *currOverflowBucket, *lastBucket, *overflowBucket;
currBucket = table->head;
lastBucket = NULL;
key = hashPartition(buffer,num);
/* key = secondHash(buffer,num);
*/
if(key == 0)
{
/* Check current bucket for empty slots */
if(strlen(currBucket->record1) == 0)
strcpy(currBucket->record1,buffer);
else if(strlen(currBucket->record2) == 0)
strcpy(currBucket->record2,buffer);
else
{
/* If overflow bucket exists, check for free slot, if not,else generate, create new overflow
bucket if statement is false */
if(currBucket->head != NULL)
{
currOverflowBucket = currBucket->head;
/* If full, go next node */
while(currOverflowBucket != NULL)
{
/* Check current bucket for empty slots, if full, set tag to make new *
* overflow bucket */
if(strlen(currOverflowBucket->record1) == 0)
strcpy(currOverflowBucket->record1,buffer);
else if(strlen(currOverflowBucket->record2) == 0)
strcpy(currOverflowBucket->record2,buffer);
else if(currOverflowBucket->next == NULL){
lastBucket = currOverflowBucket;}
currOverflowBucket = currOverflowBucket->next;
}
/* No free slot, generate new one and slot in */
if(lastBucket != NULL)
{
overflowBucket = generateOverflowBucket(lastBucket);
strcpy(overflowBucket->record1,buffer);
}
}
else
{
/* Get new node and copy to slot */
overflowBucket = generateNewOverflowBucket(currBucket);
strcpy(overflowBucket->record1,buffer);
}
}
}
else
{
count = 0;
/* Traverse to corresponding bucket */
while(currBucket->next != NULL)
{
if(count == key)
{
break;
}
count++;
currBucket = currBucket->next;
}
/* Check bucket for free slots */
if(strlen(currBucket->record1) == 0)
strcpy(currBucket->record1,buffer);
else if(strlen(currBucket->record2) == 0)
strcpy(currBucket->record2,buffer);
else
{
/* If overflow bucket exists, check for free slot, if not,else generate, create new overflow
bucket if statement is false */
if(currBucket->head != NULL)
{
currOverflowBucket = currBucket->head;
/* If full, go next node */
while(currOverflowBucket != NULL)
{
/* If last bucket and no slots, set tag to generate new overflow */
if(strlen(currOverflowBucket->record1) == 0)
strcpy(currOverflowBucket->record1,buffer);
else if(strlen(currOverflowBucket->record2) == 0)
strcpy(currOverflowBucket->record2,buffer);
else if(currOverflowBucket->next == NULL)
lastBucket = currOverflowBucket;
currOverflowBucket = currOverflowBucket->next;
}
/* No free slot, generate new one and slot in */
if(lastBucket != NULL)
{
overflowBucket = generateOverflowBucket(lastBucket);
strcpy(overflowBucket->record1,buffer);
}
}
else
{
/* Get new node and copy to slot */
overflowBucket = generateNewOverflowBucket(currBucket);
strcpy(overflowBucket->record1,buffer);
}
}
}
}
/* Generate Table */
HashTable* generateTable(HashTable* table, int num)
{
int counter = num;
Bucket *bucket, *currBucket;
table->head = NULL;
while(counter != 0)
{
bucket = malloc(sizeof(Bucket));
if(bucket == NULL)
{
fprintf(stderr,"Malloc bucket failed\n");
exit(1);
}
bucket->next = NULL;
if(table->head == NULL)
{
table->head = bucket;
currBucket = bucket;
}
else
{
currBucket->next = bucket;
currBucket = bucket;
}
counter--;
}
return table;
}
/* Generate New Overflow bucket if no overflow occured in the slot */
OverflowBucket* generateNewOverflowBucket(Bucket* currBucket)
{
OverflowBucket *newBucket;
newBucket = malloc(sizeof(OverflowBucket));
if(newBucket == NULL)
{
fprintf(stderr,"Malloc bucket failed\n");
exit(1);
}
newBucket->next = NULL;
currBucket->head = newBucket;
return newBucket;
}
/* Generate Overflow bucket if existing overflow bucket is full */
OverflowBucket* generateOverflowBucket(OverflowBucket* currBucket)
{
OverflowBucket *newBucket;
newBucket = malloc(sizeof(OverflowBucket));
if(newBucket == NULL)
{
fprintf(stderr,"Malloc bucket failed\n");
exit(1);
}
newBucket->next = NULL;
currBucket->next = newBucket;
return newBucket;
}
/* PARTITION PHASE *
* Read outer relation and partition it */
int* partitionOuter(char* buffer,int num,int* outerPartition)
{
int key;
int fd;
mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
char outPartition[100];
key = hashPartition(buffer,num);
sprintf(outPartition,"outer%d",key);
/* Create a new file if doesn't exist, append file if exists */
fd = open(outPartition, O_RDWR | O_CREAT | O_APPEND, mode);
if(outerPartition[key] == -1)
{
outerPartition[key] = fd;
}
write(fd,buffer,strlen(buffer));
return outerPartition;
}
/* PARTITION PHASE *
* Read inner relation and partition it */
int* partitionInner(char* buffer,int num,int* innerPartition)
{
int key;
int fd;
mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
char inPartition[100];
key = hashPartition(buffer,num);
sprintf(inPartition,"inner%d",key);
/* Create a new file if doesn't exist, append file if exists */
fd = open(inPartition, O_RDWR | O_CREAT | O_APPEND, mode);
if(innerPartition[key] == -1)
{
innerPartition[key] = fd;
}
write(fd,buffer,strlen(buffer));
return innerPartition;
}
/* Remove files after using */
void removeFiles(int num)
{
char outPartitions[100];
int i;
char inPartitions[100];
for(i=0;i<num;i++)
{
sprintf(outPartitions,"outer%d",i);
unlink(outPartitions);
sprintf(inPartitions,"inner%d",i);
unlink(inPartitions);
}
}
/* First Hash function */
int hashPartition(char* buffer, int num)
{
int h = 0, a = 127;
for(;*buffer != '\0'; buffer++)
h = (a*h + *buffer) % num;
return h;
}
/* Second Hash Function */
int secondHash(char* buffer, int num)
{
return 1;
}
/* Free Data */
void systemFree(HashTable* table)
{
Bucket *currBucket, *nextBucket;
OverflowBucket *currOverflowBucket, *nextOverflowBucket;
currBucket = table->head;
while(currBucket != NULL)
{
if(currBucket->head != NULL)
{
currOverflowBucket = currBucket->head;
while(currOverflowBucket != NULL)
{
currOverflowBucket->record1[0] = '\0';
currOverflowBucket->record2[0] = '\0';
nextOverflowBucket = currOverflowBucket->next;
free(currOverflowBucket);
currOverflowBucket = nextOverflowBucket;
}
}
currBucket->record1[0] = '\0';
currBucket->record2[0] = '\0';
nextBucket = currBucket->next;
currBucket->head = NULL;
free(currBucket);
currBucket = nextBucket;
}
table->head = NULL;
}
I'm implementing a HashJoin for string matching which includes splitting my files into several partitions specified by commandline. Here's my file input :