As for the missing advanced functionality, I'm really only looking for the basics here so that I can simply understand what hashing with chaining is doing before trying to get fancy at all.
btw, if hashing is just on the key of a key-value pair, doesn't searching for a value become O(n) with searching for a key O(1)?
Anyhow, here's my (provisionally) final version of a very simplistic hash table with chaining. Header file:
Code:
/**
@file
hash table class
@author Marshall Farrier
@date 9/20/10
*/
#ifndef HASH_TABLE_H
#define HASH_TABLE_H
#include <list>
#include <utility> // for pair to get key-value pairs
#include <string>
#include <vector>
typedef std::pair<std::string, std::string> KeyValue; // key-value pair where key and value are both strings
typedef std::list<KeyValue> KeyValueList;
const double SIZE_FACTOR = 1.33;
/**
This HashTable class uses hashing with chaining
*/
class HashTable {
private:
KeyValueList * h_tbl;
unsigned tbl_size; // size of the hash_table
/**
The hash function
Note that this is not a static method because it will make use of the prime number
tbl_size associated with a specific HashTable
Simple application of division method, CLRS, p. 263
@param key an arbitrary input string (used as key in a key-value pair)
@return The index in tbl_size where we will append this key-value pair to our list
*/
unsigned hash(const std::string &key) const;
static unsigned prime_generator(const unsigned &min); // will generate the next prime at least as big as min
/**
@param max ceiling on value returned
@return The largest power of 2 that is <= max
If max == 0, 0 is returned (even though not a power of 2)
*/
static unsigned power_of_two(const unsigned &max); // returns the biggest power of 2 that is < max
public:
HashTable(const size_t size);
~HashTable() {delete [] h_tbl; } // constructor will clearly use new []
// basic hash_table operations, should all have average runtime of O(1):
/**
@param item Item to insert into HashTable
*/
void insert(const KeyValue &item);
/**
@param key The key which is to be located
@return If the key is found at some index, returns an index where it is found
Note that if there are multiple instances of key, it will be unpredictable
which one is returned
If the key is not found anywhere, then -1 is returned
*/
int search(const std::string &key) const;
/**
@param key The key to find and delete. All instances of key should be deleted.
@return Returns true iff the key was found and something was deleted.
Returns false if key not found (in which case no action is taken).
*/
bool remove(const std::string &key);
/**
a beefier search method
@param key The key we are searching for
@return returns a vector of all values associated with the given key
The vector will be empty if no match is found
*/
std::vector<std::string> get_values(const std::string &key) const;
// methods to show performance
/**
This function is one measure of how well our hash function
is working. It returns the ratio of filled slots to available slots
in h_tbl--i.e., the percent of slots filled, expressed as a double
in [0, 1]
@return Number of slots with a non-empty list divided by total number of slots
*/
double density() const;
/**
This method returns the length of the longest list to show
how many collisions we have had to deal with
@return length of longest KeyValue list.
*/
size_t longest_chain() const;
/**
@return Returns the number of elements in the container
*/
size_t size() const;
/**
return Average number of elements stored in a chain (cf. CLRS, p. 258)
*/
double load_factor() const;
};
#endif
Implementation file:
Code:
/**
@file
@author Marshall Farrier
@date 9/20/10
*/
#include "hash_table.h"
#include <cmath>
/**
Just a basic deterministic hash function to begin with
*/
unsigned HashTable::hash(const std::string &key) const {
unsigned result = 0;
// unsigned long power_of_base = 1;
const unsigned BASE = 128; // small ASCII set
size_t len = key.length();
for(int i = 0; i < len; ++i) {
// cf. Sedgewick, Algorithms in C, p. 578 (Horner's algorithm)
result = (result * BASE + (unsigned)key.at(i)) % tbl_size;
}
return result;
}
/**
This implementation is horribly inefficient but should
be ok for small values
*/
unsigned HashTable::prime_generator(const unsigned &min) {
unsigned result = min;
unsigned max_factor, i;
while (true) {
max_factor = std::sqrt((double)result);
for (i = 2; i <= max_factor; ++i) {
if (result % i == 0) break;
}
if (i > max_factor) return result; // result is prime
++result;
}
}
unsigned HashTable::power_of_two(const unsigned &max) {
unsigned max_cpy = max;
unsigned result = 1;
while (max_cpy > 0) {
max_cpy >>= 1;
result <<= 1;
}
result >>= 1;
return result;
}
HashTable::HashTable(const size_t size) {
unsigned _size = size * SIZE_FACTOR;
unsigned pow_two = power_of_two(_size);
/*
These operations are possibly overly cautious with regard to proximity
to powers of two
*/
if (_size < pow_two * 1.33) _size = pow_two * 1.4;
else if (pow_two * 1.67 < _size) _size = pow_two * 2.8;
tbl_size = prime_generator(_size);
h_tbl = new KeyValueList[tbl_size];
}
void HashTable::insert(const KeyValue &item) {
unsigned i = hash(item.first);
h_tbl[i].push_front(item);
}
int HashTable::search(const std::string &key) const {
int i = hash(key);
if (h_tbl[i].empty()) return -1;
KeyValueList::iterator it;
for (it = h_tbl[i].begin(); it != h_tbl[i].end(); ++it) {
if (key == it->first) return i;
}
return -1;
}
bool HashTable::remove(const std::string &key) {
int i = hash(key);
bool found = false;
if (h_tbl[i].empty()) return found;
KeyValueList::iterator it;
for (it = h_tbl[i].begin(); it != h_tbl[i].end(); ++it) {
if (key == it->first) {
found = true;
it = h_tbl[i].erase(it);
--it;
}
}
return found;
}
std::vector<std::string> HashTable::get_values(const std::string &key) const {
std::vector<std::string> result;
int i = hash(key);
if (h_tbl[i].empty()) return result;
KeyValueList::iterator it;
for (it = h_tbl[i].begin(); it != h_tbl[i].end(); ++it) {
if (key == it->first) result.push_back(it->second);
}
return result;
}
double HashTable::density() const {
int full = 0;
for (int i = 0; i < tbl_size; ++i) {
if (!h_tbl[i].empty()) ++full;
}
return (double)full / tbl_size;
}
size_t HashTable::longest_chain() const {
size_t result = 0;
for (int i = 0; i < tbl_size; ++i) {
if (result < h_tbl[i].size()) result = h_tbl[i].size();
}
return result;
}
size_t HashTable::size() const {
size_t result = 0;
for (int i = 0; i < tbl_size; ++i) {
result += h_tbl[i].size();
}
return result;
}
double HashTable::load_factor() const {
return (double)size() / tbl_size;
}
quick and simple demo file, which shows my table to have about half the slots occupied at all and a longest chain of 3. is that roughly normal?
Code:
/**
@file
demonstrates functionality of a hash table
@author Marshall Farrier
@date 9/20/10
*/
#include <iostream>
#include <string>
#include <vector>
#include "hash_table.h"
using namespace std;
void create_dictionary(string filename);
int main() {
const size_t SIZE = 25;
HashTable h1(SIZE);
// list from Sedgewick, p. 576
string words[] = {"now", "for", "tip", "ilk", "dim", "tag", "jot",
"sob", "nob", "sky", "hut", "ace", "bet", "men", "egg", "few",
"jay", "owl", "joy", "rap", "gig", "wee", "was", "cab", "wad"};
KeyValue tmp;
for (int i = 0; i < SIZE; ++i) {
tmp.first = words[i];
h1.insert(tmp);
}
cout << "Hash table size is " << h1.size() << endl;
cout << "Hash table density is " << h1.density() << endl;
cout << "Longest chain is " << h1.longest_chain() << endl;
cout << "Load factor is " << h1.load_factor() << endl;
cout << "\"joy\" is in slot " << h1.search("joy") << endl;
cout << "\"jay\" is in slot " << h1.search("jay") << endl;
return 0;
}