oddly enough, phantom, the xor implementation seems slower by almost a factor of 3 than the division method. On my laptop, I get somewhere around 1.8 with division and 4.5 with XOR on the first dataset, and 3.9 with division, 10.9 with XOR on the second dataset.
Here's my implementation of the XOR (mainly copied from your code but changing a couple of things and deleting irrelevant lines):
Code:
size_t rabin_karp::xor_hasher(const std::string &s, const rabin_karp::PrimeArray &arr) {
size_t len = s.size();
assert (len > 0);
unsigned int result = 0U;
unsigned int offset = s.at(0);
for (size_t i = 0; i < len; ++i) {
result ^= arr.at(offset + s.at(i));
offset = s.at(i);
}
return result;
}
size_t rabin_karp::xor_hasher_update(unsigned old_hash, const char &new_origin, const char &new_finale,
const char &old_origin, const char &old_finale, const PrimeArray &arr) {
old_hash ^= arr.at(old_origin + old_origin);
old_hash ^= arr.at(old_origin + new_origin);
old_hash ^= arr.at(new_origin + new_origin);
old_hash ^= arr.at(old_finale + new_finale);
return old_hash;
}
size_t rabin_karp::xor_find(std::string text, const std::string &pat, const rabin_karp::PrimeArray &arr) {
size_t pat_len = pat.size();
if (!pat_len) return 0; // pat is empty string
if (text.size() < pat_len) return std::string::npos; // no need to search if text is smaller than pat
size_t last_text_index = text.size() - pat.size();
int h_val = rabin_karp::xor_hasher(pat, arr); // value we will be searching for
int tmp = rabin_karp::xor_hasher(text.substr(0, pat_len), arr);
size_t i = 0;
// main testing loop
while (true) {
// do an exhaustive comparison only AFTER checking for equality of hash values
if (tmp == h_val && !text.substr(i, pat_len).compare(pat)) return i;
/**
The following is the decisive algorithm--cf. Skiena, ADM, p. 92
It provides a short-cut for calculating the hash.
Otherwise we're saving no time at all vs. exhaustive comparison
*/
if (i == last_text_index) break;
tmp = xor_hasher_update(tmp, text.at(i + 1), text.at(i + pat_len), text.at(i), text.at(i + pat_len - 1), arr);
++i;
}
return std::string::npos; // not found
}
And here's the code i used for testing:
Code:
inline double cpu_time() { return double(clock()) / CLOCKS_PER_SEC; }
int main() {
srand(time(NULL));
rabin_karp::PrimeArray arr;
size_t modulus = rabin_karp::prime_generator(1500);
double start, runtime;
string str1 = random_string(10000000);
string str2 = random_string(100000);
str1.append(str2);
cout << "Searching for pattern using Rabin-Karp with division method\n";
start = cpu_time();
size_t index = rabin_karp::div_find(str1, str2, modulus);
runtime = cpu_time() - start;
if (index != string::npos) cout << "index is " << index;
else cout << "pattern not found";
cout << endl;
cout << "runtime with modulus " << modulus << ": " << runtime << endl;
cout << "Searching for pattern using Rabin-Karp with XOR method\n";
start = cpu_time();
index = rabin_karp::xor_find(str1, str2, arr);
runtime = cpu_time() - start;
if (index != string::npos) cout << "index is " << index;
else cout << "pattern not found";
cout << endl;
cout << "runtime with XOR method: " << runtime << endl;
cout << "Searching for pattern using built-in find()\n";
start = cpu_time();
index = str1.find(str2);
runtime = cpu_time() - start;
if (index != string::npos) cout << "index is " << index;
else cout << "pattern not found";
cout << endl;
cout << "runtime: " << runtime << endl;
cout << "Creating a string on which hash searching should perform well\n";
str1 = "";
for (int i = 0; i < 500000; ++i) {
str1.append(str2.substr(0, 50));
}
str1.append(str2);
cout << "Searching for pattern using Rabin-Karp division method\n";
start = cpu_time();
index = rabin_karp::div_find(str1, str2, modulus);
runtime = cpu_time() - start;
if (index != string::npos) cout << "index is " << index;
else cout << "pattern not found";
cout << endl;
cout << "runtime with modulus " << modulus << ": " << runtime << endl;
cout << "Searching for pattern using Rabin-Karp with XOR method\n";
start = cpu_time();
index = rabin_karp::xor_find(str1, str2, arr);
runtime = cpu_time() - start;
if (index != string::npos) cout << "index is " << index;
else cout << "pattern not found";
cout << endl;
cout << "runtime with XOR method: " << runtime << endl;
cout << "Searching for pattern using built-in find()\n";
start = cpu_time();
index = str1.find(str2);
runtime = cpu_time() - start;
if (index != string::npos) cout << "index is " << index;
else cout << "pattern not found";
cout << endl;
cout << "runtime: " << runtime << endl;
//*/
return 0;
}