Hello everyone.
I found this block of code on the web. It reads text from standard input and returns the total number of distinct words found in the text. It performs much faster than if I were to use the SLT set<std::string>.
Code:
#include <unistd.h>
#include <vector>
#include <cctype>
#include <stdint.h>
#include <iostream>
const size_t pagesize = 4096;
struct knot_t {
knot_t() : exists(false), childcount(0), children(0) { }
knot_t* child(unsigned c) {
// Look for a matching child
if (children) {
unsigned i = childcount / 2;
unsigned leftpos = 0;
unsigned rightpos = childcount;
while (children[i].letter != uint8_t(c)) {
if (uint8_t(c) < children[i].letter) {
rightpos = i;
i = (leftpos + i) / 2;
} else {
leftpos = i+1;
i = (rightpos + i) / 2;
}
if (rightpos == leftpos)
goto nomatch;
}
return children + i;
}
nomatch:
// Create matching child - insert in order
knot_t *nary = new knot_t[childcount + 1];
knot_t *destp = nary;
knot_t *sourcep = children;
knot_t *endsource = children + childcount;
while (sourcep != endsource && sourcep->letter < c)
*destp++ = *sourcep++;
knot_t *outp = destp++;
while (sourcep != endsource)
*destp++ = *sourcep++;
delete[] children;
children = nary;
outp->letter = c;
childcount++;
return outp;
}
unsigned distinctcount() const {
unsigned total = exists ? 1 : 0;
for (unsigned i = 0; i != childcount; ++i)
total += children[i].distinctcount();
return total;
}
bool exists;
private:
uint8_t letter;
unsigned short childcount;
knot_t *children;
};
int main(int argc, char **argv)
{
knot_t base;
char buffer[pagesize];
knot_t *curnot = 0;
while (ssize_t r = read(0, buffer, pagesize)) {
if (r <= 0)
break;
for (unsigned bufpos = 0; bufpos != unsigned(r); ++bufpos) {
uint8_t c = buffer[bufpos];
if (isspace(c)) {
if (!curnot)
continue;
curnot->exists = true;
curnot = 0;
continue;
}
if (!curnot) {
curnot = base.child(c);
continue;
}
curnot = curnot->child(c);
}
}
if (curnot)
curnot->exists = true;
std::cout << "Distinct: " << base.distinctcount() << std::endl;
return 0;
}
Now, what I need is to read real numbers (doubles) and return the total number of distinct numbers in the input. I modified the above code and I'm getting the correct answer as I should. Can someone tell me what else my code needs? I'm not sure what algorithm is used in the above, but does anyone know? Is there a faster approach to my problem?
Here is my modified version of the above code so that it works on doubles instead of strings.
Code:
#include <unistd.h>
#include <vector>
#include <cctype>
#include <stdint.h>
#include <iostream>
using namespace std;
//const size_t pagesize = 4096;
struct knot_t {
knot_t() : exists(false), childcount(0), children(0) { }
knot_t* child(double c) {
// Look for a matching child
if (children) {
unsigned i = childcount / 2;
unsigned leftpos = 0;
unsigned rightpos = childcount;
while( children[i].letter != c ) {
if (c < children[i].letter) {
rightpos = i;
i = (leftpos + i) / 2;
} else {
leftpos = i+1;
i = (rightpos + i) / 2;
}
if (rightpos == leftpos)
goto nomatch;
}
return children + i;
}
nomatch:
// Create matching child - insert in order
knot_t *nary = new knot_t[childcount + 1];
knot_t *destp = nary;
knot_t *sourcep = children;
knot_t *endsource = children + childcount;
while (sourcep != endsource && sourcep->letter < c)
*destp++ = *sourcep++;
knot_t *outp = destp++;
while (sourcep != endsource)
*destp++ = *sourcep++;
delete[] children;
children = nary;
outp->letter = c;
childcount++;
return outp;
}
unsigned distinctcount() const {
unsigned total = exists ? 1 : 0;
for (unsigned i = 0; i != childcount; ++i)
total += children[i].distinctcount();
return total;
}
bool exists;
private:
double letter;
unsigned short childcount;
knot_t *children;
};
int main(int argc, char **argv)
{
knot_t base;
//char buffer[pagesize];
knot_t *curnot = 0;
//generate large number of random doubles for input
for(int i = 0; i < 100000000; i++){
double c = (random() & 0xf) + ((random() & 0xfff) / (double)0xff);
if( !curnot ){
curnot = base.child(c);
continue;
}
curnot->exists = true;
curnot = 0;
continue;
}
cout << base.distinctcount() << endl;
return 0;
}