Hi guys!
I am writing code, which reads an input file (280 MB) containing a list of words. I would like to compute
1) total # of words
2) total # of unique/distinct words
3) mean/median/mode of word count (Link)
I managed to get done with 1) and 2), but my program crashes for 3). I am not quite sure whether there are memory issues or some bug in the code.
Code:
#include <iostream>
#include <fstream>
#include <string>
#include <map>
using namespace std;
double GetMedian(int daArray[], int iSize) {
// Allocate an array of the same size and sort it.
double* dpSorted = new double[iSize];
for (int i = 0; i < iSize; ++i) {
dpSorted[i] = daArray[i];
}
for (int i = iSize - 1; i > 0; --i) {
for (int j = 0; j < i; ++j) {
if (dpSorted[j] > dpSorted[j+1]) {
double dTemp = dpSorted[j];
dpSorted[j] = dpSorted[j+1];
dpSorted[j+1] = dTemp;
}
}
}
// Middle or average of middle values in the sorted array.
double dMedian = 0.0;
if ((iSize % 2) == 0) {
dMedian = (dpSorted[iSize/2] + dpSorted[(iSize/2) - 1])/2.0;
} else {
dMedian = dpSorted[iSize/2];
}
delete [] dpSorted;
return dMedian;
}
double GetMode(int daArray[], int iSize) {
// Allocate an int array of the same size to hold the
// repetition count
int* ipRepetition = new int[iSize];
for (int i = 0; i < iSize; ++i) {
ipRepetition[i] = 0;
int j = 0;
while ((j < i) && (daArray[i] != daArray[j])) {
if (daArray[i] != daArray[j]) {
++j;
}
}
++(ipRepetition[j]);
}
int iMaxRepeat = 0;
for (int i = 1; i < iSize; ++i) {
if (ipRepetition[i] > ipRepetition[iMaxRepeat]) {
iMaxRepeat = i;
}
}
delete [] ipRepetition;
return daArray[iMaxRepeat];
}
double GetMean(int nums[], int total) {
int i;
double sum = 0.0;
for(i = 0; i < total; i++) {
sum += nums[i];
}
return (sum/total);
}
int main( int argc, char * args[] )
{
ifstream data_queries("inputfile.txt");
string data_querystats = "outputfile.txt";
ofstream fcoclickout(data_querystats.c_str());
// Define parameters
map<string, int> querySet; // map of queries and their frequencies
string query; // hold input query
// Read query-by-query
int queryCount = 0;
while (getline(data_queries, query, '\n'))
{
querySet[query]++; // increase that queries' count
queryCount++;
}
// Print simple statistics
fcoclickout << "Total number of queries: " << queryCount << endl;
fcoclickout << "Total number of unique queries: " << querySet.size() << endl;
// Iterate over map to save counts into array
int currentMax = 0;
int idx = 0;
string maxSugg;
map<string, int>::iterator map_iterator;
int Counts[querySet.size()];
for (map_iterator = querySet.begin(); map_iterator != querySet.end(); map_iterator++)
{
if (map_iterator->second > currentMax)
{
currentMax = map_iterator->second;
maxSugg = map_iterator->first;
}
// Store counts externally
Counts[idx] = map_iterator->second;
idx++;
}
// Print advanced statistics
fcoclickout << "Maximum suggestion count: " << currentMax << endl;
fcoclickout << "Query with maximum suggestions: " << maxSugg << endl;
fcoclickout << "Average suggestions/query: " << GetMean(Counts, querySet.size()) << endl;
fcoclickout << "Median suggestions/query: " << GetMedian(Counts, querySet.size()) << endl;
fcoclickout << "Mode suggestions/query: " << GetMode(Counts, querySet.size()) << endl;
cout << "done";
return 0;
}