Code:
#include <stdlib.h>
#include <locale.h>
#include <string.h>
#include <wchar.h>
#include <wctype.h>
#include <errno.h>
#include <stdio.h>
/* Hash type.
* The hashing function is the XOR variant of Bernstein hash.
* Note: the line read function contains an open-coded copy.
*/
typedef unsigned long hash_t;
static inline hash_t wcshash(const wchar_t *string)
{
hash_t h = 0UL;
if (string)
while (*string != L'\0')
h = (33UL * h) ^ (unsigned long)(*(string++));
return h;
}
/* Hash table entry type.
*/
struct hash_entry {
/* Words form a linked list in ascending order of hash. */
struct hash_entry *next;
hash_t hash;
/* Number of occurrences. */
unsigned long hits;
/* Length in characters, and data. */
size_t size;
wchar_t data[];
};
/* Hash table.
*/
struct hash_entry **hash_table = NULL;
size_t hash_table_size = 0;
size_t hash_table_used = 0;
/* Resize hash table.
*/
int hash_table_resize(const size_t size)
{
struct hash_entry **table, **pptr, *next;
size_t used = 0, i;
/* Invalid size? */
if (size < 2)
return errno = EINVAL;
/* Allocate new table. */
table = malloc(size * sizeof *table);
if (!table)
return errno = ENOMEM;
/* Initialize table to all NULL pointers. */
i = size; while (i-- > 0)
table[i] = NULL;
/* Scan existing hash table. */
for (i = 0; i < hash_table_size; i++) {
struct hash_entry *list = hash_table[i];
while (list) {
struct hash_entry *const curr = list;
list = list->next;
/* Locate correct point to insert into. */
pptr = &(table[curr->hash % size]);
next = table[curr->hash % size];
while (next && next->hash < curr->hash) {
pptr = &(next->next);
next = next->next;
}
/* Insert. */
curr->next = next;
*pptr = curr;
used++;
}
}
/* Discard current hash table. */
free(hash_table);
/* Update. */
hash_table = table;
hash_table_size = size;
hash_table_used = used;
/* Done. */
return 0;
}
/* Add entry to hash table.
* If it already exists, return a pointer to the existing entry.
*/
struct hash_entry *hash_table_add(const hash_t hash, const wchar_t *const data, const size_t size)
{
struct hash_entry **pptr, *next, *curr;
/* Nothing to add? */
if (!data || size < 1) {
errno = EINVAL;
return NULL;
}
/* Resize hash table if necessary. */
if (hash_table_used >= hash_table_size) {
if (hash_table_size < 48) {
if (hash_table_resize((hash_table_used | 15) + 1))
return NULL;
} else
if (hash_table_resize((3 * hash_table_used) / 2))
return NULL;
}
/* Locate hash entry. */
pptr = &(hash_table[hash % hash_table_size]);
next = hash_table[hash % hash_table_size];
while (next && next->hash < hash) {
pptr = &(next->next);
next = next->next;
}
/* Match? */
if (next && next->hash == hash && next->size == size && !wmemcmp(data, next->data, size)) {
next->hits++;
return next;
}
/* Create new entry. */
curr = malloc(sizeof (struct hash_entry) + (size + 1) * sizeof (wchar_t));
if (!curr) {
errno = ENOMEM;
return NULL;
}
/* Initialize fields. */
curr->hash = hash;
curr->hits = 1UL;
curr->size = size;
wmemcpy(curr->data, data, size);
curr->data[size] = L'\0';
/* Insert. */
curr->next = next;
*pptr = curr;
/* Update used count. */
hash_table_used++;
/* Success. */
return curr;
}
int process(const char *const filename, FILE *const in)
{
wchar_t *data;
hash_t hash_used; /* Hash using the first 'used' characters */
hash_t hash_last; /* Hash using the first 'last' characters */
size_t last; /* Last alphanumeric seen */
size_t size; /* Allocated */
size_t used; /* Used */
wint_t c;
if (fwide(in, 1) < 0) {
fprintf(stderr, "%s: Cannot process as wide-character input.\n", filename);
return 1;
}
/* Allocate an initial buffer. */
size = 1022;
data = malloc((size + 2) * sizeof *data);
if (!data) {
fprintf(stderr, "%s: %s.\n", filename, strerror(ENOMEM));
return 1;
}
/* Input loop. */
c = fgetwc(in);
while (1) {
/* Skip until first alphanumeric character. */
while (c != WEOF && !iswalpha(c))
c = fgetwc(in);
/* End of input? */
if (c == WEOF)
break;
/* Clear buffer. */
hash_used = hash_last = 5381UL;
last = used = 0;
/* Word loop. */
while (c != WEOF && !iswspace(c)) {
/* Need to grow buffer? */
if (used >= size) {
/* Any heuristic that will be >used will work. */
const size_t temp_size = (((used * 3) / 2) | 1023) + 993;
wchar_t *temp_data;
temp_data = realloc(data, (2 + temp_size) * sizeof *data);
if (!temp_data) {
fprintf(stderr, "%s: %s.\n", filename, strerror(ENOMEM));
free(data);
return 1;
}
data = temp_data;
size = temp_size;
}
/* Add to buffer and hash. */
if (iswalnum(c)) {
c = towlower(c);
data[used++] = c;
last = used;
hash_last = hash_used = (hash_used * (hash_t)33) ^ (hash_t)c;
} else {
data[used++] = c;
hash_used = (hash_used * (hash_t)33) ^ (hash_t)c;
}
c = fgetwc(in);
}
/* Terminate, just to be certain. */
data[last] = L'\0';
/* We only include the word from the first alphanumeric to last alphanumeric,
* including all punctuation etc. only if no spaces. */
if (last > 0)
if (!hash_table_add(hash_last, data, last)) {
const char *const errmsg = strerror(errno);
fprintf(stderr, "%s: Cannot add word to hash table: %s.\n", filename, errmsg);
free(data);
return 1;
}
/* Fall through to skipping until next alphanumeric. */
}
/* Error in input? */
if (c == WEOF && errno == EILSEQ) {
fprintf(stderr, "%s: %s.\n", filename, strerror(EILSEQ));
free(data);
return 1;
} else
if (ferror(in) || !feof(in)) {
fprintf(stderr, "%s: %s.\n", filename, strerror(EIO));
free(data);
return 1;
}
/* Discard data buffer. */
free(data);
/* Success. */
return 0;
}
int main(int argc, char *argv[])
{
unsigned long n;
char dummy;
int arg;
FILE *in;
setlocale(LC_CTYPE, "");
if (argc < 2 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
fprintf(stderr, " %s N [ FILENAME ... ]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "This program will read the specified files,\n");
fprintf(stderr, "or standard input if none (or -) specified,\n");
fprintf(stderr, "and output all words occurring at least N times.\n");
fprintf(stderr, "\n");
return 1;
}
if (sscanf(argv[1], " %lu %c", &n, &dummy) != 1) {
fprintf(stderr, "%s: Invalid number of occurrences.\n", argv[1]);
return 1;
}
if (argc > 2) {
for (arg = 2; arg < argc; arg++)
if (!strcmp(argv[arg], "-")) {
if (process("(standard input)", stdin))
return 1;
} else {
in = fopen(argv[arg], "rb");
if (!in) {
const char *const errmsg = strerror(errno);
fprintf(stderr, "%s: %s.\n", argv[arg], errmsg);
return 1;
}
if (process(argv[arg], in)) {
fclose(in);
return 1;
}
if (fclose(in)) {
fprintf(stderr, "%s: Read error.\n", argv[arg]);
return 1;
}
}
} else {
if (process("(standard input)", stdin))
return 1;
}
/* Output all hashed strings with at least n hits. */
{
size_t i;
for (i = 0; i < hash_table_size; i++) {
struct hash_entry *list = hash_table[i];
while (list) {
struct hash_entry *const curr = list;
list = list->next;
if (curr->hits >= n)
printf("%ls: %lu occurrences\n", curr->data, curr->hits);
}
}
fflush(stdout);
}
return 0;
}
If you wish me to expand on any specific detail in above, I'd be happy to, but I won't bother trying to explain everything from top to bottom.