Entire array is not printing--why?

**Charles Driver** · 06-20-2015

Originally Posted by Nominal Animal

Algorism, did you compare their runtimes on your machine? I'm too lazy to do that myself, but if you have, I'd be interested in seeing the results.

My own implementation differs a bit. (Oh, and this one supports all newline conventions -- LF, CR, CR LF, and LF CR --, and even counts the lines, to help the user in pinpointing the error locations if there is a problem in the input file.)

Code:

#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>

#define   MIN_PART      1U
#define   MAX_PART      35U
#define   PARTS         5
#define   RADIX        (MAX_PART - MIN_PART + 1)
#define   LIMIT         52521875    /* RADIX ** PARTS */
#define   PART_DIGITS   2

#if defined(MAP_NORESERVE)
#define   ROMAP_FLAGS   MAP_SHARED | MAP_NORESERVE
#else
#define   ROMAP_FLAGS   MAP_SHARED
#endif

#if __GNUC__ >= 4
#define   unlikely(x)   __builtin_expect((x), 0)
#else
#define   unlikely(x)   (x)
#endif

typedef struct {
    const unsigned char *data;
    size_t               size;
    size_t               len;
} romap;

static void map_release(romap *const map)
{
    if (map) {
        if (map->size)
            munmap((void *)map->data, map->size);
        map->data = NULL;
        map->size = 0;
        map->len  = 0;
    }
}

static int map_readonly(const char *const filename, romap *const map)
{
    struct stat info;
    void       *data;
    long        page;
    size_t      size, len;
    int         fd, result;

    if (!map)
        return errno = EINVAL;

    map->data = NULL;
    map->len  = 0;
    map->size = 0;

    if (!filename || !*filename)
        return errno = ENOENT;

    errno = 0;
    page = sysconf(_SC_PAGESIZE);
    if (page < 1L || errno != 0)
        return errno = ENOSYS;

    do {
        fd = open(filename, O_RDONLY | O_NOCTTY);
    } while (fd == -1 && errno == EINTR);
    if (fd == -1)
        return errno;

    do {
        result = fstat(fd, &info);
    } while (result == -1 && errno == EINTR);
    if (result == -1) {
        const int saved_errno = errno;
        close(fd);
        return errno = saved_errno;
    }
    if (info.st_size < (off_t)1) {
        close(fd);
        return errno = ENODATA;
    }

    len = (size_t)info.st_size;
    if ((off_t)len != info.st_size) {
        close(fd);
        return errno = ENOMEM;
    }

    if (len % (size_t)page)
        size = len + (size_t)page - (len % (size_t)page);
    else
        size = len;
    if (size < len) {
        close(fd);
        return errno = ENOMEM;
    }

    do {
        data = mmap(NULL, size, PROT_READ, ROMAP_FLAGS, fd, (off_t)0);
    } while (data == MAP_FAILED && errno == EINTR);
    if (data == MAP_FAILED) {
        const int saved_errno = errno;
        close(fd);
        return errno = saved_errno;
    }

    if (close(fd)) {
        munmap(data, size);
        return errno = EIO;
    }

    map->data = (const unsigned char *)data;
    map->size = size;
    map->len  = len;

    return errno = 0;
}

static int parse_ulong(const char *s, unsigned long *const val)
{
    const char *end = NULL;

    if (!val || !s || !*s)
        return EINVAL;

    errno = 0;
    *val = strtoul(s, (char **)&end, 0);
    if (errno || !end || end <= s)
        return EINVAL;

    while (*end == '\t' || *end == '\n' || *end == '\v' ||
           *end == '\f' || *end == '\r' || *end == ' ')
        end++;

    if (*end)
        return EINVAL;

    return 0;
}


int main(int argc, char *argv[])
{
    unsigned char *count;
    romap          input;
    unsigned long  exact;
    unsigned char  match;
    size_t         i;
 
    if (argc != 3 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
        fprintf(stderr, "       %s FILENAME COUNT\n", argv[0]);
        fprintf(stderr, "\n");
        fprintf(stderr, "This program reads lines of form\n");
        fprintf(stderr, "       N1 N2 N3 N4 N5\n");
        fprintf(stderr, "where N1..N5 are all positive decimal numbers\n");
        fprintf(stderr, "between 1 and 35, inclusive. Entries that occur\n");
        fprintf(stderr, "exactly COUNT times will be output in ascending order.\n");
        fprintf(stderr, "\n");
        return EXIT_FAILURE;
    }

    if (parse_ulong(argv[2], &exact)) {
        fprintf(stderr, "%s: Invalid COUNT.\n", argv[2]);
        return EXIT_FAILURE;
    }
    match = (unsigned char)exact;
    if ((unsigned long)match != exact || !(unsigned char)(match + 1U)) {
        fprintf(stderr, "%s: COUNT is too large.\n", argv[2]);
        return EXIT_FAILURE;
    }
    
    if (map_readonly(argv[1], &input)) {
        fprintf(stderr, "%s: %s.\n", argv[1], strerror(errno));
        return EXIT_FAILURE;
    }

    count = malloc((size_t)LIMIT * sizeof count[0]);
    if (!count) {
        fprintf(stderr, "Not enough memory for the 36^5 buckets.\n");
        return EXIT_FAILURE;
    }
    memset(count, 0, (size_t)LIMIT * sizeof count[0]);

    fprintf(stderr, "Reading %s .. ", argv[1]);
    fflush(stderr);

    {
        unsigned long              lineno = 1UL;
        unsigned long              numbers = 0UL;
        const unsigned char       *p = input.data;
        const unsigned char *const q = input.data + input.len;
        int                        d;

        while (p < q) {

            while (p < q && (*p == '\t' || *p == '\v' ||
                             *p == '\f' || *p == ' '))
                ++p;
            if (p >= q)
                break;

            if (*p == '\n') {
                if (++p < q && *p == '\r')
                    ++p;
                if (p < q) {
                    lineno++;
                    continue;
                } else
                    break;
            } else
            if (*p == '\r') {
                if (++p < q && *p == '\n')
                    ++p;
                if (p < q) {
                    lineno++;
                    continue;
                } else
                    break;
            }

            for (i = 0, d = 0; d < PARTS; d++) {
                unsigned int u = 0U;

                if (p >= q) {
                    fprintf(stderr, "%s: Line %lu: Premature end of line.\n", argv[1], lineno);
                    return EXIT_FAILURE;
                }

                while (p < q && *p >= '0' && *p <= '9')
                    u = 10*u + (unsigned int)(*(p++) - '0');

                if (u < (unsigned int)MIN_PART || u > (unsigned int)MAX_PART) {
                    fprintf(stderr, "%s: Line %lu: Value out of bounds.\n", argv[1], lineno);
                    return EXIT_FAILURE;
                }

                i = (size_t)RADIX * i + (size_t)(u - (unsigned int)MIN_PART);

                while (p < q && (*p == '\t' || *p == '\v' || *p == '\f' || *p == ' '))
                    p++;
            }

            numbers++;

            if (unlikely(!++count[i]))
                --count[i];

            while (p < q && *p != '\n' && *p != '\r')
                ++p;
        }

        fprintf(stderr, "%lu lines, %lu records.\n", lineno, numbers);
        fflush(stderr);
    }

    map_release(&input);

    for (i = 0; i < (size_t)LIMIT; i++)
        if (count[i] == match) {
            char            buffer[8 + (PART_DIGITS + 1) * PARTS];
            char           *p = buffer + sizeof buffer;
            unsigned long   u = (unsigned int)i;
            int             d = PARTS;

            *(--p) = '\0';
            *(--p) = '\n';
            while (d-->0) {
                unsigned long v = (unsigned long)MIN_PART + (u % (unsigned long)RADIX);
                u /= (unsigned long)RADIX;

                do {
                    *(--p) = '0' + (v % 10UL);
                    v /= 10UL;
                } while (v);

                *(--p) = ' ';
            }

            fputs(p + 1, stdout);
        }

    return EXIT_SUCCESS;
}

Originally, I was suspicious of whether the scattered binning would have bad cache behaviour, but it looks like the bin accesses are far enough apart so that the other work (parsing each input line) covers the cacheline access latency just fine.

On this Core i5 4200U laptop I'm using right now, the 21019-line example file linked to in post #13 is processed within a millisecond, if the file is already cached. Process startup overhead is about 35ms real time (30ms user CPU time, 4ms kernel CPU time). Since the algorithm is essentially linear wrt. input size, two million lines should take less than 0.1s real time, if the file is already cached.

Anyone testing the approaches will immediately see that the I/O is the bottleneck here. My version builds even the output lines in a buffer in reverse order, because standard stdio.h I/O is just that slow -- if you are really trying to minimize the total time used. (Floating-point data, encountered often in scientific work, is atrociously slow to parse using standard library interfaces; we're talking about an order of magnitude speed-up in many cases, with identical results.)

That also means that using a fixed one-byte binary output format from that other program -- just 5 bytes per record; no byte-order issues! -- would allow a further speedup, for two reasons: the file size would be significantly smaller, and the parsing would be trivial. For example, one could use ASCII codes 49 '1' to 84 'T' (123456789:;<=>?@ABCDEFGHIJKLMNOPQRST) so that each record would be 5 characters exactly, and the file could be read and written in any programming language (that support reading very long lines).

In that case, most records could also be parsed in sets of four, eight, or sixteen, as a vector, allowing for some very clever tricks for the conversion, speeding up the process even more. The cache effects would need investigating, however; most likely, we'll need to do other work between accessing each counter bin, to allow the CPU time to cache the necessary lines without inducing latencies -- unnecessary waiting. One way to accomplish that is to handle multiple vectors of records at a time, but staggered in time. It gets pretty complicated very fast, but sometimes you can gain a 2x - 4x speedup that way. Here, in normal cases, we're I/O- or cache-bound, so vectorization of the code might not be that useful after all.

It is also quite possible to use more than one thread to do the processing here, for example by splitting the input file into roughly equal-sized parts. If the records were fixed size, that would be even easier. However, the performance benefits might not be that great, because of the I/O- and/or RAM/cache-boundedness. Adding more CPU cores just might not help much. Besides, it is unclear whether one would do better by using atomic accesses to a single array, or whether each thread should have their own bin array.

In my opinion, the problem as posed here is just the butt end of the real problem to be solved. Much better results could be had, if the hidden, untalked of part of the problem could be accessed/modified, too. In particular, by switching to smaller, fixed-size five-byte records (or even six-byte, if you insist on having '\n' after each record).

WOW, some pretty cool tidbits of info. I am really glad you posted, so much to think of now but some really great directions to go in, in terms of getting it to the fastest possible solution. I am EXTREMELY flexible as to the organization of the file numbers or anything else, if it will create further gains in speed. I will leave no stone unturned in the quest for speed!!

I was hoping getting a few slow php commands rewritten in C would not be too, difficult. PHP commands array_count_values, array_diff and array_intersect are killing my script. Each one comes in at .8 of a second but that is really slow when you have to make the call 3000 times.

Naive me thought I could just do these calls in C at my present skill level and see a good speedup. I've learned valuable lessons from everyone's posts.

**migf1** · 06-20-2015

If speed is your primary objective, then you can totally ignore my code (was not opted for speed). Focus on the codes of algorism & nominal animal

**Nominal Animal** · 06-20-2015

Note: My code in post #27 has a bug in the output: it prints the output parts in the reverse order. Lines 272-276 should be

Code:

            int             d;
 
            *(--p) = '\0';
            *(--p) = '\n';
            for (d = 0; d < PARTS; d++) {

Originally Posted by Charles Driver

I am really happy you took the time to explain how the shell scripting can be a fast solution to the goal.

Well, scalability is an issue. The example data set is just 21019 records, whereas the practical data sets are approximately two million records, or almost fifty times larger. Testing with the example dataset is not a reliable indicator on how long the programs take with the larger dataset.

So, I generated an uniformly random two-million row data set using awk,

Code:

awk 'BEGIN { n = 2000000; while (n-->0) printf "%d %d %d %d %d\n", int(1 + 35*rand()), int(1 + 35*rand()), int(1 + 35*rand()), int(1 + 35*rand()), int(1 + 35*rand()) }' > two-million.txt

and generated a packed version (using the format I described in post #27: five characters per record (row in the original file), using characters 123456789:;<=>?@ABCDEFGHIJKLMNOPQRST to describe the 35 possible values for each, with no record separators) using

Code:

awk 'NF>=5 { printf "%c%c%c%c%c", 48+$1, 48+$2, 48+$3, 48+$4, 48+$5 }' two-million.txt > two-million.packed

The text format file was 27,428,568 bytes long. The packed format file was 10,000,000 bytes long, only about 36% of the size of the text format file.

If you used e.g. printf("%d %d %d %d %d\n", $n1, $n2, $n3, $n4, $n5) in PHP to output the records, just use printf("%c%c%c%c%c", 48+$n1, 48+$n2, 48+$n3, 48+$n4, 48+$n5) instead to output the records in the packed format.

On my Core i5-4200U laptop, with the input files in cache,

the mawk script took 24 seconds
the gawk script took 10 seconds (surprise!!)
plain sort -g two-million.txt takes 9 seconds (but uses all four threads (two cores) of this CPU; about 31 seconds user CPU time)
my mmap code took 0.26 seconds
my packed-input format code, below, took 0.12 seconds

It seems that gawk (GNU awk) has much better hashing algorithms than mawk does, as it is so much faster for this large data set. I wasn't aware of this before, although I normally use awk/mawk/gawk for conversion and test data generation, nothing time-sensitive.

Using sort is not worth it; doing it all using gawk (which uses only one CPU core for this, under 10 seconds of user CPU time) is a better option.

My memory-mapping code is faster than the gawk script by a factor of about 38.

Using the packed-input format, you could cut the processing time down to less than half, to a factor of 83 faster than the gawk script.

In Linux, you can drop all caches using sudo sh -c 'sync; echo 3 >/proc/sys/vm/drop_caches ; sync' . Unfortunately (wrt. this test!), this laptop has a sizzling fast SSD, and the results are basically the same even if the files are not cached, because reading them just does not take long enough to show up:

the mawk script took 24 seconds
the gawk script took 10 seconds
my mmap code took 0.26 seconds
my packed-input format code, below, took 0.12 seconds

Note that when using an SSD, the initial chunk of the file is almost immediately available, whereas using spinning disks, you have all kinds of latencies, on the order of about 10 ms.

3000 files with two million records each would take about fourteen minutes to process if in readable text format, or about six minutes if in the packed format.

The version of my mmap program that uses this packed input format, but produces exactly the same (human-readable decimal number) output as my mmap program, is here:

Code:

#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>

#define   MIN_PART      1
#define   MIN_CHAR      49
#define   MAX_CHAR      85
#define   RADIX        (MAX_CHAR - MIN_CHAR + 1)
#define   LIMIT        (RADIX*RADIX*RADIX*RADIX*RADIX)
#define   PARTS         5
#define   PART_DIGITS   2

#if defined(MAP_NORESERVE)
#define   ROMAP_FLAGS   MAP_SHARED | MAP_NORESERVE
#else
#define   ROMAP_FLAGS   MAP_SHARED
#endif

#if __GNUC__ >= 4
#define   unlikely(x)   __builtin_expect((x), 0)
#else
#define   unlikely(x)   (x)
#endif

typedef struct {
    const unsigned char *data;
    size_t               size;
    size_t               len;
} romap;

static void map_release(romap *const map)
{
    if (map) {
        if (map->size)
            munmap((void *)map->data, map->size);
        map->data = NULL;
        map->size = 0;
        map->len  = 0;
    }
}

static int map_readonly(const char *const filename, romap *const map)
{
    struct stat info;
    void       *data;
    long        page;
    size_t      size, len;
    int         fd, result;

    if (!map)
        return errno = EINVAL;

    map->data = NULL;
    map->len  = 0;
    map->size = 0;

    if (!filename || !*filename)
        return errno = ENOENT;

    errno = 0;
    page = sysconf(_SC_PAGESIZE);
    if (page < 1L || errno != 0)
        return errno = ENOSYS;

    do {
        fd = open(filename, O_RDONLY | O_NOCTTY);
    } while (fd == -1 && errno == EINTR);
    if (fd == -1)
        return errno;

    do {
        result = fstat(fd, &info);
    } while (result == -1 && errno == EINTR);
    if (result == -1) {
        const int saved_errno = errno;
        close(fd);
        return errno = saved_errno;
    }
    if (info.st_size < (off_t)1) {
        close(fd);
        return errno = ENODATA;
    }

    len = (size_t)info.st_size;
    if ((off_t)len != info.st_size) {
        close(fd);
        return errno = ENOMEM;
    }

    if (len % (size_t)page)
        size = len + (size_t)page - (len % (size_t)page);
    else
        size = len;
    if (size < len) {
        close(fd);
        return errno = ENOMEM;
    }

    do {
        data = mmap(NULL, size, PROT_READ, ROMAP_FLAGS, fd, (off_t)0);
    } while (data == MAP_FAILED && errno == EINTR);
    if (data == MAP_FAILED) {
        const int saved_errno = errno;
        close(fd);
        return errno = saved_errno;
    }

    if (close(fd)) {
        munmap(data, size);
        return errno = EIO;
    }

    map->data = (const unsigned char *)data;
    map->size = size;
    map->len  = len;

    return errno = 0;
}

static int parse_ulong(const char *s, unsigned long *const val)
{
    const char *end = NULL;

    if (!val || !s || !*s)
        return EINVAL;

    errno = 0;
    *val = strtoul(s, (char **)&end, 0);
    if (errno || !end || end <= s)
        return EINVAL;

    while (*end == '\t' || *end == '\n' || *end == '\v' ||
           *end == '\f' || *end == '\r' || *end == ' ')
        end++;

    if (*end)
        return EINVAL;

    return 0;
}


int main(int argc, char *argv[])
{
    unsigned char *count;
    romap          input;
    unsigned long  exact;
    unsigned char  match;
    size_t         i, n;
 
    if (argc != 3 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
        fprintf(stderr, "       %s FILENAME COUNT\n", argv[0]);
        fprintf(stderr, "\n");
        fprintf(stderr, "This program reads FILENAME, which consists of\n");
        fprintf(stderr, "five-character records with no record separators.\n");
        fprintf(stderr, "The characters are ASCII codes 49 '1' to 85 'T':\n");
        fprintf(stderr, "       1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B\n");
        fprintf(stderr, "       C D E F G H I J K L M N O P Q R S T\n");
        fprintf(stderr, "Character '1' corresponds to value 1, and\n");
        fprintf(stderr, "character 'T' to value 35.\n");
        fprintf(stderr, "\n");
        fprintf(stderr, "Records that occur exactly COUNT times, are output as\n");
        fprintf(stderr, "       V1 V2 V3 V4 V5\n");
        fprintf(stderr, "where V1..V5 are the decimal values corresponding to\n");
        fprintf(stderr, "the five-character string.\n");
        fprintf(stderr, "\n");
        return EXIT_FAILURE;
    }

    if (parse_ulong(argv[2], &exact)) {
        fprintf(stderr, "%s: Invalid COUNT.\n", argv[2]);
        return EXIT_FAILURE;
    }
    match = (unsigned char)exact;
    if ((unsigned long)match != exact || !(unsigned char)(match + 1U)) {
        fprintf(stderr, "%s: COUNT is too large.\n", argv[2]);
        return EXIT_FAILURE;
    }
    
    if (map_readonly(argv[1], &input)) {
        fprintf(stderr, "%s: %s.\n", argv[1], strerror(errno));
        return EXIT_FAILURE;
    }
    if (input.len % (size_t)PARTS) {
        fprintf(stderr, "%s: File length is not a multiple of %d!\n", argv[1], PARTS);
        return EXIT_FAILURE;
    }

    count = malloc((size_t)LIMIT * sizeof count[0]);
    if (!count) {
        fprintf(stderr, "Not enough memory for the buckets.\n");
        return EXIT_FAILURE;
    }
    memset(count, 0, (size_t)LIMIT * sizeof count[0]);

    fprintf(stderr, "Reading %s .. ", argv[1]);
    fflush(stderr);

    n = input.len / PARTS;
    for (i = 0; i < n; i++) {
        const unsigned char       *p = input.data + i * (size_t)PARTS;
        const unsigned char *const q = input.data + i * (size_t)PARTS + (size_t)PARTS;
        size_t                     k = 0;

        while (p < q)
            if (*p >= MIN_CHAR && *p <= MAX_CHAR)
                k = (size_t)RADIX * k + (size_t)(*(p++) - MIN_CHAR);
            else {
                fprintf(stderr, "%s: Invalid data in record %lu of %lu.\n", argv[1], 1UL + (unsigned long)i, (unsigned long)n);
                return EXIT_FAILURE;
            }

        if (unlikely(!++count[k]))
            --count[k];
    }

    fprintf(stderr, "%lu records.\n", (unsigned long)n);
    fflush(stderr);

    map_release(&input);

    for (i = 0; i < (size_t)LIMIT; i++)
        if (count[i] == match) {
            char            buffer[8 + (PART_DIGITS + 1) * PARTS];
            char           *p = buffer + sizeof buffer;
            unsigned long   u = (unsigned int)i;
            int             d;

            *(--p) = '\0';
            *(--p) = '\n';
            for (d = 0; d < PARTS; d++) {
                unsigned long v = (unsigned long)MIN_PART + (u % (unsigned long)RADIX);
                u /= (unsigned long)RADIX;

                do {
                    *(--p) = '0' + (v % 10UL);
                    v /= 10UL;
                } while (v);

                *(--p) = ' ';
            }

            fputs(p + 1, stdout);
        }

    return EXIT_SUCCESS;
}

Next, a threaded version of the above. Input processing is done by one or more separate threads, so this one will be slower (due to unnecessary overheads) if only one thread is used. Theoretically, it could be faster when multiple threads are used:

Code:

#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <pthread.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>

#define   MIN_PART      1
#define   MIN_CHAR      49
#define   MAX_CHAR      85
#define   RADIX        (MAX_CHAR - MIN_CHAR + 1)
#define   LIMIT        (RADIX*RADIX*RADIX*RADIX*RADIX)
#define   PARTS         5
#define   PART_DIGITS   2

#define   MAX_THREADS   16

#if defined(MAP_NORESERVE)
#define   ROMAP_FLAGS   MAP_SHARED | MAP_NORESERVE
#else
#define   ROMAP_FLAGS   MAP_SHARED
#endif

#if __GNUC__ >= 4
#define   unlikely(x)   __builtin_expect((x), 0)
#else
#define   unlikely(x)   (x)
#endif

typedef struct {
    const unsigned char *data;
    size_t               size;
    size_t               len;
} romap;

static void map_release(romap *const map)
{
    if (map) {
        if (map->size)
            munmap((void *)map->data, map->size);
        map->data = NULL;
        map->size = 0;
        map->len  = 0;
    }
}

static int map_readonly(const char *const filename, romap *const map)
{
    struct stat info;
    void       *data;
    long        page;
    size_t      size, len;
    int         fd, result;

    if (!map)
        return errno = EINVAL;

    map->data = NULL;
    map->len  = 0;
    map->size = 0;

    if (!filename || !*filename)
        return errno = ENOENT;

    errno = 0;
    page = sysconf(_SC_PAGESIZE);
    if (page < 1L || errno != 0)
        return errno = ENOSYS;

    do {
        fd = open(filename, O_RDONLY | O_NOCTTY);
    } while (fd == -1 && errno == EINTR);
    if (fd == -1)
        return errno;

    do {
        result = fstat(fd, &info);
    } while (result == -1 && errno == EINTR);
    if (result == -1) {
        const int saved_errno = errno;
        close(fd);
        return errno = saved_errno;
    }
    if (info.st_size < (off_t)1) {
        close(fd);
        return errno = ENODATA;
    }

    len = (size_t)info.st_size;
    if ((off_t)len != info.st_size) {
        close(fd);
        return errno = ENOMEM;
    }

    if (len % (size_t)page)
        size = len + (size_t)page - (len % (size_t)page);
    else
        size = len;
    if (size < len) {
        close(fd);
        return errno = ENOMEM;
    }

    do {
        data = mmap(NULL, size, PROT_READ, ROMAP_FLAGS, fd, (off_t)0);
    } while (data == MAP_FAILED && errno == EINTR);
    if (data == MAP_FAILED) {
        const int saved_errno = errno;
        close(fd);
        return errno = saved_errno;
    }

    if (close(fd)) {
        munmap(data, size);
        return errno = EIO;
    }

    map->data = (const unsigned char *)data;
    map->size = size;
    map->len  = len;

    return errno = 0;
}

static int parse_ulong(const char *s, unsigned long *const val)
{
    const char *end = NULL;

    if (!val || !s || !*s)
        return EINVAL;

    errno = 0;
    *val = strtoul(s, (char **)&end, 0);
    if (errno || !end || end <= s)
        return EINVAL;

    while (*end == '\t' || *end == '\n' || *end == '\v' ||
           *end == '\f' || *end == '\r' || *end == ' ')
        end++;

    if (*end)
        return EINVAL;

    return 0;
}

struct work {
    const char          *fname;
    unsigned char       *count;
    const unsigned char *data;
    size_t               first;
    size_t               limit;
};

pthread_mutex_t error_mutex = PTHREAD_MUTEX_INITIALIZER;

void *worker(void *workref)
{
    const char          *const fname = ((struct work *)workref)->fname;
    unsigned char       *const count = ((struct work *)workref)->count;
    const unsigned char *const data  = ((struct work *)workref)->data;
    const size_t               limit = ((struct work *)workref)->limit;
    size_t                     index = ((struct work *)workref)->first;

    while (index < limit) {
        const unsigned char       *p = data + index * PARTS;
        const unsigned char *const q = data + (index + 1) * PARTS;
        size_t                     value = 0;

        while (p < q)
            if (*p >= MIN_CHAR && *p <= MAX_CHAR)
                value = (size_t)RADIX * value + (size_t)(*(p++) - MIN_CHAR);
            else {
                pthread_mutex_lock(&error_mutex);
                fprintf(stderr, "%s: Record %zu at file offset %zu is invalid.\n",
                                fname, index, (size_t)PARTS * index);
                fflush(stderr);
                pthread_mutex_unlock(&error_mutex);
                return (void *)(long)EINVAL;
            }

        index++;

        if (unlikely(!__atomic_add_fetch(count + value, (unsigned char)1U, __ATOMIC_SEQ_CST)))
            __atomic_store_n(count + value, ~(unsigned char)0U, __ATOMIC_SEQ_CST);
    }

    return NULL;
}

int main(int argc, char *argv[])
{
    pthread_t      thread_id[MAX_THREADS];
    struct work    thread_work[MAX_THREADS];
    size_t         thread_count;
    pthread_attr_t attrs;
    unsigned char *count;
    romap          input;
    unsigned long  value;
    unsigned char  match;
    size_t         i, n;
 
    if (argc < 3 || argc > 4 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
        fprintf(stderr, "       %s FILENAME COUNT [ THREADS ]\n", argv[0]);
        fprintf(stderr, "\n");
        fprintf(stderr, "This program reads FILENAME, which consists of\n");
        fprintf(stderr, "five-character records with no record separators.\n");
        fprintf(stderr, "The characters are ASCII codes 49 '1' to 85 'T':\n");
        fprintf(stderr, "       1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B\n");
        fprintf(stderr, "       C D E F G H I J K L M N O P Q R S T\n");
        fprintf(stderr, "Character '1' corresponds to value 1, and\n");
        fprintf(stderr, "character 'T' to value 35.\n");
        fprintf(stderr, "\n");
        fprintf(stderr, "Records that occur exactly COUNT times, are output as\n");
        fprintf(stderr, "       V1 V2 V3 V4 V5\n");
        fprintf(stderr, "where V1..V5 are the decimal values corresponding to\n");
        fprintf(stderr, "the five-character string.\n");
        fprintf(stderr, "\n");
        return EXIT_FAILURE;
    }

    if (parse_ulong(argv[2], &value)) {
        fprintf(stderr, "%s: Invalid COUNT.\n", argv[2]);
        return EXIT_FAILURE;
    }
    match = (unsigned char)value;
    if ((unsigned long)match != value || !(unsigned char)(match + 1U)) {
        fprintf(stderr, "%s: COUNT is too large.\n", argv[2]);
        return EXIT_FAILURE;
    }

    if (argc > 3) {
        if (parse_ulong(argv[3], &value) || value < 1UL || value >= (unsigned long)MAX_THREADS) {
            fprintf(stderr, "%s: Invalid number of threads.\n", argv[3]);
            return EXIT_FAILURE;
        }
        thread_count = (size_t)value;
    } else
        thread_count = 1;
    
    if (map_readonly(argv[1], &input)) {
        fprintf(stderr, "%s: %s.\n", argv[1], strerror(errno));
        return EXIT_FAILURE;
    }
    if (input.len % (size_t)PARTS) {
        fprintf(stderr, "%s: File length is not a multiple of %d!\n", argv[1], PARTS);
        return EXIT_FAILURE;
    }
    n = input.len / (size_t)PARTS;

    count = malloc((size_t)LIMIT * sizeof count[0]);
    if (!count) {
        fprintf(stderr, "Not enough memory for the buckets.\n");
        return EXIT_FAILURE;
    }
    memset(count, 0, (size_t)LIMIT * sizeof count[0]);

    for (i = 0; i < thread_count; i++) {
        thread_work[i].fname = argv[1];
        thread_work[i].data  = input.data;
        thread_work[i].count = count;
        thread_work[i].first = (n * i) / thread_count;
        thread_work[i].limit = (n * (i + 1)) / thread_count;
    }

    pthread_attr_init(&attrs);
    if (pthread_attr_setstacksize(&attrs, 32768U))
        pthread_attr_setstacksize(&attrs, 65536U);

    fprintf(stderr, "Reading %s, %lu records .. ", argv[1], (unsigned long)n);
    fflush(stderr);

    for (i = 0; i < thread_count; i++) {
        int result;

        result = pthread_create(&(thread_id[i]), &attrs, worker, &thread_work[i]);
        if (result) {
            fprintf(stderr, "Cannot create worker thread: %s.\n", strerror(result));
            while (i-->0)
                pthread_cancel(thread_id[i]);
            return EXIT_FAILURE;
        }
    }

    pthread_attr_destroy(&attrs);

    {   size_t errorcount = 0;
        for (i = 0; i < thread_count; i++) {
            void *retval = NULL;
            pthread_join(thread_id[i], &retval);
            if (retval)
                ++errorcount;
        }
        if (errorcount)
            return EXIT_FAILURE;
    }

    fprintf(stderr, "Done.\n");
    fflush(stderr);

    map_release(&input);

    for (i = 0; i < (size_t)LIMIT; i++)
        if (count[i] == match) {
            char            buffer[8 + (PART_DIGITS + 1) * PARTS];
            char           *p = buffer + sizeof buffer;
            unsigned long   u = (unsigned int)i;
            int             d;

            *(--p) = '\0';
            *(--p) = '\n';
            for (d = 0; d < PARTS; d++) {
                unsigned long v = (unsigned long)MIN_PART + (u % (unsigned long)RADIX);
                u /= (unsigned long)RADIX;

                do {
                    *(--p) = '0' + (v % 10UL);
                    v /= 10UL;
                } while (v);

                *(--p) = ' ';
            }

            fputs(p + 1, stdout);
        }

    return EXIT_SUCCESS;
}

However, as I suspected, we are RAM speed or cache speed bound, and adding new CPU cores does not help -- on my machine, at least. Using only one worker thread the two million records are processed in 0.15 seconds; a bit more than using a single-threaded process, due to the overheads.

With two or more threads, the real-world time taken is 0.11 seconds, the same time taken by the single-threaded process, so the extra cores do not help. However, that might be because the i5-4200U only has two CPU cores (but four threads). In any case, this indicates that multithreading might not help in this case; although it would be useful to test on other hardware, too, to verify.

Finally, my initial suggestion (that treats the packed input format as a large number; kind of trivial hashing). This one uses qsort() from the standard C library, and due to the large number of function calls that entails, is not the fastest possible. On my machine, it takes 0.38 seconds (even though it uses the packed input format, therefore about three times slower than the single-threaded packed input program above):

Code:

#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>

#define   MIN_PART      1
#define   MIN_CHAR      49
#define   MAX_CHAR      85
#define   RADIX        (MAX_CHAR - MIN_CHAR + 1)
#define   LIMIT        (RADIX*RADIX*RADIX*RADIX*RADIX)
#define   PARTS         5
#define   PART_DIGITS   2

#if defined(MAP_NORESERVE)
#define   ROMAP_FLAGS   MAP_SHARED | MAP_NORESERVE
#else
#define   ROMAP_FLAGS   MAP_SHARED
#endif

#if __GNUC__ >= 4
#define   unlikely(x)   __builtin_expect((x), 0)
#else
#define   unlikely(x)   (x)
#endif

typedef struct {
    const unsigned char *data;
    size_t               size;
    size_t               len;
} romap;

static void map_release(romap *const map)
{
    if (map) {
        if (map->size)
            munmap((void *)map->data, map->size);
        map->data = NULL;
        map->size = 0;
        map->len  = 0;
    }
}

static int map_readonly(const char *const filename, romap *const map)
{
    struct stat info;
    void       *data;
    long        page;
    size_t      size, len;
    int         fd, result;

    if (!map)
        return errno = EINVAL;

    map->data = NULL;
    map->len  = 0;
    map->size = 0;

    if (!filename || !*filename)
        return errno = ENOENT;

    errno = 0;
    page = sysconf(_SC_PAGESIZE);
    if (page < 1L || errno != 0)
        return errno = ENOSYS;

    do {
        fd = open(filename, O_RDONLY | O_NOCTTY);
    } while (fd == -1 && errno == EINTR);
    if (fd == -1)
        return errno;

    do {
        result = fstat(fd, &info);
    } while (result == -1 && errno == EINTR);
    if (result == -1) {
        const int saved_errno = errno;
        close(fd);
        return errno = saved_errno;
    }
    if (info.st_size < (off_t)1) {
        close(fd);
        return errno = ENODATA;
    }

    len = (size_t)info.st_size;
    if ((off_t)len != info.st_size) {
        close(fd);
        return errno = ENOMEM;
    }

    if (len % (size_t)page)
        size = len + (size_t)page - (len % (size_t)page);
    else
        size = len;
    if (size < len) {
        close(fd);
        return errno = ENOMEM;
    }

    do {
        data = mmap(NULL, size, PROT_READ, ROMAP_FLAGS, fd, (off_t)0);
    } while (data == MAP_FAILED && errno == EINTR);
    if (data == MAP_FAILED) {
        const int saved_errno = errno;
        close(fd);
        return errno = saved_errno;
    }

    if (close(fd)) {
        munmap(data, size);
        return errno = EIO;
    }

    map->data = (const unsigned char *)data;
    map->size = size;
    map->len  = len;

    return errno = 0;
}

static int parse_ulong(const char *s, unsigned long *const val)
{
    const char *end = NULL;

    if (!val || !s || !*s)
        return EINVAL;

    errno = 0;
    *val = strtoul(s, (char **)&end, 0);
    if (errno || !end || end <= s)
        return EINVAL;

    while (*end == '\t' || *end == '\n' || *end == '\v' ||
           *end == '\f' || *end == '\r' || *end == ' ')
        end++;

    if (*end)
        return EINVAL;

    return 0;
}

static int compare_unsigned_int(const void *ptr1, const void *ptr2)
{
    if (*(const unsigned int *)ptr1 < *(const unsigned int *)ptr2)
        return -1;
    else
    if (*(const unsigned int *)ptr1 > *(const unsigned int *)ptr2)
        return +1;
    else
        return  0;
}

int main(int argc, char *argv[])
{
    unsigned int  *array;
    romap          input;
    unsigned long  value;
    size_t         match;
    size_t         i, n;
 
    if (argc != 3 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
        fprintf(stderr, "       %s FILENAME COUNT\n", argv[0]);
        fprintf(stderr, "\n");
        fprintf(stderr, "This program reads FILENAME, which consists of\n");
        fprintf(stderr, "five-character records with no record separators.\n");
        fprintf(stderr, "The characters are ASCII codes 49 '1' to 85 'T':\n");
        fprintf(stderr, "       1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B\n");
        fprintf(stderr, "       C D E F G H I J K L M N O P Q R S T\n");
        fprintf(stderr, "Character '1' corresponds to value 1, and\n");
        fprintf(stderr, "character 'T' to value 35.\n");
        fprintf(stderr, "\n");
        fprintf(stderr, "Records that occur exactly COUNT times, are output as\n");
        fprintf(stderr, "       V1 V2 V3 V4 V5\n");
        fprintf(stderr, "where V1..V5 are the decimal values corresponding to\n");
        fprintf(stderr, "the five-character string.\n");
        fprintf(stderr, "\n");
        return EXIT_FAILURE;
    }

    if (parse_ulong(argv[2], &value)) {
        fprintf(stderr, "%s: Invalid COUNT.\n", argv[2]);
        return EXIT_FAILURE;
    }
    match = (size_t)value;
    if ((unsigned long)match != value) {
        fprintf(stderr, "%s: COUNT is too large.\n", argv[2]);
        return EXIT_FAILURE;
    }
    
    if (map_readonly(argv[1], &input)) {
        fprintf(stderr, "%s: %s.\n", argv[1], strerror(errno));
        return EXIT_FAILURE;
    }
    if (input.len % (size_t)PARTS) {
        fprintf(stderr, "%s: File length is not a multiple of %d!\n", argv[1], PARTS);
        return EXIT_FAILURE;
    }
    n = input.len / PARTS;

    array = malloc((size_t)n * sizeof array[0]);
    if (!array) {
        fprintf(stderr, "Not enough memory for the array.\n");
        return EXIT_FAILURE;
    }

    fprintf(stderr, "Reading %s, %lu records .. ", argv[1], (unsigned long)n);
    fflush(stderr);

    for (i = 0; i < n; i++) {
        const unsigned char       *p = input.data + i * (size_t)PARTS;
        const unsigned char *const q = input.data + i * (size_t)PARTS + (size_t)PARTS;
        size_t                     k = 0;

        while (p < q)
            if (*p >= MIN_CHAR && *p <= MAX_CHAR)
                k = (size_t)RADIX * k + (size_t)(*(p++) - MIN_CHAR);
            else {
                fprintf(stderr, "%s: Invalid data in record %lu of %lu.\n", argv[1], 1UL + (unsigned long)i, (unsigned long)n);
                return EXIT_FAILURE;
            }

        array[i] = k;
    }

    map_release(&input);

    fprintf(stderr, "done. Sorting ..");
    fflush(stderr);

    qsort(array, n, sizeof array[0], compare_unsigned_int);

    fprintf(stderr, "done.\n");
    fflush(stderr);

    i = 0; while (i < n) {
        size_t j = i + 1;

        while (j < n && array[i] == array[j])
            j++;

        if (j - i == match) {
            char            buffer[8 + (PART_DIGITS + 1) * PARTS];
            char           *p = buffer + sizeof buffer;
            unsigned long   u = (unsigned long)(j - i);
            int             d;

            *(--p) = '\0';
            *(--p) = '\n';
            for (d = 0; d < PARTS; d++) {
                unsigned long v = (unsigned long)MIN_PART + (u % (unsigned long)RADIX);
                u /= (unsigned long)RADIX;

                do {
                    *(--p) = '0' + (v % 10UL);
                    v /= 10UL;
                } while (v);

                *(--p) = ' ';
            }

            fputs(p + 1, stdout);
        }

        i = j;
    }

    return EXIT_SUCCESS;
}

Replacing the qsort() call with an optimized Quicksort function that uses Insertion sort for the short segments, might improve on that. In many cases such a Quicksort routine can be parallelized to multiple threads, so if the 0.12 second real-world time is still too large, a multithreaded optimized quicksort+insertion sort for unsigned integers with no extra payload would be my next check.

The reason I posted this message, and the above sources, is to show the odd, low-level directions you might end up when chasing performance in C. Details in the source code do not matter that much; it is the overall approach, the algorithm and the implementation design, that determines how the code performs -- and because C is such a low-level language, the performance is often tightly coupled to the architecture used.

And, we haven't talked about distributed processing for this at all, yet. We could have several processes, possibly running on separate machines, preprocessing or partially processing these files. Each one could process their own chunk, generating a packed array of records with the number of occurrences per record, in sorted order; other processes would take these chunks, and combine them into one, also selecting only those entries that have the desired number of occurrences. While this sounds silly if you only consider a single file to be worked on, it becomes much more feasible if you have a large set of files to process; then, distributing the processing in this fashion, especially if the input files are spread over the nodes in a computing cluster, might yield the results in the shortest possible real time elapsed.

If you have any questions on the logic or implementation of my code -- or if you notice any bugs therein --, do ask: I'd be happy to try and answer and/or fix those.

**Charles Driver** · 06-23-2015

Originally Posted by migf1

If speed is your primary objective, then you can totally ignore my code (was not opted for speed). Focus on the codes of algorism & nominal animal

Cheers!! - no worries about the code, the nuggets of info. you gave on what needs to be taken into consideration when attempting to optimize code, made the post worth it.

**Charles Driver** · 06-23-2015

Originally Posted by Nominal Animal

Note: My code in post #27 has a bug in the output: it prints the output parts in the reverse order. Lines 272-276 should be

Code:

            int             d;
 
            *(--p) = '\0';
            *(--p) = '\n';
            for (d = 0; d < PARTS; d++) {

Well, scalability is an issue. The example data set is just 21019 records, whereas the practical data sets are approximately two million records, or almost fifty times larger. Testing with the example dataset is not a reliable indicator on how long the programs take with the larger dataset.

So, I generated an uniformly random two-million row data set using awk,

Code:

awk 'BEGIN { n = 2000000; while (n-->0) printf "%d %d %d %d %d\n", int(1 + 35*rand()), int(1 + 35*rand()), int(1 + 35*rand()), int(1 + 35*rand()), int(1 + 35*rand()) }' > two-million.txt

and generated a packed version (using the format I described in post #27: five characters per record (row in the original file), using characters 123456789:;<=>?@ABCDEFGHIJKLMNOPQRST to describe the 35 possible values for each, with no record separators) using

Code:

awk 'NF>=5 { printf "%c%c%c%c%c", 48+$1, 48+$2, 48+$3, 48+$4, 48+$5 }' two-million.txt > two-million.packed

The text format file was 27,428,568 bytes long. The packed format file was 10,000,000 bytes long, only about 36% of the size of the text format file.

If you used e.g. printf("%d %d %d %d %d\n", $n1, $n2, $n3, $n4, $n5) in PHP to output the records, just use printf("%c%c%c%c%c", 48+$n1, 48+$n2, 48+$n3, 48+$n4, 48+$n5) instead to output the records in the packed format.

On my Core i5-4200U laptop, with the input files in cache,

the mawk script took 24 seconds
the gawk script took 10 seconds (surprise!!)
plain sort -g two-million.txt takes 9 seconds (but uses all four threads (two cores) of this CPU; about 31 seconds user CPU time)
my mmap code took 0.26 seconds
my packed-input format code, below, took 0.12 seconds

It seems that gawk (GNU awk) has much better hashing algorithms than mawk does, as it is so much faster for this large data set. I wasn't aware of this before, although I normally use awk/mawk/gawk for conversion and test data generation, nothing time-sensitive.

Using sort is not worth it; doing it all using gawk (which uses only one CPU core for this, under 10 seconds of user CPU time) is a better option.

My memory-mapping code is faster than the gawk script by a factor of about 38.

Using the packed-input format, you could cut the processing time down to less than half, to a factor of 83 faster than the gawk script.

In Linux, you can drop all caches using sudo sh -c 'sync; echo 3 >/proc/sys/vm/drop_caches ; sync' . Unfortunately (wrt. this test!), this laptop has a sizzling fast SSD, and the results are basically the same even if the files are not cached, because reading them just does not take long enough to show up:

the mawk script took 24 seconds
the gawk script took 10 seconds
my mmap code took 0.26 seconds
my packed-input format code, below, took 0.12 seconds

Note that when using an SSD, the initial chunk of the file is almost immediately available, whereas using spinning disks, you have all kinds of latencies, on the order of about 10 ms.

3000 files with two million records each would take about fourteen minutes to process if in readable text format, or about six minutes if in the packed format.

The version of my mmap program that uses this packed input format, but produces exactly the same (human-readable decimal number) output as my mmap program, is here:

Code:

#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>

#define   MIN_PART      1
#define   MIN_CHAR      49
#define   MAX_CHAR      85
#define   RADIX        (MAX_CHAR - MIN_CHAR + 1)
#define   LIMIT        (RADIX*RADIX*RADIX*RADIX*RADIX)
#define   PARTS         5
#define   PART_DIGITS   2

#if defined(MAP_NORESERVE)
#define   ROMAP_FLAGS   MAP_SHARED | MAP_NORESERVE
#else
#define   ROMAP_FLAGS   MAP_SHARED
#endif

#if __GNUC__ >= 4
#define   unlikely(x)   __builtin_expect((x), 0)
#else
#define   unlikely(x)   (x)
#endif

typedef struct {
    const unsigned char *data;
    size_t               size;
    size_t               len;
} romap;

static void map_release(romap *const map)
{
    if (map) {
        if (map->size)
            munmap((void *)map->data, map->size);
        map->data = NULL;
        map->size = 0;
        map->len  = 0;
    }
}

static int map_readonly(const char *const filename, romap *const map)
{
    struct stat info;
    void       *data;
    long        page;
    size_t      size, len;
    int         fd, result;

    if (!map)
        return errno = EINVAL;

    map->data = NULL;
    map->len  = 0;
    map->size = 0;

    if (!filename || !*filename)
        return errno = ENOENT;

    errno = 0;
    page = sysconf(_SC_PAGESIZE);
    if (page < 1L || errno != 0)
        return errno = ENOSYS;

    do {
        fd = open(filename, O_RDONLY | O_NOCTTY);
    } while (fd == -1 && errno == EINTR);
    if (fd == -1)
        return errno;

    do {
        result = fstat(fd, &info);
    } while (result == -1 && errno == EINTR);
    if (result == -1) {
        const int saved_errno = errno;
        close(fd);
        return errno = saved_errno;
    }
    if (info.st_size < (off_t)1) {
        close(fd);
        return errno = ENODATA;
    }

    len = (size_t)info.st_size;
    if ((off_t)len != info.st_size) {
        close(fd);
        return errno = ENOMEM;
    }

    if (len % (size_t)page)
        size = len + (size_t)page - (len % (size_t)page);
    else
        size = len;
    if (size < len) {
        close(fd);
        return errno = ENOMEM;
    }

    do {
        data = mmap(NULL, size, PROT_READ, ROMAP_FLAGS, fd, (off_t)0);
    } while (data == MAP_FAILED && errno == EINTR);
    if (data == MAP_FAILED) {
        const int saved_errno = errno;
        close(fd);
        return errno = saved_errno;
    }

    if (close(fd)) {
        munmap(data, size);
        return errno = EIO;
    }

    map->data = (const unsigned char *)data;
    map->size = size;
    map->len  = len;

    return errno = 0;
}

static int parse_ulong(const char *s, unsigned long *const val)
{
    const char *end = NULL;

    if (!val || !s || !*s)
        return EINVAL;

    errno = 0;
    *val = strtoul(s, (char **)&end, 0);
    if (errno || !end || end <= s)
        return EINVAL;

    while (*end == '\t' || *end == '\n' || *end == '\v' ||
           *end == '\f' || *end == '\r' || *end == ' ')
        end++;

    if (*end)
        return EINVAL;

    return 0;
}


int main(int argc, char *argv[])
{
    unsigned char *count;
    romap          input;
    unsigned long  exact;
    unsigned char  match;
    size_t         i, n;
 
    if (argc != 3 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
        fprintf(stderr, "       %s FILENAME COUNT\n", argv[0]);
        fprintf(stderr, "\n");
        fprintf(stderr, "This program reads FILENAME, which consists of\n");
        fprintf(stderr, "five-character records with no record separators.\n");
        fprintf(stderr, "The characters are ASCII codes 49 '1' to 85 'T':\n");
        fprintf(stderr, "       1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B\n");
        fprintf(stderr, "       C D E F G H I J K L M N O P Q R S T\n");
        fprintf(stderr, "Character '1' corresponds to value 1, and\n");
        fprintf(stderr, "character 'T' to value 35.\n");
        fprintf(stderr, "\n");
        fprintf(stderr, "Records that occur exactly COUNT times, are output as\n");
        fprintf(stderr, "       V1 V2 V3 V4 V5\n");
        fprintf(stderr, "where V1..V5 are the decimal values corresponding to\n");
        fprintf(stderr, "the five-character string.\n");
        fprintf(stderr, "\n");
        return EXIT_FAILURE;
    }

    if (parse_ulong(argv[2], &exact)) {
        fprintf(stderr, "%s: Invalid COUNT.\n", argv[2]);
        return EXIT_FAILURE;
    }
    match = (unsigned char)exact;
    if ((unsigned long)match != exact || !(unsigned char)(match + 1U)) {
        fprintf(stderr, "%s: COUNT is too large.\n", argv[2]);
        return EXIT_FAILURE;
    }
    
    if (map_readonly(argv[1], &input)) {
        fprintf(stderr, "%s: %s.\n", argv[1], strerror(errno));
        return EXIT_FAILURE;
    }
    if (input.len % (size_t)PARTS) {
        fprintf(stderr, "%s: File length is not a multiple of %d!\n", argv[1], PARTS);
        return EXIT_FAILURE;
    }

    count = malloc((size_t)LIMIT * sizeof count[0]);
    if (!count) {
        fprintf(stderr, "Not enough memory for the buckets.\n");
        return EXIT_FAILURE;
    }
    memset(count, 0, (size_t)LIMIT * sizeof count[0]);

    fprintf(stderr, "Reading %s .. ", argv[1]);
    fflush(stderr);

    n = input.len / PARTS;
    for (i = 0; i < n; i++) {
        const unsigned char       *p = input.data + i * (size_t)PARTS;
        const unsigned char *const q = input.data + i * (size_t)PARTS + (size_t)PARTS;
        size_t                     k = 0;

        while (p < q)
            if (*p >= MIN_CHAR && *p <= MAX_CHAR)
                k = (size_t)RADIX * k + (size_t)(*(p++) - MIN_CHAR);
            else {
                fprintf(stderr, "%s: Invalid data in record %lu of %lu.\n", argv[1], 1UL + (unsigned long)i, (unsigned long)n);
                return EXIT_FAILURE;
            }

        if (unlikely(!++count[k]))
            --count[k];
    }

    fprintf(stderr, "%lu records.\n", (unsigned long)n);
    fflush(stderr);

    map_release(&input);

    for (i = 0; i < (size_t)LIMIT; i++)
        if (count[i] == match) {
            char            buffer[8 + (PART_DIGITS + 1) * PARTS];
            char           *p = buffer + sizeof buffer;
            unsigned long   u = (unsigned int)i;
            int             d;

            *(--p) = '\0';
            *(--p) = '\n';
            for (d = 0; d < PARTS; d++) {
                unsigned long v = (unsigned long)MIN_PART + (u % (unsigned long)RADIX);
                u /= (unsigned long)RADIX;

                do {
                    *(--p) = '0' + (v % 10UL);
                    v /= 10UL;
                } while (v);

                *(--p) = ' ';
            }

            fputs(p + 1, stdout);
        }

    return EXIT_SUCCESS;
}

Next, a threaded version of the above. Input processing is done by one or more separate threads, so this one will be slower (due to unnecessary overheads) if only one thread is used. Theoretically, it could be faster when multiple threads are used:

Code:

#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <pthread.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>

#define   MIN_PART      1
#define   MIN_CHAR      49
#define   MAX_CHAR      85
#define   RADIX        (MAX_CHAR - MIN_CHAR + 1)
#define   LIMIT        (RADIX*RADIX*RADIX*RADIX*RADIX)
#define   PARTS         5
#define   PART_DIGITS   2

#define   MAX_THREADS   16

#if defined(MAP_NORESERVE)
#define   ROMAP_FLAGS   MAP_SHARED | MAP_NORESERVE
#else
#define   ROMAP_FLAGS   MAP_SHARED
#endif

#if __GNUC__ >= 4
#define   unlikely(x)   __builtin_expect((x), 0)
#else
#define   unlikely(x)   (x)
#endif

typedef struct {
    const unsigned char *data;
    size_t               size;
    size_t               len;
} romap;

static void map_release(romap *const map)
{
    if (map) {
        if (map->size)
            munmap((void *)map->data, map->size);
        map->data = NULL;
        map->size = 0;
        map->len  = 0;
    }
}

static int map_readonly(const char *const filename, romap *const map)
{
    struct stat info;
    void       *data;
    long        page;
    size_t      size, len;
    int         fd, result;

    if (!map)
        return errno = EINVAL;

    map->data = NULL;
    map->len  = 0;
    map->size = 0;

    if (!filename || !*filename)
        return errno = ENOENT;

    errno = 0;
    page = sysconf(_SC_PAGESIZE);
    if (page < 1L || errno != 0)
        return errno = ENOSYS;

    do {
        fd = open(filename, O_RDONLY | O_NOCTTY);
    } while (fd == -1 && errno == EINTR);
    if (fd == -1)
        return errno;

    do {
        result = fstat(fd, &info);
    } while (result == -1 && errno == EINTR);
    if (result == -1) {
        const int saved_errno = errno;
        close(fd);
        return errno = saved_errno;
    }
    if (info.st_size < (off_t)1) {
        close(fd);
        return errno = ENODATA;
    }

    len = (size_t)info.st_size;
    if ((off_t)len != info.st_size) {
        close(fd);
        return errno = ENOMEM;
    }

    if (len % (size_t)page)
        size = len + (size_t)page - (len % (size_t)page);
    else
        size = len;
    if (size < len) {
        close(fd);
        return errno = ENOMEM;
    }

    do {
        data = mmap(NULL, size, PROT_READ, ROMAP_FLAGS, fd, (off_t)0);
    } while (data == MAP_FAILED && errno == EINTR);
    if (data == MAP_FAILED) {
        const int saved_errno = errno;
        close(fd);
        return errno = saved_errno;
    }

    if (close(fd)) {
        munmap(data, size);
        return errno = EIO;
    }

    map->data = (const unsigned char *)data;
    map->size = size;
    map->len  = len;

    return errno = 0;
}

static int parse_ulong(const char *s, unsigned long *const val)
{
    const char *end = NULL;

    if (!val || !s || !*s)
        return EINVAL;

    errno = 0;
    *val = strtoul(s, (char **)&end, 0);
    if (errno || !end || end <= s)
        return EINVAL;

    while (*end == '\t' || *end == '\n' || *end == '\v' ||
           *end == '\f' || *end == '\r' || *end == ' ')
        end++;

    if (*end)
        return EINVAL;

    return 0;
}

struct work {
    const char          *fname;
    unsigned char       *count;
    const unsigned char *data;
    size_t               first;
    size_t               limit;
};

pthread_mutex_t error_mutex = PTHREAD_MUTEX_INITIALIZER;

void *worker(void *workref)
{
    const char          *const fname = ((struct work *)workref)->fname;
    unsigned char       *const count = ((struct work *)workref)->count;
    const unsigned char *const data  = ((struct work *)workref)->data;
    const size_t               limit = ((struct work *)workref)->limit;
    size_t                     index = ((struct work *)workref)->first;

    while (index < limit) {
        const unsigned char       *p = data + index * PARTS;
        const unsigned char *const q = data + (index + 1) * PARTS;
        size_t                     value = 0;

        while (p < q)
            if (*p >= MIN_CHAR && *p <= MAX_CHAR)
                value = (size_t)RADIX * value + (size_t)(*(p++) - MIN_CHAR);
            else {
                pthread_mutex_lock(&error_mutex);
                fprintf(stderr, "%s: Record %zu at file offset %zu is invalid.\n",
                                fname, index, (size_t)PARTS * index);
                fflush(stderr);
                pthread_mutex_unlock(&error_mutex);
                return (void *)(long)EINVAL;
            }

        index++;

        if (unlikely(!__atomic_add_fetch(count + value, (unsigned char)1U, __ATOMIC_SEQ_CST)))
            __atomic_store_n(count + value, ~(unsigned char)0U, __ATOMIC_SEQ_CST);
    }

    return NULL;
}

int main(int argc, char *argv[])
{
    pthread_t      thread_id[MAX_THREADS];
    struct work    thread_work[MAX_THREADS];
    size_t         thread_count;
    pthread_attr_t attrs;
    unsigned char *count;
    romap          input;
    unsigned long  value;
    unsigned char  match;
    size_t         i, n;
 
    if (argc < 3 || argc > 4 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
        fprintf(stderr, "       %s FILENAME COUNT [ THREADS ]\n", argv[0]);
        fprintf(stderr, "\n");
        fprintf(stderr, "This program reads FILENAME, which consists of\n");
        fprintf(stderr, "five-character records with no record separators.\n");
        fprintf(stderr, "The characters are ASCII codes 49 '1' to 85 'T':\n");
        fprintf(stderr, "       1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B\n");
        fprintf(stderr, "       C D E F G H I J K L M N O P Q R S T\n");
        fprintf(stderr, "Character '1' corresponds to value 1, and\n");
        fprintf(stderr, "character 'T' to value 35.\n");
        fprintf(stderr, "\n");
        fprintf(stderr, "Records that occur exactly COUNT times, are output as\n");
        fprintf(stderr, "       V1 V2 V3 V4 V5\n");
        fprintf(stderr, "where V1..V5 are the decimal values corresponding to\n");
        fprintf(stderr, "the five-character string.\n");
        fprintf(stderr, "\n");
        return EXIT_FAILURE;
    }

    if (parse_ulong(argv[2], &value)) {
        fprintf(stderr, "%s: Invalid COUNT.\n", argv[2]);
        return EXIT_FAILURE;
    }
    match = (unsigned char)value;
    if ((unsigned long)match != value || !(unsigned char)(match + 1U)) {
        fprintf(stderr, "%s: COUNT is too large.\n", argv[2]);
        return EXIT_FAILURE;
    }

    if (argc > 3) {
        if (parse_ulong(argv[3], &value) || value < 1UL || value >= (unsigned long)MAX_THREADS) {
            fprintf(stderr, "%s: Invalid number of threads.\n", argv[3]);
            return EXIT_FAILURE;
        }
        thread_count = (size_t)value;
    } else
        thread_count = 1;
    
    if (map_readonly(argv[1], &input)) {
        fprintf(stderr, "%s: %s.\n", argv[1], strerror(errno));
        return EXIT_FAILURE;
    }
    if (input.len % (size_t)PARTS) {
        fprintf(stderr, "%s: File length is not a multiple of %d!\n", argv[1], PARTS);
        return EXIT_FAILURE;
    }
    n = input.len / (size_t)PARTS;

    count = malloc((size_t)LIMIT * sizeof count[0]);
    if (!count) {
        fprintf(stderr, "Not enough memory for the buckets.\n");
        return EXIT_FAILURE;
    }
    memset(count, 0, (size_t)LIMIT * sizeof count[0]);

    for (i = 0; i < thread_count; i++) {
        thread_work[i].fname = argv[1];
        thread_work[i].data  = input.data;
        thread_work[i].count = count;
        thread_work[i].first = (n * i) / thread_count;
        thread_work[i].limit = (n * (i + 1)) / thread_count;
    }

    pthread_attr_init(&attrs);
    if (pthread_attr_setstacksize(&attrs, 32768U))
        pthread_attr_setstacksize(&attrs, 65536U);

    fprintf(stderr, "Reading %s, %lu records .. ", argv[1], (unsigned long)n);
    fflush(stderr);

    for (i = 0; i < thread_count; i++) {
        int result;

        result = pthread_create(&(thread_id[i]), &attrs, worker, &thread_work[i]);
        if (result) {
            fprintf(stderr, "Cannot create worker thread: %s.\n", strerror(result));
            while (i-->0)
                pthread_cancel(thread_id[i]);
            return EXIT_FAILURE;
        }
    }

    pthread_attr_destroy(&attrs);

    {   size_t errorcount = 0;
        for (i = 0; i < thread_count; i++) {
            void *retval = NULL;
            pthread_join(thread_id[i], &retval);
            if (retval)
                ++errorcount;
        }
        if (errorcount)
            return EXIT_FAILURE;
    }

    fprintf(stderr, "Done.\n");
    fflush(stderr);

    map_release(&input);

    for (i = 0; i < (size_t)LIMIT; i++)
        if (count[i] == match) {
            char            buffer[8 + (PART_DIGITS + 1) * PARTS];
            char           *p = buffer + sizeof buffer;
            unsigned long   u = (unsigned int)i;
            int             d;

            *(--p) = '\0';
            *(--p) = '\n';
            for (d = 0; d < PARTS; d++) {
                unsigned long v = (unsigned long)MIN_PART + (u % (unsigned long)RADIX);
                u /= (unsigned long)RADIX;

                do {
                    *(--p) = '0' + (v % 10UL);
                    v /= 10UL;
                } while (v);

                *(--p) = ' ';
            }

            fputs(p + 1, stdout);
        }

    return EXIT_SUCCESS;
}

However, as I suspected, we are RAM speed or cache speed bound, and adding new CPU cores does not help -- on my machine, at least. Using only one worker thread the two million records are processed in 0.15 seconds; a bit more than using a single-threaded process, due to the overheads.

With two or more threads, the real-world time taken is 0.11 seconds, the same time taken by the single-threaded process, so the extra cores do not help. However, that might be because the i5-4200U only has two CPU cores (but four threads). In any case, this indicates that multithreading might not help in this case; although it would be useful to test on other hardware, too, to verify.

Finally, my initial suggestion (that treats the packed input format as a large number; kind of trivial hashing). This one uses qsort() from the standard C library, and due to the large number of function calls that entails, is not the fastest possible. On my machine, it takes 0.38 seconds (even though it uses the packed input format, therefore about three times slower than the single-threaded packed input program above):

Code:

#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>

#define   MIN_PART      1
#define   MIN_CHAR      49
#define   MAX_CHAR      85
#define   RADIX        (MAX_CHAR - MIN_CHAR + 1)
#define   LIMIT        (RADIX*RADIX*RADIX*RADIX*RADIX)
#define   PARTS         5
#define   PART_DIGITS   2

#if defined(MAP_NORESERVE)
#define   ROMAP_FLAGS   MAP_SHARED | MAP_NORESERVE
#else
#define   ROMAP_FLAGS   MAP_SHARED
#endif

#if __GNUC__ >= 4
#define   unlikely(x)   __builtin_expect((x), 0)
#else
#define   unlikely(x)   (x)
#endif

typedef struct {
    const unsigned char *data;
    size_t               size;
    size_t               len;
} romap;

static void map_release(romap *const map)
{
    if (map) {
        if (map->size)
            munmap((void *)map->data, map->size);
        map->data = NULL;
        map->size = 0;
        map->len  = 0;
    }
}

static int map_readonly(const char *const filename, romap *const map)
{
    struct stat info;
    void       *data;
    long        page;
    size_t      size, len;
    int         fd, result;

    if (!map)
        return errno = EINVAL;

    map->data = NULL;
    map->len  = 0;
    map->size = 0;

    if (!filename || !*filename)
        return errno = ENOENT;

    errno = 0;
    page = sysconf(_SC_PAGESIZE);
    if (page < 1L || errno != 0)
        return errno = ENOSYS;

    do {
        fd = open(filename, O_RDONLY | O_NOCTTY);
    } while (fd == -1 && errno == EINTR);
    if (fd == -1)
        return errno;

    do {
        result = fstat(fd, &info);
    } while (result == -1 && errno == EINTR);
    if (result == -1) {
        const int saved_errno = errno;
        close(fd);
        return errno = saved_errno;
    }
    if (info.st_size < (off_t)1) {
        close(fd);
        return errno = ENODATA;
    }

    len = (size_t)info.st_size;
    if ((off_t)len != info.st_size) {
        close(fd);
        return errno = ENOMEM;
    }

    if (len % (size_t)page)
        size = len + (size_t)page - (len % (size_t)page);
    else
        size = len;
    if (size < len) {
        close(fd);
        return errno = ENOMEM;
    }

    do {
        data = mmap(NULL, size, PROT_READ, ROMAP_FLAGS, fd, (off_t)0);
    } while (data == MAP_FAILED && errno == EINTR);
    if (data == MAP_FAILED) {
        const int saved_errno = errno;
        close(fd);
        return errno = saved_errno;
    }

    if (close(fd)) {
        munmap(data, size);
        return errno = EIO;
    }

    map->data = (const unsigned char *)data;
    map->size = size;
    map->len  = len;

    return errno = 0;
}

static int parse_ulong(const char *s, unsigned long *const val)
{
    const char *end = NULL;

    if (!val || !s || !*s)
        return EINVAL;

    errno = 0;
    *val = strtoul(s, (char **)&end, 0);
    if (errno || !end || end <= s)
        return EINVAL;

    while (*end == '\t' || *end == '\n' || *end == '\v' ||
           *end == '\f' || *end == '\r' || *end == ' ')
        end++;

    if (*end)
        return EINVAL;

    return 0;
}

static int compare_unsigned_int(const void *ptr1, const void *ptr2)
{
    if (*(const unsigned int *)ptr1 < *(const unsigned int *)ptr2)
        return -1;
    else
    if (*(const unsigned int *)ptr1 > *(const unsigned int *)ptr2)
        return +1;
    else
        return  0;
}

int main(int argc, char *argv[])
{
    unsigned int  *array;
    romap          input;
    unsigned long  value;
    size_t         match;
    size_t         i, n;
 
    if (argc != 3 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
        fprintf(stderr, "       %s FILENAME COUNT\n", argv[0]);
        fprintf(stderr, "\n");
        fprintf(stderr, "This program reads FILENAME, which consists of\n");
        fprintf(stderr, "five-character records with no record separators.\n");
        fprintf(stderr, "The characters are ASCII codes 49 '1' to 85 'T':\n");
        fprintf(stderr, "       1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B\n");
        fprintf(stderr, "       C D E F G H I J K L M N O P Q R S T\n");
        fprintf(stderr, "Character '1' corresponds to value 1, and\n");
        fprintf(stderr, "character 'T' to value 35.\n");
        fprintf(stderr, "\n");
        fprintf(stderr, "Records that occur exactly COUNT times, are output as\n");
        fprintf(stderr, "       V1 V2 V3 V4 V5\n");
        fprintf(stderr, "where V1..V5 are the decimal values corresponding to\n");
        fprintf(stderr, "the five-character string.\n");
        fprintf(stderr, "\n");
        return EXIT_FAILURE;
    }

    if (parse_ulong(argv[2], &value)) {
        fprintf(stderr, "%s: Invalid COUNT.\n", argv[2]);
        return EXIT_FAILURE;
    }
    match = (size_t)value;
    if ((unsigned long)match != value) {
        fprintf(stderr, "%s: COUNT is too large.\n", argv[2]);
        return EXIT_FAILURE;
    }
    
    if (map_readonly(argv[1], &input)) {
        fprintf(stderr, "%s: %s.\n", argv[1], strerror(errno));
        return EXIT_FAILURE;
    }
    if (input.len % (size_t)PARTS) {
        fprintf(stderr, "%s: File length is not a multiple of %d!\n", argv[1], PARTS);
        return EXIT_FAILURE;
    }
    n = input.len / PARTS;

    array = malloc((size_t)n * sizeof array[0]);
    if (!array) {
        fprintf(stderr, "Not enough memory for the array.\n");
        return EXIT_FAILURE;
    }

    fprintf(stderr, "Reading %s, %lu records .. ", argv[1], (unsigned long)n);
    fflush(stderr);

    for (i = 0; i < n; i++) {
        const unsigned char       *p = input.data + i * (size_t)PARTS;
        const unsigned char *const q = input.data + i * (size_t)PARTS + (size_t)PARTS;
        size_t                     k = 0;

        while (p < q)
            if (*p >= MIN_CHAR && *p <= MAX_CHAR)
                k = (size_t)RADIX * k + (size_t)(*(p++) - MIN_CHAR);
            else {
                fprintf(stderr, "%s: Invalid data in record %lu of %lu.\n", argv[1], 1UL + (unsigned long)i, (unsigned long)n);
                return EXIT_FAILURE;
            }

        array[i] = k;
    }

    map_release(&input);

    fprintf(stderr, "done. Sorting ..");
    fflush(stderr);

    qsort(array, n, sizeof array[0], compare_unsigned_int);

    fprintf(stderr, "done.\n");
    fflush(stderr);

    i = 0; while (i < n) {
        size_t j = i + 1;

        while (j < n && array[i] == array[j])
            j++;

        if (j - i == match) {
            char            buffer[8 + (PART_DIGITS + 1) * PARTS];
            char           *p = buffer + sizeof buffer;
            unsigned long   u = (unsigned long)(j - i);
            int             d;

            *(--p) = '\0';
            *(--p) = '\n';
            for (d = 0; d < PARTS; d++) {
                unsigned long v = (unsigned long)MIN_PART + (u % (unsigned long)RADIX);
                u /= (unsigned long)RADIX;

                do {
                    *(--p) = '0' + (v % 10UL);
                    v /= 10UL;
                } while (v);

                *(--p) = ' ';
            }

            fputs(p + 1, stdout);
        }

        i = j;
    }

    return EXIT_SUCCESS;
}

Replacing the qsort() call with an optimized Quicksort function that uses Insertion sort for the short segments, might improve on that. In many cases such a Quicksort routine can be parallelized to multiple threads, so if the 0.12 second real-world time is still too large, a multithreaded optimized quicksort+insertion sort for unsigned integers with no extra payload would be my next check.

The reason I posted this message, and the above sources, is to show the odd, low-level directions you might end up when chasing performance in C. Details in the source code do not matter that much; it is the overall approach, the algorithm and the implementation design, that determines how the code performs -- and because C is such a low-level language, the performance is often tightly coupled to the architecture used.

And, we haven't talked about distributed processing for this at all, yet. We could have several processes, possibly running on separate machines, preprocessing or partially processing these files. Each one could process their own chunk, generating a packed array of records with the number of occurrences per record, in sorted order; other processes would take these chunks, and combine them into one, also selecting only those entries that have the desired number of occurrences. While this sounds silly if you only consider a single file to be worked on, it becomes much more feasible if you have a large set of files to process; then, distributing the processing in this fashion, especially if the input files are spread over the nodes in a computing cluster, might yield the results in the shortest possible real time elapsed.

If you have any questions on the logic or implementation of my code -- or if you notice any bugs therein --, do ask: I'd be happy to try and answer and/or fix those.

Even More awesomeness-- thank you Nominal_Animal. Very excited to see how well the packed format did in the benchmarks (Because that means there is hope for me

). I went back and set up the files in the five byte setup as required. I think I should mention that I am currently running my programs on a mac book pro 2.7 intel core i7 with 16gb ram but I also have a mac desktop with equivalent speed and 64gb of ram. At one point I had a ssd drive in the desktop version but had to take it out because it was slowing my PHP scripts down. I have no idea why that was, but if I need to I can put it back. I do have access to a linux box if the packed format requires it. I am assuming that I can try to run a tester on the packed format code in Xcode on my mac? If I am understanding code correctly, the argv[0] indicates my filename should be put in at the terminal line, is there anyway to put it in the code itself?

**Nominal Animal** · 06-23-2015

Originally Posted by Charles Driver

At one point I had a ssd drive in the desktop version but had to take it out because it was slowing my PHP scripts down.

The storage medium should not matter that much, really, because the files will normally be cached anyway.

Originally Posted by Charles Driver

I am assuming that I can try to run a tester on the packed format code in Xcode on my mac?

Absolutely. The packed format itself is just plain ASCII text. The only "weird" thing about it is that there are no newlines, just one ten-million-character-long word in each file (assuming two million entries per file). And since there are no newlines in it, you should be able to transfer it like a text file -- but not have to worry about the newline conventions!

The memory mapping routines are POSIX.1, and should work on a Mac just fine.

If you want Windows compatibility, you'd need to drop the memory mapping stuff (or replace with the Windows equivalents). Instead, you could just read the file in chunks of say 160 kilobytes. (The chunks need to be a multiple of 5 long to have complete records, and large enough to make reading efficient -- not too many system calls and so on --, but small enough that we don't wait idle just because some of the file contents haven't been read from storage yet.)

Originally Posted by Charles Driver

If I am understanding code correctly, the argv[0] indicates my filename should be put in at the terminal line, is there anyway to put it in the code itself?

Yes, the program expects the file name on the command line, as the first parameter, argv[1], and the number of duplicates you're interested in as the second parameter.

(argv[0] is the path you used to execute the compiled program, like ./example for example.)

If you want to hardcode the parameters, just declare main as int main(void), and remove all code referring to argv or argc, except for the if (map_readonly(... part: keep that, just change the argv[1] to e.g. "input.txt". Finally, set match=5 or however many exact duplicates you're interested in.

**Charles Driver** · 06-27-2015

Originally Posted by Nominal Animal

The storage medium should not matter that much, really, because the files will normally be cached anyway.

Absolutely. The packed format itself is just plain ASCII text. The only "weird" thing about it is that there are no newlines, just one ten-million-character-long word in each file (assuming two million entries per file). And since there are no newlines in it, you should be able to transfer it like a text file -- but not have to worry about the newline conventions!

The memory mapping routines are POSIX.1, and should work on a Mac just fine.

If you want Windows compatibility, you'd need to drop the memory mapping stuff (or replace with the Windows equivalents). Instead, you could just read the file in chunks of say 160 kilobytes. (The chunks need to be a multiple of 5 long to have complete records, and large enough to make reading efficient -- not too many system calls and so on --, but small enough that we don't wait idle just because some of the file contents haven't been read from storage yet.)

Yes, the program expects the file name on the command line, as the first parameter, argv[1], and the number of duplicates you're interested in as the second parameter.

(argv[0] is the path you used to execute the compiled program, like ./example for example.)

If you want to hardcode the parameters, just declare main as int main(void), and remove all code referring to argv or argc, except for the if (map_readonly(... part: keep that, just change the argv[1] to e.g. "input.txt". Finally, set match=5 or however many exact duplicates you're interested in.

Cool and Thanks Nominal_Animal!! I changed match to equal 5 and I removed the argc/argv and changed the argv[1] to indicate my file, but I get an error back once I run the code that says "File length is not a multiple of 5! Program ended with exit code: 1". I'm still scratching my head over it, so I was hoping if I posted what I did you might be able to easily see what mistakes I made in the updating of the code (CHANGES ARE BELOW):

Code:

#define _POSIX_C_SOURCE 200809L
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>

#define   MIN_PART      1
#define   MIN_CHAR      49
#define   MAX_CHAR      85
#define   RADIX        (MAX_CHAR - MIN_CHAR + 1)
#define   LIMIT        (RADIX*RADIX*RADIX*RADIX*RADIX)
#define   PARTS         5
#define   PART_DIGITS   2

#if defined(MAP_NORESERVE)
#define   ROMAP_FLAGS   MAP_SHARED | MAP_NORESERVE
#else
#define   ROMAP_FLAGS   MAP_SHARED
#endif

#if __GNUC__ >= 4
#define   unlikely(x)   __builtin_expect((x), 0)
#else
#define   unlikely(x)   (x)
#endif

typedef struct {
    const unsigned char *data;
    size_t               size;
    size_t               len;
} romap;

static void map_release(romap *const map)
{
    if (map) {
        if (map->size)
            munmap((void *)map->data, map->size);
        map->data = NULL;
        map->size = 0;
        map->len  = 0;
    }
}

static int map_readonly(const char *const filename, romap *const map)
{
    struct stat info;
    void       *data;
    long        page;
    size_t      size, len;
    int         fd, result;
    
    if (!map)
        return errno = EINVAL;
    
    map->data = NULL;
    map->len  = 0;
    map->size = 0;
    
    if (!filename || !*filename)
        return errno = ENOENT;
    
    errno = 0;
    page = sysconf(_SC_PAGESIZE);
    if (page < 1L || errno != 0)
        return errno = ENOSYS;
    
    do {
        fd = open(filename, O_RDONLY | O_NOCTTY);
    } while (fd == -1 && errno == EINTR);
    if (fd == -1)
        return errno;
    
    do {
        result = fstat(fd, &info);
    } while (result == -1 && errno == EINTR);
    if (result == -1) {
        const int saved_errno = errno;
        close(fd);
        return errno = saved_errno;
    }
    if (info.st_size < (off_t)1) {
        close(fd);
        return errno = ENODATA;
    }
    
    len = (size_t)info.st_size;
    if ((off_t)len != info.st_size) {
        close(fd);
        return errno = ENOMEM;
    }
    
    if (len % (size_t)page)
        size = len + (size_t)page - (len % (size_t)page);
    else
        size = len;
    if (size < len) {
        close(fd);
        return errno = ENOMEM;
    }
    
    do {
        data = mmap(NULL, size, PROT_READ, ROMAP_FLAGS, fd, (off_t)0);
    } while (data == MAP_FAILED && errno == EINTR);
    if (data == MAP_FAILED) {
        const int saved_errno = errno;
        close(fd);
        return errno = saved_errno;
    }
    
    if (close(fd)) {
        munmap(data, size);
        return errno = EIO;
    }
    
    map->data = (const unsigned char *)data;
    map->size = size;
    map->len  = len;
    
    return errno = 0;
}

static int parse_ulong(const char *s, unsigned long *const val)
{
    const char *end = NULL;
    
    if (!val || !s || !*s)
        return EINVAL;
    
    errno = 0;
    *val = strtoul(s, (char **)&end, 0);
    if (errno || !end || end <= s)
        return EINVAL;
    
    while (*end == '\t' || *end == '\n' || *end == '\v' ||
           *end == '\f' || *end == '\r' || *end == ' ')
        end++;
    
    if (*end)
        return EINVAL;
    
    return 0;
}


int main(void)
{
    unsigned char *count;
    romap          input;
    unsigned long  exact;
    unsigned char  match=5;
    size_t         i, n;
    
   /* if (argc != 3 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
        fprintf(stderr, "\n");
        fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
        fprintf(stderr, "       %s FILENAME COUNT\n", argv[0]);
        fprintf(stderr, "\n");
        fprintf(stderr, "This program reads FILENAME, which consists of\n");
        fprintf(stderr, "five-character records with no record separators.\n");
        fprintf(stderr, "The characters are ASCII codes 49 '1' to 85 'T':\n");
        fprintf(stderr, "       1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B\n");
        fprintf(stderr, "       C D E F G H I J K L M N O P Q R S T\n");
        fprintf(stderr, "Character '1' corresponds to value 1, and\n");
        fprintf(stderr, "character 'T' to value 35.\n");
        fprintf(stderr, "\n");
        fprintf(stderr, "Records that occur exactly COUNT times, are output as\n");
        fprintf(stderr, "       V1 V2 V3 V4 V5\n");
        fprintf(stderr, "where V1..V5 are the decimal values corresponding to\n");
        fprintf(stderr, "the five-character string.\n");
        fprintf(stderr, "\n");
        return EXIT_FAILURE;
    }*/
    
   /* if (parse_ulong(argv[2], &exact)) {
        fprintf(stderr, "%s: Invalid COUNT.\n", argv[2]);
        return EXIT_FAILURE;
    }*/
    /*match = (unsigned char)exact;
    if ((unsigned long)match != exact || !(unsigned char)(match + 1U)) {
        fprintf(stderr, "%s: COUNT is too large.\n", argv[2]);
        return EXIT_FAILURE;
    }*/
    
    if (map_readonly("boardpatterntype1_converted_merged_level_final_2.txt", &input)) {
        fprintf(stderr, "%s: %s.\n", "boardpatterntype1_converted_merged_level_final_2.txt", strerror(errno));
        return EXIT_FAILURE;
    }
   if (input.len % (size_t)PARTS) {
        fprintf(stderr, "%s: File length is not a multiple of %d!\n", "boardpatterntype1_converted_merged_level_final_2.txt", PARTS);
        return EXIT_FAILURE;
    }
    count = malloc((size_t)LIMIT * sizeof count[0]);
    if (!count) {
        fprintf(stderr, "Not enough memory for the buckets.\n");
        return EXIT_FAILURE;
    }
    memset(count, 0, (size_t)LIMIT * sizeof count[0]);
    
    fprintf(stderr, "Reading %s .. ", "boardpatterntype1_converted_merged_level_final_2.txt");
    fflush(stderr);
    
    n = input.len / PARTS;
    for (i = 0; i < n; i++) {
        const unsigned char       *p = input.data + i * (size_t)PARTS;
        const unsigned char *const q = input.data + i * (size_t)PARTS + (size_t)PARTS;
        size_t                     k = 0;
        
        while (p < q)
            if (*p >= MIN_CHAR && *p <= MAX_CHAR)
                k = (size_t)RADIX * k + (size_t)(*(p++) - MIN_CHAR);
            else {
                fprintf(stderr, "%s: Invalid data in record %lu of %lu.\n", "boardpatterntype1_converted_merged_level_final_2.txt", 1UL + (unsigned long)i, (unsigned long)n);
                return EXIT_FAILURE;
            }
        
        if (unlikely(!++count[k]))
            --count[k];
    }
    
    fprintf(stderr, "%lu records.\n", (unsigned long)n);
    fflush(stderr);
    
    map_release(&input);
    
    for (i = 0; i < (size_t)LIMIT; i++)
        if (count[i] == match) {
            char            buffer[8 + (PART_DIGITS + 1) * PARTS];
            char           *p = buffer + sizeof buffer;
            unsigned long   u = (unsigned int)i;
            int             d;
            
            *(--p) = '\0';
            *(--p) = '\n';
            for (d = 0; d < PARTS; d++) {
                unsigned long v = (unsigned long)MIN_PART + (u % (unsigned long)RADIX);
                u /= (unsigned long)RADIX;
                
                do {
                    *(--p) = '0' + (v % 10UL);
                    v /= 10UL;
                } while (v);
                
                *(--p) = ' ';
            }
            
            fputs(p + 1, stdout);
        }
    
    return EXIT_SUCCESS;
}

**Nominal Animal** · 07-17-2015

Sorry for not catching this question earlier.

Originally Posted by Charles Driver

I get an error back once I run the code that says "File length is not a multiple of 5! Program ended with exit code: 1".

Most likely there is an extra newline ('\n' or "\r\n" or similar). You can use

Code:

tail -c 100 boardpatterntype1_converted_merged_level_final_2.txt | hexdump -c

to see the final 100 bytes of the file. Personally, I'd really like to see what

Code:

bash -c 'F=boardpatterntype1_converted_merged_level_final_2.txt ; dd bs=5 if=$F skip=$[$(stat -c %s $F)/5 - 20] status=none | hexdump -e "5/1 \"%02X \"\"\t\"\" \"" -e "5/1 \"%c\"\"\n\""'

outputs; in particular, if I'm correct about having extra newlines, the last line of the above output will have less than five characters -- either 0A, 0D, 0D 0A, or 0A 0D, depending on the newline convention on the machine that generated the file.

If so, the workaround is trivial: just ignore the final partial quintet of characters, by removing the length check (lines 193-196 in your post).

**WaltP** · 07-19-2015

I'm a little late to this problem, but I would like to clear up a couple assumptions:

1) each line is one "serial number"
2) each serial number is 5 numbers separated by space
3) the order if the 5 numbers is important (1 2 3 4 5 is not the same as 1 3 5 4 2)
4) there is no math being done on the serial numbers (a 'number' could just as easily be "A B 4 54 X")

Are these assumptions correct?

**Nominal Animal** · 07-19-2015

Originally Posted by WaltP

I'm a little late to this problem

No worries!

Originally Posted by WaltP

1) each line is one "serial number"
2) each serial number is 5 numbers separated by space
3) the order if the 5 numbers is important (1 2 3 4 5 is not the same as 1 3 5 4 2)

Each serial number is a five-digit number in base 35. In the original file, each digit was described as a decimal number between 1 and 35, inclusive. The digit order is thus indeed important.

In other words, if the five numbers are n₁ n₂ n₃ n₄ n₅, you can consider it a 25-bit integer value

35⁴ (n₁ - 1) + 35³ (n₂ - 1) + 35² (n₃ - 1) + 35 (n₄ - 1) + (n₅ - 1)
= 1500625 n₁ + 42875 n₂ + 1225 n₃ + 35 n₄ + n₅ - 1544761

or, if it is easier, a 26-bit integer value

n₅ + 36 n₄ + 36² n₃ + 36³ n₂ + 36⁴ n₁
= n₅ + 36 n₄ + 1296 n₃ + 46656 n₂ + 1679616 n₁

although of the 36⁵=60466176 possible values for the latter, only 35⁵=52521875 will be used in any case.

Originally Posted by WaltP

4) there is no math being done on the serial numbers (a 'number' could just as easily be "A B 4 54 X")

No, only occurrences of each serial number are counted, and the interesting ones (those that occur 5 times) are output. The OP told us the dataset typically contains two million serial numbers per file. (But note that only 35⁵ = 52,521,875 unique serial numbers exist.)

My post #33 did some practical tests, showing that if speed is a concern, you need a C program. Common utilities like sort and awk scripts just do not cut it; you can decimate the time those take by writing your own C program to do this particular job.

I did use memory mapping, but only because it is easy in Linux, and avoids any file load latencies you might have if your block size is not optimal for the given file and filesystem. If you use a power-of-two read chunk size (typically 2¹⁷=131072 or larger), and refill the buffer only when a premature end of line/record is encountered, you should get close to same performance.

Using a packed input format, five ASCII characters (the set of 35 allowed values being ASCII codes 49 to 83, 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S) per serial number, is quite portable (aside from the possible newline issue at the end of the file -- that being the question I missed three weeks ago), is much better than decimal number input. Just switching to the packed format cuts the time taken down to less than half, yielding the fastest method I've seen in this thread thus far.

Edit: Converting the serial to a 25-bit integer, and using four ASCII characters to describe each (using e.g. base 86, 86 printable ASCII characters), at the generating end, would be even faster. Probably faster by 20%, because it would also reduce the file size by 20%. The PHP code to encode the serial number (assuming the five decimal numbers are [1,35] and in $n1 through $n5) as a four-character string in $record is just

PHP Code:


$value = 1500625*intval($n1)

       + 42875*intval($n2)

       + 1225*intval($n3)

       + 35*intval($n4)

       + intval($n5)

       - 1544761;

$record =chr(40 + ($value / 54700816) % 86)

        . chr(40 + ($value / 636056) % 86)

        . chr(40 + ($value / 7396) % 86)

        . chr(40 + $value % 86);

where the base set are the 86 ASCII codes 40..125 (( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | }) where ( corresponds to 0, ) to 1, and so on, up to } corresponding to 85.

Thread: Entire array is not printing--why?

Thread Tools

Search Thread

Display

Similar Threads

Help with printing .txt from an array

Printing an array of strings until array == NULL

Printing an array

Unable to store my entire string in an array trough a pointer. (C++)

printing array

Tags for this Thread