Since reading CSV in C seems to be such a frequent question, I thought I'd post a simple and robust implementation:
Code:
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
/* RFC4180-compatible CSV field reader. Does not consume the separator.
* Hash function is DJB2 XOR-variant:
* hash(0) = 5318
* hash(i) = (hash(i-1) * 33U) ^ character(i)
* If you are not interested in the hash, just supply a NULL pointer.
*
* Returns the length of the field read.
* If the function returns zero, also check errno for errors. (0 is OK; empty field.)
*/
size_t csv_field(char **const dataptr, size_t *const sizeptr, unsigned int *const hashptr, FILE *const input)
{
char *data, *temp;
size_t size;
size_t used = 0;
unsigned int hash = 5318U;
int quoted = 0;
int c;
/* Invalid parameters? */
if (!dataptr || !sizeptr || !input) {
errno = EINVAL;
return 0;
}
/* Initialize field content buffer. Same logic as POSIX.1-2008 getline(). */
if (*dataptr) {
data = *dataptr;
size = *sizeptr;
} else {
data = NULL;
size = 0;
}
c = getc(input);
/* Skip leading whitespace. This is not strictly RFC4180-compliant,
* but it allows the use of both \n and \r\n newline convention.
* Quoted values will retain their leading whitespace, of course. */
while (c == '\t' || c == '\v' || c == '\f' || c == '\r' || c == ' ')
c = getc(input);
/* End of input/record/field? */
if (c == EOF || c == '\n' || c == ',')
goto done;
/* Quoted value? */
if (c == '"') {
quoted = 1;
c = getc(input);
}
while (c != EOF) {
/* If the field is not quoted, newline or comma ends the field. */
if (!quoted && (c == '\n' || c == ','))
break;
if (quoted && c == '"') {
/* " in a quoted value is special. */
c = getc(input);
/* Did the " end the quoted field? */
if (c == EOF || c == '\n' || c == ',')
break;
/* It really should be ", then. */
if (c != '"') {
/* Un-escaped " within field text; this is really an error.
* However, we're robust, and treat as if it was escaped.
*/
ungetc(c, input);
c = '"';
}
}
/* Enough room for the new character? */
if (used >= size) {
if (used < 4096)
size = 4096; /* Minimum 4096 */
else
if (used < 1048576)
size = (used * 5) / 4; /* Add 25%, up to one megabyte */
else
size = (used | 131071) + 130944; /* Pad to next (128k-128). */
temp = realloc(data, size);
if (!temp) {
errno = ENOMEM;
return 0;
}
data = temp;
*dataptr = temp;
*sizeptr = size;
}
hash = (33U * hash) ^ (unsigned int)c;
data[used++] = c;
c = getc(input);
}
done:
/* Do not consume the delimiter. */
if (c != EOF)
ungetc(c, input);
/* Enough room for the end-of-string mark? */
if (used >= size) {
size = (used | 7) + 1; /* Next multiple of 8. */
temp = malloc(size);
if (!temp) {
errno = ENOMEM;
return 0;
}
data = temp;
*dataptr = temp;
*sizeptr = size;
}
/* Terminate field value, */
data[used] = '\0';
/* save hash, if asked, */
if (hashptr)
*hashptr = hash;
/* and return the length of the field. */
errno = 0;
return used;
}
The function reads one (RFC4180-like) CSV field into the dynamically allocated buffer. It does not consume the separator (comma or newline).
It's also one of the better examples how to sanely use goto in C. Sure, you can easily replace it with an if clause, but I do believe this is simpler and easier to understand. (I mostly used the goto because I often use goto for the error/abnormal code paths. Here, the three cases -- end-of-input read as the first non-whitespace character, newline read as the first non-whitespace character, and comma read as the first non-whitespace character, just happened to fold nicely into one if clause.)
Because csv_field() reads the input stream character by character, it is not very fast. However, it's very robust, and handles even stuff like embedded newlines correctly.
The pointer to the hash is optional: you can supply NULL, if you are not interested what the DJB2 hash of the field value is.
Here's an example of how to use the function to read a CSV file:
Code:
int main(void)
{
char *data = NULL;
size_t size = 0;
size_t length;
long record = 0L, field = 0L;
unsigned int hash;
int c;
while (1) {
length = csv_field(&data, &size, &hash, stdin);
if (!length && errno) {
const char *const errmsg = strerror(errno);
fprintf(stderr, "Error reading standard input: %s.\n", errmsg);
return 1;
}
/* Advance field count, and record count if this is first field */
if (!field++)
record++;
/* Don't bother printing empty fields. */
if (!length)
return 0;
/* Output this field. */
printf("Record %ld, field %ld: '%s' (%lu chars, hash 0x%x)\n", record, field, data, (unsigned long)length, hash);
/* Get delimiter. */
c = getc(stdin);
if (c == EOF) {
/* EOF: end of input */
break;
} else
if (c == '\n') {
/* Newline: new record */
field = 0L;
continue;
} else
if (c == ',') {
/* Comma: new field */
continue;
} else {
/* Invalid separator */
if (c != '\'' && c > 32 && c < 127)
fprintf(stderr, "Invalid delimiter ('%c', ASCII %d) in standard input!\n", c, c);
else
fprintf(stderr, "Invalid delimiter (ASCII %d) in standard input!\n", c);
return 1;
}
}
printf("Read %ld records.\n", record);
/* Discard dynamically allocated field buffer. */
free(data);
data = NULL;
size = 0;
return 0;
}
If you need faster CSV handling, write your own functions. You can use the above code as a benchmark and unit test: you can verify your code parses some difficult test CSV (containing quoted values, escaped quotes, and embedded newlines et cetera) the same way.