Code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/*
Pattern matching functions; only length of matches returned
*/
int match_range(const char* current, char lower, char upper)
{
const char* next = current;
while(*next >= lower && *next <= upper)
++next;
return next - current;
}
int match_group(const char* current, const char* group)
{
const char* next = current;
while(*next != 0 && strchr(group, *next) != 0)
++next;
return next - current;
}
int match_character(const char* current, char character)
{
return *current == character ? 1 : 0;
}
int match_integer(const char* current)
{
int prefixed = 0;
if(match_character(current, '+') || match_character(current, '-'))
prefixed = 1;
int count = prefixed + match_range(current + prefixed, '0', '9');
/*
Assume the convention that an integer cannot adjoin an identifier...
*/
int match_identifier(const char*); // Forward declaration
if(match_identifier(current + count))
return 0;
return count;
}
int match_identifier(const char* current)
{
/*
Assume the convention that an identifier cannot begin with an integer...
*/
const char* next = current;
int count = match_range(next, 'a', 'z') + match_range(next, 'A', 'Z') + match_character(next, '_');
if(count == 0)
return 0;
next += count;
for(;;)
{
count = match_range(next, '0', '9') + match_range(next, 'a', 'z') + match_range(next, 'A', 'Z') + match_character(next, '_');
if(count == 0)
break;
next += count;
}
return next - current;
}
int match_space(const char* current)
{
return match_group(current, "\x20\t\r\n");
}
/*
Token lexer stuff
*/
enum
{
type_integer,
type_identifier,
type_space
};
const char* token_type_to_text(int type)
{
if(type == type_integer)
return "integer";
if(type == type_identifier)
return "identifier";
if(type == type_space)
return "space";
return "???";
};
/*
This would normally be the function that actually moves the token
information into a data structure. We'll just print them for now...
*/
void process_token(int type, const char* current, int length)
{
printf("token: '");
const char* next = current, * end = next + length;
while(next != end)
putchar(*next++);
printf("' [type: %s, length: %d]\n", token_type_to_text(type), length);
}
struct token_matcher
{
int type;
int (*match)(const char*);
};
/*
Note: these should be ordered by decreasing "greed"
(eg: a matcher for '++' should come before one for '+')
*/
struct token_matcher token_matchers[] =
{
{
type_integer,
match_integer
},
{
type_identifier,
match_identifier
},
{
type_space,
match_space
},
};
void tokenize(const char* input)
{
const char* current = input;
for(;;)
{
int found = 0;
for(int index = 0; index < sizeof(token_matchers)/sizeof(token_matchers[0]); ++index)
{
int length = token_matchers[index].match(current);
if(length != 0)
{
process_token(token_matchers[index].type, current, length);
current += length;
found = 1;
}
}
if(found == 0)
break;
}
if(*current != 0)
fprintf(stderr, "Warning: end of input not reached!\nData remaining: %s\n", current);
}
int main(void)
{
char buffer[1024] = "select mycol from mytable";
const int size = sizeof(buffer);
puts(buffer);
for(;;)
{
tokenize(buffer);
puts("Enter some more text to test:");
fgets(buffer, size, stdin);
if(buffer[0] == '\n')
break;
buffer[size - 1] = 0;
}
return 0;
}