Hi all -
I've made huge progress with my lexer in the last day or so! The code is quite clean now and easy to follow now that I have functions for each lexeme type.
One small problem - the code is tacking on a bit of junk onto the "from" token which means that it isn't recognised as a keyword. So, if someone can track down why this is, that'd be *great* and the lexer will work perfectly!
Here's the code -
Code:
/* This code is released to the public domain. */
/* "Share and enjoy......" :) */
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#define NUMBER_OF_KEYWORDS 9
/* Array of our keywords in string form. */
char *kw_strings[] = {
"select", "from", "where", "and", "or", "not", "in", "is", "null"
} ;
/* Search function to search the array of keywords. */
int search(char *arr[], int dim, char *str) {
int i;
int found_match;
for (i=0; i<dim; i++) {
if ( !strcmp(arr[i] , str ) ) {
found_match = 1;
break;
} else found_match = 0;
} /* For */
return found_match;
} /* search */
/* Forward declarations. */
void lex(char *str) ;
void parse(char token[], char *toktype);
void lex_kwident(char *str) {
char token[20];
char *toktype;
int i=0;
while (isalnum(*str) && *str != '\0' && i<20) {
token[i] = *str;
i++;
str++;
}
if (search(kw_strings, NUMBER_OF_KEYWORDS, token) == 1 )
toktype = "Keyword";
else
toktype = "Identifier" ;
parse(token, toktype);
memset(&token[0], 0, sizeof(token));
lex(str);
}
void lex_string(char *str) {
char token[20];
char *toktype;
int i=0;
while ( (*str != '"') && *str != '\0' && i<20) {
token[i] = *str;
i++;
str++;
}
/* Add the end double-quote. */
token[i] = '"' ;
toktype = "String" ;
parse(token, toktype);
memset(&token[0], 0, sizeof(token));
lex(str);
}
void lex_number(char *str) {
char token[20];
char *toktype;
int i=0;
while (isdigit(*str) && *str != '\0' && i<20) {
token[i] = *str;
i++;
str++;
}
toktype = "Number" ;
parse(token, toktype);
memset(&token[0], 0, sizeof(token));
lex(str);
}
void lex_punct(char *str) {
char token[20];
char *toktype;
int i=0;
while (ispunct(*str) && *str != '\0' && i<20) {
token[i] = *str;
i++;
str++;
}
toktype = "Punct" ;
parse(token, toktype);
memset(&token[0], 0, sizeof(token));
lex(str);
}
void lex_space(char *str) {
char token[80] = " ";
char *toktype = "Space";
while ( isspace(*str) && *str != '\0') {
str++;
}
toktype = "Space" ;
parse(token, toktype);
memset(&token[0], 0, sizeof(token));
lex(str);
}
void lex(char *str) {
if (isalpha(*str) || *str == '_') lex_kwident(str) ;
else if ( (*str == '"') ) lex_string(str);
else if (isspace(*str)) lex_space(str);
else if (isdigit(*str)) lex_number(str);
else if (ispunct(*str) && *str != '_') lex_punct(str);
}
/* Not a parser (yet) - just prints the tokens. */
void parse(char token[], char *toktype) {
printf("Token: %s Tokentype: %s\n", token, toktype);
}
int main() {
char *mystr = "select mycol8 from mytable" ;
lex(mystr);
return 0;
}
Many thanks in advance.....
- Andy