SQL lexer - almost there, just have a printf problem

**latte123** · 12-30-2017

Hi all -

I've been doing a "toy SQL lexer" and almost have it working. It compiles fine but I get this one warning -

sql_lexer.c:281:2: warning: format ‘%s’ expects argument of type ‘char *’, but argument 3 has type ‘union <anonymous>’ [-Wformat=]

It is caused by this line -

Code:

printf("%s %s", curr_token.toktype, curr_token.value);

I don't know what format to use to print struct members.
To make matters worse, the struct has a union as part of it (as you can see below).

The entire code follows (sorry for its length) -

Code:

/* sql_lexer.c */ 

/* A toy lexer for a small part of SQL. */ 
/* This code is released to the public domain. */ 
/* "Share and enjoy......"  :)     */ 


#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>


/* Declare the current character. */ 
int current_char;


/* Next_char function */ 
int next_char() {
    return current_char = getchar();
}



/* Struct to store TOKENS.  */   
typedef struct {
    char *toktype;
    union {
        char *string_value;
        char char_value;        
        int int_value;
        float float_value;
    } value;
} token;



/* Allocate memory for tokens. */ 
void *allocate_memory(size_t n) {
    return malloc(n);
}



/* Lex a keyword or identifier. */ 
token lex_keyword_or_identifier() {
    token mytok; 
    char *mystr = "";
    
   while( isalnum(current_char) || current_char == '_' ) { 
       mystr[strlen(mystr)] = current_char ;             
   } 
      
      
   if ( strcmp(mystr, "select") )  { 
       mytok.toktype = "TOKEN_SELECT" ;  
       mytok.value.string_value = "select" ; } 
   else if (strcmp(mystr, "from") )  { 
       mytok.toktype = "TOKEN_FROM" ; 
       mytok.value.string_value = "from" ;  } 
   else if (strcmp(mystr, "where")) {          
       mytok.toktype = "TOKEN_WHERE" ; 
       mytok.value.string_value = "where" ;  }        
   else if (strcmp(mystr, "and")) { 
       mytok.toktype = "TOKEN_AND" ; 
       mytok.value.string_value = "and" ;  }  
   else if (strcmp(mystr, "or")) { 
       mytok.toktype = "TOKEN_OR" ; 
       mytok.value.string_value = "or" ;  }  
   else if (strcmp(mystr, "and")) { 
       mytok.toktype = "TOKEN_AND" ; 
       mytok.value.string_value = "and" ;  }  
   else if (strcmp(mystr, "is")) { 
       mytok.toktype = "TOKEN_IS" ; 
       mytok.value.string_value = "is" ;  }  
   else if (strcmp(mystr, "in")) { 
       mytok.toktype = "TOKEN_IN" ; 
       mytok.value.string_value = "in" ;  }  
   else if (strcmp(mystr, "not")) { 
       mytok.toktype = "TOKEN_NOT" ; 
       mytok.value.string_value = "not" ;  }  
   else if (strcmp(mystr, "null")) { 
       mytok.toktype = "TOKEN_NULL" ; 
       mytok.value.string_value = "null" ;  }  
   else mytok.toktype = "TOKEN_IDENT" ; 
       mytok.value.string_value = mystr ;                         
                         
   return mytok;  
} 


/* Lex a string constant. */ 
token lex_string() { 
   token mytok;    
   char *mystr = "";
   int i=0;    
    
   while( (current_char != '"' && i<80)) { 
       mystr[strlen(mystr)] = current_char ;        
       i++;
   }     
      
   mytok.toktype = "TOKEN_STR_CONST" ; 
   mytok.value.string_value = mystr ;   
   return mytok;
}


/* Lex an integer constant. */ 
token lex_number() { 
   token mytok;    
   
   char *mystrint = "";
   char *myptr; 
   long myint;    
   
      
   while( isalnum(current_char) || current_char == '_' ) { 
       mystrint[strlen(mystrint)] = current_char ; 
   }    
    
   /* Convert the text "number" to a long integer. */ 
   myint = strtol(mystrint, &myptr, 10);
   
   mytok.toktype = "int_constant" ; 
   mytok.value.int_value = myint;
        
  return mytok;     
} 


/* Lex the "star" operator. */  
token lex_star() { 
   token mytok;
   
   mytok.toktype = "TOKEN_STAR" ; 
   mytok.value.char_value = '*' ;          
   return mytok; 
}    


/* Lex various operators. */ 
token lex_operator() {
    token mytok;     
        
    if (current_char == ',') { 
       mytok.toktype = "TOKEN_COMMA" ; 
       mytok.value.char_value = ',' ;  } 
    else if (current_char == '.') { 
       mytok.toktype = "TOKEN_DOT" ; 
       mytok.value.char_value = '.' ;  }                                              
    else if (current_char == ';') { 
       mytok.toktype = "TOKEN_SEMICOLON" ; 
       mytok.value.char_value = ';' ;  }                      
    else if (current_char == '(') { 
       mytok.toktype = "TOKEN_LPAREN" ; 
       mytok.value.char_value = '(' ;  }                              
    else if (current_char == ')') { 
       mytok.toktype = "TOKEN_RPAREN" ; 
       mytok.value.char_value = ')' ;  }                             
    else if (current_char == '+') { 
       mytok.toktype = "TOKEN_PLUS" ; 
       mytok.value.char_value = '+' ;  }                      
    else if (current_char == '-') { 
       mytok.toktype = "TOKEN_MINUS" ; 
       mytok.value.char_value = '-' ;  }                      
    else if (current_char == '*') { 
       mytok.toktype = "TOKEN_TIMES" ; 
       mytok.value.char_value = '*' ;  }    
    else if (current_char == '/') { 
       mytok.toktype = "TOKEN_DIVIDE" ; 
       mytok.value.char_value = '/' ;  }                                                  
    else if (current_char == ';') { 
       mytok.toktype = "TOKEN_SEMICOLON" ; 
       mytok.value.char_value = ';' ;  }    
    
    else if (current_char=='<') {
         if (next_char()=='=') {
           next_char();
       mytok.toktype = "TOKEN_LE" ; 
       mytok.value.string_value = "<=" ; 
       }
       mytok.toktype = "TOKEN_LT" ; 
       mytok.value.char_value = '<' ;               
    }
    
    else if (current_char=='>') {
         if (next_char()=='=') {
           next_char();
       mytok.toktype = "TOKEN_GE" ; 
       mytok.value.string_value = ">=" ; 
       }
       mytok.toktype = "TOKEN_GT" ; 
       mytok.value.char_value = '>' ;               
    }
    
    else if (current_char=='!') {
         if (next_char()=='=') {
           next_char();
       mytok.toktype = "TOKEN_NE" ; 
       mytok.value.string_value = "!=" ; 
       }
       mytok.toktype = "TOKEN_INVALID" ; 
       mytok.value.char_value = current_char ;               
    }
    
    else if (current_char == '=') { 
       mytok.toktype = "TOKEN_EQUAL" ; 
       mytok.value.char_value = '=' ;  }       
                                
    else { 
       mytok.toktype = "TOKEN_INVALID" ; 
       mytok.value.char_value = current_char ;  }                          
    return mytok; 
    
}    
    


/* Lex white space. */ 
void lex_white_space() {
    while (isspace(current_char)) {
    next_char();
    }
}


token lex_eof() { 
   token mytok;
   
   mytok.toktype = "TOKEN_EOF" ; 
   mytok.value.string_value = "EOF" ;        
   return mytok; 
}       


/*  Error in lexing. */ 
token lex_error() { 
   token mytok; 
   
   mytok.toktype = "TOKEN_ERROR" ; 
   mytok.value.char_value = current_char ;        
   return mytok; 
}       
    
    


/*  Main lexer */ 
token lexer() {
    lex_white_space();
    if (isalpha(current_char) || current_char == '_') {
    return lex_keyword_or_identifier();
    } else if (isdigit(current_char)) {
    return lex_number();
    } else if ( current_char=='"' ) { 
    return lex_string(); 
    }  else if (ispunct(current_char)) { 
    return lex_operator();               
    } else if (current_char==EOF) {
    return lex_eof();
    } else  { 
    return lex_error(); 
    }                
}



int main()  { 
    
token curr_token;

char *mystr = "select var1 from mytable where city = \"Sydney\" ; " ; 

while (mystr != '\0') { 
    
    next_char();
    curr_token = lexer();      
    printf("%s %s", curr_token.toktype, curr_token.value);
}

    return 0; 
    
}

I hope someone may be able to help. Oh, and any improvements to the code are very welcome....

Many thanks in advance - bye for now -
- Andy ( latte123)

**john.c** · 12-30-2017

Your toktype is too detailed. You don't need a separate type for everything. You just need to enumerate a few categories:

Code:

typedef enum TokenType {
    KEYWORD, PUNCTUATION, IDENTIFIER, STRING, INTEGER, FLOAT
} TokenType;

You may want an OPERATOR category, too.

To print the value, you need to do it in the appropriate manner for each type. Keywords, punctuation, identifiers, and arbitrary strings are all strings. Numbers are integers or floats (may as well be doubles). Also, the identifiers and arbitrary strings will need to be dynamically allocated. You can't just point them all to the same local input buffer.

Code:

switch (token.type) {
case KEYWORD:
case PUNCTUATION:
case IDENTIFIER:
case STRING:
    printf("%s", token.val.string);
    break;
case INTEGER:
    printf("%d", token.val.integer);
    break;
case FLOAT:
    printf("%f", token.val.float);
    break;
...
default:
    printf("Unknown token.type: %d\n", token.type);
}

**john.c** · 12-30-2017

I just noticed this kind of thing:

Code:

   token mytok;    
   char *mystr = "";
   int i=0;    
     
   while( (current_char != '"' && i<80)) { 
       mystr[strlen(mystr)] = current_char ;        
       i++;
   }

mystr has no storage space allocated to it.
It's a pointer that points to a byte of memory containing a '\0' character.
You are trying to write into and beyond that memory, which is a bad idea.

It should be more like:

Code:

char mystr[81];
int i = 0;
while (current_char != '"' && i < 80)
    mystr[i++] = current_char;
mystr[i] = '\0';  // must zero-terminate strings in C

**latte123** · 12-30-2017

Originally Posted by john.c

I just noticed this kind of thing:

Code:

   token mytok;    
   char *mystr = "";
   int i=0;    
     
   while( (current_char != '"' && i<80)) { 
       mystr[strlen(mystr)] = current_char ;        
       i++;
   }

mystr has no storage space allocated to it.
It's a pointer that points to a byte of memory containing a '\0' character.
You are trying to write into and beyond that memory, which is a bad idea.

It should be more like:

Code:

char mystr[81];
int i = 0;
while (current_char != '"' && i < 80)
    mystr[i++] = current_char;
mystr[i] = '\0';  // must zero-terminate strings in C

Hi john.c - thanks for both of your tips there, that's great!
I'll fix the code accordingly and I should be up and running.....

I'll do a parser next. That should be a bit easier (hopefully!).

Thanks again - bye for now -
- Andy (latte123)

**latte123** · 12-31-2017

Hi again all -

Sorry to say - I'm still having problems with the SQL lexer.
It compiles but when the executable is run, it doesn't do anything - it just hangs.

The latest code is here - hoping someone may be able to help.......

Code:

  

/* sql_lexer.c */ 

/* A toy lexer for a small part of SQL. */ 
/* This code is released to the public domain. */ 
/* "Share and enjoy......"  :)     */ 


#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>


/* Declare the current character. */ 
int current_char;


/* Next_char function */ 
int next_char() {
    return current_char = getchar();
}


/* Enum for the token types. */ 
typedef enum tokentype {
    KEYWORD, PUNCTUATION, IDENTIFIER, STRING, 
    INTEGER, FLOAT, OTHER 
} tokentype;



/* Struct to store TOKENS.  */   
typedef struct {
    tokentype toktype;
    union {        
        char *string_value;      
        int int_value;
        float float_value;
    } value;
} token;



/* Allocate memory for tokens. */ 
void *allocate_memory(size_t n) {
    return malloc(n);
}



/* Lex a keyword or identifier. */ 
token lex_keyword_or_identifier() {
    token mytok; 
    char mystr[80] ;
    int i=0;
    
   while ( i<80 && (isalnum(current_char) 
       || current_char == '_' ) ) { 
       mystr[i] = current_char ;     
       i++;
       //next_char();        
   } 
      
      
   if ( strcmp(mystr, "select") )  { 
       mytok.toktype = KEYWORD ;       
       mytok.value.string_value = "select" ;  } 
   else if (strcmp(mystr, "from") )  { 
       mytok.toktype = KEYWORD ; 
       mytok.value.string_value = "from" ;  } 
   else if (strcmp(mystr, "where")) {          
       mytok.toktype = KEYWORD ; 
       mytok.value.string_value = "where" ;  }        
   else if (strcmp(mystr, "and")) { 
       mytok.toktype = KEYWORD ; 
       mytok.value.string_value = "and" ;  }  
   else if (strcmp(mystr, "or")) { 
       mytok.toktype = KEYWORD ; 
       mytok.value.string_value = "or" ;  }  
   else if (strcmp(mystr, "and")) { 
       mytok.toktype = KEYWORD ; 
       mytok.value.string_value = "and" ;  }  
   else if (strcmp(mystr, "is")) { 
       mytok.toktype = KEYWORD ; 
       mytok.value.string_value = "is" ;  }  
   else if (strcmp(mystr, "in")) { 
       mytok.toktype = KEYWORD ; 
       mytok.value.string_value = "in" ;  }  
   else if (strcmp(mystr, "not")) { 
       mytok.toktype = KEYWORD ; 
       mytok.value.string_value = "not" ;  }  
   else if (strcmp(mystr, "null")) { 
       mytok.toktype = KEYWORD ; 
       mytok.value.string_value = "null" ;  }  
   else mytok.toktype = IDENTIFIER ; 
       mytok.value.string_value = mystr ;                         
                         
   return mytok;  
} 


/* Lex a string constant. */ 
token lex_string() { 
   token mytok;    
   char mystr[80];
   int i=0;    
    
   while ( (i<80 || current_char != '"' )) { 
       mystr[i] = current_char ;        
     //  next_char();
       i++;
   }     
   
   mystr[i+1] = '"' ;    
      
   mytok.toktype = STRING ; 
   mytok.value.string_value = mystr ;   
   return mytok;
}


/* Lex an integer constant. */ 
token lex_number() { 
   token mytok;    
   
   char mystr[80];  
   char *myptr; 
   long myint;    
   int i=0;
         
   while ( i<80 && isdigit(current_char) ) { 
       mystr[i] = current_char ; 
       i++;
       //next_char();
   }    
    
   /* Convert the text "number" to a long integer. */ 
   myint = strtol(mystr, &myptr, 10);
   
   mytok.toktype = INTEGER ;  
   mytok.value.int_value = myint;
        
  return mytok;     
} 


/* Lex the "star" operator. */  
token lex_star() { 
   token mytok;
   
   mytok.toktype = PUNCTUATION ; 
   mytok.value.string_value = "*" ;      
   next_char();    
   return mytok; 
}    


/* Lex various operators. */ 
token lex_operator() {
    token mytok;         
            
    if (current_char == ',') { 
       mytok.toktype = PUNCTUATION ; 
       mytok.value.string_value = "," ;  } 
    else if (current_char == '.') { 
       mytok.toktype = PUNCTUATION ; 
       mytok.value.string_value = "." ;  }                                              
    else if (current_char == ';') { 
       mytok.toktype = PUNCTUATION ; 
       mytok.value.string_value = ";" ;  }                      
    else if (current_char == '(') { 
       mytok.toktype = PUNCTUATION ; 
       mytok.value.string_value = "(" ;  }                              
    else if (current_char == ')') { 
       mytok.toktype = PUNCTUATION ; 
       mytok.value.string_value = ")" ;  }                             
    else if (current_char == '+') { 
       mytok.toktype = PUNCTUATION ; 
       mytok.value.string_value = "+" ;  }                      
    else if (current_char == '-') { 
       mytok.toktype = PUNCTUATION ; 
       mytok.value.string_value = "-" ;  }                      
    else if (current_char == '*') { 
       mytok.toktype = PUNCTUATION ; 
       mytok.value.string_value = "*" ;  }    
    else if (current_char == '/') { 
       mytok.toktype = PUNCTUATION ; 
       mytok.value.string_value = "/" ;  }                                                  
    else if (current_char == ';') { 
       mytok.toktype = PUNCTUATION ; 
       mytok.value.string_value = ";" ;  }    
    
    else if (current_char=='<') {
         if (next_char()=='=') {
           next_char();
       mytok.toktype = PUNCTUATION ; 
       mytok.value.string_value = "<=" ; 
       }
       mytok.toktype = PUNCTUATION ; 
       mytok.value.string_value = "<" ;               
    }
    
    else if (current_char=='>') {
         if (next_char()=='=') {
           next_char();
       mytok.toktype = PUNCTUATION ; 
       mytok.value.string_value = ">=" ; 
       }
       mytok.toktype = PUNCTUATION ; 
       mytok.value.string_value = ">" ;               
    }
    
    else if (current_char=='!') {
         if (next_char()=='=') {
           next_char();
       mytok.toktype = PUNCTUATION ; 
       mytok.value.string_value = "!=" ; 
       }
       mytok.toktype = OTHER ;       
       mytok.value.string_value = "!" ;               
    }
    
    else if (current_char == '=') { 
       mytok.toktype = PUNCTUATION ; 
       mytok.value.string_value = "=" ;  }       
                                
    else { 
       mytok.toktype = OTHER ;       
       mytok.value.string_value = " " ;  }                       
    
    next_char();
    return mytok; 
    
}    
    


/* Lex white space. */ 
void lex_white_space() {
    while (isspace(current_char)) {
    next_char();
    }
}


token lex_eof() { 
   token mytok;
   
   mytok.toktype = OTHER ; 
   mytok.value.string_value = "EOF" ;        
   return mytok; 
}       


/*  Error in lexing. */ 
token lex_error() { 
   token mytok; 
   
   mytok.toktype = OTHER ;    
   mytok.value.string_value = "ERROR" ;  
   return mytok; 
}       
    
    

/*  Main lexer */ 
token lexer(char *str) {
    char current_char = str[0] ; 
        
    lex_white_space();
    if (isalpha(current_char) || current_char == '_') {
    return lex_keyword_or_identifier();
    } else if (isdigit(current_char)) {
    return lex_number();
    } else if ( current_char=='"' ) { 
    return lex_string(); 
    }  else if (ispunct(current_char)) { 
    return lex_operator();               
    } else if (current_char==EOF) {
    return lex_eof();
    } else  { 
    return lex_error(); 
    }                
}



int main()  { 
    
    token curr_token;

    char *mystr = "select var1 from mytable where city = \"Sydney\" ; " ; 

    curr_token = lexer(mystr);  
  
  while (curr_token.toktype != OTHER)  { 
         
    switch (curr_token.toktype) {
      case KEYWORD:
      case PUNCTUATION:
      case IDENTIFIER:
      case STRING:
              printf("%s", curr_token.value.string_value);
              break;
      case INTEGER:
              printf("%d", curr_token.value.int_value);
              break;
      case FLOAT:
              printf("%f", curr_token.value.float_value);
              break;
      case OTHER: 
              printf("%s", curr_token.value.string_value);
            break;      
      default:
              printf("Unknown token.type: %d\n", curr_token.toktype);
              
}  /* switch */ 
     
     
}  /* while */ 
    
    return 0; 
    
}

Many thanks in advance (and happy New Year....

)
Andy (latte123)

Thread: SQL lexer - almost there, just have a printf problem

Thread Tools

Search Thread

Display

SQL lexer - almost there, just have a printf problem

Similar Threads

Lexer / Parser

need help with a lexer

Review my simple Lexer please..

problem with printf!!!!

printf problem

Tags for this Thread