Thread: File lexer works perfectly except for numbers

  1. #1
    Registered User
    Join Date
    Dec 2012
    Posts
    34

    File lexer works perfectly except for numbers

    Hi all -

    I'm sorry for dropping in again. (You'll be pleased to know I'm back to work tomorrow so I won't be here anywhere near as often..... )

    Anyway - my SQL file lexer now works perfectly except that it prints a weird character instead of a number (when there is a number in the SQL input).

    Here's the code -

    Code:
     
    /* toysql_parser.c  */ 
    /* A lexer and parser for a very small subset of SQL.  */
    /* This version reads the code from a file. */ 
    
    /* This code is released to the public domain. */ 
    /* "Share and enjoy......"  :)     */ 
    
    
    #include <stdio.h>
    #include <ctype.h>
    #include <string.h>
    #include <stdlib.h>
    
    
    #define NUMBER_OF_KEYWORDS 10
    #define MAXLEN 80    // Maximum length of a token. 
    
    
    /* Array of our keywords in string form. */ 
    char *kw_strings[] = { 
       "select", "as", "from", "where", "and", "or", "not", "in", "is", "null" 
        } ; 
       
        
    /*  Search function to search the array of keywords. */ 
    int search(char *arr[], int dim, char *str) { 
        
        int i;      
        int found_match;
        
        for (i=0; i<dim; i++) { 
            if ( !strcmp(arr[i] , str ) )  {   
                found_match = 1;        
                break; 
        }   else found_match = 0;    
     }  /* For */     
    
        return found_match; 
    }  /* search */ 
    
    
    /* Enum of token types. */ 
    typedef enum { KEYWORD, IDENT, INTEGER, STRING, 
        PUNCT, _EOF_ } tokentype ; 
         
    
    /* A token. */
    typedef struct { 
       tokentype toktype;
       union { 
               char string_value[MAXLEN]; 
               int int_value; 
             }  value; 
     } token ;                
    
    
    
    /* Forward declaration. */ 
    void parse(token *mytok); 
    char *ptr ;   // For strtol use.   
    
    
    void lex(FILE *fp) {    
       
       token *mytok = malloc(sizeof(token)); 
       tokentype mytype;
        
       int i=0;
       
       char myresult[MAXLEN];
       
       int curr_char = fgetc(fp);    
                   
      while (curr_char != '\0') {         
        
      if ( isspace(curr_char) ) { 
            curr_char = fgetc(fp);
           }
                    
      else if (isalpha(curr_char) || curr_char=='_')  {      
         while ( (isalnum(curr_char) || curr_char=='_') 
             && i<MAXLEN ) {           
                myresult[i] = curr_char;
                curr_char = fgetc(fp);    
                i++;      
            }       
            
        if (search(kw_strings, NUMBER_OF_KEYWORDS, myresult) == 1 )
               mytype = KEYWORD; 
        else
               mytype = IDENT; 
                    
       myresult[i] = '\0' ;   
       
       mytok->toktype = mytype;
       strcpy( mytok->value.string_value, myresult);
       
       parse(mytok);                                        
       memset(&myresult[0], 0, sizeof(myresult));  // Clear token          
       }  // Keyword or identifier 
    
          
     else if ( curr_char == '"' ) { 
           myresult[i] = curr_char;
           curr_char = fgetc(fp);       
           i++;     
            while ( ( curr_char != '"') && curr_char != '\0' 
              && i<MAXLEN-1) {               
                myresult[i] = curr_char;
                curr_char = fgetc(fp);             
                i++;     
              } 
         myresult[i] = curr_char;  // Append the last quote. 
         myresult[i+1] = '\0' ;    // Append null char. 
         curr_char = fgetc(fp);     // Move on from last quote.           
         mytype = STRING; 
         
         mytok->toktype = mytype; 
         strcpy( mytok->value.string_value, myresult);
               
         parse(mytok); 
         memset(&myresult[0], 0, sizeof(myresult));  // Clear token          
       }  // String              
                 
                 
       else if ( isdigit(curr_char) )  { 
              while (isdigit(curr_char) && curr_char != '\0' 
                 && i<MAXLEN) {     
                 myresult[i] = curr_char;                            
                curr_char = fgetc(fp);                         
                i++;     
            }  
       myresult[i] = '\0' ;    // Append null char.        
       mytype = INTEGER ;   
       mytok->toktype = mytype; 
       int j=strtol(myresult, &ptr, 10);
       mytok->value.int_value = j;
          
       parse(mytok); 
       memset(&myresult[0], 0, sizeof(myresult));  // Clear token          
       }  // Number                       
                   
                   
       else if ( ispunct(curr_char) && curr_char != '_' 
            && curr_char != '"' ) { 
                 myresult[i] = curr_char;
                 curr_char = fgetc(fp);         
                 i++;         
          while (ispunct(curr_char) && curr_char != '\0' 
            && i<MAXLEN) {   
                   myresult[i] = curr_char;                        
                   curr_char = fgetc(fp);           
                   i++;                              
                }         
       mytype = PUNCT;   
       
       mytok->toktype = mytype; 
       strcpy(mytok->value.string_value, myresult);
       
       parse(mytok); 
       memset(&myresult[0], 0, sizeof(myresult));  // Clear token           
       }  // Punct 
       
       else {          
               exit(0); 
            } 
                      
       memset(&myresult[0], 0, sizeof(myresult));  // Clear token   
       i = 0;  // Reset i.                     
       }  // while c != '\0'   
        
       exit(0) ;  
        
    }  // lex()  
    
       
       
    /* Not a parser (yet) - just prints the tokens. */ 
    void parse(token *mytok) { 
           
      printf("%d  ", mytok->toktype); 
         
      if (mytok->value.string_value) 
           printf("%s\n", mytok->value.string_value);
      else if (mytok->value.int_value) 
           printf("%d\n", mytok->value.int_value);       
      
    }        
      
              
              
    int main(int argc, char **argv) { 
    
    FILE *myfile; 
    
    myfile = fopen(argv[1], "r"); 
    
    if (myfile == NULL) 
       return -1;  
       
    lex(myfile);   
    
    fclose(myfile);
    
    return 0; 
    
    }
    The file I've used to test it -
    Code:
     
    select col1 as country, col2 as city from mytable;
    select var23 from mytable where city = "Sydney";
    select foo,bar,baz from mytable where foo = 20;
    A weird character is printed instead of that "20".

    Many thanks in advance for any help!
    Cheers -
    Andy (latte123)

  2. #2
    and the hat of int overfl Salem's Avatar
    Join Date
    Aug 2001
    Location
    The edge of the known universe
    Posts
    39,666
    1. Your indentation needs work.
    2. Your lexer needs some more sub-functions to clarify the overall function.
    3. Have you discovered debuggers yet? For example, a breakpoint on line 133 then single step and examine variables will reveal much.
    If you dance barefoot on the broken glass of undefined behaviour, you've got to expect the occasional cut.
    If at first you don't succeed, try writing your phone number on the exam paper.

  3. #3
    Registered User
    Join Date
    Dec 2012
    Posts
    34
    Quote Originally Posted by Salem View Post
    1. Your indentation needs work.
    2. Your lexer needs some more sub-functions to clarify the overall function.
    3. Have you discovered debuggers yet? For example, a breakpoint on line 133 then single step and examine variables will reveal much.
    Hi Salem - thanks for your reply -

    Points 1 and 2 - understood. I'll work on those.
    Debuggers - I've hardly used them (which probably shows). At least in compiling, I always compile with -Wall and -Wpedantic. I did that with this code - no errors or warnings.
    I set the breakpoint as you said and got [ Inferior 1 [ process 3178 ] exited with code 0377. ]
    I searched the net for code 0377. Found a few pages but none that said what that meant.
    If that's the kind of thing that gdb comes back with, it's not hugely helpful.......

    Anyway - thanks again for your reply! I'm aware that you didn't have to reply to my post so I appreciate it.

    Cheers -
    Andy (latte123)

  4. #4
    Registered User
    Join Date
    Dec 2017
    Posts
    1,644
    Your print routine ("parse") is not selecting the value type properly. It should be more like:
    Code:
    if (mytok->toktype == INTEGER)
        print as integer
    else // STRING, IDENT, KEYWORD, PUNCT
        print as string
    And you should null-terminate myresult BEFORE you do the keyword search.

    Since you're not using the value of ptr in strtol, you could just pass NULL and get rid of the ptr variable.
    Code:
    //Shouldn't this
        while (curr_char != '\0') {
    //be
        while (curr_char != EOF) {
    The exit(0)'s in lex should probably be returns.
    A little inaccuracy saves tons of explanation. - H.H. Munro

  5. #5
    Registered User
    Join Date
    Dec 2012
    Posts
    34
    Quote Originally Posted by john.c View Post
    Your print routine ("parse") is not selecting the value type properly. It should be more like:
    Code:
    if (mytok->toktype == INTEGER)
        print as integer
    else // STRING, IDENT, KEYWORD, PUNCT
        print as string
    And you should null-terminate myresult BEFORE you do the keyword search.

    Since you're not using the value of ptr in strtol, you could just pass NULL and get rid of the ptr variable.
    Code:
    //Shouldn't this
        while (curr_char != '\0') {
    //be
        while (curr_char != EOF) {
    The exit(0)'s in lex should probably be returns.
    Thanks very much for that, john.c - that's really useful and much appreciated!
    Update - it works! Numbers print correctly now - many thanks!

    Cheers - bye for now -
    Andy
    Last edited by latte123; 01-15-2018 at 01:33 AM.

Popular pages Recent additions subscribe to a feed

Similar Threads

  1. Lexer works almost perfectly - one small problem
    By latte123 in forum C Programming
    Replies: 2
    Last Post: 01-08-2018, 11:33 PM
  2. Replies: 4
    Last Post: 03-26-2013, 06:42 PM
  3. Replies: 15
    Last Post: 09-23-2010, 02:19 PM
  4. Replies: 4
    Last Post: 08-18-2009, 03:32 PM
  5. Random numbers works in one program, not in another
    By Shadow12345 in forum C++ Programming
    Replies: 27
    Last Post: 09-30-2002, 04:06 PM

Tags for this Thread