I've written a full compiler before. For the lexical analysis portion good pattern detection methods are essential. You could:
a) Write your own C or C++ code.
b) Use a tool to increase development & readability & and ease of altercation at a later date and would reduce potential conflicts and extensive debugging time.
I was taught it was safer to go with B. Lex (C version) or Flex (C++ version) are nice tools that will automatically generate the necesssary code for your lexical analyzer and work well in combination with YACC.
Basically Lex looks like Perl's pattern matching. When it finds a pattern you can then tell it what to do by writing C or C++ code. Below is a Lex (lexical analyzer) code + Yacc code example.
Example of a Lex file:
Code:
extern FILE *listingFile;
%%
\{[^}]*\} { fprintf(listingFile, yytext);} // Comment detected output to file without returning a token..
[ \t]+ { fprintf(listingFile, yytext);} // Tab or whitespace detected output without returning a token..
[\n] { fprintf(listingFile, yytext); // New line detected.
outputNextLine();}
"+" { fprintf(listingFile, yytext); // + found return the token ADDOP.
return(ADDOP); }
"-" { fprintf(listingFile, yytext); // - found return as token ADDOP
return(ADDOP); }
"<" { fprintf(listingFile, yytext); // return as Relational Operator
return(RELOP); }
"<=" { fprintf(listingFile, yytext); // return as Relational Operator
return(RELOP); }
">" { fprintf(listingFile, yytext); // return as Relational Operator
return(RELOP); }
">=" { fprintf(listingFile, yytext);
return(RELOP); }
"=" { fprintf(listingFile, yytext);
return(RELOP); }
"<>" { fprintf(listingFile, yytext);
return(RELOP); }
"*" { fprintf(listingFile, yytext);
return(MULOP); }
"/" { fprintf(listingFile, yytext);
return(MULOP); }
":=" { fprintf(listingFile, yytext); // return as Assign Operator
return(ASSIGNOP); }
".." { fprintf(listingFile, yytext);
return(DOTDOT); }
[0-9]+ { fprintf(listingFile, yytext);
yylval.intValue = atoi(yytext);
return(INT_LITERAL); } // Return token INT_LITERAL and save along with it its value..
[0-9]+(\.[0-9]+)?(E[+-]?[0-9]+)? { fprintf(listingFile, yytext);
yylval.floatValue = atof(yytext);
return(REAL_LITERAL); } // Return token REAL_LITERAL
[A-Za-z]([A-Za-z]|[0-9])* { fprintf(listingFile, yytext);
yycopy = strdup(yytext);
type = checkReserved(yycopy); // Make sure identifier isn't reserved.
if(type == IDENTIFIER) // Wasn't reserved so Add identifier to symbol table
{
yylval.tableEntry = addSymbol(yycopy);
}
return type; }
[\[\]\(\)\.,:;] { fprintf(listingFile, yytext);
return(yytext[0]);}
. { fprintf(listingFile, yytext);
yycopy = strdup(yytext);
reportError(yycopy);}
%%
Here is an example of a token file (Lex/Flex will automatically create one for you):
Code:
#ifndef TOKENS_H
#define TOKENS_H
#define ADDOP 257 // Add operator
#define RELOP 258 // Relational operator
#define MULOP 259 // Multiply operator
#define ASSIGNOP 260 // Assignment operator
#define DOTDOT 261 // ..
#define INT_LITERAL 262 // Integer
#define REAL_LITERAL 263 // Real number.
#define IDENTIFIER 264 // Identifier
// RESERVED WORDS
#define ARRAY 265
#define BEGIN_ 266
#define DO 267
#define ELSE 268
#define END 269
#define FUNCTION 270
#define IF 271
#define INTEGER 272
#define OF 273
#define PROCEDURE 274
#define PROGRAM 275
#define REAL 276
#define THEN 277
#define WHILE 278
#define VAR 279
#define OR 280
#define MOD 281
#define DIV 282
#define AND 283
#endif
And of course the Yacc portion will directly receive the tokens that the Lex/Flex finds to perform its job:
Example of Yacc code:
Code:
%start program
%token <floatValue> REAL_LITERAL
%token <intValue> INT_LITERAL
%token <tableEntry> IDENTIFIER
%token ADDOP RELOP MULOP ASSIGNOP DOTDOT
%token ARRAY BEGIN_ DO ELSE END FUNCTION IF INTEGER OF PROCEDURE PROGRAM REAL
%token THEN WHILE VAR OR MOD DIV AND
%type <tableEntry> identifier
%type <tableEntry> variable
%type <intValue> type
%type <intValue> expression
%type <intValue> literals
%type <intValue> standard_type
%nonassoc NO_ELSE
%nonassoc ELSE
%left ADDOP
%left MULOP
%left RELOP
%%
program:
program_head semicolon {setDeclarationState();} variable_declarations {setLookupState();} subprogram_declarations compound_statement period
;
program_head:
PROGRAM {setDeclarationState();} identifier {trackIdentifier($3); setClassification(5); trackIdentifier(NULL); setLookupState();} |
error {reportError("Missing 'PROGRAM'");} ';'
;
variable_declarations:
variable_declarations VAR identifier_list colon type semicolon {setType($5); trackIdentifier(NULL);} |
;
identifier_list:
identifier {
if($1 == NULL)
{}
else
trackIdentifier($1);
}
|
identifier_list
','
identifier {
if($3 == NULL)
{}
else
trackIdentifier($3);
}
|
error {reportError("Need atleast one identifier");}
;
parameters:
identifier |
parameters ',' identifier
;
etc....
Example of your Main function to communicate with the Lex+Yacc would then be simple:
Code:
main()
{
yyparse(); // Automatically performs Lex anaylzer and sends tokens to Yacc.
outputSymbolTable(); // Dump symbol table.
destroySymbolTable(); // Destroy symbol table.
}