Hello (...)
I wrote this a while ago, while making a calculator.
This time I'm trying to write a simplistic interpreter.
So, any suggestions to improve the following code (please read the comments carefully) for reuse?
Assume that the constructor for the token class works correctly, it(the token class) obviously needs to be remade for this crazy idea !
Code:
#include "lexer.h"
#include "token.h"
#include <iterator>
#include <string>
//#include <iostream> //activate if needed..
//using std::cout;
using std::list;
using std::string;
list<token> tokenize(string s)
{
list<token> lt;
string::iterator sit;
enum State {nil,num,ifr,sym} state(nil);
//^nothing,number,identifier,other single char symbols
int start,length; //indicating start and lenght of substr
char cur(0); //current char
string section; //substr'ed string
bool uniflag(false);
//If a section is ready to be pushed_back
for(sit=s.begin();sit!=s.end();sit++)
{
uniflag = false;
cur = *sit;
if(cur==' ') //for skipping whitespaces && pusing back preceding ones
{
if(state==nil) //Just Skip
continue;
else //i.e if state is num, ifr or sym
{//read this block carefully...
length=sit - s.begin() - start ;
//sit (an iterator ) - s.begin()
//gives the current position.
//^*that - start (an int)
//gives the length :D
section = s.substr(start,length);
uniflag = true;
//Setting the flag for section to be dumped
state=nil; //reseting state..so that the next char starts from scratch
}
}
else if(cur>='0'&&cur<='9') //condition for num state
{
if(state==nil) //Start a new num
{
state=num;
start = sit - s.begin();
}
else if(state==ifr) //end the ifr,set flag for dump,start a new num
{
state=num;
length=sit - s.begin() - start ;
section = s.substr(start,length);
start = sit - s.begin();
uniflag = true;
}
else if(state==sym) // look up the above else-if
{
state=num;
length=sit - s.begin() - start ;
section = s.substr(start,length);
start = sit - s.begin();
uniflag = true;
}
}
else if((cur>='a'&&cur<='z')||(cur>='A'&&cur<='Z')) //look above
{
if(state==nil)
{
state=ifr;
start = sit - s.begin();
}
else if(state==num)
{
state=ifr;
length=sit - s.begin() - start;
section = s.substr(start,length);
start = sit - s.begin();
uniflag = true;
}
else if(state==sym)
{
state=ifr;
length=sit - s.begin() - start;
section = s.substr(start,length);
start = sit - s.begin();
uniflag = true;
}
}
else //look above for clarification
{
if(state==nil)
{
state=sym;
start = sit - s.begin();
}
else if(state==num)
{
state=sym;
length=sit - s.begin() - start;
section = s.substr(start,length);
start = sit - s.begin();
uniflag = true;
}
else if(state==ifr)
{
state=sym;
length=sit - s.begin() - start;
section = s.substr(start,length);
start = sit - s.begin();
uniflag = true;
}
//The following(extra) condition is there to make sure that sym's
//are always single character..
//They can be joined(:D)at a higher level if required
else if(state==sym)
{
state=sym;
length=sit - s.begin() - start;
section = s.substr(start,length);
start = sit - s.begin();
uniflag = true;
}
}
if(uniflag==true) //Dumping Station
{
// cout<<'\n'<<section<<'\n';
lt.push_back(token(section));
}
if((sit == s.end()-1)&&(cur!=' ')) //Ending Condition (when not ' ')
{
length = s.end()-s.begin()-start;
section = s.substr(start,length);
// cout<<'\n'<<section<<'\n';
lt.push_back(token(section)); //another D.S.
}
}
return lt;
//If this return type needs to be changed,
//^also modify the Dumping Stations above
}
The idea is that, it goes through the characters one at a time, determine if it a char, number or a symbol and dumps a substring into the output list when new types (like a number when the previous was a character or a symbol) begin to appear (the exception being symbols, assumed that they'd be single characters) .
If you find this interesting enough to compile, use the following threadbare token class to test,
Code:
class token
{
public:
std::string raw;
token(std::string& input){raw=input;};
};