Guidance in finding and printing Multi line comments

**AustinA456!** · 10-09-2023

So I have this code that is supposed to go through a file and seperate Char (' '), String(" "), and Comments then print them and anything that isn't that is a Token. Single-line and multi line are found with /* and */. I have the single line, String, Char and Token part. But I can't seem to figure out the multi line comment.
This is what i have so far:

Code:

void lex(char *line, int start, int end, int length, char *typeN){
   char token[MAXTOKEN];
   while (start <length && isspace(line[start])) {
      start++;
   }
   if (start >=end){
      return;
   }
   strncpy(token, &line[start], end-start);
   token[end-start] = '\0';
   printf("%s: %s\n", typeN, token);
}
void  tokenize(char *line, int length)
{
   int start= 0;
   int end=0;
   static  int inComment=0;
   for (end=0; end <length; end++){
      if(line[end]=='/' && line[end+1]=='*' && line[end+2]=='\n'){
         lex(line, start, end, length, "Token");
         start=end;
         end++;
         inComment=1;
         while(inComment){
            end++;
            if(line[end]=='*' && line[end+1]=='/'){
               inComment=0;
               end+=2;
               lex(line, start, end, length, "Comment");
            }
         }
      }

      else if (line[end]=='/' && line[end+1]=='*') {
         lex(line, start, end, length, "Token");
         start=end;
         end++;
         while(end<length && (line[end]!='*' || line[end+1]!='/')){
            end++;
         }
         end+=2;
         lex(line, start, end,  length, "Comment");
         start=end;
         end++;
         //Finds start of comment. Does Token then goes through comment.
      }
      else if (line[end] == '"') //If double qotes then
      {
         lex(line, start, end, length,"Token");
         start=end;
         end++;
         while(end<length && line[end]!='"')
         {
            end++;
         }
         if (end<length){
            end++;
            lex(line, start, end, length, "String");
            start=end;
            end--;
         }

      }

      //If sees ' then is a char. end++ to enter char until ' then end++ and lex.
      else if (line[end] =='\''){
         end++;
         while(end<length && line[end]!='\''){
            end++;
         }
         end++;
         lex(line, start, end, length, "Char");
         start=end;
         end--;

      }
      //while+lex After anything at start/zero has been tokened. It adds to the index, through the characters. Once it finds the double quote it lexes from start=wherever the last token ended to end+zero to get the ending quotes.

      else if(isspace(line[end])) {
         lex(line, start, end, length, "Token");
         start = end;
         // printf("%i: ", end);
      }
   }
}

A multi-line comment is supposed to look like this

Comment: /*
Comment: multi
Comment: line
Comment: comment */

But this is what it looks like for me

Comment: /*

Token: multi
Token: line
Token: comment
Token: */
Comment: /*

I know i'm supposed to have it see if its in a comment, if it reaches the end of the string and it hasn't found a ending */ then it keeps going. But I'm a little lost and some guidance would be appreciated.

**Salem** · 10-09-2023

Calling tokenize with inComment already true would seem to be the case you're missing.

**aghast** · 10-10-2023

Your code is long and seems like it's getting longer.

I'll suggest the first thing you should do is replace your inComment variable with a state variable. Make a list of different states that must be handled differently, assign them to a C enum or a set of #defines, and store that data in your variable. Then use a switch statement to control the outer layer of your code:

Code:

for (int end = start; end < length; ++end) 
    {
    int ch = line[end];
    switch (state) 
        {
    case IN_BLOCK_COMMENT:
        // ...
        break;
    case IN_LINE_COMMENT:
        // ...
        break;
    case IN_SQ_LITERAL:
        // ...
        break;
    case IN_DQ_LITERAL:
        // ...
        break;
    case IN_NORMAL_TEXT:
        // ...
        break;
    default:
        die("Unknown value for 'state' variable");
    }
}

Next, I suggest you bundle up all your tokenizer data into a struct that you can pass (either by value or by reference -- most modern compilers are smart enough to handle it) around to various helper functions:

Code:

typedef struct Tokenizer {
    Str tok_text;
    int tok_start;
    int tok_end;
    int tok_state;
    // ... other stuff ...
} Tokenizer;

typedef struct Token Token;    // Whatever you need a Token to be.
typedef /* ??? */ TokenType;   // string literal, int, whatever

Token tok_next(Tokenizer *, TokenType type);
// Other tokenizer functions here: match, advance, etc.

**AustinA456!** · 10-11-2023

Sorry for the late reply, I forgot my password.
The thing is I don't think I can use struct for this. I have to keep it with this format sort of.
I did figure some things out. I got it to recognize the multi line comments, it just adds a bunch of new lines and doesn't indent it correctly

Code:

void  tokenize(char *line, int length)
{
   int start= 0;
   int end=0;
   static  int inComment=0;
   for (end=0; end <length; end++){
      if ((line[end]=='/' && (line[end+1]=='*'))||inComment) {
         //Lex token to start.
         if(!inComment){
            lex(line, start, end, length, "Token");
         }
         start=end;
         //end++;


         while(line[end] != '*' || line[end+1]!= '/') {
            end++;
         }
         if(end>length){
            while((line[end] != '*' || line[end+1]!= '/')) {
               end++;
            }
            lex(line, start, end, length, "CommentD");
            inComment=1;
            return;
         }
         else if(end+1<length) {
            end+=2;
            lex(line, start, end, length, "CommentS");
            start=end;
            end++;
            inComment=0;
         }


         inComment=0;
      }
else if (line[end] == '"') //If double qotes then
      {
         lex(line, start, end, length,"Token");
         start=end;
         end++;
         while(end<length && line[end]!='"')
         {
            end++;
         }
         if (end<length){
            end++;
            lex(line, start, end, length, "String");
            start=end;
            end--;
         }


      }


      //If sees ' then is a char. end++ to enter char until ' then end++ and lex.
      else if (line[end] =='\''){
         end++;
         while(end<length && line[end]!='\''){
            end++;
         }
         end++;
         lex(line, start, end, length, "Char");
         start=end;
         end--;


      }
      //while+lex After anything at start/zero has been tokened. It adds to the index, through the characters. Once it finds the double quote it lexes from start=wherever the last token ended to end+zero to get the ending quotes.


      else if(isspace(line[end])) {
         lex(line, start, end, length, "Token");
         start = end;
         // printf("%i: ", end);
      }
   }
}

This is the output I get
CommentS: /* Comment 4 */
CommentD: /*

CommentD: multi

CommentD: line

CommentS: comment */
CommentD: /*

CommentD: multi

CommentD: line

CommentD: comment

CommentS: */
Char: 'c'
CommentD: /*

CommentD: multi

CommentD: line

CommentD: comment

CommentS: */
Token: "string

The gaps between the multi line comment
And the final string is cut off and supposed to be string not token
And I still need to deal with not removing leading spaces when I am incomment

Thread: Guidance in finding and printing Multi line comments

Thread Tools

Search Thread

Display

Guidance in finding and printing Multi line comments

Similar Threads

Multi tasking - Printing problem!

Help printing a multi array

finding number of cores in a multi-core processor

Multi - File Program not finding files

Printing multi-lines?

Tags for this Thread