Thread: removing comments from c text file

  1. #1
    Registered User
    Join Date
    May 2015
    Posts
    228

    removing comments from c text file

    So Currently the exercise that I'm doing involves removing comments from a c text file. This exercise says that I have take into account single line comments(//...), multi line comments(/*...*/) and indented multi line comments. I also have to take into consideration if any of these are in double quotation marks and adjust accordingly.

    For the purpose of this exercise, I will assume that the programs works and that there are no syntax errors. So I will not be doing any error checking for this c text file.

    This is my attempt:

    Please give me feedback if you think there is something wrong with my logic.

    Code:
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <errno.h>
    
    
    void readSourceFile(FILE *, char[], const int);
    void writeToFile(FILE *, char[]);
    
    
    int main()
    {
        FILE *sourceFile, *outputFile;
        const int MAX_FILE_LENGTH = 20;
        char srcFileName[MAX_FILE_LENGTH], outFileName[MAX_FILE_LENGTH];
    
    
        printf("What is the name or path of the source file that you're trying to access?\n");
        fgets(srcFileName, MAX_FILE_LENGTH, stdin);
        printf("\nWhat is the name or path of this output file?\n");
        fgets(outFileName, MAX_FILE_LENGTH, stdin);
    
    
        // Remove the newline as they will cause the FILE pointers to remain NULL when trying to open the file.
        srcFileName[strcspn(srcFileName, "\n")] = '\0';
        outFileName[strcspn(outFileName, "\n")] = '\0';
    
    
        sourceFile = fopen(srcFileName, "r");
        outputFile = fopen(outFileName, "w");
    
    
        if(sourceFile == NULL)
        {
            fprintf(stderr, "\n%s: %s.\n", srcFileName, strerror(errno));
            return EXIT_FAILURE;
        }
        else if(outputFile == NULL) // This case is if source file opens but output file does not.
        {
            const int errnum = errno; // I'm using a local variable to represent errno because many functions set it when an error occurs.
            fclose(sourceFile); // If the source files is open, but the source file is not open then close the source file prematurely.
            fprintf(stderr, "\n%s: %s.\n", outFileName, strerror(errnum));
            return EXIT_FAILURE;
        }
    
    
        fseek(sourceFile, 0, SEEK_END);
        const int SIZE_OF_FILE = ftell(sourceFile); // retrieve the size of the file.
        char srcData[SIZE_OF_FILE];
        fseek(sourceFile,0,SEEK_SET); // seek back to the start of the file.
    
    
        readSourceFile(sourceFile, srcData, SIZE_OF_FILE);
        writeToFile(outputFile, srcData);
    
    
        // If both of them are opened and every thing went smoothly, then close them.
        fclose(sourceFile);
        fclose(outputFile);
    
    
        return 0;
    }
    
    
    void writeToFile(FILE *filePtr, char srcData[])
    {
        fputs(srcData, filePtr);
    }
    
    
    void readSourceFile(FILE *filePtr, char srcData[], const int SIZE_OF_FILE)
    {
        int i = 0; // This is an index counter for srcData
        int j = 0; // this is an index counter for tempArray
        int ch;    // The character that will compared.
        int state = 0; // To determine if we are in a comment and still in a comment.
        int charNotInComments = 0; // The characters that we want to keep.
        char tempArray[SIZE_OF_FILE];
    
    
        // Once we reach the end of the file, we stop reading data.
        while(fgets(tempArray, SIZE_OF_FILE, filePtr) != NULL)
        {
            ch = tempArray[j];
    
    
            while(ch != '\n') // until we reach the end of the line.
            {
                // the / is the start of a single or multi comment line. the state variable will help us keep in check multi level comments that are indented
                // like this:
                // /*
                //
                // */
                if(ch == '/' || state == 1)
                {
                    j++;
                    ch = tempArray[j];
    
    
                    if(ch == '*' || state == 1) // this is if we find /*
                    {
                        state = 1;
    
    
                        j++;
                        ch = tempArray[j];
    
    
                        while(ch != '\n' && state == 1)
                        {
                            j++;
                            ch = tempArray[j];
    
    
                            if(ch == '*')
                            {
                                j++;
                                ch = tempArray[j];
    
    
                                if(ch == '/')
                                {
                                    j++;
                                    ch = tempArray[j];
                                    state = 0; // The end of a multi line comment is when we find */ and only */
                                }
                            }
                        }
                    }
                    else if(ch == '/') // This is if we find //
                    {
                        state = 1;
    
    
                        j++;
                        ch = tempArray[j];
    
    
                        while(ch != '\n') // single line comments make the entire line a comment. so keep going until you find a new line.
                        {
                            j++;
                            ch = tempArray[j];
                        }
    
    
                        state = 0; // a new line signifies the end of a single line comment. At least for the line we are in.
                    }
                }
                else if(ch == '"') // if we find an opening double quotation mark.
                {
                    srcData[i] = ch;
                    i++;
                    j++;
                    charNotInComments++;
                    ch = tempArray[j];
    
    
                    while(ch != '"')
                    {
                        srcData[i] = ch;
                        i++;
                        j++;
                        charNotInComments++;
                        ch = tempArray[j];
                    }
    
    
                    srcData[i] = ch;
                    i++;
                    j++;
                    charNotInComments++;
                    ch = tempArray[j];
                }
                else if(state == 0) // if it's letter, whitespace, etc. Anything but a comment.
                {
                    srcData[i] = ch;
                    i++;
                    j++;
                    charNotInComments++;
                    ch = tempArray[j];
                }
            }
    
    
            if(charNotInComments > 0) // if there weren't any characters worth keeping, skip this
            {
                srcData[i] = ch;
                i++;
                srcData[i] = '\0';
                charNotInComments = 0;
            }
    
    
            if(j > 0)
            {
                j = 0;
            }
        }
    }
    Last edited by deathslice; 10-24-2015 at 08:01 PM.

  2. #2
    Programming Wraith GReaper's Avatar
    Join Date
    Apr 2009
    Location
    Greece
    Posts
    2,738
    There are memory leaks all over the place. That happens because you don't check for the null character in the inner loop at lines #100, #140 and #159. Fix those first and we'll continue.

    Did you compile and run the code? I'm asking because it should crash like it did to me, I simply used the same source as input.
    Devoted my life to programming...

  3. #3
    Registered User
    Join Date
    May 2015
    Posts
    228
    It ran and complied for me but you do have a good point. I didn't think of null terminating the temp array because I wasn't really concern with it.

    Edit: But tell, why do you say that there are memory leaks because of the null character. All the string inside tempArray are already null terminated and the characters that are being put in srcdata are being null terminated(so you can see that I'm doing that at the end of this program) so then what's the problem?
    Last edited by deathslice; 10-25-2015 at 07:51 AM.

  4. #4
    Registered User
    Join Date
    May 2010
    Posts
    4,632
    Since you're not using dynamic memory there are no memory leaks, but there may be buffer overrun errors if you're not properly terminating your strings.

    However you probably don't realize that you're using Variable Length Arrays, unless you happen to be using a C++ compiler instead of a C compiler. You should use #define instead of const to create your compile time constants in C programs.

    main.c||In function ‘main’:|
    main.c|15|warning: variable length array ‘srcFileName’ is used [-Wvla]|
    main.c|15|warning: variable length array ‘outFileName’ is used [-Wvla]|
    main.c|49|warning: variable length array ‘srcData’ is used [-Wvla]|
    main.c||In function ‘readSourceFile’:|
    main.c|79|warning: variable length array ‘tempArray’ is used [-Wvla]|

    Jim

  5. #5
    Registered User
    Join Date
    May 2015
    Posts
    228
    I have almost everything but I just can't figure out a way to keep a track of indented multi line comment.

  6. #6
    Registered User
    Join Date
    May 2015
    Posts
    228
    for both temparray and srcdata, the value of the variable inside their brackets is the current size of the file during runtime. I did it so I didn't have to create an arbitrarily large buffer size. If the compiler warns me about variable size arrays, what can I do to get around that? The book that I'm currently reading hasn't talked about dynamic memory yet so that is out the window.

    Edit: For the buffer oveload, that is why I used fgets because it at least takes the size of the buffer into consideration. Sure, data will get lost if the string is greater than what the buffer can handle, but I'm not worried about that right now.
    Last edited by deathslice; 10-25-2015 at 09:51 AM.

  7. #7
    C++ Witch laserlight's Avatar
    Join Date
    Oct 2003
    Location
    Singapore
    Posts
    28,413
    Quote Originally Posted by deathslice
    for both temparray and srcdata, the value of the variable inside their brackets is the current size of the file during runtime. I did it so I didn't have to create an arbitrarily large buffer size. If the compiler warns me about variable size arrays, what can I do to get around that? The book that I'm currently reading hasn't talked about dynamic memory yet so that is out the window.
    Has the book that you are currently reading talked about variable length arrays? If so, then you just need to compile with the feature enabled, e.g., compile with respect to C99.

    If not, then just as you want to avoid dynamic memory allocation, you should avoid variable length arrays, hence the solution of "an arbitrarily large buffer size" is appropriate. If you do not want to do that... then the solution is to read more of your book until you do learn about dynamic memory allocation (or variable length arrays).
    Quote Originally Posted by Bjarne Stroustrup (2000-10-14)
    I get maybe two dozen requests for help with some sort of programming or design problem every day. Most have more sense than to send me hundreds of lines of code. If they do, I ask them to find the smallest example that exhibits the problem and send me that. Mostly, they then find the error themselves. "Finding the smallest program that demonstrates the error" is a powerful debugging tool.
    Look up a C++ Reference and learn How To Ask Questions The Smart Way

  8. #8
    Registered User
    Join Date
    May 2015
    Posts
    228
    Alright I think I have something half way decent that works with //, /* */ and indented multi lines. This program assume that there are no syntax errors in the text file.

    Here is a test file.main.txt

    Code:
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <errno.h>
    
    
    #define MAX_FILE_NAME 20
    #define MAX_FILE_SIZE 1024
    
    
    void readSourceFile(FILE *, char[], const int);
    void writeToFile(FILE *, char[]);
    
    
    int main()
    {
        FILE *sourceFile, *outputFile;
        char srcFileName[MAX_FILE_NAME], outFileName[MAX_FILE_NAME], srcData[MAX_FILE_SIZE];
    
    
        printf("What is the name or path of the source file that you're trying to access?\n");
        fgets(srcFileName, MAX_FILE_NAME, stdin);
        printf("\nWhat is the name or path of this output file?\n");
        fgets(outFileName, MAX_FILE_NAME, stdin);
    
    
        // Remove the newline as they will cause the FILE pointers to remain NULL when trying to open the file.
        srcFileName[strcspn(srcFileName, "\n")] = '\0';
        outFileName[strcspn(outFileName, "\n")] = '\0';
    
    
        sourceFile = fopen(srcFileName, "r");
        outputFile = fopen(outFileName, "w");
    
    
        if(sourceFile == NULL)
        {
            fprintf(stderr, "\n%s: %s.\n", srcFileName, strerror(errno));
            return EXIT_FAILURE;
        }
        else if(outputFile == NULL) // This case is if source file opens but output file does not.
        {
            const int errnum = errno; // I'm using a local variable to represent errno because many functions set it when an error occurs.
            fclose(sourceFile); // If the source files is open, but the source file is not open then close the source file prematurely.
            fprintf(stderr, "\n%s: %s.\n", outFileName, strerror(errnum));
            return EXIT_FAILURE;
        }
    
    
        readSourceFile(sourceFile, srcData, MAX_FILE_SIZE);
        writeToFile(outputFile, srcData);
    
    
        // If both of them are opened and every thing went smoothly, then close them.
        fclose(sourceFile);
        fclose(outputFile);
    
    
        return 0;
    }
    
    
    void writeToFile(FILE *filePtr, char srcData[])
    {
        fputs(srcData, filePtr);
    }
    
    
    void readSourceFile(FILE *filePtr, char srcData[], const int SIZE_OF_FILE)
    {
        int i = 0; // This is an index counter for srcData
        int j = 0; // this is an index counter for tempArray
        int ch = 0;    // The character that will compared.
        int state = 0; // To determine if we are in a comment and still in a comment.
        int charNotInComments = 0; // The characters that we want to keep.
        char tempArray[SIZE_OF_FILE];
    
    
        // Once we reach the end of the file, we stop reading data.
        while(fgets(tempArray, SIZE_OF_FILE, filePtr) != NULL)
        {
            ch = tempArray[j];
            j++;
    
    
            while(ch != '\n') // until we reach the end of the line.
            {
                if(ch == '/') // The start of a comment.
                {
                    ch = tempArray[j];
                    j++;
    
    
                    if(ch == '*')
                    {
                        ch = tempArray[j];
                        j++;
                        state = 1;
    
    
                        while(ch != '\n' && state == 1)
                        {
                            ch = tempArray[j];
                            j++;
    
    
                            if(ch == '*')
                            {
                                ch = tempArray[j];
                                j++;
    
    
                                if(ch == '/')
                                {
                                    state = 0;
                                }
                            }
                        }
                    }
                    else if(ch == '/')
                    {
                        while(ch != '\n')
                        {
                            ch = tempArray[j];
                            j++;
                        }
                    }
                }
                else if(state == 1) // If we are still in a comment.
                {
                    while(ch != '\n' && state == 1)
                    {
                        if(ch == '*')
                        {
                            ch = tempArray[j];
                            j++;
    
    
                            if(ch == '/')
                            {
                                state = 0;
                            }
                        }
    
    
                        ch = tempArray[j];
                        j++;
                    }
                }
                else if(ch == '"') // if we find an opening double quotation mark.
                {
                    srcData[i] = ch;
                    i++;
                    ch = tempArray[j];
                    j++;
                    charNotInComments++;
    
    
                    while(ch != '"')
                    {
                        srcData[i] = ch;
                        i++;
                        ch = tempArray[j];
                        j++;
                        charNotInComments++;
                    }
    
    
                    srcData[i] = ch;
                    i++;
                    ch = tempArray[j];
                    j++;
                    charNotInComments++;
                }
                else
                {
                    srcData[i] = ch;
                    i++;
                    ch = tempArray[j];
                    j++;
                    charNotInComments++;
                }
            }
    
    
            if(charNotInComments > 0) // if there were characters worth keeping, skip this
            {
                srcData[i] = ch;
                i++;
                srcData[i] = '\0';
                charNotInComments = 0;
            }
    
    
            ch = 0;
            j = 0;
        }
    }
    Last edited by deathslice; 10-25-2015 at 10:56 AM.

  9. #9
    Lurking whiteflags's Avatar
    Join Date
    Apr 2006
    Location
    United States
    Posts
    9,612
    I would restructure the code so that it works like a finite state machine. Everything is easier to read that way. (You were close to doing it by yourself anyway)

  10. #10
    Registered User
    Join Date
    May 2009
    Posts
    4,183
    Am I the only one who wondered why the exercise is always to remove the comments and leave the code?
    I thought it would be just as hard of a exercise to keep the comments and remove the code instead.

    Tim S.
    "...a computer is a stupid machine with the ability to do incredibly smart things, while computer programmers are smart people with the ability to do incredibly stupid things. They are,in short, a perfect match.." Bill Bryson

  11. #11
    Lurking whiteflags's Avatar
    Join Date
    Apr 2006
    Location
    United States
    Posts
    9,612
    Quote Originally Posted by stahta01 View Post
    Am I the only one who wondered why the exercise is always to remove the comments and leave the code?
    I thought it would be just as hard of a exercise to keep the comments and remove the code instead.

    Tim S.
    Definitely not! I could see a use for a program that extracts comments from code instead. Say you used doxygen to generate some documentation from comments and later you decide to use a separate file. Working from the output of such a program is better than starting over, whereas doing this seems silly. It does show how the preprocessor might implement removing comments from code though.

  12. #12
    Registered User
    Join Date
    May 2015
    Posts
    228
    Well I able to remove a couple of unnecessary lines of code(while also adding a couple of new variables) but it still retains its general shape and logic. Though I used a much bigger test file and it worked perfectly.

    Code:
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <errno.h>
    
    
    #define MAX_FILE_NAME 20
    #define MAX_FILE_SIZE 1024
    
    
    void readSourceFile(FILE *, char[], const int);
    void writeToFile(FILE *, char[]);
    
    
    int main()
    {
        FILE *sourceFile, *outputFile;
        char srcFileName[MAX_FILE_NAME], outFileName[MAX_FILE_NAME], srcData[MAX_FILE_SIZE];
    
    
        printf("What is the name or path of the source file that you're trying to access?\n");
        fgets(srcFileName, MAX_FILE_NAME, stdin);
        printf("\nWhat is the name or path of this output file?\n");
        fgets(outFileName, MAX_FILE_NAME, stdin);
    
    
        // Remove the newline as they will cause the FILE pointers to remain NULL when trying to open the file.
        srcFileName[strcspn(srcFileName, "\n")] = '\0';
        outFileName[strcspn(outFileName, "\n")] = '\0';
    
    
        sourceFile = fopen(srcFileName, "r");
        outputFile = fopen(outFileName, "w");
    
    
        if(sourceFile == NULL)
        {
            fprintf(stderr, "\n%s: %s.\n", srcFileName, strerror(errno));
            return EXIT_FAILURE;
        }
        else if(outputFile == NULL) // This case is if source file opens but output file does not.
        {
            const int errnum = errno; // I'm using a local variable to represent errno because many functions set it when an error occurs.
            fclose(sourceFile); // If the source files is open, but the source file is not open then close the source file prematurely.
            fprintf(stderr, "\n%s: %s.\n", outFileName, strerror(errnum));
            return EXIT_FAILURE;
        }
    
    
        readSourceFile(sourceFile, srcData, MAX_FILE_SIZE);
        writeToFile(outputFile, srcData);
    
    
        // If both of them are opened and every thing went smoothly, then close them.
        fclose(sourceFile);
        fclose(outputFile);
    
    
        return 0;
    }
    
    
    void writeToFile(FILE *filePtr, char srcData[])
    {
        fputs(srcData, filePtr);
    }
    
    
    void readSourceFile(FILE *filePtr, char srcData[], const int SIZE_OF_FILE)
    {
        int i = 0; // This is an index counter for srcData
        int j = 0; // this is an index counter for tempArray
        int ch = 0;    // The character that will compared.
        int singleLine = 0; // If We are in a single line comment,
        int multiLine = 0; // If we are in a multi ling comment.
        int charNotInComments = 0; // The characters that we want to keep.
        char tempArray[SIZE_OF_FILE];
    
    
        // Once we reach the end of the file, we stop reading data.
        while(fgets(tempArray, SIZE_OF_FILE, filePtr) != NULL)
        {
            do
            {
                ch = tempArray[j];
                j++;
    
    
                if(singleLine == 1)
                {
                    while(ch != '\n')
                    {
                        ch = tempArray[j];
                        j++;
                    }
    
    
                    singleLine = 0;
                }
                else if(multiLine == 1)
                {
                    while(ch != '\n' && multiLine == 1) // The reason is because the mult line could be either /* */ or indented.
                    {
                        if(ch == '*')
                        {
                            ch = tempArray[j];
                            j++;
    
    
                            if(ch == '/')
                            {
                                multiLine = 0;
                            }
                        }
    
    
                        ch = tempArray[j];
                        j++;
                    }
                }
                else if(ch == '/') // The start of a comment.
                {
                    ch = tempArray[j];
                    j++;
    
    
                    if(ch == '*')
                    {
                        multiLine = 1;
                    }
                    else if(ch == '/')
                    {
                        singleLine = 1;
                    }
                }
                else if(ch == '"') // if we find an opening double quotation mark.
                {
                    srcData[i] = ch;
                    charNotInComments++;
                    i++;
                    ch = tempArray[j];
                    j++;
    
    
                    while(ch != '"')
                    {
                        srcData[i] = ch;
                        charNotInComments++;
                        i++;
                        ch = tempArray[j];
                        j++;
                    }
    
    
                    srcData[i] = ch;
                    charNotInComments++;
                    i++;
                }
                else // anything else really
                {
                    srcData[i] = ch;
                    charNotInComments++;
                    i++;
                }
            }
            while(ch != '\n'); // until we reach the end of the line.
    
    
            if(charNotInComments > 0)
            {
                srcData[i] = '\0';
                charNotInComments = 0;
            }
    
    
            ch = 0;
            j = 0;
        }
    }
    Edit: Though yes I think the purpose of this exercise is to show how the preprocessor does one of its job and that is to remove comments. Other than that, when I first read the problem I thought it was a bit silly lol.
    Last edited by deathslice; 10-25-2015 at 12:59 PM.

  13. #13
    Ticked and off
    Join Date
    Oct 2011
    Location
    La-la land
    Posts
    1,728
    deathslice, like whiteflags wrote, this is the domain of state machines, really. I recommend you take a look.

    In more than one way, you've already using the approach on your own; what you lack, is the organization, or the rigorous approach.

    Since your code now works (I assume), I'll show how I'd do the same using a state machine with seven states: CODE, DOUBLEQUOTED, SINGLEQUOTED, SLASH, COMMENTLINE, COMMENT, and ASTERISK.

    When in CODE state:
    " causes a transition to the DOUBLEQUOTED state
    ' causes a transition to the SINGLEQUOTED state
    / causes a transition to the SLASH state

    When in DOUBLEQUOTED state:
    " causes a transition to the CODE state
    and technically, a newline is illegal, but we'll ignore that.

    When in SINGLEQUOTED state:
    ' causes a transition to the CODE state
    and technically, a newline is illegal, but we'll ignore that.

    When in SLASH state:
    / causes a transition to the COMMENTLINE state
    * causes a transition to the COMMENT state
    and all others cause a transition to the CODE state.

    When in COMMENTLINE state:
    a newline causes a transition to the CODE state

    When in COMMENT state:
    * causes a transition to the ASTERISK state

    When in ASTERISK state:
    / causes a transition to the CODE state
    Everything else except for * causes a transition to the COMMENT state.

    If we write those rules as
    Code:
    digraph {
        node [ shape=record ];
        rankdir = "TB";
    
        CODE         [ label="{ CODE         |{ <DQ> \" | <SQ> ' | <S> / | <any> }}" ];
        DOUBLEQUOTED [ label="{ DOUBLEQUOTED |{ <DQ> \" | <any> }}" ];
        SINGLEQUOTED [ label="{ SINGLEQUOTED |{ <SQ> \' | <any> }}" ];
        SLASH        [ label="{ SLASH        |{ <S>  /  | <AST> *| <any> }}" ];
        COMMENTLINE  [ label="{ COMMENTLINE  |{ <NL> newline | <any> }}" ];
        COMMENT      [ label="{ COMMENT      |{ <AST>*  | <any> }}" ];
        ASTERISK     [ label="{ ASTERISK     |{ <S>  /  | <AST> *| <any> }}" ];
    
        CODE:DQ  -> DOUBLEQUOTED;
        CODE:SQ  -> SINGLEQUOTED;
        CODE:S   -> SLASH;
        CODE:any -> CODE;
    
        DOUBLEQUOTED:DQ  -> CODE;
        DOUBLEQUOTED:any -> DOUBLEQUOTED;
    
        SINGLEQUOTED:SQ  -> CODE;
        SINGLEQUOTED:any -> SINGLEQUOTED;
    
        SLASH:S   -> COMMENTLINE;
        SLASH:AST -> COMMENT;
        SLASH:any -> CODE;
    
        COMMENTLINE:NL  -> CODE;
        COMMENTLINE:any -> COMMENTLINE;
    
        COMMENT:AST -> ASTERISK;
        COMMENT:any -> COMMENT;
    
        ASTERISK:S   -> CODE;
        ASTERISK:AST -> ASTERISK;
        ASTERISK:any -> CODE;
    }
    then Graphviz's dot gives us this nice state diagram:
    removing comments from c text file-graph-png

    I'll also add optional output streams for the code part, strings and character constants (one per line), and comments (one per line). For good measure, this also supports all newline encodings ("universal newlines", i.e. CR, LF, CRLF, and LFCR).

    If you start in the CODE state, and you switch to the state that corresponds to the next character (if not listed in the boxes below the state, then use the empty box), you should be able to parse any correct C code. I haven't checked whether my state machine is correct, and whether my code implements it correctly, though!

    (I did throw some code at it, and it seems to work; the important part is the process and the structure/approach here.)

    Here's the implementation:
    Code:
    #include <stdlib.h>
    #include <stdio.h>
    
    typedef enum {
        CODE = 0,
        DOUBLEQUOTED = 1,
        SINGLEQUOTED = 2,
        SLASH = 3,
        COMMENTLINE = 4,
        COMMENT = 5,
        ASTERISK = 6
    } input_state;
    
    void process(FILE *const in, FILE *const code, FILE *const strings, FILE *const comments)
    {
        input_state  state = CODE;
        int          c;
    
        while (1) {
    
            c = getc(in);
            if (c == EOF)
                break;
    
            /* Universal newline support */
            if (c == '\n') {
                c = getc(in);
                if (c != '\r')
                    ungetc(c, in);
                c = '\n';
            } else
            if (c == '\r') {
                c = getc(in);
                if (c != '\n')
                    ungetc(c, in);
                c = '\n';
            }
    
            switch (state) {
            case CODE:
                if (c == '/')
                    state = SLASH;
                else {
                    if (code)
                        fputc(c, code);
                    if (c == '"')
                        state = DOUBLEQUOTED;
                    else
                    if (c == '\'')
                        state = SINGLEQUOTED;
                }
                break;
    
            case DOUBLEQUOTED:
                if (code)
                    fputc(c, code);
                if (c == '"') {
                    state = CODE;
                    if (strings)
                        fputc('\n', strings);
                } else {
                    if (strings)
                        fputc(c, strings);
                }
                break;
    
            case SINGLEQUOTED:
                if (code)
                    fputc(c, code);
                if (c == '\'') {
                    state = CODE;
                    if (strings)
                        fputc('\n', strings);
                } else {
                    if (strings)
                        fputc(c, strings);
                }
                break;
    
            case SLASH:
                if (c == '/')
                    state = COMMENTLINE;
                else
                if (c == '*')
                    state = COMMENT;
                else
                if (code) {
                    fputc('/', code);
                    fputc(c, code);
                    state = CODE;
                }
                break;
    
            case COMMENTLINE:
                if (comments)
                    fputc(c, comments);
                if (c == '\n')
                    state = CODE;
                break;
    
            case COMMENT:
                if (comments)
                    fputc(c, comments);
                if (c == '*')
                    state = ASTERISK;
                break;
    
            case ASTERISK:
                if (c == '/')
                    state = CODE;
                else {
                    if (comments)
                        fputc('*', comments);
                    if (c != '*')
                        state = COMMENT;
                }
                break;
            }
        }
    
        /* State should be CODE here. */
    }
    Although pen and paper are good tools, I write so much code snippets that I prefer to keep my notes in digital form, with each code snippet and relevant notes in the same directory. Graphviz's graphs are very useful (and they're fast to write after you get familiar with the simple syntax); for math I use either plain text or LibreOffice Writer (I like its easy-to-use formula editor). I could use LaTeX (or LyX for example) for either or both, and might switch back in the future. There certainly are a lot of tools to choose from.

  14. #14
    Registered User
    Join Date
    May 2015
    Posts
    228
    Very nice example. Maybe later I'll go back to this exercise and redo this way but for I'm just going to keep pushing forward. Cheers

  15. #15
    misoturbutc Hodor's Avatar
    Join Date
    Nov 2013
    Posts
    1,787
    Quote Originally Posted by Nominal Animal View Post
    deathslice, like whiteflags wrote, this is the domain of state machines, really. I recommend you take a look.
    You went to all that time and effort writing a fantastic explanation and used a switch?

    I haven't tested the code below much. There are known issues (escaped characters for example) but handling them is just another state so easy to add. I wouldn't normally write it like this (I doubt I'd use arrays... more likely some other more appropriate data structure, but *shrug* it saved time for something that's only meant as an example AND "translating" your description to an array was easier. Ok, I was lazy).

    Code:
    /*
     * Known issues:
     * a) Escaped characters in quotes or double quotes
     * b) Multiline comments end up being replaced by a newline (not a huge issue)
     * c) probably more
     *
     * I apologise for the lack of comments. I removed them.
     */
    #include <stdlib.h>
    #include <stdio.h>
    
    enum StateId {
        UNDEFINED_STATE,
        CODE,                /* */
        DOUBLEQUOTED,
        SINGLEQUOTED,
        SLASH,
        COMMENTLINE,
        COMMENT,
        ASTERISK
    };
    
    #define MAX_TRANSITIONS 4
    #define ANY_CHAR '\0'
    
    #define MATCH 0
    #define MATCH_NOT 1
    
    /* Pointless comment
     */
    struct Transition {
        int match_not;
        char match;
        enum StateId next;
        int print_prev;
        int suppress_out;
    };
    
    struct State {
        enum StateId state;
        struct Transition transitions[MAX_TRANSITIONS];
    };
    
    static const struct State StateMachine[] = {
        { 0 },    // Undefined
        { CODE, {{MATCH, '"', DOUBLEQUOTED}, {MATCH, '\'', SINGLEQUOTED}, {MATCH, '/', SLASH}} },
        { DOUBLEQUOTED, {{MATCH, '"', CODE}} },
        { SINGLEQUOTED, {{MATCH, '\'', CODE}} },
        { SLASH, {{MATCH, '/', COMMENTLINE}, {MATCH, '*', COMMENT}, {MATCH, ANY_CHAR, CODE, 1}} },
        { COMMENTLINE, {{MATCH, '\n', CODE}} },
        { COMMENT, {{MATCH, '*', ASTERISK}} },
        { ASTERISK, {{MATCH, '/', CODE, 0, 1}, {MATCH_NOT, '*', COMMENT}} }
    };
    
    enum StateId get_state_idx(int c, enum StateId current_state, int *print_prev, int *suppress)
    {
        int token_match;
        int candidate_tr;
    
        const struct State *st = &StateMachine[current_state];
    
        for (candidate_tr = 0; st->transitions[candidate_tr].next != UNDEFINED_STATE; candidate_tr++) {
    
            if (st->transitions[candidate_tr].match == ANY_CHAR)
                token_match = 1;
            else
                token_match = (c == st->transitions[candidate_tr].match);
    
            if (st->transitions[candidate_tr].match_not)
                token_match = !token_match;
    
            if (token_match) {
                current_state = st->transitions[candidate_tr].next;
                *print_prev = st->transitions[candidate_tr].print_prev;
                *suppress = st->transitions[candidate_tr].suppress_out;
                break;
            }
        }
    
        return current_state;
    }
    
    void process(FILE *in, FILE *out)
    {
        int c;
        int prev_c = '\0';
        int print_prev = 0;    /* bool */
        int suppress_out = 0; /* bool */
    
        enum StateId st = CODE;
    
        while ((c = getc(in)) != EOF) {
            st = get_state_idx(c, st, &print_prev, &suppress_out);
            if (print_prev && prev_c != '\0') {
                putc(prev_c, out);
                print_prev = 0;
            }
            switch (st) {
                case CODE: case DOUBLEQUOTED: case SINGLEQUOTED:
                    if (!suppress_out)
                        putc(c, out);
                    else
                        suppress_out = 0;
                    break;
                default:
                    break;
            }
            prev_c = c;
        }
    }
    
    int main(void)
    {
    #if 0
        int a = 2 / 3;
        printf("This removes comments like /* comment */ and // comment from a file\n");
        printf("Since it's a test input is from stdin and output is stdout\n");
    #endif
        process(stdin, stdout);
        return 0;
    }

Popular pages Recent additions subscribe to a feed

Similar Threads

  1. Removing Comments
    By thames in forum C Programming
    Replies: 9
    Last Post: 10-30-2012, 07:19 PM
  2. removing comments of type '//' and '/*'
    By rohit83.ken in forum C++ Programming
    Replies: 3
    Last Post: 10-20-2007, 02:24 AM
  3. removing text from a file
    By finkus in forum C++ Programming
    Replies: 5
    Last Post: 11-26-2005, 01:36 PM
  4. Problem parsing comments and such in text file
    By zaxxon in forum C Programming
    Replies: 3
    Last Post: 08-09-2004, 12:14 AM
  5. Removing text between /* */ in a file
    By 0rion in forum C Programming
    Replies: 2
    Last Post: 04-05-2004, 08:54 AM