removing comments from c text file

**deathslice** · 10-24-2015

So Currently the exercise that I'm doing involves removing comments from a c text file. This exercise says that I have take into account single line comments(//...), multi line comments(/*...*/) and indented multi line comments. I also have to take into consideration if any of these are in double quotation marks and adjust accordingly.

For the purpose of this exercise, I will assume that the programs works and that there are no syntax errors. So I will not be doing any error checking for this c text file.

This is my attempt:

Please give me feedback if you think there is something wrong with my logic.

Code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>


void readSourceFile(FILE *, char[], const int);
void writeToFile(FILE *, char[]);


int main()
{
    FILE *sourceFile, *outputFile;
    const int MAX_FILE_LENGTH = 20;
    char srcFileName[MAX_FILE_LENGTH], outFileName[MAX_FILE_LENGTH];


    printf("What is the name or path of the source file that you're trying to access?\n");
    fgets(srcFileName, MAX_FILE_LENGTH, stdin);
    printf("\nWhat is the name or path of this output file?\n");
    fgets(outFileName, MAX_FILE_LENGTH, stdin);


    // Remove the newline as they will cause the FILE pointers to remain NULL when trying to open the file.
    srcFileName[strcspn(srcFileName, "\n")] = '\0';
    outFileName[strcspn(outFileName, "\n")] = '\0';


    sourceFile = fopen(srcFileName, "r");
    outputFile = fopen(outFileName, "w");


    if(sourceFile == NULL)
    {
        fprintf(stderr, "\n%s: %s.\n", srcFileName, strerror(errno));
        return EXIT_FAILURE;
    }
    else if(outputFile == NULL) // This case is if source file opens but output file does not.
    {
        const int errnum = errno; // I'm using a local variable to represent errno because many functions set it when an error occurs.
        fclose(sourceFile); // If the source files is open, but the source file is not open then close the source file prematurely.
        fprintf(stderr, "\n%s: %s.\n", outFileName, strerror(errnum));
        return EXIT_FAILURE;
    }


    fseek(sourceFile, 0, SEEK_END);
    const int SIZE_OF_FILE = ftell(sourceFile); // retrieve the size of the file.
    char srcData[SIZE_OF_FILE];
    fseek(sourceFile,0,SEEK_SET); // seek back to the start of the file.


    readSourceFile(sourceFile, srcData, SIZE_OF_FILE);
    writeToFile(outputFile, srcData);


    // If both of them are opened and every thing went smoothly, then close them.
    fclose(sourceFile);
    fclose(outputFile);


    return 0;
}


void writeToFile(FILE *filePtr, char srcData[])
{
    fputs(srcData, filePtr);
}


void readSourceFile(FILE *filePtr, char srcData[], const int SIZE_OF_FILE)
{
    int i = 0; // This is an index counter for srcData
    int j = 0; // this is an index counter for tempArray
    int ch;    // The character that will compared.
    int state = 0; // To determine if we are in a comment and still in a comment.
    int charNotInComments = 0; // The characters that we want to keep.
    char tempArray[SIZE_OF_FILE];


    // Once we reach the end of the file, we stop reading data.
    while(fgets(tempArray, SIZE_OF_FILE, filePtr) != NULL)
    {
        ch = tempArray[j];


        while(ch != '\n') // until we reach the end of the line.
        {
            // the / is the start of a single or multi comment line. the state variable will help us keep in check multi level comments that are indented
            // like this:
            // /*
            //
            // */
            if(ch == '/' || state == 1)
            {
                j++;
                ch = tempArray[j];


                if(ch == '*' || state == 1) // this is if we find /*
                {
                    state = 1;


                    j++;
                    ch = tempArray[j];


                    while(ch != '\n' && state == 1)
                    {
                        j++;
                        ch = tempArray[j];


                        if(ch == '*')
                        {
                            j++;
                            ch = tempArray[j];


                            if(ch == '/')
                            {
                                j++;
                                ch = tempArray[j];
                                state = 0; // The end of a multi line comment is when we find */ and only */
                            }
                        }
                    }
                }
                else if(ch == '/') // This is if we find //
                {
                    state = 1;


                    j++;
                    ch = tempArray[j];


                    while(ch != '\n') // single line comments make the entire line a comment. so keep going until you find a new line.
                    {
                        j++;
                        ch = tempArray[j];
                    }


                    state = 0; // a new line signifies the end of a single line comment. At least for the line we are in.
                }
            }
            else if(ch == '"') // if we find an opening double quotation mark.
            {
                srcData[i] = ch;
                i++;
                j++;
                charNotInComments++;
                ch = tempArray[j];


                while(ch != '"')
                {
                    srcData[i] = ch;
                    i++;
                    j++;
                    charNotInComments++;
                    ch = tempArray[j];
                }


                srcData[i] = ch;
                i++;
                j++;
                charNotInComments++;
                ch = tempArray[j];
            }
            else if(state == 0) // if it's letter, whitespace, etc. Anything but a comment.
            {
                srcData[i] = ch;
                i++;
                j++;
                charNotInComments++;
                ch = tempArray[j];
            }
        }


        if(charNotInComments > 0) // if there weren't any characters worth keeping, skip this
        {
            srcData[i] = ch;
            i++;
            srcData[i] = '\0';
            charNotInComments = 0;
        }


        if(j > 0)
        {
            j = 0;
        }
    }
}

**GReaper** · 10-24-2015

There are memory leaks all over the place. That happens because you don't check for the null character in the inner loop at lines #100, #140 and #159. Fix those first and we'll continue.

Did you compile and run the code? I'm asking because it should crash like it did to me, I simply used the same source as input.

**deathslice** · 10-25-2015

It ran and complied for me but you do have a good point. I didn't think of null terminating the temp array because I wasn't really concern with it.

Edit: But tell, why do you say that there are memory leaks because of the null character. All the string inside tempArray are already null terminated and the characters that are being put in srcdata are being null terminated(so you can see that I'm doing that at the end of this program) so then what's the problem?

**jimblumberg** · 10-25-2015

Since you're not using dynamic memory there are no memory leaks, but there may be buffer overrun errors if you're not properly terminating your strings.

However you probably don't realize that you're using Variable Length Arrays, unless you happen to be using a C++ compiler instead of a C compiler. You should use #define instead of const to create your compile time constants in C programs.

main.c||In function ‘main’:|
main.c|15|warning: variable length array ‘srcFileName’ is used [-Wvla]|
main.c|15|warning: variable length array ‘outFileName’ is used [-Wvla]|
main.c|49|warning: variable length array ‘srcData’ is used [-Wvla]|
main.c||In function ‘readSourceFile’:|
main.c|79|warning: variable length array ‘tempArray’ is used [-Wvla]|

Jim

**deathslice** · 10-25-2015

I have almost everything but I just can't figure out a way to keep a track of indented multi line comment.

**deathslice** · 10-25-2015

for both temparray and srcdata, the value of the variable inside their brackets is the current size of the file during runtime. I did it so I didn't have to create an arbitrarily large buffer size. If the compiler warns me about variable size arrays, what can I do to get around that? The book that I'm currently reading hasn't talked about dynamic memory yet so that is out the window.

Edit: For the buffer oveload, that is why I used fgets because it at least takes the size of the buffer into consideration. Sure, data will get lost if the string is greater than what the buffer can handle, but I'm not worried about that right now.

**laserlight** · 10-25-2015

Originally Posted by deathslice

for both temparray and srcdata, the value of the variable inside their brackets is the current size of the file during runtime. I did it so I didn't have to create an arbitrarily large buffer size. If the compiler warns me about variable size arrays, what can I do to get around that? The book that I'm currently reading hasn't talked about dynamic memory yet so that is out the window.

Has the book that you are currently reading talked about variable length arrays? If so, then you just need to compile with the feature enabled, e.g., compile with respect to C99.

If not, then just as you want to avoid dynamic memory allocation, you should avoid variable length arrays, hence the solution of "an arbitrarily large buffer size" is appropriate. If you do not want to do that... then the solution is to read more of your book until you do learn about dynamic memory allocation (or variable length arrays).

**deathslice** · 10-25-2015

Alright I think I have something half way decent that works with //, /* */ and indented multi lines. This program assume that there are no syntax errors in the text file.

Here is a test file.main.txt

Code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>


#define MAX_FILE_NAME 20
#define MAX_FILE_SIZE 1024


void readSourceFile(FILE *, char[], const int);
void writeToFile(FILE *, char[]);


int main()
{
    FILE *sourceFile, *outputFile;
    char srcFileName[MAX_FILE_NAME], outFileName[MAX_FILE_NAME], srcData[MAX_FILE_SIZE];


    printf("What is the name or path of the source file that you're trying to access?\n");
    fgets(srcFileName, MAX_FILE_NAME, stdin);
    printf("\nWhat is the name or path of this output file?\n");
    fgets(outFileName, MAX_FILE_NAME, stdin);


    // Remove the newline as they will cause the FILE pointers to remain NULL when trying to open the file.
    srcFileName[strcspn(srcFileName, "\n")] = '\0';
    outFileName[strcspn(outFileName, "\n")] = '\0';


    sourceFile = fopen(srcFileName, "r");
    outputFile = fopen(outFileName, "w");


    if(sourceFile == NULL)
    {
        fprintf(stderr, "\n%s: %s.\n", srcFileName, strerror(errno));
        return EXIT_FAILURE;
    }
    else if(outputFile == NULL) // This case is if source file opens but output file does not.
    {
        const int errnum = errno; // I'm using a local variable to represent errno because many functions set it when an error occurs.
        fclose(sourceFile); // If the source files is open, but the source file is not open then close the source file prematurely.
        fprintf(stderr, "\n%s: %s.\n", outFileName, strerror(errnum));
        return EXIT_FAILURE;
    }


    readSourceFile(sourceFile, srcData, MAX_FILE_SIZE);
    writeToFile(outputFile, srcData);


    // If both of them are opened and every thing went smoothly, then close them.
    fclose(sourceFile);
    fclose(outputFile);


    return 0;
}


void writeToFile(FILE *filePtr, char srcData[])
{
    fputs(srcData, filePtr);
}


void readSourceFile(FILE *filePtr, char srcData[], const int SIZE_OF_FILE)
{
    int i = 0; // This is an index counter for srcData
    int j = 0; // this is an index counter for tempArray
    int ch = 0;    // The character that will compared.
    int state = 0; // To determine if we are in a comment and still in a comment.
    int charNotInComments = 0; // The characters that we want to keep.
    char tempArray[SIZE_OF_FILE];


    // Once we reach the end of the file, we stop reading data.
    while(fgets(tempArray, SIZE_OF_FILE, filePtr) != NULL)
    {
        ch = tempArray[j];
        j++;


        while(ch != '\n') // until we reach the end of the line.
        {
            if(ch == '/') // The start of a comment.
            {
                ch = tempArray[j];
                j++;


                if(ch == '*')
                {
                    ch = tempArray[j];
                    j++;
                    state = 1;


                    while(ch != '\n' && state == 1)
                    {
                        ch = tempArray[j];
                        j++;


                        if(ch == '*')
                        {
                            ch = tempArray[j];
                            j++;


                            if(ch == '/')
                            {
                                state = 0;
                            }
                        }
                    }
                }
                else if(ch == '/')
                {
                    while(ch != '\n')
                    {
                        ch = tempArray[j];
                        j++;
                    }
                }
            }
            else if(state == 1) // If we are still in a comment.
            {
                while(ch != '\n' && state == 1)
                {
                    if(ch == '*')
                    {
                        ch = tempArray[j];
                        j++;


                        if(ch == '/')
                        {
                            state = 0;
                        }
                    }


                    ch = tempArray[j];
                    j++;
                }
            }
            else if(ch == '"') // if we find an opening double quotation mark.
            {
                srcData[i] = ch;
                i++;
                ch = tempArray[j];
                j++;
                charNotInComments++;


                while(ch != '"')
                {
                    srcData[i] = ch;
                    i++;
                    ch = tempArray[j];
                    j++;
                    charNotInComments++;
                }


                srcData[i] = ch;
                i++;
                ch = tempArray[j];
                j++;
                charNotInComments++;
            }
            else
            {
                srcData[i] = ch;
                i++;
                ch = tempArray[j];
                j++;
                charNotInComments++;
            }
        }


        if(charNotInComments > 0) // if there were characters worth keeping, skip this
        {
            srcData[i] = ch;
            i++;
            srcData[i] = '\0';
            charNotInComments = 0;
        }


        ch = 0;
        j = 0;
    }
}

**whiteflags** · 10-25-2015

I would restructure the code so that it works like a finite state machine. Everything is easier to read that way. (You were close to doing it by yourself anyway)

**stahta01** · 10-25-2015

Am I the only one who wondered why the exercise is always to remove the comments and leave the code?
I thought it would be just as hard of a exercise to keep the comments and remove the code instead.

Tim S.

**whiteflags** · 10-25-2015

Originally Posted by stahta01

Am I the only one who wondered why the exercise is always to remove the comments and leave the code?
I thought it would be just as hard of a exercise to keep the comments and remove the code instead.

Tim S.

Definitely not! I could see a use for a program that extracts comments from code instead. Say you used doxygen to generate some documentation from comments and later you decide to use a separate file. Working from the output of such a program is better than starting over, whereas doing this seems silly. It does show how the preprocessor might implement removing comments from code though.

**deathslice** · 10-25-2015

Well I able to remove a couple of unnecessary lines of code(while also adding a couple of new variables) but it still retains its general shape and logic. Though I used a much bigger test file and it worked perfectly.

Code:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>


#define MAX_FILE_NAME 20
#define MAX_FILE_SIZE 1024


void readSourceFile(FILE *, char[], const int);
void writeToFile(FILE *, char[]);


int main()
{
    FILE *sourceFile, *outputFile;
    char srcFileName[MAX_FILE_NAME], outFileName[MAX_FILE_NAME], srcData[MAX_FILE_SIZE];


    printf("What is the name or path of the source file that you're trying to access?\n");
    fgets(srcFileName, MAX_FILE_NAME, stdin);
    printf("\nWhat is the name or path of this output file?\n");
    fgets(outFileName, MAX_FILE_NAME, stdin);


    // Remove the newline as they will cause the FILE pointers to remain NULL when trying to open the file.
    srcFileName[strcspn(srcFileName, "\n")] = '\0';
    outFileName[strcspn(outFileName, "\n")] = '\0';


    sourceFile = fopen(srcFileName, "r");
    outputFile = fopen(outFileName, "w");


    if(sourceFile == NULL)
    {
        fprintf(stderr, "\n%s: %s.\n", srcFileName, strerror(errno));
        return EXIT_FAILURE;
    }
    else if(outputFile == NULL) // This case is if source file opens but output file does not.
    {
        const int errnum = errno; // I'm using a local variable to represent errno because many functions set it when an error occurs.
        fclose(sourceFile); // If the source files is open, but the source file is not open then close the source file prematurely.
        fprintf(stderr, "\n%s: %s.\n", outFileName, strerror(errnum));
        return EXIT_FAILURE;
    }


    readSourceFile(sourceFile, srcData, MAX_FILE_SIZE);
    writeToFile(outputFile, srcData);


    // If both of them are opened and every thing went smoothly, then close them.
    fclose(sourceFile);
    fclose(outputFile);


    return 0;
}


void writeToFile(FILE *filePtr, char srcData[])
{
    fputs(srcData, filePtr);
}


void readSourceFile(FILE *filePtr, char srcData[], const int SIZE_OF_FILE)
{
    int i = 0; // This is an index counter for srcData
    int j = 0; // this is an index counter for tempArray
    int ch = 0;    // The character that will compared.
    int singleLine = 0; // If We are in a single line comment,
    int multiLine = 0; // If we are in a multi ling comment.
    int charNotInComments = 0; // The characters that we want to keep.
    char tempArray[SIZE_OF_FILE];


    // Once we reach the end of the file, we stop reading data.
    while(fgets(tempArray, SIZE_OF_FILE, filePtr) != NULL)
    {
        do
        {
            ch = tempArray[j];
            j++;


            if(singleLine == 1)
            {
                while(ch != '\n')
                {
                    ch = tempArray[j];
                    j++;
                }


                singleLine = 0;
            }
            else if(multiLine == 1)
            {
                while(ch != '\n' && multiLine == 1) // The reason is because the mult line could be either /* */ or indented.
                {
                    if(ch == '*')
                    {
                        ch = tempArray[j];
                        j++;


                        if(ch == '/')
                        {
                            multiLine = 0;
                        }
                    }


                    ch = tempArray[j];
                    j++;
                }
            }
            else if(ch == '/') // The start of a comment.
            {
                ch = tempArray[j];
                j++;


                if(ch == '*')
                {
                    multiLine = 1;
                }
                else if(ch == '/')
                {
                    singleLine = 1;
                }
            }
            else if(ch == '"') // if we find an opening double quotation mark.
            {
                srcData[i] = ch;
                charNotInComments++;
                i++;
                ch = tempArray[j];
                j++;


                while(ch != '"')
                {
                    srcData[i] = ch;
                    charNotInComments++;
                    i++;
                    ch = tempArray[j];
                    j++;
                }


                srcData[i] = ch;
                charNotInComments++;
                i++;
            }
            else // anything else really
            {
                srcData[i] = ch;
                charNotInComments++;
                i++;
            }
        }
        while(ch != '\n'); // until we reach the end of the line.


        if(charNotInComments > 0)
        {
            srcData[i] = '\0';
            charNotInComments = 0;
        }


        ch = 0;
        j = 0;
    }
}

Edit: Though yes I think the purpose of this exercise is to show how the preprocessor does one of its job and that is to remove comments. Other than that, when I first read the problem I thought it was a bit silly lol.

**Nominal Animal** · 10-25-2015

deathslice, like whiteflags wrote, this is the domain of state machines, really. I recommend you take a look.

In more than one way, you've already using the approach on your own; what you lack, is the organization, or the rigorous approach.

Since your code now works (I assume), I'll show how I'd do the same using a state machine with seven states: CODE, DOUBLEQUOTED, SINGLEQUOTED, SLASH, COMMENTLINE, COMMENT, and ASTERISK.

When in CODE state:

" causes a transition to the DOUBLEQUOTED state
' causes a transition to the SINGLEQUOTED state
/ causes a transition to the SLASH state

When in DOUBLEQUOTED state:

" causes a transition to the CODE state
and technically, a newline is illegal, but we'll ignore that.

When in SINGLEQUOTED state:

' causes a transition to the CODE state
and technically, a newline is illegal, but we'll ignore that.

When in SLASH state:

/ causes a transition to the COMMENTLINE state
* causes a transition to the COMMENT state
and all others cause a transition to the CODE state.

When in COMMENTLINE state:

a newline causes a transition to the CODE state

When in COMMENT state:

* causes a transition to the ASTERISK state

When in ASTERISK state:

/ causes a transition to the CODE state
Everything else except for * causes a transition to the COMMENT state.

If we write those rules as

Code:

digraph {
    node [ shape=record ];
    rankdir = "TB";

    CODE         [ label="{ CODE         |{ <DQ> \" | <SQ> ' | <S> / | <any> }}" ];
    DOUBLEQUOTED [ label="{ DOUBLEQUOTED |{ <DQ> \" | <any> }}" ];
    SINGLEQUOTED [ label="{ SINGLEQUOTED |{ <SQ> \' | <any> }}" ];
    SLASH        [ label="{ SLASH        |{ <S>  /  | <AST> *| <any> }}" ];
    COMMENTLINE  [ label="{ COMMENTLINE  |{ <NL> newline | <any> }}" ];
    COMMENT      [ label="{ COMMENT      |{ <AST>*  | <any> }}" ];
    ASTERISK     [ label="{ ASTERISK     |{ <S>  /  | <AST> *| <any> }}" ];

    CODE:DQ  -> DOUBLEQUOTED;
    CODE:SQ  -> SINGLEQUOTED;
    CODE:S   -> SLASH;
    CODE:any -> CODE;

    DOUBLEQUOTED:DQ  -> CODE;
    DOUBLEQUOTED:any -> DOUBLEQUOTED;

    SINGLEQUOTED:SQ  -> CODE;
    SINGLEQUOTED:any -> SINGLEQUOTED;

    SLASH:S   -> COMMENTLINE;
    SLASH:AST -> COMMENT;
    SLASH:any -> CODE;

    COMMENTLINE:NL  -> CODE;
    COMMENTLINE:any -> COMMENTLINE;

    COMMENT:AST -> ASTERISK;
    COMMENT:any -> COMMENT;

    ASTERISK:S   -> CODE;
    ASTERISK:AST -> ASTERISK;
    ASTERISK:any -> CODE;
}

then Graphviz's dot gives us this nice state diagram:
removing comments from c text file-graph-png

I'll also add optional output streams for the code part, strings and character constants (one per line), and comments (one per line). For good measure, this also supports all newline encodings ("universal newlines", i.e. CR, LF, CRLF, and LFCR).

If you start in the CODE state, and you switch to the state that corresponds to the next character (if not listed in the boxes below the state, then use the empty box), you should be able to parse any correct C code. I haven't checked whether my state machine is correct, and whether my code implements it correctly, though!

(I did throw some code at it, and it seems to work; the important part is the process and the structure/approach here.)

Here's the implementation:

Code:

#include <stdlib.h>
#include <stdio.h>

typedef enum {
    CODE = 0,
    DOUBLEQUOTED = 1,
    SINGLEQUOTED = 2,
    SLASH = 3,
    COMMENTLINE = 4,
    COMMENT = 5,
    ASTERISK = 6
} input_state;

void process(FILE *const in, FILE *const code, FILE *const strings, FILE *const comments)
{
    input_state  state = CODE;
    int          c;

    while (1) {

        c = getc(in);
        if (c == EOF)
            break;

        /* Universal newline support */
        if (c == '\n') {
            c = getc(in);
            if (c != '\r')
                ungetc(c, in);
            c = '\n';
        } else
        if (c == '\r') {
            c = getc(in);
            if (c != '\n')
                ungetc(c, in);
            c = '\n';
        }

        switch (state) {
        case CODE:
            if (c == '/')
                state = SLASH;
            else {
                if (code)
                    fputc(c, code);
                if (c == '"')
                    state = DOUBLEQUOTED;
                else
                if (c == '\'')
                    state = SINGLEQUOTED;
            }
            break;

        case DOUBLEQUOTED:
            if (code)
                fputc(c, code);
            if (c == '"') {
                state = CODE;
                if (strings)
                    fputc('\n', strings);
            } else {
                if (strings)
                    fputc(c, strings);
            }
            break;

        case SINGLEQUOTED:
            if (code)
                fputc(c, code);
            if (c == '\'') {
                state = CODE;
                if (strings)
                    fputc('\n', strings);
            } else {
                if (strings)
                    fputc(c, strings);
            }
            break;

        case SLASH:
            if (c == '/')
                state = COMMENTLINE;
            else
            if (c == '*')
                state = COMMENT;
            else
            if (code) {
                fputc('/', code);
                fputc(c, code);
                state = CODE;
            }
            break;

        case COMMENTLINE:
            if (comments)
                fputc(c, comments);
            if (c == '\n')
                state = CODE;
            break;

        case COMMENT:
            if (comments)
                fputc(c, comments);
            if (c == '*')
                state = ASTERISK;
            break;

        case ASTERISK:
            if (c == '/')
                state = CODE;
            else {
                if (comments)
                    fputc('*', comments);
                if (c != '*')
                    state = COMMENT;
            }
            break;
        }
    }

    /* State should be CODE here. */
}

Although pen and paper are good tools, I write so much code snippets that I prefer to keep my notes in digital form, with each code snippet and relevant notes in the same directory. Graphviz's graphs are very useful (and they're fast to write after you get familiar with the simple syntax); for math I use either plain text or LibreOffice Writer (I like its easy-to-use formula editor). I could use LaTeX (or LyX for example) for either or both, and might switch back in the future. There certainly are a lot of tools to choose from.

**deathslice** · 10-25-2015

Very nice example. Maybe later I'll go back to this exercise and redo this way but for I'm just going to keep pushing forward. Cheers

**Hodor** · 10-26-2015

Originally Posted by Nominal Animal

deathslice, like whiteflags wrote, this is the domain of state machines, really. I recommend you take a look.

You went to all that time and effort writing a fantastic explanation and used a switch?

I haven't tested the code below much. There are known issues (escaped characters for example) but handling them is just another state so easy to add. I wouldn't normally write it like this (I doubt I'd use arrays... more likely some other more appropriate data structure, but *shrug* it saved time for something that's only meant as an example AND "translating" your description to an array was easier. Ok, I was lazy).

Code:

/*
 * Known issues:
 * a) Escaped characters in quotes or double quotes
 * b) Multiline comments end up being replaced by a newline (not a huge issue)
 * c) probably more
 *
 * I apologise for the lack of comments. I removed them.
 */
#include <stdlib.h>
#include <stdio.h>

enum StateId {
    UNDEFINED_STATE,
    CODE,                /* */
    DOUBLEQUOTED,
    SINGLEQUOTED,
    SLASH,
    COMMENTLINE,
    COMMENT,
    ASTERISK
};

#define MAX_TRANSITIONS 4
#define ANY_CHAR '\0'

#define MATCH 0
#define MATCH_NOT 1

/* Pointless comment
 */
struct Transition {
    int match_not;
    char match;
    enum StateId next;
    int print_prev;
    int suppress_out;
};

struct State {
    enum StateId state;
    struct Transition transitions[MAX_TRANSITIONS];
};

static const struct State StateMachine[] = {
    { 0 },    // Undefined
    { CODE, {{MATCH, '"', DOUBLEQUOTED}, {MATCH, '\'', SINGLEQUOTED}, {MATCH, '/', SLASH}} },
    { DOUBLEQUOTED, {{MATCH, '"', CODE}} },
    { SINGLEQUOTED, {{MATCH, '\'', CODE}} },
    { SLASH, {{MATCH, '/', COMMENTLINE}, {MATCH, '*', COMMENT}, {MATCH, ANY_CHAR, CODE, 1}} },
    { COMMENTLINE, {{MATCH, '\n', CODE}} },
    { COMMENT, {{MATCH, '*', ASTERISK}} },
    { ASTERISK, {{MATCH, '/', CODE, 0, 1}, {MATCH_NOT, '*', COMMENT}} }
};

enum StateId get_state_idx(int c, enum StateId current_state, int *print_prev, int *suppress)
{
    int token_match;
    int candidate_tr;

    const struct State *st = &StateMachine[current_state];

    for (candidate_tr = 0; st->transitions[candidate_tr].next != UNDEFINED_STATE; candidate_tr++) {

        if (st->transitions[candidate_tr].match == ANY_CHAR)
            token_match = 1;
        else
            token_match = (c == st->transitions[candidate_tr].match);

        if (st->transitions[candidate_tr].match_not)
            token_match = !token_match;

        if (token_match) {
            current_state = st->transitions[candidate_tr].next;
            *print_prev = st->transitions[candidate_tr].print_prev;
            *suppress = st->transitions[candidate_tr].suppress_out;
            break;
        }
    }

    return current_state;
}

void process(FILE *in, FILE *out)
{
    int c;
    int prev_c = '\0';
    int print_prev = 0;    /* bool */
    int suppress_out = 0; /* bool */

    enum StateId st = CODE;

    while ((c = getc(in)) != EOF) {
        st = get_state_idx(c, st, &print_prev, &suppress_out);
        if (print_prev && prev_c != '\0') {
            putc(prev_c, out);
            print_prev = 0;
        }
        switch (st) {
            case CODE: case DOUBLEQUOTED: case SINGLEQUOTED:
                if (!suppress_out)
                    putc(c, out);
                else
                    suppress_out = 0;
                break;
            default:
                break;
        }
        prev_c = c;
    }
}

int main(void)
{
#if 0
    int a = 2 / 3;
    printf("This removes comments like /* comment */ and // comment from a file\n");
    printf("Since it's a test input is from stdin and output is stdout\n");
#endif
    process(stdin, stdout);
    return 0;
}

Thread: removing comments from c text file

Thread Tools

Search Thread

Display

removing comments from c text file

Similar Threads

Removing Comments

removing comments of type '//' and '/*'

removing text from a file

Problem parsing comments and such in text file

Removing text between /* */ in a file