Thread: HTML parser that crashes with longer strings

  1. #1
    Registered User
    Join Date
    Jan 2010
    Posts
    3

    HTML parser that crashes with longer strings

    Hello,

    The post is probably going to draw a lot of criticism, given that
    a) it's a hobby project,
    b) it contains a lot of poorly written code.

    Anyway...

    I tried to write an HTML parser that puts all the DOM data into a tree-like structure. The parser works well, but only if none of the child nodes contains larger string. Strangely, the problem seems not to occur with higher nodes (the first level of depth). Here's the header for the parser:

    Code:
    #ifndef _PARSER_H_
    #define _PARSER_H_
    
    #ifdef __cplusplus
    extern "C" {
    #endif
    
    typedef struct par_pair{
        const char* name;
        char* value;
    } t_par_pair;
    
    typedef t_par_pair* PParPair;
    
    typedef struct tag_data {
        char* src_string;
        char* tag_name;
        char* tag_value;
        char* start_offset;
        char* end_offset;
        t_par_pair* params;
        struct tag_dat.......... children;
        unsigned int par_nr;
        unsigned int child_count;
    } t_tag_data;
    
    typedef t_tag_data* PTagData;
    
    int is_comment(char*);
    static void tag_children_src(PTagData tag);
    static void strip_chars(char*, char*);
    static int compare_params(const struct par_pair* p1, const struct par_pair* p2);
    int compare_tags(const PTagData*, const PTagData*);
    //char* tolower(const char* string);
    static size_t get_param_count(PTagData tag);
    static PTagData tag_init(const char* name);
    static void destroy_param(t_par_pair param);
    void tag_destroy(PTagData tag);
    static t_par_pair init_param(const char* name, char* value);
    static void add_parameter(PTagData tdata, const char* name, char* value);
    static char* get_tag_opening(char* offset);
    static void set_tag_value_src(PTagData tag);
    char* get_param_by_name(PTagData tag, const char* name);
    static PTagData parse_tag_opening(char* offset);
    static char* get_tag_ending(PTagData tag, char* offset);
    PTagData get_tag_data(char* offset);
    static void tag_append_child(PTagData parent, PTagData child);
    void strip_chars(char* string, char* chars);
    static void tag_children_src(PTagData tag);
    PTagData get_root_node(char* html);
    void tag_traverse(PTagData root, void(*callback)(PTagData));
    
    #ifdef __cplusplus
    }
    #endif
    
    #endif
    And here's the implementation:

    Code:
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <conio.h>
    #include "parser.h"
    
    
    /*
     * Comparing parameters by key value
     */
    
    int compare_params(const struct par_pair* p1, const struct par_pair* p2) {
        return strcmp(p1->name, p2->name);
    }
    
    /*
     * Comparing tags by name
     */
    
    int compare_tags(const PTagData* t1, const PTagData* t2) {
        return strcmp((*(*t1)).tag_name, (*(*t2)).tag_name);
    }
    
    /*
     * String to lowercase
     * ASCII only, should suffice for the tag names
     */
    
    /*char* tolower(const char* string) {
        int i;
        char* newstring = (char*) malloc(strlen(string));
        for(i=0; i<strlen(string); i++) {
            if(((int)string[i]>64)&&((int)string[i]<91))
            newstring[i] = (char) string[i]+32;
            else newstring[i] = string[i];
        }
        newstring[++i] = '\0';
        return newstring;
    }*/
    
    size_t get_param_count(PTagData tag) {
        return sizeof(tag->params)/sizeof(PParPair);
    }
    
    static PTagData tag_init(const char* name) {
        PTagData tag = (PTagData) malloc(sizeof(t_tag_data));
        tag->tag_name = name;
        tag->params = (t_par_pair*) calloc(10, sizeof(t_par_pair));
        tag->children = (PTagData) calloc(10, sizeof(PTagData));
        tag->par_nr = 0;
        tag->child_count = 0;
        tag->tag_value = NULL;
        return tag;
    }
    
    static void destroy_param(t_par_pair param) {
        free(param.value);
    }
    
    void tag_destroy(PTagData tag) {
        int i;
    
        /*
         * Deallocate child nodes
         */
    
        if(tag->child_count) {
            for(i=0; i<tag->child_count; i++) {
                tag_destroy(tag->children[i]);
            }
        }
    
        /*
         * Deallocate parameters
         */
    
        for(i = 0; i<tag->par_nr; i++) {
            destroy_param(tag->params[i]);
        }
        free(tag->params);
    
    
        free(tag->tag_value);
        free(tag->src_string);
        free(tag);
    }
    
    static t_par_pair init_param(const char* name, char* value) {
        t_par_pair pair;
        pair.name  = name;
        pair.value = (char*) malloc(strlen(value));
        memset(pair.value, '\0', strlen(value));
        strcpy(pair.value, value);
        return pair;
    }
    
    
    static void add_parameter(PTagData tdata, const char* name, char* value) {
        int last_cell  = tdata->par_nr-1;
        size_t old_size, new_size;
    
        t_par_pair param = init_param(name, value);
    
        //add parameter to tag data structure
        old_size = sizeof(tdata->params);
        new_size = old_size + sizeof(param);
        tdata->params = (t_par_pair*) realloc(tdata->params, sizeof(t_par_pair)*(++tdata->par_nr));
        tdata->params[last_cell+1] = param;
        qsort(tdata->params, tdata->par_nr, sizeof(t_par_pair), compare_params);
    }
    
    static char* get_tag_opening(char* offset) {
        size_t tag_length;
        char* tag_end;
        char* tag_text;
    
        tag_end = strpbrk(offset, ">");
        tag_length = tag_end - offset;
    
        tag_text = (char*) malloc(tag_length);
        tag_text[tag_length] = '\0';
        memcpy(tag_text, offset, tag_length);
        return tag_text;
    }
    
    /*
     * Get tag value, ignore children
     */
    
    static void set_tag_value_src(PTagData tag) {
        char* src = tag->src_string;
        char* offset = strpbrk(src, ">");
        char* tag_ending;
        char  tag_end_clause[256];
        char* tag_value;
        size_t val_length;
    
        sprintf(tag_end_clause, "</%s>", tag->tag_name);
        tag_ending = strstr(tag->src_string, tag_end_clause);
        val_length = tag_ending-offset;
        tag_value = (char*) malloc(val_length);
        memcpy(tag_value, ++offset, val_length);
        tag->tag_value = tag_value;
        tag->tag_value[val_length-1] = '\0';
    }
    
    
    char* get_param_by_name(PTagData tag, const char* name) {
        t_par_pair temp_pair;
        PParPair ret_pair;
        temp_pair.name = name;
    
        ret_pair = bsearch(&temp_pair, tag->params, tag->par_nr, sizeof(t_par_pair), compare_params);
        return ret_pair->value;
    }
    
    /*
     * Get all the relevant data from the opening
     * of the tag
     */
    
    static PTagData parse_tag_opening(char* offset) {
        char* tag_opening = get_tag_opening(offset);
        char* params_offset = strpbrk(tag_opening, " ");
        char* tag_name;
        char* pptr;
        short int flag = 0;
        unsigned char  names[256][256];
        unsigned char  values[256][1024];
        int n_count = -1;
        int v_count = -1;
        size_t name_length;// = params_offset - tag_opening;
        PTagData tag;
    
        if(params_offset==NULL)
            name_length = strlen(tag_opening);
        else name_length = params_offset - tag_opening;
    
    
    
        tag_name = (char*) malloc(name_length);
        tag_name[name_length] = '\0';
        memcpy(tag_name, offset, name_length);
        tag = tag_init(tag_name);
    
    
        if(offset[name_length]!='>') {
        pptr = strtok(params_offset, "\"=");
        while(pptr!=NULL) {
            if(pptr[0]==' ') ++pptr;
            if(!flag) {
                strcpy(names[++n_count], pptr);
                flag = 1;
            }
            else {
                strcpy(values[++v_count], pptr);
                flag = 0;
            }
            pptr = strtok(NULL, "\"=");
        }
    
        for(n_count = 0; n_count<=v_count; n_count++) {
            char* name = names[n_count];
            char* value = values[n_count];
    
    
            char* param_name = (char*) malloc(strlen(name)+1);
            char* param_value = (char*) malloc(strlen(value)+1);
    
            strcpy(param_name, name);
            strcpy(param_value, value);
    
    
            param_value[strlen(param_value)] = '\0';
            add_parameter(tag, param_name, param_value);
         }
        }
        free(tag_opening);
        return tag;
    }
    
    static char* get_tag_ending(PTagData tag, char* offset) {
        char s_string[256];
        char* name = tag->tag_name;
        char* tag_ending;
        memset(s_string, '\0', strlen(s_string));
        sprintf(s_string, "</%s>", name);
        tag_ending = strstr(offset, s_string);
        if(tag_ending==NULL) return NULL;
        tag_ending = strpbrk(tag_ending, ">");
        return ++tag_ending;
    }
    
    
    PTagData get_tag_data(char* offset) {
        char* tag_start = strpbrk(offset, "<");
        char* tag_end;
        char* tag_string;
        size_t tag_size;
    
        PTagData tag = parse_tag_opening(++tag_start);
    
        tag_end = get_tag_ending(tag, offset);
        if(tag_end!=NULL) {
        tag_size = tag_end - offset;
        tag_string = (char*) malloc(tag_size);
    
        tag->src_string = tag_string;
        tag->start_offset = tag_start;
        tag->end_offset = tag_end;
    
        memcpy(tag_string, offset, tag_size);
        tag_string[tag_size] = '\0';
    
        set_tag_value_src(tag);
        tag_children_src(tag);
        }
        return tag;
    }
    
    
    static void tag_append_child(PTagData parent, PTagData child) {
        size_t pos = parent->child_count;
        parent->children = (PTagData*) realloc(parent->children, ++parent->child_count*sizeof(PTagData));
        parent->children[pos] = child;
    }
    
    /*
     * Strip newlines prior to parsing
     * the string
     */
    
    void strip_chars(char* string, char* chars) {
        char new_string[strlen(string)];
        char* offset;
        offset = strtok(string, chars);
        while(offset!=NULL) {
            strcat(new_string, offset);
            offset = strtok(NULL, chars);
        }
        memset(string, '\0', strlen(string));
        strcpy(string, new_string);
    }
    
    
    /*
     * Iterate recursively through the inner tags
     * and append them to the structure
     */
    
    static void tag_children_src(PTagData tag) {
        char* cptr = tag->tag_value;
        while((cptr=strpbrk(cptr, "<"))!=NULL) {
            PTagData child = get_tag_data(cptr);
            tag_append_child(tag, child);
            cptr = child->end_offset;
        }
    }
    
    char* load_html(const char* path) {
        size_t file_size;
    
        FILE* html = fopen(path, "r");
        if(html==NULL) {
            fprintf(stderr, "File not found.\n");
            return NULL;
        }
        fseek(html, 0, SEEK_END);
        file_size = ftell(html);
        rewind(html);
        unsigned char* content = (unsigned char*) malloc(file_size);
        memset(content, '\0', file_size);
        fread(content, file_size, 1, html);
        fclose(html);
        return content;
    }
    
    /*
     * Recursively traverse the DOM structure and
     * perform the callback
     */
    
    void tag_traverse(PTagData root, void(*callback)(PTagData tag)) {
    	int i;
    	if(root->child_count>0) {
    		for(i=0; i<root->child_count; i++) {
    			PTagData tag = root->children[i];
    			if(tag!=NULL) {
    				callback(tag);
    				if(tag->child_count) tag_traverse(tag, callback);
    				}
    		}
    	}
    }
    
    PTagData get_root_node(char* html) {
        return get_tag_data(html);
    }
    The get_tag_data function is the entry point for all the further procedures. The parse_tag_opening function acquires the tag name and the parameters of the tag. The tag_children_src function gets all the data for the child nodes of a particular tag and, oddly, it seems to crash with the following HTML (a modified version of the example.com):

    Code:
    <HTML>
    <HEAD>
      <TITLE>Example Web Page</TITLE>
    </HEAD> 
    <BODY>
    aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
    <p style="">You have somehow reached this web page by typing &quot;example.com&quot;,
    &quot;example.net&quot;,
      or &quot;example.org&quot; into your web browser.</p>
    <p>These domain names are reserved for use in documentation and are not available 
      for registration. See <a href="http://www.rfc-editor.org/rfc/rfc2606.txt">RFC 
      2606</a>, Section 3.</p>
    </BODY>
    </HTML>
    If the HTML string is too long in one of the child tags, the program stops responding and crashes after approximately 10 seconds. Debugging gave me no clues as to why it happens. Without the 'aaaa' thing the parser works perfectly.

    Any help will be greatly appreciated.

  2. #2
    Registered User UltraKing227's Avatar
    Join Date
    Jan 2010
    Location
    USA, New york
    Posts
    123
    try increasing the elements of the array which is used in the parser.

  3. #3
    Registered User
    Join Date
    Jan 2010
    Posts
    3
    Thanks, but I've already tried to increase the memory allocation for the strings and it didn't help. The memory for the child nodes is reallocated each time a node is appended.

    EDIT: It seems that the lower depth of the document, the shorter the strings needed to crash the parser.
    Last edited by almos; 03-05-2010 at 07:16 AM. Reason: Providing additional data

  4. #4
    and the hat of int overfl Salem's Avatar
    Join Date
    Aug 2001
    Location
    The edge of the known universe
    Posts
    39,659
    Code:
    /*char* tolower(const char* string) {
        int i;
        char* newstring = (char*) malloc(strlen(string));
        for(i=0; i<strlen(string); i++) {
            if(((int)string[i]>64)&&((int)string[i]<91))
            newstring[i] = (char) string[i]+32;
            else newstring[i] = string[i];
        }
        newstring[++i] = '\0';
        return newstring;
    }*/
    Try this (yours is broke on nearly every line)
    Code:
    char* tolower(const char* string) {
        int i;
        char* newstring = malloc(strlen(string)+1); /* forgot to count the \0, and casting */
        for(i=0; i string[i] != '\0'; i++) {        /* don't use strlen to find the \0 */
            if ( isupper(string[i]) )               /* goodies in ctype.h */
              newstring[i] = tolower( string[i] );  /* more goodies */
            else  
              newstring[i] = string[i];
        }
        newstring[i] = '\0';                        /* erroneous ++i */
        return newstring;
    }

    > In init_param()
    Yet more off-by-1

    > pair.value = (char*) malloc(strlen(value));
    You forgot to add 1 for the \0

    > memset(pair.value, '\0', strlen(value));
    This is a waste of effort, you're going to write every byte with the strcpy anyway.



    Then later on,
    > memset(s_string, '\0', strlen(s_string));
    > sprintf(s_string, "</%s>", name);
    Now this is just bone-headed stupid.
    If s_string LACKS a \0 within it's bounds, then strlen will just keep roaming though the rest of your stack (and beyond) until it either finds a \0, or it segfaults.
    Then you ERASE ALL THAT DATA (stack and all).
    a) it's a pointless exercise - the sprintf will add a \0 anyway
    b) if (and I do mean IF) you really feel the need to do this, then use sizeof(s_string) not strlen.


    > tag_name = (char*) malloc(name_length);
    > tag_name[name_length] = '\0';
    Yet more off by 1
    Got the idea yet?
    If you dance barefoot on the broken glass of undefined behaviour, you've got to expect the occasional cut.
    If at first you don't succeed, try writing your phone number on the exam paper.

Popular pages Recent additions subscribe to a feed

Similar Threads

  1. html web translator/interpreter
    By Aisthesis in forum C++ Programming
    Replies: 6
    Last Post: 08-02-2009, 02:17 PM
  2. Please Help - C code creates dynamic HTML
    By Christie2008 in forum C Programming
    Replies: 19
    Last Post: 04-02-2008, 07:36 PM
  3. Easy C# way to unescape HTML strings?
    By Cat in forum C# Programming
    Replies: 3
    Last Post: 11-15-2006, 11:59 PM
  4. very simple html parser
    By chad101 in forum C++ Programming
    Replies: 1
    Last Post: 07-26-2006, 07:18 PM
  5. menus and strings
    By garycastillo in forum C Programming
    Replies: 3
    Last Post: 04-29-2002, 11:23 AM