HTML parser that crashes with longer strings
Hello,
The post is probably going to draw a lot of criticism, given that
a) it's a hobby project,
b) it contains a lot of poorly written code.
Anyway...
I tried to write an HTML parser that puts all the DOM data into a tree-like structure. The parser works well, but only if none of the child nodes contains larger string. Strangely, the problem seems not to occur with higher nodes (the first level of depth). Here's the header for the parser:
Code:
#ifndef _PARSER_H_
#define _PARSER_H_
#ifdef __cplusplus
extern "C" {
#endif
typedef struct par_pair{
const char* name;
char* value;
} t_par_pair;
typedef t_par_pair* PParPair;
typedef struct tag_data {
char* src_string;
char* tag_name;
char* tag_value;
char* start_offset;
char* end_offset;
t_par_pair* params;
struct tag_dat.......... children;
unsigned int par_nr;
unsigned int child_count;
} t_tag_data;
typedef t_tag_data* PTagData;
int is_comment(char*);
static void tag_children_src(PTagData tag);
static void strip_chars(char*, char*);
static int compare_params(const struct par_pair* p1, const struct par_pair* p2);
int compare_tags(const PTagData*, const PTagData*);
//char* tolower(const char* string);
static size_t get_param_count(PTagData tag);
static PTagData tag_init(const char* name);
static void destroy_param(t_par_pair param);
void tag_destroy(PTagData tag);
static t_par_pair init_param(const char* name, char* value);
static void add_parameter(PTagData tdata, const char* name, char* value);
static char* get_tag_opening(char* offset);
static void set_tag_value_src(PTagData tag);
char* get_param_by_name(PTagData tag, const char* name);
static PTagData parse_tag_opening(char* offset);
static char* get_tag_ending(PTagData tag, char* offset);
PTagData get_tag_data(char* offset);
static void tag_append_child(PTagData parent, PTagData child);
void strip_chars(char* string, char* chars);
static void tag_children_src(PTagData tag);
PTagData get_root_node(char* html);
void tag_traverse(PTagData root, void(*callback)(PTagData));
#ifdef __cplusplus
}
#endif
#endif
And here's the implementation:
Code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <conio.h>
#include "parser.h"
/*
* Comparing parameters by key value
*/
int compare_params(const struct par_pair* p1, const struct par_pair* p2) {
return strcmp(p1->name, p2->name);
}
/*
* Comparing tags by name
*/
int compare_tags(const PTagData* t1, const PTagData* t2) {
return strcmp((*(*t1)).tag_name, (*(*t2)).tag_name);
}
/*
* String to lowercase
* ASCII only, should suffice for the tag names
*/
/*char* tolower(const char* string) {
int i;
char* newstring = (char*) malloc(strlen(string));
for(i=0; i<strlen(string); i++) {
if(((int)string[i]>64)&&((int)string[i]<91))
newstring[i] = (char) string[i]+32;
else newstring[i] = string[i];
}
newstring[++i] = '\0';
return newstring;
}*/
size_t get_param_count(PTagData tag) {
return sizeof(tag->params)/sizeof(PParPair);
}
static PTagData tag_init(const char* name) {
PTagData tag = (PTagData) malloc(sizeof(t_tag_data));
tag->tag_name = name;
tag->params = (t_par_pair*) calloc(10, sizeof(t_par_pair));
tag->children = (PTagData) calloc(10, sizeof(PTagData));
tag->par_nr = 0;
tag->child_count = 0;
tag->tag_value = NULL;
return tag;
}
static void destroy_param(t_par_pair param) {
free(param.value);
}
void tag_destroy(PTagData tag) {
int i;
/*
* Deallocate child nodes
*/
if(tag->child_count) {
for(i=0; i<tag->child_count; i++) {
tag_destroy(tag->children[i]);
}
}
/*
* Deallocate parameters
*/
for(i = 0; i<tag->par_nr; i++) {
destroy_param(tag->params[i]);
}
free(tag->params);
free(tag->tag_value);
free(tag->src_string);
free(tag);
}
static t_par_pair init_param(const char* name, char* value) {
t_par_pair pair;
pair.name = name;
pair.value = (char*) malloc(strlen(value));
memset(pair.value, '\0', strlen(value));
strcpy(pair.value, value);
return pair;
}
static void add_parameter(PTagData tdata, const char* name, char* value) {
int last_cell = tdata->par_nr-1;
size_t old_size, new_size;
t_par_pair param = init_param(name, value);
//add parameter to tag data structure
old_size = sizeof(tdata->params);
new_size = old_size + sizeof(param);
tdata->params = (t_par_pair*) realloc(tdata->params, sizeof(t_par_pair)*(++tdata->par_nr));
tdata->params[last_cell+1] = param;
qsort(tdata->params, tdata->par_nr, sizeof(t_par_pair), compare_params);
}
static char* get_tag_opening(char* offset) {
size_t tag_length;
char* tag_end;
char* tag_text;
tag_end = strpbrk(offset, ">");
tag_length = tag_end - offset;
tag_text = (char*) malloc(tag_length);
tag_text[tag_length] = '\0';
memcpy(tag_text, offset, tag_length);
return tag_text;
}
/*
* Get tag value, ignore children
*/
static void set_tag_value_src(PTagData tag) {
char* src = tag->src_string;
char* offset = strpbrk(src, ">");
char* tag_ending;
char tag_end_clause[256];
char* tag_value;
size_t val_length;
sprintf(tag_end_clause, "</%s>", tag->tag_name);
tag_ending = strstr(tag->src_string, tag_end_clause);
val_length = tag_ending-offset;
tag_value = (char*) malloc(val_length);
memcpy(tag_value, ++offset, val_length);
tag->tag_value = tag_value;
tag->tag_value[val_length-1] = '\0';
}
char* get_param_by_name(PTagData tag, const char* name) {
t_par_pair temp_pair;
PParPair ret_pair;
temp_pair.name = name;
ret_pair = bsearch(&temp_pair, tag->params, tag->par_nr, sizeof(t_par_pair), compare_params);
return ret_pair->value;
}
/*
* Get all the relevant data from the opening
* of the tag
*/
static PTagData parse_tag_opening(char* offset) {
char* tag_opening = get_tag_opening(offset);
char* params_offset = strpbrk(tag_opening, " ");
char* tag_name;
char* pptr;
short int flag = 0;
unsigned char names[256][256];
unsigned char values[256][1024];
int n_count = -1;
int v_count = -1;
size_t name_length;// = params_offset - tag_opening;
PTagData tag;
if(params_offset==NULL)
name_length = strlen(tag_opening);
else name_length = params_offset - tag_opening;
tag_name = (char*) malloc(name_length);
tag_name[name_length] = '\0';
memcpy(tag_name, offset, name_length);
tag = tag_init(tag_name);
if(offset[name_length]!='>') {
pptr = strtok(params_offset, "\"=");
while(pptr!=NULL) {
if(pptr[0]==' ') ++pptr;
if(!flag) {
strcpy(names[++n_count], pptr);
flag = 1;
}
else {
strcpy(values[++v_count], pptr);
flag = 0;
}
pptr = strtok(NULL, "\"=");
}
for(n_count = 0; n_count<=v_count; n_count++) {
char* name = names[n_count];
char* value = values[n_count];
char* param_name = (char*) malloc(strlen(name)+1);
char* param_value = (char*) malloc(strlen(value)+1);
strcpy(param_name, name);
strcpy(param_value, value);
param_value[strlen(param_value)] = '\0';
add_parameter(tag, param_name, param_value);
}
}
free(tag_opening);
return tag;
}
static char* get_tag_ending(PTagData tag, char* offset) {
char s_string[256];
char* name = tag->tag_name;
char* tag_ending;
memset(s_string, '\0', strlen(s_string));
sprintf(s_string, "</%s>", name);
tag_ending = strstr(offset, s_string);
if(tag_ending==NULL) return NULL;
tag_ending = strpbrk(tag_ending, ">");
return ++tag_ending;
}
PTagData get_tag_data(char* offset) {
char* tag_start = strpbrk(offset, "<");
char* tag_end;
char* tag_string;
size_t tag_size;
PTagData tag = parse_tag_opening(++tag_start);
tag_end = get_tag_ending(tag, offset);
if(tag_end!=NULL) {
tag_size = tag_end - offset;
tag_string = (char*) malloc(tag_size);
tag->src_string = tag_string;
tag->start_offset = tag_start;
tag->end_offset = tag_end;
memcpy(tag_string, offset, tag_size);
tag_string[tag_size] = '\0';
set_tag_value_src(tag);
tag_children_src(tag);
}
return tag;
}
static void tag_append_child(PTagData parent, PTagData child) {
size_t pos = parent->child_count;
parent->children = (PTagData*) realloc(parent->children, ++parent->child_count*sizeof(PTagData));
parent->children[pos] = child;
}
/*
* Strip newlines prior to parsing
* the string
*/
void strip_chars(char* string, char* chars) {
char new_string[strlen(string)];
char* offset;
offset = strtok(string, chars);
while(offset!=NULL) {
strcat(new_string, offset);
offset = strtok(NULL, chars);
}
memset(string, '\0', strlen(string));
strcpy(string, new_string);
}
/*
* Iterate recursively through the inner tags
* and append them to the structure
*/
static void tag_children_src(PTagData tag) {
char* cptr = tag->tag_value;
while((cptr=strpbrk(cptr, "<"))!=NULL) {
PTagData child = get_tag_data(cptr);
tag_append_child(tag, child);
cptr = child->end_offset;
}
}
char* load_html(const char* path) {
size_t file_size;
FILE* html = fopen(path, "r");
if(html==NULL) {
fprintf(stderr, "File not found.\n");
return NULL;
}
fseek(html, 0, SEEK_END);
file_size = ftell(html);
rewind(html);
unsigned char* content = (unsigned char*) malloc(file_size);
memset(content, '\0', file_size);
fread(content, file_size, 1, html);
fclose(html);
return content;
}
/*
* Recursively traverse the DOM structure and
* perform the callback
*/
void tag_traverse(PTagData root, void(*callback)(PTagData tag)) {
int i;
if(root->child_count>0) {
for(i=0; i<root->child_count; i++) {
PTagData tag = root->children[i];
if(tag!=NULL) {
callback(tag);
if(tag->child_count) tag_traverse(tag, callback);
}
}
}
}
PTagData get_root_node(char* html) {
return get_tag_data(html);
}
The get_tag_data function is the entry point for all the further procedures. The parse_tag_opening function acquires the tag name and the parameters of the tag. The tag_children_src function gets all the data for the child nodes of a particular tag and, oddly, it seems to crash with the following HTML (a modified version of the example.com):
Code:
<HTML>
<HEAD>
<TITLE>Example Web Page</TITLE>
</HEAD>
<BODY>
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
<p style="">You have somehow reached this web page by typing "example.com",
"example.net",
or "example.org" into your web browser.</p>
<p>These domain names are reserved for use in documentation and are not available
for registration. See <a href="http://www.rfc-editor.org/rfc/rfc2606.txt">RFC
2606</a>, Section 3.</p>
</BODY>
</HTML>
If the HTML string is too long in one of the child tags, the program stops responding and crashes after approximately 10 seconds. Debugging gave me no clues as to why it happens. Without the 'aaaa' thing the parser works perfectly.
Any help will be greatly appreciated.