Code:
#ifndef _PARSER_H_
#define _PARSER_H_
#ifdef __cplusplus
extern "C" {
#endif
typedef struct par_pair{
const char* name;
char* value;
} t_par_pair;
typedef t_par_pair* PParPair;
typedef struct tag_data {
char* src_string;
char* tag_name;
char* tag_value;
char* start_offset;
char* end_offset;
t_par_pair* params;
struct tag_dat.......... children;
unsigned int par_nr;
unsigned int child_count;
} t_tag_data;
typedef t_tag_data* PTagData;
int is_comment(char*);
static void tag_children_src(PTagData tag);
static void strip_chars(char*, char*);
static int compare_params(const struct par_pair* p1, const struct par_pair* p2);
int compare_tags(const PTagData*, const PTagData*);
//char* tolower(const char* string);
static size_t get_param_count(PTagData tag);
static PTagData tag_init(const char* name);
static void destroy_param(t_par_pair param);
void tag_destroy(PTagData tag);
static t_par_pair init_param(const char* name, char* value);
static void add_parameter(PTagData tdata, const char* name, char* value);
static char* get_tag_opening(char* offset);
static void set_tag_value_src(PTagData tag);
char* get_param_by_name(PTagData tag, const char* name);
static PTagData parse_tag_opening(char* offset);
static char* get_tag_ending(PTagData tag, char* offset);
PTagData get_tag_data(char* offset);
static void tag_append_child(PTagData parent, PTagData child);
void strip_chars(char* string, char* chars);
static void tag_children_src(PTagData tag);
PTagData get_root_node(char* html);
void tag_traverse(PTagData root, void(*callback)(PTagData));
#ifdef __cplusplus
}
#endif
#endif
And here's the implementation:
Code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <conio.h>
#include "parser.h"
/*
* Comparing parameters by key value
*/
int compare_params(const struct par_pair* p1, const struct par_pair* p2) {
return strcmp(p1->name, p2->name);
}
/*
* Comparing tags by name
*/
int compare_tags(const PTagData* t1, const PTagData* t2) {
return strcmp((*(*t1)).tag_name, (*(*t2)).tag_name);
}
/*
* String to lowercase
* ASCII only, should suffice for the tag names
*/
/*char* tolower(const char* string) {
int i;
char* newstring = (char*) malloc(strlen(string));
for(i=0; i<strlen(string); i++) {
if(((int)string[i]>64)&&((int)string[i]<91))
newstring[i] = (char) string[i]+32;
else newstring[i] = string[i];
}
newstring[++i] = '\0';
return newstring;
}*/
size_t get_param_count(PTagData tag) {
return sizeof(tag->params)/sizeof(PParPair);
}
static PTagData tag_init(const char* name) {
PTagData tag = (PTagData) malloc(sizeof(t_tag_data));
tag->tag_name = name;
tag->params = (t_par_pair*) calloc(10, sizeof(t_par_pair));
tag->children = (PTagData) calloc(10, sizeof(PTagData));
tag->par_nr = 0;
tag->child_count = 0;
tag->tag_value = NULL;
return tag;
}
static void destroy_param(t_par_pair param) {
free(param.value);
}
void tag_destroy(PTagData tag) {
int i;
/*
* Deallocate child nodes
*/
if(tag->child_count) {
for(i=0; i<tag->child_count; i++) {
tag_destroy(tag->children[i]);
}
}
/*
* Deallocate parameters
*/
for(i = 0; i<tag->par_nr; i++) {
destroy_param(tag->params[i]);
}
free(tag->params);
free(tag->tag_value);
free(tag->src_string);
free(tag);
}
static t_par_pair init_param(const char* name, char* value) {
t_par_pair pair;
pair.name = name;
pair.value = (char*) malloc(strlen(value));
memset(pair.value, '\0', strlen(value));
strcpy(pair.value, value);
return pair;
}
static void add_parameter(PTagData tdata, const char* name, char* value) {
int last_cell = tdata->par_nr-1;
size_t old_size, new_size;
t_par_pair param = init_param(name, value);
//add parameter to tag data structure
old_size = sizeof(tdata->params);
new_size = old_size + sizeof(param);
tdata->params = (t_par_pair*) realloc(tdata->params, sizeof(t_par_pair)*(++tdata->par_nr));
tdata->params[last_cell+1] = param;
qsort(tdata->params, tdata->par_nr, sizeof(t_par_pair), compare_params);
}
static char* get_tag_opening(char* offset) {
size_t tag_length;
char* tag_end;
char* tag_text;
tag_end = strpbrk(offset, ">");
tag_length = tag_end - offset;
tag_text = (char*) malloc(tag_length);
tag_text[tag_length] = '\0';
memcpy(tag_text, offset, tag_length);
return tag_text;
}
/*
* Get tag value, ignore children
*/
static void set_tag_value_src(PTagData tag) {
char* src = tag->src_string;
char* offset = strpbrk(src, ">");
char* tag_ending;
char tag_end_clause[256];
char* tag_value;
size_t val_length;
sprintf(tag_end_clause, "</%s>", tag->tag_name);
tag_ending = strstr(tag->src_string, tag_end_clause);
val_length = tag_ending-offset;
tag_value = (char*) malloc(val_length);
memcpy(tag_value, ++offset, val_length);
tag->tag_value = tag_value;
tag->tag_value[val_length-1] = '\0';
}
char* get_param_by_name(PTagData tag, const char* name) {
t_par_pair temp_pair;
PParPair ret_pair;
temp_pair.name = name;
ret_pair = bsearch(&temp_pair, tag->params, tag->par_nr, sizeof(t_par_pair), compare_params);
return ret_pair->value;
}
/*
* Get all the relevant data from the opening
* of the tag
*/
static PTagData parse_tag_opening(char* offset) {
char* tag_opening = get_tag_opening(offset);
char* params_offset = strpbrk(tag_opening, " ");
char* tag_name;
char* pptr;
short int flag = 0;
unsigned char names[256][256];
unsigned char values[256][1024];
int n_count = -1;
int v_count = -1;
size_t name_length;// = params_offset - tag_opening;
PTagData tag;
if(params_offset==NULL)
name_length = strlen(tag_opening);
else name_length = params_offset - tag_opening;
tag_name = (char*) malloc(name_length);
tag_name[name_length] = '\0';
memcpy(tag_name, offset, name_length);
tag = tag_init(tag_name);
if(offset[name_length]!='>') {
pptr = strtok(params_offset, "\"=");
while(pptr!=NULL) {
if(pptr[0]==' ') ++pptr;
if(!flag) {
strcpy(names[++n_count], pptr);
flag = 1;
}
else {
strcpy(values[++v_count], pptr);
flag = 0;
}
pptr = strtok(NULL, "\"=");
}
for(n_count = 0; n_count<=v_count; n_count++) {
char* name = names[n_count];
char* value = values[n_count];
char* param_name = (char*) malloc(strlen(name)+1);
char* param_value = (char*) malloc(strlen(value)+1);
strcpy(param_name, name);
strcpy(param_value, value);
param_value[strlen(param_value)] = '\0';
add_parameter(tag, param_name, param_value);
}
}
free(tag_opening);
return tag;
}
static char* get_tag_ending(PTagData tag, char* offset) {
char s_string[256];
char* name = tag->tag_name;
char* tag_ending;
memset(s_string, '\0', strlen(s_string));
sprintf(s_string, "</%s>", name);
tag_ending = strstr(offset, s_string);
if(tag_ending==NULL) return NULL;
tag_ending = strpbrk(tag_ending, ">");
return ++tag_ending;
}
PTagData get_tag_data(char* offset) {
char* tag_start = strpbrk(offset, "<");
char* tag_end;
char* tag_string;
size_t tag_size;
PTagData tag = parse_tag_opening(++tag_start);
tag_end = get_tag_ending(tag, offset);
if(tag_end!=NULL) {
tag_size = tag_end - offset;
tag_string = (char*) malloc(tag_size);
tag->src_string = tag_string;
tag->start_offset = tag_start;
tag->end_offset = tag_end;
memcpy(tag_string, offset, tag_size);
tag_string[tag_size] = '\0';
set_tag_value_src(tag);
tag_children_src(tag);
}
return tag;
}
static void tag_append_child(PTagData parent, PTagData child) {
size_t pos = parent->child_count;
parent->children = (PTagData*) realloc(parent->children, ++parent->child_count*sizeof(PTagData));
parent->children[pos] = child;
}
/*
* Strip newlines prior to parsing
* the string
*/
void strip_chars(char* string, char* chars) {
char new_string[strlen(string)];
char* offset;
offset = strtok(string, chars);
while(offset!=NULL) {
strcat(new_string, offset);
offset = strtok(NULL, chars);
}
memset(string, '\0', strlen(string));
strcpy(string, new_string);
}
/*
* Iterate recursively through the inner tags
* and append them to the structure
*/
static void tag_children_src(PTagData tag) {
char* cptr = tag->tag_value;
while((cptr=strpbrk(cptr, "<"))!=NULL) {
PTagData child = get_tag_data(cptr);
tag_append_child(tag, child);
cptr = child->end_offset;
}
}
char* load_html(const char* path) {
size_t file_size;
FILE* html = fopen(path, "r");
if(html==NULL) {
fprintf(stderr, "File not found.\n");
return NULL;
}
fseek(html, 0, SEEK_END);
file_size = ftell(html);
rewind(html);
unsigned char* content = (unsigned char*) malloc(file_size);
memset(content, '\0', file_size);
fread(content, file_size, 1, html);
fclose(html);
return content;
}
/*
* Recursively traverse the DOM structure and
* perform the callback
*/
void tag_traverse(PTagData root, void(*callback)(PTagData tag)) {
int i;
if(root->child_count>0) {
for(i=0; i<root->child_count; i++) {
PTagData tag = root->children[i];
if(tag!=NULL) {
callback(tag);
if(tag->child_count) tag_traverse(tag, callback);
}
}
}
}
PTagData get_root_node(char* html) {
return get_tag_data(html);
}
The get_tag_data function is the entry point for all the further procedures. The parse_tag_opening function acquires the tag name and the parameters of the tag. The tag_children_src function gets all the data for the child nodes of a particular tag and, oddly, it seems to crash with the following HTML (a modified version of the example.com):