I'm trying to solve that problem two days in a row, and at this point i'm completely lost, so i'm asking for help.
The program is some sort of web crawler built with libcurl and pcre.
Struct which handles http response body is
Code:
// struct to save fetch html data to
struct html_data {
char *html;
size_t size;
};
And here is a function which is failing
Code:
char **postings_urls (struct html_data *html_response_data) {
// init array of strings
// there are no more than 100 urls
// and 1 for storing "\0"
char **matched_urls;
matched_urls = calloc(101 , sizeof(*matched_urls));
// setup pcre
int matches_count = 0;
int erroffset , rc , i;
const char *error;
char *numberurl_pattern = "\\d{5,}\\.html";
int ovector[OVECCOUNT];
// init pcre handle
pcre *re;
// compile the pattern
re = pcre_compile(
numberurl_pattern, /* the pattern */
0, /* default options */
&error, /* for error message */
&erroffset, /* for error offset */
NULL /* use default character tables */
);
if (re == NULL) {
printf("PCRE compilation failed at offset %d: %s\n", erroffset, error);
exit(1);
}
// try to match
rc = pcre_exec(
re, /* the compiled pattern */
NULL, /* no extra data - we didn't study the pattern */
html_response_data->html, /* the subject string */
html_response_data->size, /* the length of the subject */
0, /* start at offset 0 in the subject */
0, /* default options */
ovector, /* output vector for substring information */
OVECCOUNT /* number of elements in the output vector */
);
/*
Show substrings stored in the output vector by number. Obviously, in a real
application you might want to do things other than print them.
*/
// match succeded
if (rc > 0) {
i = 0;
// get the string start and length
char *substring_start = html_response_data->html + ovector[2*i];
size_t substring_length = ovector[2*i+1] - ovector[2*i];
// save formated string to temp variable
char *temp = malloc(substring_length * sizeof(char));
sprintf(temp , "%.*s", substring_length, substring_start);
// allocate memory for string
matched_urls[matches_count] = malloc(strlen(temp) + 1);
// copy from temp string
strcpy(matched_urls[matches_count], temp);
// free memory used by temp string
free(temp);
// increase matches count
matches_count++;
}
for (;;) {
int options = 0; /* Normally no options */
int start_offset = ovector[1]; /* Start at end of previous match */
/* If the previous match was for an empty string, we are finished if we are
at the end of the subject. Otherwise, arrange to run another match at the
same point to see if a non-empty match can be found. */
if (ovector[0] == ovector[1]) {
if (ovector[0] == html_response_data->size)
break;
options = PCRE_NOTEMPTY | PCRE_ANCHORED;
}
/* Run the next matching operation */
rc = pcre_exec(
re, /* the compiled pattern */
NULL, /* no extra data - we didn't study the pattern */
html_response_data->html, /* the subject string */
html_response_data->size, /* the length of the subject */
start_offset, /* starting offset in the subject */
options, /* options */
ovector, /* output vector for substring information */
OVECCOUNT /* number of elements in the output vector */
);
/* This time, a result of NOMATCH isn't an error. If the value in "options"
is zero, it just means we have found all possible matches, so the loop ends.
Otherwise, it means we have failed to find a non-empty-string match at a
point where there was a previous empty-string match. In this case, we do what
Perl does: advance the matching position by one, and continue. We do this by
setting the "end of previous match" offset, because that is picked up at the
top of the loop as the point at which to start again. */
if (rc == PCRE_ERROR_NOMATCH) {
if (options == 0)
break;
ovector[1] = start_offset + 1;
continue; /* Go round the loop again */
}
/* Other matching errors are not recoverable. */
if (rc < 0) {
printf("Matching error %d\n", rc);
exit(1);
}
/* Match succeded */
//printf("\nMatch succeeded again at offset %d\n", ovector[0]);
/* The match succeeded, but the output vector wasn't big enough. */
if (rc == 0) {
rc = OVECCOUNT/3;
printf("ovector only has room for %d captured substrings\n", rc - 1);
}
/* As before, show substrings stored in the output vector by number, and then
also any named substrings. */
for (i = 0; i < rc; i++) {
// get the string start and length
char *substring_start = html_response_data->html + ovector[2*i];
size_t substring_length = ovector[2*i+1] - ovector[2*i];
// save formated string to temp variable
char *temp = malloc(substring_length * sizeof(char));
sprintf(temp , "%.*s", substring_length, substring_start);
// allocate memory for string
matched_urls[matches_count] = malloc(strlen(temp) + 1);
// copy from temp string
strcpy(matched_urls[matches_count], temp);
// free memory used by temp string
free(temp);
// increase matches count
matches_count++;
}
}
matched_urls[++matches_count] = '\0';
return matched_urls;
}
I'm calling it like this
Code:
char **pu = postings_urls(&html_response_data);
Now, it fails very differently. Sometime i can call it like 10 times with different content (html_response_data), and recieve what i needed and sometimes it's failing just after the 2/3/4 call, it's completley unpredictable.
When i comment the following lines:
Code:
// allocate memory for string
//matched_urls[matches_count] = malloc(strlen(temp) + 1);
// copy from temp string
//strcpy(matched_urls[matches_count], temp);
everything starts to work just fine. So i believe that the reason for the error is wrong memory allocation. I've tried different allocation algorithms, but no success.
Help!