Thread: parsing images and links with curl

Threaded View

Previous Post Previous Post   Next Post Next Post
  1. #1
    Registered User
    Join Date
    May 2024
    Posts
    4

    parsing images and links with curl

    This code isn't working, and I don't understand why. It is supposed to collect all images and links on the site and indicate the pages where they are located, but for some reason, it is not functioning properly. If anyone can help, I would appreciate any tips on how to fix this code.

    Code:
    #include <iostream>
    #include <string>
    #include <vector>
    #include <curl/curl.h>
    #include <regex>
    
    
    using namespace std;
    
    
    size_t WriteCallback(void* contents, size_t size, size_t nmemb, void* userp) {
        ((string*)userp)->append((char*)contents, size * nmemb);
        return size * nmemb;
    }
    
    
    string fetch_url(const string& url) {
        CURL* curl;
        CURLcode res;
        string readBuffer;
    
    
        curl = curl_easy_init();
        if(curl) {
            curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
            curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
            curl_easy_setopt(curl, CURLOPT_WRITEDATA, &readBuffer);
            res = curl_easy_perform(curl);
            curl_easy_cleanup(curl);
        }
        return readBuffer;
    }
    
    
    vector<pair<string, string>> parse_images(const string& html, const string& page_url) {
        vector<pair<string, string>> images;
        regex img_regex("<img[^>]+src=\"([^\"]+)\"", regex::icase);
        smatch match;
    
    
        string::const_iterator searchStart(html.cbegin());
        while (regex_search(searchStart, html.cend(), match, img_regex)) {
            images.push_back(make_pair(match[1], page_url));
            searchStart = match.suffix().first;
        }
    
    
        return images;
    }
    
    
    vector<pair<string, string>> parse_links(const string& html, const string& page_url) {
        vector<pair<string, string>> links;
        regex link_regex("<a[^>]+href=\"([^\"]+)\"", regex::icase);
        smatch match;
    
    
        string::const_iterator searchStart(html.cbegin());
        while (regex_search(searchStart, html.cend(), match, link_regex)) {
            links.push_back(make_pair(match[1], page_url));
            searchStart = match.suffix().first;
        }
    
    
        return links;
    }
    
    
    int main() {
        string url = "https://www.cprogramming.com";
        string html = fetch_url(url);
    
    
        vector<pair<string, string>> images = parse_images(html, url);
        vector<pair<string, string>> links = parse_links(html, url);
    
    
        cout << "Images found:\n";
        for (const auto& img : images) {
            cout << "Image URL: " << img.first << ", Page URL: " << img.second << "\n";
        }
    
    
        cout << "\nLinks found:\n";
        for (const auto& link : links) {
            cout << "Link URL: " << link.first << ", Page URL: " << link.second << "\n";
        }
    
    
        return 0;
    }
    Last edited by Salem; 2 Weeks Ago at 10:58 AM. Reason: Fixed URL

Popular pages Recent additions subscribe to a feed

Similar Threads

  1. Help with cURL segfault
    By willc0de4food in forum C Programming
    Replies: 2
    Last Post: 08-07-2012, 09:25 AM
  2. Replies: 1
    Last Post: 01-25-2012, 11:50 AM
  3. curl problem
    By pramodjha in forum C Programming
    Replies: 1
    Last Post: 03-29-2011, 06:20 AM
  4. cURL and piping
    By carrotcake1029 in forum Windows Programming
    Replies: 1
    Last Post: 09-25-2010, 02:35 PM
  5. String parsing(parsing comments out of HTML file)
    By slcjoey in forum C# Programming
    Replies: 0
    Last Post: 07-29-2006, 08:28 PM

Tags for this Thread