libcurl-crawer

#ifndef LIBCURL_CRAWLER_HPP
#define LIBCURL_CRAWLER_HPP

int max_con = 200;
int max_total = 20000;
int max_requests = 500;
int max_link_per_page = 5;
int follow_relative_links = 0;
char *start_page = "https://www.reuters.com";

#include"libxml/HTMLparser.h"
#include"libxml/xpath.h"
#include"libxml/uri.h"
#include"curl/curl.h"
#include"stdlib.h"
#include"string.h"
#include"math.h"
#include"signal.h"


int pending_interrupt = 0;
void sighandler(int dummy){
    pending_interrupt = 1;
}

//resizable buffer
typedef  struct{
    char *pBuf;
    size_t size;
} memory;

size_t grow_buffer(void *contents,size_t sz,size_t nmemb,void *ctx){
    size_t realsize = sz*nmemb;
    memory *pMem = (memory*)ctx;
    char *ptr = (char*)realloc(pMem->pBuf,pMem->size +realsize);
    if(!ptr){
        //out of memory
        printf("not enough memory (realloc returned null)\n");
        return 0;
    }
    pMem->pBuf = ptr;
    memcpy(&(pMem->pBuf[pMem->size]),contents,realsize);
    pMem->size += realsize;
    return realsize;
}

CURL  *make_handle(char *url){
    CURL *pHandle = curl_easy_init();

    //important use http2 over https
    curl_easy_setopt(pHandle,CURLOPT_HTTP_VERSION,CURL_HTTP_VERSION_2TLS);
    curl_easy_setopt(pHandle,CURLOPT_URL,url);

    //buffer body
    memory *pMem = (memory *)malloc(sizeof(memory));
    pMem->size = 0;
    pMem->pBuf = (char*)malloc(1);
    curl_easy_setopt(pHandle,CURLOPT_WRITEFUNCTION,grow_buffer);
    curl_easy_setopt(pHandle,CURLOPT_WRITEDATA,pMem);
    curl_easy_setopt(pHandle,CURLOPT_PRIVATE,pMem);
    curl_easy_setopt(pHandle,CURLOPT_VERBOSE ,1L);
    //for completeness
    curl_easy_setopt(pHandle,CURLOPT_ACCEPT_ENCODING,"");
    curl_easy_setopt(pHandle,CURLOPT_TIMEOUT,5L);
    curl_easy_setopt(pHandle,CURLOPT_FOLLOWLOCATION,1L);
    curl_easy_setopt(pHandle,CURLOPT_MAXREDIRS,10L);
    curl_easy_setopt(pHandle,CURLOPT_CONNECTTIMEOUT,2L);
    curl_easy_setopt(pHandle,CURLOPT_COOKIEFILE,"");
    curl_easy_setopt(pHandle,CURLOPT_USERAGENT,"mini crawler");
    curl_easy_setopt(pHandle,CURLOPT_HTTPAUTH,CURLAUTH_ANY);
    curl_easy_setopt(pHandle,CURLOPT_UNRESTRICTED_AUTH,1L);
    curl_easy_setopt(pHandle,CURLOPT_PROXYAUTH,CURLAUTH_ANY);
    curl_easy_setopt(pHandle,CURLOPT_EXPECT_100_TIMEOUT_MS,1L);
    curl_easy_setopt(pHandle, CURLOPT_SSL_VERIFYPEER, 0);
    curl_easy_setopt(pHandle, CURLOPT_SSL_VERIFYHOST, 0);
    return pHandle;
}

//href finder implemented in libxml2 but could be any html parser
size_t follow_links(CURLM *multi_handle,memory * mem,char *url){
    int opts = HTML_PARSE_NOBLANKS |HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING|HTML_PARSE_NONET;
    htmlDocPtr doc = htmlReadMemory(mem->pBuf,mem->size,url,NULL,opts);
    if(!doc)
        return 0;
    xmlChar *pXpath = (xmlChar*)"//a/@href";
    xmlXPathContextPtr context = xmlXPathNewContext(doc);
    xmlXPathObjectPtr result = xmlXPathEvalExpression(pXpath,context);
    xmlXPathFreeContext(context);
    if(!result)
        return 0;
    xmlNodeSetPtr nodeset = result->nodesetval;
    if(xmlXPathNodeSetIsEmpty(nodeset)){
        xmlXPathFreeObject((result));
        return 0;
    }
    size_t count = 0;

    for(int i=0;i<nodeset->nodeNr;i++){
        double r = rand();
        int x = r*nodeset->nodeNr/RAND_MAX;
        const xmlNode *pNode = nodeset->nodeTab[x]->xmlChildrenNode;
        xmlChar *pHref = xmlNodeListGetString(doc,pNode,1);
        if(follow_relative_links){
            xmlChar *pOrig = pHref;
            pHref = xmlBuildURI(pHref,(xmlChar*)url);
            xmlFree(pOrig);
        }
        char *link = (char*)pHref;
        if(!link || strlen(link) < 20)
            continue;
        if(!strncmp(link,"http://",7) || !strncmp(link,"https://",8)){
            curl_multi_add_handle(multi_handle,make_handle(link));
            if(count++ == max_link_per_page)
                break;
        }
        xmlFree(link);
    }
    xmlXPathFreeObject(result);
    return count;
}

int is_html(char *ctype){
    return ctype != NULL && strlen(ctype) > 10 && strstr(ctype,"text/html");
}

int test_crawler(){
    signal(SIGINT,sighandler);
    LIBXML_TEST_VERSION;
    curl_global_init(CURL_GLOBAL_ALL);
    CURLM *pMulti_handle = curl_multi_init();
    curl_multi_setopt(pMulti_handle,CURLMOPT_MAX_TOTAL_CONNECTIONS,max_con);
    curl_multi_setopt(pMulti_handle,CURLMOPT_MAX_HOST_CONNECTIONS,6L);

    //enable http/2 if avaliable
#ifdef CURLPIPE_MULTIPLEX
    curl_multi_setopt(pMulti_handle,CURLMOPT_PIPELINING,CURLPIPE_MULTIPLEX);
#endif

    //sets html start page
    curl_multi_add_handle(pMulti_handle,make_handle(start_page));

    int msgs_left;
    int pending = 0;
    int complete = 0;
    int still_running = 1;
    while (still_running && !pending_interrupt) {
        int numfds;
        curl_multi_wait(pMulti_handle,NULL,0,1000,&numfds);
        curl_multi_perform(pMulti_handle,&still_running);

        //set how the transfers went
        CURLMsg *m = NULL;
        while ((m = curl_multi_info_read(pMulti_handle,&msgs_left))) {
            if(m->msg == CURLMSG_DONE){
                CURL *pHandle = m->easy_handle;
                char *url;
                memory *pMem;
                curl_easy_getinfo(pHandle,CURLINFO_PRIVATE,&pMem);
                curl_easy_getinfo(pHandle,CURLINFO_EFFECTIVE_URL,&url);
                if(m->data.result == CURLE_OK){
                    long res_status;
                    curl_easy_getinfo(pHandle,CURLINFO_RESPONSE_CODE,&res_status);
                    if(res_status == 200){
                        char *pCtype;
                        curl_easy_getinfo(pHandle,CURLINFO_CONTENT_TYPE,&pCtype);
                        printf("[%d] HTTP 200 (%s): %s\n",complete,pCtype,url);
                        if(is_html(pCtype) &&pMem->size > 100){
                            if(pending < max_requests && (complete + pending) < max_total){
                                pending += follow_links(pMulti_handle,pMem,url);
                                still_running = 1;
                            }
                        }
                    }else{
                        printf("[%d] HTTP %d: %s\n",complete,(int) res_status,url);
                    }
                }else{
                    printf("[%d] Connection failure: %s\n",complete,url);
                }
                curl_multi_remove_handle(pMulti_handle,pHandle);
                curl_easy_cleanup(pHandle);
                free(pMem->pBuf);
                free(pMem);
                complete ++;
                pending --;
            }
        }
    }
    curl_multi_cleanup(pMulti_handle);
    curl_global_cleanup();
    return 0;

}


#endif // LIBCURL_CRAWLER_HPP

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章