• 抓取url的超链接程序(使用到libcurl和libxml2)


    抓取url的超链接程序(使用到libcurl和libxml2)

    分类: Linux程序设计(C/C++) 215人阅读 评论(0) 收藏 举报

    抓取url的超链接程序(使用到libcurl和libxml2)

    写了一个c++语言小程序,当作练习。
    c++文件:
    #include <iostream>
    #include <stdio.h>
    #include <string.h>
    #include <stdlib.h>
    #include <string>
    #include <vector>
    #include <curl/curl.h>
    #include <libxml/HTMLparser.h>

    using namespace std;

    #ifdef _MSC_VER
    #define COMPARE(a, b) (!stricmp((a), (b)))
    #else
    #define COMPARE(a, b) (!strcasecmp((a), (b)))
    #endif

    typedef struct LinkStringDefined
    {
        string url;
        string anthor_text;
    } LinkString;

    typedef struct ContextDefined
    {
        ContextDefined(): addTitle(false) { }
        bool addTitle;
        string title;
        string url;
        vector<LinkString> terms;
    } Context;

    static char errorBuffer[CURL_ERROR_SIZE];
    static string buffer;
    static int writer(char *, size_t, size_t, string *);
    static bool init(CURL *&, char *);
    static void parseHtml(const string &, vector<LinkString> &);
    static void StartElement(void *, const xmlChar *, const xmlChar **);
    static void EndElement(void *, const xmlChar *);
    static void Characters(void *, const xmlChar *, int);
    static void CdataBlock(void *, const xmlChar *, int);

    int main(int argc, char* argv[])
    {
        CURL *conn = NULL;
        CURLcode code;
        vector<LinkString> arr;
        if (argc != 2)
        {
            fprintf(stderr, "Usage: %s <url>/n", argv[0]);
            exit(EXIT_FAILURE);
        }
        curl_global_init(CURL_GLOBAL_DEFAULT);
        if (!init(conn, argv[1]))
        {
            fprintf(stderr, "Connection initializion failed/n");
            exit(EXIT_FAILURE);
        }
        code = curl_easy_perform(conn);
        curl_easy_cleanup(conn);
        if (code != CURLE_OK)
        {
            fprintf(stderr, "Failed to get '%s' [%s]/n", argv[1], errorBuffer);
            exit(EXIT_FAILURE);
        }
        parseHtml(buffer, arr);
        int arr_size = arr.size();
        for(int i = 0; i < arr_size; i ++)
        {
            cout << arr[i].anthor_text << "/t" << arr[i].url << endl;
        }
        return 0;
    }

    static bool init(CURL *&conn, char *url)
    {
        CURLcode code;
        conn = curl_easy_init();
        if (conn == NULL)
        {
            fprintf(stderr, "Failed to create CURL connection/n");
            exit(EXIT_FAILURE);
        }
        code = curl_easy_setopt(conn, CURLOPT_ERRORBUFFER, errorBuffer);
        if (code != CURLE_OK)
        {
            fprintf(stderr, "Failed to set error buffer [%d]/n", code);
            return false;
        }
        code = curl_easy_setopt(conn, CURLOPT_URL, url);
        if (code != CURLE_OK)
        {
            fprintf(stderr, "Failed to set URL [%s]/n", errorBuffer);
            return false;
        }
        code = curl_easy_setopt(conn, CURLOPT_FOLLOWLOCATION, 1);
        if (code != CURLE_OK)
        {
            fprintf(stderr, "Failed to set redirect option [%s]/n", errorBuffer);
            return false;
        }
        code = curl_easy_setopt(conn, CURLOPT_WRITEFUNCTION, writer);
        if (code != CURLE_OK)
        {
            fprintf(stderr, "Failed to set writer [%s]/n", errorBuffer);
            return false;
        }
        code = curl_easy_setopt(conn, CURLOPT_WRITEDATA, &buffer);
        if (code != CURLE_OK)
        {
            fprintf(stderr, "Failed to set write data [%s]/n", errorBuffer);
            return false;
        }
        return true;
    }

    static int writer(char *data, size_t size, size_t nmemb, string *writerData)
    {
        unsigned long long sizes = size * nmemb;
        if (writerData == NULL) return 0;
        writerData->append(data, sizes);
        return sizes;
    }

    static htmlSAXHandler saxHandler =
    {
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        StartElement,
        EndElement,
        NULL,
        Characters,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        NULL,
        CdataBlock,
        NULL
    };

    static void parseHtml(const string &html, vector<LinkString> &arr)
    {
        htmlParserCtxtPtr ctxt;
        Context context;
        ctxt = htmlCreatePushParserCtxt(&saxHandler, &context, "", 0, "", XML_CHAR_ENCODING_NONE);
        htmlParseChunk(ctxt, html.c_str(), html.size(), 0);
        htmlParseChunk(ctxt, "", 0, 1);
        htmlFreeParserCtxt(ctxt);
        arr = context.terms;
    }

    static void StartElement(void *voidContext, const xmlChar *name, const xmlChar **attributes)
    {
    //    int sz_att = sizeof(**attributes) / sizeof(xmlChar);
        Context *context = (Context *)voidContext;
        if (COMPARE((char *)name, "a"))
        {
            context->url = (char *)attributes[1];
            context->title = "";
            context->addTitle = true;
        }
    }

    static void EndElement(void *voidContext, const xmlChar *name)
    {
        Context *context = (Context *)voidContext;
        if (COMPARE((char *)name, "a"))
        {
            context->url = "";
            context->addTitle = false;
        }
    }

    static void handleCharacters(Context *context, const xmlChar *chars, int length)
    {
        LinkString linkString;
        if (context->addTitle)
        {
            context->title.append((char *)chars, length);
            linkString.anthor_text = context->title;
            linkString.url = context->url;
            context->terms.push_back(linkString);
        }
    }

    static void Characters(void *voidContext, const xmlChar *chars, int length)
    {
        Context *context = (Context *)voidContext;
        handleCharacters(context, chars, length);
    }

    static void CdataBlock(void *voidContext, const xmlChar *chars, int length)
    {
        Context *context = (Context *)voidContext;
        handleCharacters(context, chars, length);
    }

    Makefile文件:
    CXX = g++

    WARNING = -Wall
    GDBDEBUG = -g
    LIBS = -L. 
    DEFAULT_INCLUDE = -I. -I/usr/include
    ADDED_INCLUDE= -I /usr/include/libxml2
    OPTIMIZE = -O2

    allprog = get_url_info
    object1 = get_url_info.o
    complied = $(CXX) $(GDBDEBUG) $(WARNING) $(OPTIMIZE) $(DEFAULT_INCLUDE) $(ADDED_INCLUDE) -c $*.cpp

    all : ${allprog}
    .cpp.o :
        $(complied)
    get_url_info : $(object1)
        $(CXX) $(GDBDEBUG) $(WARNING) -lcurl -lxml2 $(OPTIMIZE) -o get_url_info $(object1)

    .PHONY : cleanall clean cleanobj
    cleanall :
        -rm *.o ${allprog}
    clean :
        -rm $(object1) get_url_info
    cleanobj :
        -rm *.o
  • 相关阅读:
    读 Kafka 源码写优雅业务代码:配置类
    如何安装FTP服务器,并实现文件共享
    Merge into用法总结
    Insomnia 跟 Postman 类似的软件
    iOS dealloc中初始化weak指针崩溃防护
    Centos7安装febootstrap
    获取 linux 系统 CPU、内存、磁盘 IO 等信息的脚本
    Git本地远程仓库
    网络及服务故障的排查思路
    Git配置远程仓库(密匙链接)
  • 原文地址:https://www.cnblogs.com/moonvan/p/2174461.html
Copyright © 2020-2023  润新知