• 关于从网页上下载数据


      1 #include "curl/curl.h"  
      2 
      3 #include <io.h> 
      4 #include <direct.h>  
      5 #include <stdio.h>
      6 #include <string>
      7 #include <fstream>
      8 #include <stdlib.h> 
      9 
     10 #pragma comment(lib, "libcurl.lib")   
     11 #pragma comment(lib, "wldap32.lib")   
     12 #pragma comment(lib, "ws2_32.lib")   
     13 #pragma comment(lib, "winmm.lib")   
     14 
     15 size_t write_func(void *ptr, size_t size, size_t nmemb, FILE *stream)
     16 {
     17     return fwrite(ptr, size, nmemb, stream);
     18 }
     19 
     20 
     21 std::string analyze_name;
     22 std::string analyze_url;
     23 std::string cplocation;
     24 std::string httpurl;
     25 
     26 //得到每条链接的短地址和名字
     27 void get_name_url(std::string line)
     28 {
     29     //File d;
     30     char *linkhead = "<a href=";
     31     char *linkmiddle = """;
     32     char *linktail = "</a>";
     33     std::string head = "<a href=";
     34 
     35     char *s = strstr((char*)line.data(), linkhead);
     36     for (int i = 0; i<9; i++)
     37         *s++;
     38     int urloffset = 0;
     39     while (*s != *linkmiddle)
     40     {
     41         *s++;
     42         urloffset++;
     43     }
     44     analyze_url = line.substr(line.find(head) + 9, urloffset);
     45     for (int i = 0; i<2; i++)
     46         *s++;
     47     int nameoffset = 0;
     48     while (*s != *linktail)
     49     {
     50         *s++;
     51         nameoffset++;
     52     }
     53     analyze_name = line.substr(line.find(head) + 11 + urloffset, nameoffset);
     54 
     55     //return d;
     56 }
     57 
     58 //根据http网址下载网页到本机指定位置
     59 int http_get(std::string httpurl, std::string location)
     60 {
     61     CURL *curl;
     62     CURLcode res;
     63     FILE *outfile;
     64     errno_t err;
     65     err = fopen_s(&outfile, location.c_str(), "wb");
     66     //outfile = fopen(location.c_str(), "wb");
     67     const char *url = httpurl.c_str();
     68     curl = curl_easy_init();
     69     if (curl)
     70     {
     71         curl_easy_setopt(curl, CURLOPT_URL, url);
     72         curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile);
     73         curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_func);
     74         res = curl_easy_perform(curl);
     75         fclose(outfile);
     76         curl_easy_cleanup(curl);
     77     }
     78     return 0;
     79 }
     80 
     81 
     82 int linkextract(std::string httpfile, std::string httpurl, std::string cplocation)
     83 {
     84     std::ifstream infile;
     85     std::string line, link, name;
     86     infile.open(httpfile, std::ios::in);
     87     std::string directory = "alt="[DIR]"";
     88     std::string file = "alt="[   ]"";
     89 
     90     while (!infile.eof())
     91     {
     92         getline(infile, line, '
    ');
     93         //if (line.find(directory) != -1)//目录
     94         //{
     95         //    get_name_url(line);
     96         //    analyze_name = analyze_name.substr(0, analyze_name.length() - 1);
     97         //    std::string content = cplocation + "/" + analyze_name;
     98         //    _mkdir((char*)content.c_str());
     99         //    httpurl = httpurl + analyze_url;
    100         //    cplocation = cplocation + "/" + analyze_name;
    101         //    http_get(httpurl, cplocation + "/" + analyze_name + ".txt");
    102         //    linkextract(cplocation + "/" + analyze_name + ".txt", httpurl, cplocation);
    103         //}
    104         //else if (line.find(file) != -1)//文件
    105         {
    106             //get_name_url(line);
    107             if (line.find("href="Tile")==-1)
    108                 continue;
    109 
    110             char *linkhead = "<a href=";
    111             char *linkmiddle = """;
    112             char *linktail = "</a>";
    113             std::string head = "<a href=";
    114 
    115             char *s = strstr((char*)line.data(), linkhead);
    116             for (int i = 0; i<9; i++)
    117                 *s++;
    118             int urloffset = 0;
    119             while (*s != *linkmiddle)
    120             {
    121                 *s++;
    122                 urloffset++;
    123             }
    124             analyze_url = line.substr(line.find(head) + 9, urloffset);
    125             for (int i = 0; i<2; i++)
    126                 *s++;
    127             int nameoffset = 0;
    128             while (*s != *linktail)
    129             {
    130                 *s++;
    131                 nameoffset++;
    132             }
    133             analyze_name = line.substr(line.find(head) + 11 + urloffset, nameoffset);
    134 
    135 
    136             http_get(httpurl + analyze_url, cplocation + "/" + analyze_name);
    137         }
    138         /*else
    139             continue;*/
    140     }
    141     infile.close();
    142 
    143     return 0;
    144 }
    145 
    146 
    147 
    148 
    149 int main()
    150 {
    151     std::string httpurl = "http://服务地址";
    152     std::string content = "../DownloadDirectory";
    153     _mkdir((char*)content.c_str());
    154     std::string localdir = "../DownloadDirectory/DownloadDirectory.txt";
    155     http_get(httpurl, localdir);
    156     linkextract(localdir, httpurl, content);
    157     return 0;
    158 }

    只能下载href里面的链接文件,而且很多网页不能下载;

    其中,需要下载curl并配置路径,关于下载编译curl有很好的博客,暂时找不见了;

  • 相关阅读:
    SQL Server 配置管理器不见了
    SQL常用函数使用
    数据库优化
    关于百度地图js api的getCurrentPosition定位不准确的解决方法
    uni-app中使用外部字体
    不定高度,元素垂直居中
    css实现的鼠标悬浮提示
    导入
    下载
    百度地图在地图上标点显示数字
  • 原文地址:https://www.cnblogs.com/ningmouming/p/12100447.html
Copyright © 2020-2023  润新知