• nodejs爬虫笔记(二)---代理设置


    node爬虫代理设置

    最近想爬取YouTube上面的视频信息,利用nodejs爬虫笔记(一)的方法,代码和错误如下

    var request = require(‘request’);
    var cheerio = require(‘cheerio’);****
    var url = ‘https://www.youtube.com ‘;
    function crawler(url,callback){
        var list = [];
        request(url,function(err,res){
            if(err){
                return callback(err);
            }
            var $ = cheerio(res.body.toString());
            var s = $(’*’).text();
            console.log(‘s=’+s);
        });
        callback(null,list);
    };
    
    crawler(url,function(err,list){
        if(err){
            return console.log(err);
        }
        console.log(list);
    });
    错误
    { [Error: connect ETIMEDOUT 8.7.198.45:443 ] code: ‘ETIMEDOUT’, errno: ‘ETIMEDOUT’, syscall: ‘connect’, address: '8.7.198.45', port: 443 }

    由于国内访问youtube的时候需要翻墙,而在代码里我们需要通过设置代理才能获取页面信息。

    1、通过nodejs的http/https模块

    具体使用可以参考http://nodejs.cn/api/http.html#http_http_request_options_callback,我使用的lantern作为翻墙工具。

    var http = require('http'); // 使用http模块,也可以换成https模块
    var opt = {
        host: '127.0.0.1', // 这里是代理服务器的地址
        port: '57939', // 这里是代理服务器的端口号
        method: 'GET', // 这里是发送的方法
        path: 'https://www.youtube.com', // 这里是访问的路径
        headers: {
    //请求头(可以利用Google浏览器打开youtube首页,点击network 查看请求头,把相关信息复制过来即可)

          'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
          'Accept-Encoding':'gzip, deflate, sdch, br',
          'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6',
          'Cache-Control':'max-age=0',
          'Cookie':'_ga=GA1.2.1653214693.1476773935; VISITOR_INFO1_LIVE=T3BczuPUIQo; SID=5QR6XEldVgveXzFtqjIcD480cHE18gBRd3xPo398vndcc5JNxOAZ-TgVp5jQx         3CR-ePvgA.; HSID=APr2I8UwM-A-Lypbd; SSID=Ap4H3Td1nrV__-9tN; APISID=8bHyFV90pNBU5Z9p/A2DlJa2MyJLL4-RKP; SAPISID=4tZf4GDX7Dt5bNAt/A5vhaZe_DLzn            -ECul; CONSENT=YES+CN.zh-CN+20160904-14-0; YSC=XVHk_pArWhE; PREF=cvdm=grid&f1=50000000&f6=1&f5=30&al=zh-CN&gl=HK',
          'Upgrade-insecure-requests':'1',
          'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.59 Safari/537.36',
          'X-Chrome-Uma-Enabled':'1',
          'X-Client-Data':'CJa2yQEIorbJAQjBtskBCKmdygE=',
          'Connection': 'keep-alive'

         }

    };
    
    var body = '';
    var req = http.request(opt, function(res) {
        console.log("Got response: " + res.statusCode);
        res.on('data', function(d) {
            body += d;
        }).on('end', function() {
            //console.log(res);
            console.info('============');
    
            console.log(body)
        });
    
    }).on('error', function(e) {
        console.log("Got error: " + e.message);
    })
    
    req.end();

    2、使用SuperAgent以及superagent-proxy模块

    为了使用方便以及加快开发的速度,我们就会引入模块。SuperAgent也是一个可以封装好的http模块,功能和Request模块也差不多。如果要使用代理模块的还,还需要额外的拓展模块SuperAgent-Proxy。

    SuperAgent官网地址
    SuperAgent-proxy官网地址

    var request =require('superagent');
    require('superagent-proxy')(request);
    
    var fs = require('fs');
    
    var proxy = 'http://127.0.0.1:57939';
    
    var header = {
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding':'gzip, deflate, sdch, br',
        'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6',
        'Cache-Control':'max-age=0',
        'Cookie':'_ga=GA1.2.1653214693.1476773935; VISITOR_INFO1_LIVE=T3BczuPUIQo; SID=5QR6XEldVgveXzFtqjIcD480cHE18gBRd3xPo398vndcc5JNxOAZ-TgVp5jQx3CR-ePvgA.; HSID=APr2I8UwM-A-Lypbd; SSID=Ap4H3Td1nrV__-9tN; APISID=8bHyFV90pNBU5Z9p/A2DlJa2MyJLL4-RKP; SAPISID=4tZf4GDX7Dt5bNAt/A5vhaZe_DLzn-ECul; CONSENT=YES+CN.zh-CN+20160904-14-0; YSC=XVHk_pArWhE; PREF=cvdm=grid&f1=50000000&f6=1&f5=30&al=zh-CN&gl=HK',
        'Upgrade-insecure-requests':'1',
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.59 Safari/537.36',
        'X-Chrome-Uma-Enabled':'1',
        'X-Client-Data':'CJa2yQEIorbJAQjBtskBCKmdygE=',
        'Connection': 'keep-alive'
    };
    
    request
        .get('https://www.youtube.com')
        .set('header',header)
        .proxy(proxy)
        .end(onresponse);
    
    function onresponse(err,res){
        res.setEncoding('utf-8'); //防止中文乱码
        if(err){
            console.log(err);
        }else{
            console.log('status:'+res.status);
            //console.log(res.headers);
            console.log(res.text);
            //将res.text写入json文件
            fs.writeFile(__dirname+'/data/home.json',JSON.stringify({
                status: 0,
                data: res.text
            }),function(err){
                if(err){
                    return console.log(err);
                }
                console.log('完成');
            });
        }
    }

    运行后会出现以下结果:

    status:200
    -sessionlink="itct=CJUBEJQ1GAEiEwjxtYyGsv7UAhUFKioKHf57D1Yojh4yCmctaGlnaC10cnZaD0ZFd2hhdF90b193YXRjaA" title="欅って、書けない 2017年7月9日 #87" aria-describedby="description-id-25113" dir="ltr">欅って、書けない 2017年7月9日 #87</a><span class="accessible-description" id="description-id-25113"> - 長さ: 27:16。</span></h3><div class="yt-lockup-byline yt-ui-ellipsis yt-ui-ellipsis-2"><a href="/channel/UCmIRpoe4yk09Lfx16doL3SA" class="g-hovercard yt-uix-sessionlink spf-link " data-sessionlink="itct=CJUBEJQ1GAEiEwjxtYyGsv7UAhUFKioKHf57D1Yojh4" data-ytid="UCmIRpoe4yk09Lfx16doL3SA" >5時に夢中</a></div><div class="yt-lockup-meta "><ul class="yt-lockup-meta-info"><li>視聴回数 63,297 回</li><li>17 時間前</li></ul></div></div></div></div></li><li class="yt-shelf-grid-item yt-uix-shelfslider-item"><div class="yt-lockup yt-lockup-grid yt-lockup-video vve-check clearfix" data-context-item-id="6OwlCt4aKfM" data-visibility-tracking="CJQBEJQ1GAIiEwjxtYyGsv7UAhUFKioKHf57D1Yojh5A89Po8K2hifboAQ=="><div class="yt-lockup-dismissable"><div class="yt-lockup-thumbnail contains-addto"><a aria-hidden="true" href="/watch?v=6OwlCt4aKfM" class=" yt-uix-sessionlink spf-link " data-sessionlink="itct=CJQBEJQ1GAIiEwjxtYyGsv7UAhUFKioKHf57D1Yojh4yCmctaGlnaC10cnZaD0ZFd2hhdF90b193YXRjaA" ><div class="yt-thumb video-thumb"><span class="yt-thumb-simple"> <img width="196" alt="" data-ytimg="1" src="https://i.ytimg.com/vi/6OwlCt4aKfM/hqdefault.jpg?sqp=-oaymwEWCMQBEG5IWvKriqkDCQgBFQAAiEIYAQ==&amp;rs=AOn4CLAwVQhAiFdPT0nRTMFB8rX989yXuA" height="110" onload=";window.__ytRIL &amp;&amp; __ytRIL(this)" > <span class="video-time" aria-hidden="true">6:44</span></span></div></a> <span class="thumb-menu dark-overflow-action-menu video-actions"> <button aria-haspopup="true" aria-expanded="false" onclick=";return false;" class="yt-uix-button-reverse flip addto-watch-queue-menu spf-nolink hide-until-delayloaded yt-uix-button yt-uix-button-dark-overflow-action-menu yt-uix-button-size-default yt-uix-button-has-icon no-icon-markup yt-uix-button-empty" type="button" ><span class="yt-uix-button-arrow yt-sprite"></span><ul class="watch-queue-thumb-menu yt-uix-button-menu yt-uix-button-menu-dark-overflow-action-menu hid"><li role="menuitem" class="overflow-menu-choice addto-watch-queue-menu-choice addto-watch-queue-play-next yt-uix-button-menu-item" data-action="play-next" onclick=";return false;" data-video-ids="6OwlCt4aKfM"><span class="addto-watch-queue-menu-text">次に再生</span></li><li role="menuitem" class="overflow-menu-choice addto-watch-queue-menu-choice addto-watch-queue-play-now yt-uix-button-menu-item" data-action="play-now" onclick=";return false;" data-video-ids="6OwlCt4aKfM"><span class="addto-watch-queue-menu-text">今すぐ再生</span></li></ul></button> </span> <button class="yt-uix-button yt-uix-button-size-small yt-uix-button-default yt-uix-button-empty yt-uix-button-has-icon no-icon-markup addto-button video-actions spf-nolink hide-until-delayloaded addto-watch-later-button-sign-in yt-uix-tooltip" type="button" onclick=";return false;" title="後で見る" role="button" data-button-menu-id="shared-addto-watch-later-login" data-video-ids="6OwlCt4aKfM"><span class="yt-uix-button-arrow yt-sprite"></span></button> <button class="yt-uix-button yt-uix-button-size-small yt-uix-button-default yt-uix-button-empty yt-uix-button-has-icon no-icon-markup addto-button addto-queue-button video-actions spf-nolink hide-until-delayloaded addto-tv-queue-button yt-uix-tooltip" type="button" onclick=";return false;" title="キュー" data-style="tv-queue" data-video-ids="6OwlCt4aKfM"></button> </div><div class="yt-lockup-content"><h3 class="yt-lockup-title "><a href="/watch?v=6OwlCt4aKfM" class=" yt-ui-ellipsis yt-ui-ellipsis-2 yt-uix-sessionlink spf-link " data-sessionlink="itct=CJQBEJQ1GAIiEwjxtYyGsv7UAhUFKioKHf57D1Yojh4yCmctaGlnaC10cnZaD0ZFd2hhdF90b193YXRjaA" title="シャンプー中に血のり垂らしたらとんでもないパニックになったwww" aria-describedby="description-id-467161" dir="ltr">シャンプー中に血のり垂らしたらとんでもないパニックになったwww</a><span class="accessible-description" id="description-id-467161"> - 長さ: 6:44。</span></h3><div class="yt-lockup-byline yt-ui-ellipsis yt-ui-ellipsis-2"><a href="/channel/UCpOjLndjOqMoffA-fr8cbKA" class="g-hovercard yt-uix-sessionlink spf-link " data-sessionlink="itct=CJQBEJQ1GAIiEwjxtYyGsv7UAhUFKioKHf57D1Yojh4" data-ytid="UCpOjLndjOqMoffA-fr8cbKA" >水溜りボンド</a>&nbsp;<span title="確認済み" class="yt-uix-tooltip yt-channel-title-icon-verified yt-sprite"></span></div><div class="yt-lockup-meta "><ul class="yt-lockup-meta-info"><li>視聴回数 1,969,790 回</li><li>22 時間前</li></ul></div></div></div></div></li><li class="yt-shelf-grid-item yt-uix-shelfslider-item"><div class="yt-lockup yt-lockup-grid yt-lockup-video vve-check clearfix" data-context-item-id="t-saIu9AD58" data-visibility-tracking="CJMBEJQ1GAMiEwjxtYyGsv7UAhUFKioKHf57D1Yojh5An5-A-q7ExvW3AQ=="><div class="yt-lockup-dismissable"><div class="yt-lockup-thumbnail contains-addto"><a aria-hidden="true" href="/watch?v=t-saIu9AD58" class=" yt-uix-sessionlink spf-link " data-sessionlink="itct=CJMBEJQ1GAMiEwjxtYyGsv7UAhUFKioKHf57D1Yojh4yCmctaGlnaC10cnZaD0ZFd2hhdF90b193YXRjaA" ><div class="yt-thumb video-thumb"><span class="yt-thumb-simple"> <img width="196" alt="" data-ytimg="1" src="https://i.ytimg.com/vi/t-saIu9AD58/hqdefault.jpg?sqp=-oaymwEWCMQBEG5IWvKriqkDCQgBFQAAiEIYAQ==&amp;rs=AOn4CLCICoGcnfWiQ4rKmXIbmq1fspGKiA" height="110" onload=";window.__ytRIL &amp;&amp; __ytRIL(this)" > <span class="video-time" aria-hidden="true">8:40</span></span></div></a> <span class="thumb-menu dark-overflow-action-menu video-actions"> <button aria-haspopup="true" aria-expanded="false" onclick=";return false;" class="yt-uix-button-reverse flip addto-watch-queue-menu spf-nolink hide-until-delayloaded yt-uix-button yt-uix-button-dark-overflow-action-menu yt-uix-button-size-default yt-uix-button-has-icon no-icon-markup yt-uix-button-empty" type="button" ><span class="yt-uix-button-arrow yt-sprite"></span><ul class="watch-queue-thumb-menu yt-uix-button-menu yt-uix-button-menu-dark-overflow-action-menu hid"><li role="menuitem" class="overflow-menu-choice addto-watch-queue-menu-choice addto-watch-queue-play-next yt-uix-button-menu-item" data-action="play-next" onclick=";return false;" data-video-ids="t-saIu9AD58"><span class="addto-watch-queue-menu-text">次に再生</span></li><li role="menuitem" class="overflow-menu-choice addto-watch-queue-menu-choice addto-watch-queue-play-now yt-uix-button-menu-item" data-action="play-now" onclick=";return false;" data-video-ids="t-saIu9AD58"><span class="addto-watch-queue-menu-text">今すぐ再生</span></li></ul></button> </span> .......太长了就省略了

    此时我们已经可以成功连接上YouTube,接下来就可以利用cheerio模块对其进行解析啦。

     请求头的获取:

    代码中的请求头可以利用浏览器,翻墙后打开YouTube首页,点击检查,然后点击network,刷新后我们会发现header里面包含的请求信息。

     

  • 相关阅读:
    通讯录封装实现
    简单通讯录的实现 main..h .m文件全部
    iOS 开发 OC编程 字典和集合 排序方法
    iOS 开发 OC编程 数组冒泡排序.图书管理
    iOS 开发 OC编程 属性和字符串练习
    iOS 开发 OC编程 属性和字符串
    iOS 开发 OC编程 便利构造器以及初始化方法
    iOS 开发 OC编程 方法的书写
    IOS 开发 OC编程 类和对象
    iOS 开发 c语言阶段考试题
  • 原文地址:https://www.cnblogs.com/xiaxuexiaoab/p/7147574.html
Copyright © 2020-2023  润新知