• node.js 爬虫动态代理ip


    参考文章:

      https://andyliwr.github.io/2017/12/05/nodejs_spider_ip/

      https://segmentfault.com/q/1010000008196143

    代码:

    import request from 'request';
    import userAgents from './common/userAgent';
    import Promise from 'bluebird';
    
    //这里只做测试,所以用变量存,而实际应用中,应该使用数据缓存
    const expiryTime = 10 * 60 * 1000;// 过期间隔时间,毫秒
    let ips = null; //代理ip
    let time = null;// 存储代理IP的时间,判断是否过期,如果过期重新请求
    
    /**
     * 请求免费代理,可做缓存,这里就存在变量中,只做测试
     */
    const getProxyList = () => {
        return new Promise((resolve, reject) => {
                const nowDate = Date.now();
                if( nowDate - time <  expiryTime ){
                    resolve(ips);
                    return;
                }
                const apiURL = 'http://www.66ip.cn/mo.php?sxb=&tqsl=100&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=http%3A%2F%2Fwww.66ip.cn%2F%3Fsxb%3D%26tqsl%3D100%26ports%255B%255D2%3D%26ktip%3D%26sxa%3D%26radio%3Dradio%26submit%3D%25CC%25E1%2B%2B%25C8%25A1';
            const options = { method: 'GET', url: apiURL, gzip: true, encoding: null,
                headers: {
                    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                    'Accept-Encoding': 'gzip, deflate',
                    'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
                    'User-Agent': 'Mozilla/8.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
                    'referer': 'http://www.66ip.cn/'
                },
            };
            request(options, (error, response, body)=>{
                try {
                         if(Buffer.isBuffer(body)){
                             const ret = body.toString().match(/d{1,3}.d{1,3}.d{1,3}.d{1,3}:d{1,4}/g);
                             ips = ret;
                             time = Date.now();
                             resolve(ret);
                         }
                } catch (e) {
                    console.log(e);
                }
            });
        })
    }
    //爬取网页
    async function reptile(ipList){
        return new Promise((resolve, reject) => {
            let userAgent = userAgents[parseInt(Math.random() * userAgents.length)];
            let ip = ipList[parseInt(Math.random() * ips.length)];
            let useIp = `http://${ip}`;
            const options = { method: 'GET', url: 'http://www.qcnh1920.com', gzip: true, encoding: null,
                headers: {
                    'User-Agent': userAgent, //动态设置浏览器头部信息
                },
                proxy: useIp, //动态设置代理ip
                timeout: 8000
            };
            request( options , (error, response, body)=>{
                //这里是因为有些ip 不能访问,所以如果没有访问到,则继续掉用其他ip 访问
                if (error) {
                    console.log(`爬取页面失败,${error},正在重新寻找代理ip... ×`);
                    // 如果是代理ip无法访问,另外选择一个代理
                }else{
                    console.log('爬取页面成功,  √');
                }
                resolve(body)
            })
        });
    }
    //启动方法
    async function startFun (){
        const ipList = await getProxyList();//获取代理ip
        const body = await reptile(ipList);//爬取网页
        if(!body){
            startFun();
            return;
        }
        //解析html
        console.log(body.toString());
    }
    //启动方法
    startFun();
    userAgent.js
      
    const userAgents = [
      'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
      'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
      'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
      'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20',
      'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
      'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
      'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) ,Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
      'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
      'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
      'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
      'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre',
      'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
      'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
      'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
      'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
      'Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6',
      'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
      'Opera/9.25 (Windows NT 5.1; U; en), Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
      'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    ]
    
    export default userAgents;
    
    
  • 相关阅读:
    STL map与Boost unordered_map
    static constructors in C++? need to initialize private static objects
    谈谈C++继承中的重载,覆盖和隐藏
    C++继承:公有,私有,保护
    C++ using关键字作用总结
    ++iter的效率比iter++的效率高
    C++中dynamic_cast,static_cast,const_cast,reinterpret_cast
    【转】WinForm中添加闪屏窗口的两种方法
    【转】一个不错的Matlab的gui界面设计实例 (20081003 15:47:30)matlab gui 界面 校园 分类:Matlab实例
    制作一个WinForm的闪屏
  • 原文地址:https://www.cnblogs.com/bruce-gou/p/9315592.html
Copyright © 2020-2023  润新知