• Nodejs实现爬虫的几种方式


    获取代理 IP

    // 需要安装 axios 模块 - npm install axios --save
    const axios = require('axios')
    
    // id secret 等参数是在猿人云官网提取API内获取的
    const queries = {
        id: 'xxx',
        secret: 'xxx',
        limit: 1,
        format: 'txt',
        auth_mode: 'auto'
    };
    
    axios.get('http://tunnel-api.apeyun.com/q', {
        params: queries,
    }).then((response) => {
        console.log('IP:', response.data);
    }).catch((e) => {
        console.error('Error:', e);
    });

    爬虫程序

    • axios


      // 需要安装 axios 模块 - npm install axios --save
      const axios = require('axios')
      
      // 要访问的目标页面
      let targetUrl = "http://www.baidu.com"
      
      // 代理服务器,假设提取到的代理 ip 是 123.123.123.123:1234
      const proxyHost = "123.123.123.123"
      const proxyPort = 1234
      
      // 代理验证信息(猿人云官网获取)
      const proxyUser = "xxx"
      const proxyPass = "xxx"
      
      let proxy = {
          host: proxyHost,
          port: proxyPort,
          auth: {
              username: proxyUser,
              password: proxyPass
          }
      }
      
      // 参见官方文档 https://github.com/axios/axios#request-config
      axios.get(targetUrl,{proxy:proxy})
          .then(function (response) {
              // handle success
              console.log(response.data)
          })
          .catch(function (error) {
              // handle error
              console.log(error)
          })
          .finally(function () {
              // always executed
          });
      

        

    • http

    • const http = require("http")
      const url  = require("url")
      
      // 要访问的目标页面
      const targetUrl = "http://www.baidu.com"
      const urlParsed   = url.parse(targetUrl)
      
      // 代理服务器,假设提取到的代理ip是123.123.123.123:1234
      const proxyHost = "123.123.123.123"
      const proxyPort = "1234"
      
      // 代理隧道验证信息(猿人云官网获取)
      const proxyUser = "xxx"
      const proxyPass = "xxx"
      
      const base64 = Buffer.from(proxyUser + ":" + proxyPass).toString("base64")
      
      const options = {
          host: proxyHost,
          port: proxyPort,
          path: targetUrl,
          method: "GET",
          headers: {
              "Host": urlParsed.hostname,
              "Proxy-Authorization": "Basic " + base64
          }
      }
      
      http.request(options, function(res) {
          console.log("got response: " + res.statusCode)
      })
      .on("error", function(err) {
          console.log(err)
      })
      .end()
      

        

    • request


    • // 需要安装 request 模块 - npm install request --save
      const request = require("request")
      
      // 要访问的目标页面
      const targetUrl = "http://www.baidu.com"
      
      // 代理服务器,假设提取到的代理ip是123.123.123.123:1234
      const proxyHost = "123.123.123.123"
      const proxyPort = 1234
      
      // 代理隧道验证信息(猿人云官网获取)
      const proxyUser = "xxx"
      const proxyPass = "xxx"
      
      const proxyUrl = "http://" + proxyUser + ":" + proxyPass + "@" + proxyHost + ":" + proxyPort
      
      const proxiedRequest = request.defaults({'proxy': proxyUrl})
      
      const options = {
          url: targetUrl,
          headers: {}
      }
      
      proxiedRequest.get(options, function (err, res, body) {
          console.log("got response: " + res.statusCode)
          console.log("got response: " + body)
      })
      .on("error", function (err) {
          console.log(err);
      })
      

        

    • superagent


    • // 需要安装 superagent 和 superagent-proxy 模块 - npm install superagent superagent-proxy --save
      const request = require("superagent")
      require("superagent-proxy")(request)
      
      // 要访问的目标页面
      const targetUrl = "http://www.baidu.com"
      
      // 代理服务器,假设提取到的代理ip是123.123.123.123:1234
      const proxyHost = "123.123.123.123";
      const proxyPort = 1234;
      
      // 代理隧道验证信息(猿人云官网获取)
      const proxyUser = "xxx"
      const proxyPass = "xxx"
      
      const proxyUrl = "http://" + proxyUser + ":" + proxyPass + "@" + proxyHost + ":" + proxyPort
      
      request.get(targetUrl).proxy(proxyUrl).end(function onResponse(err, res) {
          if (err) {
              return console.log(err)
          }
          console.log(res.status, res.headers)
          console.log(res.text)
      });

    使用 node 运行这个文件,当你的控制台打印出一大段 HTML 代码说明这个爬虫程序成功了

    转自:https://mp.weixin.qq.com/s/JA11NzbbHtKqgijmdmJPlw

  • 相关阅读:
    【转】高性能网络编程4--TCP连接的关闭
    Kubernetes 用了,延迟高了 10 倍,问题在哪?
    多路复用和多路分用
    网络七层模型与四层模型区别
    Go验证包出错 dial tcp 34.64.4.17:443: i/o timeout
    spring Bean配置的三种形式
    Spring容器IOC初始化过程
    Go 特殊语法
    服务发现的基本原理与比较:Eureka vs Consul vs Zookeeper
    docker 常用命令
  • 原文地址:https://www.cnblogs.com/tjp40922/p/16069741.html
Copyright © 2020-2023  润新知