• nodejs 爬虫


    参考了各位大大的,然后自己写了个爬虫

    用到的modules:
    utils.js        ---    moment

    module_url.js    
      var http = require("http");         //获得页面数据
      var cheerio = require("cheerio");     //分析页面数据,提取内容
      var sanitize = require("validator");     //过滤没用的数据 如空格等  
      var fs = require('fs');          //操作文件,保存结果  

    app.js 

      var async = require("async");    //异步操作 如each, filter
      var ts = require("timespans")    //计算花费时间
      var sanitize = require("validator");  //过滤没用的数据 如空格等

    获得每个页面的话题列表       --   并行的
    根据话题列表获得的话题具体内容   --   并行的  但是最后输出的内容是按顺序的

    别处拷来的utils  里面重写了下console.log  增加了输出的时间

    var moment = require('moment');
    
    exports.inc = function(n, callback, timeout) {
        timeout = timeout || 200;
        setTimeout(function() {
            callback(null, n+1);
        }, timeout);
    };
    
    exports.fire = function(obj, callback, timeout) {
        timeout = timeout || 200;
        setTimeout(function() {
            callback(null, obj);
        }, timeout);
    };
    
    exports.err = function(errMsg, callback, timeout) {
        timeout = timeout || 200;
        setTimeout(function() {
            callback(errMsg);
        }, timeout);
    };
    
    // utils
    exports.log = function(msg, obj) {
        process.stdout.write(moment().format('ss.SSS')+'> ');
        if(obj!==undefined) {
            process.stdout.write(msg);
            console.log(obj);
        } else {
            console.log(msg);
        }
    };
    
    exports.wait = function(mils) {
        var now = new Date;
        while(new Date - now <= mils);
    }
    utils.js

    抓取页面数据

    //获得页面数据
    var http = require("http");
    //分析页面数据,提前内容
    var cheerio = require("cheerio");
    //过滤没用的数据 如空格等
    var sanitize = require("validator");
    //操作文件,保存结果
    var fs = require('fs');
    
    
    var scrapy = {};
    scrapy.get = function(url, callback) {
      http.get(url, function(res) {
    
        var size = 0;
        var chunks = [];
    
        res.on('data', function(chunk) {
          size += chunk.length;
          chunks.push(chunk);
        });
    
        res.on('end', function() {
          var data = Buffer.concat(chunks, size);
          callback(null, data);
        });
    
      }).on('error', function(e) {
        callback(e, null);
      });
    }
    
    var getPage = function(pageUrl, callback){
      scrapy.get(pageUrl, function(err, data){
        if(err){
          callback(err);
        }
    
        var html = data.toString();
        $ = cheerio.load(html);
          //title link, link to detail page
          var news = $('.cell .topic_title_wrapper a');   
          callback(null, news);
        });
    }
    
    var getDetail = function(detailUrl, callback){  
      scrapy.get(detailUrl, function(err, data){
        if(err){
          callback(err);
        }
    
        var html = data.toString();
        $ = cheerio.load(html);  
        var item = {};
        item.href = detailUrl;
        $('.header .topic_full_title .put_top').remove(); //删除 “置顶”
        item.title = sanitize.escape(sanitize.trim($('.header .topic_full_title').text()));
        item.content = sanitize.escape(sanitize.trim($('.inner.topic .topic_content').text()));
    
        callback(null, item);
      });
    }
    
    var save = function(fileName, data) {
      var result = JSON.stringify(data);
      fs.writeFileSync(fileName, result);
    }
    
    exports.getUrl = scrapy.get;
    exports.getPage = getPage;
    exports.getDetail = getDetail;
    exports.save = save;
    module_url.js

    主文件

    //自定义console.log 加入了输出时间
    var utils = require("./utils");
    var log = utils.log;
    //异步操作 如each, filter
    var async = require("async");
    //计算花费时间
    var ts = require("timespans")
    //过滤没用的数据 如空格等
    var sanitize = require("validator");
    var url = require("./module_url")
    
    
    var baseUrl = 'http://cnodejs.org';
    var pageUrl = baseUrl + '/?page=';
    var isOnlyTitle = true;
    var pages = [];
    for (var i = 1; i < 4; i++) {
        pages.push(i);
    };
    
    ts.start();
    var titles = {};
    //page 之间并行
    async.forEach(pages, function(page, callback_each){
        titles[page] = [];
    
        url.getPage(pageUrl + page, function(err, news){
            if(err){
                log("page error");
                return;
            }
    
            if (news.length === 0) {
                  log("no data for the page:" + page);
                  return;
            }
            
            async.filter(news, function(index, callback){
                var detailUrl = baseUrl + news[index].attribs['href'];
    
                if(isOnlyTitle){
                    var curNew = news[index];
                    var item = {};
                    item.href = detailUrl;        
                    $(curNew).find(".put_top").remove();    //删除 “置顶”                
                    item.title = sanitize.escape(sanitize.trim($(curNew).text()));
    
                    titles[page][index] = item;
    
                    callback(true);
                }
                else{
                    url.getDetail(detailUrl, function(err, item){
                        if(err){
                            log("detail error");
                            return;
                        }
                        titles[page][index] = item;
                        //titles[page].push(item);
    
                        callback(true);
                    });
                }
            }, function(result){
                //log("filter news:", result);
                callback_each(null);
            });
            
        });
    }, function(err){
        ts.stop();
        //ts.pause();    ---   ts.continue();
        console.log('total: %s pause: %s used: %s', ts.elapsedtime(), ts.pausetime(), ts.usedtime());
        log(titles);
        //url.save("cnodejs.json", titles);
    });
    app.js

    另外:想实现抓取某个时间段内的话题,努力ing...

  • 相关阅读:
    POJ 2018 二分
    873. Length of Longest Fibonacci Subsequence
    847. Shortest Path Visiting All Nodes
    838. Push Dominoes
    813. Largest Sum of Averages
    801. Minimum Swaps To Make Sequences Increasing
    790. Domino and Tromino Tiling
    764. Largest Plus Sign
    Weekly Contest 128
    746. Min Cost Climbing Stairs
  • 原文地址:https://www.cnblogs.com/dfg727/p/3809615.html
Copyright © 2020-2023  润新知