• 没什么


    去年意识头脑发热(我经常这样),想利用gayhub和node做一个项目。

    大概的想法是,利用nodejs爬取各大招聘网站的招聘需求,然后再用d3.js和github主页功能根据日期来展示各种统计图表出来。

    自动爬取是没有思路的,所以当时的思路是每周手动爬取数据,生成json上传到gayhub的主页repo里面去,页面里面根据日期去判断获取哪一个星期的数据。

    数据初步的爬取到了,然后后面由于众多原因,我没有继续进行下去,当然,这只是为我的三分发热找的借口。

    以下是爬取lagou数据的主要文件,现在还能不能用我也懒的去试验,以此备忘顺道更新blog。

    var superagent = require("superagent");
    var cheerio = require("cheerio");
    var async = require("async");
    
    var fs = require("fs");
    var path = require("path");
    
    var rootUrl = "https://www.lagou.com";
    
    var $;
    var locations = [encodeURI('全国'),encodeURI('北京'),encodeURI('上海'),encodeURI('杭州'),encodeURI('广州'),encodeURI('深圳'),encodeURI('成都')];
    var content = '';
    //for test only
    /**/
    fs.readFile('./result/class_1481010149483.txt',(err, data) => {
      if( err ) console.error(err);
      parse(data);
    });
    /**//*
    scrawlLocation(0);
    function scrawlLocation(index){
      superagent
        .get(rootUrl)
        .set("index_location_city",locations[index])
        .end(function(err, res){
          file = fs.createWriteStream("./result/class_"+Date.now()+".txt");
          console.log(locations[index]);
          parse(res.text,locations[index]);
          file.write(res.text);
          file.end();
          if( index + 1 < locations.length){
            scrawlLocation(index+1);
          }
      });
    }
    /**/
    
    /**/
    var today = new Date();
    var curDir;
    function parse(content,currentLocation){
      var dataPool = {};
      var file;
      var todayStr = today.getFullYear()+"-"+today.getMonth()+"-"+today.getDate();
      curDir = "./result/"+todayStr+"/";
      if( !fs.existsSync(curDir)){
        fs.mkdirSync(curDir);
    
        file = fs.createWriteStream("./result/config.js");
        file.write("var revision = "+todayStr+";");
        file.end();
      }
    
      $ = cheerio.load(content,{ignoreWhitespace: true});
      var mainClass;
      var secondClass;
      var classData;
      $('div[class="menu_box"]').each(function(k,v){
        //console.log("====================");
        mainClass = parserMainClass(v);//menu_main job_hopping
        //file = fs.createWriteStream(curDir+mainClass+".json");
        classData = [];
        
        parseSecondClass($(v).children()[1], classData);//menu_sub dn
    
        dataPool[mainClass] = classData;
    
        //file.write(JSON.stringify(classData));
        //file.end();
      });
      
      file = fs.createWriteStream(curDir+decodeURI(currentLocation)+".json");
      file.write(JSON.stringify(dataPool));
      file.end();
      
      startScrawlCount(curDir);
    }
    
    function parserMainClass(value){
      var h2Item = $(value).children().children()[0];
      var title = h2Item.children[0].data;
      return title.trim();
    }
    
    function parseSecondClass(value, classArr){
      var item;
      var arr = value.children;
      var len = arr.length;
      var data,len1,arr1,item1,len2,arr2,item2;
      //console.log("*****************************");
      for(var i = 0 ; i < len ; i++){//dl
        item = arr[i];
        if( item.type === "text") continue;
        //console.log("1~~~~~~~~~~~~~~~~~~~~~~~~~~");
        //console.log(item);
        arr1 = item.children;
        len1 = arr1.length;
        for(var j = 0; j < len1; j++){
          item1 = arr1[j];
          if( item1.type === "text") continue;
          //console.log("2 ~~~~~~~~~~~~~~~~~~~~~~~~~~");
          //console.log(item1);
          if( item1.name === "dt"){
            item1 = item1.children[1];
            //console.log("3~~~~~~~~~~~~~~~~~~~~~~~~~~");
            //console.log(item1);
            data = {};
            data.name = item1.children[0].data;
            data.isMain = 1;
            data.href = item1.attribs["href"].substring(2);
            data.dataLgTjId = item1.attribs["data-lg-tj-id"];
            data.dataLgTjNo = item1.attribs["data-lg-tj-no"];
            data.dataLgTjCid = item1.attribs["data-lg-tj-cid"];
            classArr.push(data);
            //console.log(item1.children[0].data,item1.attribs["href"],item1.attribs["data-lg-tj-id"],item1.attribs["data-lg-tj-no"],item1.attribs["data-lg-tj-cid"]);
          }else if( item1.name === "dd"){
            //console.log("4~~~~~~~~~~~~~~~~~~~~~~~~~~");
            arr2 = item1.children;
            len2 = arr2.length;
            for( var k = 0; k < len2; k++){
              item2 = arr2[k];
              if( item2.type === "text") continue;
              data = {};
              //console.log("5~~~~~~~~~~~~~~~~~~~~~~~~~~");
              //console.log(item2);
              data.name = item2.children[0].data;
              data.isMain = 0;
              data.href = item2.attribs["href"].substring(2);
              data.dataLgTjId = item2.attribs["data-lg-tj-id"];
              data.dataLgTjNo = item2.attribs["data-lg-tj-no"];
              data.dataLgTjCid = item2.attribs["data-lg-tj-cid"];
              classArr.push(data);
              //console.log(item2.children[0].data,item2.attribs["href"],item2.attribs["data-lg-tj-id"],item2.attribs["data-lg-tj-no"],item2.attribs["data-lg-tj-cid"]);
            }
          }
        }
      }
    }
    
    const JOB_PER_PAGE = 15;
    
    function startScrawlCount(dir){
      var files = fs.readdirSync(dir);
      //files.forEach(function(file){
        scrawlFile(files,0,dir);
      //});
      
    }
    
    function scrawlFile(files, index,dir){//city
      var file = files[index];
      var location = encodeURI(file.split(".")[0]);
      var data;
      fs.readFile(dir+file,{encoding:'utf8',flag:"r+"},(err, content) =>{
        if( err ) console.error(err);
    
        data = JSON.parse(content);
        var total = 0;
        var complete = 0;
        for (var k in data){
          total++;
          var tarr = data[k];
          var completeCnt = 0;
          async.eachLimit(tarr,3,function(item, callback){
            superagent
              .get(item.href)
              .set("index_location_city",location)
              .end(function(err, res){
                if( err ) console.error(err);
    
                $ = cheerio.load(res.text);
                console.log(item.href);
                var arr = $("#tab_pos").text().match(/d+[+]?/);
                if( arr.length != 0){
                  var countStr = arr[0];
                  if(countStr.indexOf("+") == -1){
                    item.count = parseInt(countStr);
                  }else{
                    var arr1 = $(".page_no");
                    var maxIndex = 1;
                    var tempIndex;
                    var len = arr1.length
                    var pageItem;
                    for(var i = 0; i < arr1.length; i++){
                      pageItem = arr1[i];
                      tempIndex = parseInt(pageItem.attribs["data-index"]);
                      maxIndex = tempIndex > maxIndex ? tempIndex : maxIndex;
                    }
                    item.count = maxIndex * JOB_PER_PAGE;
                  }
                }
                completeCnt++;
                callback(err, res);
              });
          },function(err){
            if( err ) console.error(err);
            complete++;
            console.log(files[index]+":"+complete+"/"+total);
            if( complete == total){
              var wfile = fs.createWriteStream(dir+file);
              wfile.write(JSON.stringify(data));
              wfile.end();
              if( index+1 < files.length){
                scrawlFile(files,index+1,dir);
              }
            }
          });
        }
    
        return;
        var completeCnt = 0;
        async.eachLimit(data,3,function(item, callback){
          superagent
            .get(item.href)
            .set("index_location_city","%E5%8C%97%E4%BA%AC")
            .end(function(err, res){
              
              if( err ) console.error(err);
    
              $ = cheerio.load(res.text);
              console.log(item.href);
              var arr = $("#tab_pos").text().match(/d+[+]?/);
              if( arr.length != 0){
                var countStr = arr[0];
                if(countStr.indexOf("+") == -1){
                  item.count = parseInt(countStr);
                  //console.log(item.count);
                }else{
                  var arr1 = $(".page_no");
                  var maxIndex = 1;
                  var tempIndex;
                  var len = arr1.length
                  var pageItem;
                  for(var i = 0; i < arr1.length; i++){
                    pageItem = arr1[i];
                    tempIndex = parseInt(pageItem.attribs["data-index"]);
                    maxIndex = tempIndex > maxIndex ? tempIndex : maxIndex;
                  }
                  //console.log("Count",countStr,"Page:",maxIndex);
                  item.count = maxIndex * JOB_PER_PAGE;
                }
              }
              completeCnt++;
              //console.log(completeCnt+"/"+data.length);
              callback(err, res);
            });
        },function(err){
          if( err ) console.error(err);
    
          console.log("hehe");
          var wfile = fs.createWriteStream(dir+file);
          wfile.write(JSON.stringify(data));
          wfile.end();
        });
      });
    }
    app.js

    这个是npm依赖文件:

    {
      "name": "node-crawl",
      "version": "1.0.0",
      "description": "",
      "main": "app.js",
      "scripts": {
        "test": "echo "Error: no test specified" && exit 1"
      },
      "keywords": [],
      "author": "",
      "license": "ISC",
      "dependencies": {
        "asyn": "0.0.1",
        "async": "^2.1.4",
        "cheerio": "^0.22.0",
        "express": "^4.14.0",
        "superagent": "^3.1.0"
      }
    }
    package.json

    这个是当初爬到的数据的样子:

    感觉自己好没羞耻

  • 相关阅读:
    10 个让人惊讶的 jQuery 插件
    URL编码方法比较
    Java大文件分片上传/多线程上传源码
    Java大文件分片上传/多线程上传代码
    Java大文件分片上传/多线程上传插件
    Java大文件分片上传/多线程上传控件
    python函数
    关于言谈
    Sql语句之select 5种查询
    openstack之网络基础
  • 原文地址:https://www.cnblogs.com/adoontheway/p/6669610.html
Copyright © 2020-2023  润新知