• 使用node.js抓取有路网图书信息(原创)


    之前写过使用python抓取有路网图书信息,见http://www.cnblogs.com/dyf6372/p/3529703.html。

    最近想学习一下Node.js,所以想试试手,比较一下http抓取上的性能,采用事件驱动的Node.js比python好一些,以下上代码(刚学还未优化):

    var http = require('http');
    var iconv = require('iconv-lite');
    var url = require('./gb2312_url_encode.js');
    
    function getHtmlOptions(path){
        return  {
            hostname : 'www.youlu.net',
            port : 80,
            path : path,
            method : 'GET',
            headers : {
                'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36'
            }
        }
    }
    
    function getAllNumber(str){
        var re = /共有图书数量
    s*[0-9]*/;
        var n_str = str.match(re)[0];
        re = /[0-9]{1,}/;
        return n_str.match(re)[0];
    }
    
    function getDetailList(str){
        var re = /"/d+.*"/g;
        var n_array = str.match(re);
        //console.log(n_array);
        var result_array = [];
        if(n_array == null){
            return [];
        }
        for(var i=0;i<n_array.length;i++){
            var tmp = n_array[i];
            re = /d+/;
            var tmp_item = tmp.match(re);
            //console.log(tmp_item[0]);
            if(result_array.indexOf(tmp_item[0])<0){
                re = /alt=".*"/;
                var n_tmp_item = tmp.match(re);
                if(n_tmp_item == null) continue;
                var book_name = n_tmp_item[0].substring(5);
                book_name = book_name.substring(0,book_name.length-1);
                result_array.push(tmp_item[0]);
                searchDetail(tmp_item[0],book_name);
            }
        }
        return result_array;
    }
    
    function getRealBookNumber(str,book_name,url){
        re = /startRequestBookBuyLink(.*)/;
        n_array = str.match(re);
        var num = n_array[0].split(',')[3];
        num = num.substring(2,num.length-1);
        if(num > 0){
            console.log("------------------------------------");
            console.log(book_name);
            console.log("数目:"+num+" url:"+url);
        }
    }
    
    function firstSearch(query_book_name){
        var options = getHtmlOptions('/search/result/default.aspx?isbn=&publisherName=&author=&bookName='+query_book_name);
        var req = http.request(options,function(res){
            if(res.statusCode != 200){
                console.log("请求/search/result/default.aspx?isbn=&publisherName=&author=&bookName="+query_book_name+"发生错误了:请求返回非200,返回码:"+res.statusCode+"
    ");
                return;
            }
    
            var chunks = [];
            var size = 0;
    
            res.on('data',function(chunk){
                chunks.push(chunk);
                size += chunk.length;
            });
    
            res.on('end',function(){
                var returnMsg = Buffer.concat(chunks,size);
                var nreturnMsg = iconv.decode(returnMsg,'gb2312');
                var allNumber = getAllNumber(nreturnMsg);
                if(allNumber>0){
                    var allPage = (allNumber/20+0.5).toFixed(0);
                    console.log("该图书数目为:"+allNumber);
                    console.log("共有页数:"+allPage);
                    for(var i = 1; i<=allPage; i++){
                        searchEachPage(query_book_name,i,allNumber);
                    }
                }else{
                    console.log("该图书数目为0."+"
    ");
                }
            });
        });
    
        req.on('error', function(e) {
            console.log('problem with request: ' + e.message);
        });
    
        req.end();
    }
    
    function searchEachPage(query_book_name,pageIndex,rowCount){
        var options = getHtmlOptions("/search/result/?isbn=&publisherName=&author=&bookName="+query_book_name+"&pageIndex="+pageIndex+"&rowCount="+rowCount+"&searchIn=");
        var req = http.request(options,function(res){
            if(res.statusCode != 200){
                console.log("请求"+"/search/result/?isbn=&publisherName=&author=&bookName="+query_book_name+"&pageIndex="+pageIndex+"&rowCount="+rowCount+"&searchIn="+"发生错误了:请求返回非200,返回码:"+res.statusCode+"
    ");
                return;
            }
    
            var chunks = [];
            var size = 0;
    
            res.on('data',function(chunk){
                chunks.push(chunk);
                size += chunk.length;
            });
    
            res.on('end',function(){
                var returnMsg = Buffer.concat(chunks,size);
                var nreturnMsg = iconv.decode(returnMsg,'gb2312');
                getDetailList(nreturnMsg);
            });
        });
    
        req.on('error', function(e) {
            console.log('problem with request: ' + e.message);
        });
    
        req.end();
    }
    
    
    function searchDetail(detail_number,book_name){
        var options = getHtmlOptions("/"+detail_number);
        var req = http.request(options,function(res){
            if(res.statusCode != 200){
                console.log("请求"+"/"+detail_number+"发生错误了:请求返回非200,返回码:"+res.statusCode+"
    ");
                return;
            }
    
            var chunks = [];
            var size = 0;
    
            res.on('data',function(chunk){
                chunks.push(chunk);
                size += chunk.length;
            });
    
            res.on('end',function(){
                var returnMsg = Buffer.concat(chunks,size);
                var nreturnMsg = iconv.decode(returnMsg,'gb2312');
                getRealBookNumber(nreturnMsg,book_name,"http://www.youlu.net/"+detail_number);
            });
        });
    
        req.on('error', function(e) {
            console.log('problem with request: ' + e.message);
        });
    
        req.end();
    }
    
    var query_book_name= url.URLEncode('java');
    firstSearch(query_book_name);
  • 相关阅读:
    泛式之争
    测试的本质
    动态语言与静态语言
    对象之间的关系
    关于“重复”的一段交流
    装饰器与子类化
    类的设计质量
    抽象跟难
    Unity经典游戏编程之:球球大作战
    关于Unity 中对UGUI制作任务系统的编程
  • 原文地址:https://www.cnblogs.com/dyf6372/p/3536821.html
Copyright © 2020-2023  润新知