• nodejs 小爬虫


    编写爬虫示例:

    var http = require('http');
    var cheerio = require('cheerio');
    
    var url = 'http://www.cnblogs.com/tianxintian22/';
    
    function filterblogs(html) {
        var $ = cheerio.load(html);
    
        var blogs = $('.day');
    
        // [{
        //     dayTitle: '',
        //    dayCont: {
        //        postId: '',
        //         postTitle: '',
        //         postCont: ''
        //    }
        // }]
    
        var blogDatas = [];
    
        blogs.each(function(item) {
            var blog = $(this);
            var dayTitle = blog.find('.dayTitle a').text();
            var blogData = {
                dayTitle: dayTitle,
                dayCont: []
            };
    
            var postId = blog.find('.postCon .c_b_p_desc a').attr('href').split('p/')[1].replace('.html', '');
            var postTitle = blog.find('.postTitle a').text();
            var postCont = blog.find('.postCon .c_b_p_desc').text();
    
            blogData.dayCont.push({
                postId: postId,
                postTitle: postTitle,
                postCont: postCont
            });
    
            blogDatas.push(blogData);
        })
    
        return blogDatas;
    
    }
    
    function printBlogInfo(blogDatas) {
        blogDatas.forEach(function(item) {
            var dayTitle = item.dayTitle;
            console.log(dayTitle + '
    ');
    
            item.dayCont.forEach(function(blog){
                console.log('    【' + blog.postId + '】' + blog.postTitle +'
    ');
                console.log('    ' + blog.postCont + '
    ');
            });
        })
    }
    
    http.get(url, function (res) {
        var html = '';
    
        res.on('data', function(data) {
            html += data;
        });
    
        res.on('end', function() {
            var blogDatas =  filterblogs(html);
            printBlogInfo(blogDatas);
        });
    }).on('error', function() {
        console.log('获取博客数据出错');
    })
  • 相关阅读:
    js 特殊字符处理
    sql server 查询新增
    idea 很多基础文件找不到
    js 千分位
    Navicat Premium 12新增标识列
    Javascript 树形菜单 (11个)
    Javascript调用后台方法
    Treeview绑定数据库
    Repeater实现GridView编辑修改模式
    如何用JS获取键盘上任意按键的值?兼容FireFox和IE js获取键盘ASCII码?js键盘事件全面控制
  • 原文地址:https://www.cnblogs.com/tianxintian22/p/5121530.html
Copyright © 2020-2023  润新知