• nodejs .http模块, cheerio模块 实现 小爬虫.


    代码:

     1 var http = require("http");
     2 
     3 var cheerio = require("cheerio");
     4 
     5 
     6 var url = 'http://www.imooc.com/learn/348';
     7 
     8 
     9 http.get(url, function(res){
    10     var html = '';
    11 
    12     res.on('data', function(data){
    13         html += data;
    14     });
    15 
    16     res.on('end', function(){
    17         var courseData = filterChapters(html);
    18 
    19         printCourseInfo(courseData);
    20         console.log(courseData);
    21     });
    22 }).on('error', function(){
    23     console.log("获取课程数据出错!");
    24 });
    25 
    26 function filterChapters(html)
    27 {
    28     var $ = cheerio.load(html);
    29 
    30     //所有章节
    31     var chapters = $('.chapter');
    32 
    33     var courseData = [];
    34 
    35     chapters.each(function(item){
    36         var chapter = $(this);
    37         var chapterTitle = chapter.find('h3 strong').text().replace(/
    |
    /ig,"").trim();
    38         var videos = chapter.find(".video").children('li');
    39 
    40         var chapterData = {
    41             chapterTitle: chapterTitle,
    42             videos: []
    43         };
    44 
    45         videos.each(function(index, item2) {
    46             var video = $(this).find('.J-media-item');
    47             var videoTitle = video.text().replace(/
    |
    /ig,"").trim();
    48             var id = video.attr('href').split('video/')[1];
    49 
    50             chapterData.videos.push({
    51                 title: videoTitle,
    52                 id: id
    53             })
    54         });
    55 
    56         courseData.push(chapterData);
    57     });
    58 
    59     return courseData;
    60 }
    61 
    62 
    63 function printCourseInfo(courseData)
    64 {
    65     courseData.forEach(function(item){
    66         var chapterTitle = item.chapterTitle;
    67         console.log(chapterTitle + '
    ');
    68 
    69         item.videos.forEach(function(video){
    70             console.log('   [' + video.id+ ']' + video.title);
    71         });
    72     });
    73 }

    运行:

    ----------------------------------------------------------------------

    参考链接:

  • 相关阅读:
    文件包含漏洞
    命令执行漏洞详解
    CSRF跨站请求伪造
    XSS跨站脚本攻击详解
    SQL盲注
    字符型注入、数字型注入、搜索型注入
    AWVS13扫描类型profile_id对照表
    Cobalt Strike 和 Metasploit Framework 联动
    msf常用命令
    上传嵌入式python环境进行渗透测试
  • 原文地址:https://www.cnblogs.com/cbza/p/7281367.html
Copyright © 2020-2023  润新知