使用Nodejs实现的小说爬虫

 1 //引入模块
 2 const http = require('http')
 3 const fs = require('fs')
 4 const cheerio = require('cheerio')
 5 const iconv = require('iconv-lite')
 6 //第一章url
 7 const url = 'http://www.81zw.com/book/8634/745331.html'
 8 //开始章节数
 9 let i = 1
10 //最大获取章节数
11 let num = 100
12 
13 function main(url) {
14     startRequest(url)
15 }
16 
17 function startRequest(url) {
18     http.get(url, res => {
19         //定义空数组存放html
20         const html = []
21         res.on('data', (chunk) => {
22             //把数据块添加进数组
23             html.push(chunk)
24         })
25         res.on('end', () => {
26             //获取数据完毕后，使用iconv-lite转码，decedo中为Buffer对象，Buffer.concat为数组
27             const html1 = iconv.decode(Buffer.concat(html), 'gbk')
28             //使用cheerio解析html，cheerio模块的语法跟jQuery基本一样
29             const $ = cheerio.load(html1, {decodeEntities: false})
30             //处理数据
31             const title = $('.bookname h1').text()
32             const arr = []
33             const content = $("#content").html()
34             //分析结构后分割html
35             const contentArr = content.split('<br><br>')
36             contentArr.forEach(elem => {
37                 //去除内容的两端空格和&nbsp;
38                 const data = trim(elem.toString())
39                 arr.push(data)
40             })
41             const bookName = $(".con_top a").eq(2).text()
42             //定义存入数据库的对象
43             const obj = {
44                 id: i,
45                 err: 0,
46                 bookName: bookName,
47                 title: title,
48                 content: arr
49             }
50 
51             let url2 = url.split('/')[url.split('/').length - 2]
52             const link = $(".bottem2 a").eq(2).attr('href')
53             //获取当前章节的下一章地址，递归调用fetchPage
54             const nextLink = `http://www.81zw.com/book/${url2}/${link}`
55             saveContent(obj, nextLink)
56             console.log(`第${i + 1}章：${nextLink}`)
57             i++
58             if (i <= num) {
59                 setTimeout(() => {
60                     main(nextLink)
61                 }, 100)
62             }
63         })
64     })
65 }
66 
67 function saveContent(obj, nextLink) {
68     console.log(`${i}--${obj.title}`)
69     //判断书名文件夹是否存在，不存在则创建
70     if (!fs.existsSync(`data/${obj.bookName}`)) {
71         fs.mkdirSync(`data/${obj.bookName}`)
72     }
73     //写入json文件
74     fs.writeFile(`./data/${obj.bookName}/chapter${i}.json`, JSON.stringify(obj), 'utf-8', err => {
75         if (err) throw err
76     })
77 }
78 
79 function trim(str) {
80     return str.replace(/(^s*)|(s*$)/g, '').replace(/&nbsp;/g, '')
81 }
82 
83 main(url)

生成文件

相关阅读:
spring的@Transactional注解详细用法
 java中线程池的使用方法
 Java中多线程使用匿名内部类的方式进行创建3种方式
 RISC-V指令集的诞生，"V"也表示变化(variation)和向量(vectors)
雷军致全员公开信：明天，让我们一起见证伟大时刻！（估值543亿美元，中国的山寨机已被彻底消灭，通过生态链产品改变了100多个行业，全面推动了商业效率的提升，上市仅仅是小米新的开始）
C语言利用 void 类型指针实现面向对象类概念与抽象
 Google、Mozilla、Qt、LLVM 这几家的规范是明确禁用异常的
 Qt程序调试之Q_ASSERT断言（它是一个宏，接受布尔值，当其中的布尔值为真时，便什么也不做）
ES 1.7安装ik分词elasticsearch-analysis-ik-1.2.5中文同义词实现
 Windows 64下elasticsearch-1.7.1集群安装、启动、停止
原文地址：https://www.cnblogs.com/tgxh/p/6754649.html