python学习之小说爬虫

 1 # coding:utf8
 2 from multiprocessing.dummy import Pool as ThreadPool
 3 import multiprocessing
 4 import requests, os, codecs, time
 5 from lxml import etree
 6 
 7 url = 'https://www.biquge5200.cc/79_79883/'  # 要下载的小说章节列表页面url
 8 
 9 
10 def getsource(url):
11     try:
12         s = requests.get(url)
13     except:
14         print('访问异常，跳过~！')
15     else:
16         s.encoding = 'gbk'
17         return s.text
18 
19 
20 def getlist(url):
21     global txtname, txtzz
22     #解析地址
23     html = getsource(url)
24     ehtml = etree.HTML(html)
25     u = ehtml.xpath('//*[@id="list"]/dl/dd/a/@href')
26     t = ehtml.xpath('//*[@id="list"]/dl/dd/a/text()')
27     txtname = ehtml.xpath('//*[@id="info"]/h1/text()')[0].replace('\', '').replace('/', '').replace(':', '').replace(
28         '*', '').replace('?', '').replace('"', '').replace('<', '').replace('>', '').replace('|', '')
29     txtzz = ehtml.xpath('//*[@id="info"]/p[1]/text()')[0].replace('xa0', '')
30     num = 0
31     #循环urllist
32     for i in range(9, len(u)):
33         urllist.append(u[i] + '|' + t[i] + '|' + str(num))
34         print(urllist)
35         print(u[i] + '|' + t[i] + '|' + str(num))
36         num += 1
37 
38 
39 def downtxt(url):
40     global downcount
41     u = url.split('|')[0]
42     t = url.split('|')[1]
43     num = url.split('|')[2]
44     content = ''
45     while len(content) == 0:
46         html = getsource(u)
47         ehtml = etree.HTML(html)
48         content = ehtml.xpath('string(//*[@id="content"])').replace('    ', '
').replace('　　', '
').replace(
49             'xa0', '').replace('ufffd', '').replace('u266a', '').replace('readx;', '')
50     if os.path.exists(savepath + num + '.txt'):
51         print(num + '.txt 已经存在!')
52     else:
53         with codecs.open(savepath + num + '.txt', 'a')as f:
54             f.write('
' + t + '
' + content)
55         print(t + ' 下载完成!')
56         downcount += 1
57 
58 
59 time_start = time.time();
60 downcount = 0
61 urllist = []
62 getlist(url)
63 savepath = os.getcwd() + '\' + txtname + '\'
64 if os.path.exists(savepath) == False:
65     os.makedirs(savepath)
66 pool = ThreadPool(multiprocessing.cpu_count())
67 results = pool.map(downtxt, urllist)
68 pool.close()
69 pool.join()
70 print('开始合并txt...')
71 with codecs.open(savepath + txtname + '.txt', 'a')as f:
72     f.write(txtname)
73     f.write('
')
74     f.write(txtzz)
75     f.write('
')
76     for i in range(0, len(urllist)):
77         with open(savepath + str(i) + '.txt', "r") as fr:
78             txt = fr.read()
79             f.write(txt)
80             f.write('===========================')
81             fr.close()
82             os.remove(savepath + str(i) + '.txt')
83 print('小说合并完成~！')
84 
85 print('')
86 print('*' * 15 + ' 任务完成，结果如下：' + '*' * 15)
87 print('')
88 print('<' + txtname + '> 下载完成' + '，获取并下载章节页面：' + str(downcount) + ' 个')
89 print('')
90 print('耗时：' + str(time.time() - time_start) + ' s')
91 print('')
92 print('*' * 51)

相关阅读:
iOS设备后台播放音乐方法
 iOS 编译64位FFMPEG
os8 location authorization 错误.
IOS 使用新浪微博SDK
IOS 解析歌词lrc
IOS 通过button获取cell
IOS 解析XML文档
 OC .(点）与->(箭头）用法区别
 黑苹果安装合集
 Hello，World
原文地址：https://www.cnblogs.com/hfct/p/10977974.html