1 import requests
2 from lxml import etree
3 from urllib import request
4
5 # 全局变量(请求头+文件IO对象)
6 headers = {
7 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.44'}
8 file=open('./古诗名句.txt','w',encoding='utf-8')
9
10
11 # 采集前端源码
12 def index():
13 for num in range(1, 12):
14 base_url = 'https://so.gushiwen.cn/mingju/default.aspx?p={}&c=&t='.format(num) # 网站翻页
15 print('正在写入', base_url, '中的数据信息...')
16 response = requests.get(base_url, headers=headers) # 模拟访问+请求头
17 response.encoding = 'utf-8' # 解码
18 html = response.text # 获取源码
19 clean(html) # 清洗数据
20
21
22 # 清洗数据
23 def clean(html):
24 htmls=etree.HTML(html)#预处理
25 #xpath守则
26 Mingjus_urls=htmls.xpath('//div[@class="cont"]/a[1]/@href')#名句网址(待处理)
27 #print(Mingjus_urls)
28 Mingjus=htmls.xpath('//div[@class="cont"]/a[1]/text()')#名句
29 #print(Mingjus)
30 Poem_titles=htmls.xpath('//div[@class="cont"]/a[2]/text()')
31 #print(Poem_titles)
32 sto(Mingjus_urls,Mingjus,Poem_titles)
33
34 # 打印数据
35 def sto(Mingjus_urls,Mingjus,Poem_titles):
36 for M,Mingju,Poem_title in zip(Mingjus_urls,Mingjus,Poem_titles):
37 #拼接网址
38 Mingju_url='https://so.gushiwen.cn/'+M
39 #整合数据信息
40 full_info=Mingju+' '+Poem_title+'
'+'名句网址:'+Mingju_url
41 #写入文件
42 file.write(full_info+'
')
43
44 if __name__ == '__main__':
45 index()
46 file.close()