0x00代码
#coding:utf-8
import requests
import re
def parse_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
}
response = requests.get(url,headers=headers)
text = response.text
#print(response.text)
titles = re.findall(r'<td>.*?<a.*?>(.*?)</a>.*?</td>',text,re.DOTALL)
#print(titles)
authors = re.findall(r'<td style="font-weight:bold;">.*?<a.*?>(.*?)</a>',text,re.DOTALL)
#print(authors)
content_tags = re.findall(r'<td style="font-weight:bold;">.*?<td>(.*?)</td>',text,re.DOTALL)
#print(content_tags)
contents = []#建立一个列表
for content in content_tags:
x = re.sub(r'<.*?>',"",content)
contents.append(x.strip())
#print(contents)此处就是去掉多余的标签,我们只要内容。
for value in zip(titles,authors,contents):#将目标转换成一一对应的数组
title,author,content = value #进行解包
#将下面封装的字典装在列表里
poems = []
poem = { #将我们得完整古诗封装在字典里
'title':title,
'author':author,
'content':content
}
poems.append(poem)#append() 方法用于在列表末尾添加新的对象。
for poem in poems:
print(poem)#打印我们得列表
print('='*40)
def main():
url = "https://fanti.dugushici.com/ancient_proses/query?page=1"
for x in range(1,10):
url = "https://fanti.dugushici.com/ancient_proses/query?page=%s" %x
parse_page(url)
if __name__ == "__main__":
main()
0x01运行效果
到这里正则基本已经掌握了,接下来继续深入学习爬虫,明天开始看python官方文档。