##### 爬取古诗文
import re
import requests
def parse_page(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
}
respose = requests.get(url,headers).text
titles = re.findall(r'<divsclass="cont">.*?<b>(.*?)</b>',respose,re.DOTALL)
authors = re.findall(r'<psclass="source">.*?<as.*?>(.*?)</a>',respose,re.DOTALL)
dynaties = re.findall(r'<psclass="source">.*?<as.*?><as.*?>(.*?)</a>',respose,re.DOTALL)
content_tags = re.findall(r'divsclass="contson"sid=.*?>(.*?)</div>',respose,re.DOTALL)
contents = []
for content in content_tags:
# 替换<b>
text = re.sub(r'<.*?>','',content)
contents.append(text.strip())
poems = []
# zip进行解包
for values in zip(titles,authors,dynaties,contents):
titles,authors,dynaties,content = values
poem = {
'titles':titles,
'authors':authors,
'dynaties':dynaties,
'content_tags':content
}
poems.append(poem)
for poem in poems:
print(poem)
print('*'*40)
def main():
for x in range(1,10):
url = 'https://www.gushiwen.cn/default_%d.aspx'%x
parse_page(url)
if __name__ == '__main__':
main()
# 正则爬取糗事百科段子
import re
import requests
def parse_page(url):
headers = {
'user-agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36'
}
respone = requests.get(url,headers=headers).text
# re.DOTALL = re.S 可以.匹配
contents = re.findall(r'<divsclass="content">.*?<span>(.*?)</span>',respone,re.DOTALL)
content_text = []
for content in contents:
text = re.sub(r'<.*?>',"",content)
content_text.append(text.strip())
DUNA_Text = []
for value in zip(content_text):
contents = value
text = {
"title":contents
}
DUNA_Text.append(text)
for duan_value in DUNA_Text:
print(duan_value)
def main():
for x in range(1,10):
url = 'https://www.qiushibaike.com/text/page/%d/'%x
parse_page(url)
if __name__ == '__main__':
main()