爬虫之 段子网
https://ishuo.cn/{i} i in ['duanzi','yulu','joke','xiaozhishi','duanyu']
import requests
import re
for i in ['duanzi','yulu','joke','xiaozhishi','duanyu']: # 对多个页面进行爬文字
response = requests.get(f'https://ishuo.cn/{i}') # 模拟浏览器打开网页
data = response.text #
# .匹配所有字符,*表示前面的字符0到无穷个
content_res = re.findall('<div class="content">(.*?)</div>', data)
title_res = re.findall('<a href="/subject/.*?">(.*?)</a>', data)
title_res= title_res[10:60]
title_content_dic = {}
for i in range(len(title_res)):
title_content_dic[title_res[i]] = content_res[i]
# print(title_content_dic)
# print(title_content_dic)
for i in title_content_dic.items():
# print(str(i)+'
')
print(f'{i[0]:<40} | {i[1]:<1000}')