通过requests、re(正则表达式) 爬取“古诗文”网页数据。
详细代码如下:
#!/user/bin env python # author:Simple-Sir # time:2019/7/31 22:01 # 爬取古诗文网页数据 import re import requests def getHtml(page): ''' 获取网页数据 :param page: 页数 :return: 网页html数据(文本格式) ''' headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36' } url = 'https://www.gushiwen.org/default_{}.aspx'.format(page) # 获取几页数据 respons = requests.get(url,headers=headers) html = respons.text return html def getText(html): titles = re.findall(r'<div class="cont">.*?<b>(.*?)</b>',html,re.DOTALL) # 获取标题 re.DOTALL 匹配所有字符,包含 (.无法匹配 ) caodai = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>',html,re.DOTALL) # 获取朝代 author = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>',html,re.DOTALL) # 获取朝代 contents = re.findall(r'<div class="contson".*?>(.*?)</div>',html,re.DOTALL) # 获取内容,包含标签符号 con_texts =[] # 内容,不含标签符号 for i in contents: rsub = re.sub('<.*?>','',i) con_texts.append(rsub.strip()) # strip 去空格 si = [] for v in zip(titles,caodai,author,con_texts): bt, cd, zz, nr = v s = { '标题':bt, '朝代': cd, '作者': zz, '内容': nr } si.append(s) return si def main(): p = int(input('您想要获取多少页的数据? ')) for page in range(1,p+1): print('第{}页数据:'.format(page)) html = getHtml(page) text = getText(html) for i in text: print(i) if __name__ == '__main__': main()
执行结果: