1 # -*- coding: utf-8 -*- 2 #author:zxy 3 #Date:2018-9-24 4 5 import re 6 import requests 7 8 HEADERS = { 9 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) ' 10 'AppleWebKit/537.36 (KHTML, like Gecko)' 11 ' Chrome/67.0.3396.99 Safari/537.36' 12 } 13 14 def parse_page(url): 15 response=requests.get(url,headers=HEADERS) 16 text=response.text 17 contents=re.findall(r'<divsclass="content">.*?<span>(.*?)</span>',text,re.S) 18 for content in contents: 19 x=re.sub(r'<.*?>',' ',content) # delete <br> 20 with open('qiushi.txt','a',encoding='utf-8') as f: 21 f.write(x.strip()) 22 f.write(' ') 23 f.write('='*50) 24 f.write(' '*3) 25 26 27 def main(): 28 url="https://www.qiushibaike.com/text/page/1/" 29 for x in range(1,4): 30 url="https://www.qiushibaike.com/text/page/%s/"%x 31 #url="https://www.qiushibaike.com/text/page/2/".format(x) 32 parse_page(url) 33 34 35 if __name__ == '__main__': 36 main()
效果如下所示: