1 # -*- coding: utf-8 -*- 2 #author:zxy 3 #Date:2018-10-19 4 5 6 import requests 7 import re 8 HEADERS={ 9 "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " 10 "Chrome/69.0.3497.100 Safari/537.36" 11 } 12 13 14 def parse_url(url): 15 response=requests.get(url,headers=HEADERS) 16 text=response.text 17 titles=re.findall(r'<divsclass="cont">.*?<b>(.*?)</b>',text,re.DOTALL) #r raw 18 dynasties=re.findall(r'<psclass="source">.*?<a.*?>(.*?)</a>',text,re.DOTALL) 19 authors=re.findall(r'<psclass="source">.*?<a.*?<a.*?>(.*?)</a>',text,re.DOTALL) 20 content_tags=re.findall(r'<divsclass="contson".*?>(.*?)</div>',text,re.DOTALL) 21 contents=[] 22 for content_tag in content_tags: 23 x=re.sub('<.*?>','',content_tag) 24 xx=re.sub('。', '。 ',x) 25 contents.append(xx.strip()) 26 poems=[] 27 for value in zip(titles,dynasties,authors,contents): 28 title,dynasty,author,content=value 29 poem={ 30 "title":title, 31 "dynasty":dynasty, 32 "author":author, 33 "content":content 34 } 35 poems.append(poem) 36 37 with open('poems.txt','w',encoding="utf-8") as f: 38 for poem in poems: 39 for (key,value) in poem.items(): 40 if(key=="title"): 41 f.write("{} ".format(value)) 42 if (key == "dynasty"): 43 f.write(" {} ".format(value)) 44 if(key=="author"): 45 str=" {} " 46 f.write(str.format(value)) 47 if(key=="content"): 48 print(value) 49 f.write("{} ".format(value)) 50 # print(x+"{} ".format(value)) 51 52 if __name__ == '__main__': 53 url="https://www.gushiwen.org/default_1.aspx" 54 parse_url(url)