1 import urllib.request 2 from bs4 import BeautifulSoup 3 import re 4 5 def gethtml(url): 6 page=urllib.request.urlopen(url) 7 html=page.read().decode('gbk') 8 soup=BeautifulSoup(html,"html.parser") 9 #print(soup) 10 return soup 11 12 13 def getcontent(soup,load): #获取章节内容以及章节名称 14 content1="" 15 content=re.findall(r'<div id="content"><div id="adright"></div>(.*?)</div>',str(soup)) 16 for i in range(0,len(content)): 17 content1+=content[i] 18 content2 = re.sub("</?w+[^>]*>", "", content1) 19 content3=content2.replace('。','。 ') 20 #以上获取章节内容 21 zjname = re.findall(r'<div id="title">(.*?)</div>', str(soup)) 22 #获取章节名称 23 24 with open(load, 'a', encoding='utf-8') as f: 25 f.write("