功能有待加强,可实现抓取,不过速度太慢,由于代理,没设置header,幸好网站对爬虫没设限制
1 import requests 2 from bs4 import BeautifulSoup 3 def get_url_list(url): 4 content = requests.get(url).content 5 soup = BeautifulSoup(content,'lxml') 6 list = [] 7 for i in soup.select('#list dl dd a'): 8 temp = 'http://www.biquge.info/0_921/'+i.get('href') 9 list.append(temp) 10 return list 11 def get_date(url): 12 content = requests.get(url).content 13 soup = BeautifulSoup(content,'lxml') 14 soup1 = str(soup.select('#content')) 15 text = soup1.replace('<br/>',' ').replace('</div>',' ').replace('<div id="content">','') 16 title = soup.select('.content_read .box_con .bookname h1')[0].get_text() 17 f = open(r'F:\栋歌第一代爬虫.txt','a+',encoding = 'utf-8') 18 f.write(title +" "+text) 19 print(title) 20 f.close() 21 22 if __name__=="__main__": 23 url = 'http://www.biquge.info/0_921/' 24 url_list = get_url_list(url) 25 for i in url_list: 26 get_date(i)