需求:
抓取某些网站上的小说,按页抓取
每页都有next 按钮,获取这写next 按钮的 href 然后 就可以逐页抓取
解析网页使用beautisoup
from bs4 import BeautifulSoup import urllib2 import time import sys #http://www.vc.com/htm/2016/12/24/t02/367246.html host_name = 'http://www.vc.com' def html_process(html_file,url): ''' use bs to get the titile && contain && next link from html_file ''' global host_name #soup = BeautifulSoup(open(html_file),"html_parser") soup = BeautifulSoup(html_file,"html.parser") ##################################################### text = '/dev/shm/novel.txt' file = open(text,'a') file.write('######################################') file.write(' ' + url + ' ') ##################################################### #get title title_ret = soup.title.string.split('-')[0].strip() file.write(' @# '+ title_ret+ ' ') ##################################################### #get context file.write( soup.find("div",id='view2').get_text() + ' ') file.close() ##################################################### #get next href link = soup.find_all("li",class_ = "next")[0] if None == link: print 'next link is None' exit(0) next_href = host_name + link.a['href'] return next_href def html_get(url): user_agent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0" headers = {'User-Agent':user_agent} req = urllib2.Request(url,headers = headers) try: page = urllib2.urlopen(req,timeout=20).read() return page except urllib2.URLError,e: print "error while loading" + url exit(1) except socket.timeout: #do retry return html_get(url) def test(url): while None != url: html_file = html_get(url) if None == html_file: print 'ERROR OF READING ',url exit(1) url = html_process(html_file,url) time.sleep(5) if __name__ == '__main__': reload(sys) sys.setdefaultencoding( "utf-8" ) #start up url test("http://www.vc.com/htm/2013/11/2/t02/316551.html")