先上代码:
1 # -*- coding: utf-8 -*- 2 import urllib 3 import urllib2 4 import re 5 import json 6 7 8 class Spider: 9 10 def __init__(self): 11 # 记录爬取每页的开始 12 self.offset = 0 13 self.title = [] 14 15 def spider_page(self): 16 url = 'https://www.zhihu.com/r/roundtables?offset={offset}' 17 headers = { 18 "Host": "www.zhihu.com", 19 "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0", 20 "Accept": "*/*", 21 "Accept-Language": "zh-CN;en-US,en;q=0.5", 22 "X-Requested-With": "XMLHttpRequest", 23 "Referer": "https://www.zhihu.com/roundtable", 24 "Connection": "keep-alive", 25 "Accept-Charset": "utf-8" 26 } 27 28 url = url.format(offset=str(self.offset)) 29 req = urllib2.Request(url, headers=headers) 30 html = urllib2.urlopen(req).read().decode('utf-8').encode('utf-8') 31 html = json.loads(html) 32 33 before = self.offset 34 try: 35 next_offset = re.findall(r'offset=(.*)', html['paging']['next']) 36 self.offset = int(next_offset[0]) 37 except: 38 self.offset = 0 39 40 titles = html['htmls'] 41 temp = "" 42 for title in titles: 43 temp += title 44 45 return {"before": before , "temp": temp} 46 47 def find_title(self, titles): 48 titles = titles[:] 49 title = re.findall(r'<span.*?class="name">(.*?)</span></span></a>', titles) 50 51 for item in title: 52 self.title.append(item) 53 54 55 if __name__ == '__main__': 56 s = Spider() 57 for i in range(6): 58 results = s.spider_page() 59 s.find_title(results["temp"]) 60 61 for item in s.title: 62 print item
总结:
总体来说,这段代码还是比较好理解的,有以下几点需要注意:
- 输出乱码,看返回报文首部有Content-Encoding:"gzip",那么你就要注意你的请求首部信息Accept-Ecoding的内容是否能解压,或者直接告知服务器直接不加密传输。
- 在就是对返回的json对象调用json.loads进行解码成json数据类型,否则数据是以字符串的形式存在的。