• 初探爬虫小练习,爬取知乎圆桌内容标题


    先上代码:

     1 # -*- coding: utf-8 -*-
     2 import urllib
     3 import urllib2
     4 import re
     5 import json
     6 
     7 
     8 class Spider:
     9 
    10     def __init__(self):
    11         # 记录爬取每页的开始
    12         self.offset = 0
    13         self.title = []
    14 
    15     def spider_page(self):
    16         url = 'https://www.zhihu.com/r/roundtables?offset={offset}'
    17         headers = {
    18                       "Host": "www.zhihu.com",
    19                       "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:48.0) Gecko/20100101 Firefox/48.0",
    20                       "Accept": "*/*",
    21                       "Accept-Language": "zh-CN;en-US,en;q=0.5",
    22                       "X-Requested-With": "XMLHttpRequest",
    23                       "Referer": "https://www.zhihu.com/roundtable",
    24                       "Connection": "keep-alive",
    25                       "Accept-Charset": "utf-8"
    26         }
    27 
    28         url = url.format(offset=str(self.offset))
    29         req = urllib2.Request(url, headers=headers)
    30         html = urllib2.urlopen(req).read().decode('utf-8').encode('utf-8')
    31         html = json.loads(html)
    32 
    33         before = self.offset
    34         try:
    35             next_offset = re.findall(r'offset=(.*)', html['paging']['next'])
    36             self.offset = int(next_offset[0])
    37         except:
    38             self.offset = 0
    39 
    40         titles = html['htmls']
    41         temp = ""
    42         for title in titles:
    43             temp += title
    44 
    45         return {"before": before , "temp": temp}
    46 
    47     def find_title(self, titles):
    48         titles = titles[:]
    49         title = re.findall(r'<span.*?class="name">(.*?)</span></span></a>', titles)
    50 
    51         for item in title:
    52             self.title.append(item)
    53 
    54 
    55 if __name__ == '__main__':
    56     s = Spider()
    57     for i in range(6):
    58         results = s.spider_page()
    59         s.find_title(results["temp"])
    60 
    61     for item in s.title:
    62         print item

    总结:

        总体来说,这段代码还是比较好理解的,有以下几点需要注意:

    1.     输出乱码,看返回报文首部有Content-Encoding:"gzip",那么你就要注意你的请求首部信息Accept-Ecoding的内容是否能解压,或者直接告知服务器直接不加密传输。
    2.     在就是对返回的json对象调用json.loads进行解码成json数据类型,否则数据是以字符串的形式存在的。
  • 相关阅读:
    GIT更改clone方式 ;GIT的SSH配置
    关于web性能测试的一些总结
    pyinstaller 打包selenium程序后,消除chromdriver 控制台黑框
    pyinstaller 打包exe 遇到的坑
    jenkins 新增节点的3种方式
    class
    python 语法糖
    模块 subprocess
    模块 re
    模块 logging
  • 原文地址:https://www.cnblogs.com/selfimprovement/p/5861132.html
Copyright © 2020-2023  润新知