• 糗事百科实例


    爬取糗事百科段子,页面的URL是 http://www.qiushibaike.com/8hr/page/

    1. 使用requests获取页面信息,用XPath 做数据提取

    2. 获取每个帖子里的用户头像链接用户姓名段子内容点赞次数评论次数

     1 # -*- coding:utf-8 -*-
     2 import requests
     3 from lxml import etree
     4 
     5 def loadPage(url):
     6     headers = {
     7         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
     8         'Accept-Language': 'zh-CN,zh;q=0.8'}
     9     try:
    10         response = requests.get(url, headers=headers)
    11         resHtml = response.text
    12         html = etree.HTML(resHtml)
    13         result = html.xpath('//div[contains(@id,"qiushi_tag")]')
    14         for site in result:
    15             item = {}
    16             imgUrl = site.xpath('./div/a/img/@src')[0].encode('utf-8')
    17             #username = site.xpath('.//img/@alt')[0].encode('utf-8')
    18             username = site.xpath('.//h2')[0].text
    19             content = site.xpath('.//div[@class="content"]/span')[0].text.strip().encode('utf-8')
    20             # 投票次数
    21             vote = site.xpath('.//i')[0].text
    22             #print site.xpath('.//*[@class="number"]')[0].text
    23             # 评论信息
    24             comments = site.xpath('.//i')[1].text
    25             print imgUrl, username, content, vote, comments
    26     except Exception, e:
    27         print e
    28 
    29 def qiushiSpider(url, beginPage, endPage):
    30     """
    31         作用:贴吧爬虫调度器,负责组合处理每个页面的url
    32         url : 贴吧url的前部分
    33         beginPage : 起始页
    34         endPage : 结束页
    35     """
    36     for page in range(beginPage, endPage + 1):
    37         pn = page
    38         fullurl = url + str(pn)
    39         #print fullurl
    40         loadPage(fullurl)
    41         #print html
    42 
    43 if __name__ == "__main__":
    44     beginPage = int(raw_input("请输入起始页:"))
    45     endPage = int(raw_input("请输入结束页:"))
    46     #page = 1
    47     url = 'http://www.qiushibaike.com/8hr/page/'
    48     qiushiSpider(url, beginPage, endPage)

     保存到 json 文件内

     1 #!/usr/bin/env python
     2 # -*- coding:utf-8 -*-
     3 
     4 import urllib2
     5 import json
     6 from lxml import etree
     7 
     8 
     9 def loadPage(url):
    10     
    11     headers = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
    12 
    13     request = urllib2.Request(url, headers = headers)
    14     html = urllib2.urlopen(request).read()
    15     # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
    16 
    17     text = etree.HTML(html)
    18     # 返回所有段子的结点位置,contains()模糊查询方法,第一个参数是要匹配的标签,第二个参数是标签名部分内容
    19     node_list = text.xpath('//div[contains(@id, "qiushi_tag")]')
    20 
    21     items ={}
    22     for node in node_list:
    23         # xpath返回的列表,这个列表就这一个参数,用索引方式取出来,用户名
    24         username = node.xpath('.//img/@alt')[0]
    25         # 图片连接
    26         image = node.xpath('.//div[@class="thumb"]//@src')#[0]
    27         # 取出标签下的内容,段子内容
    28         content = node.xpath('.//div[@class="content"]/span')[0].text
    29         # 取出标签里包含的内容,点赞
    30         zan = node.xpath('.//i')[0].text
    31         # 评论
    32         comments = node.xpath('.//i')[1].text
    33 
    34         items = {
    35             "username" : username,
    36             "image" : image,
    37             "content" : content,
    38             "zan" : zan,
    39             "comments" : comments
    40         }
    41 
    42         with open("qiushi.json", "a") as f:
    43             f.write(json.dumps(items, ensure_ascii = False).encode("utf-8") + "
    ")
    44 
    45 def qiushiSpider(url, beginPage, endPage):
    46     """
    47         作用:贴吧爬虫调度器,负责组合处理每个页面的url
    48         url : 贴吧url的前部分
    49         beginPage : 起始页
    50         endPage : 结束页
    51     """
    52     for page in range(beginPage, endPage + 1):
    53         pn = page
    54         fullurl = url + str(pn)
    55         #print fullurl
    56         loadPage(fullurl)
    57         #print html
    58 
    59 if __name__ == "__main__":
    60     beginPage = int(raw_input("请输入起始页:"))
    61     endPage = int(raw_input("请输入结束页:"))
    62     #page = 1
    63     url = 'http://www.qiushibaike.com/8hr/page/'
    64     qiushiSpider(url, beginPage, endPage)

     在python3中爬取糗事百科段子:

     1 import requests
     2 import json
     3 from lxml import etree
     4 from urllib import parse
     5 
     6 
     7 def loadPage(url):
     8     
     9     headers = {"User-Agent" : "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}
    10 
    11     response = requests.get(url, headers = headers)
    12     html = response.content
    13     # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)
    14 
    15     text = etree.HTML(html)
    16     # 返回所有段子的结点位置,contains()模糊查询方法,第一个参数是要匹配的标签,第二个参数是标签名部分内容
    17     node_list = text.xpath('//div[contains(@id, "qiushi_tag")]')
    18 
    19     items ={}
    20     for node in node_list:
    21         # xpath返回的列表,这个列表就这一个参数,用索引方式取出来,用户名
    22         username = node.xpath('.//img/@alt')[0]
    23         # 图片连接
    24         image = node.xpath('.//div[@class="thumb"]//@src')#[0]
    25         # 取出标签下的内容,段子内容
    26         content = node.xpath('.//div[@class="content"]/span')[0].text
    27         # 取出标签里包含的内容,点赞
    28         zan = node.xpath('.//i')[0].text
    29         # 评论
    30         comments = node.xpath('.//i')[1].text
    31 
    32         items = {
    33             "username" : username,
    34             "image" : image,
    35             "content" : content,
    36             "zan" : zan,
    37             "comments" : comments
    38         }
    39 
    40         with open("qiushi.json", "a") as f:
    41             #f.write(json.dumps(items, ensure_ascii = False).encode("utf-8") + "
    ")
    42             f.write(json.dumps(items, ensure_ascii = False) + "
    ")
    43 def qiushiSpider(url, beginPage, endPage):
    44     """
    45         作用:贴吧爬虫调度器,负责组合处理每个页面的url
    46         url : 贴吧url的前部分
    47         beginPage : 起始页
    48         endPage : 结束页
    49     """
    50     for page in range(beginPage, endPage + 1):
    51         pn = page
    52         fullurl = url + str(pn)
    53         #print(fullurl)
    54         loadPage(fullurl)
    55         #print(html)
    56 
    57 if __name__ == "__main__":
    58     beginPage = int(input("请输入起始页:"))
    59     endPage = int(input("请输入结束页:"))
    60     #page = 1
    61     url = 'http://www.qiushibaike.com/8hr/page/'
    62     qiushiSpider(url, beginPage, endPage)

    改进版

     1 # coding=utf-8
     2 import requests
     3 from lxml import etree
     4 import time
     5 
     6 class QiuBai:
     7     def __init__(self):
     8         self.temp_url = "http://www.qiushibaike.com/8hr/page/{}"
     9         self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36"}
    10 
    11     def get_url_list(self):
    12         '''准备url地址的刘表'''
    13         return [self.temp_url.format(i) for i in range(1,14)]
    14 
    15     def parse_url(self,url):
    16         '''发送请求,获取响应'''
    17         response = requests.get(url,headers=self.headers)
    18         return response.content.decode()
    19 
    20     def get_content_list(self,html_str):
    21         '''提取数据'''
    22         html = etree.HTML(html_str)
    23         div_list = html.xpath("//div[@id='content-left']/div")
    24         content_list = []
    25         for div in div_list:
    26             item = {}
    27             item["user_name"] = div.xpath(".//h2/text()")[0].strip()
    28             item["content"] = [i.strip() for i in div.xpath(".//div[@class='content']/span/text()")]
    29             content_list.append(item)
    30         return content_list
    31 
    32     def save_content_list(self,content_list): 
    33         '''保存'''
    34         for content in content_list:
    35             print(content)
    36 
    37     def run(self):#实现做主要逻辑
    38         #1. 准备url列表
    39         url_list = self.get_url_list()
    40         #2. 遍历发送请求,获取响应
    41         for url in url_list:
    42             html_str = self.parse_url(url)
    43             #3. 提取数据
    44             content_list = self.get_content_list(html_str)
    45             #4. 保存
    46             self.save_content_list(content_list)
    47 
    48 
    49 if __name__ == '__main__':
    50     t1 = time.time()
    51     qiubai = QiuBai()
    52     qiubai.run()
    53     print("total cost:",time.time()-t1)
  • 相关阅读:
    关于Xcode的一些方法-15-05-01
    iOS 多线程(NSThread、GCD、NSOperation)
    iOS中View的创建过程
    iOS启动原理及应用生命周期
    UITableView详解
    iOS 字典转模型
    strong和weak
    零碎知识点总结(不定时更新)
    iOS常用第三方类库 Xcode插件
    cocoapods 类库管理利器
  • 原文地址:https://www.cnblogs.com/wanglinjie/p/9193460.html
Copyright © 2020-2023  润新知