• 糗百热点爬虫


     1 # -*- coding:utf-8 -*-
     2 # Author:Sure Feng
     3 
     4 import requests
     5 from lxml import etree
     6 import json
     7 
     8 
     9 class QiubaiSpider(object):
    10     def __init__(self):
    11         self.tempt_url = "https://www.qiushibaike.com/8hr/page/{}/"
    12         self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}
    13 
    14     def parse_url(self, url):
    15         """发送请求,获取响应"""
    16         respond = requests.get(url, self.headers)
    17         return respond.content.decode()
    18 
    19     def get_content(self, html_str, num):
    20         """提取数据"""
    21         html = etree.HTML(html_str)
    22         div_list = html.xpath("//div[@id='content-left']/div")  # 分组
    23         content_list = []
    24         for div in div_list:
    25             item = {}
    26             item["page"] = num
    27             item["content"] = div.xpath(".//div[@class='content']/span/text()")
    28             item["content"] = [i.replace("
    ","") for i in item["content"]]
    29             item["author_gender"] = div.xpath(".//div[contains(@class, 'articleGender')]/@class")
    30             item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon", "") if len(item["author_gender"])>0 else None
    31             item["age"] = div.xpath(".//div[contains(@class, 'articleGender')]/text()")
    32             item["age"] = item["age"][0] if len(item["age"])>0 else None
    33             item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src")
    34             item["content_img"] = "https" + item["content_img"][0] if len(item["content_img"])>0 else None
    35             item["author_img"] = div.xpath(".//div[@class='author clearfix']//img/@src")
    36             item["author_img"] = "https:" + item["author_img"][0] if len(item["author_img"])>0 else None
    37             item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()")
    38             item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"])>0 else None
    39             content_list.append(item)
    40         return content_list
    41 
    42     def save_conten(self, content_list):
    43         """保存"""
    44         with open("qiubai.txt", "a", encoding="utf-8") as f:
    45             for content in content_list:
    46                 f.write(json.dumps(content, ensure_ascii=False, indent=4))
    47                 f.write("
    ")
    48         print("保存成功")
    49 
    50     def run(self):  # 实现主要逻辑
    51         # 获取URL列表,遍历列表
    52         start_url = [self.tempt_url.format(i) for i in range(1, 14)]
    53         num = 1
    54         for url in start_url:
    55             # 发送请求,获取响应
    56             html_str = self.parse_url(url)
    57             # 提取数据
    58             content_list = self.get_content(html_str, num)
    59             # 保存
    60             self.save_conten(content_list)
    61             num += 1
    62 
    63 
    64 if __name__ == '__main__':
    65     qiubai_spider = QiubaiSpider()
    66     qiubai_spider.run()
  • 相关阅读:
    安装黑苹果的config.plist
    navicat for mysql 导出数据的坑
    js中的深层复制
    js写的一个HashMap
    js前台数据校验
    nginx对上传文件大小的限制
    tomcat用户配置,内存配置,pid配置
    redis安装及使用
    程序端口被占用分析
    zookeeper+dubbo-admin开发dubbo应用
  • 原文地址:https://www.cnblogs.com/sure-feng/p/10066845.html
Copyright © 2020-2023  润新知