1 # -*- coding:utf-8 -*- 2 # Author:Sure Feng 3 4 import requests 5 from lxml import etree 6 import json 7 8 9 class QiubaiSpider(object): 10 def __init__(self): 11 self.tempt_url = "https://www.qiushibaike.com/8hr/page/{}/" 12 self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"} 13 14 def parse_url(self, url): 15 """发送请求,获取响应""" 16 respond = requests.get(url, self.headers) 17 return respond.content.decode() 18 19 def get_content(self, html_str, num): 20 """提取数据""" 21 html = etree.HTML(html_str) 22 div_list = html.xpath("//div[@id='content-left']/div") # 分组 23 content_list = [] 24 for div in div_list: 25 item = {} 26 item["page"] = num 27 item["content"] = div.xpath(".//div[@class='content']/span/text()") 28 item["content"] = [i.replace(" ","") for i in item["content"]] 29 item["author_gender"] = div.xpath(".//div[contains(@class, 'articleGender')]/@class") 30 item["author_gender"] = item["author_gender"][0].split(" ")[-1].replace("Icon", "") if len(item["author_gender"])>0 else None 31 item["age"] = div.xpath(".//div[contains(@class, 'articleGender')]/text()") 32 item["age"] = item["age"][0] if len(item["age"])>0 else None 33 item["content_img"] = div.xpath(".//div[@class='thumb']/a/img/@src") 34 item["content_img"] = "https" + item["content_img"][0] if len(item["content_img"])>0 else None 35 item["author_img"] = div.xpath(".//div[@class='author clearfix']//img/@src") 36 item["author_img"] = "https:" + item["author_img"][0] if len(item["author_img"])>0 else None 37 item["stats_vote"] = div.xpath(".//span[@class='stats-vote']/i/text()") 38 item["stats_vote"] = item["stats_vote"][0] if len(item["stats_vote"])>0 else None 39 content_list.append(item) 40 return content_list 41 42 def save_conten(self, content_list): 43 """保存""" 44 with open("qiubai.txt", "a", encoding="utf-8") as f: 45 for content in content_list: 46 f.write(json.dumps(content, ensure_ascii=False, indent=4)) 47 f.write(" ") 48 print("保存成功") 49 50 def run(self): # 实现主要逻辑 51 # 获取URL列表,遍历列表 52 start_url = [self.tempt_url.format(i) for i in range(1, 14)] 53 num = 1 54 for url in start_url: 55 # 发送请求,获取响应 56 html_str = self.parse_url(url) 57 # 提取数据 58 content_list = self.get_content(html_str, num) 59 # 保存 60 self.save_conten(content_list) 61 num += 1 62 63 64 if __name__ == '__main__': 65 qiubai_spider = QiubaiSpider() 66 qiubai_spider.run()