• 123


    coding=utf-8

    import requests
    from lxml import etree
    import json

    class TiebaSpider:
    def init(self,tieba_name):
    self.tieba_name = tieba_name
    self.start_url = "http://tieba.baidu.com/mo/q----,sz@320_240-1-3---2/m?kw={}&lp=5009".format(tieba_name)
    self.headers= {"User-Agent":"Mozilla/5.0 (iPhone; CPU iPhone OS 10_3 like Mac OS X) AppleWebKit/602.1.50 (KHTML, like Gecko) CriOS/56.0.2924.75 Mobile/14E5239e Safari/602.1"}
    self.part_url = "http://tieba.baidu.com/mo/q----,sz@320_240-1-3---2/"

    def parse_url(self,url):
        print(url)
        response = requests.get(url,headers=self.headers)
        return response.content
    
    def get_content_list(self,html_str): #提取列表页的帖子的url,title
        html = etree.HTML(html_str)
        div_list = html.xpath("//div[contains(@class,'i')]")
        content_list = []
        for div in div_list:
            content = {}
            content["title"] = div.xpath("./a/text()")[0] if len(div.xpath("./a/text()"))>0 else None
            content["href"] = self.part_url + div.xpath("./a/@href")[0] if len( div.xpath("./a/@href"))>0 else None
            content["img_list"] = self.get_img_list(content["href"],[])  #通过函数获取图片
            content["img_list"] = [requests.utils.unquote(i.split("src=")[-1]) for i in content["img_list"]]
            content_list.append(content)
        next_url = html.xpath("//a[text()='下一页']/@href")  #获取下一页的url
        return  content_list,next_url
    
    def get_img_list(self,detail_url,img_list):  #获取详情页的图片列表
        #3.1 请求帖子详情页
        detail_html_str = self.parse_url(detail_url)
        #3.2 提取第一页的图片
        detail_html = etree.HTML(detail_html_str)
        img_list.extend(detail_html.xpath("//img[@class='BDE_Image']/@src"))
        #3.3 获取详情页的下一页的,url
        next_url = detail_html.xpath("//a[text()='下一页']/@href")
        #3.4 循环3.1-3.3
        if len(next_url)>0:
            return self.get_img_list(self.part_url+next_url[0],img_list)
        return img_list
    
    def save_content_list(self,content_list): #保存
        file_path = "{}_".format(self.tieba_name)
        with open(file_path,"a") as f:
            for content in content_list:
                f.write(json.dumps(content,ensure_ascii=False,indent=2))
                f.write("
    ")
        print("保存成功")
    
    def run(self): #实现主要逻辑
        next_url = [self.start_url]
        while len(next_url)>0:
            #1. start_url
            #2. 发送请求,获取响应
            next_url = next_url[0] if next_url[0].startswith("http") else self.part_url + next_url[0]
            html_str = self.parse_url(next_url)
            #3.提取列表页的帖子的url,title
            content_list,next_url = self.get_content_list(html_str)
                #3.1 请求帖子详情页
                #3.2 提取第一页的图片
                #3.3 获取详情页的下一页的,url
                #3.4 循环3.1-3.3
            #4. 保存数据
            self.save_content_list(content_list)
            #5. 提取列表页的下一页url,循环2-5
    

    if name == 'main':
    tieba_spider = TiebaSpider("123")
    tieba_spider.run()

  • 相关阅读:
    【嵌入式】arm-linux-gcc/ld/objcopy/objdump参数概述
    【Java】Java复习笔记-第四部分
    【C/C++】C语言复习笔记-17种小算法-解决实际问题
    【Java】Java复习笔记-三大排序算法,堆栈队列,生成无重复的随机数列
    【Java】Java复习笔记-第三部分
    【教程】ubuntu下安装NFS服务器
    【Java】Java复习笔记-第二部分
    【Java】Java复习笔记-第一部分
    【教程】ubuntu下安装samba服务器
    【C/C++】一道试题,深入理解数组和指针
  • 原文地址:https://www.cnblogs.com/RomanticLife/p/9119716.html
Copyright © 2020-2023  润新知