• 爬取糗事百科列表页案例


    import requests
    from lxml import etree
    import re
    
    
    # 爬取糗事百科所有列表页信息
    class Qiushi():
        def __init__(self, base_url):
            self.base_url = base_url
            self.max_page = self.get_max_page()
            self.get_data()
    
        # 获取最大页数
        def get_max_page(self):
            response = requests.get(self.base_url)
            html_str = response.text
            html = etree.HTML(html_str)
            max_page = html.xpath('//ul[@class="pagination"]/li[last()-1]/a/span/text()')
            max_page = int(max_page[0].strip())
            return max_page
    
        # 发起请求获取数据
        def get_data(self):
            # 循环获取每一页的数据
            for page in range(1, self.max_page + 1):
                base_url = 'https://www.qiushibaike.com/8hr/page/{}/'.format(page)
                response = requests.get(base_url)
                html_str = response.text
                html = etree.HTML(html_str)
                result = html.xpath('//div[@class="recommend-article"]/ul/li')
                all_list = []
                for site in result:
                    # print(type(site))
                    # 看看里面是什么
                    # print(etree.tostring(site, pretty_print=True,encoding='utf-8').decode('utf-8'))
                    qiushi_info = {}
                    funny_number = site.xpath('.//div[@class="recmd-num"]/span[1]/text()')  # 搞笑数
                    comment_number = site.xpath('.//div[@class="recmd-num"]/span[4]/text()')  # 评论数
                    content = site.xpath('.//a[@class="recmd-content"]/text()')             # 内容
                    pic = site.xpath('.//a[contains(@class, "recmd-left")]/img/@src')  # 图片
                    username = site.xpath('.//span[@class="recmd-name"]/text()')  # 用户昵称
                    # all函数 所有的都为真 返回真 只要有一个假 则返回假
                    # any函数 只要有一个为真 则返回真
                    # 过滤掉广告
                    if all([funny_number, comment_number, content, pic, username]):
                        qiushi_info['funny_number'] = funny_number[0]
                        qiushi_info['comment_number'] = comment_number[0]
                        qiushi_info['content'] = content[0]
                        # 拼接图片url
                        pic = "https:" + pic[0]
                        # 获取原始图片大小 有些图片没有问号 就是原始图片
                        if "?" in pic:
                            pattern = re.compile('(.*?)?')
                            pic = pattern.findall(pic)[0]
                        qiushi_info['pic'] = pic
                        qiushi_info['username'] = username[0]
                        all_list.append(qiushi_info)
                # 整理输出
                print('-------------------第{}页------------------------'.format(page))
                for i in all_list:
                    print(i)
    
    
    if __name__ == "__main__":
        base_url = 'https://www.qiushibaike.com/'
        Qiushi(base_url)
  • 相关阅读:
    Object中的线程等待和Condition
    synchronized锁和Lock锁
    手写二叉排序树(二叉查找树、二叉搜索树)
    JDK源码之ArrayList-Iterator
    JDK源码之ArrayList
    Integer&int,自动装箱&自动拆箱
    学习Zookeeper第一课
    Thumbnailator处理图片
    线程的停止和中断
    BigInteger和BigDecimal
  • 原文地址:https://www.cnblogs.com/zhangboblogs/p/10122382.html
Copyright © 2020-2023  润新知