• Python 爬虫-豆瓣读书


    import requests
    from bs4 import BeautifulSoup
    
    
    def parse_html(num):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
        }
        response = requests.get(f'https://book.douban.com/top250?start={num}', headers=headers)
    
        soup = BeautifulSoup(response.text, 'lxml')
    
        # 书名
        all_div = soup.find_all('div', class_='pl2')
        book_names = [div.find('a')['title'] for div in all_div]
    
        # 图书信息
        all_p = soup.find_all('p', class_='pl')
        book_infos = [p.get_text() for p in all_p]
    
        # 评分
        all_span = soup.find_all('span', class_='rating_nums')
        book_rates = [span.get_text() for span in all_span]
    
        # 简介
        all_quote = soup.find_all('span', class_='inq')
        book_inqs = [quote.get_text() for quote in all_quote]
    
        d = ''
        # 组合
        for name, info, rate, inq in zip(book_names, book_infos, book_rates, book_inqs):
            d = f'{d}书名:{name}
    作者:{info}
    评分:{rate}
    简介:{inq}
    =======================
    '
    
        return d
    
    
    if __name__ == '__main__':
        data = ''
        for i in range(0, 250, 25):
            data = f'{data}{parse_html(i)}'
    
        filename = '豆瓣图书Top250.txt'
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(data)
  • 相关阅读:
    es6.8集群部署(ssl认证)+nfs备份(生产)
    spool
    dataguard unname
    zabbix监控mysql主从同步可用性
    企业微信发送消息
    安装ruby
    binlog2sql
    xtrabackup备份异地恢复+binlog日志应用
    5.7.29重新部署主从
    centos7 图形界面启动
  • 原文地址:https://www.cnblogs.com/Jimc/p/9707847.html
Copyright © 2020-2023  润新知