• 爬B站并保存成csv文件。提供数据


    """
    b站排行榜爬虫(scrapy)
    https://www.bilibili.com/ranking#!/all/0/0/7/
    爬取编号,标题,url,综合评分,播放量,评论数
    存储到mysql数据库

    """
    import requests
    from fake_useragent import FakeUserAgent
    from lxml import etree
    import re
    import csv

    url = 'https://www.bilibili.com/ranking#!/all/0/0/7/'
    # 代理ip
    proxies = {"http":"101.65.24.108:8118"}
    headers = {
    'User-Agent': FakeUserAgent().random
    }
    html = requests.get(url,params=proxies,headers=headers).text
    # print(html)

    # 构造树形结构
    html1=etree.HTML(html)


    """
    爬取编号,标题,url,综合评分,播放量,评论数
    分析编号:
    <div class="num">1</div>
    <div class="num">2</div>
    分析标题:
    <a href="//www.bilibili.com/video/av55443085/" target="_blank" class="title">【党妹】三十变十三!毕业季必须拥有的芒果系JK妆容,成为甜甜山吹女孩!</a>
    <a href="//www.bilibili.com/video/av55210171/" target="_blank" class="title">【中字.迪士尼反派系列2】后妈们的抱怨</a>
    分析评分:
    <div class="">2087768</div>
    <div class="">1715927</div>
    """
    bianhao = html1.xpath('//div[@class="num"]/text()')
    print(bianhao)
    titles = html1.xpath('//a[@class="title"]/text()')
    print(titles)
    urls = html1.xpath('//a[@class="title"]/@href')
    # print(urls)
    # 将url进行处理
    url_list = []
    for url in urls:
    url = url.replace("//","").replace("/","")
    url_list.append(url)
    print(url_list)

    grade = html1.xpath('//div[@class="pts"]/div/text()')
    print(grade)
    # 播放量
    vv = html1.xpath('//div[@class="detail"]/span[1]/text()')
    print(vv)
    # 评论数
    comment = html1.xpath('//div[@class="detail"]/span[2]/text()')
    print(comment)



    # 对数据进行处理保存成csv文件
    # 使用zip函数,让数据一一对应
    data_list = []
    res = zip(bianhao,titles,url_list,grade,vv,comment)
    for data in res:
    data_list.append(data)
    print(data_list)
    # 打开一个csv文件
    with open('../files/data/bzhan.csv','w',encoding='utf-8') as file:
    csv_f = csv.writer(file)
    # 添加第一行
    csv_f.writerow(["id","title","url","grade","vv","comment"])
    for row in data_list:
    csv_f.writerow(row)
    人生苦短,我用python!
  • 相关阅读:
    webpack学习笔记--配置resolve
    常用 Git 命令清单
    彻底解决Webpack打包慢的问题
    vue-cli3安装遇到的问题,卸载不掉旧版本,导致更新不了
    在5分钟内搭建企业内部私有npm仓库
    package.json中你还不清楚的browser,module,main 字段优先级
    细说 peerDependencies
    对peerDependencies的理解
    [转载]罗技Usb Receiver在Win7 64位系统中驱动安装方法
    波峰波谷的判断
  • 原文地址:https://www.cnblogs.com/YangQingHong/p/11048137.html
Copyright © 2020-2023  润新知