• python 爬虫爬取B站api接口返回的json数据,分页存储csv以及下载图片


    接口直接返回的是json数据格式,那就不用去findall各种class了直接处理json数据保存即可

    Request URL: https://api.bilibili.com/x/space/arc/search?mid=390461123&ps=30&tid=0&pn=17&keyword=&order=pubdate&jsonp=jsonp
    Request Method: GET
    Status Code: 200 
    Remote Address: 123.6.7.66:443
    Referrer Policy: no-referrer-when-downgrade
    access-control-allow-credentials: true
    access-control-allow-headers: Origin,No-Cache,X-Requested-With,If-Modified-Since,Pragma,Last-Modified,Cache-Control,Expires,Content-Type,Access-Control-Allow-Credentials,DNT,X-CustomHeader,Keep-Alive,User-Agent,X-Cache-Webcdn
    access-control-allow-methods: GET,POST,PUT,DELETE
    access-control-allow-origin: https://space.bilibili.com
    bili-status-code: 0
    bili-trace-id: 4fb516b50d619c81
    cache-control: no-cache
    content-encoding: br
    content-type: application/json; charset=utf-8
    date: Tue, 23 Nov 2021 05:49:54 GMT
    expires: Tue, 23 Nov 2021 05:49:53 GMT
    idc: shjd
    vary: Origin
    x-bili-trace-id: 4fb516b50d619c81
    x-cache-webcdn: BYPASS from blzone02
    :authority: api.bilibili.com
    :method: GET
    :path: /x/space/arc/search?mid=390461123&ps=30&tid=0&pn=17&keyword=&order=pubdate&jsonp=jsonp
    :scheme: https
    accept: application/json, text/plain, */*
    accept-encoding: gzip, deflate, br
    accept-language: zh-CN,zh;q=0.9
    cookie: buvid3=89EFA719-1D0F-BB2E-FE21-6C7BDCE8053B38280infoc; CURRENT_FNVAL=976; _uuid=210E48834-E65E-AD99-7F37-6771109799A8837281infoc; video_page_version=v_old_home_11; blackside_state=1; rpdid=|(k||)R|Y|)k0J'uYJ~um~kR|; PVID=1; innersign=0
    origin: https://space.bilibili.com
    referer: https://space.bilibili.com/390461123/video?tid=0&page=17&keyword=&order=pubdate
    sec-ch-ua: "Google Chrome";v="95", "Chromium";v="95", ";Not A Brand";v="99"
    sec-ch-ua-mobile: ?0
    sec-ch-ua-platform: "Windows"
    sec-fetch-dest: empty
    sec-fetch-mode: cors
    sec-fetch-site: same-site
    user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36
    mid: 390461123
    ps: 30
    tid: 0
    pn: 17
    keyword: 
    order: pubdate
    jsonp: jsonp
    

      

     

     

    案例一: 单页爬取

    from bs4 import BeautifulSoup   #引用BeautifulSoup库
    import requests                 #引用requests
    import os                       #os
    import pandas as pd
    import csv
    import codecs
    import re
    import xlwt #excel操作
    import time
    import json
    
    #https://api.bilibili.com/x/space/arc/search?mid=390461123&ps=30&tid=0&pn=2&keyword=&order=pubdate&jsonp=jsonp
    url = 'https://api.bilibili.com/x/space/arc/search?mid=390461123&ps=30&tid=0&pn=1&keyword=&order=pubdate&jsonp=jsonp'
    fake_headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
    }
    
    #访问快手界面
    first_request = requests.get(url=url,headers=fake_headers)
    first_data =first_request.json() #转成json字符串
    # text = first_request.text
    # data = json.loads(text)  # str转成json
    item = first_data['data']['list']['vlist']  # 从全部数据中取出vlist  liebiao 项
    
    #写模式打开csv文件
    csv_obj = open('bilibili.csv', 'w', encoding="utf-8")
    #写入一行标题
    csv.writer(csv_obj).writerow(["aid", "图片链接", "标题"])
    
    listaid = []
    for d in item:
        # plist = d.find('img')['src']
        listaid.append(d['aid'])
        #逐个写入电影信息
        print("===============正在写入id为:%s,的信息===============" %(d['aid']))
        csv.writer(csv_obj).writerow([d['aid'],d['pic'],d['title']])
        print("======aid={0}完成: {1}====".format(d['aid'],'over'))    #将format后面的内容以此填充
    #关闭
    csv_obj.close()
    print("finshed")

    从上面url链接看出:

    https://api.bilibili.com/x/space/arc/search?mid=390461123&ps=30&tid=0&pn=分页页数&keyword=&order=pubdate&jsonp=jsonp

    案例二:分页爬取需要的数据保存到csv中,另外下载图片到本地

     

    from bs4 import BeautifulSoup   #引用BeautifulSoup库
    import requests                 #引用requests
    import os                       #os
    import pandas as pd
    import csv
    import codecs
    import re
    import xlwt #excel操作
    import time
    import json
    
    #通用的爬取方法
    def scrape_api(url):
        fake_headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
        }
        response=requests.get(url=url,headers=fake_headers)
        # text = first_request.text
        # data = json.loads(text)  # str转成json
        return response.json()#读取接口返回的json信息,转化成json字符串
    
    #通用的分页方法
    def scrape_page(page):
        #https://api.bilibili.com/x/space/arc/search?mid=390461123&ps=30&tid=0&pn=2&keyword=&order=pubdate&jsonp=jsonp
        url = 'https://api.bilibili.com/x/space/arc/search?mid=390461123&ps=30&tid=0&pn={page}&keyword=&order=pubdate&jsonp=jsonp'.format(page=page)
        return scrape_api(url)
    
    #通用的csv表格
    def scrpe_csv(item):
        # #写模式打开csv文件
        csv_obj = open('bilibili.csv', 'a+', encoding="utf-8")
        # #写入一行标题
        csv.writer(csv_obj).writerow(["aid", "图片链接", "标题"])
        listaid = []
        for d in item:
            listaid.append(d['aid'])
            #逐个写入电影信息
            print("===============正在写入id为:%s,的信息===============" %(d['aid']))
            csv.writer(csv_obj).writerow([d['aid'],d['pic'],d['title']])
            #print("======aid={0}完成: {1}====".format(d['aid'],'over'))    #将format后面的内容以此填充
    
        #关闭
        csv_obj.close()
        print("finshed")
    
    # w:以写方式打开,
    # a:以追加模式打开 (从 EOF 开始, 必要时创建新文件)
    # r+:以读写模式打开
    # w+:以读写模式打开 (参见 w )
    # a+:以读写模式打开 (参见 a )
    # rb:以二进制读模式打开
    # wb:以二进制写模式打开 (参见 w )
    # ab:以二进制追加模式打开 (参见 a )
    # rb+:以二进制读写模式打开 (参见 r+ )
    # wb+:以二进制读写模式打开 (参见 w+ )
    # ab+:以二进制读写模式打开 (参见 a+ )
    
    #下载图片到本地
    def download_img(item):
        pic_l = []#把所有图片地址放到这个数组里头
        for dd in item:
            pic_l.append(dd['pic'])
        
        if not os.path.exists(r'picture'):
            os.mkdir(r'picture')
    
        for i in pic_l:
            #i == http://i0.hdslb.com/bfs/archive/c6490a18ce51d821b0edc9701bc8c16353fbea4a.jpg
            pic = requests.get(i)
            #split():拆分字符串。通过指定分隔符对字符串进行切片,并返回分割后的字符串列表(list):
            p_name=i.split('/')
            #['http:', '', 'i0.hdslb.com', 'bfs', 'archive', 'c6490a18ce51d821b0edc9701bc8c16353fbea4a.jpg']
            imgadres = p_name[5] #c6490a18ce51d821b0edc9701bc8c16353fbea4a.jpg
            print(imgadres)
            with open('picture\\'+imgadres,'wb') as f:
                 f.write(pic.content)
        print("imgdown_finshed")
    
    #挨个调用方法
    def datacsv():
        pages=28 #总页数
        data = []
        for page in range(1,pages):
            print("===========当前为第%s页============="%(page))
            indexdata = scrape_page(page)
            allres=indexdata.get('data')
            item = allres.get('list').get('vlist')# 从全部数据中取出vlist  liebiao 项
            scrpe_csv(item)   #csv表格
            time.sleep(1)
            download_img(item)     #图片下载
            time.sleep(1)
    
    
    if __name__=='__main__':
        datacsv()
  • 相关阅读:
    django错误参考
    Pyhton模块学习
    jmeter
    SQL SERVER 2008
    touch的属性
    Sql Server Alter语句
    安装LoadRunner提示缺少vc2005_sp1_with_atl..
    sql语句
    数据库的知识
    十天学会<div+css>横向导航菜单和纵向导航菜单
  • 原文地址:https://www.cnblogs.com/yszr/p/15593278.html
Copyright © 2020-2023  润新知