• 开线程爬取黑猫里的阿里投诉信息


    仅供学习,请适度开线程

    一.代码

    import requests
    from requests_html import HTMLSession
    import time
    from concurrent.futures import ThreadPoolExecutor
    import json
    
    pool = ThreadPoolExecutor(30)
    big_list = []
    pool_name_list =[]
    session = HTMLSession()
    
    def dewu_company(x):
    
        try:
    
            print(f'第{x+1}页')
    
            params = {
                'couid': '1878960481',
                'type': '1',
                'page_size': f'{(x + 1) * 10}',
                'page': f'{x + 1}',
                # 'callback':'jQuery11',
            }
            url = 'https://tousu.sina.com.cn/api/company/received_complaints'
            res = requests.get(url, params=params, verify=False)
            info_list = res.json()['result']['data']['complaints']
            for dict_info in info_list:
                dict_info['main']['url'] = 'https:' + dict_info['main']['url']
                dict_info['author']['avatar'] = 'https:' + dict_info['author']['avatar']
                info_url = dict_info['main']['url']
                print(info_url)
                res = session.get(info_url, verify=False)
                new_dict = dict()
                new_dict['投诉编号'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[1]/text()')[0]
                new_dict['投诉对象'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[2]/a/text()')[0]
                new_dict['投诉问题'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[3]/text()')[0]
                new_dict['投诉要求'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[4]/text()')[0]
                new_dict['涉诉金额'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[5]/text()')[0]
                new_dict['投诉进度'] = res.html.xpath('/html/body/div[2]/div/div/div[1]/div[2]/ul/li[6]/b/text()')[0]
                # new_dict['a'] = res_dome.xpath('//*[@class="u-name"]/text()')
                # new_dict['b'] = res_dome.xpath('//*[@class="u-status"]/text()')
                new_dict['投诉进程详情'] = res.html.xpath('//*[@class="ts-d-steplist"]')[0].text
                not_have_http_img_list = res.html.xpath('//*[@class="example-image-link"]/@href')
                have_http_img_list = []
                for a in not_have_http_img_list:
                    have_http_img_list.append('https:' + a)
                new_dict['投诉图片'] = have_http_img_list
    
                vide_id_list = res.html.xpath('//*[@class="video-ico"]/@data-id')
                print(vide_id_list)
                new_vide_list = []
                if vide_id_list:
                    for vide_id in vide_id_list:
                        t = int(time.time())
                        vide_info_url = f'https://api.ivideo.sina.com.cn/public/video/play?video_id={vide_id}&appver=V11220.191231.02&appname=sinaplayer_pc&applt=web&tags=sinaplayer_pc&player=all&jsonp=&plid=2019090301&prid=&uid=&tid=&pid=1&ran=0.34558379845592846&r=https%3A%2F%2Ftousu.sina.com.cn%2Fcomplaint%2Fview%2F17349160365%2F&referrer=&ssid=gusr_pc_{t}&preload=0&uu=60.180.189.200_1581579174.788519&isAuto=1'
                        res = session.get(vide_info_url, verify=False)
                        try:
                            new_vide_list.append(res.json())
                        except:
                            pass
                new_dict['投诉视频详情'] = new_vide_list
                dict_info['投诉详情'] = new_dict
                big_list.append(dict_info)
        except:
            print('错误跳过这一页')
    
    def run(page):
        '''爬取的页面数量'''
        for x in range(page):
            name = pool.submit(dewu_company,x)
            pool_name_list.append(name)
        for name_1 in pool_name_list:
            name_1.result()
        print('全部结束开始保存本地')
        with open(f'阿里投诉信息.json', "w", encoding='utf8') as fw:
            json.dump(big_list, fw)
        print('保存完毕')
    
    if __name__ == '__main__':
        run(1)
    
    
  • 相关阅读:
    推荐一本SQL经典书籍
    准备升级包包版游戏大厅
    《博客园精华集软件工程分册》第三轮筛选结果
    (翻译)《Expert .NET 2.0 IL Assembler》 第八章 基本类型和签名(一)
    如何输入人名间的顿号
    推荐一个下名人传记电子书的好地方
    asp.net 2.0 中使用web.config存储数据库连接字符串
    Asp.Net小技巧之在client端调用server端事件:
    C#编码好习惯
    把ip转换成对应的城市名
  • 原文地址:https://www.cnblogs.com/pythonywy/p/12545614.html
Copyright © 2020-2023  润新知