• 爬取某APP的数据


    好久没写博客了,也许人还处在迷茫状态一天浑浑噩噩的。最近写了一个爬虫脚本爬某APP的用户厂商数据,由于数据有12W+加上sleep的时间跑起来花费的时间很长。所以我将脚本分开写了先抓一级页面的请求参数再抓二级页面的详细数据,也可以将两个脚本合并,抓到的请求参数会存放在列表中在二脚本循环读取。数据量过大频繁的抓取必然会遭到反爬,所以我们需要代理IP池。说实在的去抓取免费的代理IP池真的纯属浪费时间,能用的IP少得可怜,那种东西只适合自己写着玩玩。真正的实际操作中你要为公司抓取数据还是买个代理IP套餐,下图标红的喂代理AP的API,调用API获取代理IP,把获取到的IP放进池子里通过页面的状态码去甄别有效的IP加以利用。(忘了说抓APP数据要用filder去找请求头参数,用自己手机下载注册APP连接到跟电脑同一网段的wife,通过代理IP把wife配置成自己电脑的IP这样你在打开APP的时候filder上会刷出APP的请求记录)

    import requests
    import urllib3
    import pprint
    import socket
    import pymysql
    import pandas as pd
    import os
    import ssl
    import time
    
    datapage = []
    aplist = []
    requests.adapters.DEFAULT_RETRIES = 5
    timeout = 120
    socket.setdefaulttimeout(timeout)
    requests.packages.urllib3.disable_warnings()
    
    ssl._create_default_https_context = ssl._create_unverified_context
    proxy_pool_url = []
    
    
    
    def main():
        urllib3.disable_warnings()
        os.chdir(r'E:eclipse-workspaceday23weixiuzhanvenv')
        url1 = 'http://47.106.123.30:8070/app/api/usercompany/finsCompanyListAndVipCompany'
        df = pd.read_json('city.json')
        headers = {
            'Content-Type': 'application/x-www-form-urlencoded',
            'Connection': 'keep-alive',
            'Accept': '*/*',
            'Accept-Language': 'zh-Hans-CN;q=1, en-CN;q=0.9',
            'Content-Length': '86',
            'Accept-Encoding': 'gzip, deflate',
            'User-Agent': 'lanaer/15.1.8 (iPhone; iOS 14.4.2; Scale/2.00)'
    
        }
        sk = 0
        for i in range(len(df.dataes)):
            for Num in range(1, 20):
                try:
                    # print(df['data'][i]['latitude'], df['data'][i]['longitude'])
                    params = "locationX=" + str(df['dataes'][i]['latitude']) + "&locationY=" 
                             + str(df['dataes'][i]['longitude']) 
                             + "&oneselfType=0&pageId=" + str(Num) 
                             + "&pageCount=25&pageSize=20&userType=3"
                except KeyError:
                    continue
    
                gurl = 'https://proxyapi.horocn.com/api/v2/proxies?order_id=ZILH1713559550939697&num=1&format=text&line_separator=win&can_repeat=no&user_token=8a61d42fdd4041c67145cf6a44f51d69'
                rep = requests.get(url=gurl)
                proxy_pool_url.append(rep.text)
                time.sleep(1)
                proxies = {'https': 'https://' + proxy_pool_url[sk]}
                response = requests.post(url=url1, allow_redirects=False, proxies=proxies, data=params, headers=headers,
                                         verify=False)
                json_data = response.json()
                result = json_data
                coed_status = result['msg']
                if coed_status == '您的操作过于频繁,请休息一下吧~':
                    sk = sk + 1
                    proxies1 = {'https': 'https://' + proxy_pool_url[sk]}
                    response1 = requests.post(url=url1, allow_redirects=False, proxies=proxies1, data=params, headers=headers,
                                              verify=False)
                    json_data1 = response1.json()
                    result2 = json_data1
                else:
                    result2 = json_data
    
    
                try:
                    result2['data']['result'][0]
                except (IndexError, KeyError, TypeError):
                    continue
                for i in range(len(result['data'])):
    
                    try:
                        id = result2['data']['result'][i]['_id']
                        aplist.append(id)
                    except (IndexError, KeyError):
                        id = ' '
                        aplist.append(id)
                time.sleep(2)
    
        print(aplist)
        output = open('id.json', 'w', encoding='gbk')
        output.write(str(aplist))
        output.close()
    
    if __name__ == '__main__':
        main()
    import requests
    import urllib3
    import pprint
    import socket
    import pymysql
    import pandas as pd
    import os
    import ssl
    import time
    
    datapage = []
    aplist = []
    requests.adapters.DEFAULT_RETRIES = 5
    timeout = 10
    socket.setdefaulttimeout(timeout)
    requests.packages.urllib3.disable_warnings()
    ssl._create_default_https_context = ssl._create_unverified_context
    proxy_pool_url = []
    
    def main():
        urllib3.disable_warnings()
        os.chdir(r'E:eclipse-workspaceday23weixiuzhanvenv')
        df = pd.read_json('id.json')
        head = {
            'Content-Type': 'application/x-www-form-urlencoded',
            'Connection': 'keep-alive',
            'Connection': 'keep-alive',
            'Accept': '*/*',
            'User-Agent': 'lanaer/15.1.20 (iPhone; iOS 13.4.2; Scale/2.00)',
            'Accept-Language': 'zh-Hans-CN;q=1, en-CN;q=0.9',
            'Content-Length': '319',
            'Accept-Encoding': 'gzip, deflate'
    
    
        }
        url2 = 'http://47.106.123.30:8070/app/api/usercompany/v1/getById'
        sk = 0
        for i in range(len(df.dataes)):
    
            param = "companyType=3&id=" + str(df['dataes'][i]['id']) + "&key=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJBUFAiLCJpc3MiOiJTZXJ2aWNlIiwiZXhwIjoxNjk3MjYzNzkwLCJ1c2VySWQiOiI0ODIwMWUzNy02ODlhLTRlNjEtYmZjYy1mMzVlMmQwYWRlMjEiLCJpYXQiOjE2MzQxOTE3OTAsInRva2VuIjoiMTYzNDE5MTc5MDIzOCJ9.60gGl6hJbvpKcHtwfxRSMQveZ8O-moWrLEBEpVn-PYo"
    
            gurl = 'https://proxyapi.horocn.com/api/v2/proxies?order_id=ZILH1713559550939697&num=1&format=text&line_separator=win&can_repeat=no&user_token=8a61d42fdd4041c67145cf6a44f51d69'
            rep = requests.get(url=gurl)
            proxy_pool_url.append(rep.text)
            time.sleep(1)
            proxies = {'https': 'https://' + proxy_pool_url[sk]}
            response = requests.post(url=url2, allow_redirects=False, proxies=proxies, data=param, headers=head,verify=False)
            json_data = response.json()
            result = json_data
            coed_status = result['msg']
            if coed_status == '您的操作过于频繁,请休息一下吧~':
                sk = sk + 1
                proxies1 = {'https': 'https://' + proxy_pool_url[sk]}
                response1 = requests.post(url=url2, allow_redirects=False, proxies=proxies1, data=param, headers=head,verify=False)
                json_data1 = response1.json()
                result2 = json_data1
            else:
                result2 = json_data
            print(proxies)
    
    
            print(result2)
            addert = []
    
            try:
                companyName = result2['data']['companyName']
                addert.append(companyName)
            except (IndexError, KeyError):
                companyName = ' '
                addert.append(companyName)
            try:
                repairTypeName = result2['data']['repairTypeName']
                addert.append(repairTypeName)
            except (IndexError, KeyError):
                repairTypeName = ' '
                addert.append(repairTypeName)
            try:
                contacts = result2['data']['contacts']
                addert.append(contacts)
            except (IndexError, KeyError):
                contacts = ' '
                addert.append(contacts)
            try:
                mobile = result2['data']['mobile']
                addert.append(mobile)
            except (IndexError, KeyError):
                mobile = ' '
                addert.append(mobile)
            try:
                workDescribe = result2['data']['workDescribe'].strip('
    ')
                addert.append(workDescribe)
            except (IndexError, KeyError):
                workDescribe = ' '
                addert.append(workDescribe)
            try:
                address = result2['data']['address']
                addert.append(address)
            except (IndexError, KeyError):
                address = ' '
                addert.append(address)
            try:
                location = result2['data']['location']
                addert.append(location)
            except (IndexError, KeyError):
                location = ' '
                addert.append(location)
    
            datapage.append(addert)
            time.sleep(1)
        dbpath = pymysql.connect(host='192.168.1.202', port=3306, user='root', password='Password@123', database='wxzhan')
        saveData(datapage, dbpath)
    
    
    # 创建数据表
    def init_db(dbpath):
        c = dbpath.cursor()  # 获取游标
        sql = '''
            CREATE TABLE `weixiuz` (id int unsigned not null auto_increment primary key,
            `company`  mediumtext NULL ,
            `type`  mediumtext NULL ,
            `contact`  mediumtext NULL ,
            `mobile`   mediumtext NULL ,
            `describe`  longtext NULL ,
            `address`  mediumtext NULL ,
            `location`  longtext NULL 
    )
        '''  # 创建数据表
        dbpath.ping(reconnect=True)
        c.execute(sql)  # 执行SQL
        dbpath.commit()  # 提交数据库操作
        dbpath.close()  # 关闭数据库连接
    
    # 保存数据
    def saveData(datapage, dbpath):
        init_db(dbpath)
        cur = dbpath.cursor()
        for page in datapage:
            for index in range(len(page)):
                if (len(page[index]) != 0):
                    page[index] = '"' + str(page[index]) + '"'
                    # page.append('"' + str(page1[index]) + '"')
                else:
    
                    page[index] = '""'
            sql = '''
            insert into `weixiuz` (company, `type`, contact, `mobile`, `describe`, `address`, `location`)
            values (%s)''' % str(",".join(page))
            print(sql)
            dbpath.ping(reconnect=True)
            cur.execute(sql)
            dbpath.commit()
        cur.close()
        dbpath.close()
    
    if __name__ == '__main__':
        main()
    我们以为我们是我们,但其实真的是这样吗?
  • 相关阅读:
    170619、springboot编程之HelloWorld
    170616、解决 java.lang.IllegalArgumentException: No converter found for return value of type: class java.util.ArrayList
    170615、spring不同数据库数据源动态切换
    pytest文档10-命令行传参
    pytest文档9-参数化parametrize
    pytest文档8-html报告报错截图+失败重跑
    pytest文档7-pytest-html生成html报告
    定位对应关系
    ADB 无线连接
    command failed shell "ps 'uiautomator'"的解决方式
  • 原文地址:https://www.cnblogs.com/FireLL/p/15411338.html
Copyright © 2020-2023  润新知