好久没写博客了,也许人还处在迷茫状态一天浑浑噩噩的。最近写了一个爬虫脚本爬某APP的用户厂商数据,由于数据有12W+加上sleep的时间跑起来花费的时间很长。所以我将脚本分开写了先抓一级页面的请求参数再抓二级页面的详细数据,也可以将两个脚本合并,抓到的请求参数会存放在列表中在二脚本循环读取。数据量过大频繁的抓取必然会遭到反爬,所以我们需要代理IP池。说实在的去抓取免费的代理IP池真的纯属浪费时间,能用的IP少得可怜,那种东西只适合自己写着玩玩。真正的实际操作中你要为公司抓取数据还是买个代理IP套餐,下图标红的喂代理AP的API,调用API获取代理IP,把获取到的IP放进池子里通过页面的状态码去甄别有效的IP加以利用。(忘了说抓APP数据要用filder去找请求头参数,用自己手机下载注册APP连接到跟电脑同一网段的wife,通过代理IP把wife配置成自己电脑的IP这样你在打开APP的时候filder上会刷出APP的请求记录)
import requests import urllib3 import pprint import socket import pymysql import pandas as pd import os import ssl import time datapage = [] aplist = [] requests.adapters.DEFAULT_RETRIES = 5 timeout = 120 socket.setdefaulttimeout(timeout) requests.packages.urllib3.disable_warnings() ssl._create_default_https_context = ssl._create_unverified_context proxy_pool_url = [] def main(): urllib3.disable_warnings() os.chdir(r'E:eclipse-workspaceday23weixiuzhanvenv') url1 = 'http://47.106.123.30:8070/app/api/usercompany/finsCompanyListAndVipCompany' df = pd.read_json('city.json') headers = { 'Content-Type': 'application/x-www-form-urlencoded', 'Connection': 'keep-alive', 'Accept': '*/*', 'Accept-Language': 'zh-Hans-CN;q=1, en-CN;q=0.9', 'Content-Length': '86', 'Accept-Encoding': 'gzip, deflate', 'User-Agent': 'lanaer/15.1.8 (iPhone; iOS 14.4.2; Scale/2.00)' } sk = 0 for i in range(len(df.dataes)): for Num in range(1, 20): try: # print(df['data'][i]['latitude'], df['data'][i]['longitude']) params = "locationX=" + str(df['dataes'][i]['latitude']) + "&locationY=" + str(df['dataes'][i]['longitude']) + "&oneselfType=0&pageId=" + str(Num) + "&pageCount=25&pageSize=20&userType=3" except KeyError: continue gurl = 'https://proxyapi.horocn.com/api/v2/proxies?order_id=ZILH1713559550939697&num=1&format=text&line_separator=win&can_repeat=no&user_token=8a61d42fdd4041c67145cf6a44f51d69' rep = requests.get(url=gurl) proxy_pool_url.append(rep.text) time.sleep(1) proxies = {'https': 'https://' + proxy_pool_url[sk]} response = requests.post(url=url1, allow_redirects=False, proxies=proxies, data=params, headers=headers, verify=False) json_data = response.json() result = json_data coed_status = result['msg'] if coed_status == '您的操作过于频繁,请休息一下吧~': sk = sk + 1 proxies1 = {'https': 'https://' + proxy_pool_url[sk]} response1 = requests.post(url=url1, allow_redirects=False, proxies=proxies1, data=params, headers=headers, verify=False) json_data1 = response1.json() result2 = json_data1 else: result2 = json_data try: result2['data']['result'][0] except (IndexError, KeyError, TypeError): continue for i in range(len(result['data'])): try: id = result2['data']['result'][i]['_id'] aplist.append(id) except (IndexError, KeyError): id = ' ' aplist.append(id) time.sleep(2) print(aplist) output = open('id.json', 'w', encoding='gbk') output.write(str(aplist)) output.close() if __name__ == '__main__': main()
import requests import urllib3 import pprint import socket import pymysql import pandas as pd import os import ssl import time datapage = [] aplist = [] requests.adapters.DEFAULT_RETRIES = 5 timeout = 10 socket.setdefaulttimeout(timeout) requests.packages.urllib3.disable_warnings() ssl._create_default_https_context = ssl._create_unverified_context proxy_pool_url = [] def main(): urllib3.disable_warnings() os.chdir(r'E:eclipse-workspaceday23weixiuzhanvenv') df = pd.read_json('id.json') head = { 'Content-Type': 'application/x-www-form-urlencoded', 'Connection': 'keep-alive', 'Connection': 'keep-alive', 'Accept': '*/*', 'User-Agent': 'lanaer/15.1.20 (iPhone; iOS 13.4.2; Scale/2.00)', 'Accept-Language': 'zh-Hans-CN;q=1, en-CN;q=0.9', 'Content-Length': '319', 'Accept-Encoding': 'gzip, deflate' } url2 = 'http://47.106.123.30:8070/app/api/usercompany/v1/getById' sk = 0 for i in range(len(df.dataes)): param = "companyType=3&id=" + str(df['dataes'][i]['id']) + "&key=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJBUFAiLCJpc3MiOiJTZXJ2aWNlIiwiZXhwIjoxNjk3MjYzNzkwLCJ1c2VySWQiOiI0ODIwMWUzNy02ODlhLTRlNjEtYmZjYy1mMzVlMmQwYWRlMjEiLCJpYXQiOjE2MzQxOTE3OTAsInRva2VuIjoiMTYzNDE5MTc5MDIzOCJ9.60gGl6hJbvpKcHtwfxRSMQveZ8O-moWrLEBEpVn-PYo" gurl = 'https://proxyapi.horocn.com/api/v2/proxies?order_id=ZILH1713559550939697&num=1&format=text&line_separator=win&can_repeat=no&user_token=8a61d42fdd4041c67145cf6a44f51d69' rep = requests.get(url=gurl) proxy_pool_url.append(rep.text) time.sleep(1) proxies = {'https': 'https://' + proxy_pool_url[sk]} response = requests.post(url=url2, allow_redirects=False, proxies=proxies, data=param, headers=head,verify=False) json_data = response.json() result = json_data coed_status = result['msg'] if coed_status == '您的操作过于频繁,请休息一下吧~': sk = sk + 1 proxies1 = {'https': 'https://' + proxy_pool_url[sk]} response1 = requests.post(url=url2, allow_redirects=False, proxies=proxies1, data=param, headers=head,verify=False) json_data1 = response1.json() result2 = json_data1 else: result2 = json_data print(proxies) print(result2) addert = [] try: companyName = result2['data']['companyName'] addert.append(companyName) except (IndexError, KeyError): companyName = ' ' addert.append(companyName) try: repairTypeName = result2['data']['repairTypeName'] addert.append(repairTypeName) except (IndexError, KeyError): repairTypeName = ' ' addert.append(repairTypeName) try: contacts = result2['data']['contacts'] addert.append(contacts) except (IndexError, KeyError): contacts = ' ' addert.append(contacts) try: mobile = result2['data']['mobile'] addert.append(mobile) except (IndexError, KeyError): mobile = ' ' addert.append(mobile) try: workDescribe = result2['data']['workDescribe'].strip(' ') addert.append(workDescribe) except (IndexError, KeyError): workDescribe = ' ' addert.append(workDescribe) try: address = result2['data']['address'] addert.append(address) except (IndexError, KeyError): address = ' ' addert.append(address) try: location = result2['data']['location'] addert.append(location) except (IndexError, KeyError): location = ' ' addert.append(location) datapage.append(addert) time.sleep(1) dbpath = pymysql.connect(host='192.168.1.202', port=3306, user='root', password='Password@123', database='wxzhan') saveData(datapage, dbpath) # 创建数据表 def init_db(dbpath): c = dbpath.cursor() # 获取游标 sql = ''' CREATE TABLE `weixiuz` (id int unsigned not null auto_increment primary key, `company` mediumtext NULL , `type` mediumtext NULL , `contact` mediumtext NULL , `mobile` mediumtext NULL , `describe` longtext NULL , `address` mediumtext NULL , `location` longtext NULL ) ''' # 创建数据表 dbpath.ping(reconnect=True) c.execute(sql) # 执行SQL dbpath.commit() # 提交数据库操作 dbpath.close() # 关闭数据库连接 # 保存数据 def saveData(datapage, dbpath): init_db(dbpath) cur = dbpath.cursor() for page in datapage: for index in range(len(page)): if (len(page[index]) != 0): page[index] = '"' + str(page[index]) + '"' # page.append('"' + str(page1[index]) + '"') else: page[index] = '""' sql = ''' insert into `weixiuz` (company, `type`, contact, `mobile`, `describe`, `address`, `location`) values (%s)''' % str(",".join(page)) print(sql) dbpath.ping(reconnect=True) cur.execute(sql) dbpath.commit() cur.close() dbpath.close() if __name__ == '__main__': main()