• 基于cookie登录+验证码如何爬取


    实例:

    需求:获取人人网用户登录过后的个人主页数据

     1 #云打码平台登录,直接下载引用就好,无需更改
     2 import http.client, mimetypes, urllib, json, time, requests
     3 class YDMHttp:
     4 
     5     apiurl = 'http://api.yundama.com/api.php'
     6     username = ''
     7     password = ''
     8     appid = ''
     9     appkey = ''
    10 
    11     def __init__(self, username, password, appid, appkey):
    12         self.username = username  
    13         self.password = password
    14         self.appid = str(appid)
    15         self.appkey = appkey
    16 
    17     def request(self, fields, files=[]):
    18         response = self.post_url(self.apiurl, fields, files)
    19         response = json.loads(response)
    20         return response
    21     
    22     def balance(self):
    23         data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
    24         response = self.request(data)
    25         if (response):
    26             if (response['ret'] and response['ret'] < 0):
    27                 return response['ret']
    28             else:
    29                 return response['balance']
    30         else:
    31             return -9001
    32     
    33     def login(self):
    34         data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey}
    35         response = self.request(data)
    36         if (response):
    37             if (response['ret'] and response['ret'] < 0):
    38                 return response['ret']
    39             else:
    40                 return response['uid']
    41         else:
    42             return -9001
    43 
    44     def upload(self, filename, codetype, timeout):
    45         data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
    46         file = {'file': filename}
    47         response = self.request(data, file)
    48         if (response):
    49             if (response['ret'] and response['ret'] < 0):
    50                 return response['ret']
    51             else:
    52                 return response['cid']
    53         else:
    54             return -9001
    55 
    56     def result(self, cid):
    57         data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid)}
    58         response = self.request(data)
    59         return response and response['text'] or ''
    60 
    61     def decode(self, filename, codetype, timeout):
    62         cid = self.upload(filename, codetype, timeout)
    63         if (cid > 0):
    64             for i in range(0, timeout):
    65                 result = self.result(cid)
    66                 if (result != ''):
    67                     return cid, result
    68                 else:
    69                     time.sleep(1)
    70             return -3003, ''
    71         else:
    72             return cid, ''
    73 
    74     def report(self, cid):
    75         data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid, 'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
    76         response = self.request(data)
    77         if (response):
    78             return response['ret']
    79         else:
    80             return -9001
    81 
    82     def post_url(self, url, fields, files=[]):
    83         for key in files:
    84             files[key] = open(files[key], 'rb');
    85         res = requests.post(url, files=files, data=fields)
    86         return res.text
    云打码平台代码
     1 def parse_codeImg(imgPath):
     2     # 用户名
     3     username    = 'xxxx'#自己注册的用户名
     4 
     5     # 密码
     6     password    = 'xxxx'    #自己注册的密码                     
     7 
     8     # 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
     9     appid       = 6372                                     
    10 
    11     # 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
    12     appkey      = '9b672eb204d7eede7ddeda5a87d7be08'    
    13 
    14     # 图片文件
    15     filename    = imgPath                       
    16 
    17     # 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
    18     codetype    = 2004
    19 
    20     # 超时时间,秒
    21     timeout     = 30                                    
    22 
    23     # 检查
    24     if (username == 'username'):
    25         print('请设置好相关参数再测试')
    26     else:
    27         # 初始化
    28         yundama = YDMHttp(username, password, appid, appkey)
    29 
    30         # 登陆云打码
    31         uid = yundama.login();
    32         print('uid: %s' % uid)
    33 
    34         # 查询余额
    35         balance = yundama.balance();
    36         print('balance: %s' % balance)
    37 
    38         # 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
    39         cid, result = yundama.decode(filename, codetype, timeout);
    40         print('cid: %s, result: %s' % (cid, result))
    41         return result
    云打码平台代码2
     1 import requests
     2 from lxml import etree
     3 import json
     4 import time
     5 #创建一个session对象,会自动保存cookie
     6 session=requests.session()
     7 #获取人人网URL
     8 url='http://www.renren.com'
     9 #仿造headers
    10 headers = {
    11     'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
    12 }
    13 page_text=requests.get(url=url,headers=headers).text
    14 #解析验证码图片,保存到本地
    15 tree=etree.HTML(page_text)
    16 code_img_url=tree.xpath('//img[@id="verifyPic_login"]/@src')[0]
    17 code_img_data=requests.get(url=code_img_url,headers=headers).content
    18 with open('./code.png','wb') as fp:
    19     fp.write(code_img_data)
    20     print("验证码存储成功!!")
    21 code_text=parse_codeImg('./code.png')
    22 print(code_text)
    23 #登录操作,获取cookie
    24 #此url以及data数据需要用Fidder工具抓包获取
    25 login_url="http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=20181131725329"
    26 data={
    27     "email":"18526303496",
    28     "icode":code_text,
    29     "origURL":"http://www.renren.com/home",
    30     "domain":"renren.com",
    31     "key_id":"1",
    32     "captcha_type":"web_login",
    33     "password":"3f06abf49c06d3f2dfce6554f070677f2459a14159d738eb08f8f7922280f5b7",
    34     "rkey":"3ca02f6d93a15caf7d0c0b3637abf5a8",
    35     "f":'http%3A%2F%2Fwww.renren.com%2F969092014'   
    36 }
    37 #使用session发起请求,将cookie存储到session,保证请求成功,
    38 session.post(url=login_url,headers=headers,data=data)
    39 
    40 #进行个人主页页面的数据爬取
    41 personoal_url='http://www.renren.com/969092014/profile'
    42 page_text=session.get(url=personoal_url,headers=headers).text
    43 with open('./renren.html','w',encoding='utf-8')as fp:
    44     fp.write(page_text)
    45     print('over')
    主代码
  • 相关阅读:
    [nodejs]npm国内npm安装nodejs modules终极解决方案
    [nodejs]解决mysql和连接池(pool)自动断开问题
    [nodejs]国内npm安装nodejs modules失败的几个解决方案
    [less]用webstorm自动编译less产出css和sourcemap
    [javascript] Promise API
    [javascript]巧用sourcemap快速定位javascript中的问题
    Gruntjs提高生产力(四)
    Gruntjs提高生产力(三)
    Gruntjs提高生产力(二)
    Gruntjs提高生产力(一)
  • 原文地址:https://www.cnblogs.com/duanhaoxin/p/10110909.html
Copyright © 2020-2023  润新知