• requests模块的进阶使用


    代理

    • 代理服务器,可以接受请求后将其转发,从而更换我们请求所对应的ip
    • 匿名度
      • 高匿:啥也不知道
      • 匿名:服务器会知道我使用了代理但是不知道我的真实ip
      • 透明:服务器会知道我使用了代理并且知道我的真实ip
    • 类型
      • http:意味着这个服务器只能转发http协议的请求
      • https:只能转发https协议的请求
    • 免费代理
    • 为了避免ip失效,我们应该构建一个代理池,里面存放可用的代理
    • 代理池的构建也是通过爬虫实现的,我们可以去快代理上去爬取,但是频繁访问快代理也是会被封的,所以我们需要去买一些代理来爬取快代理,有点以小博大的意思(这里的代理选取主要用于举例,主要是方法)
    # 代理池的用法
    # ip_pool = [
    #     {'https':'https://171.35.149.66:9999'},
    #     {'https':'https://171.35.149.66:9999'},
    #     ...
    # ]
    
    # url = 'https://www.baidu.com/s?wd=ip'
    # page_text = requests.get(url=url,headers=headers,proxies=random.choice(ip_pool)).text
    # with open('ip.html','w',encoding='utf-8') as fp:
    #     fp.write(page_text)
    
    import requests
    from lxml import etree
    import random
    
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
        'Connection':'close'
    }
    
    # 去代理精灵购买代理,并通过爬虫快速提取到购买的代理ip
    ip_url = '.....'(购买后会生成一个url连接)
    page_text = requests.get(url=ip_url,headers=headers).text
    tree = etree.HTML(page_text)
    ip_pool = tree.xpath('//body//text()')
    
    # 爬取快代理
    url = 'https://www.kuaidaili.com/free/inha/%d/'
    proxy_http_list = []
    proxy_https_list = []
    for page in range(1,20):
        new_url = format(url%page)
        page_text = requests.get(new_url,headers=headers,proxies=random.choice(ip_pool)).text
        tree = etree.HTML(page_text)
        tr_list = tree.xpath('//*[@id="list"]/table//tr')[1:]  # 注意!tbody不能出现在xpath表达式中
        
        for tr in tr_list:
            ip = tr.xpath('./td[1]/text()')[0]
            port = tr.xpath('./td[2]/text()')[0]
            t_type = tr.xpath('./td[4]/text()')[0]
            cip = t_type+'://'+ip+':'+port
            dic = {
               t_type: cip
            }
            if t_type == 'HTTP':
                proxy_http_list.append(dic)
            else:
                proxy_https_list.append(dic)
    
    print(len(proxy_http_list),len(proxy_https_list))   #查看个数
    
    # 检查代理是否可用
    # 让其随意访问一个网站(http/https要分开),如果状态码为200,即为可用
    
    for ip in proxy_http_list:
        response = requests.get('http://...',headers=headers,proxies=ip)
        if response.status_code == '200':
            ... #将可用ip存储起来
    
    • cookie的处理

      • cookie的作用:可以让浏览器记录和保存我们当前客户端的某些状态
      • 手动处理:将cookie封装到headers中
      • 自动处理:创建一个session对象,该对象可以像requests一样进行请求发送.不同之处在于如果使用session进行请求发送的过程中产生了cookie,则cookie会自动保存在session对象中.
    • 模拟登陆

    • 实现验证码识别

    • 动态变化的请求参数

      • 通常情况下动态变化的请求参数都会被隐藏在前台页面源码中
    • 模拟登陆失败的话检查以下原因

      • 核对用户名和密码有无出错
      • 核对验证码是否正确
      • 检查请求参数是否为动态变化的
      • 最后检查cookie
    # 对雪球网的新闻数据进行爬取  https://xueqiu.com/
    # 手动处理cookie
    import requests
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
        'Cookie':'acw_tc=2760820e15998126544901640efe534c6e7b7ac5b225fee97814d7b22bf4a5; xq_a_token=4db837b914fc72624d814986f5b37e2a3d9e9944; xqat=4db837b914fc72624d814986f5b37e2a3d9e9944; xq_r_token=2d6d6cc8e57501dfe571d2881cabc6a5f2542bf8; xq_id_token=eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJ1aWQiOi0xLCJpc3MiOiJ1YyIsImV4cCI6MTYwMDQ4MzAwNywiY3RtIjoxNTk5ODEyNjE5NDkzLCJjaWQiOiJkOWQwbjRBWnVwIn0.Uf32VSRz3a-_uxv4lrdqq3K8GTOBcz4EIhTbL1pV3eTbFOv5JU89CmD2svrgJ6cBpUys0LGNuoR9l82uemyMjz5d3vtOijWOLui-VEcFGm1_U4gV90OsB9cH7CG7FE8xy0_O300VyaFjHtV6dDOfbrJdUz3ijLMntMoF3X_0454gUxqYZ78cAY5zwKYpt-BEUQFfJ2eAccbO2Gu432-fK3I6YT5kjkxHr-L0dIZIuEkqKNZBkzTG_dyoADaUG7vs-RL7N8gdWNaL3dfyeB2i8WsoLNiML8fyK5r-ujROfrhUKC8AcIof2fXT5a0mRXfG2ZLUzVE3udvqxyPhrc7XZA; u=601599812654691; Hm_lvt_1db88642e346389874251b5a1eded6e3=1599812658; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1599812658; device_id=24700f9f1986800ab4fcc880530dd0ed'
    }
    
    url = 'https://xueqiu.com/statuses/hot/listV2.json?since_id=-1&max_id=105684&size=15'
    page_text = requests.get(url=url,headers=headers).json()
    print(page_text)
    
    #创建session对象,自动处理cookie
    import requests
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
    }
    session = requests.Session()
    session.get('https://xueqiu.com/',headers=headers)  #先去访问一次主页,获取cookie
    
    url = 'https://xueqiu.com/statuses/hot/listV2.json?since_id=-1&max_id=105684&size=15'
    page_text = session.get(url=url,headers=headers).json()
    print(page_text)
    

    模拟登陆

    # 超级鹰定义的类
    import requests
    from hashlib import md5
    
    class Chaojiying_Client(object):
    
        def __init__(self, username, password, soft_id):
            self.username = username
            password =  password.encode('utf8')
            self.password = md5(password).hexdigest()
            self.soft_id = soft_id
            self.base_params = {
                'user': self.username,
                'pass2': self.password,
                'softid': self.soft_id,
            }
            self.headers = {
                'Connection': 'Keep-Alive',
                'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
            }
    
        def PostPic(self, im, codetype):
            """
            im: 图片字节
            codetype: 题目类型 参考 http://www.chaojiying.com/price.html
            """
            params = {
                'codetype': codetype,
            }
            params.update(self.base_params)
            files = {'userfile': ('ccc.jpg', im)}
            r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
            return r.json()
    
        def ReportError(self, im_id):
            """
            im_id:报错题目的图片ID
            """
            params = {
                'id': im_id,
            }
            params.update(self.base_params)
            r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
            return r.json()
    
    # 识别古诗文网的验证码 https://www.gushiwen.org/
    # 模拟登陆
    
    # 获取验证码
    def transformImgData(img_path,img_type):
        chaojiying = Chaojiying_Client(UserName, Password, 软件id)  # 此处依次输入超级鹰用户名,密码和生成的软件id
        im = open(img_path, 'rb').read()
        return chaojiying.PostPic(im, img_type)['pic_str']
    
    import requests
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
    }
    url = 'https://so.gushiwen.cn/user/login.aspx'
    
    # session获取cookie
    session = requests.Session()
    
    # 获取验证码图片
    page_text = session.get(url=url,headers=headers).text
    tree = etree.HTML(page_text)
    img_url = 'https://so.gushiwen.cn'+tree.xpath('//*[@id="imgCode"]/@src')[0]  #验证码图片
    img_data = session.get(url=img_url,headers=headers).content
    with open('./code.jpg','wb') as fp:
        fp.write(img_data)
    
    # 获取动态数据
    __VIEWSTATEGENERATOR = tree.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value')  
    __VIEWSTATE = tree.xpath('//*[@id="__VIEWSTATE"]/@value')
    
    # 解析出验证码
    code = transformImgData('./code.jpg',1004)
    print(code)
    
    # 模拟登陆
    login_url = 'https://so.gushiwen.cn/user/login.aspx?from='
    data = {
        '__VIEWSTATE': __VIEWSTATE,
        '__VIEWSTATEGENERATOR': __VIEWSTATEGENERATOR,
        'from': '',
        'email': '...',      #填写古诗文网账号和密码
        'pwd': '...',
        'code': code,
        'denglu': '登录',
    }
    login_page = session.post(url=login_url,headers=headers,data=data).text
    with open('./login.html','w',encoding='utf-8') as fp:
        fp.write(login_page)
    
  • 相关阅读:
    js----定义变量的几种方式
    Vue----项目增加百度统计
    Vuex----核心概念和API
    Vuex----理解
    回到学校,国庆收假的第一天
    再次回到武汉
    收获的季节,最忙其实也是最没有收获的时光
    虚无缥缈的自信,一落千丈的打击
    愤怒、愤怒,终于适应了奔波
    总是骗人的你
  • 原文地址:https://www.cnblogs.com/straightup/p/13664753.html
Copyright © 2020-2023  润新知