• 2 request的get和post方法


    requests的get方法

    1 在百度里面查询关键字的方法,并获取带百度当前页面

    import requests
    
    keywords = input('请输入>>>').strip()
    response = requests.get('https://www.baidu.com/s?',
                            params={
                                'wd': keywords,
                                'pn':20
    
    },
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
    }
    
    )
    
    if response.status_code == 200:
        with open('b.html', 'wt', encoding='utf-8') as f:
            f.write(response.text)

    2 get请求给知乎

    import requests
    
    response = requests.get('https://www.zhihu.com',
                            headers={
                                'Referer': 'https://www.zhihu.com/',
                                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
                            }
                            )
    
    with open('c.html', 'wt', encoding='utf-8')as f:
        f.write(response.text)
    
    print(response.status_code)
    print(response.text)

    3 get请求给githup

    import requests
    
    response = requests.get(url='https://github.com/',
                            headers={
                                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
    
                                'Cookie': '_octo=GH1.1.1333562301.1559296277; _ga=GA1.2.392559115.1559296287; has_recent_activity=1; _gat=1; tz=Asia%2FShanghai; _device_id=0dcf09aab9c4d288aaa33f26fecd1309; user_session=Yp-WRUHkznMCmRXO6-WsL8QRfVCau3k7gQ56zIZHMHfVTRCB; __Host-user_session_same_site=Yp-WRUHkznMCmRXO6-WsL8QRfVCau3k7gQ56zIZHMHfVTRCB; logged_in=yes; dotcom_user=andygouyong; _gh_sess=TTFoakY4c0ZtcHVMc2wrdjJiMmtSejhvN0VsVnhqU01PdW9yL01CMFNHYjZOaUNGUTFmNjlQK0o5NXFmVU40L1AzeUxCV2x0VHBka2VkR3ZBRUtxVnU2YUJPTUM0T3RWM0E5OVJtSklJTmswMXl6WS9lY3lrMGYvd1FoU0NnNVNla0lrZE13TzlIekhoRDA5a1JHcXBIeDNBUXlLZnoxVkd5elNNRmdCUHVZbGttREtyd2JDUWcxS1ZaZFpJZ3pnWUx1Z2p3MEppTGZOZkVMWEMrQ01HRGJxcU5kMWJPa3V5d001OHVsNElaWUowYitYYlFxeDgxNXd4YVdlZEJ5bFViVFdtTCtGQTFHYWZWTjFiSzhodVBPNXdQLzMxSkx3ZkJCeFpUdWJQdzR2dkRhcFhTeTUvZkROczZpWC9GMlVaZjgzTmxhWG5wakh1WnpDOFZpdzZ3PT0tLVFZRmowSjkva3RGY3dqaU15b0VHTkE9PQ%3D%3D--4508766204caae7d9c3ecc0c6e7c0fc8ae887a7f'
                            }
                            )
    print(response.status_code)
    print(response.text)
    with open('d.html','wt',encoding='utf-8')as f:
        f.write(response.text)

     requests的post方法(模拟登陆githup)

    #!/user/bin/env python3
    # -*- coding: utf-8 -*-
    
    
    import re, requests
    import time
    
    # 先获取登陆页面,拿到authenticity_token
    # 然后请求的url为'https://www.githuo.com/login
    # 请求方式为git
    
    r1 = requests.get('https://github.com/login',
                      headers={
                          'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
                      }
    
                      )
    
    authenticity_token = re.findall('name="authenticity_token" value="(.*?)"', r1.text, re.S)[0]
    
    r1_cookies = r1.cookies.get_dict()
    print(authenticity_token)
    print(r1_cookies)
    
    # 提交数据表单,完成登陆
    # 请求方法POST
    # https://github.com/session
    # 请求头
    # Referer: https://github.com/login
    # User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36
    
    # cookies=r1_cookies
    # 请求体
    # form date
    #     commit: Sign in
    #     utf8: ✓
    #     authenticity_token: qGeaCNP3aTAb5B13GiLwYrrO9uth09TU9Wm0CnXBg3cNQowPJJDHHMj0BXjziy1M6uuQVpEScoa9SzubrXDNMg==
    #     login: 你的githup登录名
    #     password: 你的githup密码
    
    r2 = requests.post(
        # 请求的url
        'https://github.com/session',
        # 请求的cookies
    
        # 请求头
        headers={
            'Referer': 'https://github.com',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
    
        },
        cookies=r1_cookies,
        # 请求体,勇哥这里得特别别注意
        data={
            'commit': 'Sign in',
            'utf8': '',
            # 这里里面的authenticity_token就是前面get请求中的,在login页面中拿到
            'authenticity_token': authenticity_token,
            'login': '你的githup登录名',
            'password': '你以为我会吧密码贴出来吗,还是太年轻啊'
    
        },
        allow_redirects=True
    )
    
    with open('e.html', 'wt', encoding='utf-8')as f:
        f.write(r2.text)
    print(r2.status_code)
    print('Repositories' in r2.text)

     3 爬取梨视频

    废话不多说,看勇哥写的代码

    import requests
    import re
    import os
    from threading import Thread
    
    ppth = os.path.dirname(__file__)
    
    
    def get_index_page(url):
        # 向目标网站发起请求
        response = requests.get(url)
        # 如果相应的状态吗是200,说明请求成功
        if response.status_code == 200:
            return response.text
    
    
    def parse_index_page(htmll):
        url = re.findall('class="vervideo-bd".*?href="(.*?)"', htmll, re.S)
        return url
    
    
    def get_detail_page(url):
        movie_text = requests.get(url).text
        return movie_text
    
    
    def parse_detail_page(text):
        movie_mp4 = re.findall('srcUrl="(.*?)"', text, re.S)
        title = re.findall('<h1 class="video-tt">(.*?)</h1>', text, re.S)
        # print(title)
        if movie_mp4:
            # print(movie_mp4[0])
            return {'title': title[0], 'movie': movie_mp4[0]}
    
    
    def download(movie_mp4):
        print(movie_mp4)
        title=movie_mp4['title']
        movie_url=movie_mp4['movie']
        response=requests.get(movie_url)
        if response.status_code==200:
            title=title.replace('"', ' ').replace("'"," ").replace(""," ").strip()
            print(title)
            filename=ppth+'/Download/'+title+'.mp4'
            with open(filename,'wb') as f:
                f.write(response.content)
    
    
    # def main():
    #     # 基础的url
    #     base_url = 'https://www.pearvideo.com/category_{page}'
    #     for i in range(5):
    #         # 获取五条网站数据
    #         url = base_url.format(page=i)
    #         # 获取网站的html代码
    #         htmll = get_index_page(url)
    #         # 解析出视频网址
    #         video_num = parse_index_page(htmll)
    #         for j in video_num:
    #             # 获取到每条视频的url
    #             url_end = base_url[0:26] + j
    #             # print(url_end)
    #             # 解析视频的url数据,拿到.mp4结尾的数据
    #             movie_text = get_detail_page(url_end)
    #             # 这是一个字典{'title':none,'movie':none}
    #             movie_mp4 = parse_detail_page(movie_text)
    #             # print(movie_mp4)
    #             if movie_mp4:
    #                 download(movie_mp4)
    
    def main(base_url,i):
        # 获取五条网站数据
        url = base_url.format(page=i)
        # 获取网站的html代码
        htmll = get_index_page(url)
        # 解析出视频网址
        video_num = parse_index_page(htmll)
        for j in video_num:
            # 获取到每条视频的url
            url_end = base_url[0:26] + j
            # print(url_end)
            # 解析视频的url数据,拿到.mp4结尾的数据
            movie_text = get_detail_page(url_end)
            # 这是一个字典{'title':none,'movie':none}
            movie_mp4 = parse_detail_page(movie_text)
            # print(movie_mp4)
            if movie_mp4:
                download(movie_mp4)
    
    if __name__ == '__main__':
    
        # 基础的url
        base_url = 'https://www.pearvideo.com/category_{page}'
        for i in range(5):
            t=Thread(target=main,args=(base_url,i,))
            t.start()

     4 响应response

    1 response的属性

    import requests
    respone=requests.get('http://www.jianshu.com')
    # respone属性
    print(respone.text)
    print(respone.content)
    
    print(respone.status_code)
    print(respone.headers)
    print(respone.cookies)
    print(respone.cookies.get_dict())
    print(respone.cookies.items())
    
    print(respone.url)
    print(respone.history)
    
    print(respone.encoding)
    
    #关闭:response.close()
    from contextlib import closing
    with closing(requests.get('xxx',stream=True)) as response:
        for line in response.iter_content():
        pass

    2 编码问题

    #编码问题
    import requests,re
    
    
    response=requests.get(
        'https://www.autohome.com.cn/shanghai/',
        headers={
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
        }
    )
    #汽车之家网站返回的页面内容为gb2312编码的,而requests的默认编码为ISO-8859-1,如果不设置成gbk则中文乱码 
    
    response.encoding='gbk'
    print(response.text)
    with open('f.html','wt',encoding='gbk')as f:
        f.write(response.text)

    3 获取二进制数据

    import requests
    response=requests.get('https://images.cnblogs.com/cnblogs_com/ouyang99-/1395591/o_1521768608804.jpg')
    with open('a.jpg','wb')as f:
        #写二进制数据的时候使用content
        f.write(tesponse.content)

    4 当数据过大时,就容易发生内存撑爆的现象,这时

    import requests
    response=requests.get('https://images.cnblogs.com/cnblogs_com/ouyang99-/1395591/o_1521768608804.jpg')
    with open('a.jpg','wb')as f:
        #写二进制数据的时候使用content
        for line in response.iter_content():
            f.write(line)
    #这样来一段一段的写入文件,就可以很好的避免上述的问题

    5 解析json

    #解析json
    import requests
    response=requests.get('http://httpbin.org/get')
    
    import json
    res1=json.loads(response.text) #太麻烦
    
    res2=response.json() #直接获取json数据
    
    
    print(res1 == res2) #True

    5response进阶用法

    1、SSL Cert Verification

    #证书验证(大部分网站都是https)
    import requests
    respone=requests.get('https://www.12306.cn') #如果是ssl请求,首先检查证书是否合法,不合法则报错,程序终端
    
    
    #改进1:去掉报错,但是会报警告
    import requests
    respone=requests.get('https://www.12306.cn',verify=False) #不验证证书,报警告,返回200
    print(respone.status_code)
    
    
    #改进2:去掉报错,并且去掉警报信息
    import requests
    from requests.packages import urllib3
    urllib3.disable_warnings() #关闭警告
    respone=requests.get('https://www.12306.cn',verify=False)
    print(respone.status_code)
    
    #改进3:加上证书
    #很多网站都是https,但是不用证书也可以访问,大多数情况都是可以携带也可以不携带证书
    #知乎百度等都是可带可不带
    #有硬性要求的,则必须带,比如对于定向的用户,拿到证书后才有权限访问某个特定网站
    import requests
    respone=requests.get('https://www.12306.cn',
                         cert=('/path/server.crt',
                               '/path/key'))
    print(respone.status_code)
    View Code

    2、使用代理

    #官网链接: http://docs.python-requests.org/en/master/user/advanced/#proxies
    
    #代理设置:先发送请求给代理,然后由代理帮忙发送(封ip是常见的事情)
    import requests
    proxies={
        'http':'http://egon:123@localhost:9743',#带用户名密码的代理,@符号前是用户名与密码
        'http':'http://localhost:9743',
        'https':'https://localhost:9743',
    }
    respone=requests.get('https://www.12306.cn',
                         proxies=proxies)
    
    print(respone.status_code)
    
    
    
    #支持socks代理,安装:pip install requests[socks]
    import requests
    proxies = {
        'http': 'socks5://user:pass@host:port',
        'https': 'socks5://user:pass@host:port'
    }
    respone=requests.get('https://www.12306.cn',
                         proxies=proxies)
    
    print(respone.status_code)
    View Code

    3、超时设置

    #超时设置
    #两种超时:float or tuple
    #timeout=0.1 #代表接收数据的超时时间
    #timeout=(0.1,0.2)#0.1代表链接超时  0.2代表接收数据的超时时间
    
    import requests
    respone=requests.get('https://www.baidu.com',
                         timeout=0.0001)
    View Code

    4、 认证设置

    #官网链接:http://docs.python-requests.org/en/master/user/authentication/
    
    #认证设置:登陆网站是,弹出一个框,要求你输入用户名密码(与alter很类似),此时是无法获取html的
    # 但本质原理是拼接成请求头发送
    #         r.headers['Authorization'] = _basic_auth_str(self.username, self.password)
    # 一般的网站都不用默认的加密方式,都是自己写
    # 那么我们就需要按照网站的加密方式,自己写一个类似于_basic_auth_str的方法
    # 得到加密字符串后添加到请求头
    #         r.headers['Authorization'] =func('.....')
    
    #看一看默认的加密方式吧,通常网站都不会用默认的加密设置
    import requests
    from requests.auth import HTTPBasicAuth
    r=requests.get('xxx',auth=HTTPBasicAuth('user','password'))
    print(r.status_code)
    
    #HTTPBasicAuth可以简写为如下格式
    import requests
    r=requests.get('xxx',auth=('user','password'))
    print(r.status_code)
    View Code

    5、异常处理

    #异常处理
    import requests
    from requests.exceptions import * #可以查看requests.exceptions获取异常类型
    
    try:
        r=requests.get('http://www.baidu.com',timeout=0.00001)
    except ReadTimeout:
        print('===:')
    # except ConnectionError: #网络不通
    #     print('-----')
    # except Timeout:
    #     print('aaaaa')
    
    except RequestException:
        print('Error')
    View Code

    6、上传文件

    import requests
    files={'file':open('a.jpg','rb')}
    respone=requests.post('http://httpbin.org/post',files=files)
    print(respone.status_code)
    View Code
  • 相关阅读:
    『PyTorch』第二弹_张量
    大数据技术之_12_Sqoop学习_Sqoop 简介+Sqoop 原理+Sqoop 安装+Sqoop 的简单使用案例+Sqoop 一些常用命令及参数
    HBase 构建 Scanner 体系图解
    HBase 默认刷写文件 flush_compact.xml 注释解析
    Vim 命令、操作、快捷键全集
    10个在UNIX或Linux终端上快速工作的建议
    如何三招帮你排查Linux中的硬件问题
    介绍一些有趣的MySQL pager命令
    MySQL数据库select语句的使用方法
    能够在Linux系统中运行的5款大型耐玩游戏
  • 原文地址:https://www.cnblogs.com/ouyang99-/p/10960004.html
Copyright © 2020-2023  润新知