• request模块的简单使用+爬虫小程序


    爬虫之request

    各种请求方式

    get

    host_url = 'https://www.pearvideo.com/'
    #浏览器的版本等信息
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
    }
    res = requests.get(host_url, headers=headers)
    

    post

     r = requests.post('http://httpbin.org/post', data = {'key':'value'})
    

    delete

    r = requests.delete('http://httpbin.org/delete')
    

    put

    r = requests.put('http://httpbin.org/put', data = {'key':'value'})
    

    响应response的属性

    import requests
    respone=requests.get('http://www.jianshu.com')
    # respone属性
    #获得响应的文本为字符串格式
    print(respone.text)
    #获得响应的文本为二进制格式
    print(respone.content)
    #获得响应的状态码
    print(respone.status_code)
    
    print(respone.headers)
    print(respone.cookies)
    #已字典的形式获得响应的cookie
    print(respone.cookies.get_dict())
    print(respone.cookies.items())
    
    print(respone.url)
    print(respone.history)
    
    print(respone.encoding)
    

    爬取梨视频首页视频

    import os
    import re
    from concurrent.futures import ThreadPoolExecutor
    
    import requests
    
    host_url = 'https://www.pearvideo.com/'
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
    }
    
    
    def get_index():
        res = requests.get(host_url, headers=headers)
        return res.text
    
    
    def parser_index(text):
        res = re.findall('<a href="(.*?)" class="vervideo-lilink actplay">', text)
        res = [host_url + i for i in res]
    
        return res
    
    
    def get_detail(html_text):
        # 获得视频的下载地址
        download_index = re.search(r'srcUrl="(.*?.mp4)"', html_text).group(1)
    
        # 获取标题
        title = re.search('<h1 class="video-tt">(.*?)</h1>', html_text).group(1)
    
        dic = {
            'download_index': download_index,
            'title': title
        }
        print('成功链接到[%s]视频文件' % title)
        return dic
    
    
    def get_video(video_url, title):
        video_bytes = requests.get(video_url).content
        if not os.path.exists('down_pearvideos'):
            os.mkdir('down_pearvideos')
        file_path = os.path.join('down_pearvideos', title) + '.mp4'
        with open(file_path, 'wb') as f:
            f.write(video_bytes)
        print(file_path + '下载成功!')
    
    
    if __name__ == '__main__':
        pool = ThreadPoolExecutor(10)
        text = get_index()
        url_list = parser_index(text)
        for url in url_list:
            response = requests.get(url, headers=headers).text
            content_dic = get_detail(response)
            # get_video(content_dic['download_index'],content_dic['title'])
            # 开启多线程快速的爬取数据
            pool.submit(get_video, content_dic['download_index'], content_dic['title'])
    
    

    模拟登陆github

    import re
    
    import requests
    
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36"
    }
    
    login_url = 'https://github.com/login'
    
    login_response = requests.get(login_url, headers=headers)
    login_token = re.search('name="authenticity_token" value="(.*?)"', login_response.text).group(1)
    print(login_token)
    login_cookie = login_response.cookies.get_dict()
    print(login_cookie)
    
    session_url = 'https://github.com/session'
    
    session_response = requests.post(
        session_url,
        headers=headers,
        cookies=login_cookie,
        data={
            "commit": "Sign in",
            "utf8": "✓",
            "authenticity_token": login_token,
            "login": "yangyuanhu",
            "password": "123654asd"
        }
    )
    
    print(session_response.text)
    
    
  • 相关阅读:
    jQuery库冲突解决办法
    jquery源码 整体架构
    中文版Chrome浏览器不支持12px以下字体的解决方案
    html5 localStorage
    Git创建分支/GIT提交分支
    Git直接拉取远程分支
    vscode关闭后未打开上次界面的解决办法
    MAC升级nodejs和npm到最新版
    hadoop hue切换中文版
    Hdfs dfs命令使用
  • 原文地址:https://www.cnblogs.com/jianhaozhou/p/10302640.html
Copyright © 2020-2023  润新知