• 爬虫之requests


    发送请求

    '''
    1.发送请求:
        - pip3 install requests
    '''
    
    import requests
    from urllib.parse import urlencode
    
    # 1.请求url,请求方式GET
    url = 'https://www.baidu.com/s?wd=%E7%BE%8E%E5%A5%B3'  #
    
    # 组织请求头信息
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
    }
    
    # 2.通过requests模块发送GET请求
    # 往url地址发送请求,并得到一个返回值,赋值给response变量
    # response == response 对象
    response = requests.get(url, headers=headers)
    
    print(response.status_code)  # 200成功
    # 获取响应文本
    print(response.text)
    
    with open('baidu.html', 'w', encoding='utf-8') as f:
        f.write(response.text)
    
    
    
    demo2
    import requests
    from urllib.parse import urlencode
    
    # 1.请求url,请求方式GET
    url = 'https://www.baidu.com/s?' + urlencode({'wd': '美女'})
    
    # 组织请求头信息
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
    }
    
    # 2.通过requests模块发送GET请求
    # 往url地址发送请求,并得到一个返回值,赋值给response变量
    # response == response 对象
    response = requests.get(url, headers=headers)
    
    print(response.status_code)  # 200成功
    # 获取响应文本
    print(response.text)
    
    with open('meinv.html', 'w', encoding='utf-8') as f:
        f.write(response.text)
    
    
    
    demo3
    import requests
    
    # 1.请求url,请求方式GET
    url = 'https://www.baidu.com/s?'
    
    # 组织请求头信息
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
    }
    
    # 2.通过requests模块发送GET请求
    # 往url地址发送请求,并得到一个返回值,赋值给response变量
    # response == response 对象
    response = requests.get(url, headers=headers, params={'wd': '美女'})
    
    print(response.status_code)  # 200成功
    # 获取响应文本
    print(response.text)
    
    with open('girl.html', 'w', encoding='utf-8') as f:
        f.write(response.text)
    
    
    

    通过get请求,绕过github登录

    '''
    1、携带github登录过后的cookies信息,访问github主页,绕过登录;
    
    用户名:aaa
    邮箱:bbb
    密码:ccc
    
    
    
    1.请求url:
        - https://github.com/settings/emails
    
    2.请求方式:
        - GET
    
    3.请求头:
        - cookies:
            Cookie: _octo=GH1.1.816288935.1571284371; has_recent_activity=1; tz=Asia%2FShanghai; _device_id=a00ea80a1caf3404bd890d1a520083a6; user_session=y_4qkWDvHylVnzfIezSMSCaNdAnH0fTBw7pmm9yk4ZLmh6Ha; __Host-user_session_same_site=y_4qkWDvHylVnzfIezSMSCaNdAnH0fTBw7pmm9yk4ZLmh6Ha; logged_in=yes; dotcom_user=Michaeljy; _gh_sess=Vy82WmxrcFZmREZvV0dnTUtIaENDVWhPY3M1dEpwRVNWelVmYjk3TWQzQVpVVS84S3hhOXgrMkhRR2lHZUhZNUwvQzU1U3RHUWxWNmhITDFVV3FFcTUreFdPaWsyT2hTZlllQ2Fld3dFVmR5Q0w2RDdKbEduZENidUxySkd6MXVOKzdXQURsRVpSZDVUUjY4TE83c0JCK3BrcWlmeFVBZzhxVGFiVDRPd01HUlI0M3FrVFNLZ3dyNlJ6K0pQaWdQZHFYUFdISFRHT1ZZc3k4UDNFaVB4eWhLY3NTZnRaK0JTb3JlZzhGd3o1eFBaeHh1eCtReFZ3dWFsQXBqSWdNc0ltODdVTTk2bjVrdytFekNyWGlEZnFSdGZLSTZ6eW1jY1pvUGJYWWRIU0RtK01BWC9yOFRqb2FPNENJRUZra3F2V250dVZWMys3ajl5ZHVXQThUVGRQSHhmSDNwUXdnTUFUb0grMytlT1Y5U1RqVEYzak4zV1RIRURVbktycXR5dG1NZVJYNWZLdGxVR1hZeFFmemV0dFhiZXhiZHRKTGw0RHZvYmh6NEJmdHRPVkdFSENzaHhRSjJzVWlya2R4N3N3S3FWVmxpVkl0cWs0bmUvNmhlMGpwU3hNdTFZKzI5eCt1c0RjRVViVjY4Z2E4YzFtbkRhVmtybUZabGg4VVVlaUtLV3FEUUVMdjRvUnU1amdpbnJUTStOV1NIbGpTRGNRc25pbjVlazZ5MzV0b2tVek1qaW1HUDhMWVU1bzhjN3BsYkQ1MXVkaWg2STROdDRCNWVxMUtCTUt2L0o1Y01aTWsxM1BQNUJSZUxVcnBWMXRYdStlTlA2ZFhiVERYRVcwYUZMeFlmNjNxUTN1a0dNSVQySlZQTnpiTTY2UDk2WWMycG9iQzB4SWlRNnFhM3RVRXFxUHF4ZHBmd2R6Rk0wcFQ3b0VvNkJCd1NQZGhGNThKaHdzcEtpSXZMUUxRWVVlOHUyVFFRQmZBdURBZnNXRk9JRTljQVMzWmExSXprWFI2RVMrYkNMaVJybGR0cC9vN09rWTVvYXdHRWxaMmQwdmYzenM5dDVLMXVpTEhkWnNXVkNyWTRVd3JiSWFUVkhDak9Qb0xUekxjTk5PWWxtb2d6V1hZSGNOMkhkdWlVNXY1TTExREc0RW9mUGlVK1FoYVNYLzRSM21OMGFabkZyWHBEOFBpNXBRSU9rYkRXYVczVi8yc3ZPYURJZGUzY1AwN3l0S0szci8yc1J6WmhkQ3ZFeVJuT1pkR2FkMys3d0xTa21QNE80M3hVby82TmJ4MG1DWFZ6ZDlDR3RMaUJBNXhSVHhnMmRwMWxNSTRTR2hRM1JXdncyRklEWlA3cTU2cGRpU0wyVUljZmtGVVNXN2ltTG54cW52QzloWitSZjRVdUh3NnNrcEVWdTFoM1JXNnFqVW55SXRDTmY5WXEvakQ0bk9lWlpFRTMwbm9qWkp0QXlWZkE2MzlQNXJUbk51WkttTFE4QVB3Y1pzZDc0Zlp0ei81U1FKS3JnUVV6SUZoMkhSZXBsL0M0VXBLTUNKd2YvQ2JJTmpVK2JtZ1J6VkE4MW9lVTlZRVVlOHRHd1dNUDJLYkFSbTFGbExuYmFtOHkyc0p6OTdOYnp4ZWJlTjdxR0V2eWZRYXlqeEhmTC92QlJGa2lmaVU9LS1HQmU4cXZCL0trdVVTMW5QVXkzWUdnPT0%3D--e417927d7291752b7527f64c960dbe38450f2543
        - User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36
    
        - Referer: https://github.com/settings/profile
    
    '''
    import requests
    
    url = 'https://github.com/settings/emails'
    
    github_headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
        'Cookie': '_octo=GH1.1.816288935.1571284371; has_recent_activity=1; tz=Asia%2FShanghai; _device_id=a00ea80a1caf3404bd890d1a520083a6; user_session=y_4qkWDvHylVnzfIezSMSCaNdAnH0fTBw7pmm9yk4ZLmh6Ha; __Host-user_session_same_site=y_4qkWDvHylVnzfIezSMSCaNdAnH0fTBw7pmm9yk4ZLmh6Ha; logged_in=yes; dotcom_user=Michaeljy; _gh_sess=Vy82WmxrcFZmREZvV0dnTUtIaENDVWhPY3M1dEpwRVNWelVmYjk3TWQzQVpVVS84S3hhOXgrMkhRR2lHZUhZNUwvQzU1U3RHUWxWNmhITDFVV3FFcTUreFdPaWsyT2hTZlllQ2Fld3dFVmR5Q0w2RDdKbEduZENidUxySkd6MXVOKzdXQURsRVpSZDVUUjY4TE83c0JCK3BrcWlmeFVBZzhxVGFiVDRPd01HUlI0M3FrVFNLZ3dyNlJ6K0pQaWdQZHFYUFdISFRHT1ZZc3k4UDNFaVB4eWhLY3NTZnRaK0JTb3JlZzhGd3o1eFBaeHh1eCtReFZ3dWFsQXBqSWdNc0ltODdVTTk2bjVrdytFekNyWGlEZnFSdGZLSTZ6eW1jY1pvUGJYWWRIU0RtK01BWC9yOFRqb2FPNENJRUZra3F2V250dVZWMys3ajl5ZHVXQThUVGRQSHhmSDNwUXdnTUFUb0grMytlT1Y5U1RqVEYzak4zV1RIRURVbktycXR5dG1NZVJYNWZLdGxVR1hZeFFmemV0dFhiZXhiZHRKTGw0RHZvYmh6NEJmdHRPVkdFSENzaHhRSjJzVWlya2R4N3N3S3FWVmxpVkl0cWs0bmUvNmhlMGpwU3hNdTFZKzI5eCt1c0RjRVViVjY4Z2E4YzFtbkRhVmtybUZabGg4VVVlaUtLV3FEUUVMdjRvUnU1amdpbnJUTStOV1NIbGpTRGNRc25pbjVlazZ5MzV0b2tVek1qaW1HUDhMWVU1bzhjN3BsYkQ1MXVkaWg2STROdDRCNWVxMUtCTUt2L0o1Y01aTWsxM1BQNUJSZUxVcnBWMXRYdStlTlA2ZFhiVERYRVcwYUZMeFlmNjNxUTN1a0dNSVQySlZQTnpiTTY2UDk2WWMycG9iQzB4SWlRNnFhM3RVRXFxUHF4ZHBmd2R6Rk0wcFQ3b0VvNkJCd1NQZGhGNThKaHdzcEtpSXZMUUxRWVVlOHUyVFFRQmZBdURBZnNXRk9JRTljQVMzWmExSXprWFI2RVMrYkNMaVJybGR0cC9vN09rWTVvYXdHRWxaMmQwdmYzenM5dDVLMXVpTEhkWnNXVkNyWTRVd3JiSWFUVkhDak9Qb0xUekxjTk5PWWxtb2d6V1hZSGNOMkhkdWlVNXY1TTExREc0RW9mUGlVK1FoYVNYLzRSM21OMGFabkZyWHBEOFBpNXBRSU9rYkRXYVczVi8yc3ZPYURJZGUzY1AwN3l0S0szci8yc1J6WmhkQ3ZFeVJuT1pkR2FkMys3d0xTa21QNE80M3hVby82TmJ4MG1DWFZ6ZDlDR3RMaUJBNXhSVHhnMmRwMWxNSTRTR2hRM1JXdncyRklEWlA3cTU2cGRpU0wyVUljZmtGVVNXN2ltTG54cW52QzloWitSZjRVdUh3NnNrcEVWdTFoM1JXNnFqVW55SXRDTmY5WXEvakQ0bk9lWlpFRTMwbm9qWkp0QXlWZkE2MzlQNXJUbk51WkttTFE4QVB3Y1pzZDc0Zlp0ei81U1FKS3JnUVV6SUZoMkhSZXBsL0M0VXBLTUNKd2YvQ2JJTmpVK2JtZ1J6VkE4MW9lVTlZRVVlOHRHd1dNUDJLYkFSbTFGbExuYmFtOHkyc0p6OTdOYnp4ZWJlTjdxR0V2eWZRYXlqeEhmTC92QlJGa2lmaVU9LS1HQmU4cXZCL0trdVVTMW5QVXkzWUdnPT0%3D--e417927d7291752b7527f64c960dbe38450f2543'
    }
    
    # 1.发送请求
    response = requests.get(
        url,
        # headers关键字参数
        headers=github_headers
    )
    
    print(response.status_code)  # 200
    print('bbb' in response.text)  # True
    with open('emails.html', 'w', encoding='utf-8') as f:
        f.write(response.text)
    
    
    

    通过post请求永久绕过github登录

    '''
    用户名:aaa
    邮箱:bbb
    密码:ccc
    
    1.先分析 http 的请求流程
        - 请求url:
            Request URL: https://github.com/session
    
        - 请求方式:
            Request Method: POST
    
        - 请求头:
            - Referer: https://github.com/login
            - User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36
    
        - 请求体: (form data):
            commit: Sign in
            utf8: ✓
            authenticity_token: finQLn5TxHAabDmefQ2EbBXV27jDGlWm6DUu+u5u4J6QnPXBmkc76/QlDpx61v1NFf3AP8r+vg1Cq31G9Wxenw==
            ga_id:
            login: aaa
            password: bbb
            webauthn-support: supported
            webauthn-iuvpaa-support: supported
            required_field_a359:
            timestamp: 1577696492100  # 时间戳
            timestamp_secret: 03e50e82485174cadc2dda90916b93bfeadef0ac92643cbfde40e6c7f598bbb6
    
    
        - 1) 先往https://github.com/login页面发送get请求,获取authenticity_token与timestamp_secret随机加密字符串
        - 2) 携带加密字符串与请求体所有的信息,一并通过post请求访问https://github.com/session
    
    
    2.再写爬虫代码
    
    '''
    import requests
    import re
    
    # 1) 先往https://github.com/login页面发送get请求,获取authenticity_token与timestamp_secret随机加密字符串
    url = 'https://github.com/login'
    login_headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
    }
    
    login_response = requests.get(url, headers=login_headers)
    
    # print(login_response.status_code)
    # print(type(login_response.text))
    
    # 2)先解析获取authenticity_token与timestamp_secret, 通过re模块实现
    authenticity_token = re.findall('<input type="hidden" name="authenticity_token" value="(.*?)" />',
                                    login_response.text,
                                    re.S)[0]
    
    timestamp_secret = re.findall('<input type="hidden" name="timestamp_secret" value="(.*?)" class="form-control" />',
                                  login_response.text,
                                  re.S)[0]
    
    # print(authenticity_token)
    # print(timestamp_secret)
    
    
    # 3) 携带加密字符串与请求体所有的信息,一并通过post请求访问https://github.com/session
    
    form_data = {
        'commit': 'Sign in',
        'utf8': '✓',
        'authenticity_token': authenticity_token,
        'ga_id': '',
        'login': 'aaa',
        'password': 'bbb',
        'webauthn-support': 'supported',
        'webauthn-iuvpaa-support': 'supported',
        'required_field_a359': '',
        'timestamp': 1577696892274,  # 时间戳
        'timestamp_secret': timestamp_secret
    }
    
    session_url = 'https://github.com/session'
    session_response = requests.post(
        session_url,
        data=form_data,
        cookies=login_response.cookies
    )
    
    print(session_response.status_code)
    print(session_response.cookies)
    
    # 4) 登录后,直接访问github主页, 前提是需要携带登录成功后的用户cookies值
    # 携带cookies值的两种方式:
    # - headers: 携带cookies值
    # - cookies: 可以添加cookies值
    # index_response = requests.get('https://github.com/', cookies=session_response.cookies)
    # print(index_response.text)
    # with open('github.html', 'w', encoding='utf-8') as f:
    #     f.write(index_response.text)
    
    
    # 5) 验证是否登录成功,校验邮箱
    emails_response = requests.get('https://github.com/settings/emails', cookies=session_response.cookies)
    print('aaa' in emails_response.text)
    
    
    
    

    爬取梨视频

    #爬取视频
    #https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=48&mrd=0.9993282952193101&filterIds=1625835,1625642,1625837,1625841,1625870,1625869,1625813,1625844,1625801,1625856,1625857,1625847,1625838,1625827,1625787
    #https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=0
    #获取视频
    import re
    res=requests.get('https://www.pearvideo.com/category_loading.jsp?reqType=5&categoryId=1&start=0')
    
    
    reg_text='<a href="(.*?)" class="vervideo-lilink actplay">'
    
    obj=re.findall(reg_text,res.text)
    print(obj)
    for url in obj:
        url='https://www.pearvideo.com/'+url
        res1=requests.get(url)
        obj1=re.findall('srcUrl="(.*?)"',res1.text)
        print(obj1[0])
        name=obj1[0].rsplit('/',1)[1]
        print(name)
        res2=requests.get(obj1[0])
        with open(name,'wb') as f:
            for line in res2.iter_content():
                f.write(line)
    
  • 相关阅读:
    [bzoj 3048] [Usaco2013 Jan]Cow Lineup
    [bzoj 3192] [JLOI2013]删除物品
    搬迁至新博客的原因
    洛谷 P3317 [SDOI2014]重建(矩阵树定理+数学推导) [bzoj3534]
    [bzoj1002]: [FJOI2007]轮状病毒(矩阵树定理)
    [bzoj1006]: [HNOI2008]神奇的国度(最大势算法)
    高精度板子
    洛谷 P3211 [HNOI2011]XOR和路径(推dp+高斯消元)
    字符串--manacher算法(回文串匹配)
    洛谷 P2633 Count on a tree[bzoj2588](倍增lca+主席树)
  • 原文地址:https://www.cnblogs.com/michealjy/p/12121393.html
Copyright © 2020-2023  润新知