• lxf爬虫学习


    爬虫步骤

    • 确定爬取目标的url
    • 使用python代码发送请求获取数据
    • 解析获取到的数据(精确数据)
      • 找到新的目标(新的url)回到第一步,再次获取 -- 自动化
    • 数据持久化
      • python3(原生提供的模板):urllibb.request
        • urlopen
          • 返回response对象
          • response.read()
          • bytes.decode('utf-8')
        • get:传参
          • 汉字报错:解释权ASCII没有汉字,url汉字转码,urllib.parse.quote safa=string.printable
          • 字典传参:urllib.parse.urlencode()
        • post
          • urllib.request.urlopen(url,data="服务器接收的数据')
        • handle处理器的自定义
          • User-Agent
            • 模拟真实的浏览器发送请求(百度批量搜索,user agent大全)
            • request.add_header(key,value) -- 动态添加header数据
          • 响应头:response.header
          • 创建请求:request = urllib.request.Request(url)
          • 代理IP:
            • 免费的代理ip
            • 付费的代理ip
          • handler -- opener -- opener.open(url)
        • urlError
      • python2(原生提供):urllib2
    • request
    • 数据解析:xpath、bs4
    • 数据存储:json,CSV,MongoDB,mysql

    代码

    • python3.6

    1、urllib.request

    01-url_open_code.py

    ## 不带参数请求
    import urllib.request
    
    # Create your views here.
    def load_data():
        #url = "https://www.baidu.com/"  ## https返回内容考虑安全性,http是全部的
        url = "http://www.baidu.com/" 
        ## get请求
        ## http请求
        ## response:http响应的对象
        response = urllib.request.urlopen(url)
        print(response)
        ## 读取内容 bytes类型
        data = response.read()
        #print(data)
        ## 将文件获取的内容转成字符串
        str_data = data.decode('utf-8')
        #print(str_data)
        ## 将数据写入文件
        with open("baidu.html","w",encoding="utf-8") as f:
            f.write(str_data)
    
        ## 将字符串类型转为二进制
        str_name = "baidu"
        bytes_name = str_name.encode('utf-8')
        print(bytes_name)
    
        ## python 爬取的类型 :str bytes
        ##如果爬取结果是bytes类型,但写入需要str类型,就decode('utf-8')
        ##如果爬取结果是str类型,但写入需要bytes类型,就encode('utf-8')
    load_data()
    

    02-get_paras.py

    import urllib.request
    import urllib.parse ## 汉字转义模块
    import string       ## 汉字转义模块
    
    ## 传一个参数
    
    def get_method_paras():
        url = "http://www.baidu.com/s?wd="
        ## 拼接字符串(汉字)
    
        name = "美女"
        final_url = url + name
        print(final_url)
        ## python 可以接受的数据类型,不包含汉字:https://www.baidu.com/s?wd=%E7%BE%8E%E5%A5%B3
        ## 代码发送了请求,但网址里面包含了汉字;ascii是没有汉字的,需要使用url转义
        encode_new_url = urllib.parse.quote(final_url,safe=string.printable)
        print(encode_new_url)   ## 结果是:将汉字"美女"转为ASCII类型了
    
        ## 使用代码发送网络请求
        ## response = urllib.request.urlopen(final_url)  ## 包含汉字,报错,需要转为ASCII码的encode_new_url
        response = urllib.request.urlopen(encode_new_url)
        print(response)     ## 请求结果是一个对象
        data = response.read().decode()  ## 读取内容,decode()默认为utf-8
        print(data)
        with open("02-encode.html","w",encoding='utf-8') as f:
            f.write(data)
    
    get_method_paras()
    

    02-get_params2.py

    import urllib.request
    import urllib.parse
    import string
    
    ## 传多个参数,使用字典拼接
    
    def get_params():
        url = "http://www.baidu.com/s?"
    
        ## url多参数,使用字典,再将字典中冒号转为等号
        params = {
            'wd':'中文',
            'key':'zhang',
            'value':'san'
        }
    
        ## 将冒号变为等号,并连接在一起
        str_params = urllib.parse.urlencode(params)
        print(str_params)   ## 结果:wd=%E4%B8%AD%E6%96%87&key=zhang&value=san
    
        final_url = url + str_params ## url拼接
        new_url = urllib.parse.quote(final_url,safe=string.printable) ## url中文编码
        print(new_url)
    
        response = urllib.request.urlopen(new_url)  ## 请求网页,返回对象
        print(response) 
    
        data = response.read().decode('utf-8') ## 读取对象
        print(data)
    
    
    
    get_params()
    

    03-request_header.py

    import urllib.request
    
    ## 只创建请求,不加请求头信息
    def load_baidu():
        url = "http://www.baidu.com"
    
        ## 使用url创建请求对象
        request = urllib.request.Request(url)
        
        ## 请求网络数据
        ## response = urllib.request.urlopen(url)
        response = urllib.request.urlopen(request)
        print(response)
        data = response.read().decode('utf-8')
        
        ## 响应头信息
        print(response.headers)
    
        ## 获取请求头信息
        req_headers = request.headers
        print(req.headers)  ## 请求头信息为空
    
        with open("03-headers.html","w",encoding='utf-8')as f:
            f.write(data)
    
    load_baidu()
    

    03-request_header_2.py

    import urllib.request
    
    ## 创建请求,并加请求头信息,请求头信息为指定的情况
    
    def load_baidu():
        url = "http://www.baidu.com"
    
        header = {
            ## 浏览器的版本
            'haha':'hehe', ## 无用信息,做测试获取请求头使用
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
        }
    
        ## 使用url创建请求对象,默认方法添加请求头信息
        #request = urllib.request.Request(url,headers=header)    
    
        ## 使用url创建请求对象,动态添加请求头信息
        request = urllib.request.Request(url)
        request.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36')
        
        ## 获取完整的url
        full_url = request.get_full_url()
        print(full_url)
    
        ## 请求网络数据
        ## response = urllib.request.urlopen(url)
        ## 不在此处添加请求头信息,因为urlopen方法没有提供参数
        response = urllib.request.urlopen(request)
        print(response)
        data = response.read().decode('utf-8')
        
        ## 响应头信息
        print(response.headers)
    
        ## 获取请求头信息
        request_headers = request.headers   ## 获取所有请求头信息,返回字典
        print(request_headers)  
        
        request_headers_User_agent = request.get_header("User-agent") ## ## 获取指定请求头信息
        print(request_headers_User_agent)  
        
    
        # with open("03-headers.html","w",encoding='utf-8')as f:
        #     f.write(data)
    
    load_baidu()
    

    04-random_user_agent.py

    import urllib.request
    import random
    
    def load_baidu():
    
        url = "https://www.baidu.com"
    
        ## user-agent列表
        user_agent_list = [
            ## win7
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
            'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50'
        ]
        ## 每次请求的浏览器都不一样
        random_user_agent = random.choice(user_agent_list)
        
        ## 新建请求对象
        request = urllib.request.Request(url)
        ## 动态添加请求头
        request.add_header('User-Agent',random_user_agent)
        ## 获取请求头信息
        print(request.get_header('User-agent'))     ## 此处的User-agent,首字母大写,其余必须全部小写
    
        ## 请求数据
        response = urllib.request.urlopen(request)
        print(response)
    
    load_baidu()
    

    05-handler_openner.py

    import urllib.request
    
    ## 作用:添加代理 
    
    ## 系统的urlopen并没有添加代理的功能,所以需要我们自定义这个功能
    ## ssl,安全套接层 ,ssl 第三方的CA数字证书
    ## http,80端口,HTTPS,443端口
    ## urlopen可以请求数据,是因为有handler处理器
    ## 自己的opener 请求数据
    
    def handler_openner():
        url = "https://www.cnblogs.com/moox/tag"
        #urllib.request.urlopen()
    
        ## 创建自己的处理器
        handler = urllib.request.HTTPHandler()
    
        ## 创建自己的opener
        opener = urllib.request.build_opener(handler)
    
        ## 用自己创建的opener调用open方法请求数据
        response = opener.open(url)
    
        data = response.read()
        print(data)
    
    handler_openner()
    

    06-proxy_handler.py

    import urllib.request
    
    ### 06-proxy_handler
    def create_proxy_handler():
        url = "https://www.baidu.com"
    
        ## 添加一个免费代理协议
        proxy = {
            ## 免费版写法,西刺免费代理
            # "http":"http:120.77.249.46:8080"
            "http":"120.77.249.46:8080" ## 简写
        }
    
        ## 代理处理器
        proxy_handler = urllib.request.ProxyHandler(proxy)
    
        ## 创建自己的opener
        opener = urllib.request.build_opener(proxy_handler)
    
        data = opener.open(url).read()
        print(data)
    
    create_proxy_handler()
    

    06-random_user_proxy

    import urllib.request
    
    
    
    def create_proxy():
        
        url = "https://www.cnblogs.com/moox/tag"
    
        # 添加多个免费代理协议 06-random_user_proxy
        proxy_list = [
            {"http":"1.0.9.41:8080"},
            {"http":"120.77.249.42:8080"},
            {"http":"120.77.249.43:8080"}
        ]
    
        for proxy in proxy_list:
            print(proxy)
            proxy_handler = urllib.request.ProxyHandler(proxy)
            opener = urllib.request.build_opener(proxy_handler)
            try:
                opener.open(url,timeout=0.1)
                print("haha")
            except Exception as e:
                print(e)
    
    create_proxy()
    

    07-money_proxy_handler_1

    import urllib.request
    
    ## 付费的代理发送 07-money_proxy_handler_1
    # 1.带着用户名和密码发送
    
    def money_prosy_use():
        url = "https://www.baidu.com"
    
    
        ## 第一种方式
        print("第一种方法")
        # 1.代理ip
        money_proxy = {"http":"username:pwd@192.168.12.11:8080"}
        # 2. 代理的处理器handler
        proxy_handler = urllib.request.ProxyHandler(money_proxy)
        # 3.通过处理器创建opener
        opener = urllib.request.build_opener(proxy_handler)
        # 4. open 发送请求
        try:
            response = opener.open(url)
            print(response)
        except Exception as e:
            print(e)
    
        
        ## 第二中方法
        print("第二种方法")
        ## 1.个人信息
        username = "abcname"
        pwd = "123456"
        proxy_money = "123.123.123.123:8888"
    
        ## 2、创建密码管理,添加用户名和密码
        password_manager = urllib.request.HTTPPasswordMgrWithDefaultRealm()
        password_manager.add_password(None,proxy_money,username,pwd)
    
        ## 3、创建可以验证代理ip的处理器
        handler_auth_proxy = urllib.request.ProxyBasicAuthHandler(password_manager)
    
        ## 4、根据处理器创建opener
        opener_auth = urllib.request.build_opener(handler_auth_proxy)
    
        ## 4、发送请求
        response = opener_auth.open(url)
        print(response.read())
    
    
    
        ## 爬取自己公司的数据,做数据分析
        ## 07-auth_use.py
    
    money_prosy_use()
    

    07-auth_use_nei_wang2

    import urllib.request
    
    ## 内网请求,爬取自己公司的数据 07-auth_use_nei_wang2
    def auth_nei_wang():
    
        # 1.用户名密码
        user = "admin"
        pwd = "admin123"
        nei_url = "http://172.168.179.66"
    
        # 2.创建密码管理器
        pwd_manager = urllib.request.HTTPPasswordMgrWithDefaultRealm()
        pwd_manager.add_password(None,nei_url,user,pwd)
    
        # 3. 创建认证处理器(一般使用requests比较多)
        auth_handler = urllib.request.HTTPBasicAuthHandler(pwd_manager)
    
        # 4. 根据handler创建opener
        opener = urllib.request.build_opener(auth_handler)
    
        # 5. 发送请求
        response = opener.open(nei_url)
        print(response)
    
    auth_nei_wang()
    
    

    08-cookies_1

    import urllib.request
    
    # 08-cookies_1
    ## 登录成功后,获取到的数据为未登录信息,未使用cookies
    
    # 1.数据url
    url = "https://www.yaozh.com/member/"   ## 登录后的个人信息页面
    
    # 2. 添加请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
    }
    
    # 3. 构建请求对象
    request = urllib.request.Request(url,headers=headers)
    
    # 4.发送请求对象
    response = urllib.request.urlopen(request)
    
    # 5.读取数据
    data = response.read()
    print(type(data))       
    
    ## 注意,此处的data为bytes类型,写入文件需要str类型
    ## 方法一:data = response.read().decode("utf-8") 注意考虑gbk
    ## 方法二:写入文件时,使用bytes类型写入,即wb
    
    # 6.保存到文件中,验证数据
    with open("08-cookies.html","wb") as f:
        f.write(data)
    
    
    

    08-cookies_2

    import urllib.request
    
    ## 08-cookies_2
    '''
        登录成功后,直接获取个人中心的页面
        手动粘贴,复制 pc抓包的cookies信息
        放在request 对象的请求头里面
    '''
    
    # 1.数据url
    url = "https://www.yaozh.com/member/"       ## 登录后的个人信息页面
    
    # 2. 添加请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
        ,
        'Cookie': 'acw_tc=2f624a2915951280821954516e4c2e329d585f7a1c18a5ab4d48c854a73574; PHPSESSID=qp7vu5k86b80o99nk3ne2nqeo6; Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94=1595128086; _ga=GA1.2.1668375763.1595128086; _gid=GA1.2.1688163507.1595128086; yaozh_logintime=1595128155; yaozh_user=956263%09moox2020; yaozh_userId=956263; yaozh_jobstatus=kptta67UcJieW6zKnFSe2JyXnoaabJtnm5eHnKZxanJT1qeSoMZYoNdzb5tan9LU2pOUlpFZoKifnZ%2BDn5iorJDVop6Yg3HYnmpnm1pjmJ6eCB926858ECd33fF24d2161B6ecf9232XkpackmyaV6DXn5VtWamhnsZbbKabZ5ieW2iWcWeUl5qSmZuYaJ1XoOE%3D6e6fa20636976fac57f639153c479218; _gat=1; Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94=1595128158; yaozh_uidhas=1; yaozh_mylogin=1595128161; acw_tc=2f624a2915951280821954516e4c2e329d585f7a1c18a5ab4d48c854a73574'
    }
    
    # 3. 构建请求对象
    request = urllib.request.Request(url,headers=headers)
    
    # 4.发送请求对象
    response = urllib.request.urlopen(request)
    
    # 5.读取数据
    data = response.read()
    print(type(data))       
    
    ## 注意,此处的data为bytes类型,写入文件需要str类型
    ## 方法一:data = response.read().decode("utf-8") 注意考虑gbk
    ## 方法二:写入文件时,使用bytes类型写入,即wb
    
    # 6.保存到文件中,验证数据
    with open("08-cookies.html","wb") as f:
        f.write(data)
    
    
    

    08-cookies_3

    import urllib.request
    from http import cookiejar      ## cookiejar能自动保存cookie
    from urllib import parse
    
    ## 08-cookies_3
    '''
        模拟登陆,直接获取个人中心的页面
        
        1.  代码登录 登录成功后,就有了cookie(有效的)
        2.  自动带着cookie 去请求个人中心
            放在request 对象的请求头里面
        3.  抓包时注意选上Preserve log ,能保存上个额页面的请求,不然刷新之后就没有登录页面的信息了
    '''
    
    def login_cookie():
    # 1.代码登录
        # 1.1 登录后的网址
        login_url = "https://www.yaozh.com/login"
        
        # 1.2 登录的参数
        ## 注意:登录的参数应该在登录之前来找,但需要看登录后有些什么。
        ## 登录之前的,登录页的网址:https://www.yaozh.com/login 
        ## 此处登录前后网址一样,后台是根据发送的请求方式来判断的,
        ## 如果是get请求,就是登录页面,如果是post请求,就是登录的返回结果
        """
            登录后的参数
            username: moox2020
            pwd: 
            formhash: 609FDC2169
            backurl: https%3A%2F%2Fwww.yaozh.com%2F
        """
        # 在登录前的页面Elemenets中找到forhash和backurl
        login_form_data = {
            "username":"moox2020",
            "pwd":"",
            "formhash":"4D19C2552E",
            "backurl":"https%3A%2F%2Fwww.yaozh.com%2F"
        }
        ## 注意:1.字典类型的参数data,需要使用parse转译或转码;2.post请求的data要求是bytes;3.汉字需要转译
        login_str = parse.urlencode(login_form_data)
        ## 报错:TypeError: POST data should be bytes, an iterable of bytes, or a file object. It cannot be of type str.
        login_bytes = login_str.encode('utf-8')
    
    
        # 1.3 添加cookiejar,发送登录请求POST
        ## 保存cookie的作用
        cook_jar = cookiejar.CookieJar()      
        ## 定义有添加cookie功能的处理器
        cook_handler = urllib.request.HTTPCookieProcessor(cook_jar) 
        ## 根据处理器,生成opener
        opener = urllib.request.build_opener(cook_handler)
    
        ## 添加请求头
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
        }
        ## 构造请求
        login_request = urllib.request.Request(login_url,headers=headers,data=login_bytes) 
        
        ## 带着参数,发送post请求
        ## 如果登录成功,cookiejar能自动保存cookie
        opener.open(login_request)  ## 带着cookie的opener,只要成功即可,不关心返回内容
    
    
    # 2. 代码带着cookie去访问个人中心
        ## 实质:登录后获取到请求头中的cookie,再将该cookie装到新请求中,比较麻烦,考虑cookiejar
        ## python另一种实现,cookieJar
    
        center_url = "https://www.yaozh.com/member/"
        center_request = urllib.request.Request(center_url,headers=headers)
        response = opener.open(center_request)
        ## response对象是bytes类型 --> str类型
        ## data = response.read().decode('utf-8')  ## 类型可以在element head中看到
        data = response.read()
    
        with open("08-cookies_login.html","wb") as f: ## "w"的方式报错,所以不提前转化格式,直接"wb"来写
            f.write(data)
    
    login_cookie()
    

    09-http_error

    ## urllib.request 有两种错误类型,
    # HTTPError,URLError,http是url的子类
    
    # 09-http_error
    import urllib.request
    
    '''
    url = "http://www.zhongsaannghh.com.cn"
    response = urllib.request.urlopen(url)
    raise URLError(err)
    urllib.error.URLError: <urlopen error [Errno 11004] getaddrinfo failed>
    '''
    
    url = "https://mbd.baidu.cn/newspage" 
    try:
        response = urllib.request.urlopen(url)
    except urllib.request.HTTPError as e:
        print(e.code)
    
    except urllib.request.URLError as e:
        print(e)
    

    2、requests

    10.1-requests_content_text

    # 1. 安装requests模块:pip install requests
    # request的基本用法:content,text
    
    ## 10.1-requests_content_text
    import requests
    
    
    url = "http://www.baidu.com"
    
    response = requests.get(url)
    
    ## content属性,返回的类型是bytes,需要时可以转为str,decode("...")
    # data = response.content
    data = response.content.decode('utf-8')             ## 直接使用conten,不是content(),也不是read()
    print(type(data))
    
    ## text属性,返回类型为str,但可能出错,乱码等,优先使用content
    data = response.text
    print(type(data))
    
    ## 带请求头的requests
    ## 10.2-requests_headers_cookie
    import requests
    
    class RequestSpider(object):
        def __init__(self):
            url = "http://www.baidu.com"
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
            }
            self.response = requests.get(url,headers=headers)
    
        def run(self):
            self.data = self.response.content
            print(type(self.data))
    
            ## 1. 获取请求头
            request_headers = self.response.request.headers
            print(request_headers)
    
            ## 2. 获取响应头
            response_headers = self.response.headers
            print(response_headers)
    
            ## 3. 获取响应状态码
            code = self.response.status_code
            print(code)
    
            ## 4. 请求的cookie
            request_cookie = self.response.request._cookies
            print(request_cookie)
    
            ## 5. 响应的cookie
            response_cookie = self.response.cookies
            print(response_cookie)
    
    # 实例化类调用
    # b = RequestSpider()
    # b.run()
    
    # 类直接调用
    RequestSpider().run()
    

    10.3-requests_params

    ## url 自动转译
    ## 10.3-requests_params
    
    import requests
    
    # url = "https://www.baidu.com/s?wd=%E7%BE%8E%E5%A5%B3"
    
    ## 1.参数:中文的"美女" 会自动转译
    url = "https://www.baidu.com/s?wd=美女"
    
    ## url包含字典传参,也会自动转译
    url_base = "https://www.baidu.com/s"
    url_params = {
        "wd":"美女"
    }
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
    }
    
    # response = requests.get(url,headers=headers)
    response = requests.get(url_base,headers=headers,params=url_params)
    data = response.content
    
    with open("baidu_params-10.html","wb") as f:
        f.write(data)
    
    
    

    10.4-requests_json

    ## json格式转化为字典类型或列表类型
    ## requests.post(url,data=(参数{}),json=(json字符串))
    ## 10.4-requests_json
    
    '''
    https://api.github.com/user 的内容不是HTML,而是标准的json:
    {
      "message": "Requires authentication",
      "documentation_url": "https://developer.github.com/v3/users/#get-the-authenticated-user"
    }
    '''
    import requests
    import json
    
    url = "https://api.github.com/user"
    headers = {
        #'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
        # 'User-Agent':'Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50'
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'
    }
    response = requests.get(url,headers=headers)
    
    # 1. 使用content方式获取json指定字段message的值
    # ## str 
    # data = response.content.decode("utf-8")
    # print(data)     ## 返回结果是字符串,是json格式的,若只想去message,需要转为字典
    # ## str --> dict 
    # data_dict = json.loads(data)
    # print(data_dict['message'])
    
    # 2. 直接使用json(),自动将字符串转换成python dict list
    data = response.json()
    print(type(data))   ## 返回类型直接为<class 'dict'>
    print(data)
    print(data['message'])
    
    
    
    

    11.1-requests-auth

    import requests
    ## 11.1-requests-auth
    
    ## 发送post请求
    url = ""
    data = {
    
    }
    
    response = requests.post(url,data=data) 
    
    ## 内网需要认证时使用
    # auth = (user,pwd)       ## 元组
    # response = requests.get(url,auth=auth)
    

    11.2-requests-proxy

    import requests
    ## 11.2-requests-proxy
    
    ## 发送post请求
    url = "https://www.baidu.com"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
    }
    
    free_proxy = {'http':'27.17.45.90:43411'}
    response = requests.get(url,headers=headers,proxies=free_proxy)
    print(response.status_code) ## 返回200就代表成功
    
    
    

    11.3-requests_ssl

    ## ssl 认证才能访问的情况 
    ## 11.3-requests_ssl
    
    import requests
    
    url = "https://www.12306.cn/"
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
    }
    
    ## 报证书错误时:
    ## 因为https是有第三方CA证书的,但12306是自己颁布的整数
    ## 解决办法:告诉HTTPS,12306不是CA证书,是自己颁布的,即告诉web,忽略证书访问
    # response = requests.get(url,headers=headers) ## 实际上现在不适用证书已经能够访问了
    response = requests.get(url,headers=headers,verify=False)
    data = response.content
    # print(data)
    
    with open("11.3-ssl.html","wb") as f:
        f.write(data)
    
    
    ## 注意:如果取到的都是js代码,看网站是否正确。
    

    11.4-requests_cookies

    import requests
    
    ## 11.4-requests_cookies
    ## 直接粘贴复制登录后的cookie
    
    url = "https://www.yaozh.com/member/"
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1, Win64, x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
    }
    cookies = 'PHPSESSID=qp7vu5k86b80o99nk3ne2nqeo6; Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94=1595128086; _ga=GA1.2.1668375763.1595128086; _gid=GA1.2.1688163507.1595128086; yaozh_userId=956263; _gat=1; Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94=1595128158; yaozh_uidhas=1; yaozh_mylogin=1595128161; acw_tc=2f624a2915951280821954516e4c2e329d585f7a1c18a5ab4d48c854a73574; UtzD_f52b_saltkey=gtc7UEo6; UtzD_f52b_lastvisit=1595126252; _ga=GA1.1.402802462.1595129856; _gid=GA1.1.1215858104.1595129856; UtzD_f52b_ulastactivity=1595128155%7C0; UtzD_f52b_creditnotice=0D0D2D0D0D0D0D0D0D799970; UtzD_f52b_creditbase=0D0D0D0D0D0D0D0D0; UtzD_f52b_creditrule=%E6%AF%8F%E5%A4%A9%E7%99%BB%E5%BD%95; yaozh_user=956263%09moox2020; db_w_auth=799970%09moox2020; yaozh_logintime=1595143780; yaozh_jobstatus=kptta67UcJieW6zKnFSe2JyXnoaabJtnm5eHnKZxanJT1qeSoMZYoNdzb5tan9LU2pOUlpFZoKifnZ%2BDn5iorJDVop6Yg3HYnmpnm1pjmJ6824f776949bb1CF89325c86aF17C1CB7XkpiWmWiYV6DXn5VtWamhnsZbbKabZ5ieW2iWcWeUmZWYnJWabZlXoOE%3D268bbfec91229863de4864edb7fed7c2; UtzD_f52b_lastact=1595143781%09uc.php%09; UtzD_f52b_auth=e555PLVOXByCsyZ5dlANKt5j1jodJkCYtvA%2B8h7Gd0svI4J%2FQA9SPzcUIlFOd8l2cZdPn7W2nKBuF7N5Zfe9e2MbhSQ'
    ## 此处的cookies需要一个dict或者cookiejar类型的,上述的Cookie是字符串类型,不能使用,需要转化为dict
    # cookies_dict = {
    #     'PHPSESSID':'qp7vu5k86b80o99nk3ne2nqeo6', 
    #     'Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94':'1595128086',
    #     '_ga':'GA1.2.''.1595128086', 
    #     '_gid':'GA1.2.1688163507.1595128086', 
    #     'yaozh_userId':'956263', 
    #     '_gat':'1', 
    #     'Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94':'1595128158', 
    #     'yaozh_uidhas':'1', 
    #     'yaozh_mylogin':'1595128161', 
    #     'acw_tc':'2f624a2915951280821954516e4c2e329d585f7a1c18a5ab4d48c854a73574', 
    #     'UtzD_f52b_saltkey':'gtc7UEo6', 
    #     'UtzD_f52b_lastvisit':'1595126252', 
    #     '_ga':'GA1.1.402802462.1595129856', 
    #     '_gid':'GA1.1.1215858104.1595129856', 
    #     'UtzD_f52b_ulastactivity':'1595128155%7C0', 
    #     'UtzD_f52b_creditnotice':'0D0D2D0D0D0D0D0D0D799970', 
    #     'UtzD_f52b_creditbase':'0D0D0D0D0D0D0D0D0', 
    #     'UtzD_f52b_creditrule':'%E6%AF%8F%E5%A4%A9%E7%99%BB%E5%BD%95', 
    #     'yaozh_user':'956263%09moox2020', 
    #     'db_w_auth':'799970%09moox2020', 
    #     'yaozh_logintime':'1595143780', 
    #     'yaozh_jobstatus':'kptta67UcJieW6zKnFSe2JyXnoaabJtnm5eHnKZxanJT1qeSoMZYoNdzb5tan9LU2pOUlpFZoKifnZ%2BDn5iorJDVop6Yg3HYnmpnm1pjmJ6824f776949bb1CF89325c86aF17C1CB7XkpiWmWiYV6DXn5VtWamhnsZbbKabZ5ieW2iWcWeUmZWYnJWabZlXoOE%3D268bbfec91229863de4864edb7fed7c2', 
    #     'UtzD_f52b_lastact':'1595143781%09uc.php%09', 
    #     'UtzD_f52b_auth':'e555PLVOXByCsyZ5dlANKt5j1jodJkCYtvA%2B8h7Gd0svI4J%2FQA9SPzcUIlFOd8l2cZdPn7W2nKBuF7N5Zfe9e2MbhSQ'
    # }
    
    ## 以上cookies_dict 使用手工方式处理太繁琐,使用split拆分Cookie为字典
    ## 方法一:
    cookies_dict={}
    cookies_list = cookies.split("; ")
    for cookies in cookies_list:
        cookies_dict[cookies.split("=")[0]]=cookies.split("=")[1]
    
    ## 方法二:列表推导式
    cookies_dict={
        cookies.split("=")[0]:cookies.split("=")[1]  for cookies in cookies.split("; ")
    }
    response = requests.get(url,headers=headers,cookies=cookies_dict)
    data = response.content
    
    with open("11.4_cookies_dict.html","wb") as f:
        f.write(data)
    

    11.5-requests_cookies_auto_login

    import requests
    
    ## 11.5-requests_cookies_auto_login
    ## 代码实现模拟登录,带着cookie访问
    
    ## session 类,可以自动保存cookies,类似于urllib.request中使用的cookieJar
    session = requests.session()
    # 1.代码登录
    login_url = "https://www.yaozh.com/login"
    member_url = "https://www.yaozh.com/member/"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1, Win64, x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
    }
    login_form_data = {
        'username': 'moox2020',
        'pwd': '',
        'formhash': 'E350124FCF',    ## formhash和backurl在登录前的页面代码中找
        'backurl': '%2F%2Fwww.yaozh.com%2F', ## 
    }
    
    ## 注意:登录成功之后,cookie保存在response中,但此处的实现,使用session,因此不直接使用requests.post
    # url_response = requests.post(login_url,data=login_form_data)
    ## 登录成功后,session就保存了有效的cookie,再使用session去请求即可
    login_response = session.post(login_url,headers=headers,data=login_form_data)
    print(login_response.content.decode())  ## 登录成功时:json中包含"state":"success"
    
    ## json内容可以使用网站查看:http://www.bejson.com/
    
    
    
    # 2.登录成功之后,带着有效的cookies 访问请求目标数据,使用session请求
    data = session.get(member_url,headers=headers).content
    
    with open("11.5_cookies_auto_login.html","wb") as f:
        f.write(data)
    

    3、re

  • 相关阅读:
    MongoDB笔记: 安装和常见问题
    Spring Boot方式的Dubbo项目
    Centos7安装Redis5.0.5并加入Systemd服务
    生成Nginx服务器SSL证书和客户端证书
    OpenSSL的证书, 私钥和签名请求(CSRs)
    迁移Git项目到Gitlab
    Ubuntu18.04 Server安装Nginx+Git服务和独立的svn服务
    ESXi6.5上的Ubuntu虚机在远程SSH时宕机
    Nginx访问路径添加密码保护
    从阿里云DATAV GeoAtlas接口抽取行政区划数据
  • 原文地址:https://www.cnblogs.com/moox/p/13343123.html
Copyright © 2020-2023  润新知