• 带参数,头信息,代理,cookie爬取


    1.get传参

    (1)汉字报错 :解释器器ascii没有汉字 url汉字转码

    urllib.parse.quote safe="string.printtable"

    (2)字典传参

    urllib.parse.urlencode()

    post:

    urlib.request.openurl(url,data = "服务器器接受的数据")

    handler:处理理器器的⾃自定义:

    User-Agent:

    (1)模拟真实的浏览器器发送请求:(1)百度批量量搜索(2)检查元素(百度搜索useragent⼤大全)

    (2)request.add_header(动态添加head数据)

    (3)响应头 response.header

    (4)创建request:urlib.request.Request(url)

    2.IP代理理:

    (1)免费的IP:时效性差,错误率⾼高

    (2)付费的IP:贵花钱,也有失效不不能⽤用的

    IP分类:

    透明:对⽅方知道我们真实的ip

    匿匿名:对⽅方不不知道我们真实的ip,知道了了你使⽤用了了代理理

    ⾼高匿匿:对⽅方不不知道我们真是的IP.也不不知道我们使⽤用了了代理理

    handler:

    (1)系统的urlopen()不不⽀支持代理理的添加

    创建对应的处理理器器(handler)

    1.代理理处理理器器:ProxyHandler

    2.拿着ProxyHandler创建opener:bulid_opener()

    3.opener.open(url)就可以请求数据

    auth认证handler

    Cookieshandler

    URLError

    requests(第三⽅方模块):简单易易⽤用

    数据解析:

    数据存储:json csv MongDB resdis mysql

    import urllib.request
    import urllib.parse
    import string
    
    
    def get_params():
        url = "http://www.baidu.com/s?"
    
        params = {
            "wd":"中文",
            "key":"zhang",
            "value":"san"
    
        }
        str_params = urllib.parse.urlencode(params)
        print(str_params)
        final_url = url + str_params
    
        #将带有中文的url 转译成计算机可以识别的url
        end_url = urllib.parse.quote(final_url,safe=string.printable)
    
        response = urllib.request.urlopen(end_url)
    
        data = response.read().decode("utf-8")
        print(data)
    
    
    get_params()
    import urllib.request
    
    def load_baidu():
        url= "https://www.baidu.com"
        header = {
            #浏览器的版本
            "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
            # "haha":"hehe"
        }
    
    
        #创建请求对象
        request = urllib.request.Request(url)
        #动态的去添加head的信息
        request.add_header("User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36")
        #请求网络数据(不在此处增加请求头信息因为此方法系统没有提供参数)
        response = urllib.request.urlopen(request)
        print(response)
        data = response.read().decode("utf-8")
    
        #获取到完整的url
        final_url = request.get_full_url()
        print(final_url)
    
        #响应头
        # print(response.headers)
        #获取请求头的信息(所有的头的信息)
        # request_headers = request.headers
        # print(request_headers)
        #(2)第二种方式打印headers的信息
        #注意点:首字母需要大写,其他字母都小写
        request_headers = request.get_header("User-agent")
        # print(request_headers)
        with open("02header.html","w")as f:
            f.write(data)
    
    
    
    load_baidu()
    import urllib.request
    
    def load_baidu():
        url= "http://www.baidu.com"
        #添加请求头的信息
    
    
        #创建请求对象
        request = urllib.request.Request(url)
        #请求网络数据
        response = urllib.request.urlopen(request)
        print(response)
        data = response.read().decode("utf-8")
    
        #响应头
        # print(response.headers)
        #获取请求头的信息
        request_headers = request.headers
        print(request_headers)
        with open("02header.html","w")as f:
            f.write(data)
    
    
    
    load_baidu()
    import urllib.request
    import random
    
    def load_baidu():
    
        url = "http://www.baidu.com"
        user_agent_list = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1",
            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
            "Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50"
    
        ]
        #每次请求的浏览器都是不一样的
        random_user_agent = random.choice(user_agent_list)
    
        request = urllib.request.Request(url)
    
        #增加对应的请求头信息(user_agent)
        request.add_header("User-Agent",random_user_agent)
    
        #请求数据
        response = urllib.request.urlopen(request)
        #请求头的信息
        print(request.get_header("User-agent"))
    
    load_baidu()
    import urllib.request
    
    def handler_openner():
    
        #系统的urlopen并没有添加代理的功能所以需要我们自定义这个功能
        #安全 套接层 ssl第三方的CA数字证书
        #http80端口# 和https443
        #urlopen为什么可以请求数据 handler处理器
        #自己的oppener请求数据
    
        # urllib.request.urlopen()
        url = "https://blog.csdn.net/m0_37499059/article/details/79003731"
    
        #创建自己的处理器
        handler = urllib.request.HTTPHandler()
        #创建自己的oppener
        opener=urllib.request.build_opener(handler)
        #用自己创建的opener调用open方法请求数据
        response = opener.open(url)
        # data = response.read()
        data = response.read().decode("utf-8")
    
    
        with open("02header.html", "w")as f:
            f.write(data)
    
    handler_openner()
    import urllib.request
    
    
    def create_proxy_handler():
        url = "https://blog.csdn.net/m0_37499059/article/details/79003731"
    
        #添加代理
        proxy = {
            #免费的写法
            "http":""
            # "http":"120.77.249.46:8080"
            #付费的代理
            # "http":"xiaoming":123@115.
    
    
        }
        #代理处理器
        proxy_handler = urllib.request.ProxyHandler(proxy)
    
        #创建自己opener
        opener = urllib.request.build_opener(proxy_handler)
        #拿着代理ip去发送请求
        response = opener.open(url)
        data = response.read().decode("utf-8")
    
    
        with open("03header.html", "w")as f:
            f.write(data)
    
    create_proxy_handler()
    import urllib.request
    
    def proxy_user():
    
        proxy_list = [
            {"https":""},
            # {"https":"106.75.226.36:808"},
            # {"https":"61.135.217.7:80"},
            # {"https":"125.70.13.77:8080"},
            # {"https":"118.190.95.35:9001"}
        ]
        for proxy in proxy_list:
            print(proxy)
            #利用遍历出来的ip创建处理器
            proxy_handler = urllib.request.ProxyHandler(proxy)
            #创建opener
            opener = urllib.request.build_opener(proxy_handler)
    
            try:
                data = opener.open("http://www.baidu.com",timeout=1)
    
                haha = data.read()
                print(haha)
            except Exception as e:
                print(e)
    
    
    proxy_user()

     付费的代理发送

    import urllib.request
    
    #付费的代理发送
    #1.用户名密码(带着)
    #通过验证的处理器来发送
    
    def money_proxy_use():
        # #第一种方式付费代理发送请求
        # #1.代理ip
        # money_proxy ={"http":"username:pwd@192.168.12.11:8080"}
        # #2.代理的处理器
        # proxy_handler=urllib.request.ProxyHandler(money_proxy)
        #
        # #3.通过处理器创建opener
        # opener = urllib.request.build_opener(proxy_handler)
        # #4.open发送请求
        # opener.open("http://www.baidu.com")
        # #第二种方式发送付费的ip地址
        use_name = "abcname"
        pwd = "123456"
        proxy_money = "123.158.63.130:8888"
        #2.创建密码管理器,添加用户名和密码
        password_manager = urllib.request.HTTPPasswordMgrWithDefaultRealm()
        #uri定位 uri>url
        #url 资源定位符
        password_manager.add_password(None,proxy_money,use_name,pwd)
        #3.创建可以验证代理ip的处理器
        handle_auth_proxy = urllib.request.ProxyBasicAuthHandler(password_manager)
        #4.根据处理器创建opener
        opener_auth = urllib.request.build_opener(handle_auth_proxy)
        #5.发送请求
        response = opener_auth.open("http://www.baidu.com")
        print(response.read())
    
        #爬取自己公司的数据,做数据分析
        #admin
    
    money_proxy_use()

    爬取自己的网站

    import urllib.request
    
    def auth_nei_wang():
        #1.用户名密码
        user = "admin"
        pwd = "adimin123"
        nei_url = "http://192.168.179.66"
    
    
        #2.创建密码管理器
        pwd_manager = urllib.request.HTTPPasswordMgrWithDefaultRealm()
    
        pwd_manager.add_password(None,nei_url,user,pwd)
    
        #创建认证处理器(requests)
        auth_handler = urllib.request.HTTPBasicAuthHandler(pwd_manager)
    
        opener = urllib.request.build_opener(auth_handler)
    
        response = opener.open(nei_url)
        print(response)
    
    
    auth_nei_wang()

     cookie

    第一种:

    """
        直接获取 个人中心的页面
        手动粘贴 复制 PC 抓包的 cookies
        放在  request对象的请求头里面 
    
    """
    
    import urllib.request
    
    # 1.数据url
    url = 'https://www.yaozh.com/member/'
    # 2.添加请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
        ,
        'Cookie': '_ga=GA1.2.1820447474.1535025127; MEIQIA_EXTRA_TRACK_ID=199Tty9OyANCXtHaSobJs67FU7J; UtzD_f52b_ulastactivity=1511944816%7C0; WAF_SESSION_ID=7d88ae0fc48bffa022729657cf09807d; PHPSESSID=7jsc60esmb6krgthnj99dfq7r3; _gid=GA1.2.358950482.1540209934; _gat=1; MEIQIA_VISIT_ID=1BviNX3zYEKVS7bQVpTRHOTFV8M; yaozh_logintime=1540209949; yaozh_user=381740%09xiaomaoera12; yaozh_userId=381740; db_w_auth=368675%09xiaomaoera12; UtzD_f52b_saltkey=CfYyYFY2; UtzD_f52b_lastvisit=1540206351; UtzD_f52b_lastact=1540209951%09uc.php%09; UtzD_f52b_auth=2e13RFf%2F3R%2BNjohcx%2BuoLcVRx%2FhF0NvwUbslgSZX%2FOUMkCRRcgh5Ayg6RGnklcG3d2DkUFAXJxjhlIS8fPvr9rrwa%2FY; yaozh_uidhas=1; yaozh_mylogin=1540209953; MEIQIA_EXTRA_TRACK_ID=199Tty9OyANCXtHaSobJs67FU7J; WAF_SESSION_ID=7d88ae0fc48bffa022729657cf09807d; Hm_lvt_65968db3ac154c3089d7f9a4cbb98c94=1535025126%2C1535283389%2C1535283401%2C1539351081%2C1539512967%2C1540209934; MEIQIA_VISIT_ID=1BviNX3zYEKVS7bQVpTRHOTFV8M; Hm_lpvt_65968db3ac154c3089d7f9a4cbb98c94=1540209958'
    }
    
    # 3.构建请求对象
    request = urllib.request.Request(url, headers=headers)
    
    # 4.发送请求对象
    response = urllib.request.urlopen(request)
    
    # 5.读取数据
    data = response.read()
    print(type(data))
    
    # 保存到文件中 验证数据
    with open('01cook.html', 'wb') as f:
        f.write(data)

    第二种:

    """
        获取 个人中心的页面
        
        1. 代码登录  登录成功 cookie(有效)
        2. 自动带着cookie 去请求个人中心
        
        
        cookiejar 自动保存这个cookie
    
    """
    import urllib.request
    from http import cookiejar
    from urllib import parse
    
    # 登录之前的 登录页的网址https://www.yaozh.com/login/
    # 找登录 参数
    
    # 后台 根据你发送的请求方式来判断的 如果你是get(登录页面),如果POST(登录结果)
    
    # 1. 代码登录
    # 1.1 登录的网址
    login_url = 'https://www.yaozh.com/login'
    # 1.2 登录的参数
    login_form_data = {
        "username": "3253212",
        "pwd": "56uhjyh",
        "formhash": "CE3ADF28C5",
        "backurl": "https%3A%2F%2Fwww.yaozh.com%2F"
    
    }
    # 1.3 发送登录请求POST
    cook_jar = cookiejar.CookieJar()
    # 定义有添加  cook 功能的 处理器
    cook_hanlder = urllib.request.HTTPCookieProcessor(cook_jar)
    # 根据处理器 生成 opener
    opener = urllib.request.build_opener(cook_hanlder)
    
    # 带着参数 发送post请求
    # 添加请求头
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
    }
    # 1.参数 将来 需要转译 转码; 2. post请求的 data要求是bytes
    login_str = parse.urlencode(login_form_data).encode('utf-8')
    
    login_request = urllib.request.Request(login_url, headers=headers, data=login_str)
    # 如果登录成功, cookjar自动保存cookie
    opener.open(login_request)
    
    # 2. 代码带着cooke去访问 个人中心
    center_url = 'https://www.yaozh.com/member/'
    center_request = urllib.request.Request(center_url, headers=headers)
    response = opener.open(center_url)
    # bytes -->str
    data = response.read().decode()
    
    with open('02cook.html', 'w') as f:
        f.write(data)
    
    
    # 一个用户 在不同的地点(IP(福建,上海, 杭州, 河南)) 不同浏览器 上面 不停的登录  非人为操作
    # 封你的账号
    # N 个 账号

    错误提示

    # urlib.request  提示错误 HTTPError UrlError
    """
         raise URLError(err)
    urllib.error.URLError: <urlopen error [Errno 8] nodename nor servname provided, or not known>
        
        raise HTTPError(req.full_url, code, msg, hdrs, fp)
    urllib.error.HTTPError: HTTP Error 404: Not Found
    
    """
    
    import urllib.request
    
    
    url = 'https://blog.csdn.net/zjsxxzh/article/details/110'
    
    url = 'https://affdsfsfsdfd.cn'
    
    try:
        response = urllib.request.urlopen(url)
    
    except urllib.request.HTTPError as error:
        print(error.code)
    
    
    except urllib.request.URLError as error:
        print(error)
  • 相关阅读:
    jQueryrocket
    jQueryrocket
    jQueryrocket
    jQueryrocket
    jQueryrocket
    SharePoint 2013 Workflow Manager 1.0 卸载
    SharePoint 2013 Workflow Manager 1.0 远程服务器返回错误: (400) 错误的请求。 不支持查询字符串中的 api-version
    SharePoint 2010 使用Install-SPSolution部署wsp包状态一直是”正在部署”
    SharePoint 2010管理中心服务器提示“需要升级”
    SharePoint 2010:“&”作为SharePoint账号密码引起的错误
  • 原文地址:https://www.cnblogs.com/sunBinary/p/10555671.html
Copyright © 2020-2023  润新知