• python3 urllib模块使用


    urllib模块使用

    urllib.request

    urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)

    import urllib.request
    
    url = 'http://httpbin.org/ip'
    response = urllib.request.urlopen(url)
    html = response.read()  # 返回bytes类型数据
    print(html)
    
    url = 'http://www.baidu.com'
    response = urllib.request.urlopen(url)
    html = response.read().decode('utf-8') # 通过decode()方法将bytes类型数据转化为str类型数据
    print(html)
    

    发送post数据

    import urllib.request
    import urllib.parse
    
    url = 'http://httpbin.org/post'
    
    data = {
        'name' : "小明",
        'age' : 30
    }
    # data = urllib.parse.urlencode(data)  # Error: POST data should be bytes, an iterable of bytes, or a file object. It cannot be of type str
    # data = urllib.parse.urlencode(data).encode('utf-8')
    data = bytes(urllib.parse.urlencode(data),encoding="utf-8")
    response = urllib.request.urlopen(url, data=data)
    html = response.read().decode('utf-8')
    print(html)
    

    设置timeout

    import urllib.request
    
    url = 'http://httpbin.org/get'
    response = urllib.request.urlopen(url, timeout=1)
    html = response.read().decode('utf-8')
    print(html)
    
    import socket
    import urllib.request
    import urllib.error
    
    url = 'http://httpbin.org/get'
    try:
        response = urllib.request.urlopen(url, timeout=0.1)
        html = response.read().decode('utf-8')
        print(html)
    except urllib.error.URLError as e:
        print("捕获异常....")
        print(e.reason)
        if isinstance(e.reason, socket.timeout):
            print("请求超时")
    
    

    响应

    响应类型、状态码、响应头、实际获取的url

    import urllib.request
    
    url = 'http://www.python.org'
    response = urllib.request.urlopen(url)
    # 响应类型
    response_type = type(response)
    print(response_type)  # <class 'http.client.HTTPResponse'>
    # 状态码
    status_code = response.getcode()
    print(status_code)
    # 状态码对应的信息
    status = response.reason
    print(status)    # 比如 200对应Ok, 404对应Not Found
    # 响应头
    response_headers = response.getheaders()  # 返回列表
    print(response_headers)
    server_type = response.getheader('Server') # getheader()获取响应头的指定部分信息
    print(server_type)
    print(type(response.headers))  # <class 'http.client.HTTPMessage'>
    content_type = response.headers['Content-Type'] # 获取Content-Type
    print(content_type)
    # 实际获取的url, 可以用来判断是否发生重定向
    actual_url = response.geturl()
    print(actual_url)
    

    class urllib.request.Request(url, data=None, headers={}, origin_req_host=None, unverifiable=False, method=None)¶

    import urllib.request
    
    url = 'http://httpbin.org/get'
    request = urllib.request.Request(url)  # 创建请求对象
    response = urllib.request.urlopen(request) # 发送请求
    html = response.read().decode('utf-8')
    print(html)
    # 默认的User-Agent为"Python-urllib/x.x" # x.x为python版本号
    

    发送post数据

    import urllib.request
    import urllib.parse
    
    url = 'http://httpbin.org/post'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
    }
    data = {
        'name' : 'peter', 
        'age' : 20
    }
    
    data = bytes(urllib.parse.urlencode(data), encoding="utf-8") # POST data should be bytes, an iterable of bytes, or a file object. It cannot be of type str
    request = urllib.request.Request(url, data=data, headers=headers)
    response = urllib.request.urlopen(request)
    html = response.read().decode('utf-8')
    print(html)
    # post数据时  "Content-Type": "application/x-www-form-urlencoded"
    
    

    urllib.request.Request 对象方法

    import urllib.request
    
    
    url = 'http://httpbin.org/get'
    request = urllib.request.Request(url)
    # add_header(key, val)   # 添加请求头信息
    request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36')
    response = urllib.request.urlopen(request)
    html = response.read().decode('utf-8')
    print(html)
    

    Handlers

    ProxyHandler(代理)

    import urllib.request
    
    # 字典,key为协议类型,value 为 ip地址:端口号
    proxy_dict = {
        'http': '127.0.0.1:6688',
        'https': '127.0.0.1:6688',
    }
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
    }
    
    
    proxy_hanlder = urllib.request.ProxyHandler(proxy_dict)
    opener = urllib.request.build_opener(proxy_hanlder)
    urllib.request.install_opener(opener)
    
    opener.addheaders = headers.items()   # 设置请求头
    
    url = 'http://www.whatismyip.com.tw/' # 测试ip, 如果使用代理则显示代理ip
    response = urllib.request.urlopen(url)
    print(response.read().decode('utf-8'))
    
    # 常见错误: 
    # HTTPError: HTTP Error 403: Forbidden : 很可能代理服务器设置了权限,当前ip不在代理服务器允许访问列表中
    
    

    代理需要身份认证

    # 错误提示: HTTPError: HTTP Error 407: Proxy Authentication Required
    
    #方法1: 代理ip设置格式 http://用户名:密码@ip地址:端口号
    import urllib.request
    
    # 字典,key为协议类型,value 为 ip地址:端口号
    proxy_dict = {
        'http': 'http://name:password@127.0.0.1:6688',
        'https': 'http://name:password@127.0.0.1:6688',
    }
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
    }
    
    
    proxy_hanlder = urllib.request.ProxyHandler(proxy_dict)
    opener = urllib.request.build_opener(proxy_hanlder)
    urllib.request.install_opener(opener)
    
    opener.addheaders = headers.items()   # 设置请求头
    
    url = 'http://www.whatismyip.com.tw/' # 测试ip, 如果使用代理则显示代理ip
    response = opener.open(url)
    print(response.read().decode('utf-8'))          
    
    
    #方法2: 使用ProxyBasicAuthHandler用于代理登陆验证(需要提供相应的用户名和密码)
    import urllib.request
    
    # 字典,key为协议类型,value 为 ip地址:端口号
    proxy_dict = {
        'http': 'http://127.0.0.1:6688',
    }
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',
    }
    
    
    proxy_hanlder = urllib.request.ProxyHandler(proxy_dict)
    password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
    password_mgr.add_password(None, 'http://127.0.0.1:6688', 'name', 'password') # #realm(域)设为None即可
    proxy_auth_handler = urllib.request.ProxyBasicAuthHandler(password_mgr)
    opener = urllib.request.build_opener(proxy_hanlder, proxy_auth_handler)
    urllib.request.install_opener(opener)
    
    opener.addheaders = headers.items()   # 设置请求头
    
    url = 'http://www.whatismyip.com.tw/' # 测试ip, 如果使用代理则显示代理ip
    response = opener.open(url)
    print(response.read().decode('utf-8'))   
    

    HTTPBasicAuthHandler

    用于访问web服务器时的身份验证

    import urllib.request
    
    url = 'http://127.0.0.1/test/'
    password_mgr = urllib.request.HTTPPasswordMgrWithDefaultRealm()
    password_mgr.add_password(None, url, 'admin','password')  # 添加对应url的用户名和密码
    http_auth_handler = urllib.request.HTTPBasicAuthHandler(password_mgr)
    opener = urllib.request.build_opener(http_auth_handler)
    response = opener.open(url)
    print(response.read().decode('utf-8'))
    

    FTPHandler

    import urllib.request
    
    
    url = 'ftp://ftp1.linuxidc.com'
    username = 'ftp1.linuxidc.com'
    password = 'www.linuxidc.com'
    
    ftp_url = 'ftp://%s:%s@ftp1.linuxidc.com' %(username, password)
    ftp_handler = urllib.request.FTPHandler()  
    opener = urllib.request.build_opener(ftp_handler)
    response = opener.open(ftp_url)
    print(response.read().decode('utf-8', 'ignore'))
    

    HTTPHandler、HTTPSHandler

    import urllib.request
    
    
    url = 'http://www.baidu.com'
    # 通过将debuglevel=1,将debug Log 打开,这样收发包的内容就会在屏幕上打印出来,方便调试
    http_handler = urllib.request.HTTPHandler(debuglevel=1)
    https_handler = urllib.request.HTTPSHandler(debuglevel=1)
    opener = urllib.request.build_opener(http_handler, https_handler)
    response = opener.open(url)
    
    '''
    效果:
    send: b'GET / HTTP/1.1
    Accept-Encoding: identity
    Host: www.baidu.com
    User-Agent: Python-urllib/3.6
    Connection: close
    
    '
    reply: 'HTTP/1.1 200 OK
    '
    header: Date header: Content-Type header: Transfer-Encoding header: Connection header: Vary header: Set-Cookie header: Set-Cookie header: Set-Cookie header: Set-Cookie header: Set-Cookie header: Set-Cookie header: P3P header: Cache-Control header: Cxy_all header: Expires header: X-Powered-By header: Server header: X-UA-Compatible header: BDPAGETYPE header: BDQID header: BDUSERID 
    '''
    

    CookieJar

    import urllib.request
    import http.cookiejar
    
    
    url = 'http://www.baidu.com'
    cookie = http.cookiejar.CookieJar()
    cookie_handler = urllib.request.HTTPCookieProcessor(cookie)
    opener = urllib.request.build_opener(cookie_handler)
    response = opener.open(url)
    print(response.getcode())
    for item in cookie:  # item为<class 'http.cookiejar.Cookie'>
        print(item.name, item.value, sep=" : ")
    

    MozillaCookieJar

    创建与Mozilla cookies.txt文件兼容的FileCookieJar实例

    import urllib.request
    import http.cookiejar
    
    
    url = 'https://www.zhihu.com/settings/profile'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36'
    }
    
    cookie = http.cookiejar.MozillaCookieJar("zhihu_cookie.txt")
    cookie_handler = urllib.request.HTTPCookieProcessor(cookie)
    opener = urllib.request.build_opener(cookie_handler)
    opener.addheaders = headers.items()
    
    try:
        cookie.load()    # 将cookie数据从文件加载到内存  很重要
    except http.cookiejar.LoadError as e:
        print('cookie文件加载失败')
    except IOError as e:
        print("cookie文件不存在")
    
    response = opener.open(url)
    print(response.geturl())  # 将geturl()返回的结果和url比对,判断是否登陆成功,失败会转到知乎登陆界面
    html = response.read().decode('utf-8')
    print(html)
    
    # 对于登陆成功,需要调用MozillaCookieJar对象的save()方法,将数据从内存保存到文件中
    

    LWPCookieJar

    创建与libwww-perl Set-Cookie3文件兼容的FileCookieJar实例

    import urllib.request
    import http.cookiejar
    
    
    url = 'http://www.baidu.com'
    cookie = http.cookiejar.LWPCookieJar("cookies.txt")
    opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))
    response = opener.open(url)
    # 必须调用save方法 将内存中的cookie对象保存到本地文件中, 下次再次使用cookie,只需调用load方法将其加载到内存中即可
    cookie.save(ignore_discard=True, ignore_expires=True)
    

    异常处理

    URLError

    引起URLError的原因通常有: 无网络连接,即本机无法上网、访问的目标服务器不存在。在这种情况下,异常对象会有reason属性(它是一个由(错误码、错误原因)组成的元组对象)。 捕获异常方法如下:

    import urllib.request
    
    
    try:
        response = urllib.request.urlopen('http://www.hello_world.org')
    except urllib.request.URLError as e:
        print(type(e.reason)) #  <class 'socket.gaierror'>
        print(e.reason)  # # [Errno 11001] getaddrinfo failed
    

    HTTPError

    HTTPError是URLError的子类,每次调用urlopen方法发出一个请求时,服务器上都会产生对应response,它包含一个数字"状态码",
    常见的状态码有200(请求成功),302(重定向),304(文档的内容(自上次访问以来或者根据请求的条件)并没有改变)
    这些状态码有的表示服务器无法完成请求。如果无法处理请求,urlopen会抛出HTTPError。
    典型的错误包括404(页面没有找到)、403(请求被禁止)、401(当前请求需要用户认证)、407(需要代理验证)、500(服务器内部错误)
    
    # 方式1
    import urllib.request
    import urllib.error
    
    
    url = 'http://www.hello_world.org'
    # url = 'http://example.com/test.html'
    try:
        response = urllib.request.urlopen(url)
    # HTTPError是URLError子类,要放到前面处理
    except urllib.error.HTTPError as e:
        print("The server cannot fulfill the request...")
        print("Error code: ", e.code)
        print("Reason: ", e.reason)
    except urllib.error.URLError as e:
        print("failed to fetch the server...")
        print("Reason: ", e.reason)
    
    
    # 方式2
    import urllib.request
    import urllib.error
    
    
    url = 'http://www.hello_world.org'
    # url = 'http://example.com/test.html'
    try:
        response = urllib.request.urlopen(url)
    except urllib.error.URLError as e:
        if hasattr(e, 'code'):
            print("The server cannot fulfill the request...")
            print("Error code: ", e.code)
            print("Reason: ", e.reason)
        else:
            print("failed to fetch the server...")
            print("Reason: ", e.reason)     
    

    urllib.parse

    urllib.parse.urlparse(urlstring, scheme='', allow_fragments=True)¶

    负责解析URL

    from urllib.parse import urlparse
    
    # def urlparse(url, scheme='', allow_fragments=True)
    # 将url解析成6部分 <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
    # 返回6元祖 (scheme, netloc, path, params, query, fragment)
    
    
    result = urlparse('http://www.baidu.com/index.html;user?id=100#comment')
    print(type(result))  # <class 'urllib.parse.ParseResult'>
    print(result)   # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=100', fragment='comment')
    
    # 只有通过// 才能识别netloc
    result = urlparse(
        '//www.baidu.com/index.html;user?id=100#comment',
        scheme="https")
    print(result)  # ParseResult(scheme='https', netloc='www.baidu.com', path='/index.html', params='user', query='id=100', fragment='comment')
    
    
    result = urlparse(
        'www.baidu.com/index.html;user?id=100#comment',
        scheme="https")
    print(result)  # ParseResult(scheme='https', netloc='', path='www.baidu.com/index.html', params='user', query='id=100', fragment='comment')
    
    # 原url已包含scheme,使用已有的scheme
    result = urlparse(
        'http://www.baidu.com/index.html;user?id=100#comment',
        scheme="https")
    print(result)  # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=100', fragment='comment')
    
    result = urlparse(
        "http://www.baidu.com/index.html;user?id=100#comment",
        allow_fragments=False)
    print(result)  # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html', params='user', query='id=100#comment', fragment='')
    
    result = urlparse(
        "http://www.baidu.com/index.html#comment",
        allow_fragments=False)
    print(result)  # ParseResult(scheme='http', netloc='www.baidu.com', path='/index.html#comment', params='', query='', fragment='')
    

    urllib.parse.urlunparse(parts)

    from urllib.parse import urlunparse
    
    
    data = ("http", 'www.baidu.com','index.html', 'user','id=100','comment')
    url = urlunparse(data)
    print(url)
    

    urllib.parse.urljoin(base, url, allow_fragments=True)

    # 以相对路径的url为准,base url向相对路径url提供相对路径url缺少的scheme(协议),netloc(主机地址), 来构造完整的url路径
    from urllib.parse import urljoin
    
    
    print(urljoin("http://www.baidu.com","FAQ.html"))
    print(urljoin("http://www.baidu.com/index.html","FAQ.html"))
    print(urljoin("http://www.baiud.com/index.html", "http://www.google.com/FAQ.html"))
    print(urljoin("http://www.baidu.com/index.html", "http://www.google.com/FAQ.html?question=2"))
    print(urljoin("http://www.baidu.com/index.html?wd=abc", "http://www.google.com/FAQ.html"))
    print(urljoin("http://www.baidu.com/", "?category=5#comment"))
    print(urljoin("http://www.baidu.com/#comment", "?category=5"))
    
    '''
    http://www.baidu.com/FAQ.html
    http://www.baidu.com/FAQ.html
    http://www.google.com/FAQ.html
    http://www.google.com/FAQ.html?question=2
    http://www.google.com/FAQ.html
    http://www.baidu.com/?category=5#comment
    http://www.baidu.com/?category=5
    '''
    

    urllib.parse.urlencode(query, doseq=False, safe='', encoding=None, errors=None, quote_via=quote_plus)

     from urllib.parse import urlencode
    
    basic_url = 'http://httpbin.org/get'
    data = {
        "key": '天气',
    }
    data = urlencode(data)
    full_url = '%s?%s' % (basic_url, data)
    print(full_url) # http://httpbin.org/get?key=%E5%A4%A9%E6%B0%94
    
  • 相关阅读:
    xss框架(一)之浏览器通信
    Joomla未授权创建特权用户漏洞和getshell脚本解析
    从零开始写网站登录爆破(一)
    CSRF学习整理
    vue中vue2-google-maps使用谷歌地图的基础操作
    vue中百度地图API的调用
    60秒定时减少
    git操作指令,以及常规git代码操作
    taro taroUi的H5打包后路径/修改为./
    vue enter事件无效,加入native
  • 原文地址:https://www.cnblogs.com/hupeng1234/p/7099476.html
Copyright © 2020-2023  润新知