• Urllib库


    urllib是一个包,这个包收集了几个用于处理URLs的模块

    urllib.request      用于打开和读取URLs
    urllib.error        用于触发请求的异常
    urllib.parse        用于分析URLs
    urllib.robotparser  用于分析robots.txt格式的文件
    

     URLOPEN练习

    import urllib.request
    
    response = urllib.request.urlopen("http://www.baidu.com")
    print(response.read().decode("utf-8"))
    第一个爬虫
    import urllib.request
    import urllib.parse
    
    data = bytes(urllib.parse.urlencode({"word":"hello"}),encoding="utf8")
    response = urllib.request.urlopen("http://httpbin.org/post",data=data)
    print(response.read())
    POST请求
    import urllib.request
    
    response = urllib.request.urlopen("http://httpbin.org/get",timeout=1)
    print(response.read())
    简单超时
    import socket
    import urllib.request
    import urllib.error
    
    try:
        response = urllib.request.urlopen("http://httpbin.org/get",timeout=0.1)
    except urllib.error.URLError as e:
        if isinstance(e.reason,socket.timeout):
            print("TIME OUT")
    简单的异常

     响应练习

    import urllib.request
    
    response = urllib.request.urlopen("https://www.python.org")
    print(type(response))
    响应类型
    import urllib.request
    
    response = urllib.request.urlopen("https://www.python.org")
    print(response.status)
    print(response.getheaders())
    print(response.getheader("Server"))
    获取状态码响应头
    import urllib.request
    
    request = urllib.request.Request("https://www.python.org")
    response = urllib.request.urlopen(request)
    print(response.read().decode("utf-8"))
    得到响应内容

    请求练习

    import urllib.request
    
    request = urllib.request.Request("https://python.org")
    response = urllib.request.urlopen(request)
    print(response.read().decode("utf-8"))
    简单请求
    from urllib import request,parse
    
    url = "http://httpbin.org/post"
    headers = {
        "User-Agent":"Mozilla/4.0(compatible;MSIE 5.5;Windows NT)",
        "Host":"httpbin.org"
    }
    dict = {
        "name":"Germey"
    }
    data = bytes(parse.urlencode(dict),encoding="utf8")
    req = request.Request(url=url,data=data,headers=headers,method="POST")
    response = request.urlopen(req)
    print(response.read().decode("utf-8"))
    heards
    from urllib import request,parse
    
    url = "http://httpbin.org/post"
    dict = {
        "name":"Germey"
    }
    data = bytes(parse.urlencode(dict),encoding="utf8")
    req = request.Request(url=url,data=data,method="POST")
    req.add_header("User-Agent","Mozilla/4.0(compatible;MSIE 5.5;Windows NT)")
    response = request.urlopen(req)
    print(response.read().decode("utf-8"))
    add_herder方法

    代理HANDLER

    import urllib.request
    
    proxy_handler = urllib.request.ProxyHandler({
        "http":"http://127.0.0.1:9743",
        "https":"https://127.0.0.1:9743"
    })
    opener = urllib.request.build_opener(proxy_handler)
    response = opener.open("http://www.douyu.com")
    print(response.read())
    代理

    cookie

    import http.cookiejar,urllib.request
    
    cookie = http.cookiejar.CookieJar()
    handler = urllib.request.HTTPCookieProcessor(cookie)
    opener = urllib.request.build_opener(handler)
    response = opener.open("http://www.baidu.com")
    for item in cookie:
        print(item.name+"="+item.value)
    cookie

     

    异常处理

    from urllib import request,error
    try:
        response = request.urlopen("http://chuiqingcai.com/index.htm")
    except error.URLError as e:
        print(e.reason)
    异常1

    URL解析

    1.URlPARSE

     urllib.parse.urlparse(urlstring,scheme="",allow_fragments=True)

    from urllib.parse import urlparse
    
    result = urlparse("http://www.baidu.com/index.html;user?id=5#comment")
    print(type(result),result)
    分割url
    from urllib.parse import urlparse
    
    result = urlparse("www.baidu.com/index.html;user?id=5#comment",scheme="https")
    print(result)
    自动填充协议类型(默认如果有则不改变协议类型)

    2.URLUNPARSE

    from urllib.parse import urlunparse
    data = ["http","www.baidu.com","index.html","user","a=6","comment"]
    print(urlunparse(data))
    拼接

    3.URLJOIN

    4.URLENCODE

  • 相关阅读:
    PostgreSQL在Update时使用Substring函数截取字符串并且加上CASE WHEN THEN条件判断
    清理Visual Studio 2017的项目历史记录或手工修改Visual Studio 2017的注册表设置
    基于ABP模块组件与依赖注入组件的项目插件开发
    jenkins git can't work ERROR: Timeout after 10 minutes ERROR: Error fetching remote repo 'origin'
    SV randomize
    SV class
    SV coverage
    uvm设计分析——reg
    shell bash-shell
    scrapy的安装
  • 原文地址:https://www.cnblogs.com/cangshuchirou/p/9726499.html
Copyright © 2020-2023  润新知