• Python爬虫学习第一章


    #第四章

    #4.2 快速使用Urllib扒取网页

    #(1)

    >>> import urllib.request

     

    #(2)

    >>> file=urllib.request.urlopen("http://www.baidu.com")

     

    #(3)

    >>> data=file.read()

    >>> dataline=file.readline()

     

    #(4)

    >>> print(dataline)

     

    #(5)

    >>> print(data)

     

    #(6)

    >>> fhandle=open("D:/Python35/myweb/part4/1.html","wb")

    >>> fhandle.write(data)

    99437

    >>> fhandle.close()

     

    #(7)

    >>>filename=urllib.request.urlretrieve("http://edu.51cto.com",filename="D:/Python35/myweb/part4/2.html")

     

    #(8)

    >>> urllib.request.urlcleanup()

     

    #(9)

    >>> file.info()

    <http.client.HTTPMessage object at 0x0000000003623D68>

     

    #(10)

    >>> file.getcode()

    200

     

    #(11)

    >>> file.geturl()

    'http://www.baidu.com'

     

    #(12)

    >>> urllib.request.quote("http://www.sina.com.cn")

    'http%3A//www.sina.com.cn'

     

    #(13)

    >>> urllib.request.unquote("http%3A//www.sina.com.cn")

    'http://www.sina.com.cn'

     

    #4.3 浏览器的完全模拟--Headers属性

    #(1)

    >>> import urllib.request

    >>> url= "http://blog.csdn.net/weiwei_pig/article/details/51178226"

    >>> file=urllib.request.urlopen(url)

     

    #(2)

    mport urllib.request

    url= "http://blog.csdn.net/weiwei_pig/article/details/51178226"

    headers=("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0")

    opener = urllib.request.build_opener()

    opener.addheaders = [headers]

    data=opener.open(url).read()

     

    #(3)

    >>> fhandle=open("D:/Python35/myweb/part4/3.html","wb")

    >>> fhandle.write(data)

    47630

    >>> fhandle.close()

     

    #(4)

    import urllib.request

    url= "http://blog.csdn.net/weiwei_pig/article/details/51178226"

    req=urllib.request.Request(url)

    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0')

    data=urllib.request.urlopen(req).read()

     

    #4.4 超时设置

    #(1)

    import urllib.request

    for i in range(1,100):

    try:

    file=urllib.request.urlopen("http://yum.iqianyue.com",timeout=1)

    data=file.read()

    print(len(data))

    except Exception as e:

    print("出现异常-->"+str(e))

    #(2)

    import urllib.request

    for i in range(1,100):

    try:

    file=urllib.request.urlopen("http://yum.iqianyue.com",timeout=30)

    data=file.read()

    print(len(data))

    except Exception as e:

    print("出现异常-->"+str(e))

     

    #4.5 HTTP协议请求实战

    #(1)

    mport urllib.request

    keywd="hello"

    url="http://www.baidu.com/s?wd="+keywd

    req=urllib.request.Request(url)

    data=urllib.request.urlopen(req).read()

    fhandle=open("D:/Python35/myweb/part4/4.html","wb")

    fhandle.write(data)

    fhandle.close()

     

    #(2)

    import urllib.request

    url="http://www.baidu.com/s?wd="

    key="韦玮老师"

    key_code=urllib.request.quote(key)

    url_all=url+key_code

    req=urllib.request.Request(url_all)

    data=urllib.request.urlopen(req).read()

    fh=open("D:/Python35/myweb/part4/5.html","wb")

    fh.write(data)

    fh.close()

     

    #(3)

    import urllib.request

    import urllib.parse

    url = "http://www.iqianyue.com/mypost/"

    postdata =urllib.parse.urlencode({

    "name":"ceo@iqianyue.com",

    "pass":"aA123456"

    }).encode('utf-8') #将数据使用urlencode编码处理后,使用encode()设置为utf-8编码

    req = urllib.request.Request(url,postdata)

    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0')

    data=urllib.request.urlopen(req).read()

    fhandle=open("D:/Python35/myweb/part4/6.html","wb")

    fhandle.write(data)

    fhandle.close()

     

    #4.6 瞒天过海之代理服务器的设置

    #(1)

    def use_proxy(proxy_addr,url):

    import urllib.request

    proxy= urllib.request.ProxyHandler({'http':proxy_addr})

    opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)

    urllib.request.install_opener(opener)

    data = urllib.request.urlopen(url).read().decode('utf-8')

    return data

    proxy_addr="202.75.210.45:7777"

    data=use_proxy(proxy_addr,"http://www.baidu.com")

    print(len(data))

     

    #4.7 DebugLog实战

    #(1)

    import urllib.request

    httphd=urllib.request.HTTPHandler(debuglevel=1)

    httpshd=urllib.request.HTTPSHandler(debuglevel=1)

    opener=urllib.request.build_opener(httphd,httpshd)

    urllib.request.install_opener(opener)

    data=urllib.request.urlopen("http://edu.51cto.com")

     

    #4.8 异常处理神器--URLError实战

    #(1)

    import urllib.request

    import urllib.error

    try:

    urllib.request.urlopen("http://blog.csdn.net")

    except urllib.error.URLError as e:

    print(e.code)

    print(e.reason)

     

    #(2)

    import urllib.request

    import urllib.error

    try:

    urllib.request.urlopen("http://blog.csdn.net")

    except urllib.error.HTTPError as e:

    print(e.code)

    print(e.reason)

     

    #(3)

    import urllib.request

    import urllib.error

    try:

    urllib.request.urlopen("http://blog.baidusss.net")

    except urllib.error.HTTPError as e:

    print(e.reason)

     

    #(4)

    import urllib.request

    import urllib.error

    try:

    urllib.request.urlopen("http://blog.baidusss.net")

    except urllib.error.URLError as e:

    print(e.reason)

     

    #(5)

    import urllib.request

    import urllib.error

    try:

    urllib.request.urlopen("http://blog.baidusss.net")

    except urllib.error.HTTPError as e:

    print(e.code)

    print(e.reason)

    except urllib.error.URLError as e:

    print(e.reason)

     

    #(6)

    import urllib.request

    import urllib.error

    try:

    urllib.request.urlopen("http://www.baidussssss.net")

    except urllib.error.URLError as e:

    print(e.code)

    print(e.reason)

     

    #(7)

    import urllib.request

    import urllib.error

    try:

    urllib.request.urlopen("http://blog.csdn.net")

    except urllib.error.URLError as e:

    if hasattr(e,"code"):

    print(e.code)

    if hasattr(e,"reason"):

    print(e.reason)

  • 相关阅读:
    redis 日常使用
    centos7新装mysql 5.7
    spring 的常用功能
    软件安装教程
    每日成长17年1月
    ubuntu下不用拔盘就可以重新识别usb设备
    使用Linux遇到的一些问题和解决方案
    在XEN上启动guest时loopback设备不足
    使用virtualenv搭建python虚拟开发环境
    Linux局域网登陆响应时间过长
  • 原文地址:https://www.cnblogs.com/ifconfig/p/14987935.html
Copyright © 2020-2023  润新知