• Python学习笔记012_网络_异常


    1,Python如何访问互联网?

    url + lib =  urllib

    >>> # 使用urllib包下的request模块
    >>> 
    >>> import urllib.request
    >>> 
    >>> # 使用函数 .urlopen(),第一个是 string形式的地址,或者Request对象
    
    >>> response = urllib.request.urlopen("http://www.baidu.com/")
    >>> print(response)
    <http.client.HTTPResponse object at 0x02927610>
    >>> # 读取返回的信息
    >>> # 这里要返回的是二进制数据,需要解码的
    >>> html = response.read()
    >>> html = html.decode("utf-8")
    >>> print(html)

     2, 下载一只猫?

     我们可以访问网站  http://placekitten.com/ ,我们只需要加上 宽度和高度参数就可以得到一张量身定制的猫的图片

    如下形式 : http://placekitten.com/g/200/300     http://placekitten.com/400/300    

    # 编写一个 download_cat.py ,完成下载一张猫图 内容如下:
    
    import urllib.request
    
    response = urllib.request.urlopen("http://placekitten.com/g/200/300")
    cat_img = response.read()
    
    with open("cat_200_300.jpg","wb") as f:
        f.write(cat_img)
    >>> 
    >>> # 除了read()方法之外,还可以使用以下方法:
    >>> # geturl()  info()  getcode()
    >>> 
    >>> response.geturl()
    'http://placekitten.com/g/200/300'
    >>> response.info()
    <http.client.HTTPMessage object at 0x028A6E50>
    >>> print(response.info())
    Date: Tue, 02 Aug 2016 08:57:00 GMT
    Content-Type: image/jpeg
    Content-Length: 9162
    Connection: close
    Set-Cookie: __cfduid=d58fa9ee9079943b9db4ce64366aa85f61470128220; expires=Wed, 02-Aug-17 08:57:00 GMT; path=/; domain=.placekitten.com; HttpOnly
    Accept-Ranges: bytes
    X-Powered-By: PleskLin
    Access-Control-Allow-Origin: *
    Cache-Control: public
    Expires: Thu, 31 Dec 2020 20:00:00 GMT
    Server: cloudflare-nginx
    CF-RAY: 2cc051e22cad22a0-LAX
    >>> 
    >>> response.getcode()
    200
    >>> 

     3,模拟有道翻译   POST请求

    如果 urllib.request.urlopen(url,data) data参数被赋值时,就会使用POST请求,并且data参数是基于 application/x-www-form-urlencoded格式,可以使用urllib.parse.urlencode()处理data

    import urllib.request
    import urllib.parse
    import json
    
    url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=null'
    
    conent = input('请输入要翻译的内容:')
    
    data = {'type':'AUTO','i':conent,'doctype':'json','xmlVersion':'1.8',
            'keyfrom':'fanyi.web','ue':'UTF-8','action':'FY_BY_CLICKBUTTON',
            'typoResult':'true'}
    
    # encode 是把Unicode形式变成其它编码形式
    data = urllib.parse.urlencode(data).encode('utf-8')
    response = urllib.request.urlopen(url,data)
    
    # decode 是把其它编码形式变成Unicode形式
    html = response.read().decode("utf-8")
    
    
    # html是一个json格式的字符串数据
    target = json.loads(html)
    
    print("翻译结果: %s" %(target['translateResult'][0][0]['tgt']))

    Python编码问题的解决方案总结    http://bbs.fishc.com/thread-56452-1-1.html

     

      4,headers 设置

     

    headers是一个字典形式的数据,有两种设置方式 第一种是在 urlopen(url,data,headers)的第三个参数中设置;第二种方式是生成Request对象,调用 add_header(key,value)添加 

    服务器检查是程序访问还是浏览器访问一般是通过 

    1. User-Agent:
      Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36
    header ={}
    header['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'
    
    req = urllib.request.Request(url,data,header)
    response = urllib.request.urlopen(req)
    >>> req.headers
    {'User-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'}
    >>> 
    req = urllib.request.Request(url,data)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36')

    time 模块的  sleep(5)  , 表示睡5秒钟

    5,代理

      步骤:

      1,参数是一个字典{'类型':'代理IP:端口port'}

      proxy_support = urllib.request.ProxyHandler({})

      2,定制、创建一个opener

      opener = urllib.request.build_opener(proxy_support )

      3a. 安装opener

      urllib.request.install_opener(opener)

      3b.调用opener

      opener.open(url)

     在网上搜索 代理IP,可以搜索到很多免费的代理IP  比如: http://www.xicidaili.com/

     

    import urllib.request
    import random
    
    #该url地址是一个查询IP的地址
    url = 'http://www.whatismyip.com.tw'
    
    iplist = ['183.129.178.14:8080','123.57.190.51:7777','101.231.250.102:80']
    
    proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)})
    
    
    opener = urllib.request.build_opener(proxy_support)
    opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36')]
    
    urllib.request.install_opener(opener)
    
    response = urllib.request.urlopen(url)
    html = response.read().decode("utf-8")
    print(html)

    小应用,下载煎蛋网的妹子图 http://jandan.net/ 

    import urllib.request
    import os
    import random
    '''
    打开网址.
    '''
    def open_url(url):
    
       
        iplist = ['121.193.143.249:80','119.6.136.122:80','101.231.250.102:80']
    
        proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)})
    
    
        opener = urllib.request.build_opener(proxy_support)
        opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36')]
    
        urllib.request.install_opener(opener)
    
        response = urllib.request.urlopen(url)
        html = response.read()
    
        #req = urllib.request.Request(url)
        #req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36')
        #response = urllib.request.urlopen(req)
        #html = response.read()
    
        return html
    
    '''
    获取当前要下载的图片编号
    '''
    def get_page(url):
        
        html = open_url(url).decode('utf-8')
    
        # 在html中查找 <span class="current-comment-page">[2081]</span>
        a = html.find('current-comment-page') + 23
        # 从a开始找 有中括号 ]
        b = html.find(']',a)
        
        return html[a:b]
    
    def find_imgs(page_url):
    
        html = open_url(page_url).decode('utf-8')
    
        # 在html中查找 <img src="XXX.jpg">
        img_addrs = []
    
        find_start = html.find('ol class="commentlist"')
        find_end = html.find('/ol',find_start)
        
        a = html.find('img src=',find_start,find_end)
    
        while a != -1:
            #从a开始找,最多找255个字符
            b = html.find('.jpg',a,a+255)
            if b != -1:
               
                img_addrs.append(html[a+9:b+4])
            
            else:
                b =a + 9
    
            a = html.find('img src=', b,find_end)
    
        return img_addrs
    
    def save_imgs(folder,img_addrs):
        print(folder)
        for each in img_addrs:
            # 图片地址 斜杠拆分,取最后一个
            img_name = each.split('/')[-1]
      
            with open(img_name,'wb') as f:
                img = open_url(each)
                f.write(img)
    
    
    
        
    '''
    下载妹子图,保存到folder文件夹,下载pages张图
    '''
    def download_mm(folder='ooxx' , pages = 10):
        #使用 os模块,创建文件夹和切换到该文件夹
        #os.mkdir(folder)
        os.chdir(folder)
    
        url = 'http://jandan.net/ooxx/'
        #获取要下载的当前图片编号
        page_num = int(get_page(url))
    
        # 组装下载链接
        for i in range(pages):
            page_num -= i
            page_url = url + 'page-' + str(page_num)+ '#comments'
            #从链接中获取图片地址
            image_addrs = find_imgs(page_url)
            # 下载保存图片
            save_imgs(folder,image_addrs)
    
        print('--download__over--')
    
    if __name__ == '__main__':
        download_mm()

    网络异常  URLError HTTPError , HTTPError 是 URLError的子类

    ----------- 赠人玫瑰,手有余香     如果本文对您有所帮助,动动手指扫一扫哟   么么哒 -----------


    未经作者 https://www.cnblogs.com/xin1006/ 梦相随1006 同意,不得擅自转载本文,否则后果自负
  • 相关阅读:
    149. Max Points on a Line(js)
    148. Sort List(js)
    147. Insertion Sort List(js)
    146. LRU Cache(js)
    145. Binary Tree Postorder Traversal(js)
    144. Binary Tree Preorder Traversal(js)
    143. Reorder List(js)
    142. Linked List Cycle II(js)
    141. Linked List Cycle(js)
    140. Word Break II(js)
  • 原文地址:https://www.cnblogs.com/xin1006/p/5729795.html
Copyright © 2020-2023  润新知