• Python学习笔记012_网络_异常


    1,Python如何访问互联网?

    url + lib =  urllib

    >>> # 使用urllib包下的request模块
    >>> 
    >>> import urllib.request
    >>> 
    >>> # 使用函数 .urlopen(),第一个是 string形式的地址,或者Request对象
    
    >>> response = urllib.request.urlopen("http://www.baidu.com/")
    >>> print(response)
    <http.client.HTTPResponse object at 0x02927610>
    >>> # 读取返回的信息
    >>> # 这里要返回的是二进制数据,需要解码的
    >>> html = response.read()
    >>> html = html.decode("utf-8")
    >>> print(html)

     2, 下载一只猫?

     我们可以访问网站  http://placekitten.com/ ,我们只需要加上 宽度和高度参数就可以得到一张量身定制的猫的图片

    如下形式 : http://placekitten.com/g/200/300     http://placekitten.com/400/300    

    # 编写一个 download_cat.py ,完成下载一张猫图 内容如下:
    
    import urllib.request
    
    response = urllib.request.urlopen("http://placekitten.com/g/200/300")
    cat_img = response.read()
    
    with open("cat_200_300.jpg","wb") as f:
        f.write(cat_img)
    >>> 
    >>> # 除了read()方法之外,还可以使用以下方法:
    >>> # geturl()  info()  getcode()
    >>> 
    >>> response.geturl()
    'http://placekitten.com/g/200/300'
    >>> response.info()
    <http.client.HTTPMessage object at 0x028A6E50>
    >>> print(response.info())
    Date: Tue, 02 Aug 2016 08:57:00 GMT
    Content-Type: image/jpeg
    Content-Length: 9162
    Connection: close
    Set-Cookie: __cfduid=d58fa9ee9079943b9db4ce64366aa85f61470128220; expires=Wed, 02-Aug-17 08:57:00 GMT; path=/; domain=.placekitten.com; HttpOnly
    Accept-Ranges: bytes
    X-Powered-By: PleskLin
    Access-Control-Allow-Origin: *
    Cache-Control: public
    Expires: Thu, 31 Dec 2020 20:00:00 GMT
    Server: cloudflare-nginx
    CF-RAY: 2cc051e22cad22a0-LAX
    >>> 
    >>> response.getcode()
    200
    >>> 

     3,模拟有道翻译   POST请求

    如果 urllib.request.urlopen(url,data) data参数被赋值时,就会使用POST请求,并且data参数是基于 application/x-www-form-urlencoded格式,可以使用urllib.parse.urlencode()处理data

    import urllib.request
    import urllib.parse
    import json
    
    url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&smartresult=ugc&sessionFrom=null'
    
    conent = input('请输入要翻译的内容:')
    
    data = {'type':'AUTO','i':conent,'doctype':'json','xmlVersion':'1.8',
            'keyfrom':'fanyi.web','ue':'UTF-8','action':'FY_BY_CLICKBUTTON',
            'typoResult':'true'}
    
    # encode 是把Unicode形式变成其它编码形式
    data = urllib.parse.urlencode(data).encode('utf-8')
    response = urllib.request.urlopen(url,data)
    
    # decode 是把其它编码形式变成Unicode形式
    html = response.read().decode("utf-8")
    
    
    # html是一个json格式的字符串数据
    target = json.loads(html)
    
    print("翻译结果: %s" %(target['translateResult'][0][0]['tgt']))

    Python编码问题的解决方案总结    http://bbs.fishc.com/thread-56452-1-1.html

     

      4,headers 设置

     

    headers是一个字典形式的数据,有两种设置方式 第一种是在 urlopen(url,data,headers)的第三个参数中设置;第二种方式是生成Request对象,调用 add_header(key,value)添加 

    服务器检查是程序访问还是浏览器访问一般是通过 

    1. User-Agent:
      Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36
    header ={}
    header['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'
    
    req = urllib.request.Request(url,data,header)
    response = urllib.request.urlopen(req)
    >>> req.headers
    {'User-agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36'}
    >>> 
    req = urllib.request.Request(url,data)
    req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36')

    time 模块的  sleep(5)  , 表示睡5秒钟

    5,代理

      步骤:

      1,参数是一个字典{'类型':'代理IP:端口port'}

      proxy_support = urllib.request.ProxyHandler({})

      2,定制、创建一个opener

      opener = urllib.request.build_opener(proxy_support )

      3a. 安装opener

      urllib.request.install_opener(opener)

      3b.调用opener

      opener.open(url)

     在网上搜索 代理IP,可以搜索到很多免费的代理IP  比如: http://www.xicidaili.com/

     

    import urllib.request
    import random
    
    #该url地址是一个查询IP的地址
    url = 'http://www.whatismyip.com.tw'
    
    iplist = ['183.129.178.14:8080','123.57.190.51:7777','101.231.250.102:80']
    
    proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)})
    
    
    opener = urllib.request.build_opener(proxy_support)
    opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36')]
    
    urllib.request.install_opener(opener)
    
    response = urllib.request.urlopen(url)
    html = response.read().decode("utf-8")
    print(html)

    小应用,下载煎蛋网的妹子图 http://jandan.net/ 

    import urllib.request
    import os
    import random
    '''
    打开网址.
    '''
    def open_url(url):
    
       
        iplist = ['121.193.143.249:80','119.6.136.122:80','101.231.250.102:80']
    
        proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)})
    
    
        opener = urllib.request.build_opener(proxy_support)
        opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36')]
    
        urllib.request.install_opener(opener)
    
        response = urllib.request.urlopen(url)
        html = response.read()
    
        #req = urllib.request.Request(url)
        #req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36')
        #response = urllib.request.urlopen(req)
        #html = response.read()
    
        return html
    
    '''
    获取当前要下载的图片编号
    '''
    def get_page(url):
        
        html = open_url(url).decode('utf-8')
    
        # 在html中查找 <span class="current-comment-page">[2081]</span>
        a = html.find('current-comment-page') + 23
        # 从a开始找 有中括号 ]
        b = html.find(']',a)
        
        return html[a:b]
    
    def find_imgs(page_url):
    
        html = open_url(page_url).decode('utf-8')
    
        # 在html中查找 <img src="XXX.jpg">
        img_addrs = []
    
        find_start = html.find('ol class="commentlist"')
        find_end = html.find('/ol',find_start)
        
        a = html.find('img src=',find_start,find_end)
    
        while a != -1:
            #从a开始找,最多找255个字符
            b = html.find('.jpg',a,a+255)
            if b != -1:
               
                img_addrs.append(html[a+9:b+4])
            
            else:
                b =a + 9
    
            a = html.find('img src=', b,find_end)
    
        return img_addrs
    
    def save_imgs(folder,img_addrs):
        print(folder)
        for each in img_addrs:
            # 图片地址 斜杠拆分,取最后一个
            img_name = each.split('/')[-1]
      
            with open(img_name,'wb') as f:
                img = open_url(each)
                f.write(img)
    
    
    
        
    '''
    下载妹子图,保存到folder文件夹,下载pages张图
    '''
    def download_mm(folder='ooxx' , pages = 10):
        #使用 os模块,创建文件夹和切换到该文件夹
        #os.mkdir(folder)
        os.chdir(folder)
    
        url = 'http://jandan.net/ooxx/'
        #获取要下载的当前图片编号
        page_num = int(get_page(url))
    
        # 组装下载链接
        for i in range(pages):
            page_num -= i
            page_url = url + 'page-' + str(page_num)+ '#comments'
            #从链接中获取图片地址
            image_addrs = find_imgs(page_url)
            # 下载保存图片
            save_imgs(folder,image_addrs)
    
        print('--download__over--')
    
    if __name__ == '__main__':
        download_mm()

    网络异常  URLError HTTPError , HTTPError 是 URLError的子类

    ----------- 赠人玫瑰,手有余香     如果本文对您有所帮助,动动手指扫一扫哟   么么哒 -----------


    未经作者 https://www.cnblogs.com/xin1006/ 梦相随1006 同意,不得擅自转载本文,否则后果自负
  • 相关阅读:
    APDU:APDU常用指令
    其他:IntelliJ IDEA设置运行内存
    Linux:获取Linux离线postgresql数据库安装包并部署
    资源:Postgresql数据库下载路径
    免费版:Xshell和Xftp下载路径
    Raspberry Pi:树莓派开发板配置USB启动系统
    Raspberry Pi:树莓派安装基础系统 Raspberry Pi(树莓派系统)
    Raspberry Pi:树莓派安装Kali2021新版本
    Docker:虚拟机挂起后,再启动docker容器连接失败
    Docker:docker搭建redis一主多从集群(配置哨兵模式)
  • 原文地址:https://www.cnblogs.com/xin1006/p/5729795.html
Copyright © 2020-2023  润新知