• urllib2基础操作


    Urllib2基础操作

    1、打开网页(urlopen)

    打开一个网页

    import urllib2
    response = urllib2.urlopen('http://www.baidu.com')
    html= response.read()
    print html
    

    urlopen一般常用的有三个参数,它的参数如下:

    urllib.requeset.urlopen(url,data,timeout)

    data参数的使用(GET)

    import urllib  
    import urllib2  
    
    data = {'email':'myemail', 'password':'password'}  
    params = urllib.urlencode(params) 
    response= urllib.urlopen("%s?%s"%(uri, params))
    code = response.getcode()

     data参数的使用(POST)

    import urllib  
    import urllib2  
    
    data = {'email':'myemail', 'password':'password'}  
    params = urllib.urlencode(data) 
    response= urllib.urlopen(uri, params)
    code = response.getcode() 
    

     所以如果我们添加data参数的时候就是以post请求方式请求,如果没有data参数就是get请求方式

    timeout参数的使用

    在某些网络情况不好或者服务器端异常的情况会出现请求慢的情况,请求设置一个超时时间

    import urllib2
    
    response = urllib2.urlopen('http://www.baidu.com', timeout=1)
    print(response.read())
    

     2、打开网页(request)

    打开一个网页

    import urllib.request
    
    request = urllib.request.Request('https://www.baidu.com')
    response = urllib.request.urlopen(request)
    print(response.read().decode('utf-8'))
    

     指定请求头

    import urllib2
    
    # 制定请求头
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64)"}
    
    # 封装请求
    request = urllib2.Request(url=url, headers=headers)
    response = urllib2.urlopen(request)
    content = response.read().decode('utf-8')
    print content
    

     3、进阶

    增加代理

    # 自定义headers
    headers = {
        'Host':'www.dianping.com',
        'Cookie': 'JSESSIONID=F1C38C2F1A7F7BF3BCB0C4E3CCDBE245 aburl=1; cy=2;'
        'User-Agent': "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
        }
    
    proxy_handler = urllib2.ProxyHandler({'http': 'http://host:port'})
    opener = urllib2.build_opener(proxy_handler)
    urllib2.install_opener(opener)
    request = urllib2.Request(url, headers=headers)
    response = urllib2.urlopen(request)
    content = response.read().decode('utf-8')
    

     操作cookie

    import urllib2
    import cookielib
    import json
    
    cookie = cookielib.CookieJar()
    cookie_s = urllib2.HTTPCookieProcessor(cookie)  # 创建cookie处理器
    opener = urllib2.build_opener(cookie_s)
    # 构建opener
    urllib2.install_opener(opener)
    response= urllib2.urlopen('http://www.dianping.com').read()  # 读取指定网站的内容  cj = urllib2.HTTPCookieProcessor(cookie)
    print response    # 网页HTML
    
    # 查看cookie
    print cookie, type(cookie)
    for item in cookie:
        print 'name:' + item.name + '-value:' + item.value
    

     保存cookie

    def saveCookie():
        # 设置保存cookie的文件
        filename = 'cookie.txt'
        # 声明一个MozillaCookieJar对象来保存cookie,之后写入文件
        cookie = cookielib.MozillaCookieJar(filename)
        # 创建cookie处理器
        handler = urllib2.HTTPCookieProcessor(cookie)
        # 构建opener
        opener = urllib2.build_opener(handler)
        # 创建请求
        res = opener.open('http://www.baidu.com')
        # 保存cookie到文件
        # ignore_discard的意思是即使cookies将被丢弃也将它保存下来
        # ignore_expires的意思是如果在该文件中cookies已经存在,则覆盖原文件写入
        cookie.save(ignore_discard=True, ignore_expires=True)
    

     在文件中取出cookie

    def getCookie():
        # 创建一个MozillaCookieJar对象
        cookie = cookielib.MozillaCookieJar()
        # 从文件中的读取cookie内容到变量
        cookie.load('cookie.txt', ignore_discard=True, ignore_expires=True)
        # 打印cookie内容,证明获取cookie成功
        for item in cookie:
            print 'name:' + item.name + '-value:' + item.value
        # 利用获取到的cookie创建一个opener
        handler = urllib2.HTTPCookieProcessor(cookie)
        opener = urllib2.build_opener(handler)
        res = opener.open('http://www.baidu.com')
        print res.read()
    

     来个实例

    def my_cookie_test():
        headers = {
            'User-Agent': "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
            'Connection': 'keep-alive',
            'Cookie': 'cy=2; _lxsdk_cuid=16000a1a16cc8-0629d2ca3b9f7-40544230-100200-16000a1a16dc8; _lxsdk=16000a1a16cc8-0629d2ca3b9f7-40544230-100200-16000a1a16dc8; _lxsdk_s=16000a1a16f-c56-870-2aa%7C%7C23; _hc.v=44792549-7147-7394-ac0a-eefed1fa19a2.1511839081; s_ViewType=10',
            'Host': 'www.dianping.com',
            'Referer': 'http://www.dianping.com/shop',
            'Upgrade-Insecure-Requests': 1
        }
        # 请求cookie
        cj_a = cookielib.CookieJar()
        cj_s = urllib2.HTTPCookieProcessor(cj_a)
        proxy_s = urllib2.ProxyHandler({'http': '0.0.0.0:8080'})
        opener = urllib2.build_opener(proxy_s, cj_s)
        urllib2.install_opener(opener)
        try:
            request = urllib2.Request("http://www.dianping.com/shop/000000/", headers=headers)
            response = urllib2.urlopen(request)
            content = response.read().decode('utf-8')
            # HTML
            print content
            cookie_data = {}
            for item in cj_a:
                # print '请求之后:name:' + item.name + '-value:' + item.value
                cookie_data[item.name] = item.value
            cookie_str = json.dumps(cookie_data)
            with open('cookie.txt', 'w') as f:
                f.write(cookie_str)
            print("cookies信息已保存到本地")
        except Exception as e:
            print e
    

    网页信息抽取。。。待下期。。。

  • 相关阅读:
    form 表单验证常用正则记录
    定位某一项值在多维数据中的位置
    jquery weui picker多次动态赋值
    页面旋转立方体图片
    微信开发者工具中的正则表达式解析
    Jquery WEUI 滚动加载(infinite)不触发
    背景线条实现
    进入博客
    tomcat 修改内存配置
    win10配置jdk环境变量
  • 原文地址:https://www.cnblogs.com/shangpolu/p/7929272.html
Copyright © 2020-2023  润新知