• urllib库



    urllib的介绍
    url = 'http://www/baidu.com/' # 向ོ指ོ定ོ的ོUོRོLོ地ོ址ོ发ོ起ོ请ོ求ོ,并ོ返ོ回ོ服ོ务ོ器ོ响ོ应ོ的ོ数ོ据ོ response = urllib.request.urlopen("http://www.baidu.com") # 第ོ一ོ种ོ读ོ取ོ方ོ式ོ data = response.read() # 第ོ二ོ种ོ读ོ取ོ方ོ式ོ 读ོ取ོ一ོ行ོ data = response.readline() # 第ོ三ོ种ོ读ོ取ོ方ོ式ོ, 读ོ取ོ文ོ件ོ的ོ全ོ部ོ内ོ容ོ, 会ོ把ོ读ོ取ོ到ོ的ོ数ོ据ོ复赋ོ值ོ给ོ一ོ个ོ列ོ表ོ变ོ量ོ data = response.readlines() with open(r'/Users/mac/PycharmProjects/TEST/TEST/爬虫day/file/file.html', 'wb') as f: f.write(data)

    response 常ོ用ོ的ོ属ོ性ོ

    print(response.info())
    
    # 返ོ回ོ状ོ态ོ码ོ
    print(response.getcode())
    # 返ོ回ོ正ོ在ོ爬ོ取ོ的ོUོRོLོ地ོ址ོ
    pོrོiོnོtོ(response.geturl())
    
    #解ོ码ོ出ོUོRོLོ中ོ的ོ汉ོ字ོ
    pོrོiོnོtོ(urllib.request.unquote('url'))
    
    #编ོ码ོ
    print(urllib.request.quote('url'))
    

    爬ོ取ོ到ོ的ོ网ོ页ོ直ོ接ོ写ོ入ོ文ོ件ོ

    # urllib.request.urlretrieve('http://www.baidu.com',
    #                            filename=r'/Users/mac/PycharmProjects/TEST/TEST/爬虫day/file/file2.html')
    # 清除缓存
    # urllib.request.urlcleanup()
    

     模ོ拟ོ浏ོ览ོ器ོ

    # 3 模拟浏览器
    # url = 'http://www.baidu.com'
    # 请ོ求ོ头ོ # headers = { # 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) " # "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" # } # # 设置一个请求体 # req = urllib.request.Request(url, headers=headers) # # 发起请求 # response = urllib.request.urlopen(req) # data = response.read()

     

    # 设置超时机制
    # 如果网页长时间未响应, 系统判断超时, 无法爬取
    # for i in range(1, 100):
    #     try:
    #         response = urllib.request.urlopen(
    #             "http://www.baidu.com", timeout=0.5)
    #         print(len(response.read().decode('utf-8')))
    #     except:
    #         print('请求超时, 继续下一个爬取')
    

    参ོ数ོ打ོ包ོ# 打包

    # url = "http://www.sunck.wang:8085/form"
    # data = {
    #     "username": "sunck",
    #     "passwd": "666"
    # }
    # # 对要发送的数据进行打包, 注意编码
    # postData = urllib.parse.urlencode(data).encode('utf-8')
    # # 请求体
    # # 请求
    # headers = {
    #     'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) "
    #                   "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
    # }
    # # 设置一个请求体
    # req = urllib.request.Request(url, headers=headers, data=postData)
    # # 发起请求
    # response = urllib.request.urlopen(req)
    # data = response.read()
    # print(data.decode('utf-8'))


    urlencode 编码
    举例:

    import urllib
    param = {'name': '王月'}
    print(urllib.parse.urlencode(param))
    结果:name=%E7%8E%8B%E6%9C%88

    parse_qs 解码
    举例:

    name = urllib.parse.urlencode(param)
    print(urllib.parse.parse_qs(name))
    {'name': ['王月']}

     处ོ理ོHོTོTོPོSོ的ོ网ོ址ོ

    import ssl
    import json
    #
    # def ajaxCrawler(url):
    #
    #     headers = {
    #         'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) "
    #         "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
    #     }
    #     req = urllib.request.Request(url, headers=headers)
    #
    #     # 使用ssl使用未验证的上下文
    #     context = ssl._create_unverified_context()
    #     response = urllib.request.urlopen(req, context=context)
    #     json_str = response.read().decode('utf-8')
    #     json_data = json.loads(json_str)
    #     #
    #     return json_data
    #
    # url = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=20&limit=20"
    # info = ajaxCrawler(url)
    # print(info)
    
    
    # 自动抓取下一页
    
    # for i in range(10):
    #     url2 = "https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start=" + str(
    #         i * 20) + "&limit=20"
    #     info = ajaxCrawler(url2)
    #     print(len(info))
    
    #
    
    def joyCrawler(url):
        headers = {
                    'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) "
                    "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
                }
        req = urllib.request.Request(url, headers=headers)
        context = ssl._create_unverified_context()
        response = urllib.request.urlopen(req, context=context)
        HTML = response.read().decode('utf-8')
        with open(r'/Users/mac/PycharmProjects/TEST/TEST/爬虫day/file/file3.html', 'w') as f:
            f.write(HTML)
    
    
    url = 'https://www.qiushibaike.com/8hr/page/2/'
    print(joyCrawler(url))
    

     urlparse和urlsplit 函数:

    url = 'http://www.baidu.com/s?wd=python&username=abc#1'
    result = urllib.parse.urlparse(url)
    print(result)
    结果: ParseResult(scheme='http', netloc='www.baidu.com', path='/s', params='', query='wd=python&username=abc', fragment='1')
    
    取值:
    print('path:', result.path)
    
    
    
    
    
    
    result2 = urllib.parse.urlsplit(url)
    print(result2)
    结果:plitResult(scheme='http', netloc='www.baidu.com', path='/s', query='wd=python&username=abc', fragment='1')
    
    区别:urlsplit 没有params  urlparse 有params
    
  • 相关阅读:
    MySQL Thread Pool: Problem Definition
    MySQL数据库InnoDB存储引擎多版本控制(MVCC)实现原理分析
    Mysql源码目录结构
    android学习18——对PathMeasure中getPosTan的理解
    android学习17——命令行建gradle工程
    android学习16——library project的使用
    android学习13——android egl hello world
    ant编译java的例子
    android学习12——重载SurfaceView一些方法的执行顺序
    Visual Studio命令行创建库文件lib
  • 原文地址:https://www.cnblogs.com/wangyue0925/p/11065499.html
Copyright © 2020-2023  润新知