• requests爬取数据与aiohttp爬取数据对比


    # 同步
    
    from datetime import datetime
    
    import requests
    from lxml import etree
    
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit"
                             "/537.36 (KHTML, like Gecko) "
                             "Chrome/72.0.3626.121 Safari/537.36"}
    
    
    def get_movie_url():
        req_url = "https://movie.douban.com/chart"
        response = requests.get(url=req_url, headers=headers)
        html = etree.HTML(response.text)
        movies_url = html.xpath(
            "//*[@id='content']/div/div[1]/div/div/table/tr/td/a/@href")
        return movies_url
    
    
    def get_movie_content(movie_url):
        response = requests.get(movie_url, headers=headers)
        result = etree.HTML(response.text)
        movie = dict()
        name = result.xpath('//*[@id="content"]/h1/span[1]//text()')
        author = result.xpath('//*[@id="info"]/span[1]/span[2]//text()')
        movie["name"] = name
        movie["author"] = author
        return movie
    
    
    if __name__ == '__main__':
        start = datetime.now()
        movie_url_list = get_movie_url()
        movies = dict()
        for url in movie_url_list:
            movies[url] = get_movie_content(url)
        print(movies)
        print("同步用时为:{}".format(datetime.now() - start))
    
    # 看一下同步的结果:
    #
    # E:venvspiderScriptspython.exe E:/python_project/filetest/douban.py
    # [{'name': ['小丑 Joker'], 'author': ['托德·菲利普斯']},
    # {'name': ['好莱坞往事 Once Upon a Time... in Hollywood'], 'author': ['昆汀·塔伦蒂诺']},
    # {'name': ['爱尔兰人 The Irishman'], 'author': ['马丁·斯科塞斯']},
    # {'name': ['准备好了没 Ready or Not'], 'author': ['马特·贝蒂内利-奥尔平', ' / ', '泰勒·吉勒特']},
    # {'name': ['82年生的金智英 82년생 김지영'], 'author': ['金度英']},
    # {'name': ['克劳斯:圣诞节的秘密 Klaus'], 'author': ['塞尔希奥·巴勃罗斯', ' / ', '卡洛斯·马丁内斯·洛佩斯']},
    # {'name': ['寄生虫 기생충'], 'author': ['奉俊昊']},
    # {'name': ['骡子 The Mule'], 'author': ['克林特·伊斯特伍德']},
    # {'name': ['别告诉她 The Farewell'], 'author': ['王子逸']},
    # {'name': ['犯罪现场 犯罪現場'], 'author': ['冯志强']}]
    # 同步用时为:0:00:08.765342
    # Process finished with exit code 0
    
    # 异步
    # 异步也很简单,关于异步的文章我还在整理,因为涉及到太多的东西了。先看这个爬虫代码:
    
    import asyncio
    from datetime import datetime
    
    import aiohttp
    from lxml import etree
    
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit"
                             "/537.36 (KHTML, like Gecko) "
                             "Chrome/72.0.3626.121 Safari/537.36"}
    
    
    async def get_movie_url():
        req_url = "https://movie.douban.com/chart"
        async with aiohttp.ClientSession(headers=headers) as session:
            async with session.get(url=req_url, headers=headers) as response:
                result = await response.text()
                result = etree.HTML(result)
            return result.xpath("//*[@id='content']/div/div[1]/div/div/table/tr/td/a/@href")
    
    
    async def get_movie_content(movie_url):
        async with aiohttp.ClientSession(headers=headers) as session:
            async with session.get(url=movie_url, headers=headers) as response:
                result = await response.text()
                result = etree.HTML(result)
            movie = dict()
            name = result.xpath('//*[@id="content"]/h1/span[1]//text()')
            author = result.xpath('//*[@id="info"]/span[1]/span[2]//text()')
            movie["name"] = name
            movie["author"] = author
        return movie
    
    
    if __name__ == '__main__':
        start = datetime.now()
        loop = asyncio.get_event_loop()
        movie_url_list = loop.run_until_complete(get_movie_url())
        tasks = [get_movie_content(url) for url in movie_url_list]
        movies = loop.run_until_complete(asyncio.gather(*tasks))
        print(movies)
        print("异步用时为:{}".format(datetime.now() - start))
    
    # 看一下结果,你就知道差距了:
    #
    # E:venvspiderScriptspython.exe E:/python_project/filetest/aio_douban.py
    # [{'name': ['小丑 Joker'], 'author': ['托德·菲利普斯']},
    # {'name': ['好莱坞往事 Once Upon a Time... in Hollywood'], 'author': ['昆汀·塔伦蒂诺']},
    # {'name': ['爱尔兰人 The Irishman'], 'author': ['马丁·斯科塞斯']},
    # {'name': ['准备好了没 Ready or Not'], 'author': ['马特·贝蒂内利-奥尔平', ' / ', '泰勒·吉勒特']},
    # {'name': ['82年生的金智英 82년생 김지영'], 'author': ['金度英']},
    # {'name': ['克劳斯:圣诞节的秘密 Klaus'], 'author': ['塞尔希奥·巴勃罗斯', ' / ', '卡洛斯·马丁内斯·洛佩斯']},
    # {'name': ['寄生虫 기생충'], 'author': ['奉俊昊']},
    # {'name': ['骡子 The Mule'], 'author': ['克林特·伊斯特伍德']},
    # {'name': ['别告诉她 The Farewell'], 'author': ['王子逸']},
    # {'name': ['犯罪现场 犯罪現場'], 'author': ['冯志强']}]
    # 异步用时为:0:00:02.230956
    
    抟扶摇而上者九万里
  • 相关阅读:
    宝物筛选
    [HAOI2008]糖果传递
    线段树(区间查询,区间修改)——标记永久化版
    图的割边
    图的割点
    P2066 机器分配
    SP1700 TRSTAGE
    P4568 [JLOI2011]飞行路线
    POJ 2533 Longest Ordered Subsequence
    HDU 2512 一卡通大冒险
  • 原文地址:https://www.cnblogs.com/fengting0913/p/15392590.html
Copyright © 2020-2023  润新知