• aio 爬虫,去重,入库


    #aio 爬虫,去重,入库
    import asyncio
    import aiohttp
    import aiomysql
    import re
    from pyquery import PyQuery
    
    stoping = False
    start_url = 'http://www.jobbole.com/'
    
    waiting_urls = []
    seen_urls = set()
    # url去重 --布隆过滤器 bloom filter
    
    sem  = asyncio.Semaphore(3) #限制并发数量
    
    async def fetch(url,session):
        async with sem:
            #await asyncio.sleep(0.5)
            try:
                async with session.get(url) as resp:
                    print(resp.status)
                    if resp.status in [200,201]:
                        data = await resp.text()
                        return data
            except Exception as e :
                print(e)
    
    
    #因为不是耗费 io的 所以用普通函数
    def extract_urls(html):
        urls = []
        pq = PyQuery(html)
        for link in pq.items('a'):
            url = link.attr('href')
            if url and url.startswith('http') and url not in seen_urls:
                urls.append(url)
                waiting_urls.append(url)
        return urls
    
    
    async def init_urls(url,session):
        html = await fetch(url,session)
        seen_urls.add(url)
        extract_urls(html)
    
    async def article_handeler(url,session,pool):
        #获取文章详情,并解析入库
        html = await fetch(url,session)
        seen_urls.add(url)
        extract_urls(html)
        pq = PyQuery(html)
        title = pq('title').text()
        async with pool.acquire() as conn:
            async with conn.cursor() as cur:
                await cur.execute('SELECT 42;')
                insert_sql = 'insert into aiomysql_test(title) VALUES ("{}")'.format(title)
                await cur.execute(insert_sql)
    
    
    async def consumer(pool):
        async with aiohttp.ClientSession() as session:
            while not stoping:
                if len(waiting_urls) == 0:
                    await asyncio.sleep(0.5)
                    continue
                url = waiting_urls.pop()
                print('start get url:{}'.format(url))
                if re.match('http://.*?jobbole.com/d+/',url):
                    if url not in seen_urls:
                        asyncio.ensure_future(article_handeler(url,session,pool))
                        await asyncio.sleep(0.5)
                else:
                    if url not in seen_urls:
                        asyncio.ensure_future(init_urls(url,session))
    
    
    async def main(loop):
        #等待mysql链接建立好
        pool = await aiomysql.create_pool(host='127.0.0.1',port = 3306,
                                          user = 'root',password='123456',
                                          db = 'aiomysql_test',loop=loop,
                                          charset = 'utf8',autocommit = True)
        async with aiohttp.ClientSession() as session:
            html = await fetch(start_url, session)
            seen_urls.add(start_url)
            extract_urls(html)
    
        asyncio.ensure_future(consumer(pool))
    
    if __name__ == "__main__":
        loop = asyncio.get_event_loop()
        asyncio.ensure_future(main(loop))
        loop.run_forever()
  • 相关阅读:
    第五次作业——词法分析程序的设计与实现
    第四次作业——文法和语言总结与梳理
    第三次作业-语法树,短语,直接短语,句柄
    消除左递归
    DFA最小化
    非确定的自动机NFA确定化为DFA
    正规式到正规文法与自动机
    正规文法与正规式
    词法分析程序的设计与实现
    第四次作业-文法和语言总结与梳理
  • 原文地址:https://www.cnblogs.com/Erick-L/p/8939607.html
Copyright © 2020-2023  润新知