• python 自建爬虫复用简单框架(gevent异步)


    一般爬虫可以分为以下几个步骤:

    一、打开指定网页

    二、解析网页

    三、处理/存储数据,新增任务网页

    另外异步的话,需要调度器。

    简单爬虫的话,不需要搞复杂验证码,requests/urllib修改cookie,header就能访问的话,写一个打开,一个解析就够了,处理数据和新任务,直接写在解析类就下,gevent也可以直接异步。

    项目路径:ur'D:python_pymy_scrapy/scrapy_tools'

    # scrapy_tools下添加__init__.py作为包使用

    itemparse.py

    按照数据的结构建立相应的xpath 结构

    # -*- coding: utf-8 -*-
    """
    Created on Fri Jul 07 17:24:34 2017
    
    @author: willowj
    """
    import sys
    stdout, stdin, stderr = sys.stdout, sys.stdin, sys.stderr
    reload(sys)
    sys.stdout, sys.stdin, sys.stderr = stdout, stdin, stderr
    sys.setdefaultencoding('utf8')
    
    
    import gevent
    import pandas as pd 
    import numpy as np
    from lxml import html
    import time
    import codecs
    import json
    
    
    def list_0e(list_):
        if isinstance(list_, list):
            if not list_:
                return None
            else:
                if len(list_)>1:
                    print 'warning : list>1,list[1]:', list_[1] #,len(list_)
                return list_[0]
        else:
            return list_
    
    
    class ItemParse(object):
        """docstring for zhihu_topi"""
        name = 'ItemParse'
    
        base_url = 'https://www.zhihu.com/topic/19551147/top-answers'
        pageN_x = '//div[@class="zm-invite-pager"]//span[last()-1]/a/text()'
        new_urls_x = None
    
        #以下一条数据的节点,以及每一项
        items_node_x = '//div[@class="feed-main"]'
        #注意每一条数据的属性都是在一个项目里搜索,xpath '.'开头
        item_xs = dict(
            question_name = '''.//a[@class='question_link']/text()''', 
            #question_href = '''.//a[@class='question_link']/@href''', 
            author = './/div[@data-action="/answer/content"]/@data-author-name',
            author_href = '''.//a[@class='author-link']/@href''',  
            ups_x = './/div[@class="zm-item-vote-info"]/@data-votecount',
            answers_text = ".//textarea/text()",
            commentN = './/a[@name="addcomment"]/text()[last()]',
            entry_url = './/div[@data-action="/answer/content"]/@data-entry-url',
    
            #re:
            #z = re.compile('.')
            )    
        
        #换页url样式
        def getnextpages(self):
            if self.pageN > 1:
            #自定义换也规则,只有一页则为 False
                urls = [self.base_url + '?page=%s' %n 
                            for n in range(self.pageN,1,-1)
                        ]
                return urls
    
    
        def __init__(self, html_):
            #self.item_atrr_xpath()
            self.results = []
            self.new_urls = []
            self.pageN = self.update_page_n(html_)
            self.nextpages = self.getnextpages()
            self.parase(html_)
    
    
        def parase(self, html_):
            #优先使用xpath,,补充使用re; 找不到的item 返回none
            etree = html.document_fromstring(html_)
            items_nodes = etree.xpath(self.items_node_x)
            #results = []
            for ee in items_nodes:
                ee_str = None
                ite = {}
                for item,itemx in self.item_xs.items():
                    # re, or xpath
                    if hasattr(itemx, 'findall'):
                        if ee_str is None:
                            ee_str = html.to_string(ee)
                        ite[item] = itemx.findall(ee_str)
                    #xpath   
                    elif isinstance(itemx, str) or isinstance(itemx, unicode): 
                        if itemx.startswith('./'):
                            ite[item] = ee.xpath(itemx)
                        else:
                            print item
                            raise 'xpath not startwith ./'
                    else:
                        print item
                        raise 'not re.pattarn object or xpath str'
                    
                    if len(ite[item]) == 0:
                        ite[item] = None
                    elif len(ite[item]) == 1:
                        ite[item] = ite[item][0]
                    else:
                        ite[item] = '
    '.join([str(__i) for __i in ite[item]])
                    
                self.results.append(ite)
            
            #new_url
            if self.new_urls_x:
                self.new_urls.extend(etree.xpath(self.new_urls_x)) 
    
        #获取有多少页
        def update_page_n(self, html_):
            if self.pageN_x:
                etree = html.document_fromstring(html_)
                pages = etree.xpath(self.pageN_x)
                pages = list_0e(pages)
                if isinstance(pages, str): 
                    pages.strip()
                if pages and pages.isdigit():
                    return int(pages)
            else:
                return 1
    
        #普通的获取项目下所有换页
        def get_nextpages(self, opener, sleep_sec=None):
            for url in self.nextpages:
                if sleep_sec:
                    time.sleep(sleep_sec)
                #if not hasattr(opener, 'get')    
                _re = opener.get(url)
                print _re.status_code,  _re.url
                self.parase(_re.text)
                print time.time()
        #暂时把 异步控制和存储方法写到了这里
        #gevent 协程方法            
        def __gevent_get_nextpages(self, opener):
            print id(opener)
            while self.nextpages:
                #start_time = time.time()
                url = self.nextpages.pop()
                print gevent.getcurrent()
                zhihu_re = opener.get(url)
                #gevent.sleep(5)
                print zhihu_re.status_code,  url
                self.parase(zhihu_re.text) 
                print time.time()
        #gevent 协程方法
        def get_nextpages_by_gevent(self, opener_class, g_n=4):
            '''
            param:  opener_class : 创建网页打开器的类
                    g_n: 协程数量,默认4个
            '''
            from gevent import monkey; monkey.patch_all()
              
            start_time = time.time()
            gs = [gevent.spawn(self.__gevent_get_nextpages, opener_class())
                    for i in range(g_n)
                    ]
            gevent.joinall(gs)    
    
            print time.time() - start_time 
            self.save_to_excel()
    
        def save_to_excel(self, path=None):  
            if path:  
                save_name = path
            else:     
                save_name = u''+ self.name 
                               + time.strftime('%Y%m%d_%H_%M', time.localtime()) 
                               + '.xlsx'
            print save_name
            result_pd = pd.DataFrame(self.results)
            print 'pd ok'
            result_pd.to_excel(u'' + save_name, encoding='gb18030')        
            print 'saved to ' + save_name
    
    
        def save_to_json(self, path=None):
            if path:  
                save_name = path
            else:     
                save_name = u''+ self.name 
                               + time.strftime('%Y%m%d_%H_%M', time.localtime()) 
                               + '.json'
            print save_name
            with codecs.open(save_name, 'w', encdoing='gb18030') as f:
                f.write(josn.dumps(self.results))
                
            print 'saved to '+ save_name
    View Code

     使用时继承类重写类属性和getnextpages 换页方法

    
    
    

    web_opener.py

    使用requests.Session,保持会话的方式速度大概会快一倍

     对应gevent异步,多少个协程就会生成同等的会话,各自打开网页互补干扰。 方法暂时写在itemparse.py

    # -*- coding: utf-8 -*-
    """
    2017年8月17日星期四
    下午 17:22
    @author: willowj
    """
    import sys
    sys.setdefaultencoding('utf8')  
    
    import requests
    #from requests.cookies import (
    #    cookiejar_from_dict, extract_cookies_to_jar, RequestsCookieJar, merge_cookies)
    
    class SessionFopener(object):
        """requests 封装的网页打开器
        param: headers 默认使用类属性,实例化的时候自己可以传入
               cookie_dic 默认禁用
               proxies 默认无
        """
        headers = {
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Encoding':'gzip, deflate, sdch',
            'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
            'Cache-Control':'max-age=0',
            'Connection':'keep-alive',
            #'Cookie':'q'
            #'Host':'www.zhihu.com',
            'Upgrade-Insecure-Requests':'1',
            'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36',
            }
    
        def __init__(self, headers=None, cookie_dic=None, proxies=None):
        
            self.req_s = requests.Session()
            self.req_s.adapters.DEFAULT_RETRIES = 3
            self.req_s.keep_alive = True 
    
            if headers:
                self.req_s.headers = headers 
            else:     
                self.req_s.headers = self.headers
    
            if not cookie_dic:
                cookie_dic = {}
            self.req_s.cookies = requests.cookies.cookiejar_from_dict(cookie_dic)
    
            if proxies:
                self.req_s.proxies = proxies
    
        def close(self):
            self.req_s.close()
    
        def get(self, *arg, **karg):
            return self.req_s.get(*arg, **karg)
    
        def post(self, *arg, **karg):
            return self.req_s.post(*arg, **karg)
    
        def set_cookiejar(self, cookie_dic={}):
            self.req_s.cookies = requests.cookies.cookiejar_from_dict(cookie_dic)
    
        def add_cookiejar(self, cookie_dic):
            self.req_s.cookies = requests.cookies.merge_cookies(self.req_s.cookies, cookie_dic)
    
    
        def set_headers(self, headers={}):
            self.req_s.headers = headers
    
        def add_headers(self, headers_dic):
            for k,v in header_dic:
                self.req_s.headers[k] = v
    
    
        def set_proxies(self, proxies):
            self.req_s.proxies = proxies
    
    
        @classmethod    
        def cookiejar_from_dict(cls, cookie_dic):
            return requests.cookies.cookiejar_from_dict(cookie_dic)
    
        def __enter__(self):
            print 'enter'
            return self
    
        def __exit__(self, *used):
            self.req_s.close()
            del self.req_s
            print 'exit'
    
    
    if __name__ == '__main__':
        with  SessionFopener() as req_o:
            res_p = req_o.get('http://httpbin.org/get')
            
        print res_p.json()  
    View Code

    大众点评店铺爬取示例:

    只需要继承后重写解析的节点、换页的url形式就行

    暂时未考虑外链接。

    # -*- coding: utf-8 -*-
    """
    Created
    
    2017年8月17日星期四
    下午 19:33
    
    @author: Administrator
    """
    import sys
    stdout, stdin, stderr = sys.stdout, sys.stdin, sys.stderr
    reload(sys)
    sys.stdout, sys.stdin, sys.stderr = stdout, stdin, stderr
    sys.setdefaultencoding('utf8')  
    
    sys.path.append(ur'D:python_pymy_scrapy')
    from scrapy_tools.web_opener import SessionFopener
    from scrapy_tools.itemparse import ItemParse
    
    class DzdpItemParse(ItemParse):
        """广州酒家(文昌店)的点评
        docstring for zhihu_topi"""
        name = u'DzdpItemParse广州酒家'
    
        base_url = 'https://www.dianping.com/shop/516983/review_more'
        pageN_x = ".//a[@class='PageLink'][last()]/text()"
        new_urls_x = None
    
        #以下一条数据的节点,以及每一项
        items_node_x = './/div[@class="comment-list"]/ul/li'
        #注意每一条数据的属性都是在一个项目里搜索,xpath '.'开头
        item_xs = dict(
            user_id = '''.//*[@class="J_card"]/@user-id''',
            #question_href = '''.//a[@class='question_link']/@href''' ,
    
            comm_per = """.//span[@class='comm-per']/text()""",
            total_mark = """.//*[@class="user-info"]/span[1]/@class""",
            taste = """.//*[@class="comment-rst"]/span[1]/text()""",
            environment = """.//*[@class="comment-rst"]/span[2]/text()""",
            sevice = """.//*[@class="comment-rst"]/span[3]/text()""",
    
            comments_agree = '''.//span[@class="heart-num"]/text()''',
            comment_text = """.//*[@class="J_brief-cont"]/text()""",
            comment_date = '''.//*[@class="time"]/text()''',
            recommend_food = 
            u'''.//*[@class="comment-recommend" 
            and (contains(text(),推荐) 
            or contains(text(),喜欢))]
            [1]/a/text()'''
            #  中文得使用unicode                                                        
            #re:
            #z = re.compile('.')
            )    
    
        
        def getnextpages(self):
            if self.pageN > 1:
            #自定义换也规则,只有一页则为 False
                urls = [self.base_url + '?pageno=%s' %n
                        for n in range(self.pageN, 1, -1)
                        ]
                return urls
    
    
    open_s = SessionFopener()   #实例化一个打开器
    respon_= open_s.get(DzdpItemParse.base_url) #打开初始页
    gzjj_item = DzdpItemParse(respon_.text) #解析对象用初始页html实例化
    
    #同步方式的话,使用普通方法
    gzjj_item.get_nextpages(open_s, sleep_sec=None)
    
    #异步方法:
    #gzjj_item.get_nextpages_by_gevent(SessionFopener) #实例异步方法
    View Code

    结果:本来打开一个网页0.5279 s,开四个协程后77.71s爬完613个页面,平均0.13s一个,速度提升至4倍

    200 https://www.dianping.com/shop/516983/review_more?pageno=600
    1503074965.07
    <Greenlet at 0x9c44620: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
    200 https://www.dianping.com/shop/516983/review_more?pageno=602
    1503074965.1
    <Greenlet at 0x9c445d0: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
    200 https://www.dianping.com/shop/516983/review_more?pageno=601
    1503074965.14
    <Greenlet at 0x9c44670: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
    200 https://www.dianping.com/shop/516983/review_more?pageno=604
    1503074965.54
    <Greenlet at 0x9c44440: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
    200 https://www.dianping.com/shop/516983/review_more?pageno=607
    1503074965.59
    <Greenlet at 0x9c44670: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
    200 https://www.dianping.com/shop/516983/review_more?pageno=605
    1503074965.64
    <Greenlet at 0x9c44620: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
    200 https://www.dianping.com/shop/516983/review_more?pageno=606
    1503074965.67
    <Greenlet at 0x9c445d0: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
    200 https://www.dianping.com/shop/516983/review_more?pageno=611
    1503074966.1
    <Greenlet at 0x9c445d0: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
    200 https://www.dianping.com/shop/516983/review_more?pageno=609
    1503074966.15
    <Greenlet at 0x9c44670: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
    200 https://www.dianping.com/shop/516983/review_more?pageno=610
    1503074966.18
    <Greenlet at 0x9c44620: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
    200 https://www.dianping.com/shop/516983/review_more?pageno=608
    1503074966.22
    <Greenlet at 0x9c44440: <bound method DzdpItemParse.__gevent_get_nextpages of <__main__.DzdpItemParse object at 0x094BF470>>(<scrapy_tools.web_opener.SessionFopener object at )>
    200 https://www.dianping.com/shop/516983/review_more?pageno=612
    1503074966.7
    200 https://www.dianping.com/shop/516983/review_more?pageno=614
    1503074966.74
    200 https://www.dianping.com/shop/516983/review_more?pageno=615
    1503074967.05
    200 https://www.dianping.com/shop/516983/review_more?pageno=613
    1503074967.09
    77.7100000381
    DzdpItemParse广州酒家20170819_00_49.xlsx
    pd ok
    saved to DzdpItemParse广州酒家20170819_00_49.xlsx
    View Code

    分布式多进程、入数据库的话,还得单独写调度器、与数据对接的模块

  • 相关阅读:
    Ubuntu下建立Android开发环境
    c#值类型和引用类型
    Jude Begin
    Eclipse C/C++ development environment creation
    C# var usage from MSDN
    SubSonic应用_Collection
    C#2.0中委托与匿名委托引
    sql语句的执行步骤——zhuan
    图˙谱˙马尔科夫过程·聚类结构 (转载,原始出处不详)
    Hadoop集群新增节点实现方案
  • 原文地址:https://www.cnblogs.com/willowj/p/7391301.html
Copyright © 2020-2023  润新知