• scrapy在重复爬取的时候删除掉之前爬的旧数据,在爬虫结束的时候收集统计信息


    问题:想在启动scrapy后重复爬取某一天的数据,但是爬取之前需要删除掉之前的旧数据,在哪里实现删除呢?

    可以在pipeline的open_spider(self,spider)中删除,则在爬虫启动的时候会删除。

    以下是pipelines.py 文件

    # -*- coding: utf-8 -*-
    import sys
    sys.path.append("/apps/jr_python/riskspiders")
    from riskspiders.utils import DButil
    from riskspiders.settings import DATABASE_PRM
    import logging
    import hashlib
    logger = logging.getLogger(__name__)
    # Define your item pipelines here
    #
    # Don't forget to add your pipeline to the ITEM_PIPELINES setting
    # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
    
    
    class RiskspidersPipeline(object):
        # 连接数据库,只要类一初始化,就已经连好了数据库
        db = DButil(DATABASE_PRM)
        def process_item(self, item, spider):
            return item
    
    class RiskspidersMySQLPipeline(object):
    
        # 连接数据库,只要类一初始化,就已经连好了数据库
        # def __init__(self):
        #     self.md = hashlib.md5()
        def open_spider(self, spider):
            print("open_spider, %s" % spider.name)
            self.db = DButil(DATABASE_PRM)
            for day in spider.day_list:
                sql_del = """delete from riskinfo where spider = '{}' and release_time = '{}';""".format(spider.name,day)
                try:
                    self.db.execute(sql_del)
                except Exception as e:
                    print(e)
    
        def close_spider(self,spider):
            self.db.close()
            # 以下可以打印大部分的数据收集,但是finish_time等不能输出,因为程序还没有运行完
            print(spider.crawler.stats.get_value())
    
        def process_item(self,item,spider):
            db = DButil(DATABASE_PRM)
    
            # 逐条插入,更新插入
            if spider.name == 'hexun_bankdata':
                # print('***** item_bank insert MySQL')
                logger.info('***** item_bank insert MySQL')
    
                pa = (
                item["source"], item["spider"],item['website_menu'], item["disclosure_period"], item["bank_abbreviation"], item["total_assets"],
                item["capital_adequancy_ratio"], item["core_capital_adequancy_ratio"], item["bad_loan_ratio"],
                item["provision_coverage"], item["url"], item["cra_time"], item["cra_time"])
                sql_data = 
                    """insert into hexun_bankdata(source,spider,website_menu,disclosure_period, bank_abbreviation,total_assets,capital_adequancy_ratio,core_capital_adequancy_ratio,bad_loan_ratio,provision_coverage,url,cra_time) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)  on duplicate key update  cra_time = %s;"""
    
                try:
                    db.execute(sql_data, pa)
                except Exception as e:
                    print e
                    logger.error(e)
                finally:
                    db.close()
            else:
                md = hashlib.md5()
                str1 = '%s%s' % (item['title'], item['content'])
                md.update(str1)
                md_value = md.hexdigest()
                # print("str1 is %s,md_value is %s" % (str1,md_value))
                logger.info('***** item_bank insert MySQL')
                params = (
                item['source'], item['spider'],item['website_menu'], item['release_time'], item['key_words'], item['neg_key_words'], item['title'].strip(),
                item['source_type'], item['f_name'], item['is_include_tbl'], item['content'].strip(), item['content_web'], item['url'],
                item['father_url'], item['cra_time'], md_value, item['cra_time']
                )
                try:
                    db.execute(
                    """
                    insert into riskinfo
                    (source, spider,website_menu, release_time, key_words,neg_key_words, title, source_type,f_name, is_include_tbl,content,content_web, url,father_url,cra_time,content_id)
                    values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s) on duplicate key update  cra_time = %s;
                    """, params
                    )
                except Exception as e:
                    print e
                    logger.error(e)
                finally:
                    db.close()
  • 相关阅读:
    oracle lpad函数和rpad函数
    OREACLE SUBSTR()函数应用-截取字符函数
    oracle常用数值函数
    Oracle 分析函数row_number() over (partition by order by )
    oracle中decode函数用法
    oracle数据字典信息整理
    python学习遇到的英文词汇
    读书随想
    常用css列表
    爬虫趣事
  • 原文地址:https://www.cnblogs.com/yoyowin/p/12521207.html
Copyright © 2020-2023  润新知