• 爬取知名社区技术文章_pipelines_4


    获取字段的存储处理和获取普通的路径

    #!/usr/bin/python3
    # -*- coding: utf-8 -*-
    
    import pymysql
    import gevent
    import pymysql
    from gevent import monkey
    from scrapy.pipelines.images import ImagesPipeline
    import pymysql.cursors
    
    
    class JobboleImagerPipeline(ImagesPipeline):
        """
        获得图片下载路径
        """
        def item_completed(self, results, item, info):
            if 'img_url' in item:
                for key, value in results:
                    # print(key)
                    img_path = value['path']
                    # print(value['path'])
                    item['img_path'] = img_path
            return item
            
    
    # class SqlSave(object):
    #     """常规同步方式存入数据库"""
    #     def __init__(self):
    #         SQL_DBA = {
    #             'host': 'localhost',
    #             'db': 'jobole',
    #             'user': 'root',
    #             'password': 'password',
    #             'use_unicode': True,
    #             'charset': 'utf8'
    #         }
    #         self.conn = pymysql.connect(**SQL_DBA)
    #         self.cursor = self.conn.cursor()
    #
    #     def process_item(self, item, spider):
    #         sql = self.get_sql(item)
    #         print(sql)
    #         self.cursor.execute(sql)
    #         self.conn.commit()
    #
    #         return item
    #
    #     def get_sql(self, item):
    #         sql = """insert into article(cont_id, cont_url, title, publish_time, cont, img_url, img_path, like_num, collection_num, comment_num) value ('%s','%s','%s','%s','%s','%s','%s', %d, %d, %d)
    #         """ % (item['cont_id'], item['cont_url'],item['title'],item['publish_time'],item['cont'],item['img_url'][0],item['img_path'],item['link_num'],item['collection_num'],item['comment_num'],)
    #         return sql
    
    
    class SqlSave(object):
        """
        协程方式向数据库插入数据
        """
        
        def __init__(self):
            # 初始数据库连接和参数,SQL_DBA可写在setting中,通过 获取在settings.py中设置的SQL_DBA字典
            # @classmethod
            # def from_settings(cls, settings):
            #     sql_dba = settings[SQL_DBA]
            #     return cls(cls,sql_dba)           需要__init__中新添个参数接收这个值
            SQL_DBA = {
                'host': 'localhost',
                'db': 'jobole',
                'user': 'root',
                'password': 'password',
                'use_unicode': True,
                'charset': 'utf8'
            }
            self.conn = pymysql.connect(**SQL_DBA)
            self.cursor = self.conn.cursor()
        
        def process_item(self, item, spider):
            sql = self.__get_sql(item)
            # 协程方式对数据库插入操作
            gevent.joinall([
                gevent.spawn(self.__go_sql, self.cursor, self.conn, sql, item),
            ])
            return item
        
        def __go_sql(self, cursor, conn, sql, item):
            try:
                # 数据库插入操作
                cursor.execute(sql,
                               (item['cont_id'], item['cont_url'], item['title'], item['publish_time'],
                                item['cont'], item['img_url'][0], item['img_path'], item['link_num'],
                                item['collection_num'], item['comment_num']))
                conn.commit()
            except Exception as e:
                print(e)
        
        def __get_sql(self, item):
            # 生成sql语句
            sql = """insert into
                      article(cont_id, cont_url, title, publish_time,
                      cont, img_url, img_path, like_num,
                      collection_num, comment_num)
                    value
                      (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
            return sql
    

      

  • 相关阅读:
    【ceph | 运维】部署osd
    【osd | 运维】osd数据均衡
    leveldb——leveldb入门篇之Linux下编译配置和使用
    【Linux系统编程】预分配磁盘空间
    【filestore】源码剖析
    【Linux】Linux Page Cache的理解
    ceph internal 之 底层对象
    【Linux】磁盘基础知识
    Spring Cloud Alibaba学习08Seata基本使用
    Spring Cloud Alibaba学习05Sentinel基本使用
  • 原文地址:https://www.cnblogs.com/2bjiujiu/p/7233321.html
Copyright © 2020-2023  润新知