from twisted.enterprise import adbapi class MySQLAsyncPipeline: def open_spider(self, spider): db = spider.settings.get('MYSQL_DB_NAME', 'scrapy_default') host = spider.settings.get('MYSQL_HOST', 'localhost') port = spider.settings.get('MYSQL_PORT', 3306) user = spider.settings.get('MYSQL_USER', 'root') passwd = spider.settings.get('MYSQL_PASSWORD', 'root') self.dbpool = adbapi.ConnectionPool('MySQLdb', host=host, db=db, user=user, passwd=passwd, port=port, charset='utf8') def close_spider(self, spider): self.dbpool.close() def process_item(self, item, spider): self.dbpool.runInteraction(self.insert_db, item) return item def insert_db(self, tx, item): values = ( item['f1'], item['f2'], ) sql = 'INSERT INTO books VALUES (%s,%s)' tx.execute(sql, values)
#redis入库
import redis from scrapy import Item class RedisPipeline: def open_spider(self, spider): db_host = spider.settings.get('REDIS_HOST', 'localhost') db_port = spider.settings.get('REDIS_PORT', 6379) db_index = spider.settings.get('REDIS_DB_INDEX', 0) self.db_conn = redis.StrictRedis(host=db_host, port=db_port, db=db_index) self.item_i = 0 def close_spider(self, spider): self.db_conn.connection_pool.disconnect() def process_item(self, item, spider): self.insert_db(item) return item def insert_db(self, item): if isinstance(item, Item): item = dict(item) self.item_i += 1 self.db_conn.hmset('book:%s' % self.item_i, item)