from qiubaiPro.items import QiubaiproItem class QiubaiSpider(scrapy.Spider): name = 'qiubai' # allowed_domains = ['www.qiushibaike.com/text'] start_urls = ['https://www.qiushibaike.com/text/'] def parse(self, response): #建议使用xpath进行指定内容的解析,因为框架集成了xpath解析的接口 #段子的内容和作者 #可以直接用response+点来调用xpath div_list = response.xpath('//div[@id="content-left"]/div') for div in div_list: #extract()该方法可以将selector对象中存储的数据值拿到,是一个列表,取值要用索引【0】 # author = div.xpath('./div/a[2]/h2/text()').extract()[0] # extract_first() 等于extract()[0] author = div.xpath('./div/a[2]/h2/text()').extract_first() content = div.xpath('.//div[@class="content"]/span/text()').extract_first() #1 将解析到的数据值(author和cont)存储到items对象,需要去QiubaiproItem类里声明属性 item = QiubaiproItem() item['author'] = author item['content'] = content #2 将item对象提交给管道,去pipelines文件编写代码 yield item
# 基于管道存储的代码 class QiubaiproPipeline(object): fp = None # 整个爬虫过程中,该方法只会在开始爬虫的时候被调用一次 def open_spider(self, spider): print('开始爬虫') self.fp = open('./qiubai_pipe.txt', 'w', encoding='utf-8') # 该方法可以接受爬虫文件中提交过来的item对象,并且对item对象中存储的页面数据进行持久化存储 # 参数:item表示的就是接收到的item对象 # 每当爬虫文件向管道提交一次item,该方法就会被执行一次 def process_item(self, item, spider): # 取出item对象中存储的数据值 author = item['author'] content = item['content'] # 持久化存储 self.fp.write(author + ':' + content+' ') return item #该方法只会在爬虫结束的时候被调用一次 def close_spider(self,spider): print('爬虫结束') self.fp.close()
# 编写向mysql数据库中存储数据的相关代码 import pymysql class QiubaiproPipeline(object): conn = None cursor = None def open_spider(self, spider): print('开始爬虫') # 链接数据库 self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='123', db='qiubai') def process_item(self, item, spider): # 1. 链接数据库(创建好数据库和要存的表) # 2. 执行sql语句 sql = 'insert into qiubai values("%s","%s")' % (item['author'], item['content']) #创建游标对象 self.cursor = self.conn.cursor() # 3. 提交事务 try: self.cursor.execute(sql) self.conn.commit() except Exception as e: print(e) self.conn.rollback() return item def close_spider(self, spider): print('爬虫结束') self.cursor.close() #关闭游标对象 self.conn.close() #关闭连接对象
import redis class QiubaiproPipeline(object): conn = None def open_spider(self, spider): print('开始爬虫') # 1. 链接数据库 self.conn = redis.Redis(host='127.0.0.1', port=6379) def process_item(self, item, spider): # 2. 执行语句 dict = { 'author': item['author'], 'content': item['content'], } # 创建一个名为data的列表,传入数据dict self.conn.lpush('data', dict) return item