# -*- coding: utf-8 -*- import scrapy from daomu.items import DaomuItem class DaomuspiderSpider(scrapy.Spider): name = "daomuspider" # allowed_domains = ["www.daomubiji.com"] start_urls = ['http://www.daomubiji.com/'] index_url = 'http://www.daomubiji.com/' def start_requests(self): yield scrapy.Request(url=self.index_url,callback=self.parse_book) def parse_book(self, response): for url in response.css('.article-content a'): book_url = url.css('a::attr(href)').extract_first() yield scrapy.Request(url=book_url, callback=self.parse_chapter) def parse_chapter(self, response): item = DaomuItem() book_title = response.css('.focusbox .container h1::text').extract_first() book_info = response.css('.focusbox .container .focusbox-text::text').extract_first() book_url = response.url for chapter in response.css('.excerpts-wrapper .excerpts .excerpt'): chapter_title = chapter.css('a::text').extract_first().split(' ')[1] + ':'+ chapter.css('a::text').extract_first().split(' ')[-1] chapter_url = chapter.css('a::attr(href)').extract_first() item['book_title'] = book_title item['book_info'] = book_info item['book_url'] = book_url item['chapter_title'] = chapter_title item['chapter_url'] = chapter_url yield item yield scrapy.Request(url = chapter_url,callback=self.parse_detail, meta={'item':item})#重点在这里,用meta进行转移到下一个函数 def parse_detail(self, response): item = response.meta['item'] content = response.css('.article-content p::text').extract() item['content'] = content yield item
import pymongo class DaomuPipeline(object): def __init__(self): self.mongo_uri = 'localhost' self.mongo_db = 'daomu' # @classmethod # def frow_crawler(cls, crawler): # return cls( # mongo_uri = crawler.settings.get('MONGO_URI'), # mongo_db = crawler.settings.get('MONGO_DB') # ) def open_spider(self,spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def process_item(self, item, spider): name = item.__class__.__name__ self.db[name].insert(dict(item))#一定要注意这里用dict return item def close_spider(self, spider): self.client.close()