scrapy选择器的用法 //selector可以加可以不加 response.selector.xpath("//title/text()").extract_first() response.selector.css("title::text").extract_first() response.xpath("//title/text()").extract_first() response.xpath("//div[@id='images']").xpath("//img/@src").extract()[0] response.css('a::text').re('Name:(.*)')---需要特殊处理的数据后面可以加re模块 response.xpath("//a/text()").re('Name:(.*)')--这个后面对象变了,不能再加extract()方法了 spiders的用法 完成爬虫的逻辑url 解析网页数据parse 1.初始start_urls→解析parse→scrapy.Request(url,callback=parse) item url继续给下载器 2.name---唯一标识爬虫 3.allowed_domains---允许爬取列表 4.start_urls---从这里开始,可以构造列表,一个一个请求(遍历),get形式的请求 5.custom_settings---字典形式的写法(参考headers设置)可以覆盖项目settings---可以吧请求头写在myspider中 6.crawler--- 7.settings--- 8.from_crawler----可以拿到全局配置 9.start_requests()----利用改写start_requests()方法,使第一次请求用post请求 def start_requests(self): yield scrapy.Request(url='http://httpbin.org/post',method='post',callback=self.parse_post) def parse_post(self,response): print('OK',response.status) 10.make_requests_from_url--改变默认回调函数,将回调函数改写,,如果先写start_requests函数就不会在调用此函数 def make_requests_from_url(self,url): yield scrapy.Request(url=url,callback=self.parse_index) def parse_index(self,response): print('OK',response.status) 11.parse默认回调函数--item 可迭代对象,request(加到下载队列) 12.log日志文件--info --debug self.logger.info(response.status) 13.closed--myspider关闭时调用 item Pipeline的用法 数据清洗 重复检查 存储到数据库 1.process_item---item会自动传给它 2.open_spider---spider开启的时候调用 3.close_spider--spider关闭的时候调用 4.from-crawler----类方法,获取项目中的setting from scrapy.exceptions import DropItem 标准写法: class PricePipeline(object): vat_factor = 1.15 def process_item(self, item, spider): if item.get('price'): if item.get('price_excludes_vat'): item['price'] = item['price'] * self.vat_factor return item else: raise DropItem("Missing price in %s" % item) 写到json: import json class JsonWriterPipeline(object): def open_spider(self, spider): self.file = open('items.jl', 'w') def close_spider(self, spider): self.file.close() def process_item(self, item, spider): line = json.dumps(dict(item)) + " " self.file.write(line) return item 存储到MongoDB: import pymongo class MongoPipeline(object): collection_name = 'scrapy_items' def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DATABASE', 'items') ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] def close_spider(self, spider): self.client.close() def process_item(self, item, spider): self.db[self.collection_name].insert_one(dict(item)) return item 请求网页,保存图片 import scrapy import hashlib from urllib.parse import quote class ScreenshotPipeline(object): """Pipeline that uses Splash to render screenshot of every Scrapy item.""" SPLASH_URL = "http://localhost:8050/render.png?url={}" def process_item(self, item, spider): encoded_item_url = quote(item["url"]) screenshot_url = self.SPLASH_URL.format(encoded_item_url) request = scrapy.Request(screenshot_url) dfd = spider.crawler.engine.download(request, spider) dfd.addBoth(self.return_item, item) return dfd def return_item(self, response, item): if response.status != 200: # Error happened, return item. return item # Save screenshot to file, filename will be hash of url. url = item["url"] url_hash = hashlib.md5(url.encode("utf8")).hexdigest() filename = "{}.png".format(url_hash) with open(filename, "wb") as f: f.write(response.body) # Store filename in item. item["screenshot_filename"] = filename return item 去重操作: from scrapy.exceptions import DropItem class DuplicatesPipeline(object): def __init__(self): self.ids_seen = set() def process_item(self, item, spider): if item['id'] in self.ids_seen: raise DropItem("Duplicate item found: %s" % item) else: self.ids_seen.add(item['id']) return item 使用item pipeline: ITEM_PIPELINES = { 'myproject.pipelines.PricePipeline': 300, 'myproject.pipelines.JsonWriterPipeline': 800, }