一 基本流程
- 创建工程,工程名称为(cmd):firstblood: scrapy startproject firstblood
- 进入工程目录中(cmd):cd :./firstblood
- 创建爬虫文件(cmd):scrapy genspider first www.xxx.con (first为爬虫文件名称 www.xxx.com :起始url)
- pycharm打开爬虫项目,进入到spider文件下,找到first爬虫文件,书写爬虫代码.注释allowed_domains
- 启动爬虫文件(cmd):scrapy crawl first
***在pycharm中启动设置方法
#在项目根目录下新建:entrypoint.py from scrapy.cmdline import execute execute(['scrapy', 'crawl', '爬虫名称'])
二 spider反反爬配置
- robot.txt
settings 中修改为:ROBOTSTXT_OBEY = False
- UA伪装
setting文件中 USER_AGENT = 'firstblood (+http://www.yourdomain.com)' 修改为: USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36' 自定义请求头信息,重写start_requests方法: def start_requests(self): headers={ 'Host': 'www.amazon.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36', } url='https://www.amazon.cn/s/ref=nb_sb_noss?__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&url=search-alias%3Daps&field-keywords=iphone-x' resquest=scrapy.Request(url=url,headers=headers) yield resquest
三 基本命令汇总
- scrapy startproject firstblood #新建工程
- scrapy genspider first www.xxx.con #新建爬虫文件
- scrapy crawl first #执行爬虫文件,并打印日记
- scrapy crawl first --nolog #执行爬虫文件,不打印日记
- scrapy crawl qiubai -o qiushibaike.csv 把parse函数的返回结果存入csv文件中
- scrapy genspider -t crawl chouti www.xxx.com 创建crawspider爬虫项目
四 存储
-
基于终端指令的持久化存储(只会将parse函数返回值进行本地持久化存储)
命令: scrapy crawl qiubai -o qiushibaike.csv
局限性:只能存储这些后缀的文件('json', 'jsonlines', 'jl', 'csv', 'xml', 'marshal', 'pickle')
class QiubaiSpider(scrapy.Spider): name = 'qiubai' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.qiushibaike.com/text/'] def parse(self, response): div_list=response.xpath("//div[@id='content-left']/div") res_list=[] for div in div_list: # author=div.xpath('./div[1]/a[2]/h2/text()')[0] ##scrapy中的xpath返回的是select对象 #<Selector xpath='./div[1]/a[2]/h2/text()' data=' 胡子灬哥 '> #获取select对象中data的数据 # 方式一:author=div.xpath('./div[1]/a[2]/h2/text()')[0].extract() # 方式二:author=div.xpath('./div[1]/a[2]/h2/text()').extract_first() author=div.xpath('./div[1]/a[2]/h2/text()')[0].extract() content=div.xpath('./a[1]/div[@class="content"]/span//text()').extract() content="".join(content) # print("author......",author) # print("content......",content) # break dic={ 'author':author, 'content':content } res_list.append(dic) return res_list
-
基于管道操作的持久化存储(持久化存储的操作必须写在管道文件中)
推荐使用:
pip install redis==2.10.6
如何把数据封装到item对象中
1.在items.py文件中定义存储字段的属性
class QiubaiproItem(scrapy.Item): # define the fields for your item here like:(定义字段如下:) # name = scrapy.Field() (name字段=scrapy万能字段) #示例 author=scrapy.Field() content=scrapy.Field()
2.爬虫文件spiders/qiubai.py中引入定义的item类:
from qiubaiPro.items import QiubaiproIte
3.实例化items对象
#实例化 item对象 item=QiubaiproItem() item['author']=author item['content']=content #注意:一条数据一个item对象,pipeline接受一个item就存储一条记录
4.把实例化的对象提交给管道,scrapy自动提交,我们只需要写:
yield item #每条数据提交一次
5.pipeline.py文件中书写管道存储的逻辑(三种存储方式)
class QiubaiproPipeline(object): def process_item(self, item, spider): print(item) return item import pymysql class Mysql_PipeLine(object): #全局定义管道conn和游标cursor #导入pymysql conn=None cursor=None def open_spider(self, spider): #端口号是数字而非字符串, self.conn=pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='123',db='scrapy') self.cursor = self.conn.cursor() def process_item(self, item, spider): # print(item) try: self.cursor.execute('insert into qiubai values ("%s","%s");'%(item['author'],item['content'])) self.conn.commit() except Exception as e: print(e) self.conn.rollback() return item def close_spider(self, spider): # self.cursor.close() self.conn.close() from redis import Redis class Redis_PipeLine(object): conn=None def open_spider(self,spider): # 链接数据库 self.conn=Redis(host='127.0.0.1',port=6379) def process_item(self,item,spider): dic={ 'author':item['author'], 'content':item['content'] } self.conn.lpush('qiubai',dic) 6 settings文件中开启item_pipeline功能 #允许书写多个管道,多种存储方式 ITEM_PIPELINES = { 'qiubaiPro.pipelines.QiubaiproPipeline': 300, #'管道路径.管道名称':优先级 } ITEM_PIPELINES = { 'qiubaiPro.pipelines.QiubaiproPipeline': 300, #新增的管道 'qiubaiPro.pipelines.Mysql_PipeLine': 301, 'qiubaiPro.pipelines.Redis_PipeLine': 302, }
五 简单实例
- 新建的爬虫文件qiubai.py
# -*- coding: utf-8 -*- import scrapy from qiubaiPro.items import QiubaiproItem ''' 1 基于终端指令的持久化存储(只会将parse函数返回值进行本地持久化存储) 命令: scrapy crawl qiubai -o qiushibaike.csv 局限性:只能存储这些后缀的文件('json', 'jsonlines', 'jl', 'csv', 'xml', 'marshal', 'pickle') ''' # 基于终端指令的持久化存储(只会将parse函数返回值进行本地持久化存储) class QiubaiSpider(scrapy.Spider): name = 'qiubai' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.qiushibaike.com/text/'] def parse(self, response): div_list=response.xpath("//div[@id='content-left']/div") res_list=[] for div in div_list: author=div.xpath('./div[1]/a[2]/h2/text()')[0].extract() content=div.xpath('./a[1]/div[@class="content"]/span//text()').extract() content="".join(content) dic={ 'author':author, 'content':content } res_list.append(dic) return res_list # 基于管道操作的持久化存储(持久化存储的操作必须写在管道文件中) class QiubaiSpider(scrapy.Spider): name = 'qiubai' start_urls = ['https://www.qiushibaike.com/text/'] def parse(self, response): div_list=response.xpath("//div[@id='content-left']/div") for div in div_list: try: author=div.xpath('./div[1]/a[2]/h2/text()')[0].extract() except Exception as e: # print(e) author=div.xpath('./div[1]/span[2]/h2/text()')[0].extract() content=div.xpath('./a[1]/div[@class="content"]/span//text()').extract() content="".join(content) #实例化 item对象 item=QiubaiproItem() item['author']=author item['content']=content # print(item['author']) #提交管道 yield item
- items.py
import scrapy class QiubaiproItem(scrapy.Item): author=scrapy.Field() content=scrapy.Field()
- pipeline.py
# -*- coding: utf-8 -*- import pymysql from redis import Redis #一个类对应一个存储方式 #存入文件qiubai.txt class QiubaiproPipeline(object): fp = None # 文件管道 # open_spider重写父类方法,爬虫过程中只会执行一次 def open_spider(self,spider): self.fp=open('qiubai.txt','w',encoding='utf-8') # 处理item文件会执行多次,因此文件打开和关闭操作不应该放在这个函数内部, # 否则,执行效率太低 def process_item(self, item, spider): # print(item) self.fp.write(item['author']+':'+item['content']) return item # close_spider重写父类spider的方法,在爬虫执行过程只会执行一次 def close_spider(self,spider): self.fp.close() #存入mysql数据库 #同时在settings添加该管道路径 class Mysql_PipeLine(object): #全局定义管道conn和游标cursor #导入pymysql conn=None cursor=None def open_spider(self, spider): #端口号是数字而非字符串, self.conn=pymysql.Connect(host='127.0.0.1',port=3306,user='root',password='123',db='scrapy') self.cursor = self.conn.cursor() def process_item(self, item, spider): # print(item) try: self.cursor.execute('insert into qiubai values ("%s","%s");'%(item['author'],item['content'])) self.conn.commit() except Exception as e: print(e) self.conn.rollback() return item def close_spider(self, spider): # self.cursor.close() self.conn.close() class Redis_PipeLine(object): conn=None def open_spider(self,spider): # 链接数据库 self.conn=Redis(host='127.0.0.1',port=6379) def process_item(self,item,spider): dic={ 'author':item['author'], 'content':item['content'] } self.conn.lpush('qiubai',dic)
六 scrapy中的xpath的不同点
- scrapy中xpath表达式获取到的数据不是标签对象,而是select对象
author=div.xpath('./div[1]/a[2]/h2/text()')[0] #<Selector xpath='./div[1]/a[2]/h2/text()' data=' 胡子灬哥 '>
- 获取select对象中的data的数据
方式一:author=div.xpath('./div[1]/a[2]/h2/text()')[0].extract() 方式二:author=div.xpath('./div[1]/a[2]/h2/text()').extract_first() author=div.xpath('./div[1]/a[2]/h2/text()')[0].extract() #返回值为列表