安装scrapy
pip install scrapy
新建项目
(python36) E:www>scrapy startproject fileDownload New Scrapy project 'fileDownload', using template directory 'c:usersrady.condaenvspython36libsite-packagesscrapy emplatesproject', created in: E:wwwfileDownload You can start your first spider with: cd fileDownload scrapy genspider example example.com (python36) E:www>
(python36) E:www>scrapy startproject fileDownload New Scrapy project 'fileDownload', using template directory 'c:usersrady.condaenvspython36libsite-packagesscrapy emplatesproject', created in: E:wwwfileDownload You can start your first spider with: cd fileDownload scrapy genspider example example.com (python36) E:www>
编辑爬虫提取内容
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from fileDownload.items import FiledownloadItem class PexelsSpider(CrawlSpider): name = 'pexels' allowed_domains = ['www.pexels.com'] start_urls = ['https://www.pexels.com/photo/white-concrete-building-2559175/'] rules = ( Rule(LinkExtractor(allow=r'/photo/'), callback='parse_item', follow=True), ) def parse_item(self, response): print(response.url) url = response.xpath("//img[contains(@src,'photos')]/@src").extract() item = FiledownloadItem() try: item['file_urls'] = url print("爬取到图片列表 " + url) yield item except Exception as e: print(str(e))
配置item
class FiledownloadItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() file_urls = scrapy.Field()
setting.py
启用文件管道
'scrapy.pipelines.files.FilesPipeline':2 文件管道
FILES_STORE='' //存储路径
item里面
file_urls = scrapy.Field()
files = scrapy.field()
爬虫里面 改为file_urls参数传递到管道
重写文件管道 保存文件名为图片原名
pipelines.php里面 新建自己图片管道,继承图片管道
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html from scrapy.pipelines.files import FilesPipeline class FiledownloadPipeline(object): def process_item(self, item, spider): tmp = item['file_urls'] item['file_urls'] = [] for i in tmp: if "?" in i: item['file_urls'].append(i.split('?')[0]) else: item['file_urls'].append(i) print(item) return item class MyFilesPipeline(FilesPipeline): def file_path(self, request, response=None, info=None): file_path = request.url file_path = file_path.split('/')[-1] print("下载图片"+ file_path) return 'full/%s' % (file_path)
setting.py 改为启用自己文件管道
ITEM_PIPELINES = { 'fileDownload.pipelines.FiledownloadPipeline': 1, 'fileDownload.pipelines.MyFilesPipeline': 2, #'scrapy.pipelines.files.FilesPipeline':2 }
获取套图
# -*- coding: utf-8 -*- from time import sleep import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule class AngelSpider(CrawlSpider): name = 'angel' allowed_domains = ['angelimg.spbeen.com'] start_urls = ['http://angelimg.spbeen.com/'] base_url = "http://angelimg.spbeen.com" rules = ( Rule(LinkExtractor(allow=r'^http://angelimg.spbeen.com/ang/d+$'), callback='parse_item', follow=False), ) def parse_item(self, response): item = response.meta.get('item',False) if item: pass else: item = {} item['files'] = [] item['file_urls'] = [] print(response.url) img_url = response.xpath('.//div[@id="content"]/a/img/@src').extract_first() item['file_urls'].append(img_url) # 如果有下一页 请求下一页,没有数据丢回管道 next_url = response.xpath('.//div[@class="page"]//a[contains(@class,"next")]/@href').extract_first() if next_url: next_url = self.base_url + next_url yield scrapy.Request(next_url,callback=self.parse_item,meta={'item':item}) else: print(item) yield item def parse_next_response(self,response,): item = response.meta.get('item') print(item,response.url)
github地址
https://github.com/brady-wang/spider-fileDownload