scrapy 文件下载配置
爬取matplotlib作图库
matplotlib是非常有用的作图库,官网上提供了许多实例,可在’http://matplotlib.org/examples/index.html’ 查到,我们就把这些文件下载到本地,方便以后查找使用。
1 pipelines.py 管道模块 自定义管道下载类-主要是改名字
from urllib import parse
from os.path import dirname,basename,join
from scrapy.pipelines.files import FilesPipeline
class MatpDownloadPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
# print(request.url)
path=parse.urlparse(request.url).path
dir_name=dirname(path)
base_name=basename(path)
return join(basename(dir_name),base_name)
2 settings.py 配置
ITEM_PIPELINES = {
配置自己自定义的下载管道
'img.pipelines.MatpDownloadPipeline':200,
}
文件存储路径
FILES_STORE = 'examples_src'
3爬虫应用代码
import scrapy
from ..items import MatpItem
from scrapy.linkextractors import LinkExtractor
from urllib import parse
class AnmSpider(scrapy.Spider):
name = 'anm'
allowed_domains = ['matplotlib.org']
start_urls = ['https://matplotlib.org/examples/index.html']
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url=url,callback=self.parse,encoding="UTF-8")
def parse(self, response):
le=LinkExtractor(restrict_xpaths="//div[@class='toctree-wrapper compound']/ul/li/ul/li/a")
links=le.extract_links(response)
for link in links:
# print(link.url)
yield scrapy.Request(link.url,callback=self.parse_detail,encoding="UTF_8")
def parse_detail(self, response):
source_url=response.xpath("//a[@class='reference external']/@href").extract_first()
source=response.urljoin(source_url)
items=MatpItem()
items["file_urls"]=[source]
return items