主要原因:需要下载文件并保留原有后缀名,但scrapy的下载管道没有这个选项,需要重新定义filespipelines功能,参考其他人的文件,
import time from urllib import parse from scrapy.pipelines.files import FilesPipeline class FileRenamePipeline(FilesPipeline): def file_path(self, request, response=None, info=None): print('_'*100) timest = str(int(time.time()*1000)) name = parse.unquote(parse.unquote(request.url).split(';')[1]).split('"')[1] if '.' in name: file_name = name.split('.')[0] + '_' + timest + '.' + name.split('.')[1] else: file_name = name + '_' + timest return 'full/' + file_name
custom_settings = { 'ITEM_PIPELINES':{ 'spider_dataPlat.pipelines.FileRenamePipeline':2, }, 'FILES_STORE':'E:下载', # 文件下载路径 }
items = SpiderFileItem() items['file_urls'] = [final_url] items['files'] = name.split('.')[0] yield items