scrapy 破解图片网站防盗链下载
语录:
你不优秀,是不可能出头的,这条规则不仅适用于人事,也适于互联网,那就点进去一探究竟!
爬虫防盗链破解:
防盗链就是为了防止A站把B站的资源直接拿去用,而做的一种阻止技术
防盗链的核心是判断你请求的地址是不是来自本服务器,若是,则给你图片,不是则不给
每下载一张图片都先伪造一个服务器的请求,然后在下载:
scrapy 实现:
class AoisolasSpiderMiddleware(object):
def process_request(self, request, spider):
referer = request.url
if referer:
request.headers['referer'] = referer
seetings.py 启动中间件:
DOWNLOADER_MIDDLEWARES = {
'AoiSolas.middlewares.AoisolasSpiderMiddleware': 1,
}
MM网站实列:
import scrapy
class AoisolasItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
ImgUrl = scrapy.Field()
pass
spider.py:
import scrapy
from AoiSolas.items import AoisolasItem
class AoisolaspiderSpider(scrapy.Spider):
name = "AoiSola"
allowed_domains = ["www.mm131.com"]
start_urls = ['http://www.mm131.com/xinggan/',
'http://www.mm131.com/qingchun/',
'http://www.mm131.com/xiaohua/',
'http://www.mm131.com/chemo/',
'http://www.mm131.com/qipao/',
'http://www.mm131.com/mingxing/'
]
def parse(self, response):
list = response.css(".list-left dd:not(.page)")
for img in list:
imgname = img.css("a::text").extract_first()
imgurl = img.css("a::attr(href)").extract_first()
imgurl2 = str(imgurl)
print(imgurl2)
next_url = response.css(".page-en:nth-last-child(2)::attr(href)").extract_first()
if next_url is not None:
# 下一页
yield response.follow(next_url, callback=self.parse)
yield scrapy.Request(imgurl2, callback=self.content)
def content(self, response):
item = AoisolasItem()
item['name'] = response.css(".content h5::text").extract_first()
item['ImgUrl'] = response.css(".content-pic img::attr(src)").extract()
yield item
# 提取图片,存入文件夹
# print(item['ImgUrl'])
next_url = response.css(".page-ch:last-child::attr(href)").extract_first()
if next_url is not None:
# 下一页
yield response.follow(next_url, callback=self.content)
pipelines.py:
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy.http import Request
import re
class MyImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['ImgUrl']:
yield Request(image_url,meta={'item':item['name']})
def file_path(self, request, response=None, info=None):
name = request.meta['item']
# name = filter(lambda x: x not in '()0123456789', name)
name = re.sub(r'[?\*|“<>:/()0123456789]', '', name)
image_guid = request.url.split('/')[-1]
# name2 = request.url.split('/')[-2]
filename = u'full/{0}/{1}'.format(name, image_guid)
return filename
# return 'full/%s' % (image_guid)
def item_completed(self, results, item, info):
image_path = [x['path'] for ok, x in results if ok]
if not image_path:
raise DropItem('Item contains no images')
item['image_paths'] = image_path
return item
settings.py:
# 设置图片存储路径
IMAGES_STORE = 'D:meizi2'
#启动pipeline中间件
ITEM_PIPELINES = {
'AoiSolas.pipelines.MyImagesPipeline': 300,
}