爬虫图片案列:
煎蛋网(http://jandan.net)
spider.py:
import scrapy
from jiandan.items import JiandanItem
from scrapy.crawler import CrawlerProcess
class jiandanSpider(scrapy.Spider):
name = 'jiandan'
allowed_domains = []
start_urls = ["http://jandan.net/ooxx"]
def parse(self, response):
item = JiandanItem()
item['image_urls'] = response.xpath('//img//@src').extract()#提取图片链接
# print 'image_urls',item['image_urls']
yield item
new_url= response.xpath('//a[@class="previous-comment-page"]//@href').extract_first()#翻页
# print 'new_url',new_url
if new_url:
yield scrapy.Request(new_url,callback=self.parse)
item.py:
import scrapy
class JiandanItem(scrapy.Item):
# define the fields for your item here like:
image_urls = scrapy.Field()#图片的链接
pip.py
import os
import urllib
from jiandan import settings
class JiandanPipeline(object):
def process_item(self, item, spider):
dir_path = '%s/%s'%(settings.IMAGES_STORE,spider.name)#存储路径
print 'dir_path',dir_path
if not os.path.exists(dir_path):
os.makedirs(dir_path)
for image_url in item['image_urls']:
list_name = image_url.split('/')
file_name = list_name[len(list_name)-1]#图片名称
# print 'filename',file_name
file_path = '%s/%s'%(dir_path,file_name)
# print 'file_path',file_path
if os.path.exists(file_name):
continue
with open(file_path,'wb') as file_writer:
conn = urllib.urlopen(image_url)#下载图片
file_writer.write(conn.read())
file_writer.close()
return item
setting.py:
BOT_NAME = 'jiandan'
SPIDER_MODULES = ['jiandan.spiders']
NEWSPIDER_MODULE = 'jiandan.spiders'
ITEM_PIPELINES = {
'jiandan.pipelines.JiandanPipeline': 1,
}
IMAGES_STORE='E:'
DOWNLOAD_DELAY = 0.25
main.py:
scrapy crawl jiandan