# 爬虫文件 # -*- coding: utf-8 -*- import scrapy import os from urllib import request from lianjia.items import LianjiaItem class LianjiaspiderSpider(scrapy.Spider): name = 'lianjiaSpider' # allowed_domains = ['www.xxx.com'] start_urls = ['https://bj.lianjia.com/zufang/l1rp5/#contentList '] def parse(self, response): div_list = response.xpath('//div[@class="content__list"]/div[@class="content__list--item"]') # print(len(div_list)) for div in div_list: title = div.xpath('.//div[@class="content__list--item--main"]/p[1]/a/text()').get() title = title.strip() detail_url = div.xpath('.//div[@class="content__list--item--main"]/p[1]/a/@href').get() detail_url = "https://bj.lianjia.com" + detail_url # print(detail_url) location = div.xpath('.//div[@class="content__list--item--main"]/p[2]//text()').getall() location = list(map(lambda x:x.replace(" ","").replace("-","").replace("/","").strip(),location)) location = "".join(location) # print(location) price = div.xpath('.//div[@class="content__list--item--main"]/span//text()').getall() price = price[0]+price[1] # print(price) yield scrapy.Request(url=detail_url, callback=self.parse_detail,meta={'info':(title,location,price,detail_url)}) # 2-100页的url for i in range(2,101): next_url = "https://bj.lianjia.com/zufang/pg%dl1rp5/#contentList" % i yield scrapy.Request(url=next_url, callback=self.parse) def parse_detail(self,response): title,location,price,detail_url = response.meta.get("info") # pic_src = response.xpath("//div[@class='content__thumb--box']/ul/li[2]/img/@src").get() pic_srcs = response.xpath("//div[@class='content__thumb--box']/ul//img/@src").getall() # print('户型图链接:',pic_srcs) print('房源链接:',detail_url) item = LianjiaItem() item["title"] = title item["location"] = location item["price"] = price item['detail_url']=detail_url # item['pic_srcs'] = pic_srcs item['image_urls'] = pic_srcs yield item
# 管道文件 # 保存图片 # 普通方法保存图片 import os from urllib import request class LianjiaPipeline(object): def __init__(self): # 获取当前pipeline文件所在的目录路径 os.path.dirname(__file__) # 获取最外层bmw的路径os.path.dirname(os.path.dirname(__file__)) # 在最外层bmw目录下创建一个文件夹 images, 获取images的路径 self.path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images') # 生成images文件夹 if not os.path.exists(self.path): print("images文件夹不存在") os.mkdir(self.path) # 创建images文件夹 def process_item(self, item, spider): location = item['location'] urls = item['pic_srcs'] per_house_pic_path = os.path.join(self.path,location) # path2=self.path # G:Crawler and Data21days_spiderlianjiaimages # 处理路径拼接 打印出来的是一个斜杠的 但是系统里是两个斜杠的, 会报错 per_house_pic_path = per_house_pic_path.replace('/','\') print('每一个户型图的保存路径:',per_house_pic_path) if not os.path.exists(per_house_pic_path): os.mkdir(per_house_pic_path) for url in urls: # 每个图片的url url = url.replace('126x86.jpg','780x439.jpg') # 更改保存图片的大小 # 切割图片url 拼接图片的名称 防止图片保存被覆盖 不然最后爬下的始终只有一张图片 pic_name = url.split('.')[2][-9:-1] # 防止图片被覆盖 # os.path.join 的两个参数:户型图文件夹 和 图片的名称 拼接出来图片路径 request.urlretrieve(url=url,filename=os.path.join(per_house_pic_path,pic_name+'.png')) return item # item文件 class LianjiaItem(scrapy.Item): # define the fields for your item here like: # 普通的字段 title = scrapy.Field() detail_url = scrapy.Field() location = scrapy.Field() price = scrapy.Field() pic_srcs = scrapy.Field() # setting中 ITEM_PIPELINES = { 'lianjia.pipelines.LianjiaPipeline': 300, }
# 使用scrapy中的 image pipleline方法保存图片 import os from urllib import request from scrapy.pipelines.images import ImagesPipeline from lianjia import settings class LjImagesPipeline(ImagesPipeline): # 这个方法是下载请求前调用的, 就是发送下载请求的时候调用 def get_media_requests(self,item,info): request_objs = super(LjImagesPipeline,self).get_media_requests(item,info) for request_obj in request_objs: request_obj.item = item # 把item绑定到request上面,为了下面的方法可以通过request获取item return request_objs def file_path(self,request,response=None,info=None): # 这个方法是图片被存储的时候调用,来获取这个图片存储的路径 path = super(LjImagesPipeline,self).file_path(request,response,info) location = request.item.get('location') # 获取图片存储路径 images文件夹路径 images_store = settings.IMAGES_STORE # 判断这里有没有目录 每个房源的目录(这里面存房子图片) per_house_pic_path = os.path.join(images_store, location) if not os.path.exists(per_house_pic_path): os.mkdir(per_house_pic_path) image_name = path.replace('full/','') # 加个斜杠/是把full删除 # print('image_name:',image_name) #c554f76249059833f3a454830ec2cc2067465968.jpg image_path = os.path.join(per_house_pic_path,image_name) return image_path # 对应的item文件 class LianjiaItem(scrapy.Item): # define the fields for your item here like: # 普通的字段 title = scrapy.Field() detail_url = scrapy.Field() location = scrapy.Field() price = scrapy.Field() # pic_srcs = scrapy.Field() # 使用Images Pipeline需要的字段 image_urls=scrapy.Field() images = scrapy.Field() #settings文件 ITEM_PIPELINES = { # 'lianjia.pipelines.LianjiaPipeline': 300, # "scrapy.pipelines.images.ImagesPipeline":1 #不执行管道文件 'lianjia.pipelines.LjImagesPipeline': 1, #执行管道文件里重写的两个方法 } # 图片下载的路径 供image.pipelines使用 import os # 图片存储路径 IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images') # 生成images文件夹
# 总结: 1. def process_item()方法中 self.path 获取到的是images文件夹的路径, 要在这个文件下面保存每一个户型图的图片 2. 在window系统的路径拼接, os.path.join() 生成的路径通过print打印出来是一个斜杠/, 但是系统找路径的时候是找的双斜杠//, 这个时候就会报错.