# 爬虫文件 # -*- coding: utf-8 -*- import scrapy import re from soufangwang.items import NewHouseItem,SecondhandHouseItem class FangspiderSpider(scrapy.Spider): name = 'fangSpider' allowed_domains = ['fang.com'] start_urls = ['https://www.fang.com/SoufunFamily.html'] def parse(self, response): # print(response) trs = response.xpath("//div[@class='outCont']//tr") province = None for tr in trs: #找没有class='font01' 属性的td标签 注意 tds = tr.xpath(".//td[not(@class)]") province_td = tds[0] province_text = province_td.xpath('.//text()').get() # 把空白字符's'去掉 province_text = re.sub(r's','',province_text) if province_text: province = province_text # 不爬取海外城市 if province == '其它': continue city_td = tds[1] city_links = city_td.xpath('.//a') for city_link in city_links: city = city_link.xpath('.//text()').get() city_url = city_link.xpath('.//@href').get() # 构建新房url 连接 new_url = city_url.split(".") if new_url[0].endswith('bj'): newhouse_url = "https://"+"newhouse."+"fang.com/house/s/" secondhand_url="https://"+"esf."+"fang.com/" else: newhouse_url = new_url[0]+".newhouse."+new_url[1]+"."+new_url[2]+"house/s/" # 构建二手房的url secondhand_url = new_url[0]+".esf."+new_url[1]+"."+new_url[2] print(city,'新房链接:',newhouse_url) print(city,'二手房链接:',secondhand_url) yield scrapy.Request(url=newhouse_url,callback=self.parse_newhouse,meta={'info':(province,city,newhouse_url)}) yield scrapy.Request(url=secondhand_url,callback=self.parse_esf,dont_filter=True,meta={'info':(province,city,secondhand_url)}) # break # break # 解析新房数据 def parse_newhouse(self,response): try: # 请求传参 province,city,newhouse_url = response.meta.get('info') li_list = response.xpath('//div[contains(@class,"nl_con")]/ul/li') for li in li_list: try: name = li.xpath('.//div[@class="nlcd_name"]/a/text()').get().strip() # print(name) house_type_list = li.xpath('.//div[contains(@class,"house_type")]//text()').getall() house_type_list = list(map(lambda x:x.replace(" ","").replace(" ",""),house_type_list)) house_type_str = ''.join(house_type_list).strip().split("-") house_type = house_type_str[0] area = house_type_str[1] # print(house_type) # print(area) address = li.xpath('.//div[@class="address"]/a/text()').getall() address = list(map(lambda x:x.replace(" ","").replace(" ",""),address))[1] district = li.xpath('.//div[@class="address"]/a/span/text()').get().strip() district = district[1:-1] # print(district) sale = li.xpath('.//div[@class="fangyuan"]/span[1]/text()').get() # print(sale) price = li.xpath('.//div[@class="nhouse_price"]//text()').getall() price = list(map(lambda x:x.replace(" ","").replace(" ",""),price)) price = "".join(price) # print(price) origin_url = li.xpath('.//div[@class="nlcd_name"]/a/@href').get() origin_url = "https:" + origin_url # print(origin_url) item = NewHouseItem(province=province,city=city,name=name,price=price,rooms=house_type, area=area,address=address,district=district,sale=sale,origin_url=origin_url) yield item except: pass # 下一页 next_url = response.xpath('//div[@class="page"]//a[@class="next"]/@href').get() if next_url: next_url = next_url.split("/")[-2] next_url = newhouse_url+next_url # print(next_url) yield scrapy.Request(url=next_url, callback=self.parse_newhouse,meta={'info':(province,city,newhouse_url)}) except: print("无房子信息") # 解析二手房数据 def parse_esf(self,response): try: # 请求传参 province,city,secondhand_url = response.meta.get('info') dls = response.xpath('//div[@class="main945 floatl"]/div[@class="shop_list shop_list_4"]/dl[@dataflag="bg"]') for dl in dls: name = dl.xpath('.//h4[@class="clearfix"]//span[@class="tit_shop"]/text()').get() # print(name) price = dl.xpath('.//dd[@class="price_right"]/span//text()').getall() unit = price[-1] price = list(map(lambda x:x.replace(" ","").replace(" ",""),price))[:-1] price = "".join(price) room_info = dl.xpath('.//p[@class="tel_shop"]/text()').getall() room_info = list(map(lambda x:x.replace(" ","").replace(" ","").strip(),room_info))[:5] if room_info[0] == "独栋": rooms = room_info[1] area = room_info[3] floor = room_info[0] toward = room_info[4] else: try: rooms = room_info[0] area = room_info[1] floor = room_info[2] print(province, city) print(room_info) toward = room_info[3] year = room_info[4] except: pass address = dl.xpath('.//p[@class="add_shop"]/span/text()').get() # print(address) origin_url = dl.xpath('.//h4[@class="clearfix"]/a/@href').get() origin_url = secondhand_url[:-1]+origin_url # print(origin_url) item = SecondhandHouseItem(province=province,city=city,name=name,price=price,unit=unit,rooms=rooms, area=area,floor=floor,toward=toward,year=year,address=address,origin_url=origin_url) yield item # 下一页 next_url = response.xpath('//div[@class="page_al"]/p/a/@href').get() if next_url: next_url = secondhand_url[:-1] + next_url # print(next_url) yield scrapy.Request(url=next_url, callback=self.parse_esf,meta={'info':(province,city,secondhand_url)}) except: print("本页没有房源信息")
# 管道文件 from scrapy.exporters import JsonLinesItemExporter class SoufangwangPipeline(object): def __init__(self): self.newhouse_fp = open('newhouse.json','wb') self.esfhouse_fp = open('esfhouse.json','wb') self.newhouse_exporter=JsonLinesItemExporter( self.newhouse_fp,ensure_ascii=False ) self.esfhouse_exporter=JsonLinesItemExporter( self.esfhouse_fp,ensure_ascii=False ) def process_item(self, item, spider): self.newhouse_exporter.export_item(item) self.esfhouse_exporter.export_item(item) return item def close_spider(self,spider): self.newhouse_fp.close() self.esfhouse_fp.close()
# item文件 # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class NewHouseItem(scrapy.Item): # define the fields for your item here like: # 省份 province = scrapy.Field() # 城市 city = scrapy.Field() # 小区名字 name = scrapy.Field() # 价格 price = scrapy.Field() # 几居室, 是个列表 rooms = scrapy.Field() # 面积 area = scrapy.Field() # 地址 address = scrapy.Field() # 行政区 district = scrapy.Field() # 是否在售 sale = scrapy.Field() # 房天下详情页面 origin_url = scrapy.Field() class SecondhandHouseItem(scrapy.Item): # define the fields for your item here like: # 省份 province = scrapy.Field() # 城市 city = scrapy.Field() # 小区名字 name = scrapy.Field() # 价格 price = scrapy.Field() # 单价 unit = scrapy.Field() # 几居室, 是个列表 rooms = scrapy.Field() # 层 floor = scrapy.Field() # 朝向 toward = scrapy.Field() # 年代 year = scrapy.Field() # 面积 area = scrapy.Field() # 地址 address = scrapy.Field() # 房天下详情页面 origin_url = scrapy.Field()
# 中间件 设置随机请求头 # -*- coding: utf-8 -*- import random class UserAgentDownloadMiddleware(object): USER_AGENTS = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 " "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 " "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 " "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 " "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 " "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 " "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] def process_request(self,request,spider): user_agent = random.choice(self.USER_AGENTS) request.headers['User-Agent'] = user_agent