房天下新房和二手房

# 爬虫文件
# -*- coding: utf-8 -*-
import scrapy
import re
from soufangwang.items import NewHouseItem,SecondhandHouseItem


class FangspiderSpider(scrapy.Spider):
    name = 'fangSpider'
    allowed_domains = ['fang.com']
    start_urls = ['https://www.fang.com/SoufunFamily.html']

    def parse(self, response):
        # print(response)
        trs = response.xpath("//div[@class='outCont']//tr")
        province = None
        for tr in trs:
            #找没有class='font01' 属性的td标签  注意
            tds = tr.xpath(".//td[not(@class)]")
            province_td = tds[0]
            province_text = province_td.xpath('.//text()').get()
            # 把空白字符's'去掉
            province_text = re.sub(r's','',province_text)
            if province_text:
                province = province_text

            # 不爬取海外城市
            if province == '其它':
                continue

            city_td = tds[1]
            city_links = city_td.xpath('.//a')
            for city_link in city_links:
                city = city_link.xpath('.//text()').get()
                city_url = city_link.xpath('.//@href').get()

                # 构建新房url 连接
                new_url = city_url.split(".")
                if new_url[0].endswith('bj'):
                    newhouse_url = "https://"+"newhouse."+"fang.com/house/s/"
                    secondhand_url="https://"+"esf."+"fang.com/"
                else:
                    newhouse_url = new_url[0]+".newhouse."+new_url[1]+"."+new_url[2]+"house/s/"
                    # 构建二手房的url
                    secondhand_url = new_url[0]+".esf."+new_url[1]+"."+new_url[2]
                print(city,'新房链接:',newhouse_url)
                print(city,'二手房链接:',secondhand_url)

                yield scrapy.Request(url=newhouse_url,callback=self.parse_newhouse,meta={'info':(province,city,newhouse_url)})
                yield scrapy.Request(url=secondhand_url,callback=self.parse_esf,dont_filter=True,meta={'info':(province,city,secondhand_url)})
            #     break
            # break

    # 解析新房数据
    def parse_newhouse(self,response):
        try:
            # 请求传参
            province,city,newhouse_url = response.meta.get('info')
            li_list = response.xpath('//div[contains(@class,"nl_con")]/ul/li')
            for li in li_list:
                try:
                    name = li.xpath('.//div[@class="nlcd_name"]/a/text()').get().strip()
                    # print(name)
                    house_type_list = li.xpath('.//div[contains(@class,"house_type")]//text()').getall()
                    house_type_list = list(map(lambda x:x.replace("
","").replace("	",""),house_type_list))
                    house_type_str = ''.join(house_type_list).strip().split("－")
                    house_type = house_type_str[0]
                    area = house_type_str[1]
                    # print(house_type)
                    # print(area)
                    address = li.xpath('.//div[@class="address"]/a/text()').getall()
                    address =  list(map(lambda x:x.replace("
","").replace("	",""),address))[1]
                    district = li.xpath('.//div[@class="address"]/a/span/text()').get().strip()
                    district = district[1:-1]
                    # print(district)
                    sale = li.xpath('.//div[@class="fangyuan"]/span[1]/text()').get()
                    # print(sale)
                    price = li.xpath('.//div[@class="nhouse_price"]//text()').getall()
                    price = list(map(lambda x:x.replace("
","").replace("	",""),price))
                    price = "".join(price)
                    # print(price)
                    origin_url = li.xpath('.//div[@class="nlcd_name"]/a/@href').get()
                    origin_url = "https:" + origin_url
                    # print(origin_url)
                    item = NewHouseItem(province=province,city=city,name=name,price=price,rooms=house_type,
                                area=area,address=address,district=district,sale=sale,origin_url=origin_url)
                    yield item
                except:
                    pass

            # 下一页
            next_url = response.xpath('//div[@class="page"]//a[@class="next"]/@href').get()
            if next_url:
                next_url = next_url.split("/")[-2]
                next_url = newhouse_url+next_url
                # print(next_url)
                yield scrapy.Request(url=next_url, callback=self.parse_newhouse,meta={'info':(province,city,newhouse_url)})
        except:
            print("无房子信息")


    # 解析二手房数据
    def parse_esf(self,response):
        try:
            # 请求传参
            province,city,secondhand_url = response.meta.get('info')
            dls = response.xpath('//div[@class="main945 floatl"]/div[@class="shop_list shop_list_4"]/dl[@dataflag="bg"]')

            for dl in dls:
                name = dl.xpath('.//h4[@class="clearfix"]//span[@class="tit_shop"]/text()').get()
                # print(name)
                price = dl.xpath('.//dd[@class="price_right"]/span//text()').getall()
                unit = price[-1]
                price = list(map(lambda x:x.replace("
","").replace("	",""),price))[:-1]
                price = "".join(price)
                room_info = dl.xpath('.//p[@class="tel_shop"]/text()').getall()
                room_info = list(map(lambda x:x.replace("
","").replace("
","").strip(),room_info))[:5]
                if room_info[0] == "独栋":
                    rooms = room_info[1]
                    area = room_info[3]
                    floor = room_info[0]
                    toward = room_info[4]
                else:
                    try:
                        rooms = room_info[0]
                        area = room_info[1]
                        floor = room_info[2]
                        print(province, city)
                        print(room_info)
                        toward = room_info[3]
                        year = room_info[4]
                    except:
                        pass

                address = dl.xpath('.//p[@class="add_shop"]/span/text()').get()
                # print(address)
                origin_url = dl.xpath('.//h4[@class="clearfix"]/a/@href').get()
                origin_url = secondhand_url[:-1]+origin_url
                # print(origin_url)

                item = SecondhandHouseItem(province=province,city=city,name=name,price=price,unit=unit,rooms=rooms,
                                   area=area,floor=floor,toward=toward,year=year,address=address,origin_url=origin_url)
                yield item

            # 下一页
            next_url = response.xpath('//div[@class="page_al"]/p/a/@href').get()
            if next_url:
                next_url = secondhand_url[:-1] + next_url
                # print(next_url)
            yield scrapy.Request(url=next_url, callback=self.parse_esf,meta={'info':(province,city,secondhand_url)})

        except:
            print("本页没有房源信息")

# 管道文件

from scrapy.exporters import JsonLinesItemExporter

class SoufangwangPipeline(object):

    def __init__(self):
        self.newhouse_fp = open('newhouse.json','wb')
        self.esfhouse_fp = open('esfhouse.json','wb')

        self.newhouse_exporter=JsonLinesItemExporter(
            self.newhouse_fp,ensure_ascii=False
        )
        self.esfhouse_exporter=JsonLinesItemExporter(
            self.esfhouse_fp,ensure_ascii=False
        )

    def process_item(self, item, spider):
        self.newhouse_exporter.export_item(item)
        self.esfhouse_exporter.export_item(item)
        return item

    def close_spider(self,spider):
        self.newhouse_fp.close()
        self.esfhouse_fp.close()

# item文件
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class NewHouseItem(scrapy.Item):
    # define the fields for your item here like:
    # 省份
    province = scrapy.Field()
    # 城市
    city = scrapy.Field()
    # 小区名字
    name = scrapy.Field()
    # 价格
    price = scrapy.Field()
    # 几居室, 是个列表
    rooms = scrapy.Field()
    # 面积
    area = scrapy.Field()
    # 地址
    address = scrapy.Field()
    # 行政区
    district = scrapy.Field()
    # 是否在售
    sale = scrapy.Field()
    # 房天下详情页面
    origin_url = scrapy.Field()


class SecondhandHouseItem(scrapy.Item):
    # define the fields for your item here like:
    # 省份
    province = scrapy.Field()
    # 城市
    city = scrapy.Field()
    # 小区名字
    name = scrapy.Field()
    # 价格
    price = scrapy.Field()
    # 单价
    unit = scrapy.Field()
    # 几居室, 是个列表
    rooms = scrapy.Field()
    # 层
    floor = scrapy.Field()
    # 朝向
    toward = scrapy.Field()
    # 年代
    year = scrapy.Field()
    # 面积
    area = scrapy.Field()
    # 地址
    address = scrapy.Field()
    # 房天下详情页面
    origin_url = scrapy.Field()

# 中间件  设置随机请求头
# -*- coding: utf-8 -*-

import random

class UserAgentDownloadMiddleware(object):
    USER_AGENTS = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
    def process_request(self,request,spider):
        user_agent = random.choice(self.USER_AGENTS)
        request.headers['User-Agent'] = user_agent

相关阅读:
关于相机权限
 JDBC插入中文出现乱码问题
 记一次Java代码的部署
 Java-final关键字
 Java枚举使用详解
 临时表
 Oracle 11g 建表表名大小写问题
 ORA-04021等待锁定对象时超时
 Oracle 查询时间差几天
 NVL()
原文地址：https://www.cnblogs.com/kenD/p/11123690.html