• 房天下新房和二手房


    # 爬虫文件
    # -*- coding: utf-8 -*-
    import scrapy
    import re
    from soufangwang.items import NewHouseItem,SecondhandHouseItem
    
    
    class FangspiderSpider(scrapy.Spider):
        name = 'fangSpider'
        allowed_domains = ['fang.com']
        start_urls = ['https://www.fang.com/SoufunFamily.html']
    
        def parse(self, response):
            # print(response)
            trs = response.xpath("//div[@class='outCont']//tr")
            province = None
            for tr in trs:
                #找没有class='font01' 属性的td标签  注意
                tds = tr.xpath(".//td[not(@class)]")
                province_td = tds[0]
                province_text = province_td.xpath('.//text()').get()
                # 把空白字符's'去掉
                province_text = re.sub(r's','',province_text)
                if province_text:
                    province = province_text
    
                # 不爬取海外城市
                if province == '其它':
                    continue
    
                city_td = tds[1]
                city_links = city_td.xpath('.//a')
                for city_link in city_links:
                    city = city_link.xpath('.//text()').get()
                    city_url = city_link.xpath('.//@href').get()
    
                    # 构建新房url 连接
                    new_url = city_url.split(".")
                    if new_url[0].endswith('bj'):
                        newhouse_url = "https://"+"newhouse."+"fang.com/house/s/"
                        secondhand_url="https://"+"esf."+"fang.com/"
                    else:
                        newhouse_url = new_url[0]+".newhouse."+new_url[1]+"."+new_url[2]+"house/s/"
                        # 构建二手房的url
                        secondhand_url = new_url[0]+".esf."+new_url[1]+"."+new_url[2]
                    print(city,'新房链接:',newhouse_url)
                    print(city,'二手房链接:',secondhand_url)
    
                    yield scrapy.Request(url=newhouse_url,callback=self.parse_newhouse,meta={'info':(province,city,newhouse_url)})
                    yield scrapy.Request(url=secondhand_url,callback=self.parse_esf,dont_filter=True,meta={'info':(province,city,secondhand_url)})
                #     break
                # break
    
        # 解析新房数据
        def parse_newhouse(self,response):
            try:
                # 请求传参
                province,city,newhouse_url = response.meta.get('info')
                li_list = response.xpath('//div[contains(@class,"nl_con")]/ul/li')
                for li in li_list:
                    try:
                        name = li.xpath('.//div[@class="nlcd_name"]/a/text()').get().strip()
                        # print(name)
                        house_type_list = li.xpath('.//div[contains(@class,"house_type")]//text()').getall()
                        house_type_list = list(map(lambda x:x.replace("
    ","").replace("	",""),house_type_list))
                        house_type_str = ''.join(house_type_list).strip().split("")
                        house_type = house_type_str[0]
                        area = house_type_str[1]
                        # print(house_type)
                        # print(area)
                        address = li.xpath('.//div[@class="address"]/a/text()').getall()
                        address =  list(map(lambda x:x.replace("
    ","").replace("	",""),address))[1]
                        district = li.xpath('.//div[@class="address"]/a/span/text()').get().strip()
                        district = district[1:-1]
                        # print(district)
                        sale = li.xpath('.//div[@class="fangyuan"]/span[1]/text()').get()
                        # print(sale)
                        price = li.xpath('.//div[@class="nhouse_price"]//text()').getall()
                        price = list(map(lambda x:x.replace("
    ","").replace("	",""),price))
                        price = "".join(price)
                        # print(price)
                        origin_url = li.xpath('.//div[@class="nlcd_name"]/a/@href').get()
                        origin_url = "https:" + origin_url
                        # print(origin_url)
                        item = NewHouseItem(province=province,city=city,name=name,price=price,rooms=house_type,
                                    area=area,address=address,district=district,sale=sale,origin_url=origin_url)
                        yield item
                    except:
                        pass
    
                # 下一页
                next_url = response.xpath('//div[@class="page"]//a[@class="next"]/@href').get()
                if next_url:
                    next_url = next_url.split("/")[-2]
                    next_url = newhouse_url+next_url
                    # print(next_url)
                    yield scrapy.Request(url=next_url, callback=self.parse_newhouse,meta={'info':(province,city,newhouse_url)})
            except:
                print("无房子信息")
    
    
        # 解析二手房数据
        def parse_esf(self,response):
            try:
                # 请求传参
                province,city,secondhand_url = response.meta.get('info')
                dls = response.xpath('//div[@class="main945 floatl"]/div[@class="shop_list shop_list_4"]/dl[@dataflag="bg"]')
    
                for dl in dls:
                    name = dl.xpath('.//h4[@class="clearfix"]//span[@class="tit_shop"]/text()').get()
                    # print(name)
                    price = dl.xpath('.//dd[@class="price_right"]/span//text()').getall()
                    unit = price[-1]
                    price = list(map(lambda x:x.replace("
    ","").replace("	",""),price))[:-1]
                    price = "".join(price)
                    room_info = dl.xpath('.//p[@class="tel_shop"]/text()').getall()
                    room_info = list(map(lambda x:x.replace("
    ","").replace("
    ","").strip(),room_info))[:5]
                    if room_info[0] == "独栋":
                        rooms = room_info[1]
                        area = room_info[3]
                        floor = room_info[0]
                        toward = room_info[4]
                    else:
                        try:
                            rooms = room_info[0]
                            area = room_info[1]
                            floor = room_info[2]
                            print(province, city)
                            print(room_info)
                            toward = room_info[3]
                            year = room_info[4]
                        except:
                            pass
    
                    address = dl.xpath('.//p[@class="add_shop"]/span/text()').get()
                    # print(address)
                    origin_url = dl.xpath('.//h4[@class="clearfix"]/a/@href').get()
                    origin_url = secondhand_url[:-1]+origin_url
                    # print(origin_url)
    
                    item = SecondhandHouseItem(province=province,city=city,name=name,price=price,unit=unit,rooms=rooms,
                                       area=area,floor=floor,toward=toward,year=year,address=address,origin_url=origin_url)
                    yield item
    
                # 下一页
                next_url = response.xpath('//div[@class="page_al"]/p/a/@href').get()
                if next_url:
                    next_url = secondhand_url[:-1] + next_url
                    # print(next_url)
                yield scrapy.Request(url=next_url, callback=self.parse_esf,meta={'info':(province,city,secondhand_url)})
    
            except:
                print("本页没有房源信息")
    # 管道文件
    
    from scrapy.exporters import JsonLinesItemExporter
    
    class SoufangwangPipeline(object):
    
        def __init__(self):
            self.newhouse_fp = open('newhouse.json','wb')
            self.esfhouse_fp = open('esfhouse.json','wb')
    
            self.newhouse_exporter=JsonLinesItemExporter(
                self.newhouse_fp,ensure_ascii=False
            )
            self.esfhouse_exporter=JsonLinesItemExporter(
                self.esfhouse_fp,ensure_ascii=False
            )
    
        def process_item(self, item, spider):
            self.newhouse_exporter.export_item(item)
            self.esfhouse_exporter.export_item(item)
            return item
    
        def close_spider(self,spider):
            self.newhouse_fp.close()
            self.esfhouse_fp.close()
    # item文件
    # -*- coding: utf-8 -*-
    
    # Define here the models for your scraped items
    #
    # See documentation in:
    # https://doc.scrapy.org/en/latest/topics/items.html
    
    import scrapy
    
    
    class NewHouseItem(scrapy.Item):
        # define the fields for your item here like:
        # 省份
        province = scrapy.Field()
        # 城市
        city = scrapy.Field()
        # 小区名字
        name = scrapy.Field()
        # 价格
        price = scrapy.Field()
        # 几居室, 是个列表
        rooms = scrapy.Field()
        # 面积
        area = scrapy.Field()
        # 地址
        address = scrapy.Field()
        # 行政区
        district = scrapy.Field()
        # 是否在售
        sale = scrapy.Field()
        # 房天下详情页面
        origin_url = scrapy.Field()
    
    
    class SecondhandHouseItem(scrapy.Item):
        # define the fields for your item here like:
        # 省份
        province = scrapy.Field()
        # 城市
        city = scrapy.Field()
        # 小区名字
        name = scrapy.Field()
        # 价格
        price = scrapy.Field()
        # 单价
        unit = scrapy.Field()
        # 几居室, 是个列表
        rooms = scrapy.Field()
        #
        floor = scrapy.Field()
        # 朝向
        toward = scrapy.Field()
        # 年代
        year = scrapy.Field()
        # 面积
        area = scrapy.Field()
        # 地址
        address = scrapy.Field()
        # 房天下详情页面
        origin_url = scrapy.Field()
    # 中间件  设置随机请求头
    # -*- coding: utf-8 -*-
    
    import random
    
    class UserAgentDownloadMiddleware(object):
        USER_AGENTS = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
            "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
            "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
            "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
            "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
            "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
            "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
            "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
            "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
            "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
            "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
            "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
        ]
        def process_request(self,request,spider):
            user_agent = random.choice(self.USER_AGENTS)
            request.headers['User-Agent'] = user_agent
  • 相关阅读:
    web(零)---tornado使用
    web(一)----tornado nginx配置
    pb- 使用
    Python排序算法之直接插入排序
    敏捷测试中发现的一些问题及改进办法
    加密算法
    共享内存与存储映射(mmap)
    mysql索引的性能分析
    mysql索引
    Innodb的存储及缓存
  • 原文地址:https://www.cnblogs.com/kenD/p/11123690.html
Copyright © 2020-2023  润新知