• 赶集租房


    #!/usr/bin/env python
    # -*- coding:utf-8 -*-
    
    # Author : zhibo.wang
    # E-mail : d_1206@qq.com
    # Date   : 18/04/24 09:22:58
    # Desc   : 赶集租房
    
    import hashlib
    import socket
    import time
    import scrapy
    import copy
    from house_spider.agent import get_agent
    from bs4 import BeautifulSoup
    from utils.time_convert import get_time_stamp
    from utils.conmongodb import mongo_con_keepalive
    
    
    class GanJiSpider(scrapy.Spider):
        name = 'ganji_rent_spider'
        allowed_domains = ['ganji.com']
        custom_settings = {
            "DOWNLOAD_DELAY": 0.4,
        }
    
        def __init__(self):
            self.db = mongo_con_keepalive()
            self.start_data = [{"cityname": i.get("cityname"),
                                "city_code": i.get("ganji_code"),
                                "url": "http://{0}.ganji.com/fang1/m1/".format(i.get("ganji_code"))}
                               for i in self.db.get_collection("para_city_list").find({}, {"_id":0})]
    
            self.efs = self.db.get_collection('pathdir_dict')
            self.ganji_header = get_agent('ganji_header')
            self.path_dir = "realstate/house/ganji/rent/{0}/".format(get_time_stamp())
            self.efs.insert_one({'pathdir': self.path_dir, 'website': 'ganji_rent', 'flag': False})
    
    
            super(GanJiSpider, self).__init__()
    
        def start_requests(self):
            reqs = []
            for start_ in self.start_data:
                hd = copy.deepcopy(self.ganji_header[0])
                url = start_.get("url")
                hd['Host'] = url.split("/")[2]
                data = {"cityname": start_.get("cityname"), "city_code": start_.get("city_code"), "url": url, "host": hd['Host']}
                reqs.append(scrapy.Request(url, headers=hd, callback=self.parse_district, meta=data))
    
            return reqs
    
        def parse_district(self, response):
            hd = copy.deepcopy(self.ganji_header[0])
            host = response.request.meta.get("host")
            cityname = response.request.meta.get("cityname")
            city_code = response.request.meta.get("city_code")
            hd['Host'] = host
            hd["Referer"] = response.request.meta.get("url")
            soup = BeautifulSoup(response.text, 'html.parser')
            district_list = soup.find("div", class_="thr-list").find_all("li", class_="item")
            for district_ in district_list:
    
                url = "http://{0}.ganji.com{1}".format(city_code, district_.find("a").get("href"))
                data = {"cityname": cityname, "city_code": city_code,"url": url, "host": hd['Host'], "page": True}
                yield scrapy.Request(url, headers=hd, callback=self.parse_page, meta=data)
    
    
    
        def parse_page(self, response):
            # 处理列表页
    
            hd = copy.deepcopy(self.ganji_header[0])
            host = response.request.meta.get("host")
            cityname = response.request.meta.get("cityname")
            hd['Host'] = host
            hd["Referer"] = response.request.meta.get("url")
            soup = BeautifulSoup(response.text, 'html.parser')
            city_code = response.request.meta.get("city_code")
    
            try:
                house_list = soup.find("div", class_="f-list js-tips-list").find_all("div", class_="f-list-item ershoufang-list")
                # 获取列表页 所有房源地址
                house_datas = [
                    "http://{0}.ganji.com{1}".format(city_code, i.get("href"))
                    for i in house_list
                ]
                for house_data in house_datas:
                    yield scrapy.Request(house_data, headers=hd, callback=self.parse_item)
            except:
                print("没有租房数据")
    
            if response.request.meta.get("page"):
                try:
                    all_count = int(
                        soup.find("p", class_="m-result f-fr").find("span", class_="num").get_text().replace("", ""))
                    end_apge = int(all_count/50)
                    if all_count % 50 != 0:
                        end_apge += 1
                    page_urls = [
                        response.request.meta.get("url").replace("m1/", "m1o{0}/".format(i))
                        for i in range(2, int(end_apge/2))]
                    for page_url in page_urls:
                        data = {"cityname": cityname, "city_code": city_code, "url": page_url, "host": hd['Host'], "page": False}
                        yield scrapy.Request(page_url, headers=hd, callback=self.parse_page, meta=data)
    
                except Exception as e:
    
                    print("没有分页: ", e)
    
    
        def parse_item(self, response):
            #
            fname = '%s_%s_%s_%s.html' % (socket.gethostname(),
                                          response.url.split('//')[-1].split('/')[0].replace('.', '-'),
                                          hashlib.md5(response.url.encode()).hexdigest(),
                                          str(time.time()).split('.')[0])
            doc = response.text + '
    <!--%s-->' % response.url
            return {'path': self.path_dir + fname, 'data': doc}
    def etl(html):
        soup = BeautifulSoup(html, "html.parser")
        try:
            card_top = soup.find("div", class_="card-top")
            name = card_top.find("p", class_="card-title").find("i").get_text()
            card_pay = card_top.find("ul", class_="card-pay f-clear")
            price_num = card_pay.find("span", class_="num").get_text()
            price_rmb = card_pay.find("span", class_="rmb").get_text()
            price_month = card_pay.find("span", class_="month").get_text()
            er_ = card_top.find("ul", class_="er-list f-clear")
            item_list = er_.find_all("li", class_="item f-fl")
            item = [{i.find("span", class_="t").get_text().strip().replace(":", ""):
                         i.find("span", class_="content").get_text().strip().replace(";", "").replace("xa0&nbsp", "")}
                    for i in item_list]
            info_dict = {}
            for i in item:
                for k, v in i.items():
                    info_dict[k] = v
            field_mapping = {
                "户型": "huxing",
                "面积": "buildingArea",
                "朝向": "orientation",
                "楼层": "floor",
                "电梯情况": "is_Elevator",
                "装修情况": "renovation",
                "入住时间": "in_house_date",
                "看房时间": "see_house_date",
            }
            _info = {}
            for key in field_mapping.keys():
                try:
                    _info[field_mapping[key]] = info_dict[key]
                except Exception as e:
                    _info[field_mapping[key]] = None
    
    
            address_data = card_top.find("ul", class_="er-list-two f-clear")
            add = [i.get_text().replace("
    ", "") for i in address_data.find_all("li", class_="er-item f-fl")]
            xiaoqu_name, address = None, None
            for i in add:
                if "小区名称:" in i:
                    xiaoqu_name = add[0].replace("小区名称:", "").split(" ")[0].strip()
                elif "所在地址:" in i:
                    address = add[-1].replace("所在地址:", "")
            others = soup.find("div", class_="f-main-left f-fl f-w970")
            house_peizhi = ",".join([i.get_text()
                                     for i in others.find("div",
                                                          {"class":"f-group", "id":"js-house-peizhi"}).find_all("p", class_="text")])
            describe = others.find("div",
                                   {"class":"f-group", "id":"js-house-describe"}).find("div", class_="describe").get_text().strip().replace("
    ", "")
            try:
                lo = json.loads(others.find("div", class_="col-sub map-wrap").find("div", class_="map-content js-map-tab js-so-map-tab").get("data-ref"))
                lnglat = lo.get("lnglat")
                lng_b, lat_b = [float(i.replace("b", "")) for i in lnglat.split(",")]
            except:
                lng_b, lat_b = None, None
            if lng_b:
                lng_a, lat_a = bd09togcj02(lng_b, lat_b)  # 高德
                lng_g, lat_g = gcj02towgs84(lng_a, lat_a)  # GPS
            else:
                lng_a, lat_a = None, None
                lng_g, lat_g = None, None
            urlPath = soup.find("div", class_="f-crumbs f-w1190").get_text().strip().replace("
    ", "")
            try:
                cityname = urlPath.split(">")[0].replace("赶集", "")
            except:
                cityname = None
            end_json = {
                "cityname": cityname,
                "lng_a": lng_a,  # 高德
                "lat_a": lat_a,
                "lng_b": lng_b,  # 百度
                "lat_b": lat_b,
                "lng_g": lng_g,  # GPS
                "lat_g": lat_g,
                "gps_s": "b",
                "urlPath": urlPath,
                "name": name,
                "price": price_num,
                "price_rmb": price_rmb,
                "pricetype": price_month,
                "peitao": house_peizhi,
                "describe": describe,
                "projname": xiaoqu_name,
                "address": address
            }
    
            end_data = end_json.copy()
            end_data.update(_info)
        except:
            end_data = None
        return end_data
    

      

    {
        "crawlTime" : "2018-04-23 17:46:14",
        "recordBatchNo" : "17",
        "province" : "广东省",
        "cityname" : "云浮 ",
        "lng_a" : 112.03291232909547,
        "lat_a" : 22.931326338916666,
        "lng_b" : 112.039476574,
        "lat_b" : 22.9370670779,
        "lng_g" : 112.02761893741206,
        "lat_g" : 22.934005392048782,
        "gps_s" : "b",
        "urlPath" : "云浮赶集 > 云浮房产 > 云浮租房 > 云城租房",
        "name" : "富临花园交通便利物业很好",
        "price" : "700",
        "price_rmb" : "¥",
        "pricetype" : "/月",
        "peitao" : "电视,空调,热水器,洗衣机,冰箱,床,沙发,衣柜,暖气,宽带网,可做饭,独立阳台,独卫",
        "describe" : "147家电齐全 有空调 洗衣机 电视机(富临花园) 热水器,带一个阳台,视野无遮挡。",
        "projname" : "富临花园",
        "address" : null,
        "huxing" : "1室1厅1卫",
        "buildingArea" : "整租51㎡",
        "orientation" : "南北向",
        "floor" : "低层/共20层",
        "is_Elevator" : "有电梯",
        "renovation" : "精装修",
        "in_house_date" : "2018-04-09",
        "see_house_date" : "周六/日"
    }
    

       教程仅供技术研究学习使用,若有侵权,联系本人删除

  • 相关阅读:
    [bzoj3038/3211]上帝造题的七分钟2/花神游历各国_线段树
    [bzoj1002][FJOI2007]轮状病毒_递推_高精度
    UNIX环境高级编程——线程同步之互斥锁、读写锁和条件变量(小结)
    UNIX环境高级编程——线程与进程区别
    UNIX环境高级编程——死锁
    UNIX环境高级编程——线程同步之条件变量以及属性
    UNIX环境高级编程——线程同步之读写锁以及属性
    UNIX环境高级编程——线程同步之互斥量
    UNIX环境高级编程——pthread_create的问题
    UNIX环境高级编程——主线程与子线程的退出关系
  • 原文地址:https://www.cnblogs.com/dockers/p/9238454.html
Copyright © 2020-2023  润新知