赶集租房

#!/usr/bin/env python
# -*- coding:utf-8 -*-

# Author : zhibo.wang
# E-mail : d_1206@qq.com
# Date   : 18/04/24 09:22:58
# Desc   : 赶集租房

import hashlib
import socket
import time
import scrapy
import copy
from house_spider.agent import get_agent
from bs4 import BeautifulSoup
from utils.time_convert import get_time_stamp
from utils.conmongodb import mongo_con_keepalive


class GanJiSpider(scrapy.Spider):
    name = 'ganji_rent_spider'
    allowed_domains = ['ganji.com']
    custom_settings = {
        "DOWNLOAD_DELAY": 0.4,
    }

    def __init__(self):
        self.db = mongo_con_keepalive()
        self.start_data = [{"cityname": i.get("cityname"),
                            "city_code": i.get("ganji_code"),
                            "url": "http://{0}.ganji.com/fang1/m1/".format(i.get("ganji_code"))}
                           for i in self.db.get_collection("para_city_list").find({}, {"_id":0})]

        self.efs = self.db.get_collection('pathdir_dict')
        self.ganji_header = get_agent('ganji_header')
        self.path_dir = "realstate/house/ganji/rent/{0}/".format(get_time_stamp())
        self.efs.insert_one({'pathdir': self.path_dir, 'website': 'ganji_rent', 'flag': False})


        super(GanJiSpider, self).__init__()

    def start_requests(self):
        reqs = []
        for start_ in self.start_data:
            hd = copy.deepcopy(self.ganji_header[0])
            url = start_.get("url")
            hd['Host'] = url.split("/")[2]
            data = {"cityname": start_.get("cityname"), "city_code": start_.get("city_code"), "url": url, "host": hd['Host']}
            reqs.append(scrapy.Request(url, headers=hd, callback=self.parse_district, meta=data))

        return reqs

    def parse_district(self, response):
        hd = copy.deepcopy(self.ganji_header[0])
        host = response.request.meta.get("host")
        cityname = response.request.meta.get("cityname")
        city_code = response.request.meta.get("city_code")
        hd['Host'] = host
        hd["Referer"] = response.request.meta.get("url")
        soup = BeautifulSoup(response.text, 'html.parser')
        district_list = soup.find("div", class_="thr-list").find_all("li", class_="item")
        for district_ in district_list:

            url = "http://{0}.ganji.com{1}".format(city_code, district_.find("a").get("href"))
            data = {"cityname": cityname, "city_code": city_code,"url": url, "host": hd['Host'], "page": True}
            yield scrapy.Request(url, headers=hd, callback=self.parse_page, meta=data)



    def parse_page(self, response):
        # 处理列表页

        hd = copy.deepcopy(self.ganji_header[0])
        host = response.request.meta.get("host")
        cityname = response.request.meta.get("cityname")
        hd['Host'] = host
        hd["Referer"] = response.request.meta.get("url")
        soup = BeautifulSoup(response.text, 'html.parser')
        city_code = response.request.meta.get("city_code")

        try:
            house_list = soup.find("div", class_="f-list js-tips-list").find_all("div", class_="f-list-item ershoufang-list")
            # 获取列表页 所有房源地址
            house_datas = [
                "http://{0}.ganji.com{1}".format(city_code, i.get("href"))
                for i in house_list
            ]
            for house_data in house_datas:
                yield scrapy.Request(house_data, headers=hd, callback=self.parse_item)
        except:
            print("没有租房数据")

        if response.request.meta.get("page"):
            try:
                all_count = int(
                    soup.find("p", class_="m-result f-fr").find("span", class_="num").get_text().replace("套", ""))
                end_apge = int(all_count/50)
                if all_count % 50 != 0:
                    end_apge += 1
                page_urls = [
                    response.request.meta.get("url").replace("m1/", "m1o{0}/".format(i))
                    for i in range(2, int(end_apge/2))]
                for page_url in page_urls:
                    data = {"cityname": cityname, "city_code": city_code, "url": page_url, "host": hd['Host'], "page": False}
                    yield scrapy.Request(page_url, headers=hd, callback=self.parse_page, meta=data)

            except Exception as e:

                print("没有分页: ", e)


    def parse_item(self, response):
        #
        fname = '%s_%s_%s_%s.html' % (socket.gethostname(),
                                      response.url.split('//')[-1].split('/')[0].replace('.', '-'),
                                      hashlib.md5(response.url.encode()).hexdigest(),
                                      str(time.time()).split('.')[0])
        doc = response.text + '
<!--%s-->' % response.url
        return {'path': self.path_dir + fname, 'data': doc}

def etl(html):
    soup = BeautifulSoup(html, "html.parser")
    try:
        card_top = soup.find("div", class_="card-top")
        name = card_top.find("p", class_="card-title").find("i").get_text()
        card_pay = card_top.find("ul", class_="card-pay f-clear")
        price_num = card_pay.find("span", class_="num").get_text()
        price_rmb = card_pay.find("span", class_="rmb").get_text()
        price_month = card_pay.find("span", class_="month").get_text()
        er_ = card_top.find("ul", class_="er-list f-clear")
        item_list = er_.find_all("li", class_="item f-fl")
        item = [{i.find("span", class_="t").get_text().strip().replace("：", ""):
                     i.find("span", class_="content").get_text().strip().replace(";", "").replace("xa0&nbsp", "")}
                for i in item_list]
        info_dict = {}
        for i in item:
            for k, v in i.items():
                info_dict[k] = v
        field_mapping = {
            "户型": "huxing",
            "面积": "buildingArea",
            "朝向": "orientation",
            "楼层": "floor",
            "电梯情况": "is_Elevator",
            "装修情况": "renovation",
            "入住时间": "in_house_date",
            "看房时间": "see_house_date",
        }
        _info = {}
        for key in field_mapping.keys():
            try:
                _info[field_mapping[key]] = info_dict[key]
            except Exception as e:
                _info[field_mapping[key]] = None


        address_data = card_top.find("ul", class_="er-list-two f-clear")
        add = [i.get_text().replace("
", "") for i in address_data.find_all("li", class_="er-item f-fl")]
        xiaoqu_name, address = None, None
        for i in add:
            if "小区名称：" in i:
                xiaoqu_name = add[0].replace("小区名称：", "").split(" ")[0].strip()
            elif "所在地址：" in i:
                address = add[-1].replace("所在地址：", "")
        others = soup.find("div", class_="f-main-left f-fl f-w970")
        house_peizhi = ",".join([i.get_text()
                                 for i in others.find("div",
                                                      {"class":"f-group", "id":"js-house-peizhi"}).find_all("p", class_="text")])
        describe = others.find("div",
                               {"class":"f-group", "id":"js-house-describe"}).find("div", class_="describe").get_text().strip().replace("
", "")
        try:
            lo = json.loads(others.find("div", class_="col-sub map-wrap").find("div", class_="map-content js-map-tab js-so-map-tab").get("data-ref"))
            lnglat = lo.get("lnglat")
            lng_b, lat_b = [float(i.replace("b", "")) for i in lnglat.split(",")]
        except:
            lng_b, lat_b = None, None
        if lng_b:
            lng_a, lat_a = bd09togcj02(lng_b, lat_b)  # 高德
            lng_g, lat_g = gcj02towgs84(lng_a, lat_a)  # GPS
        else:
            lng_a, lat_a = None, None
            lng_g, lat_g = None, None
        urlPath = soup.find("div", class_="f-crumbs f-w1190").get_text().strip().replace("
", "")
        try:
            cityname = urlPath.split(">")[0].replace("赶集", "")
        except:
            cityname = None
        end_json = {
            "cityname": cityname,
            "lng_a": lng_a,  # 高德
            "lat_a": lat_a,
            "lng_b": lng_b,  # 百度
            "lat_b": lat_b,
            "lng_g": lng_g,  # GPS
            "lat_g": lat_g,
            "gps_s": "b",
            "urlPath": urlPath,
            "name": name,
            "price": price_num,
            "price_rmb": price_rmb,
            "pricetype": price_month,
            "peitao": house_peizhi,
            "describe": describe,
            "projname": xiaoqu_name,
            "address": address
        }

        end_data = end_json.copy()
        end_data.update(_info)
    except:
        end_data = None
    return end_data

{
    "crawlTime" : "2018-04-23 17:46:14",
    "recordBatchNo" : "17",
    "province" : "广东省",
    "cityname" : "云浮 ",
    "lng_a" : 112.03291232909547,
    "lat_a" : 22.931326338916666,
    "lng_b" : 112.039476574,
    "lat_b" : 22.9370670779,
    "lng_g" : 112.02761893741206,
    "lat_g" : 22.934005392048782,
    "gps_s" : "b",
    "urlPath" : "云浮赶集 > 云浮房产 > 云浮租房 > 云城租房",
    "name" : "富临花园交通便利物业很好",
    "price" : "700",
    "price_rmb" : "¥",
    "pricetype" : "/月",
    "peitao" : "电视,空调,热水器,洗衣机,冰箱,床,沙发,衣柜,暖气,宽带网,可做饭,独立阳台,独卫",
    "describe" : "147家电齐全 有空调 洗衣机 电视机(富临花园) 热水器,带一个阳台，视野无遮挡。",
    "projname" : "富临花园",
    "address" : null,
    "huxing" : "1室1厅1卫",
    "buildingArea" : "整租51㎡",
    "orientation" : "南北向",
    "floor" : "低层/共20层",
    "is_Elevator" : "有电梯",
    "renovation" : "精装修",
    "in_house_date" : "2018-04-09",
    "see_house_date" : "周六/日"
}

　　教程仅供技术研究学习使用，若有侵权，联系本人删除

相关阅读:
虚方法、重写方法和抽象方法[转载]
枚举的默认构造为第一个成员值！
自定义Attribute例子！
Python 处理MD5
php5 反射refection 的用途
 python Sleep休眠函数
 python处理urlencode的两种方式
 python操作mysql进行更新的时候，必须有commit
Python repr() 或str() 函数
 python抓取网页内容
原文地址：https://www.cnblogs.com/dockers/p/9238454.html