python+ selenium爬取房天下新房详情

新房详情

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep
import json
from datetime import datetime
import re

option = webdriver.ChromeOptions()
# 防止打印一些无用的日志
option.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])



b = webdriver.Chrome(executable_path ="D:chrome_driver_win32chromedriver.exe", chrome_options=option)
num = 1
base_urls = "https://nanjing.newhouse.fang.com/house/s/b9{}/".format(num)    

b.get(base_urls)
name = b.find_elements_by_xpath('//*[@class="nl_con clearfix"]/ul/li/div/div[1]/a')
house_lst = []
for i in name:
    href = (i.get_attribute('href'))
    house_lst.append(href)

data_list = []
for url in house_lst:
    b.get(url)
    data = {}
    # 获取楼盘详情
    quyu = b.find_element_by_xpath(
        '//div[@class="br_left"]//ul[@class="tf f12"]//li[3]/a').text  # 一级区域


    data['subarea'] = quyu[:-2]     # 字符串切片，去掉后面2个字
    data['area'] = b.find_element_by_xpath('//div[@class="s2"]/div/a').text  # 当前城市


    try:
        # 详情里的属性
        fangyuan_url = b.find_element_by_xpath(
            "//*[@class='main_1200 tf']//div[@class='cxfnav']//a[contains(text(),'楼盘详情')]")
        href1 = fangyuan_url.get_attribute('href')
        b.get(href1)

        nodes= any
        main_items = b.find_elements_by_xpath('//div[@class="main_1200 tf"]//div[@class="main_1200"]//div[@class="main-cont clearfix"]//div[@class="main-left"]//div[@class="main-item"]')
        for i in main_items:
            # print(i.find_element_by_xpath(".//h3").text)   # .//表示当前目录下的 xxx
            nodes1 = i.find_elements_by_xpath('.//ul//li')
            for n in nodes1:
                print(n.text)
                print('-'*50)


        # xxx位置及周边
        dingwei_url = b.find_element_by_xpath('//div[@class="mapbox_dt"]/iframe').get_attribute(
            "src")  # 获取定位连接
        b.get(dingwei_url)
        sound_code = b.page_source    # 获取网站的源码
        re_search = re.search(r'"mapx":"(.*?)","mapy":"(.*?)"', sound_code, re.DOTALL)  # 楼盘坐标..正则匹配"mapx":后面数数字
        data['housecoord'] = re_search.group(2) + "," + re_search.group(1)

    except Exception as e:
        pass


    data_list.append(data)
    break

print(data_list)

with open('详情(南京).jsonlines', 'a', encoding='utf8') as f:
    for data in data_list:
        json.dump(data, f, ensure_ascii=False)
        f.write('
')


b.quit()

相关阅读:
mysql week 的使用方法
 获取某个周在本年的开始日期和结束日期
 question and answer
系统安装
 adblock 下载地址
 windows新增/修改/删除系统环境变量bat示例，一键配置JAVA_HOME
SpringMVC 复杂对象数据绑定
 IntelliJ IDEA default settings 全局默认设置
 Spring整合Ehcache管理缓存
 label的for属性
原文地址：https://www.cnblogs.com/yansc/p/14889490.html