• python+ selenium爬取房天下新房详情


    新房详情

    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from time import sleep
    import json
    from datetime import datetime
    import re
    
    option = webdriver.ChromeOptions()
    # 防止打印一些无用的日志
    option.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
    
    
    
    b = webdriver.Chrome(executable_path ="D:chrome_driver_win32chromedriver.exe", chrome_options=option)
    num = 1
    base_urls = "https://nanjing.newhouse.fang.com/house/s/b9{}/".format(num)    
    
    b.get(base_urls)
    name = b.find_elements_by_xpath('//*[@class="nl_con clearfix"]/ul/li/div/div[1]/a')
    house_lst = []
    for i in name:
        href = (i.get_attribute('href'))
        house_lst.append(href)
    
    data_list = []
    for url in house_lst:
        b.get(url)
        data = {}
        # 获取楼盘详情
        quyu = b.find_element_by_xpath(
            '//div[@class="br_left"]//ul[@class="tf f12"]//li[3]/a').text  # 一级区域
    
    
        data['subarea'] = quyu[:-2]     # 字符串切片,去掉后面2个字
        data['area'] = b.find_element_by_xpath('//div[@class="s2"]/div/a').text  # 当前城市
    
    
        try:
            # 详情里的属性
            fangyuan_url = b.find_element_by_xpath(
                "//*[@class='main_1200 tf']//div[@class='cxfnav']//a[contains(text(),'楼盘详情')]")
            href1 = fangyuan_url.get_attribute('href')
            b.get(href1)
    
            nodes= any
            main_items = b.find_elements_by_xpath('//div[@class="main_1200 tf"]//div[@class="main_1200"]//div[@class="main-cont clearfix"]//div[@class="main-left"]//div[@class="main-item"]')
            for i in main_items:
                # print(i.find_element_by_xpath(".//h3").text)   # .//表示当前目录下的 xxx
                nodes1 = i.find_elements_by_xpath('.//ul//li')
                for n in nodes1:
                    print(n.text)
                    print('-'*50)
    
    
            # xxx位置及周边
            dingwei_url = b.find_element_by_xpath('//div[@class="mapbox_dt"]/iframe').get_attribute(
                "src")  # 获取定位连接
            b.get(dingwei_url)
            sound_code = b.page_source    # 获取网站的源码
            re_search = re.search(r'"mapx":"(.*?)","mapy":"(.*?)"', sound_code, re.DOTALL)  # 楼盘坐标..正则匹配"mapx":后面数数字
            data['housecoord'] = re_search.group(2) + "," + re_search.group(1)
    
        except Exception as e:
            pass
    
    
        data_list.append(data)
        break
    
    print(data_list)
    
    with open('详情(南京).jsonlines', 'a', encoding='utf8') as f:
        for data in data_list:
            json.dump(data, f, ensure_ascii=False)
            f.write('
    ')
    
    
    b.quit()

     

  • 相关阅读:
    mysql week 的使用方法
    获取某个周在本年的开始日期和结束日期
    question and answer
    系统安装
    adblock 下载地址
    windows新增/修改/删除系统环境变量bat示例,一键配置JAVA_HOME
    SpringMVC 复杂对象数据绑定
    IntelliJ IDEA default settings 全局默认设置
    Spring整合Ehcache管理缓存
    label的for属性
  • 原文地址:https://www.cnblogs.com/yansc/p/14889490.html
Copyright © 2020-2023  润新知