• 爬取链家任意城市二手房数据(天津)


      1 #!/usr/bin/env python
      2 # -*- coding: utf-8 -*-
      3 # @Time    : 2019-08-16 12:40
      4 # @Author  : Anthony
      5 # @Email   : ianghont7@163.com
      6 # @File    : 爬取链家任意城市二手房数据.py
      7 
      8 
      9 import requests
     10 from lxml import etree
     11 import time
     12 import xlrd
     13 import os
     14 import xlwt
     15 from xlutils.copy import copy
     16 
     17 # 伪装请求
     18 headers = {
     19     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.7 Safari/537.36'
     20 }
     21 
     22 xlsInfo = {}
     23 
     24 def catchHouseDetail(url):
     25     # 通过requests模块模拟get请求
     26     page_text = requests.get(url, headers=headers, stream=True)
     27 
     28     # 将互联网上获取的页面数据加载到etree对象中
     29     tree = etree.HTML(page_text.text)
     30 
     31     # 定位页面标签位置装入一个list中
     32     li_list = tree.xpath('//div[@class="leftContent"]/ul/li')
     33     all_house_list = []
     34     # 遍历列表中每一个字段
     35     for li in li_list:
     36         info = []
     37         # info = {}
     38         # info["房屋标题"] = li.xpath('.//div[@class="info clear"]/div[@class="title"]/a/text()')[0]
     39         # info["小区名称"] = li.xpath('.//div[@class="address"]/div[@class="houseInfo"]/text()')[0].split('|')[1]
     40         # info['建筑面积'] = li.xpath('.//div[@class="address"]/div[@class="houseInfo"]/text()')[0].split('|')[2]
     41         # info['房屋朝向'] = li.xpath('.//div[@class="address"]/div[@class="houseInfo"]/text()')[0].split('|')[3]
     42         # info['装修情况'] = li.xpath('.//div[@class="address"]/div[@class="houseInfo"]/text()')[0].split('|')[4]
     43         # info['所在楼层'] = li.xpath('.//div[@class="flood"]/div[@class="positionInfo"]/text()')[0].split(' ')[0]
     44         # info['所在区域'] = li.xpath('.//div[@class="flood"]/div[@class="positionInfo"]/a/text()')[0]
     45         # info['总价'] = li.xpath('.//div[@class="priceInfo"]/div[@class="totalPrice"]/span/text()')[0] + '万'
     46         # info['每平米售价'] = li.xpath('.//div[@class="priceInfo"]/div[@class="unitPrice"]/span/text()')[0]
     47         # info['房屋关注人数'] = li.xpath('.//div[@class="followInfo"]/text()')[0].split('/')[0]
     48         # info['房屋发布时间'] = li.xpath('.//div[@class="followInfo"]/text()')[0].split('/')[1]
     49 
     50         #房屋标题
     51         houseTitle = li.xpath('.//div[@class="info clear"]/div[@class="title"]/a/text()')[0]
     52         #小区名称
     53         houseName = li.xpath('.//div[@class="address"]/div[@class="houseInfo"]/text()')[0].split('|')[1]
     54         #建筑面积
     55         houseArea = li.xpath('.//div[@class="address"]/div[@class="houseInfo"]/text()')[0].split('|')[2]
     56         #房屋朝向
     57         houseTowards = li.xpath('.//div[@class="address"]/div[@class="houseInfo"]/text()')[0].split('|')[3]
     58         #装修情况
     59         houseFinish = li.xpath('.//div[@class="address"]/div[@class="houseInfo"]/text()')[0].split('|')[4]
     60         #所在楼层
     61         houseFloor = li.xpath('.//div[@class="flood"]/div[@class="positionInfo"]/text()')[0].split(' ')[0]
     62         #所在区域
     63         houseSite = li.xpath('.//div[@class="flood"]/div[@class="positionInfo"]/a/text()')[0]
     64         #总价
     65         housePrices = li.xpath('.//div[@class="priceInfo"]/div[@class="totalPrice"]/span/text()')[0] + ''
     66         #每平米售价
     67         houseSquarePrices = li.xpath('.//div[@class="priceInfo"]/div[@class="unitPrice"]/span/text()')[0]
     68         #房屋关注人数
     69         houseFollowers = li.xpath('.//div[@class="followInfo"]/text()')[0].split('/')[0]
     70         #房屋发布时间
     71         houseTime = li.xpath('.//div[@class="followInfo"]/text()')[0].split('/')[1]
     72         info.append(houseTitle)
     73         info.append(houseName)
     74         info.append(houseArea)
     75         info.append(houseTowards)
     76         info.append(houseFinish)
     77         info.append(houseFloor)
     78         info.append(houseSite)
     79         info.append(housePrices)
     80         info.append(houseSquarePrices)
     81         info.append(houseFollowers)
     82         info.append(houseTime)
     83         all_house_list.append(info)
     84     if if_xls_exits() == True:
     85         write_excel_xls_append(xlsInfo["xlsName"],all_house_list)
     86 
     87 
     88 #获取数据写入xls表格中
     89 def write_excel_xls(path, sheet_name, value):
     90     index = len(value)  # 获取需要写入数据的行数
     91     workbook = xlwt.Workbook()  # 新建一个工作簿
     92     sheet = workbook.add_sheet(sheet_name)  # 在工作簿中新建一个表格
     93     for i in range(0, index):
     94         for j in range(0, len(value[i])):
     95             sheet.write(i, j, value[i][j])  # 像表格中写入数据(对应的行和列)
     96     workbook.save(path)  # 保存工作簿
     97     print("xls格式表格写入数据成功!")
     98 
     99 
    100 
    101 def write_excel_xls_append(path, value):
    102     index = len(value)  # 获取需要写入数据的行数
    103     workbook = xlrd.open_workbook(path)  # 打开工作簿
    104     sheets = workbook.sheet_names()  # 获取工作簿中的所有表格
    105     worksheet = workbook.sheet_by_name(sheets[0])  # 获取工作簿中所有表格中的的第一个表格
    106     rows_old = worksheet.nrows  # 获取表格中已存在的数据的行数
    107     new_workbook = copy(workbook)  # 将xlrd对象拷贝转化为xlwt对象
    108     new_worksheet = new_workbook.get_sheet(0)  # 获取转化后工作簿中的第一个表格
    109     for i in range(0, index):
    110         for j in range(0, len(value[i])):
    111             new_worksheet.write(i + rows_old, j, value[i][j])  # 追加写入数据,注意是从i+rows_old行开始写入
    112     new_workbook.save(path)  # 保存工作簿
    113     print("xls格式表格【追加】写入数据成功!")
    114 
    115 
    116 
    117 
    118 def if_xls_exits():
    119     while True:
    120         book_name_xls = '天津链家二手房信息表.xls'
    121         sheet_name_xls = '房屋信息'
    122         value_title = [["房屋标题", "房屋户型", "建筑面积", "房屋朝向", "装修情况", "所在楼层", "所在区域", "总价", "每平米售价", "房屋关注人数", "房屋发布时间"], ]
    123         if os.path.exists('./%s'%book_name_xls):
    124             xlsInfo["xlsName"] = book_name_xls
    125             return True
    126         else:
    127             write_excel_xls(book_name_xls, sheet_name_xls, value_title)
    128             continue
    129 
    130 
    131 
    132 def catch():
    133     pages = ['https://tj.lianjia.com/ershoufang/pg{}/'.format(x) for x in range(1, 1000)]
    134     for page in pages:
    135         try:
    136             info = catchHouseDetail(page)
    137         except:
    138             pass
    139         time.sleep(3)
    140 
    141 
    142 if __name__ == '__main__':
    143     catch()

    效果图:

  • 相关阅读:
    P2155 [SDOI2008]沙拉公主的困惑
    P4345 [SHOI2015]超能粒子炮·改
    乘法逆元
    P1608 路径统计
    P1342 请柬
    一些网址
    20/08/02测试
    ivqBlog 开源博客 (angularjs + express + mongodb)
    angularjs, nodejs, express, gulp, karma, jasmine 前端方案整合
    参照nopCommerce框架开发(NextCMS)
  • 原文地址:https://www.cnblogs.com/ipyanthony/p/11365962.html
Copyright © 2020-2023  润新知