• python 爬取链家


    import json
    
    import  requests
    from lxml import etree
    from time import sleep
    
    
    url = "https://sz.lianjia.com/ershoufang/rs/"
    headers = {
        "User-Agent":"",
        "Refer":"https://sz.lianjia.com/ershoufang/pg2/"
    }
    resp = requests.get(url,headers=headers)
    
    base_url = "https://sz.lianjia.com/ershoufang/pg{}/"
    html = etree.HTML(resp.text)
    data = html.xpath('//*[@id="content"]//div[@class="page-box fr"]/div/@page-data')
    data = json.loads(data[0])
    totalPage = data['totalPage']
    curPage = data['curPage']
    
    def get_data(url):
        list = []
        resp = requests.get(url, headers=headers)
        html = etree.HTML(resp.text)
        ul = html.xpath('.//ul[@class="sellListContent"]/li')
        for li in ul:
            face = li.xpath('./a/img/@src')
            title = li.xpath('.//div[@class="title"]/a/text()')
            position = li.xpath('.//div[@class="positionInfo"]/a/text()')
            house_info = li.xpath('.//div[@class="houseInfo"]/text()')
            follow_info = li.xpath('.//div[@class="followInfo"]/text()')
            price = li.xpath('.//div[@class="priceInfo"]/div[@class="totalPrice"]/span/text()')
            unit_price = li.xpath('.//div[@class="priceInfo"]/div[@class="unitPrice"]/span/text()')
            tag = li.xpath('.//div[@class="tag"]//span/text()')
            content = {}
            content["face"] = face[0]
            content["title"] = title[0]
            content["position"] = position[0]
            content["house_info"] = house_info[0]
            content["follow_info"] = follow_info[0]
            content["price"] = price[0]
            content["unit_price"] = unit_price[0]
            if len(tag) >=1 and  tag[0] is not None:
                content['tag'] = tag[0]
            list.append(content)
        return list
    
    totalList = []
    for i in range(1,totalPage+1):
        url = base_url.format(i)
        print("crawl url  " + url)
        cur_list = get_data(url)
        print(cur_list)
        totalList = totalList + cur_list
    
    url = base_url.format(1)
    
    print(totalList)

  • 相关阅读:
    HTML screenX 事件属性
    CSS Display(显示) 与 Visibility(可见性)
    JavaScript手册 | JS Array 对象中的sort() 方法
    IntelliJ IDEA收费版本Ultimate的安装和破解
    ASP.NET Style 控件
    HTML area hreflang 属性
    Shell test 命令
    ftplib (Internet) – Python 中文开发手册
    Java 之 Collection 接口
    java 之 集合概述
  • 原文地址:https://www.cnblogs.com/brady-wang/p/12491105.html
Copyright © 2020-2023  润新知