• Python爬取小猪短租全网数据


    爬取时需要进行的操作:

    1:输入你是要爬取国内的还是海外的,1表示国内,2表示海外;
    2:然后输入你要爬取的城市名称,就可以了;

    每个函数的功能:

    choose_area函数根据你输入的是国内还是海外,输出不同的区域名称
    url_list函数根据你输入的城市名称,来爬取城市有多少房源,来判断有几页数据,因为小猪短租网上面只显示13页数据,超过了的话页只显示13页的数据,所以做个判断就可以
    get_url函数根据你输入的城市名称和页码,来构建你输入的城市每页的url

    get_html函数就是获取每页的html数据
    get_zf_url函数根据每页的html数据来爬取每个租房的url链接
    get_zf_message函数,通过传入的租房url链接来获取每个租房的信息

    '''
    获取每个城市的url
    '''
    import  re
    import requests
    
    url = 'http://jci.xiaozhustatic1.com/e17061601/xzjs?k=Front_Search&httphost=bj.xiaozhu.com'     #获取城市名称的链接
    
    ser = input('输入你要查找的地区(1:国内;2:海外):')            #输入你你要查找是国内的短租房还是海外的短租房
    html = requests.get(url).text       #通过上面提供的url来爬取每个省份的拼音
    
    def choose_area():      #判断你需要查找的是国内的还是海外的,并输出相应的内容
        city_tup = re.compile('citys[[0-9]d*]=new Array(.*?);').findall(html)
        #print(city_tup)
        for city_name in city_tup[29:]:
            #print(city_name)
            city_time = re.compile('[0-9]d*:[0-9]d*').findall(city_name)
            if ser == '1':
                if len(city_time) == 0:
                    city = re.compile('[u4E00-u9FA5]+').findall(city_name)[0]     #城市名称
                    city_jc = re.compile('[a-z]w*').findall(city_name)[1]      #城市拼音
                    city_zf = re.compile('[0-9]d*').findall(city_name)[0]      #城市租房数量
                    city_dic = {city:[city_jc,city_zf]}
                    yield city_dic
                else:
                    pass
            elif ser == '2':
                if len(city_time) != 0:
                    city = re.compile('[u4E00-u9FA5]+').findall(city_name)[0]
                    city_jc = re.compile('[a-z]w*').findall(city_name)[1]
                    city_zf = re.compile('[0-9]d*').findall(city_name)[0]
                    city_dic = {city: [city_jc, city_zf]}
                    yield city_dic
                else:
                    pass
    
    def get_url(city_jc,page):  #提供省份的名称和页码来构建需要爬取的url
        url = 'http://{}.xiaozhu.com/search-duanzufang-p{}-0/'.format(city_jc,page)
        return url
    
    #根据你提供的省份名称来判断,这个省份有多少房源,但是每个省份的房源只显示13页的数据,所有做个判断,超过了的话就只显示13页,没有超过的话就有几页就显示几页
    def url_list(city_name):
        #city_name = input('输入你要查找的城市名称:')
        for city in choose_area():
            if city_name in city.keys():
                if int(int(city[city_name][1])/24) > 13:
                    for page in range(1,14):
                        url = get_url(city[city_name][0],page)
                        yield url
                elif int(int(city[city_name][1])/24) <= 13:
                    sum_page = int(int(city[city_name][1])/24) <= 13
                    for page in range(1,sum_page + 1):
                        url = get_url(city[city_name][0], page)
                        yield url
    import requests
    from lxml import etree
    import re
    
    headers = {
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36'
    }
    
    def get_html(url):      #获取网页的html内容
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            response.encoding = response.apparent_encoding
            html = response.text
            return html
        else:
            print('没有获取到HTML')
    
    def get_zf_url(url):  #根据网页的内容来获取每个短租房的url,并传入get_zf_message函数来获取每个短租房的信息
        html = get_html(url)
        links = etree.HTML(html).xpath('//*[@id="page_list"]/ul/li/a/@href')
        return links    #返回是一个列表
    
    def get_zf_message(zf_url):
        html = get_html(zf_url)
        area = etree.HTML(html).xpath('//div[@class="pho_info"]/p/@title')[0]
        h_image = etree.HTML(html).xpath('//*[@id="curBigImage"]/@src')[0]
        #//*[@id="floatRightBox"]/div[3]/div[3]/h6/a
        #因为用lxml获取不到房东姓名,不知道为什么,但是用正则就可以
        fd_name = re.compile('<a class="lorder_name" href=".*?" title="(.*?)" target="_blank">.*?</a>').findall(html)[0]
        #fd_name = etree.HTML(html).xpath('//a[class="lorder_name"]/text()')
        fd_link = re.compile('<a class="lorder_name" href="(.*?)" title=".*?" target="_blank">.*?</a>').findall(html)[0]
        zf_price = etree.HTML(html).xpath('//*[@id="pricePart"]/div[1]/span/text()')[0]
        zf_title = etree.HTML(html).xpath('//div[@class="pho_info"]/h4/em/text()')[0]
        data = {
            '标题': zf_title,
            '价格': zf_price,
            '地址': area,
            '图片': h_image,
            '房东': fd_name,
            '房东链接': fd_link,
        }
        return data
    from spider_ziaozu import *
    from get_url import *
    
    
    if __name__ == '__main__':
        city = input('输入你想爬取的城市名称:')
        for url in url_list(city):      #通过传入上海的,获取上海的短租房的所有页面url
            print(url)
            zf_list = get_zf_url(url)       #把上海的短租房的所有页面url,传给get_zf_url来获取每个短租房url,返回的也是一个列表
            #print(zf_list)
            for zf_url in zf_list:          #通过遍历每个短租房的url,并传入给get_zf_message,来获取每个租房的信息
                print(zf_url)
                try:
                    zf_message = get_zf_message(zf_url)
                    print(zf_message)
                except Exception as e:
                    print(e)

    下面是城市的数据:

    domestic_list = [
    {'北京': ['beijing', '8221']},
    {'上海': ['shanghai', '6996']},
    {'广州': ['guangzhou', '2727']},
    {'成都': ['chengdu', '5369']},
    {'深圳': ['shenzhen', '2522']},
    {'西安': ['xian', '2562']},
    {'南京': ['nanjing', '1675']},
    {'杭州': ['hangzhou', '2455']},
    {'重庆': ['chongqing', '3171']},
    {'武汉': ['wuhan', '1901']},
    {'苏州': ['suzhou', '1603']},
    {'无锡': ['wuxi', '240']},
    {'青岛': ['qingdao', '3712']},
    {'厦门': ['xiamen', '1548']},
    {'三亚': ['sanya', '2384']},
    {'大连': ['dalian', '1034']},
    {'哈尔滨': ['haerbin', '790']},
    {'秦皇岛': ['qinhuangdao', '1924']},
    {'天津': ['tianjin', '485']},
    {'昆明': ['kunming', '819']},
    {'香港': ['xianggang', '164']},
    {'长春': ['changchun', '350']},
    {'沈阳': ['shenyang', '562']},
    {'合肥': ['hefei', '533']},
    {'郑州': ['zhengzhou', '370']},
    {'太原': ['taiyuan', '470']},
    {'威海': ['weihai', '821']},
    {'丽江': ['lijiang', '632']},
    {'大理': ['dali', '576']},
    {'桂林': ['guilin', '518']},
    {'澳门': ['aomen', '33']},
    {'福州': ['fuzhou', '410']},
    {'宁波': ['ningbo', '233']},
    {'珠海': ['zhuhai', '595']},
    {'长沙': ['changsha', '727']},
    {'石家庄': ['shijiazhuang', '288']},
    {'拉萨': ['lasa', '47']},
    {'常州': ['changzhou', '137']},
    {'扬州': ['yangzhou', '224']},
    {'东莞': ['dongguan', '70']},
    {'海口': ['haikou', '518']},
    {'兰州': ['lanzhou', '198']},
    {'洛阳': ['luoyang', '195']},
    {'乌鲁木齐': ['wulumuqi', '213']},
    {'徐州': ['xuzhou', '51']},
    {'贵阳': ['guiyang', '503']},
    {'呼和浩特': ['huhehaote', '82']},
    {'济南': ['jinan', '375']},
    {'唐山': ['tangshan', '102']},
    {'保定': ['baoding', '83']},
    {'南昌': ['nanchang', '206']},
    {'邯郸': ['handan', '12']},
    {'南宁': ['nanning', '168']},
    {'潍坊': ['weifang', '65']},
    {'锦州': ['jinzhou', '54']},
    {'日照': ['rizhao', '508']},
    {'临沂': ['linyi', '41']},
    {'鞍山': ['anshan', '23']},
    {'廊坊': ['langfang', '101']},
    {'大庆': ['daqing', '29']},
    {'北海': ['beihai', '436']},
    {'中山': ['zhongshan', '70']},
    {'西宁': ['xining', '362']},
    {'金华': ['jinhua', '71']},
    {'丹东': ['dandong', '181']},
    {'承德': ['chengde', '437']},
    {'盘锦': ['panjin', '35']},
    {'淄博': ['zibo', '30']},
    {'株洲': ['zhuzhou', '17']},
    {'佛山': ['foshan', '127']},
    {'吉林': ['jilinshi', '50']},
    {'邢台': ['xingtai', '9']},
    {'齐齐哈尔': ['qiqihaer', '8']},
    {'宜昌': ['yichang', '42']},
    {'大同': ['datong', '83']},
    {'烟台': ['yantai', '803']},
    {'银川': ['yinchuan', '76']},
    {'温州': ['wenzhou', '52']},
    {'淮安': ['huaian', '37']},
    {'绵阳': ['mianyang', '121']},
    {'包头': ['baotou', '40']},
    {'抚顺': ['fushun', '5']},
    {'泰安': ['taian', '103']},
    {'济宁': ['jining', '11']},
    {'连云港': ['lianyungang', '33']},
    {'泉州': ['quanzhou', '95']},
    {'安阳': ['anyang', '24']},
    {'惠州': ['huizhou', '537']},
    {'葫芦岛': ['huludao', '595']},
    {'嘉兴': ['jiaxing', '405']},
    {'南通': ['nantong', '143']},
    {'攀枝花': ['panzhihua', '15']},
    {'柳州': ['liuzhou', '19']},
    {'东营': ['dongying', '1']},
    {'佳木斯': ['jiamusi', '5']},
    {'通辽': ['tongliao', '5']},
    {'德州': ['dezhou', '22']},
    {'赣州': ['ganzhou', '6']},
    {'滨州': ['binzhou', '3']},
    {'咸阳': ['xianyang', '23']},
    {'江门': ['jiangmen', '17']},
    {'漳州': ['zhangzhou', '84']},
    {'新乡': ['xinxiang', '8']},
    {'襄樊': ['xiangfan', '4']},
    {'南充': ['nanchong', '29']},
    {'聊城': ['liaocheng', '17']},
    {'张家口': ['zhangjiakou', '196']},
    {'沧州': ['cangzhou', '22']},
    {'石河子': ['shihezi', '4']},
    {'宝鸡': ['baoji', '5']},
    {'赤峰': ['chifeng', '22']},
    {'湛江': ['zhanjiang', '41']},
    {'商丘': ['shangqiu', '5']},
    {'平顶山': ['pingdingshan', '4']},
    {'信阳': ['xinyang', '13']},
    {'九江': ['jiujiang', '29']},
    {'营口': ['yingkou', '500']},
    {'本溪': ['benxi', '6']},
    {'钦州': ['qinzhou', '2']},
    {'衡阳': ['hengyang', '19']},
    {'汕头': ['shantou', '63']},
    {'芜湖': ['wuhu', '18']},
    {'呼伦贝尔': ['hulunbeier', '124']},
    {'湘潭': ['xiangtan', '11']},
    {'朝阳市': ['chaoyang', '2']},
    {'清远': ['qingyuan', '137']},
    {'遂宁': ['suining', '6']},
    {'泰州': ['jstaizhou', '6']},
    {'莆田': ['putian', '3']},
    {'枣庄': ['zaozhuang', '14']},
    {'泸州': ['luzhou', '52']},
    {'舟山': ['zhoushan', '304']},
    {'镇江': ['zhenjiang', '19']},
    {'开封': ['kaifeng', '130']},
    {'鄂尔多斯': ['eerduosi', '3']},
    {'十堰': ['shiyan', '14']},
    {'延边': ['yanbian', '75']},
    {'淮北': ['huaibei', '5']},
    {'临汾': ['linfen', '21']},
    {'常德': ['changde', '5']},
    {'荆州': ['jingzhou', '3']},
    {'郴州': ['chenzhou', '46']},
    {'德阳': ['deyang', '13']},
    {'绍兴': ['shaoxing', '33']},
    {'南阳': ['nanyang', '7']},
    {'菏泽': ['heze', '1']},
    {'台州': ['zjtaizhou', '36']},
    {'遵义': ['zunyi', '7']},
    {'阜新': ['fuxin', '2']},
    {'盐城': ['yancheng', '6']},
    {'宿迁': ['suqian', '2']},
    {'焦作': ['jiaozuo', '16']},
    {'长治': ['changzhi', '26']},
    {'吉安': ['jian', '14']},
    {'驻马店': ['zhumadian', '1']},
    {'汉中': ['hanzhong', '28']},
    {'河源': ['heyuan', '11']},
    {'铁岭': ['tieling', '2']},
    {'晋中': ['jinzhong', '63']},
    {'安康': ['ankang', '4']},
    {'岳阳': ['yueyang', '13']},
    {'肇庆': ['zhaoqing', '15']},
    {'衡水': ['hengshui', '21']},
    {'牡丹江': ['mudanjiang', '24']},
    {'安庆': ['anqing', '11']},
    {'黄冈': ['huanggang', '2']},
    {'娄底': ['loudi', '3']},
    {'乐山': ['leshan', '187']},
    {'蚌埠': ['bengbu', '14']},
    {'昌吉': ['changji', '1']},
    {'韶关': ['shaoguan', '28']},
    {'阳江': ['yangjiang', '87']},
    {'潮州': ['chaozhou', '15']},
    {'张家界': ['zhangjiajie', '171']},
    {'怀化': ['huaihua', '2']},
    {'西双版纳': ['xishuangbanna', '141']},
    {'三明': ['sanming', '9']},
    {'运城': ['yuncheng', '15']},
    {'眉山': ['meishan', '17']},
    {'许昌': ['xuchang', '11']},
    {'防城港': ['fangchenggang', '16']},
    {'永州': ['yongzhou', '1']},
    {'益阳': ['yiyang', '5']},
    {'上饶': ['shangrao', '45']},
    {'衢州': ['quzhou', '1']},
    {'六盘水': ['liupanshui', '10']},
    {'白山': ['baishan', '37']},
    {'六安': ['luan', '1']},
    {'铜陵': ['tongling', '1']},
    {'池州': ['chizhou', '5']},
    {'晋城': ['jincheng', '3']},
    {'黄石': ['huangshi', '10']},
    {'湘西': ['xiangxi', '24']},
    {'宜春': ['jxyichun', '18']},
    {'茂名': ['maoming', '1']},
    {'梅州': ['meizhou', '2']},
    {'凉山': ['liangshan', '330']},
    {'宜宾': ['yibin', '22']},
    {'湖州': ['huzhou', '296']},
    {'海拉尔': ['hailaer', '2']},
    {'延安': ['yanan', '5']},
    {'内江': ['neijiang', '9']},
    {'南平': ['nanping', '11']},
    {'三门峡': ['sanmenxia', '2']},
    {'松原': ['songyuan', '5']},
    {'阜阳': ['fuyang', '3']},
    {'黄山': ['huangshan', '105']},
    {'巴彦淖尔': ['bayannaoer', '1']},
    {'渭南': ['weinan', '8']},
    {'咸宁': ['xianning', '12']},
    {'恩施': ['enshi', '29']},
    {'抚州': ['jxfuzhou', '5']},
    {'龙岩': ['longyan', '13']},
    {'通化': ['tonghua', '18']},
    {'莱芜': ['laiwu', '1']},
    {'宣城': ['xuancheng', '8']},
    {'锡林郭勒': ['xilinguole', '18']},
    {'景德镇': ['jingdezhen', '21']},
    {'曲靖': ['qujing', '3']},
    {'广元': ['guangyuan', '9']},
    {'巴中': ['bazhong', '5']},
    {'济源': ['jiyuan', '3']},
    {'鹤岗': ['hegang', '2']},
    {'黑河': ['heihe', '6']},
    {'吕梁': ['lvliang', '3']},
    {'天水': ['tianshui', '11']},
    {'榆林': ['sxyulin', '4']},
    {'萍乡': ['pingxiang', '4']},
    {'哈密': ['hami', '7']},
    {'自贡': ['zigong', '23']},
    {'阿坝': ['aba', '260']},
    {'宁德': ['ningde', '20']},
    {'马鞍山': ['maanshan', '1']},
    {'阿拉善': ['alashan', '12']},
    {'阳泉': ['yangquan', '2']},
    {'新余': ['xinyu', '1']},
    {'喀什': ['kashi', '2']},
    {'黔西南': ['qianxinan', '14']},
    {'鸡西': ['jixi', '1']},
    {'伊春': ['hljyichun', '24']},
    {'大兴安岭': ['daxinganling', '5']},
    {'宿州': ['ahsuzhou', '2']},
    {'梧州': ['wuzhou', '2']},
    {'阿克苏': ['akesu', '1']},
    {'汕尾': ['shanwei', '12']},
    {'广安': ['guangan', '4']},
    {'资阳': ['ziyang', '3']},
    {'安顺': ['anshun', '7']},
    {'黔东南': ['qiandongnan', '15']},
    {'七台河': ['qitaihe', '1']},
    {'河池': ['hechi', '4']},
    {'张掖': ['zhangye', '27']},
    {'酒泉': ['jiuquan', '113']},
    {'陇南': ['longnan', '4']},
    {'神农架': ['shennongjia', '12']},
    {'克拉玛依': ['kelamayi', '4']},
    {'伊犁': ['yili', '19']},
    {'雅安': ['yaan', '8']},
    {'甘孜': ['ganzi', '94']},
    {'丽水': ['lishui', '39']},
    {'瓦房店': ['wafangdian', '2']},
    {'武夷山': ['wuyishan', '34']},
    {'亳州': ['bozhou', '1']},
    {'贺州': ['hezhou', '6']},
    {'石嘴山': ['shizuishan', '1']},
    {'中卫': ['zhongwei', '20']},
    {'平凉': ['pingliang', '1']},
    {'铜川': ['tongchuan', '3']},
    {'昭通': ['zhaotong', '1']},
    {'巴音郭楞': ['bayinguoleng', '2']},
    {'日喀则': ['rikaze', '4']},
    {'铜仁': ['tongren', '6']},
    {'忻州': ['xinzhou', '15']},
    {'吴忠': ['wuzhong', '1']},
    {'玉树': ['yushu', '1']},
    {'海西': ['haixi', '11']},
    {'玉溪': ['yuxi', '11']},
    {'红河': ['honghe', '7']},
    {'德宏': ['dehong', '8']},
    {'吐鲁番': ['tulufan', '2']},
    {'黔南': ['qiannan', '9']},
    {'张北': ['zhangbei', '3']},
    {'鹤壁': ['hebi', '1']},
    {'五指山': ['wuzhishan', '4']},
    {'兴安': ['xingan', '6']},
    {'嘉峪关': ['jiayuguan', '20']},
    {'商洛': ['shangluo', '1']},
    {'海东': ['haidong', '5']},
    {'海北': ['haibei', '17']},
    {'随州': ['suizhou', '1']},
    {'保山': ['baoshan', '25']},
    {'楚雄': ['chuxiong', '2']},
    {'普洱': ['puer', '3']},
    {'文山': ['wenshan', '1']},
    {'迪庆': ['diqing', '14']},
    {'和田': ['hetian', '1']},
    {'阿拉尔': ['alaer', '1']},
    {'文昌': ['wenchang', '39']},
    {'琼海': ['qionghai', '30']},
    {'儋州': ['danzhou', '1']},
    {'万宁': ['wanning', '17']},
    {'东方': ['dongfang', '5']},
    {'安定': ['anding', '1']},
    {'澄迈': ['chengmai', '7']},
    {'临高': ['lingao', '1']},
    {'白沙': ['baisha', '2']},
    {'昌江': ['changjiang', '10']},
    {'乐东': ['ledong', '5']},
    {'陵水': ['lingshui', '60']},
    {'保亭': ['baoting', '2']},
    {'琼中': ['qiongzhong', '1']},
    {'长白山': ['changbaishan', '113']},
    {'台北': ['taibei', '14']},
    {'新北': ['xinbei', '4']},
    {'台中': ['taizhong', '9']},
    {'高雄': ['gaoxiong', '2']},
    {'新竹': ['xinzhu', '1']},
    {'嘉义': ['jiayi', '1']},
    {'花莲乡': ['hualianxiang', '20']},
    {'台东县': ['taidongxian', '1']},
    {'澎湖县': ['penghuxian', '1']},
    ]
    
    overseas_list = [
    {'墨尔本': ['moerben', '4']},
    {'悉尼': ['xini', '3']},
    {'维多利亚': ['weiduoliya', '2']},
    {'巴黎': ['bali', '856']},
    {'巴厘岛': ['balidaobalidao', '1']},
    {'佛罗伦萨': ['foluolunsa', '1']},
    {'名古屋': ['nagoya', '2']},
    {'福冈': ['fugang', '1']},
    {'吉隆坡': ['jilongpo', '19']},
    {'马累': ['malei', '1']},
    {'奥克兰': ['aokelan', '4']},
    {'圣彼得堡': ['shengbidebao', '1']},
    {'普吉岛': ['phuket', '5']},
    {'芭堤雅': ['badiya', '10']},
    {'伦敦': ['lundun', '1']},
    {'旧金山': ['jiujinshan', '1']},
    {'拉斯维加斯': ['lasiweijiasi', '1']},
    {'文莱': ['wenlai', '4']},
    {'弗雷德里顿': ['fuleidelidun', '1']},
    {'杜塞尔多夫': ['dusaierduofu', '3']},
    {'雅加达': ['yajiada', '1']},
    {'埼玉': ['qiyu', '2']},
    {'广岛': ['guangdao', '1']},
    {'千叶': ['qianye', '2']},
    {'': ['jie', '1']},
    {'相模原': ['xiangmoyuan', '1']},
    {'船桥': ['chuanqiao', '1']},
    {'东大阪': ['dongdaban', '12']},
    {'暹粒': ['xianli', '3']},
    {'哥打基纳巴鲁': ['gedajinabalu', '4']},
    {'奥兰多': ['aolanduo', '3']},
    {'圣何塞': ['shenghs', '1']},
    {'立川': ['lichuan', '1']},
    {'调布': ['diaobu', '1']},
    {'日野': ['riye', '1']},
    {'马塔兰': ['mataram', '1']},
    ]
    View Code

     因为每个城市的房源时刻在变得所有,不一定准确

  • 相关阅读:
    我开博客了,啦啦啦.
    cf593div2
    Comet OJ
    cf591div2abc
    cfround586ac
    cf589div2
    cf573div2
    Codeforces Round #569 (Div. 2)
    uva11729 水题
    luogu1984 [SDOI2008] 烧水问题
  • 原文地址:https://www.cnblogs.com/114811yayi/p/7061674.html
Copyright © 2020-2023  润新知