• 使用requests和BeautifulSoup爬取“小猪短租”的房屋信息


    爬取小猪短租的租房信息,爬取第1~4页房屋,共96个房子的信息,包括标题、地址、租金、房东姓名、图片链接等。
    使用requests和BeautifulSoup。

    小猪短租:http://bj.xiaozhu.com/房源详情页:http://bj.xiaozhu.com/fangzi/134350372103.html
    import requests
    from bs4 import BeautifulSoup
    import pandas as pd
    
    
    # 获取单个房子的相关信息,返回dict
    def get_fangzi_info(fangzi_url):
        raw_page = requests.get(fangzi_url).text
    
        soup = BeautifulSoup(raw_page,'lxml')
        title = soup.select('div.pho_info>h4')[0].get_text().strip()
        address = soup.select('div.pho_info > p > span')[0].get_text().strip()
        price = soup.select('div.day_l')[0].get_text().strip()
        img_link = soup.select('img[id="curBigImage"]')[0].get('src').strip()
        fangdong_link = soup.select('div.member_pic>a>img')[0].get('src').strip()
        fangdong_name = soup.select('h6 > a')[0].get_text().strip()
        fangdong_sex_eles = soup.select('span.member_boy_ico')
        if fangdong_sex_eles:
            fangdong_sex = '男'
        else:
            fangdong_sex = '女'
    
        fangzi_dict = {
                '标题':title,
                '地址':address,
                '日租金':price,
                '房子图片':img_link,
                '房东姓名':fangdong_name,
                '房东性别':fangdong_sex,
                '房东图片':fangdong_link
        }
        # print(fangzi_dict)
        return fangzi_dict
    
    # 根据给定的多房屋概览页面,找出页面上所有房子的详情页链接,返回详情页链接的列表
    def get_fangzi_urls(multi_fangzi_url):
        raw_page = requests.get(multi_fangzi_url).text
        soup = BeautifulSoup(raw_page,'lxml')
        fangzi_eles = soup.select('li[lodgeunitid]>a')
        fangzi_urls = []
        for f in fangzi_eles:
            fangzi_url = f.get('href')
            fangzi_urls.append(fangzi_url)
        # print(fangzi_urls)
        # print('len(fangzi_urls):',len(fangzi_urls))
        return fangzi_urls
    
    
    index_url = 'http://bj.xiaozhu.com/'
    multi_fangzi_urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(2,5)]  #第2、3、4页
    # print(multi_fangzi_urls)
    fangzi_dicts = []
    
    # 先爬取首页的房子信息
    first_page_urls = get_fangzi_urls(index_url)  # 首页24个房子的url
    for fangzi_url in first_page_urls:
        fangzi_dict = get_fangzi_info(fangzi_url)
        fangzi_dicts.append(fangzi_dict)
    
    # 再爬取第2页以后的房子信息
    for multi_fangzi_url in multi_fangzi_urls:              # multi_fangzi_url是首页后的某一页
        post_page_urls = get_fangzi_urls(multi_fangzi_url)  # 获取到首页后的某一页上的24个房子的url
        for fangzi_url in post_page_urls:
            fangzi_dict = get_fangzi_info(fangzi_url)
            fangzi_dicts.append(fangzi_dict)
    
    print(fangzi_dicts)
    print('len(fangzi_dicts):',len(fangzi_dicts))
    
    df = pd.DataFrame(fangzi_dicts)
    df.to_csv('./xiaozhu.csv', index=False, mode='w', encoding='utf-8-sig')
    

      

     
     
  • 相关阅读:
    20201004 助教一周总结(第五周)
    20200906助教一周总结(第一周)
    如何在Linux下增加Apache的虚拟主机
    在Windows下编译OpenSSL
    Windows 应用
    祭5.12地震死难者文[转]
    VC++中控制控件台字体颜色(转)
    RSA算法简述
    如何解决在IE6下不自动换行的问题
    重庆旅游网新版上线,欢迎测试!
  • 原文地址:https://www.cnblogs.com/djlbolgs/p/12513434.html
Copyright © 2020-2023  润新知