爬取小猪短租的租房信息,爬取第1~4页房屋,共96个房子的信息,包括标题、地址、租金、房东姓名、图片链接等。
使用requests和BeautifulSoup。
小猪短租:http://bj.xiaozhu.com/房源详情页:http://bj.xiaozhu.com/fangzi/134350372103.html
import requests from bs4 import BeautifulSoup import pandas as pd # 获取单个房子的相关信息,返回dict def get_fangzi_info(fangzi_url): raw_page = requests.get(fangzi_url).text soup = BeautifulSoup(raw_page,'lxml') title = soup.select('div.pho_info>h4')[0].get_text().strip() address = soup.select('div.pho_info > p > span')[0].get_text().strip() price = soup.select('div.day_l')[0].get_text().strip() img_link = soup.select('img[id="curBigImage"]')[0].get('src').strip() fangdong_link = soup.select('div.member_pic>a>img')[0].get('src').strip() fangdong_name = soup.select('h6 > a')[0].get_text().strip() fangdong_sex_eles = soup.select('span.member_boy_ico') if fangdong_sex_eles: fangdong_sex = '男' else: fangdong_sex = '女' fangzi_dict = { '标题':title, '地址':address, '日租金':price, '房子图片':img_link, '房东姓名':fangdong_name, '房东性别':fangdong_sex, '房东图片':fangdong_link } # print(fangzi_dict) return fangzi_dict # 根据给定的多房屋概览页面,找出页面上所有房子的详情页链接,返回详情页链接的列表 def get_fangzi_urls(multi_fangzi_url): raw_page = requests.get(multi_fangzi_url).text soup = BeautifulSoup(raw_page,'lxml') fangzi_eles = soup.select('li[lodgeunitid]>a') fangzi_urls = [] for f in fangzi_eles: fangzi_url = f.get('href') fangzi_urls.append(fangzi_url) # print(fangzi_urls) # print('len(fangzi_urls):',len(fangzi_urls)) return fangzi_urls index_url = 'http://bj.xiaozhu.com/' multi_fangzi_urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(2,5)] #第2、3、4页 # print(multi_fangzi_urls) fangzi_dicts = [] # 先爬取首页的房子信息 first_page_urls = get_fangzi_urls(index_url) # 首页24个房子的url for fangzi_url in first_page_urls: fangzi_dict = get_fangzi_info(fangzi_url) fangzi_dicts.append(fangzi_dict) # 再爬取第2页以后的房子信息 for multi_fangzi_url in multi_fangzi_urls: # multi_fangzi_url是首页后的某一页 post_page_urls = get_fangzi_urls(multi_fangzi_url) # 获取到首页后的某一页上的24个房子的url for fangzi_url in post_page_urls: fangzi_dict = get_fangzi_info(fangzi_url) fangzi_dicts.append(fangzi_dict) print(fangzi_dicts) print('len(fangzi_dicts):',len(fangzi_dicts)) df = pd.DataFrame(fangzi_dicts) df.to_csv('./xiaozhu.csv', index=False, mode='w', encoding='utf-8-sig')