knn算法不需要进行训练, 耗时,适用于多标签分类情况
1. 将输入的单个测试数据与每一个训练数据依据特征做一个欧式距离、
2. 将求得的欧式距离进行降序排序,取前n_个
3. 计算这前n_个的y值的平均或者(类别),获得测试数据的预测值
4.根据测试数据的实际值和测试数据的预测值计算当前的rmse,判断该方法的好坏
使用AIRbob的房子的特征与房价做演示:
演示1.首先使用accommodates属性对一个数据做演示,采用的距离是绝对值距离
import pandas as pd import numpy as np df_listings = pd.read_csv('listings.csv') # 选取部分特征 features = ['accommodates', 'bedrooms', 'bathrooms', 'beds', 'price', 'minimum_nights', 'maximum_nights', 'number_of_reviews'] # 选取部分特征重新组合 df_listings = df_listings[features] # 先只对accommodates进行操作 new_accomodates = 3 # 有一个房子的可容纳住房为3 df_listings['distance'] = np.abs(df_listings['accommodates'] - new_accomodates) # 接下来对df_listings按照'distance'进行排序操作.value_counts()统计个数, sort_index() 进行排序 df_listings.distance.value_counts().sort_index() # 使用洗牌操作,重新赋值 df_listings = df_listings.sample(frac=1, random_state=0) # 重新继续排序 df_listings = df_listings.sort_values('distance') print(df_listings.price.head()) # 由于价格是$150 ,我们需要将其转换为float类型 df_listings['price'] = df_listings['price'].str.replace('$|,', "").astype(float) # 取前5个数据,求价格的平均值 price_mean_5 = df_listings['price'].iloc[:5].mean() print(price_mean_5)
演示2 将住房数据分为训练集和测试集, 使用单个特征进行测试
df_listings = df_listings.drop('distance', axis=1) # 将数据进行拆分 train_df = df_listings[:2792] test_df = df_listings[2792:] # 定义预测函数 def predict_price(test_content, feature_name): temp_df = train_df temp_df['distance'] = np.abs(test_content - temp_df[feature_name]) # 根据distance进行排序 temp_df = temp_df.sort_values('distance') price_mean_5 = temp_df.price.iloc[:5].mean() return price_mean_5 cols = ['accommodates'] # 这个.apply相当于将每一个数据输入,参数为函数, feature_name为第二个参数 test_df['predict_price'] = test_df[cols[0]].apply(predict_price, feature_name = 'accommodates') print(test_df['predict_price']) # 计算rmse mse = ((test_df['predict_price'] - test_df['price']) ** 2).mean() rmse = mse ** (1 / 2) print(rmse) # 分别比较其他属性单个的区别 for feature in ['accommodates', 'bedrooms', 'bathrooms', 'number_of_reviews']: test_df['predict_price'] = test_df[feature].apply(predict_price, feature_name=feature) print(test_df['predict_price']) # 计算rmse mse = ((test_df['predict_price'] - test_df['price']) ** 2).mean() rmse = mse ** (1 / 2) print('where{}:{}'.format(feature, rmse))
演示3:在上面的基础上,添加数据标准化(zeros)操作,标准化的意思是先减去均值,然后再除于标准差。同时引入多变量操作
使用的包有: from sklearn.mean_squred_error 用于求平均值
from scipy.spatial import distance 用于求欧式距离
from sklearn.processing import StandardScaler 用于进行标准化操作
from sklearn.preprocessing import StandardScaler df_listings = pd.read_csv('listings.csv') # 选取部分特征 features = ['accommodates', 'bedrooms', 'bathrooms', 'beds', 'price', 'minimum_nights', 'maximum_nights', 'number_of_reviews'] # 选取部分特征重新组合 df_listings = df_listings[features] # 对价格进行处理 df_listings['price'] = df_listings['price'].str.replace('$|,', "").astype(float) # 去除有缺失值的行 df_listings = df_listings.dropna() # 对数据进行标准化的操作 df_listings[features] = StandardScaler().fit_transform(df_listings[features]) # 进行数据拆分 train_df = df_listings[:2792] test_df = df_listings[2792:] # 使用欧式距离构成距离 from scipy.spatial import distance from sklearn.metrics import mean_squared_error # 构造多变量函数 def predict_price(new_content, feature_name): temp_df = train_df.copy() temp_df['distance'] = distance.cdist(temp_df[feature_name], [new_content[feature_name]]) temp_df = temp_df.sort_values('distance') price_mean_5 = temp_df.price.iloc[:5].mean() return price_mean_5
# 选取其中的两个变量 cols = ['accommodates', 'bathrooms'] test_df['predict_price'] = test_df.apply(predict_price, feature_name=cols, axis=1) mse = mean_squared_error(test_df['predict_price'], test_df['price']) rmse = mse ** (1 / 2) print(rmse)
演示4 使用sklearn附带的knn进行运算
from sklearn.neighbors import KNeighborsRegressor from sklearn.preprocessing import StandardScaler from sklearn.metrics import mean_squared_error df_listings = pd.read_csv('listings.csv') # 选取部分特征 features = ['accommodates', 'bedrooms', 'bathrooms', 'beds', 'price', 'minimum_nights', 'maximum_nights', 'number_of_reviews'] # 选取部分特征重新组合 df_listings = df_listings[features] # 对价格进行处理 df_listings['price'] = df_listings['price'].str.replace('$|,', "").astype(float) # 去除有缺失值的行 df_listings = df_listings.dropna() # 拆分数据 df_listings[features] = StandardScaler().fit_transform(df_listings[features]) train_df = df_listings[:2792] test_df = df_listings[2792:] print(test_df.head()) cols = ['accommodates', 'bathrooms'] # 实例化一个knn, n_neighbors用来调整k值 knn = KNeighborsRegressor(n_neighbors=10) # 进行模型的训练 knn.fit(train_df[cols], train_df['price']) # 进行模型的预测 test_df['predict_price'] = knn.predict(test_df[cols]) # 计算mse mse = mean_squared_error(test_df['predict_price'], test_df['price']) rmse = mse ** (1 / 2) print(rmse) # 使用全部特征做一个比较 cols = ['accommodates', 'bedrooms', 'bathrooms', 'beds', 'minimum_nights', 'maximum_nights', 'number_of_reviews'] knn = KNeighborsRegressor(n_neighbors=10) knn.fit(train_df[cols], train_df['price']) test_df['predict_price'] = knn.predict(test_df[cols]) mse = mean_squared_error(test_df['predict_price'], test_df['price']) rmse = mse ** (1 / 2) print(rmse)