#!/usr/bin/env python # coding: utf-8 # In[1]: # 1.定义问题 # 2.导入数据 # 导入类库 import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split, KFold, cross_val_score from sklearn.preprocessing import StandardScaler get_ipython().run_line_magic('matplotlib', 'inline') import warnings warnings.filterwarnings('ignore') # 显示所有列 pd.set_option('display.max_columns', None) # 导入数据 train_data = pd.read_csv('../data/train.csv') test_data = pd.read_csv('../data/test.csv') # In[2]: # 3.理解数据 # 数据信息 train_data.info() # In[3]: # 数据维度 train_data.shape # In[4]: # 前5个数据 train_data.head(5) # In[5]: # 描述性统计数据 train_data.describe().T # In[6]: # 4.数据可视化 # 分析SalePrice train_data['SalePrice'].describe() sns.distplot(train_data['SalePrice']) plt.show() # In[7]: # 关系矩阵 corr = train_data.corr() f, ax = plt.subplots(figsize=(20, 9)) sns.heatmap(corr, vmax=1, vmin=-1,square=True) # In[8]: # 选取特征 # 选择相关系数绝对值大于0.5的特征(共十个) train_data.corr()[train_data.corr()['SalePrice'].values > abs(0.5)] # In[9]: cols = ['OverallQual', 'YearBuilt', 'YearRemodAdd', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'FullBath', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea', 'SalePrice'] train_data = train_data[cols] train_data.info() # In[10]: # 5.建立模型 # 分离数据集 X = train_data.values[:, 0:10] Y = train_data.values[:, 10] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42) # 建模 model = LinearRegression() # 预测数据 model.fit(X_train,Y_train) y_pred = model.predict(X_test) print('cost:'+ str(np.sum(abs(y_pred-Y_test)/len(y_pred)))) # In[11]: # 由于原始数据所得cost太大,所以接下来对数据进行归一化处理(误差反而更大了,不知道为什么???) X_scaled = StandardScaler().fit_transform(X) Y_scaled = StandardScaler().fit_transform(Y.reshape(-1, 1)) X_scaled_train, X_scaled_test, Y_scaled_train, Y_scaled_test = train_test_split(X_scaled, Y_scaled, test_size=0.33, random_state=42) model_scaled = LinearRegression() model_scaled.fit(X_scaled_train,Y_scaled_train) y_pred = model.predict(X_scaled_test) y_pred print('cost:'+ str(np.sum(abs(y_pred-Y_scaled_test)/len(y_pred)))) # In[12]: test_data['SalePrice'] = None test_data = test_data[cols] # 填充缺失值 test_data['TotalBsmtSF'].fillna(test_data['TotalBsmtSF'].median(), inplace=True) test_data['GarageCars'].fillna(test_data['GarageCars'].median(), inplace=True) test_data['GarageArea'].fillna(test_data['GarageArea'].median(), inplace=True) # In[13]: X = test_data.values[:, 0:10] y_test_pre = model.predict(X) test_data['SalePrice'] = y_test_pre test_data.head(10)