数据挖掘实践（25）：实战-- 建筑能源得分预测报告（三）

5 建立基础模型，尝试多种算法

#之前把精力都放在了前面了，这回我的重点就要放在建模上了，导入所需要的包
# 数据分析库
import pandas as pd
import numpy as np

# warnings：警告——>忽视
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 60)

# 可视化
import matplotlib.pyplot as plt
%matplotlib inline

# 字体大小设置
plt.rcParams['font.size'] = 24

from IPython.core.pylabtools import figsize

# Seaborn 高级可视化工具
import seaborn as sns
sns.set(font_scale = 2)

# 预处理：缺失值 、 最大最小归一化
from sklearn.preprocessing import Imputer, MinMaxScaler

# 机器学习算法库
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# 调参工具包
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV


import warnings
warnings.filterwarnings("ignore")

上次保存好的数据加载进来

# Read in data into dataframes 
train_features = pd.read_csv('data/training_features.csv')
test_features = pd.read_csv('data/testing_features.csv')
train_labels = pd.read_csv('data/training_labels.csv')
test_labels = pd.read_csv('data/testing_labels.csv')

# Display sizes of data
print('Training Feature Size: ', train_features.shape)
print('Testing Feature Size:  ', test_features.shape)
print('Training Labels Size:  ', train_labels.shape)
print('Testing Labels Size:   ', test_labels.shape)

Training Feature Size:  (6622, 67)
Testing Feature Size:   (2839, 67)
Training Labels Size:   (6622, 1)
Testing Labels Size:    (2839, 1)

train_features.head()

5.1 缺失值填充

imputer = Imputer(strategy='median') # 因为数据有离群点，有大有小，用mean不太合适，用中位数较合适
# 在训练特征中训练
imputer.fit(train_features)

# 对训练数据进行转换
X = imputer.transform(train_features)#用中位数来代替做成的训练集
X_test = imputer.transform(test_features) #用中位数来代替做成的测试集

#np.isnan:数值进行空值检测
print('Missing values in training features: ', np.sum(np.isnan(X))) #返回的是0 ，代表缺失值任务已经完成了
print('Missing values in testing features:  ', np.sum(np.isnan(X_test)))

Missing values in training features:  0
Missing values in testing features:   0

5.2 特征进行与归一化

scaler = MinMaxScaler(feature_range=(0, 1))

# 训练与转换
scaler.fit(X)

# 把训练数据转换过来(0,1)
X = scaler.transform(X)
X_test = scaler.transform(X_test) # 测试数据

#标签值是1列 ，reshape变成1行
# reshape(行数，列数)常用来更改数据的行列数目
y = np.array(train_labels).reshape((-1,))#一维数组 ， 变成1列
y_test = np.array(test_labels).reshape((-1, )) # 一维数组 ， 变成1列

6 建立基础模型，尝试多种算法（回归问题）

1. Linear Regression
2. Support Vector Machine Regression
3. Random Forest Regression
4. Gradient Boosting Regression
5. K-Nearest Neighbors Regression

6.1 建立损失函数

# 在这里的损失函数是MAE ，abs()是绝对值
def mae(y_true, y_pred):
    return np.mean(abs(y_true - y_pred))


#制作一个模型 ，训练模型和在验证集上验证模型的参数
def fit_and_evaluate(model):
    
    # 训练模型
    model.fit(X, y)
    
    # 训练模型开始在测试数据上训练
    model_pred = model.predict(X_test)
    model_mae = mae(y_test, model_pred)
    
    
    return model_mae

6.2 选择机器学习算法

lr = LinearRegression()#线性回归
lr_mae = fit_and_evaluate(lr)

print('Linear Regression Performance on the test set: MAE = %0.4f' % lr_mae)

Linear Regression Performance on the test set: MAE = 12.9817

svm = SVR(C = 1000, gamma = 0.1) #支持向量机
svm_mae = fit_and_evaluate(svm)

print('Support Vector Machine Regression Performance on the test set: MAE = %0.4f' % svm_mae)

Support Vector Machine Regression Performance on the test set: MAE = 10.9336

random_forest = RandomForestRegressor(random_state=60)#集成算法的随机森林
random_forest_mae = fit_and_evaluate(random_forest)

print('Random Forest Regression Performance on the test set: MAE = %0.4f' % random_forest_mae)

Random Forest Regression Performance on the test set: MAE = 10.0941

gradient_boosted = GradientBoostingRegressor(random_state=60) #梯度提升树
gradient_boosted_mae = fit_and_evaluate(gradient_boosted)

print('Gradient Boosted Regression Performance on the test set: MAE = %0.4f' % gradient_boosted_mae)

Gradient Boosted Regression Performance on the test set: MAE = 9.9893

knn = KNeighborsRegressor(n_neighbors=10)#K近邻算法
knn_mae = fit_and_evaluate(knn)

print('K-Nearest Neighbors Regression Performance on the test set: MAE = %0.4f' % knn_mae)

K-Nearest Neighbors Regression Performance on the test set: MAE = 12.6722

plt.style.use('fivethirtyeight') 
figsize(8, 6)


model_comparison = pd.DataFrame({'model': ['Linear Regression', 'Support Vector Machine',
                                           'Random Forest', 'Gradient Boosted',
                                            'K-Nearest Neighbors'],
                                 'mae': [lr_mae, svm_mae, random_forest_mae, 
                                         gradient_boosted_mae, knn_mae]})

#         ascending=True是对的意思升序      降序 ：从大到小/从第1行到第5行    barh:横着去画的直方图 
model_comparison.sort_values('mae', ascending = False).plot(x = 'model', y = 'mae', kind = 'barh',
                                                           color = 'red', edgecolor = 'black')

# 纵轴是算法模型的名称    yticks:为递增值向量       横轴是MAE损失                 xticks：为递增值向量
plt.ylabel(''); plt.yticks(size = 14); plt.xlabel('Mean Absolute Error'); plt.xticks(size = 14)
plt.title('Model Comparison on Test MAE', size = 20);

7 模型调参

7.1 调参

loss = ['ls', 'lad', 'huber']

# 所使用的弱“学习者”（决策树）的数量
n_estimators = [100, 500, 900, 1100, 1500]

# 决策树的最大深度
max_depth = [2, 3, 5, 10, 15]

# 决策树的叶节点所需的最小示例个数
min_samples_leaf = [1, 2, 4, 6, 8]

# 分割决策树节点所需的最小示例个数
min_samples_split = [2, 4, 6, 10]





hyperparameter_grid = {'loss': loss,
                       'n_estimators': n_estimators,
                       'max_depth': max_depth,
                       'min_samples_leaf': min_samples_leaf,
                       'min_samples_split': min_samples_split}

model = GradientBoostingRegressor(random_state = 42)


random_cv = RandomizedSearchCV(estimator=model, 
                               
                               param_distributions=hyperparameter_grid,
                               cv=4, n_iter=25, 
                               scoring = 'neg_mean_absolute_error', #选择好结果的评估值
                               
                               n_jobs = -1, verbose = 1, 
                               
                               return_train_score = True,
                               
                               random_state=42)

# 注意：运行的时间非常慢,需要14mins
random_cv.fit(X, y)

Fitting 4 folds for each of 25 candidates, totalling 100 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  9.7min finished

输出

RandomizedSearchCV(cv=4, error_score='raise-deprecating',
                   estimator=GradientBoostingRegressor(alpha=0.9,
                                                       criterion='friedman_mse',
                                                       init=None,
                                                       learning_rate=0.1,
                                                       loss='ls', max_depth=3,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                                                       n_estimators=100,...
                                                       verbose=0,
                                                       warm_start=False),
                   iid='warn', n_iter=25, n_jobs=-1,
                   param_distributions={'loss': ['ls', 'lad', 'huber'],
                                        'max_depth': [2, 3, 5, 10, 15],
                                        'min_samples_leaf': [1, 2, 4, 6, 8],
                                        'min_samples_split': [2, 4, 6, 10],
                                        'n_estimators': [100, 500, 900, 1100,
                                                         1500]},
                   pre_dispatch='2*n_jobs', random_state=42, refit=True,
                   return_train_score=True, scoring='neg_mean_absolute_error',
                   verbose=1)

random_cv.best_estimator_ #最好的参数

输出：GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='lad', max_depth=5,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=4, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=500,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

# 创建树策个数
trees_grid = {'n_estimators': [100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800]}

#建立模型
#lad:最小化绝对偏差
model = GradientBoostingRegressor(loss = 'lad', max_depth = 5,
                                  min_samples_leaf = 6,
                                  min_samples_split = 6,
                                  max_features = None,
                                  random_state = 42)

# 传入参数
grid_search = GridSearchCV(estimator = model, param_grid=trees_grid, cv = 4, 
                           scoring = 'neg_mean_absolute_error', verbose = 1,
                           n_jobs = -1, return_train_score = True)

# 需要3mins
grid_search.fit(X, y)

Fitting 4 folds for each of 15 candidates, totalling 60 fits

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  2.2min finished

输出：

GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=GradientBoostingRegressor(alpha=0.9,
                                                 criterion='friedman_mse',
                                                 init=None, learning_rate=0.1,
                                                 loss='lad', max_depth=5,
                                                 max_features=None,
                                                 max_leaf_nodes=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=6,
                                                 min_samples_split=6,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100,
                                                 n_iter_no_change=None,
                                                 presort='auto',
                                                 random_state=42, subsample=1.0,
                                                 tol=0.0001,
                                                 validation_fraction=0.1,
                                                 verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'n_estimators': [100, 150, 200, 250, 300, 350, 400,
                                          450, 500, 550, 600, 650, 700, 750,
                                          800]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='neg_mean_absolute_error', verbose=1)

7.2 对比损失函数

# 得到结果传入DataFrame
results = pd.DataFrame(grid_search.cv_results_)

# 画图操作
figsize(8, 8)
plt.style.use('fivethirtyeight')

plt.plot(results['param_n_estimators'], -1 * results['mean_test_score'], label = 'Testing Error')
plt.plot(results['param_n_estimators'], -1 * results['mean_train_score'], label = 'Training Error')
#横轴是树的个数 ，纵轴是MAE的误差
plt.xlabel('Number of Trees'); plt.ylabel('Mean Abosolute Error'); plt.legend();
plt.title('Performance vs Number of Trees');
#过拟合 ， 蓝色平缓 ，红色比较陡 ，中间的数据越来陡，所以overfiting

8 评估与测试:预测和真实之间的差异图

# 测试模型
default_model = GradientBoostingRegressor(random_state = 42)
default_model.fit(X,y)
# 选择最好的参数
final_model = grid_search.best_estimator_

final_model

输出：

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='lad', max_depth=5,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=6, min_samples_split=6,
                          min_weight_fraction_leaf=0.0, n_estimators=800,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

default_pred = default_model.predict(X_test)
final_pred = final_model.predict(X_test)
print('Default model performance on the test set: MAE = %0.4f.' % mae(y_test, default_pred))
print('Final model performance on the test set:   MAE = %0.4f.' % mae(y_test, final_pred))

Default model performance on the test set: MAE = 9.9864.
Final model performance on the test set:   MAE = 9.1172.

Default model performance on the test set: MAE = 9.9864.
Final model performance on the test set:   MAE = 9.1172.

figsize = (6, 6)

# 最终的模型差异 = 模型  -  测试值 ，大部分都在+-25%
residuals = final_pred - y_test

plt.hist(residuals, color = 'red', bins = 20,
         edgecolor = 'black')
plt.xlabel('Error'); plt.ylabel('Count')
plt.title('Distribution of Residuals');

9 解释模型：基于重要性来进行特征选择

import pandas as pd
import numpy as np


pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', 60)


import matplotlib.pyplot as plt
%matplotlib inline


plt.rcParams['font.size'] = 24

from IPython.core.pylabtools import figsize

import seaborn as sns

sns.set(font_scale = 2)


from sklearn.preprocessing import Imputer, MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor

from sklearn import tree



import warnings
warnings.filterwarnings("ignore")

先拿到我们上个模型的结果

train_features = pd.read_csv('data/training_features.csv')
test_features = pd.read_csv('data/testing_features.csv')
train_labels = pd.read_csv('data/training_labels.csv')
test_labels = pd.read_csv('data/testing_labels.csv')

# 用中值代替缺失值
imputer = Imputer(strategy='median')

# 开始训练
imputer.fit(train_features)


X = imputer.transform(train_features)
X_test = imputer.transform(test_features)


y = np.array(train_labels).reshape((-1,))
y_test = np.array(test_labels).reshape((-1,))

def mae(y_true, y_pred):
    return np.mean(abs(y_true - y_pred))

model = GradientBoostingRegressor(loss='lad', max_depth=5, max_features=None,
                                  min_samples_leaf=6, min_samples_split=6, 
                                  n_estimators=800, random_state=42)

model.fit(X, y)

输出：

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='lad', max_depth=5,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=6, min_samples_split=6,
                          min_weight_fraction_leaf=0.0, n_estimators=800,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

#  GBDT模型作为最终的模型
model_pred = model.predict(X_test)

print('Final Model Performance on the test set: MAE = %0.4f' % mae(y_test, model_pred))

Final Model Performance on the test set: MAE = 9.1485

# 特征重要度
feature_results = pd.DataFrame({'feature': list(train_features.columns),  #所有的训练特征
                                'importance': model.feature_importances_})

# 展示前10名的重要的特征 ，降序  
feature_results = feature_results.sort_values('importance', ascending = False).reset_index(drop=True)

feature_results.head(10)

figsize(12, 10)
plt.style.use('fivethirtyeight')

# 展示前10名的重要的特征 
feature_results.loc[:9, :].plot(x = 'feature', y = 'importance', 
                                 edgecolor = 'k',
                                 kind='barh', color = 'blue');#barh：直方图横着
plt.xlabel('Relative Importance', size = 20); plt.ylabel('')
plt.title('Feature Importances from Random Forest', size = 30);

most_important_features = feature_results['feature'][:10]#前10行的特征
# indices=10个列名
indices = [list(train_features.columns).index(x) for x in most_important_features]# 列表推导式


X_reduced = X[:, indices]
X_test_reduced = X_test[:, indices]

print('Most important training features shape: ', X_reduced.shape)
print('Most important testing  features shape: ', X_test_reduced.shape)

Most important training features shape:  (6622, 10)
Most important testing  features shape:  (2839, 10)

lr = LinearRegression()


lr.fit(X, y)
lr_full_pred = lr.predict(X_test)


lr.fit(X_reduced, y)
lr_reduced_pred = lr.predict(X_test_reduced)


print('Linear Regression Full Results: MAE =    %0.4f.' % mae(y_test, lr_full_pred))
print('Linear Regression Reduced Results: MAE = %0.4f.' % mae(y_test, lr_reduced_pred))

Linear Regression Full Results: MAE =    12.9817.
Linear Regression Reduced Results: MAE = 14.4826.

相关阅读:
百度地图生成器添加标注不显示
 为web文件夹添加IIS应用程序池用户权限
 SSL证书 .pem转.pfx
sql获取表的所有字段及属性
 sql server不用安装sql管理工具开启sa
asp.net mvc+webuploader大文件分片上传
 将WinForm程序（含多个非托管Dll）合并成一个exe的方法
 手机访问网站自动跳转到手机版
 c++和cuda混合编程实现传统神经网络
 leetcode 338
原文地址：https://www.cnblogs.com/qiu-hua/p/14397688.html