[学习笔记][Python机器学习：预测分析核心算法][利用Python集成方法工具包构建梯度提升模型]

参考：

1、《Python机器学习：预测分析核心算法》 P258-P266

1 import numpy
2 
3 #from sklearn.cross_validation import train_test_split
4 from sklearn.model_selection import train_test_split
5 
6 #这里是可以用来构建GradientBoostingRegressor模型
7 from sklearn import ensemble
8 from sklearn.metrics import mean_squared_error
9 import pylab as plot

1 # 从本机读取数据
2 target_file = open('winequality-red.csv','r')
3 data = target_file.readlines()
4 target_file.close()

 1 #整理原始数据，将原始数据分为属性列表（xList），标签列表（labels）
 2 #将各个属性的名称存入names列表
 3 xList = []
 4 labels = []
 5 names = []
 6 firstLine = True
 7 for line in data:
 8     if firstLine:
 9         names = line.strip().split(";")
10         firstLine = False
11     else:
12         #split on semi-colon
13         row = line.strip().split(";")
14         #put labels in separate array
15         labels.append(float(row[-1]))
16         #remove label from row
17         row.pop()
18         #convert row to floats
19         floatRow = [float(num) for num in row]
20         xList.append(floatRow)
21 
22 #计算属性列表的行数和列数
23 nrows = len(xList)
24 ncols = len(xList[0])

1 #将各列表转为numpy数组形式，此形式是RandomForestRegressor的要求
2 #并且这些对象可以使用sklearn的train_test_split构建训练和测试集
3 X = numpy.array(xList)
4 y = numpy.array(labels)
5 wineNames = numpy.array(names)

1 #构建test集为30%规模的训练集和测试集
2 #random_state设置为一个特殊整数，而不是让随机数生成器自己选择一个不可重复的内部值
3 #这样重复代码可以获得同样的结果，便于开发阶段的调整，否则随机性会掩盖所做的改变
4 #固定random_state就固定了测试集，会对测试数据集过度训练
5 #take fixed holdout set 30% of data rows
6 xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.30, random_state=531)

 1 # Train gradient boosting model to minimize mean squared error
 2 
 3 #对梯度提升法参数设置和调整的建议：
 4 #1、除了设置subsample为0.5的情况，其他情况都以缺省值开始训练。
 5 #2、模型训练完成后，观察模型在测试数据（out_of_sample, oos）下的预测性能与决策树数目的关系及变化。
 6 #3、如果测试数据性能在图的右侧迅速提高，增加n_estimators或者learning_rate
 7 #4、如果测试数据性能在图的右侧迅速恶化，则减少learning_rate
 8 #5、一旦测试数据的性能曲线在整体都有改善，并且图的右侧基本持平，则尝试改变max_depth和max_features
 9 
10 nEst = 2000
11 depth = 7
12 learnRate = 0.01
13 subSamp = 0.5
14 
15 wineGBMModel = ensemble.GradientBoostingRegressor(n_estimators=nEst,
16                                                   max_depth=depth,
17                                                   learning_rate=learnRate,
18                                                   subsample = subSamp,
19                                                   loss='ls')
20 
21 wineGBMModel.fit(xTrain, yTrain)

运行后显示：

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.01, loss='ls', max_depth=7,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=2000,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=0.5, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

 1 # compute mse on test set
 2 msError = []
 3 #利用迭代器staged_predict观察测试数据误差与决策树数目的关系
 4 predictions = wineGBMModel.staged_predict(xTest)
 5 for p in predictions:
 6     msError.append(mean_squared_error(yTest, p))
 7 
 8 print("MSE" )
 9 print(min(msError))
10 print(msError.index(min(msError)))

运行后显示：

MSE
0.3155227031463733
826

1 #plot training and test errors vs number of trees in ensemble
2 plot.figure(figsize=(12,8))
3 plot.plot(range(1, nEst + 1), wineGBMModel.train_score_, label='Training Set MSE')
4 plot.plot(range(1, nEst + 1), msError, label='Test Set MSE')
5 plot.legend(loc='upper right')
6 plot.xlabel('Number of Trees in Ensemble')
7 plot.ylabel('Mean Squared Error')
8 plot.show()

运行后显示：

 1 # Plot feature importance
 2 featureImportance = wineGBMModel.feature_importances_
 3 
 4 # normalize by max importance
 5 featureImportance = featureImportance / featureImportance.max()
 6 idxSorted = numpy.argsort(featureImportance)
 7 barPos = numpy.arange(idxSorted.shape[0]) + .5
 8 plot.barh(barPos, featureImportance[idxSorted], align='center')
 9 plot.yticks(barPos, wineNames[idxSorted])
10 plot.xlabel('Variable Importance')
11 plot.subplots_adjust(left=0.2, right=0.9, top=0.9, bottom=0.1)
12 plot.show()

运行后显示：

相关阅读:
linux cpu load学习笔记
 P1064 金明的预算方案
 P1757 通天之分组背包
 P1352 没有上司的舞会
 P1651 塔
 P1250 种树
 P1938 [USACO09NOV]找工就业Job Hunt
P4392 [BOI2007]Sound 静音问题
 P3884 [JLOI2009]二叉树问题
 P2880 [USACO07JAN]平衡的阵容Balanced Lineup
原文地址：https://www.cnblogs.com/jaysonguan/p/12443464.html