• [学习笔记][Python机器学习:预测分析核心算法][利用Python集成方法工具包构建梯度提升模型]


    参考:

    1、《Python机器学习:预测分析核心算法》 P258-P266

    1 import numpy
    2 
    3 #from sklearn.cross_validation import train_test_split
    4 from sklearn.model_selection import train_test_split
    5 
    6 #这里是可以用来构建GradientBoostingRegressor模型
    7 from sklearn import ensemble
    8 from sklearn.metrics import mean_squared_error
    9 import pylab as plot
    1 # 从本机读取数据
    2 target_file = open('winequality-red.csv','r')
    3 data = target_file.readlines()
    4 target_file.close()
     1 #整理原始数据,将原始数据分为属性列表(xList),标签列表(labels)
     2 #将各个属性的名称存入names列表
     3 xList = []
     4 labels = []
     5 names = []
     6 firstLine = True
     7 for line in data:
     8     if firstLine:
     9         names = line.strip().split(";")
    10         firstLine = False
    11     else:
    12         #split on semi-colon
    13         row = line.strip().split(";")
    14         #put labels in separate array
    15         labels.append(float(row[-1]))
    16         #remove label from row
    17         row.pop()
    18         #convert row to floats
    19         floatRow = [float(num) for num in row]
    20         xList.append(floatRow)
    21 
    22 #计算属性列表的行数和列数
    23 nrows = len(xList)
    24 ncols = len(xList[0])
    1 #将各列表转为numpy数组形式,此形式是RandomForestRegressor的要求
    2 #并且这些对象可以使用sklearn的train_test_split构建训练和测试集
    3 X = numpy.array(xList)
    4 y = numpy.array(labels)
    5 wineNames = numpy.array(names)
    1 #构建test集为30%规模的训练集和测试集
    2 #random_state设置为一个特殊整数,而不是让随机数生成器自己选择一个不可重复的内部值
    3 #这样重复代码可以获得同样的结果,便于开发阶段的调整,否则随机性会掩盖所做的改变
    4 #固定random_state就固定了测试集,会对测试数据集过度训练
    5 #take fixed holdout set 30% of data rows
    6 xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=0.30, random_state=531)
     1 # Train gradient boosting model to minimize mean squared error
     2 
     3 #对梯度提升法参数设置和调整的建议:
     4 #1、除了设置subsample为0.5的情况,其他情况都以缺省值开始训练。
     5 #2、模型训练完成后,观察模型在测试数据(out_of_sample, oos)下的预测性能与决策树数目的关系及变化。
     6 #3、如果测试数据性能在图的右侧迅速提高,增加n_estimators或者learning_rate
     7 #4、如果测试数据性能在图的右侧迅速恶化,则减少learning_rate
     8 #5、一旦测试数据的性能曲线在整体都有改善,并且图的右侧基本持平,则尝试改变max_depth和max_features
     9 
    10 nEst = 2000
    11 depth = 7
    12 learnRate = 0.01
    13 subSamp = 0.5
    14 
    15 wineGBMModel = ensemble.GradientBoostingRegressor(n_estimators=nEst,
    16                                                   max_depth=depth,
    17                                                   learning_rate=learnRate,
    18                                                   subsample = subSamp,
    19                                                   loss='ls')
    20 
    21 wineGBMModel.fit(xTrain, yTrain)

    运行后显示:

    GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                              init=None, learning_rate=0.01, loss='ls', max_depth=7,
                              max_features=None, max_leaf_nodes=None,
                              min_impurity_decrease=0.0, min_impurity_split=None,
                              min_samples_leaf=1, min_samples_split=2,
                              min_weight_fraction_leaf=0.0, n_estimators=2000,
                              n_iter_no_change=None, presort='deprecated',
                              random_state=None, subsample=0.5, tol=0.0001,
                              validation_fraction=0.1, verbose=0, warm_start=False)

     1 # compute mse on test set
     2 msError = []
     3 #利用迭代器staged_predict观察测试数据误差与决策树数目的关系
     4 predictions = wineGBMModel.staged_predict(xTest)
     5 for p in predictions:
     6     msError.append(mean_squared_error(yTest, p))
     7 
     8 print("MSE" )
     9 print(min(msError))
    10 print(msError.index(min(msError)))

    运行后显示:

    MSE
    0.3155227031463733
    826
    1 #plot training and test errors vs number of trees in ensemble
    2 plot.figure(figsize=(12,8))
    3 plot.plot(range(1, nEst + 1), wineGBMModel.train_score_, label='Training Set MSE')
    4 plot.plot(range(1, nEst + 1), msError, label='Test Set MSE')
    5 plot.legend(loc='upper right')
    6 plot.xlabel('Number of Trees in Ensemble')
    7 plot.ylabel('Mean Squared Error')
    8 plot.show()

    运行后显示:

     1 # Plot feature importance
     2 featureImportance = wineGBMModel.feature_importances_
     3 
     4 # normalize by max importance
     5 featureImportance = featureImportance / featureImportance.max()
     6 idxSorted = numpy.argsort(featureImportance)
     7 barPos = numpy.arange(idxSorted.shape[0]) + .5
     8 plot.barh(barPos, featureImportance[idxSorted], align='center')
     9 plot.yticks(barPos, wineNames[idxSorted])
    10 plot.xlabel('Variable Importance')
    11 plot.subplots_adjust(left=0.2, right=0.9, top=0.9, bottom=0.1)
    12 plot.show()

    运行后显示:

  • 相关阅读:
    linux下ipc
    c各种打印集合
    linux环境下,利用gsoap生成webservice客户端进行应用程序开发的小结 (轉)
    Linux C函数之错误处理函数
    Linux操作系统文件系统基础知识详解
    常用字符串操作函数
    ASP.NET偷懒大法二
    ASP.NET偷懒大法三 (利用Attribute特性简化多查询条件拼接sql语句的麻烦)
    用JavaScript获取Asp.net服务器端控件CheckBoxList的选中值数组
    ASP.NET偷懒大法四(动态生成表单对象)
  • 原文地址:https://www.cnblogs.com/jaysonguan/p/12443464.html
Copyright © 2020-2023  润新知