• sklearn-GDBT


    GDBT 可以解决分类和回归问题

    回归问题

    def __init__(self, loss='ls', learning_rate=0.1, n_estimators=100,
                     subsample=1.0, criterion='friedman_mse', min_samples_split=2,
                     min_samples_leaf=1, min_weight_fraction_leaf=0.,
                     max_depth=3, min_impurity_decrease=0.,
                     min_impurity_split=None, init=None, random_state=None,
                     max_features=None, alpha=0.9, verbose=0, max_leaf_nodes=None,
                     warm_start=False, presort='auto')

    示例

    import numpy as np
    from sklearn.metrics import mean_squared_error
    from sklearn.datasets import make_friedman1
    from sklearn.ensemble import GradientBoostingRegressor
    
    X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)
    X_train, X_test = X[:200], X[200:]
    y_train, y_test = y[:200], y[200:]
    
    ### 损失函数
    # 如果损失函数为 误差绝对值,L=|y-f(x)|,负梯度为 sign(y-f(x)),即要么1,要么-1,sklearn 中对应为 loss='lad'
    # 如果损失函数为 huber,sklearn 中对应为 loss='huber'
    # 如果损失函数为 均方误差,sklearn 中对应为 loss='ls'
    est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='huber').fit(X_train, y_train)
    
    pred = est.predict(X_test)
    error = mean_squared_error(pred, y_test)
    
    print(max(y_test), min(y_test)) # (27.214332670044374, 0.8719243023544349)
    print(error)
    # loss='ls' 5.009154859960321
    # loss='lad' 5.817510629608294
    # loss='huber' 4.690823542377095

    分类问题

    def __init__(self, loss='deviance', learning_rate=0.1, n_estimators=100,
                     subsample=1.0, criterion='friedman_mse', min_samples_split=2,
                     min_samples_leaf=1, min_weight_fraction_leaf=0.,
                     max_depth=3, min_impurity_decrease=0.,
                     min_impurity_split=None, init=None,
                     random_state=None, max_features=None, verbose=0,
                     max_leaf_nodes=None, warm_start=False,
                     presort='auto')

    示例

    from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
    from sklearn.model_selection import GridSearchCV, train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import accuracy_score, mean_squared_error
    from time import time
    import numpy as np
    import pandas as pd
    import mnist
    
    
    if __name__ == "__main__":
        # 读取Mnist数据集, 测试GBDT的分类模型
        mnistSet = mnist.loadLecunMnistSet()
        train_X, train_Y, test_X, test_Y = mnistSet[0], mnistSet[1], mnistSet[2], mnistSet[3]
    
        m, n = np.shape(train_X)
        idx = range(m)
        np.random.shuffle(idx)
    
        # 使用PCA降维
        # num = 30000
        # pca = PCA(n_components=0.9, whiten=True, random_state=0)
        # for i in range(int(np.ceil(1.0 * m / num))):
        #     minEnd = min((i + 1) * num, m)
        #     sub_idx = idx[i * num:minEnd]
        #     train_pca_X = pca.fit_transform(train_X[sub_idx])
        #     print np.shape(train_pca_X)
    
        print "
    **********测试GradientBoostingClassifier类**********"
        t = time()
        # param_grid1 = {"n_estimators": range(1000, 2001, 100)}
        # param_grid2 = {'max_depth': range(30, 71, 10), 'min_samples_split': range(4, 9, 2)}
        # param_grid3 = {'min_samples_split': range(4, 9, 2), 'min_samples_leaf': range(3, 12, 2)}
        # param_grid4 = {'subsample': np.arange(0.6, 1.0, 0.05)}
        # model = GridSearchCV(
        #     estimator=GradientBoostingClassifier(max_features=90, max_depth=40, min_samples_split=8, learning_rate=0.1,
        #                                          n_estimators=1800),
        #     param_grid=param_grid4, cv=3)
        # # 拟合训练数据集
        # model.fit(train_X, train_Y)
        # print "最好的参数是:%s, 此时的得分是:%0.2f" % (model.best_params_, model.best_score_)
        model = GradientBoostingClassifier(max_features=90, max_depth=40, min_samples_split=8, min_samples_leaf=3,
                                           n_estimators=1200, learning_rate=0.05, subsample=0.95)
        # 拟合训练数据集
        model.fit(train_X, train_Y)
        # 预测训练集
        train_Y_hat = model.predict(train_X[idx])
        print "训练集精确度: ", accuracy_score(train_Y[idx], train_Y_hat)
        # 预测测试集
        test_Y_hat = model.predict(test_X)
        print "测试集精确度: ", accuracy_score(test_Y, test_Y_hat)
        print "总耗时:", time() - t, ""

    参考资料:

    https://github.com/haidawyl/Mnist  各种模型的用法

  • 相关阅读:
    力拓题目 5-8-575,657,707,771
    力拓题目1-4-7,217,344,557
    解码,编码,文件的基本操作
    集合类型内置方法和拷贝浅拷贝深拷贝
    列表元祖字典内置方法
    数字类型内置方法
    字符串类型内置方法
    hdu2262 高斯消元
    hdu1757 构造矩阵
    poj1222 高斯消元
  • 原文地址:https://www.cnblogs.com/yanshw/p/11834859.html
Copyright © 2020-2023  润新知