• 使用 learning_curve 对 load_boston 进行 linear_regression


    原创转载请注明出处:https://www.cnblogs.com/agilestyle/p/12697359.html

    准备数据

    import matplotlib.pyplot as plt
    import numpy as np
    from sklearn.datasets import load_boston
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import learning_curve
    from sklearn.model_selection import train_test_split
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import PolynomialFeatures
    
    # 准备数据
    samples = load_boston()
    
    samples
    
    # dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])
    samples.keys()
    # (506, 13)
    samples['data'].shape
    # (506,)
    samples['target'].shape
    
    # print(samples['DESCR'])

    分割训练集和测试集

    # 分割训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(samples['data'], samples['target'], test_size=0.2, random_state=3)
    # (404, 13)
    X_train.shape

    建模训练

    # 建模训练
    lr_model = LinearRegression(normalize=True)
    # LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)
    lr_model.fit(X_train, y_train)

    评估模型

    # 评估模型
    train_score = lr_model.score(X_train, y_train)
    test_score = lr_model.score(X_test, y_test)
    # 0.7239410298290111
    train_score
    # 0.7952617563243858
    test_score

    欠拟合,高偏差,构建多项式特征

    def polynomial_model(degree=1):
        polynomial_features = PolynomialFeatures(degree=degree, include_bias=False, interaction_only=False)
        linear_regression = LinearRegression(normalize=True)
        pipeline = Pipeline([('polynomial_features', polynomial_features),
                             ('linear_regression', linear_regression)])
        return pipeline
    
    
    p2_model = polynomial_model(2)
    p2_model.fit(X_train, y_train)
    train_score = p2_model.score(X_train, y_train)
    test_score = p2_model.score(X_test, y_test)
    
    # 0.9305468799409319
    print('score on train set: ', train_score)
    # 0.8600492818189014
    print('score on test set: ', test_score)

    绘制学习曲线

    学习曲线是用来判断训练模型的一种方法,通过观察绘制出来的学习曲线图,可以比较直观的了解到模型处于一个什么样的状态,如:过拟合(overfitting)或欠拟合(underfitting)。可以很好的表示,当训练数据集增加时,模型对训练数据集拟合的准确性,以及对交叉验证数据集预测的准确性的变化规律。

    # array([0.1  , 0.325, 0.55 , 0.775, 1.   ])
    train_sizes = np.linspace(0.1, 1.0, 5)
    train_sizes, train_scores, test_scores = learning_curve(p2_model, X_train, y_train, cv=10, train_sizes=train_sizes)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    plt.grid()
    
    plt.ylim(-1, 1)
    plt.plot(train_sizes, train_scores_mean, 'ro-', label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'go-', label="Cross-validation score")
    
    plt.legend()

    Reference

    https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.learning_curve.html

    https://blog.csdn.net/qq_36523839/article/details/82556932

  • 相关阅读:
    别人的代码
    ZOJ 1914 Arctic Network
    今日所得 2.22
    poj 2031 Building a Space Station
    POJ 1251 Jungle Roads
    优秀的开发者 vs. 糟糕的开发者
    有关读书求知的一些想法
    开发者拒绝写技术博客的常见理由
    写代码如坐禅:你是哪一类程序员?
    [C++] c++ new() 与new[]的区别
  • 原文地址:https://www.cnblogs.com/agilestyle/p/12697359.html
Copyright © 2020-2023  润新知