• 房价预测《进阶版,测试》


    #coding=utf8
    
    import numpy as np
    import pandas as pd
    from sklearn.linear_model import Ridge
    from sklearn.model_selection import cross_val_score
    import matplotlib.pyplot as plt
    from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor
    from xgboost import XGBRegressor
    
    #不要第一列id,只是作为索引
    train_df = pd.read_csv('./input/train.csv', index_col=0)
    test_df = pd.read_csv('./input/test.csv', index_col=0)
    prices = pd.DataFrame({'price':train_df['SalePrice'], 'log(price + 1)':np.log1p(train_df['SalePrice'])})
    #print train_df.columns
    #prices.hist()
    #print 'ok'
    #print train_df.index
    #print test_df.index
    
    y_train = np.log1p(train_df.pop('SalePrice'))
    #print y_train.shape
    #print train_df.index
    
    all_df = pd.concat((train_df,test_df), axis=0)
    #变量转换
    print train_df.index
    print test_df.index
    
    #print all_df['MSSubClass'].dtypes
    all_df['MSSubClass'] = all_df['MSSubClass'].astype(str)
    #print all_df.shape
    #print all_df['MSSubClass'].value_counts()
    #print all_df['MSSubClass'].dtypes
    #print pd.get_dummies(all_df['MSSubClass'], prefix='MSSubClass').head()
    #当我们用numerical来表达categorical的时候,要注意,数字本身有大小的含义,所以乱用数字会给之后的模型学习带来麻烦。于是我们可以用One-Hot的方法来表达category。
    #pandas自带的get_dummies方法,一键做到One-Hot。
    #把所有的category数据,都给One-Hot了
    all_dummy_df = pd.get_dummies(all_df)
    #print all_dummy_df.head()
    #print all_dummy_df.isnull().sum().sort_values(ascending=False).head(10)
    #处理缺失值
    mean_cols = all_dummy_df.mean()
    #print mean_cols
    all_dummy_df = all_dummy_df.fillna(mean_cols)
    #print all_dummy_df.isnull().sum().sum()
    #标准化numerical数据,这里,我们当然不需要把One-Hot的那些0/1数据给标准化。我们的目标应该是那些本来就是numerical的数据:
    #先来看看 哪些是numerical的
    numeric_cols = all_df.columns[all_df.dtypes != 'object']
    #print numeric_cols
    #print train_df.index
    numeric_col_means = all_dummy_df.loc[:, numeric_cols].mean()
    numeric_col_std = all_dummy_df.loc[:, numeric_cols].std()
    all_dummy_df.loc[:, numeric_cols] = (all_dummy_df.loc[:, numeric_cols] - numeric_col_means) / numeric_col_std
    
    dummy_train_df = all_dummy_df.loc[train_df.index]
    dummy_test_df = all_dummy_df.loc[test_df.index]
    #print train_df.index
    #print test_df.index
    #print dummy_train_df.shape
    #print dummy_test_df.shape
    #print type(dummy_train_df)
    
    X_train = dummy_train_df.values
    X_test = dummy_test_df.values
    #print type(X_train)
    
    print X_train.shape
    alphas = np.logspace(-3, 2, 50)
    test_scores = []
    for alpha in alphas:
        clf = Ridge(alpha)
        test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
        test_scores.append(np.mean(test_score))
    
    plt.plot(alphas, test_scores)
    plt.title('Alpha vs CV Error')
    
    max_features = [.1, .3, .5, .7, .9, .99]
    test_scores = []
    for max_feat in max_features:
        clf = RandomForestRegressor(n_estimators=200, max_features=max_feat)
        test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=5, scoring='neg_mean_squared_error'))
        test_scores.append(np.mean(test_score))
    
    plt.plot(max_features, test_scores)
    plt.title("Max Features vs CV Error")
    
    #做一点高级的Ensemble
    #这里,可以不必输入Base_estimator,使用自带的,但是结果不及已经调好的 base_estimator,通过作图可以验证。
    ridge = Ridge(alpha=15)
    
    #Bagging
    params = [1, 10, 15, 20, 25, 30, 40]
    test_scores = []
    for param in params:
        clf = BaggingRegressor(n_estimators=param, base_estimator=ridge)
        test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
        test_scores.append(np.mean(test_score))
    
    plt.plot(params, test_scores)
    plt.title("n_estimator vs CV Error")
    
    #Boosting
    params = [10, 15, 20, 25, 30, 35, 40, 45, 50]
    test_scores = []
    for param in params:
        clf = AdaBoostRegressor(n_estimators=param, base_estimator=ridge)
        test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
        test_scores.append(np.mean(test_score))
    
    plt.plot(params, test_scores)
    plt.title("n_estimator vs CV Error")
    
    #XGBoost
    params = [1,2,3,4,5,6]
    test_scores = []
    for param in params:
        clf = XGBRegressor(max_depth=param)
        test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=10, scoring='neg_mean_squared_error'))
        test_scores.append(np.mean(test_score))
    
    plt.plot(params, test_scores)
    plt.title("max_depth vs CV Error")
    
    
    """
    rf = RandomForestRegressor(n_estimators=500, max_features=.3)
    
    ridge.fit(X_train, y_train)
    rf.fit(X_train, y_train)
    
    y_ridge = np.expm1(ridge.predict(X_test))
    y_rf = np.expm1(rf.predict(X_test))
    y_final = (y_ridge + y_rf) / 2
    """
  • 相关阅读:
    Expert C programming书摘
    修改文件时间属性的方法
    算法学习, 开始计划
    小游戏学习搜集
    C++ 练习记录2---Effective STL中的vector<bool>
    C++ 练习记录1--vector<T> 中T的初始化
    oracle的wm_concat函数实现行转列
    Oracle批量插入有日期类型数据
    如何将zTree选中节点传递给后台
    第五章:1.数组和广义表 -- 数组
  • 原文地址:https://www.cnblogs.com/TMatrix52/p/7718848.html
Copyright © 2020-2023  润新知