• 机器学习预测时label错位对未来数据做预测


    前言

      这篇文章时承继上一篇机器学习经典模型使用归一化的影响。这次又有了新的任务,通过将label错位来对未来数据做预测。

    实验过程

      使用不同的归一化方法,不同得模型将测试集label错位,计算出MSE的大小;

      不断增大错位的数据的个数,并计算出MSE,并画图。通过比较MSE(均方误差,mean-square error)的大小来得出结论

    过程及结果

    数据处理(和上一篇的处理方式相同):

    1 test_sort_data = sort_data[:5000]
    2 test_sort_target = sort_target[:5000]
    3 
    4 sort_data1 = _sort_data[5000:16060]
    5 sort_data2 = _sort_data[16060:]
    6 sort_target1 = _sort_target[5000:16060]
    7 sort_target2 = _sort_target[16060:]
    View Code

    完整数据处理代码:

     1 #按时间排序
     2 sort_data = data.sort_values(by = 'time',ascending = True)
     3 
     4 sort_data.reset_index(inplace = True,drop = True)
     5 target = data['T1AOMW_AV']
     6 sort_target = sort_data['T1AOMW_AV']
     7 del data['T1AOMW_AV']
     8 del sort_data['T1AOMW_AV']
     9 
    10 from sklearn.model_selection import train_test_split
    11 test_sort_data = sort_data[16160:]
    12 test_sort_target = sort_target[16160:]
    13 
    14 _sort_data = sort_data[:16160]
    15 _sort_target = sort_target[:16160]
    16 
    17 from sklearn.model_selection import train_test_split
    18 test_sort_data = sort_data[:5000]
    19 test_sort_target = sort_target[:5000]
    20 
    21 sort_data1 = _sort_data[5000:16060]
    22 sort_data2 = _sort_data[16060:]
    23 sort_target1 = _sort_target[5000:16060]
    24 sort_target2 = _sort_target[16060:]
    25 
    26 import scipy.stats as stats
    27 dict_corr = {
    28     'spearman' : [],
    29     'pearson' : [],
    30     'kendall' : [],
    31     'columns' : []
    32 }
    33 
    34 for i in data.columns:
    35     corr_pear,pval = stats.pearsonr(sort_data[i],sort_target)
    36     corr_spear,pval = stats.spearmanr(sort_data[i],sort_target)
    37     corr_kendall,pval = stats.kendalltau(sort_data[i],sort_target)
    38     
    39     dict_corr['pearson'].append(abs(corr_pear))
    40     dict_corr['spearman'].append(abs(corr_spear))
    41     dict_corr['kendall'].append(abs(corr_kendall))
    42     
    43     dict_corr['columns'].append(i)
    44     
    45 # 筛选新属性  
    46 dict_corr =pd.DataFrame(dict_corr)
    47 dict_corr.describe()
    View Code

    选取25%以上的;

    1 new_fea = list(dict_corr[(dict_corr['pearson']>0.41) & (dict_corr['spearman']>0.45) & (dict_corr['kendall']>0.29)]['columns'].values)
    View Code

    包含下面的用来画图:

    1 import matplotlib.pyplot as plt 
    2 lr_plt=[]
    3 ridge_plt=[]
    4 svr_plt=[]
    5 RF_plt=[]
    View Code

    正常的计算mse(label没有移动):

     1 from sklearn.linear_model import LinearRegression,Lasso,Ridge
     2 from sklearn.preprocessing import MinMaxScaler,StandardScaler,MaxAbsScaler
     3 from sklearn.metrics import mean_squared_error as mse
     4 from sklearn.svm import SVR
     5 from sklearn.ensemble import RandomForestRegressor
     6 import xgboost as xgb
     7 #最大最小归一化
     8 mm = MinMaxScaler()
     9 
    10 lr = Lasso(alpha=0.5)
    11 lr.fit(mm.fit_transform(sort_data1[new_fea]), sort_target1)
    12 lr_ans = lr.predict(mm.transform(sort_data2[new_fea]))
    13 lr_mse=mse(lr_ans,sort_target2)
    14 lr_plt.append(lr_mse)
    15 print('lr:',lr_mse)
    16 
    17 ridge = Ridge(alpha=0.5)
    18 ridge.fit(mm.fit_transform(sort_data1[new_fea]),sort_target1)
    19 ridge_ans = ridge.predict(mm.transform(sort_data2[new_fea]))
    20 ridge_mse=mse(ridge_ans,sort_target2)
    21 ridge_plt.append(ridge_mse)
    22 print('ridge:',ridge_mse)
    23 
    24 svr = SVR(kernel='rbf',C=100,epsilon=0.1).fit(mm.fit_transform(sort_data1[new_fea]),sort_target1)
    25 svr_ans = svr.predict(mm.transform(sort_data2[new_fea]))
    26 svr_mse=mse(svr_ans,sort_target2)
    27 svr_plt.append(svr_mse)
    28 print('svr:',svr_mse)
    29 
    30 estimator_RF = RandomForestRegressor().fit(mm.fit_transform(sort_data1[new_fea]),sort_target1)
    31 predict_RF = estimator_RF.predict(mm.transform(sort_data2[new_fea]))
    32 RF_mse=mse(predict_RF,sort_target2)
    33 RF_plt.append(RF_mse)
    34 print('RF:',RF_mse)
    35 
    36 bst = xgb.XGBRegressor(learning_rate=0.1, n_estimators=550, max_depth=4, min_child_weight=5, seed=0,
    37                              subsample=0.7, colsample_bytree=0.7, gamma=0.1, reg_alpha=1, reg_lambda=1)
    38 bst.fit(mm.fit_transform(sort_data1[new_fea]),sort_target1)
    39 bst_ans = bst.predict(mm.transform(sort_data2[new_fea]))
    40 print('bst:',mse(bst_ans,sort_target2))
    View Code

    先让label移动5个:

    1 change_sort_data2 = sort_data2.shift(periods=5,axis=0)
    2 change_sort_target2 = sort_target2.shift(periods=-5,axis=0)
    3 change_sort_data2.dropna(inplace=True)
    4 change_sort_target2.dropna(inplace=True)
    View Code

    让label以5的倍数移动:

     1 mm = MinMaxScaler()
     2 
     3 for i in range(0,45,5):
     4     print(i)
     5     lr = Lasso(alpha=0.5)
     6     lr.fit(mm.fit_transform(sort_data1[new_fea]), sort_target1)
     7     lr_ans = lr.predict(mm.transform(change_sort_data2[new_fea]))
     8     lr_mse=mse(lr_ans,change_sort_target2)
     9     lr_plt.append(lr_mse)
    10     print('lr:',lr_mse)
    11     
    12     ridge = Ridge(alpha=0.5)
    13     ridge.fit(mm.fit_transform(sort_data1[new_fea]),sort_target1)
    14     ridge_ans = ridge.predict(mm.transform(change_sort_data2[new_fea]))
    15     ridge_mse=mse(ridge_ans,change_sort_target2)
    16     ridge_plt.append(ridge_mse)
    17     print('ridge:',ridge_mse)
    18     
    19     svr = SVR(kernel='rbf',C=100,epsilon=0.1).fit(mm.fit_transform(sort_data1[new_fea]),sort_target1)
    20     svr_ans = svr.predict(mm.transform(change_sort_data2[new_fea]))
    21     svr_mse=mse(svr_ans,change_sort_target2)
    22     svr_plt.append(svr_mse)
    23     print('svr:',svr_mse)
    24     
    25     estimator_RF = RandomForestRegressor().fit(mm.fit_transform(sort_data1[new_fea]),sort_target1)
    26     predict_RF = estimator_RF.predict(mm.transform(change_sort_data2[new_fea]))
    27     RF_mse=mse(predict_RF,change_sort_target2)
    28     RF_plt.append(RF_mse)
    29     print('RF:',RF_mse)
    30     
    31 #     bst = xgb.XGBRegressor(learning_rate=0.1, n_estimators=550, max_depth=4, min_child_weight=5, seed=0,
    32 #                              subsample=0.7, colsample_bytree=0.7, gamma=0.1, reg_alpha=1, reg_lambda=1)
    33 #     bst.fit(mm.fit_transform(sort_data1[new_fea]),sort_target1)
    34 #     bst_ans = bst.predict(mm.transform(change_sort_data2[new_fea]))
    35 #     print('bst:',mse(bst_ans,change_sort_target2))
    36     
    37     change_sort_target2=change_sort_target2.shift(periods=-5,axis=0)
    38     change_sort_target2.dropna(inplace=True)
    39     change_sort_data2 = change_sort_data2.shift(periods=5,axis=0)
    40     change_sort_data2.dropna(inplace=True)
    View Code

    结果如图:

    然后就是画图了;

    1 plt.plot(x,lr_plt,label='lr',color='r',marker='o')
    2 plt.plot(x,ridge_plt,label='ridge',color='b',marker='o')
    3 plt.plot(x,svr_plt,label='svr',color='g',marker='o')
    4 plt.plot(x,RF_plt,label='RF',color='y',marker='o')
    5 plt.legend()
    6 plt.show()
    View Code

    舍去lr,并扩大纵坐标:

    1 #plt.plot(x,lr_plt,label='lr',color='r',marker='o')
    2 plt.plot(x,ridge_plt,label='ridge',color='b',marker='o')
    3 plt.plot(x,svr_plt,label='svr',color='g',marker='o')
    4 plt.plot(x,RF_plt,label='RF',color='y',marker='o')
    5 plt.legend()
    6 plt.show()
    View Code

    其他模型只需将MinMaxScaler改为MaxAbsScaler,standarScaler即可;

    总的来说,label的移动会使得mse增加,大约在label=10时候差异最小,结果最理想;

  • 相关阅读:
    贪心算法部分题目及知识点总结
    贪心算法(农夫修泥塘)
    贪心算法部分知识点
    丑数运算 一、((输出丑数n的下标)(给定丑数输下标)) 二、((求第n个丑数是谁)(给定下标求丑数))
    关于学习STL部分学到的零碎知识点
    STL中set与map的使用以及优先队列的部分补充内容以及重载运算符的使用
    回文素数与接水问题(OJ)
    关于字符串与整数转化的问题与一些常用字符串处理函数
    部分STL简单应用知识点
    【Python小游戏】俄罗斯方块
  • 原文地址:https://www.cnblogs.com/csushl/p/10005356.html
Copyright © 2020-2023  润新知