任务4:使用特征工程对比赛字段进行编码
对数据集中类别字段(取值空间大于2)的进行one-hot操作
对类别特征进行OneEncoder编码
Train_data_onehot = pd.get_dummies(Train_data,columns = ['model', 'brand', 'bodyType', 'fuelType',
'gearbox', 'notRepairedDamage'])
Train_data_onehot
SaleID | name | regDate | power | kilometer | regionCode | seller | offerType | creatDate | price | ... | fuelType_2.0 | fuelType_3.0 | fuelType_4.0 | fuelType_5.0 | fuelType_6.0 | gearbox_0.0 | gearbox_1.0 | notRepairedDamage_- | notRepairedDamage_0.0 | notRepairedDamage_1.0 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 736 | 20040402 | 60 | 12.5 | 1046 | 0 | 0 | 20160404 | 1850 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
1 | 1 | 2262 | 20030301 | 0 | 15.0 | 4366 | 0 | 0 | 20160309 | 3600 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
2 | 2 | 14874 | 20040403 | 163 | 12.5 | 2806 | 0 | 0 | 20160402 | 6222 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
3 | 3 | 71865 | 19960908 | 193 | 15.0 | 434 | 0 | 0 | 20160312 | 2400 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
4 | 4 | 111080 | 20120103 | 68 | 5.0 | 6977 | 0 | 0 | 20160313 | 5200 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
149995 | 149995 | 163978 | 20000607 | 163 | 15.0 | 4576 | 0 | 0 | 20160327 | 5900 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
149996 | 149996 | 184535 | 20091102 | 125 | 10.0 | 2826 | 0 | 0 | 20160312 | 9500 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
149997 | 149997 | 147587 | 20101003 | 90 | 6.0 | 3302 | 0 | 0 | 20160328 | 7500 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
149998 | 149998 | 45907 | 20060312 | 156 | 15.0 | 1877 | 0 | 0 | 20160401 | 4999 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
149999 | 149999 | 177672 | 19990204 | 193 | 12.5 | 235 | 0 | 0 | 20160305 | 4700 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
150000 rows × 334 columns
Test_data_onehot = pd.get_dummies(Test_data,columns = ['model', 'brand', 'bodyType', 'fuelType',
'gearbox', 'notRepairedDamage'])
Test_data_onehot
SaleID | name | regDate | power | kilometer | regionCode | seller | offerType | creatDate | v_0 | ... | fuelType_2.0 | fuelType_3.0 | fuelType_4.0 | fuelType_5.0 | fuelType_6.0 | gearbox_0.0 | gearbox_1.0 | notRepairedDamage_- | notRepairedDamage_0.0 | notRepairedDamage_1.0 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 200000 | 133777 | 20000501 | 101 | 15.0 | 5019 | 0 | 0 | 20160308 | 42.142061 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
1 | 200001 | 61206 | 19950211 | 73 | 6.0 | 1505 | 0 | 0 | 20160310 | 43.907034 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
2 | 200002 | 67829 | 20090606 | 120 | 5.0 | 1776 | 0 | 0 | 20160309 | 45.389665 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
3 | 200003 | 8892 | 20020601 | 58 | 15.0 | 26 | 0 | 0 | 20160314 | 42.788775 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
4 | 200004 | 76998 | 20030301 | 116 | 15.0 | 738 | 0 | 0 | 20160306 | 43.670763 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
49995 | 249995 | 111443 | 20041005 | 150 | 15.0 | 5564 | 0 | 0 | 20160309 | 46.321013 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 |
49996 | 249996 | 152834 | 20130409 | 179 | 4.0 | 5220 | 0 | 0 | 20160323 | 48.086547 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
49997 | 249997 | 132531 | 20041211 | 147 | 12.5 | 3795 | 0 | 0 | 20160316 | 46.145279 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
49998 | 249998 | 143405 | 20020702 | 176 | 15.0 | 61 | 0 | 0 | 20160327 | 45.507088 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
49999 | 249999 | 78202 | 20090708 | 0 | 3.0 | 4158 | 0 | 0 | 20160401 | 44.289471 | ... | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
50000 rows × 329 columns
对日期特征提取年月日等信息
Train_data_create = pd.to_datetime(Train_data['creatDate'],format='%Y%m%d', errors='coerce')
Test_data_create = pd.to_datetime(Test_data['creatDate'],format='%Y%m%d', errors='coerce')
Train_data_reg = pd.to_datetime(Train_data['regDate'],format='%Y%m%d', errors='coerce')
Test_data_reg = pd.to_datetime(Test_data['regDate'],format='%Y%m%d', errors='coerce')
Train_data_create
0 2016-04-04
1 2016-03-09
2 2016-04-02
3 2016-03-12
4 2016-03-13
...
149995 2016-03-27
149996 2016-03-12
149997 2016-03-28
149998 2016-04-01
149999 2016-03-05
Name: creatDate, Length: 150000, dtype: datetime64[ns]
Train_data_reg
0 2004-04-02
1 2003-03-01
2 2004-04-03
3 1996-09-08
4 2012-01-03
...
149995 2000-06-07
149996 2009-11-02
149997 2010-10-03
149998 2006-03-12
149999 1999-02-04
Name: regDate, Length: 150000, dtype: datetime64[ns]
任务5:使用Sklearn中基础树模型完成训练和预测
学会五折交叉验证的数据划分方法(KFold)
import numpy as np
from sklearn.model_selection import KFold
X = np.array([[1,2],[3,4],[1,2],[3,4],[3,4]])
y = np.array([1,2,3,4,5])
kf = KFold(n_splits = 5)
for train_index,test_index in kf.split(X):
print("TRAIN:",train_index,"TEST:",test_index)
X_train,X_test = X[train_index],X[test_index]
y_train,y_test = y[train_index],y[test_index]
print(X_train,X_test)
print(y_train,y_test)
TRAIN: [1 2 3 4] TEST: [0]
[[3 4]
[1 2]
[3 4]
[3 4]] [[1 2]]
[2 3 4 5] [1]
TRAIN: [0 2 3 4] TEST: [1]
[[1 2]
[1 2]
[3 4]
[3 4]] [[3 4]]
[1 3 4 5] [2]
TRAIN: [0 1 3 4] TEST: [2]
[[1 2]
[3 4]
[3 4]
[3 4]] [[1 2]]
[1 2 4 5] [3]
TRAIN: [0 1 2 4] TEST: [3]
[[1 2]
[3 4]
[1 2]
[3 4]] [[3 4]]
[1 2 3 5] [4]
TRAIN: [0 1 2 3] TEST: [4]
[[1 2]
[3 4]
[1 2]
[3 4]] [[3 4]]
[1 2 3 4] [5]
对标签price按照大小划分成10等分,然后使用StratifiedKFold进行划分
#按照大小划分成10等分
Y_data = Train_data['price']
Y_data = Y_data.sort_values()
Y_data_dict = {}
for i in range(10):
Y_data_dict[i]=Y_data[i*15000:(i+1)*15000]
Y_data_list = []
Y_data_iloc = list(Y_data_dict.keys())
for i in Y_data_iloc:
Y_data_list.append(list(Y_data_dict[i]))
Y_data_iloc
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
len(Y_data_list[0])
15000
Y_data_iloc = [0,1,0,1,0,1,0,1,0,1]
import numpy as np
from sklearn.model_selection import StratifiedKFold
skf=StratifiedKFold(n_splits = 5,shuffle=True,random_state=0)
Y_data_list = np.array(Y_data_list)
Y_data_iloc = np.array(Y_data_iloc)
for train_index,test_index in skf.split(Y_data_list,Y_data_iloc):
print("TRAIN:",train_index,"TEST:",test_index)
X_train, X_test = Y_data_list[train_index], Y_data_list[test_index]
y_train,y_test = Y_data_iloc[train_index],Y_data_iloc[test_index]
TRAIN: [0 3 4 5 6 7 8 9] TEST: [1 2]
TRAIN: [0 1 2 3 6 7 8 9] TEST: [4 5]
TRAIN: [1 2 4 5 6 7 8 9] TEST: [0 3]
TRAIN: [0 1 2 3 4 5 7 8] TEST: [6 9]
TRAIN: [0 1 2 3 4 5 6 9] TEST: [7 8]
学会使用sklearn中的随机森林模型
学习博客链接:https://www.cnblogs.com/banshaohuan/p/13308680.html
任务6:成功将树模型的预测结果文件提交到天池
使用StratifiedKFold配合随机森林完成模型的训练和预测
在每折记录下模型对验证集和测试集的预测结果
X_data = X_data.fillna(-1)
X_data
gearbox | power | kilometer | v_0 | v_1 | v_2 | v_3 | v_4 | v_5 | v_6 | v_7 | v_8 | v_9 | v_10 | v_11 | v_12 | v_13 | v_14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 60 | 12.5 | 43.357796 | 3.966344 | 0.050257 | 2.159744 | 1.143786 | 0.235676 | 0.101988 | 0.129549 | 0.022816 | 0.097462 | -2.881803 | 2.804097 | -2.420821 | 0.795292 | 0.914762 |
1 | 0.0 | 0 | 15.0 | 45.305273 | 5.236112 | 0.137925 | 1.380657 | -1.422165 | 0.264777 | 0.121004 | 0.135731 | 0.026597 | 0.020582 | -4.900482 | 2.096338 | -1.030483 | -1.722674 | 0.245522 |
2 | 0.0 | 163 | 12.5 | 45.978359 | 4.823792 | 1.319524 | -0.998467 | -0.996911 | 0.251410 | 0.114912 | 0.165147 | 0.062173 | 0.027075 | -4.846749 | 1.803559 | 1.565330 | -0.832687 | -0.229963 |
3 | 1.0 | 193 | 15.0 | 45.687478 | 4.492574 | -0.050616 | 0.883600 | -2.228079 | 0.274293 | 0.110300 | 0.121964 | 0.033395 | 0.000000 | -4.509599 | 1.285940 | -0.501868 | -2.438353 | -0.478699 |
4 | 0.0 | 68 | 5.0 | 44.383511 | 2.031433 | 0.572169 | -1.571239 | 2.246088 | 0.228036 | 0.073205 | 0.091880 | 0.078819 | 0.121534 | -1.896240 | 0.910783 | 0.931110 | 2.834518 | 1.923482 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
149995 | 1.0 | 163 | 15.0 | 45.316543 | -3.139095 | -1.269707 | -0.736609 | -1.505820 | 0.280264 | 0.000310 | 0.048441 | 0.071158 | 0.019174 | 1.988114 | -2.983973 | 0.589167 | -1.304370 | -0.302592 |
149996 | 0.0 | 125 | 10.0 | 45.972058 | -3.143764 | -0.023523 | -2.366699 | 0.698012 | 0.253217 | 0.000777 | 0.084079 | 0.099681 | 0.079371 | 1.839166 | -2.774615 | 2.553994 | 0.924196 | -0.272160 |
149997 | 0.0 | 90 | 6.0 | 44.733481 | -3.105721 | 0.595454 | -2.279091 | 1.423661 | 0.233353 | 0.000705 | 0.118872 | 0.100118 | 0.097914 | 2.439812 | -1.630677 | 2.290197 | 1.891922 | 0.414931 |
149998 | 0.0 | 156 | 15.0 | 45.658634 | -3.204785 | -0.441680 | -1.179812 | 0.620680 | 0.256369 | 0.000252 | 0.081479 | 0.083558 | 0.081498 | 2.075380 | -2.633719 | 1.414937 | 0.431981 | -1.659014 |
149999 | 1.0 | 193 | 12.5 | 45.536383 | -3.200326 | -1.612893 | -0.067144 | -1.396166 | 0.284475 | 0.000000 | 0.040072 | 0.062543 | 0.025819 | 1.978453 | -3.179913 | 0.031724 | -1.483350 | -0.342674 |
150000 rows × 18 columns
Y_data = Train_data['price']
Y_data
0 1850
1 3600
2 6222
3 2400
4 5200
...
149995 5900
149996 9500
149997 7500
149998 4999
149999 4700
Name: price, Length: 150000, dtype: int64
#对每折记录模型对验证集和测试集的预测结果并求平均值
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
#定义随机森林模型
randomforest = RandomForestRegressor(n_estimators=50,random_state=0)
scores_train = []
scores_val = []
for train_ind,val_ind in skf.split(X_data,Y_data):
train_x = X_data.iloc[train_ind].values
train_y = Y_data.iloc[train_ind]
val_x = X_data.iloc[val_ind].values
val_y = Y_data.iloc[val_ind]
randomforest.fit(train_x,train_y)
pred_train_random = randomforest.predict(train_x)
pred_val_random = randomforest.predict(val_x)
score_train = mean_absolute_error(train_y,pred_train_random)
scores_train.append(score_train)
score = mean_absolute_error(val_y,pred_val_random)
scores_val.append(score)
print('Train mae:',np.mean(scores_train))
print('Val mae',np.mean(scores_val))
Train mae: 253.97298353737415
Val mae 667.4268940009031
X_Test_data = Test_data[feature_cols]
X_Test_data
gearbox | power | kilometer | v_0 | v_1 | v_2 | v_3 | v_4 | v_5 | v_6 | v_7 | v_8 | v_9 | v_10 | v_11 | v_12 | v_13 | v_14 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 101 | 15.0 | 42.142061 | -3.094739 | -0.721300 | 1.466344 | 1.009846 | 0.236520 | 0.000241 | 0.105319 | 0.046233 | 0.094522 | 3.619512 | -0.280607 | -2.019761 | 0.978828 | 0.803322 |
1 | 0.0 | 73 | 6.0 | 43.907034 | -3.244605 | -0.766430 | 1.276718 | -1.065338 | 0.261518 | 0.000000 | 0.120323 | 0.046784 | 0.035385 | 2.997376 | -1.406705 | -1.020884 | -1.349990 | -0.200542 |
2 | 0.0 | 120 | 5.0 | 45.389665 | 3.372384 | -0.965565 | -2.447316 | 0.624268 | 0.261691 | 0.090836 | 0.000000 | 0.079655 | 0.073586 | -3.951084 | -0.433467 | 0.918964 | 1.634604 | 1.027173 |
3 | 0.0 | 58 | 15.0 | 42.788775 | 4.035052 | -0.217403 | 1.708806 | 1.119165 | 0.236050 | 0.101777 | 0.098950 | 0.026830 | 0.096614 | -2.846788 | 2.800267 | -2.524610 | 1.076819 | 0.461610 |
4 | 0.0 | 116 | 15.0 | 43.670763 | -3.135382 | -1.134107 | 0.470315 | 0.134032 | 0.257000 | 0.000000 | 0.066732 | 0.057771 | 0.068852 | 2.839010 | -1.659801 | -0.924142 | 0.199423 | 0.451014 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
49995 | 1.0 | 150 | 15.0 | 46.321013 | -3.304401 | 0.073363 | -0.622359 | -0.778349 | 0.263668 | 0.000292 | 0.141804 | 0.076393 | 0.039272 | 2.072901 | -2.531869 | 1.716978 | -1.063437 | 0.326587 |
49996 | 0.0 | 179 | 4.0 | 48.086547 | -3.318641 | 0.965881 | -2.672160 | 0.357440 | 0.255310 | 0.000991 | 0.155868 | 0.108425 | 0.067841 | 1.358504 | -3.290295 | 4.269809 | 0.140524 | 0.556221 |
49997 | 1.0 | 147 | 12.5 | 46.145279 | -3.305263 | -0.015283 | -0.288329 | -0.687112 | 0.262933 | 0.000318 | 0.141872 | 0.071968 | 0.042966 | 2.165658 | -2.417885 | 1.370612 | -1.073133 | 0.270602 |
49998 | 1.0 | 176 | 15.0 | 45.507088 | -3.197006 | -1.141252 | -0.434930 | -1.845040 | 0.282106 | 0.000023 | 0.067483 | 0.067526 | 0.009006 | 2.030114 | -2.939244 | 0.569078 | -1.718245 | 0.316379 |
49999 | 0.0 | 0 | 3.0 | 44.289471 | 4.181452 | 0.547068 | -0.775841 | 1.789601 | 0.231449 | 0.103947 | 0.096027 | 0.062328 | 0.110180 | -3.689090 | 2.032376 | 0.109157 | 2.202828 | 0.847469 |
50000 rows × 18 columns
X_Test_data = X_Test_data.fillna(0)
X_Test_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 gearbox 50000 non-null float64
1 power 50000 non-null int64
2 kilometer 50000 non-null float64
3 v_0 50000 non-null float64
4 v_1 50000 non-null float64
5 v_2 50000 non-null float64
6 v_3 50000 non-null float64
7 v_4 50000 non-null float64
8 v_5 50000 non-null float64
9 v_6 50000 non-null float64
10 v_7 50000 non-null float64
11 v_8 50000 non-null float64
12 v_9 50000 non-null float64
13 v_10 50000 non-null float64
14 v_11 50000 non-null float64
15 v_12 50000 non-null float64
16 v_13 50000 non-null float64
17 v_14 50000 non-null float64
dtypes: float64(17), int64(1)
memory usage: 6.9 MB
from sklearn.model_selection import train_test_split
#定义模型函数
def build_model_randomforest(x_train,y_train):
model = RandomForestRegressor(n_estimators=50,random_state=0)
model.fit(x_train, y_train)
return model
model_random_pre = build_model_randomforest(X_data,Y_data)
subpre = model_random_pre.predict(X_Test_data)
subpre
array([1227.94 , 1832.4 , 8610.005 , ..., 5474.99 ,
5055.48 , 5637.44666667])
将多折测试集结果进行求均值,并写入csv提交到天池
sub = pd.DataFrame()
sub['SaleID'] = Test_data.SaleID
sub['price'] = subpre
sub.to_csv('submit.csv',index = False)
sub.head()
SaleID | price | |
---|---|---|
0 | 200000 | 1227.940 |
1 | 200001 | 1832.400 |
2 | 200002 | 8610.005 |
3 | 200003 | 929.880 |
4 | 200004 | 2075.360 |
提交天池结果