1 #!/usr/bin/env python2 2 # -*- coding: utf-8 -*- 3 """ 4 Created on Sat Mar 31 21:19:09 2018 5 6 @author: hello4720 7 """ 8 import numpy as np 9 import pandas as pd 10 import lightgbm as lgb 11 from sklearn import metrics 12 from sklearn.model_selection import train_test_split 13 14 ### 读取数据 15 print("载入数据") 16 dataset1 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data1.csv') 17 dataset2 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data2.csv') 18 dataset3 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data3.csv') 19 dataset4 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data4.csv') 20 dataset5 = pd.read_csv('G:/ML/ML_match/IJCAI/data3.22/3.22ICJAI/data/7_train_data5.csv') 21 22 dataset1.drop_duplicates(inplace=True) 23 dataset2.drop_duplicates(inplace=True) 24 dataset3.drop_duplicates(inplace=True) 25 dataset4.drop_duplicates(inplace=True) 26 dataset5.drop_duplicates(inplace=True) 27 28 ### 数据合并 29 print("数据合并") 30 trains = pd.concat([dataset1,dataset2],axis=0) 31 trains = pd.concat([trains,dataset3],axis=0) 32 trains = pd.concat([trains,dataset4],axis=0) 33 34 online_test = dataset5 35 36 ### 数据拆分 37 print("数据拆分") 38 train_xy,offline_test = train_test_split(trains, test_size = 0.2,random_state=21) 39 train,val = train_test_split(train_xy, test_size = 0.2,random_state=21) 40 41 print("训练集") 42 y = train.is_trade # 训练集标签 43 X = train.drop(['instance_id','is_trade'],axis=1) # 训练集特征矩阵 44 45 print("验证集") 46 val_y = val.is_trade # 验证集标签 47 val_X = val.drop(['instance_id','is_trade'],axis=1) # 验证集特征矩阵 48 49 print("测试集") 50 offline_test_X=offline_test.drop(['instance_id','is_trade'],axis=1) # 线下测试特征矩阵 51 online_test_X=online_test.drop(['instance_id'],axis=1) # 线上测试特征矩阵 52 53 ### 数据转换 54 lgb_train = lgb.Dataset(X, y, free_raw_data=False) 55 lgb_eval = lgb.Dataset(val_X, val_y, reference=lgb_train,free_raw_data=False) 56 57 ### 开始训练 58 print('设置参数') 59 params = { 60 'boosting_type': 'gbdt', 61 'boosting': 'dart', 62 'objective': 'binary', 63 'metric': 'binary_logloss', 64 65 'learning_rate': 0.01, 66 'num_leaves':25, 67 'max_depth':3, 68 69 'max_bin':10, 70 'min_data_in_leaf':8, 71 72 'feature_fraction': 0.6, 73 'bagging_fraction': 1, 74 'bagging_freq':0, 75 76 'lambda_l1': 0, 77 'lambda_l2': 0, 78 'min_split_gain': 0 79 } 80 81 print("开始训练") 82 gbm = lgb.train(params, # 参数字典 83 lgb_train, # 训练集 84 num_boost_round=2000, # 迭代次数 85 valid_sets=lgb_eval, # 验证集 86 early_stopping_rounds=30) # 早停系数 87 ### 线下预测 88 print ("线下预测") 89 preds_offline = gbm.predict(offline_test_X, num_iteration=gbm.best_iteration) # 输出概率 90 offline=offline_test[['instance_id','is_trade']] 91 offline['preds']=preds_offline 92 offline.is_trade = offline['is_trade'].astype(np.float64) 93 print('log_loss', metrics.log_loss(offline.is_trade, offline.preds)) 94 95 ### 线上预测 96 print("线上预测") 97 preds_online = gbm.predict(online_test_X, num_iteration=gbm.best_iteration) # 输出概率 98 online=online_test[['instance_id']] 99 online['preds']=preds_online 100 online.rename(columns={'preds':'predicted_score'},inplace=True) 101 online.to_csv("./data/20180405.txt",index=None,sep=' ') 102 103 ### 保存模型 104 from sklearn.externals import joblib 105 joblib.dump(gbm,'gbm.pkl') 106 107 ### 特征选择 108 df = pd.DataFrame(X.columns.tolist(), columns=['feature']) 109 df['importance']=list(gbm.feature_importance()) 110 df = df.sort_values(by='importance',ascending=False) 111 df.to_csv("./data/feature_score_20180405.csv",index=None,encoding='gbk')