• DF平台:消费金融场景下的用户购买预测


    赛题链接:https://www.datafountain.cn/competitions/287/details

    赛题任务:利用招商银行客户的个人属性、信用卡消费数据,以及部分客户在掌上生活APP上的一个月的操作行为日志,设计合理的特征工程与模型算法方案,预测客户在未来一周内(4月1日-7日),是否会购买掌上生活APP上的优惠券(包括饭票、影票等)。考虑到客户隐私,客户的个人属性数据与信用卡消费数据,采用脱敏并标准化处理为V1,V2,…,V30数值型属性。客户在APP上的行为日志,一些字段也进行了相应加密。

    由于比赛后期时间紧张,排名一直往下掉,最终B榜排名75名,本博客记录在此比赛中的一些过程和心得。

    赛题数据介绍

    比赛提供包括训练集和测试集,训练集包括部分如下:

    (1)个人属性与信用卡消费数据:包含80000名信用卡客户的个人属性与信用卡消费数据,其中包含枚举型特征和数值型特征,均已转为数值并进行了脱敏和标准化处理。

    (2)APP操作行为日志:上述信用卡客户中,部分已绑定掌上生活APP的客户,在近一个月时间窗口内的所有点击行为日志。

    (3)标注数据:包括客户号及标签。其中,标签数据为用户是否会在未来一周,购买掌上生活APP上的优惠券。

    评分方式:AUC

    (一)数据EDA分析

    在这部分我并没有过多的花心思,主要是先load一下数据,看下缺失值,看下数据类型,老套路,用pandas_profiling.ProfileReport看一下每个表的各种信息,十分方便。然后就是各个表的merge,由于用的Jupyter Notebook,这部分代码比较散,整个代码可以参考github。

    (二)特征工程

    这部分包括各个表的处理,尤其是log表的一些操作行为日志的挖掘,包括一些滑动时间窗口操作

    import time
    from datetime import datetime
    from sklearn.externals import joblib
    from sklearn.preprocessing import StandardScaler  
    import pandas as pd
    import numpy as np
    import pickle   #序列化
    import os
    import missingno as msno
    import pandas_profiling
    from xgboost import XGBClassifier
    from sklearn.metrics import roc_auc_score
    from sklearn.model_selection import KFold
    import math
    
    if not os.path.exists('tmp'):
        os.mkdir('tmp')
    
    #log表特征(App操作行为日志)
    train_log_path = '../data/train/train_log.csv'
    test_log_path = '../data/test/test_log.csv'
    
    def gen_log_feat():
        dump_path = './tmp/log_feat.pkl'
        if os.path.exists(dump_path):
            log_copy = pickle.load(open(dump_path,'rb'))
        else:
            train_log = pd.read_csv(train_log_path,sep='	')
            test_log = pd.read_csv(test_log_path,sep='	')
            log = pd.concat([train_log,test_log],copy=False)
            EVT_LBL = log[['USRID','EVT_LBL']]
            getdummies_EVT_LBL = pd.get_dummies(EVT_LBL)
            getdummies_EVT_LBL1 = getdummies_EVT_LBL.copy()
            L1 = getdummies_EVT_LBL1.groupby('USRID',as_index=False).sum()
            
            # USRID count 7-9
    #         EVT_LBL2 = EVT_LBL.copy()
    #         USRID_count = EVT_LBL2.groupby(['USRID'],as_index=False)['USRID'].agg({'cnt':'count'})
    
    #         log['EVT_LBL_0'] = log['EVT_LBL'].apply(lambda x: x.split('-')[0])
    #         log['EVT_LBL_1'] = log['EVT_LBL'].apply(lambda x: x.split('-')[1])
    #         log['EVT_LBL_2'] = log['EVT_LBL'].apply(lambda x: x.split('-')[2])
    #         del log['EVT_LBL']
            #时间转换成秒,计算用户下一次的时间差特征
            log['OCC_TIM'] = log['OCC_TIM'].apply(lambda x:time.mktime(time.strptime(x,"%Y-%m-%d %H:%M:%S")))
            log = log.sort_values(['USRID','OCC_TIM'])
            log['next_time'] = log.groupby(['USRID'])['OCC_TIM'].diff(-1).apply(np.abs)
            statistic_log = log.copy()
            log_copy = log.copy()
            stat_feat = ['min','mean','max','std','median']
            statistic_log = statistic_log.groupby(['USRID'],as_index=False)['next_time'].agg(stat_feat).reset_index()
            statistic_log.columns = ['USRID'] + ['next_time_' + col for col in stat_feat]
            log_copy = pd.merge(log_copy,statistic_log,how='outer',on='USRID')
            log_copy = log_copy.groupby(['USRID'],as_index=False).mean()
            log_copy = pd.merge(log_copy,L1,how='left',on='USRID')
    #         log_copy = pd.merge(log_copy,USRID_count,how='left',on='USRID')
            pickle.dump(log_copy,open(dump_path,'wb'))
        return log_copy
    
    log_copy = gen_log_feat()
    
    #agg表特征(个人属性与信用卡消费数据)
    train_agg_path = '../data/train/train_agg.csv'
    test_agg_path = '../data/test/test_agg.csv'
    
    def get_stat_feat(df,values,action,days1,days2):  # day1 起始时间  days2:终止时间
        df = df[df['day'] > days1]
        df = df[df['day'] <= days2]
        stat_feat = ['min','mean','max','median','count','sum','std','var']
        df = df.groupby('USRID')[values].agg(stat_feat).reset_index()   #所以说在进行get_stat_feat之前,uid并不唯一
        df.columns = ['USRID'] + ['%s_%s_%s_' % (values,action,days2) + col for col in stat_feat] #loan_7_min,loan_7_max
        return df
    
    def gen_filter_agg_feat():
        dump_path = './tmp/filter_agg_feat.pkl'
        if os.path.exists(dump_path):
            filter_agg = pickle.load(open(dump_path,'rb'))
        else:
            train_agg = pd.read_csv(train_agg_path,sep='	')
            test_agg = pd.read_csv(test_agg_path,sep='	')
            agg = pd.concat([train_agg,test_agg],copy=False)
            
            #处理偏斜数据 7-9
    #         agg_columns = agg.columns
    #         skewed_feats = agg[agg_columns].apply(lambda x: x.skew())
    #         skewed_feats = skewed_feats[skewed_feats > 10 ]
    #         skewed_feats = skewed_feats.index
    #         agg[skewed_feats] = np.log1p(agg[skewed_feats])
            
            aggV28 = agg[['USRID','V28']]
            aggV25 = agg[['USRID','V25']]
            aggV20 = agg[['USRID','V20']]
    #         aggV19 = agg[['USRID','V19']]
    #         aggV18 = agg[['USRID','V18']]
            
            train_log = pd.read_csv(train_log_path,sep='	')
            test_log = pd.read_csv(test_log_path,sep='	')
            log = pd.concat([train_log,test_log],copy=False)
            
            log2 = log.copy()
            log2['day'] = log2['OCC_TIM'].map(lambda x:int(x.split('-')[2].split(' ')[0]))
            log1 = log2[['USRID','day']]
            aggV28_day = pd.merge(log1,aggV28,on=['USRID'],how='left',copy=False)
            aggV25_day = pd.merge(log1,aggV25,on=['USRID'],how='left',copy=False)
            aggV20_day = pd.merge(log1,aggV20,on=['USRID'],how='left',copy=False)
    #         aggV19_day = pd.merge(log1,aggV19,on=['USRID'],how='left',copy=False)
    #         aggV18_day = pd.merge(log1,aggV18,on=['USRID'],how='left',copy=False)
            USRID = aggV28_day['USRID'].unique()
            exclu1 = [1]*len(USRID)
            exclu2 = [1]*len(USRID)
            exclu3 = [1]*len(USRID)
    #         exclu4 = [1]*len(USRID)
    #         exclu5 = [1]*len(USRID)
            
            days_df1 = pd.DataFrame({'USRID':USRID,'exclu1':exclu1})
            days_df2 = pd.DataFrame({'USRID':USRID,'exclu2':exclu2})
            days_df3 = pd.DataFrame({'USRID':USRID,'exclu3':exclu3})
    #         days_df4 = pd.DataFrame({'USRID':USRID,'exclu4':exclu4})
    #         days_df5 = pd.DataFrame({'USRID':USRID,'exclu5':exclu5})
            
            day_list = [0,3,7,14,21,28,31]
            for i in range(len(day_list)-1):
                days1 = day_list[i]
                days2 = day_list[i+1]
                df_V28 = aggV28_day.copy()
                df_V25 = aggV25_day.copy()
                df_V20 = aggV20_day.copy()
    #             df_V19 = aggV19_day.copy()
    #             df_V18 = aggV18_day.copy()
    #             VS = ['V28','V25']
    #             for Vi in VS:
                day_dfV28 = get_stat_feat(df_V28,'V28','agg',days1,days2)
                day_dfV25 = get_stat_feat(df_V25,'V25','agg',days1,days2)
                day_dfV20 = get_stat_feat(df_V20,'V20','agg',days1,days2)
    #             day_dfV19 = get_stat_feat(df_V19,'V19','agg',days1,days2)
    #             day_dfV18 = get_stat_feat(df_V18,'V18','agg',days1,days2)
    
                days_df1 = pd.merge(days_df1,day_dfV28,how='left',on='USRID')
                days_df2 = pd.merge(days_df2,day_dfV25,how='left',on='USRID')
                days_df3 = pd.merge(days_df3,day_dfV20,how='left',on='USRID')
    #             days_df4 = pd.merge(days_df4,day_dfV19,how='left',on='USRID')
    #             days_df5 = pd.merge(days_df5,day_dfV18,how='left',on='USRID')
            days_df1 = days_df1.fillna(0.)
            days_df2 = days_df2.fillna(0.)
            days_df3 = days_df3.fillna(0.)
    #         days_df4 = days_df4.fillna(0.)
    #         days_df5 = days_df5.fillna(0.)
            del days_df1['exclu1']
            del days_df2['exclu2']
            del days_df3['exclu3']
    #         del days_df4['exclu4']
    #         del days_df5['exclu5']
            filter_agg1 = pd.merge(agg,days_df1,how='left',on='USRID')
            filter_agg2 = pd.merge(filter_agg1,days_df2,how='left',on='USRID')
    #         filter_agg3 = pd.merge(filter_agg2,days_df3,how='left',on='USRID')
    #         filter_agg4 = pd.merge(filter_agg3,days_df4,how='left',on='USRID')
            filter_agg = pd.merge(filter_agg2,days_df3,how='left',on='USRID')
            
    #         agg_V3 = agg[['USRID','V3']]
    #         agg_V3["VV3"]=agg_V3["V3"].astype(str).astype("str")
    #         getdummies_agg_V3 = pd.get_dummies(agg_V3)
    #         filter_agg = pd.merge(filter_agg,getdummies_agg_V3,how='left',on='USRID')
            #del filter_agg['V3']
            
        
            filter_agg.fillna(0.)
            pickle.dump(filter_agg,open(dump_path,'wb'))
        return filter_agg
    
    # V20,V25,V28
    
    filter_agg = gen_filter_agg_feat()
    print("sucessful!!!")
    
    #flg表处理
    train_flg_path = '../data/train/train_flg.csv'
    test_flg_path = '../data/submit_sample.csv'
    
    def gen_flg():
        dump_path = './tmp/flg.pkl'
        if os.path.exists(dump_path):
            flg = pickle.load(open(dump_path,'rb'))
        else:
            train_flg = pd.read_csv(train_flg_path,sep='	')
            test_flg = pd.read_csv(test_flg_path,sep='	')
            test_flg['FLAG']=-1
            del test_flg['RST']
            flg = pd.concat([train_flg,test_flg],copy=False)
            pickle.dump(flg,open(dump_path,'wb'))
        return flg
    
    log_copy = gen_log_feat()
    
    #表merge
    def make_data():
        dump_path = './tmp/data.pkl'
        if os.path.exists(dump_path):
            data = pickle.load(open(dump_path,'rb'))
        else:
            log_copy = gen_log_feat()
            filter_agg = gen_filter_agg_feat()
            flg = gen_flg()
            data = pd.merge(filter_agg,flg,how='left',on='USRID')
            data = pd.merge(data,log_copy,how='left',on='USRID')
            pickle.dump(data,open(dump_path,'wb'))
            
        return data
    
    data = make_data()
    #train = data[data['FLAG']!=-1]
    #test = data[data['FLAG']==-1]
    #test = test.drop(['FLAG'],axis=1)
    #labels = train.pop('FLAG')
    #labels = labels[:len(train)]
    #target = np.zeros([len(labels), len(np.unique(labels))])
    #target[:, 0] = labels == 0
    #target[:, 1] = labels == 1
    

    (三)XGBoost模型搭建

    import pickle   #序列化
    import os
    import missingno as msno
    import pandas_profiling
    from xgboost import XGBClassifier
    from sklearn.metrics import roc_auc_score
    from sklearn.model_selection import KFold
    import lightgbm as lgb
    import time
    import math
    from sklearn.model_selection import KFold
    
    data_path = './tmp/data.pkl'
    
    data = pickle.load(open(data_path,'rb'))
    data = data.fillna(0.)
    train = data[data['FLAG']!=-1]
    test = data[data['FLAG']==-1]
    y = train.pop('FLAG')
    col = train.columns
    X = train[col].values
    test = test.drop(['FLAG'],axis=1)
    
    folds = KFold(n_splits=6,shuffle=True,random_state=546799)
    oof_preds = np.zeros(train.shape[0])
    sub_preds = np.zeros(test.shape[0])
    print("oof_preds.shape:",oof_preds.shape)
    print("sub_preds.shape:",sub_preds.shape)
    
    ignore_features = ['USRID']
    features=[f for f in train.columns if f not in ignore_features]
    for n_fold,(trn_idx,val_idx) in enumerate(folds.split(train)):
        print("trn_idx:",trn_idx)
        trn_x,trn_y = train[features].iloc[trn_idx],y.iloc[trn_idx]
        val_x,val_y = train[features].iloc[val_idx],y.iloc[val_idx]
        clf = XGBClassifier(
            object = 'binary:logistic', 
            booster = "gbtree",
            eval_metric = 'auc',
    #         nthread = 8,    # 如果你希望使用CPU全部的核,那就不要输入这个参数,算法会自动检测它。
            eta = 0.025,    
            gamma = 0,        # 用于控制是否后剪枝的参数,越大越保守,一般选0.1,0.2这样子
            # lamda = 2,  #控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合
            max_depth = 6,
            subsample = 0.8,   # 随机采样训练样本  原来是0.8
            colsample_bytree = 0.632,     # 生成树的列采样 原来是0.632
            colsample_bylecel = 0.8,   
            min_child_weight = 19,  # 原来是19   #这个参数默认为1,是每个叶子里面 h 的和至少为多少,对正负样本不均衡的0-1分类而言,假设 h 在0.01附近
            # min_child_weight 为1意味着叶子节点中最少需要包含100个样本。这个参数非常影响结果,控制叶子节点中二阶导和的最小值,该参数越小,越容易overfiting
            alpha = 0,
            #random_state = 42,
    #         reg_alpha=100,
            nrounds = 8000,
            scale_pos_weight = 1,
            seed = 4396,  #2018,4396
            n_estimators = 1000,
            learning_rate = 0.1
            # silent :0 ,#设置成1则没有运行信息输出,最好是设置为0.
        )
        clf.fit(trn_x,trn_y,eval_set = [(trn_x,trn_y),(val_x,val_y)],verbose=10,early_stopping_rounds=30)
        oof_preds[val_idx] = clf.predict_proba(val_x)[:,1]
        sub_preds+=clf.predict_proba(test[features])[:,1] / folds.n_splits
        print('Fold %2d AUC: %.6f' % (n_fold + 1,roc_auc_score(val_y,oof_preds[val_idx])))
        del clf,trn_x,trn_y,val_x,val_y
    print('Full AUC score %.6f' % roc_auc_score(y,oof_preds))
    
    test['RST'] = sub_preds
    time_date = time.strftime('%Y-%m-%d',time.localtime(time.time()))
    test[['USRID','RST']].to_csv('../submit/%s_%s.csv'%(str(time_date),str(roc_auc_score(y,oof_preds)).split('.')[1]),index=False,sep='	')
    
    #test[['USRID','RST']].to_csv("F:/Jupyter_Notebook_dir/DataFountain_JN/submit/submission_03.csv",index = False,float_format = '%.8f',sep='	')
    

    (四)网格搜索,模型调参

    import pandas as pd
    import numpy as np
    import xgboost as xgb
    from sklearn.metrics import roc_auc_score
    from sklearn.model_selection import KFold
    from xgboost import XGBClassifier
    import gc
    import matplotlib as mpl
    from pandas.core.frame import DataFrame
    from matplotlib import pyplot as plt
    from matplotlib.pyplot import GridSpec
    import seaborn as sns
    import numpy as np
    import warnings
    warnings.filterwarnings('ignore')
    sns.set_context("poster",font_scale=1.3)
    from missingno import missingno
    import missingno as msno
    import pandas_profiling
    from sklearn.datasets import make_blobs
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    from sklearn import cross_validation, metrics 
    from sklearn.model_selection import GridSearchCV
    import pickle
    import time
    import os
    import math
    import numpy as np
    import pandas as pd
    import xgboost as xgb
    from xgboost import plot_importance
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import StratifiedKFold
    # from xgboost.sklearn import GridSearchCV
    import pickle
    
    data_path = 'G:/xjl_docunment/DataFountain/code/tmp/data.pkl'
    data = pickle.load(open(data_path,'rb'))
    train = data[data['FLAG']!=-1]
    test = data[data['FLAG']==-1]
    # y = train.pop('FLAG')
    col = train.columns
    X = train[col].values
    test = test.drop(['FLAG'],axis=1)
    print('Sucessful')
    
    def modelfit(alg,dtrain,predictors,useTrainCV=True,cv_folds=5,early_stopping_rounds=50):
        if useTrainCV:
            xgb_param = alg.get_xgb_params()
            xgbtrain = xgb.DMatrix(dtrain[predictors].values,label=dtrain[target].values)
            cvresult = xgb.cv(xgb_param,xgbtrain,num_boost_round=alg.get_params()['n_estimators'],
                              nfold=cv_folds,metrics='auc',early_stopping_rounds=early_stopping_rounds,
                              show_stdv=False)
            alg.set_params(n_estimators=cvresult.shape[0])
        # Fit the algorithm on the data
        alg.fit(dtrain[predictors],dtrain['FLAG'],eval_metric='auc')
        
        #Predict training set:
        dtrain_predictions = alg.predict(dtrain[predictors])
        dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    
        #Print model report:
        print("
    Model Report") 
        print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['FLAG'].values, dtrain_predictions)) 
        print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['FLAG'], dtrain_predprob)) 
        
        print('sucessful')
    
        feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
        feat_imp.plot(kind='bar', title='Feature Importances')
        plt.ylabel('Feature Importance Score')
            
    from matplotlib.pylab import rcParams
    rcParams['figure.figsize'] = 12, 4
    target = 'FLAG'
    IDcol = 'USRID'
    #Choose all predictors except target & IDcols
    predictors = [x for x in train.columns if x not in [target,IDcol]]
    xgb1 = XGBClassifier(
     learning_rate =0.1,
     n_estimators=1000,
     max_depth=5,
     min_child_weight=1,
     gamma=0,
     subsample=0.8,
     colsample_bytree=0.8,
     objective= 'binary:logistic',
    #  nthread=4,
     scale_pos_weight=1,
     seed=27)
    modelfit(xgb1, train, predictors)
    print('Sucessful')
    

    1)max_depth 和 min_weight 参数调优:先对这两个参数调优,因为它们对最终的结果有很大的影响,先大范围的粗调参数,然后小范围微调

    param_test1 = {
        'max_depth':range(3,10,2),
        'min_child_weight':range(1,20,3)
    }
    gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1,n_estimators=1000,max_depth=5,
                                                   min_child_weight=1,gamma=0,subsample=0.8,colsample_bytree=0.8,
                                                   objective='binary:logistic',scale_pos_weight=1,seed=27),
                           param_grid=param_test1,scoring='roc_auc',n_jobs=8,iid=False,cv=5)
    gsearch1.fit(train[predictors],train['FLAG'])
    gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
    

     

    param_test11 = {
        'max_depth':range(3,10,2),
        'min_child_weight':range(1,20,3)
    }
    gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1,n_estimators=140,max_depth=5,
                                                   min_child_weight=1,gamma=0,subsample=0.8,colsample_bytree=0.8,
                                                   objective='binary:logistic',scale_pos_weight=1,seed=27),
                           param_grid=param_test11,scoring='roc_auc',n_jobs=8,iid=False,cv=5)
    gsearch1.fit(train[predictors],train['FLAG'])
    gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

      

    params_test2 = {
        'max_depth':[3,5,6],
        'min_child_weight':[1,5,6]
    }
    gsearch2 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1,n_estimators=1000,max_depth=5,
                                                  min_child_weight=2,gamma=0,subsample=0.8,
                                                   colsample_bytree=0.632,objective = 'binary:logistic',
                                                   scale_pos_weight=1,seed=27),param_grid = params_test2,
                            scoring='roc_auc',iid=False, cv=5)
    
    gsearch2.fit(train[predictors],train[target])
    gsearch2.grid_scores_, gsearch2.best_params_,gsearch2.best_score_
    

      

    param_test2b = {
     'min_child_weight':[1,5,8,10,12,13,19,21]
     }
    gsearch2b = GridSearchCV(estimator = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=3,
     min_child_weight=5, gamma=0, subsample=0.8, colsample_bytree=0.632, objective= 'binary:logistic',
                                                       scale_pos_weight=1,seed=27),
                             param_grid = param_test2b, scoring='roc_auc',n_jobs=16,iid=False, cv=5)
    
    gsearch2b.fit(train[predictors],train[target])
    modelfit(gsearch2b.best_estimator_, train, predictors)
    gsearch2b.grid_scores_, gsearch2b.best_params_, gsearch2b.best_score_
    

    2)gamma参数调优:在已经调整好其他参数的基础上我们可以进行gamma参数调优,gamma参数取值的范围可以很大

    param_test3 = {
        'gamma':[i/10.0 for i in range(0,5)]
    }
    gsearch3 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=3,
                                                      min_child_weight=19, gamma=0, subsample=0.8, 
                                                      colsample_bytree=0.8, objective= 'binary:logistic', 
                                                      scale_pos_weight=1,seed=27), param_grid = param_test3, 
                            scoring='roc_auc',iid=False, cv=5)
    
    gsearch3.fit(train[predictors],train[target])
    gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_
    

    3)调整subsample和colsample——bytree参数:分两个阶段来进行这个步骤,这两个步骤都取0.6,0.7,0.8,0.9作为起始值

     

    param_test4 = {
        'subsample':[i/10.0 for i in range(6,10)],
        'colsample_bytree':[i/10.0 for i in range(6,10)]
    }
    
    gsearch4 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=3,
                                                      min_child_weight=19, gamma=0.2, subsample=0.8, 
                                                      colsample_bytree=0.8, objective= 'binary:logistic', 
                                                      scale_pos_weight=1,seed=27), 
                            param_grid = param_test4, scoring='roc_auc',n_jobs=16,iid=False, cv=5)
    
    gsearch4.fit(train[predictors],train[target])
    gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_
    

      

    param_test5 = {
        'subsample':[i/100.0 for i in range(75,90,5)],
        'colsample_bytree':[i/100.0 for i in range(65,90,5)]
    }
    gsearch5 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=1000, 
                                                      max_depth=3, min_child_weight=19, gamma=0, 
                                                      subsample=0.9, colsample_bytree=0.7, 
                                                      objective= 'binary:logistic', scale_pos_weight=1,
                                                      seed=27), 
                            param_grid = param_test5, scoring='roc_auc',n_jobs=16,iid=False, cv=5)
    gsearch5.fit(train[predictors],train[target])
    gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_
    

    4)正则化参数调优:应用正则化参数来降低过拟合,由于gamma函数提供了一种更加有效地降低过拟合的方法

    param_test6 = {
        'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
    }
    gsearch6 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=4, 
                                                      min_child_weight=19, gamma=0.2, subsample=0.85, 
                                                      colsample_bytree=0.65, objective= 'binary:logistic', 
                                                      scale_pos_weight=1,seed=27), 
                            param_grid = param_test6, scoring='roc_auc',n_jobs=16,iid=False, cv=5)
    
    gsearch6.fit(train[predictors],train[target])
    gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_
    

      

    param_test7 = {
        'reg_alpha':[100,150,200,500]
    }
    gsearch7 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=1000, max_depth=4, 
                                                      min_child_weight=19, gamma=0.2, subsample=0.85, 
                                                      colsample_bytree=0.65, objective= 'binary:logistic', 
                                                      scale_pos_weight=1,seed=27), 
                            param_grid = param_test7, scoring='roc_auc',n_jobs=16,iid=False, cv=5)
    
    gsearch7.fit(train[predictors],train[target])
    gsearch7.grid_scores_, gsearch7.best_params_, gsearch7.best_score_
    

    5)降低学习速率:最后,我们使用较低的学习速率以及使用更多的决策树,我们可以使用XGBoost中的CV函数来进行这一步的工作

     

    xgb4 = XGBClassifier(learning_rate =0.01,n_estimators=1000,max_depth=4,min_child_weight=19,gamma=0.2,subsample=0.85,colsample_bytree=0.65,
                         objective= 'binary:logistic',reg_alpha=100,nthread=16,scale_pos_weight=1,seed=27)
    modelfit(xgb4, train, predictors)
    

    其实,感觉网格搜索调参,在这里提升的效果并没有特别明显,不如特征工程提升明显,但还是加上了

    (五)模型融合

    这里采用stacking的形式,融合RandomForestClassifier,ExtraTreesClassifier,AdaBoostClassifier,GradientBoostingClassifier,SVC

    # Class to extend the Sklearn classifier
    class SklearnHelper(object):
        def __init__(self,clf,seed=0,params=None):
            params['random_state'] = seed
            self.clf = clf(**params)
        
        def train(self,x_train,y_train):
            self.clf.fit(x_train,y_train)
        
        def predict(self,x):
            return self.clf.predict(x)
        
        def fit(self,x,y):
            return self.clf.fit(x,y)
        
        def feature_importances(self,x,y):
            print(self.clf.fit(x,y).feature_importances_)
    
    # Class to extend XGboost classifer
    
    #对每个分类器生成5折交叉验证的预测值
    def get_oof(clf,x_train,y_train,x_test):
        oof_train = np.zeros((ntrain,))   #生成一个向量
        oof_test = np.zeros((ntest,))
        oof_test_skf = np.empty((NFOLDS,ntest))
        
        for i,(train_index,test_index) in enumerate(kf):#enumerate会将一个数组a或者列表生成(0,a[0]),(1,a[1])...
            x_tr = x_train[train_index]
            y_tr = y_train[train_index]
            x_te = x_train[test_index]
            
            clf.train(x_tr,y_tr)
            
            oof_train[test_index] = clf.predict(x_te)
            oof_test_skf[i,:] = clf.predict(x_test)  #每一次交叉验证得到的在x_test测试集上的预测值
            
        oof_test[:]=oof_test_skf.mean(axis=0)   #对五次的在测试集上的预测值取均值,
        return oof_train.reshape(-1,1),oof_test.reshape(-1,1)   #我们就得到了训练集,验
    
    # Random Forest parameters
    rf_params = {
        'n_jobs':-1,
        'n_estimators':500,
        'warm_start':True,     ##True在前面基础上增量训练(重设参数减少训练次数) False默认擦除重新训练
        'max_depth':6,
        'min_samples_leaf':2,
        'max_features':'sqrt',
        'verbose':0
    }
    
    # Extra Trees Parameters
    et_params = {
        'n_jobs': -1,
        'n_estimators':500,
        #'max_features': 0.5,
        'max_depth': 8,
        'min_samples_leaf': 2,
        'verbose': 0
    }
    
    # AdaBoost parameters
    ada_params = {
        'n_estimators': 500,
        'learning_rate' : 0.75
    }
    
    # Gradient Boosting parameters
    gb_params = {
        'n_estimators': 500,
         #'max_features': 0.2,
        'max_depth': 5,
        'min_samples_leaf': 2,
        'verbose': 0
    }
    
    # Support Vector Classifier parameters 
    svc_params = {
        'kernel' : 'linear',
        'C' : 0.025
        }
        
    # Create 5 objects that represent our 4 models
    rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
    et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
    ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
    gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
    svc = SklearnHelper(clf=SVC, seed=SEED, params=svc_params)
    
    # Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
    y_train = y.ravel()
    # train = train.drop(['FLAG'], axis=1)
    x_train = train.values # Creates an array of the train data
    x_test = test.values# Creats an array of the test data,这种方式可以把DataFrame格式或者Series格式的数据转化为数组形式。
    #train.as_matrix()这种方法也可以
    
    # Create our OOF train and test predictions. These base results will be used as new features
    et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
    rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
    ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 
    gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost

    x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train), axis=1)
    x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test), axis=1)

    gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
    n_estimators= 2000,
    max_depth= 4,
    min_child_weight= 2,
    #gamma=1,
    gamma=0.9,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    nthread= -1,
    scale_pos_weight=1).fit(x_train, y_train)
    predictions = gbm.predict_proba(x_test)[:,1]
    print("sucessful")

    USRID = test['USRID']

    import time
    test['RST'] = predictions
    time_date = time.strftime('%Y-%m-%d',time.localtime(time.time()))
    test[['USRID','RST']].to_csv('../submit/StackingSubmission.csv',index=False,sep=' ')

      

  • 相关阅读:
    faster-rcnn原理及相应概念解释
    caffe框架下目标检测——faster-rcnn实战篇操作
    caffe框架下目标检测——faster-rcnn实战篇问题集锦
    python的N个小功能(连接数据库并下载相应位置的图片)
    python的N个小功能(高斯模糊原理及实践)
    python的N个小功能(文本字段对应数值,经纬度计算距离,两个时间点计算时间间隔)
    python的N个小功能之正则匹配
    CSS注
    vue注
    在Vue项目中使用html2canvas生成页面截图并上传
  • 原文地址:https://www.cnblogs.com/xiaodongsuibi/p/9558634.html
Copyright © 2020-2023  润新知