• 7.2.py 树模型衍生变量


    主要是通过树模型衍生变量。然后和逻辑回归模型融合在一起;树模型LGM衍生模型,主要是使用LGM模型对原始数据进行训练,把每个样本落在的叶节点的位置记为1,这个有N个树就有N个位置,然后每个样本就得到一个1xN(N是树的棵树)的向量,然后通过PSI,特征重要性去刷选变量,最后将刷选后的变量放入逻辑回归模型中去,虽然模型有所提升,但是还不如直接使用集成模型。毕竟在如果提升,逻辑回归算法的上限在此。

    其中,计算PSI可是使用自己的方法,关于阈值的选定,可以选择没有使用树模型衍生变量前的特征的对应指标的最小值作为阈值。

    # -*- coding: utf-8 -*-
    """
    Created on Tue Dec 24 15:57:09 2019
    
    @author: zixing.mei
    """
    
    import lightgbm as lgb  
    import random  
    import pandas as pd  
    import numpy as np  
    from sklearn.model_selection import train_test_split  
    from sklearn.metrics import mean_squared_error  
    from sklearn.linear_model import LogisticRegression  
    from sklearn import metrics  
    from sklearn.metrics import roc_curve  
    from matplotlib import pyplot as plt  
    import math  
    
    data = pd.read_csv('xxx/Acard.txt')  
      
    df_train = data[data.obs_mth != '2018-11-30'].reset_index().copy()    
    df_test = data[data.obs_mth == '2018-11-30'].reset_index().copy()    
    NUMERIC_COLS = ['person_info','finance_info','credit_info','act_info']
    
    #%%
    #在训练集上训练LGM模型,然后对特征进行映射,得到更高维度的特征
    from sklearn.preprocessing import OneHotEncoder,LabelEncoder  
      
    lgb_train = lgb.Dataset(df_train[NUMERIC_COLS], 
                              df_train['bad_ind'], free_raw_data=False)  
    params = {  
        'num_boost_round': 50,  
        'boosting_type': 'gbdt',  
        'objective': 'binary',  
        'num_leaves': 2,  
        'metric': 'auc',  
        'max_depth':1,  
        'feature_fraction':1,  
        'bagging_fraction':1, } 
    model = lgb.train(params,lgb_train)  
    leaf = model.predict(df_train[NUMERIC_COLS],pred_leaf=True)  
    lgb_enc = OneHotEncoder()  
    #生成交叉特征
    lgb_enc.fit(leaf)
    #和原始特征进行合并,独热编码后特征变量的维度变成了100 ,原来是50
    data_leaf = np.hstack((lgb_enc.transform(leaf).toarray(),df_train[NUMERIC_COLS]))  
    
    #%%在测试集中映射
    leaf_test = model.predict(df_test[NUMERIC_COLS],pred_leaf=True)  
    lgb_enc = OneHotEncoder()  
    lgb_enc.fit(leaf_test)  
    data_leaf_test = np.hstack((lgb_enc.transform(leaf_test).toarray(),
                                  df_test[NUMERIC_COLS]))  
    
    #%%使用新数据,逻辑回归建模
    train = data_leaf.copy()  
    train_y = df_train['bad_ind'].copy()  
    val = data_leaf_test.copy()  
    val_y = df_test['bad_ind'].copy()  
    
    lgb_lm = LogisticRegression(penalty='l2',C=0.2, class_weight='balanced',solver='liblinear')
    lgb_lm.fit(train, train_y)  
    y_pred_lgb_lm_train = lgb_lm.predict_proba(train)[:, 1]  
    fpr_lgb_lm_train, tpr_lgb_lm_train, _ = roc_curve(train_y,y_pred_lgb_lm_train)
    y_pred_lgb_lm = lgb_lm.predict_proba(val)[:,1]  
    fpr_lgb_lm,tpr_lgb_lm,_ = roc_curve(val_y,y_pred_lgb_lm)  
    plt.figure(1)  
    plt.plot([0, 1], [0, 1], 'k--')  
    plt.plot(fpr_lgb_lm_train,tpr_lgb_lm_train,label='LGB + LR train')  
    plt.plot(fpr_lgb_lm, tpr_lgb_lm, label='LGB + LR test')  
    plt.xlabel('False positive rate')  
    plt.ylabel('True positive rate')  
    plt.title('ROC curve')  
    plt.legend(loc='best')  
    plt.show()  
    print('LGB+LR train ks:',abs(fpr_lgb_lm_train - tpr_lgb_lm_train).max(),
                                   'LGB+LR AUC:', metrics.auc(fpr_lgb_lm_train, tpr_lgb_lm_train))
    print('LGB+LR test ks:',abs(fpr_lgb_lm - tpr_lgb_lm).max(),
                                  'LGB+LR AUC:', metrics.auc(fpr_lgb_lm, tpr_lgb_lm))
    
    '''
    LGB+LR train ks: 0.4812287054151174 LGB+LR AUC: 0.8116320314831054
    LGB+LR test ks: 0.4441149787927866 LGB+LR AUC: 0.7776214991730668
    '''
    
    #%%为新生成的特征命名,后面需要进行特征刷选
    dff_train = pd.DataFrame(train)  
    dff_train.columns = [ 'ft' + str(x) for x in range(train.shape[1])]  
      
    dff_val = pd.DataFrame(val)  
    dff_val.columns = [ 'ft' + str(x) for x in range(val.shape[1])]  
    
    #%%
    #生成可以传入PSI的数据集  
    def make_psi_data(dff_train):  
        dftot = pd.DataFrame()  
        for col in dff_train.columns:  
            zero= sum(dff_train[col] == 0)  
            one= sum(dff_train[col] == 1)  
            ftdf = pd.DataFrame(np.array([zero,one]))  
            ftdf.columns = [col]  
            if len(dftot) == 0:  
                dftot = ftdf.copy()  
            else:  
                dftot[col] = ftdf[col].copy()  
        return dftot  
    psi_data_train = make_psi_data(dff_train)  
    psi_data_val = make_psi_data(dff_val) 
    
    def var_PSI(dev_data, val_data):  
        dev_cnt, val_cnt = sum(dev_data), sum(val_data)  
        if dev_cnt * val_cnt == 0:  
            return 0  
        PSI = 0  
        for i in range(len(dev_data)):  
            dev_ratio = dev_data[i] / dev_cnt  
            val_ratio = val_data[i] / val_cnt + 1e-10  
            psi = (dev_ratio - val_ratio) * math.log(dev_ratio/val_ratio)
            PSI += psi  
        return PSI  
    psi_dct = {}  
    for col in dff_train.columns:  
        psi_dct[col] = var_PSI(psi_data_train[col],psi_data_val[col]) 
        
    f = zip(psi_dct.keys(),psi_dct.values())  
    f = sorted(f,key = lambda x:x[1],reverse = False)  
    psi_df = pd.DataFrame(f)  
    psi_df.columns = pd.Series(['变量名','PSI'])  
    feature_lst = list(psi_df[psi_df['PSI']<psi_df.quantile(0.6)[0]]['变量名'])  
    train = dff_train[feature_lst].copy()  
    train_y = df_train['bad_ind'].copy()  
    
    val = dff_val[feature_lst].copy()  
    val_y = df_test['bad_ind'].copy()  
    lgb_lm = LogisticRegression(C = 0.3,class_weight='balanced',solver='liblinear')
    lgb_lm.fit(train, train_y)  
    y_pred_lgb_lm_train = lgb_lm.predict_proba(train)[:, 1]  
    fpr_lgb_lm_train, tpr_lgb_lm_train, _ = roc_curve(train_y, y_pred_lgb_lm_train)
    y_pred_lgb_lm = lgb_lm.predict_proba(val)[:, 1]  
    fpr_lgb_lm, tpr_lgb_lm, _ = roc_curve(val_y, y_pred_lgb_lm)  
    plt.figure(1)  
    plt.plot([0, 1], [0, 1], 'k--')  
    plt.plot(fpr_lgb_lm_train, tpr_lgb_lm_train, label='LGB + LR train')  
    plt.plot(fpr_lgb_lm, tpr_lgb_lm, label='LGB + LR test')  
    plt.xlabel('False positive rate')  
    plt.ylabel('True positive rate')  
    plt.title('ROC curve')  
    plt.legend(loc='best')  
    plt.show()  
    print('LGB+LR train ks:',abs(fpr_lgb_lm_train - tpr_lgb_lm_train).max(),
                                   'LGB+LR AUC:', metrics.auc(fpr_lgb_lm_train, tpr_lgb_lm_train))
    print('LGB+LR test ks:',abs(fpr_lgb_lm - tpr_lgb_lm).max(),'LGB+LR AUC:',
                                  metrics.auc(fpr_lgb_lm, tpr_lgb_lm))
    
    '''
    LGB+LR train ks: 0.47632382032329534 LGB+LR AUC: 0.807277659727129
    LGB+LR test ks: 0.4463346827179526 LGB+LR AUC: 0.7794119538226763
    
    '''
    
    #%%通过特征重要度刷选特征
    x = train  
    y = train_y  
      
    val_x =  val  
    val_y = val_y  
      
    #定义lgb函数  
    def LGB_test(train_x,train_y,test_x,test_y):  
        from multiprocessing import cpu_count  
        clf = lgb.LGBMClassifier(  
            boosting_type='gbdt', num_leaves=31, reg_Ap=0.0, reg_lambda=1,
            max_depth=2, n_estimators=800,max_features=140,objective='binary',
            subsample=0.7, colsample_bytree=0.7, subsample_freq=1,  
            learning_rate=0.05, min_child_weight=50,
                  random_state=None,n_jobs=cpu_count()-1,)  
        clf.fit(train_x, train_y,eval_set=[(train_x, train_y),(test_x,test_y)],
                    eval_metric='auc',early_stopping_rounds=100)  
        return clf,clf.best_score_[ 'valid_1']['auc']  
    #训练模型
    model,auc = LGB_test(x,y,val_x,val_y)                      
      
    #模型贡献度放在feture中  
    feature = pd.DataFrame(  
                {'name' : model.booster_.feature_name(),  
                'importance' : model.feature_importances_  
              }).sort_values(by = ['importance'],ascending = False) 
    feature_lst2 = list(feature[feature.importance>5].name)
    
    
    #%%再次训练
    train = dff_train[feature_lst2].copy()  
    train_y = df_train['bad_ind'].copy()  
    val = dff_val[feature_lst2].copy()  
    val_y = df_test['bad_ind'].copy()  
    lgb_lm = LogisticRegression(C = 0.3,class_weight='balanced',solver='liblinear')
    lgb_lm.fit(train, train_y)  
      
    y_pred_lgb_lm_train = lgb_lm.predict_proba(train)[:, 1]  
    fpr_lgb_lm_train, tpr_lgb_lm_train, _ = roc_curve(train_y, y_pred_lgb_lm_train)
      
    y_pred_lgb_lm = lgb_lm.predict_proba(val)[:, 1]  
    fpr_lgb_lm, tpr_lgb_lm, _ = roc_curve(val_y, y_pred_lgb_lm)  
      
    plt.figure(1)  
    plt.plot([0, 1], [0, 1], 'k--')  
    plt.plot(fpr_lgb_lm_train, tpr_lgb_lm_train, label='LGB + LR train')  
    plt.plot(fpr_lgb_lm, tpr_lgb_lm, label='LGB + LR test')  
    plt.xlabel('False positive rate')  
    plt.ylabel('True positive rate')  
    plt.title('ROC curve')  
    plt.legend(loc='best')  
    plt.show()  
    print('LGB+LR train ks:',abs(fpr_lgb_lm_train - tpr_lgb_lm_train).max(),
          'LGB+LR AUC:', metrics.auc(fpr_lgb_lm_train, tpr_lgb_lm_train))  
    print('LGB+LR test ks:',abs(fpr_lgb_lm - tpr_lgb_lm).max(),'LGB+LR AUC:', 
          metrics.auc(fpr_lgb_lm, tpr_lgb_lm))  
    
    '''
    LGB+LR train ks: 0.4687230745337274 LGB+LR AUC: 0.8045813389226749
    LGB+LR test ks: 0.44510149222090417 LGB+LR AUC: 0.7841449970149346
    '''
  • 相关阅读:
    pip源配置
    Linux:supervisor命令的使用
    uWSGI+Nginx+Flask在Linux下的部署
    MongoDB的使用[转]
    用python获取服务器硬件信息[转]
    python:virtualenv的使用
    Django笔记:常见故障排除
    常用资源网站链接
    Scrapy笔记:持久化,Feed exports的使用
    Scrapy笔记:使用代理ip
  • 原文地址:https://www.cnblogs.com/cgmcoding/p/15380026.html
Copyright © 2020-2023  润新知