主要是通过树模型衍生变量。然后和逻辑回归模型融合在一起;树模型LGM衍生模型,主要是使用LGM模型对原始数据进行训练,把每个样本落在的叶节点的位置记为1,这个有N个树就有N个位置,然后每个样本就得到一个1xN(N是树的棵树)的向量,然后通过PSI,特征重要性去刷选变量,最后将刷选后的变量放入逻辑回归模型中去,虽然模型有所提升,但是还不如直接使用集成模型。毕竟在如果提升,逻辑回归算法的上限在此。
其中,计算PSI可是使用自己的方法,关于阈值的选定,可以选择没有使用树模型衍生变量前的特征的对应指标的最小值作为阈值。
# -*- coding: utf-8 -*- """ Created on Tue Dec 24 15:57:09 2019 @author: zixing.mei """ import lightgbm as lgb import random import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error from sklearn.linear_model import LogisticRegression from sklearn import metrics from sklearn.metrics import roc_curve from matplotlib import pyplot as plt import math data = pd.read_csv('xxx/Acard.txt') df_train = data[data.obs_mth != '2018-11-30'].reset_index().copy() df_test = data[data.obs_mth == '2018-11-30'].reset_index().copy() NUMERIC_COLS = ['person_info','finance_info','credit_info','act_info'] #%% #在训练集上训练LGM模型,然后对特征进行映射,得到更高维度的特征 from sklearn.preprocessing import OneHotEncoder,LabelEncoder lgb_train = lgb.Dataset(df_train[NUMERIC_COLS], df_train['bad_ind'], free_raw_data=False) params = { 'num_boost_round': 50, 'boosting_type': 'gbdt', 'objective': 'binary', 'num_leaves': 2, 'metric': 'auc', 'max_depth':1, 'feature_fraction':1, 'bagging_fraction':1, } model = lgb.train(params,lgb_train) leaf = model.predict(df_train[NUMERIC_COLS],pred_leaf=True) lgb_enc = OneHotEncoder() #生成交叉特征 lgb_enc.fit(leaf) #和原始特征进行合并,独热编码后特征变量的维度变成了100 ,原来是50 data_leaf = np.hstack((lgb_enc.transform(leaf).toarray(),df_train[NUMERIC_COLS])) #%%在测试集中映射 leaf_test = model.predict(df_test[NUMERIC_COLS],pred_leaf=True) lgb_enc = OneHotEncoder() lgb_enc.fit(leaf_test) data_leaf_test = np.hstack((lgb_enc.transform(leaf_test).toarray(), df_test[NUMERIC_COLS])) #%%使用新数据,逻辑回归建模 train = data_leaf.copy() train_y = df_train['bad_ind'].copy() val = data_leaf_test.copy() val_y = df_test['bad_ind'].copy() lgb_lm = LogisticRegression(penalty='l2',C=0.2, class_weight='balanced',solver='liblinear') lgb_lm.fit(train, train_y) y_pred_lgb_lm_train = lgb_lm.predict_proba(train)[:, 1] fpr_lgb_lm_train, tpr_lgb_lm_train, _ = roc_curve(train_y,y_pred_lgb_lm_train) y_pred_lgb_lm = lgb_lm.predict_proba(val)[:,1] fpr_lgb_lm,tpr_lgb_lm,_ = roc_curve(val_y,y_pred_lgb_lm) plt.figure(1) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr_lgb_lm_train,tpr_lgb_lm_train,label='LGB + LR train') plt.plot(fpr_lgb_lm, tpr_lgb_lm, label='LGB + LR test') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve') plt.legend(loc='best') plt.show() print('LGB+LR train ks:',abs(fpr_lgb_lm_train - tpr_lgb_lm_train).max(), 'LGB+LR AUC:', metrics.auc(fpr_lgb_lm_train, tpr_lgb_lm_train)) print('LGB+LR test ks:',abs(fpr_lgb_lm - tpr_lgb_lm).max(), 'LGB+LR AUC:', metrics.auc(fpr_lgb_lm, tpr_lgb_lm)) ''' LGB+LR train ks: 0.4812287054151174 LGB+LR AUC: 0.8116320314831054 LGB+LR test ks: 0.4441149787927866 LGB+LR AUC: 0.7776214991730668 ''' #%%为新生成的特征命名,后面需要进行特征刷选 dff_train = pd.DataFrame(train) dff_train.columns = [ 'ft' + str(x) for x in range(train.shape[1])] dff_val = pd.DataFrame(val) dff_val.columns = [ 'ft' + str(x) for x in range(val.shape[1])] #%% #生成可以传入PSI的数据集 def make_psi_data(dff_train): dftot = pd.DataFrame() for col in dff_train.columns: zero= sum(dff_train[col] == 0) one= sum(dff_train[col] == 1) ftdf = pd.DataFrame(np.array([zero,one])) ftdf.columns = [col] if len(dftot) == 0: dftot = ftdf.copy() else: dftot[col] = ftdf[col].copy() return dftot psi_data_train = make_psi_data(dff_train) psi_data_val = make_psi_data(dff_val) def var_PSI(dev_data, val_data): dev_cnt, val_cnt = sum(dev_data), sum(val_data) if dev_cnt * val_cnt == 0: return 0 PSI = 0 for i in range(len(dev_data)): dev_ratio = dev_data[i] / dev_cnt val_ratio = val_data[i] / val_cnt + 1e-10 psi = (dev_ratio - val_ratio) * math.log(dev_ratio/val_ratio) PSI += psi return PSI psi_dct = {} for col in dff_train.columns: psi_dct[col] = var_PSI(psi_data_train[col],psi_data_val[col]) f = zip(psi_dct.keys(),psi_dct.values()) f = sorted(f,key = lambda x:x[1],reverse = False) psi_df = pd.DataFrame(f) psi_df.columns = pd.Series(['变量名','PSI']) feature_lst = list(psi_df[psi_df['PSI']<psi_df.quantile(0.6)[0]]['变量名']) train = dff_train[feature_lst].copy() train_y = df_train['bad_ind'].copy() val = dff_val[feature_lst].copy() val_y = df_test['bad_ind'].copy() lgb_lm = LogisticRegression(C = 0.3,class_weight='balanced',solver='liblinear') lgb_lm.fit(train, train_y) y_pred_lgb_lm_train = lgb_lm.predict_proba(train)[:, 1] fpr_lgb_lm_train, tpr_lgb_lm_train, _ = roc_curve(train_y, y_pred_lgb_lm_train) y_pred_lgb_lm = lgb_lm.predict_proba(val)[:, 1] fpr_lgb_lm, tpr_lgb_lm, _ = roc_curve(val_y, y_pred_lgb_lm) plt.figure(1) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr_lgb_lm_train, tpr_lgb_lm_train, label='LGB + LR train') plt.plot(fpr_lgb_lm, tpr_lgb_lm, label='LGB + LR test') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve') plt.legend(loc='best') plt.show() print('LGB+LR train ks:',abs(fpr_lgb_lm_train - tpr_lgb_lm_train).max(), 'LGB+LR AUC:', metrics.auc(fpr_lgb_lm_train, tpr_lgb_lm_train)) print('LGB+LR test ks:',abs(fpr_lgb_lm - tpr_lgb_lm).max(),'LGB+LR AUC:', metrics.auc(fpr_lgb_lm, tpr_lgb_lm)) ''' LGB+LR train ks: 0.47632382032329534 LGB+LR AUC: 0.807277659727129 LGB+LR test ks: 0.4463346827179526 LGB+LR AUC: 0.7794119538226763 ''' #%%通过特征重要度刷选特征 x = train y = train_y val_x = val val_y = val_y #定义lgb函数 def LGB_test(train_x,train_y,test_x,test_y): from multiprocessing import cpu_count clf = lgb.LGBMClassifier( boosting_type='gbdt', num_leaves=31, reg_Ap=0.0, reg_lambda=1, max_depth=2, n_estimators=800,max_features=140,objective='binary', subsample=0.7, colsample_bytree=0.7, subsample_freq=1, learning_rate=0.05, min_child_weight=50, random_state=None,n_jobs=cpu_count()-1,) clf.fit(train_x, train_y,eval_set=[(train_x, train_y),(test_x,test_y)], eval_metric='auc',early_stopping_rounds=100) return clf,clf.best_score_[ 'valid_1']['auc'] #训练模型 model,auc = LGB_test(x,y,val_x,val_y) #模型贡献度放在feture中 feature = pd.DataFrame( {'name' : model.booster_.feature_name(), 'importance' : model.feature_importances_ }).sort_values(by = ['importance'],ascending = False) feature_lst2 = list(feature[feature.importance>5].name) #%%再次训练 train = dff_train[feature_lst2].copy() train_y = df_train['bad_ind'].copy() val = dff_val[feature_lst2].copy() val_y = df_test['bad_ind'].copy() lgb_lm = LogisticRegression(C = 0.3,class_weight='balanced',solver='liblinear') lgb_lm.fit(train, train_y) y_pred_lgb_lm_train = lgb_lm.predict_proba(train)[:, 1] fpr_lgb_lm_train, tpr_lgb_lm_train, _ = roc_curve(train_y, y_pred_lgb_lm_train) y_pred_lgb_lm = lgb_lm.predict_proba(val)[:, 1] fpr_lgb_lm, tpr_lgb_lm, _ = roc_curve(val_y, y_pred_lgb_lm) plt.figure(1) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr_lgb_lm_train, tpr_lgb_lm_train, label='LGB + LR train') plt.plot(fpr_lgb_lm, tpr_lgb_lm, label='LGB + LR test') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve') plt.legend(loc='best') plt.show() print('LGB+LR train ks:',abs(fpr_lgb_lm_train - tpr_lgb_lm_train).max(), 'LGB+LR AUC:', metrics.auc(fpr_lgb_lm_train, tpr_lgb_lm_train)) print('LGB+LR test ks:',abs(fpr_lgb_lm - tpr_lgb_lm).max(),'LGB+LR AUC:', metrics.auc(fpr_lgb_lm, tpr_lgb_lm)) ''' LGB+LR train ks: 0.4687230745337274 LGB+LR AUC: 0.8045813389226749 LGB+LR test ks: 0.44510149222090417 LGB+LR AUC: 0.7841449970149346 '''