针对银行客户流失预测,主要流程分为:特征预处理、特征选择,分类模型选择与训练。主要工作如下:
1:特征预处理与选择
对性别进行哑变量处理;
对是否有****信息将布尔值转换01表示;
画出年龄直方图可以看出大致呈正态分布,对年龄分段处理后缺失值采用插补方式;
资产当前总额=存储类资产当前总额=本币存储当前总金额 月日均余额=存储类资产月日均余额=本币存储月日均余额 分别删除其中两项;
针对*NUM,*DUR,*AMT,*BAL字段分别进行特征提取(SelectKBest)达到降维效果;
最后整合数据,特征标准化处理最终为44个特征(StandardScaler)。
2:分类模型选择与训练
数据集划分:采用K折交叉验证,train_test_split自主切分数据集
模型选择:采用了决策树,提升树(GBDT/XGBoost),SVM(libsvm)神经网络(多层感知器算法)分别训练模型
3:对应python主要代码:
-
decisiontree.py
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score X_train,X_test,y_train,y_test=train_test_split(StS,y,test_size=0.4,random_state=0) clf = tree.DecisionTreeClassifier() clf = clf.fit(X_train, y_train) pre_labels = clf.predict(X_test) print('accuracy score:',accuracy_score(y_test,pre_labels,normalize=True)) print('recall score:',recall_score(y_test,pre_labels)) print('precision score:',precision_score(y_test,pre_labels)) print('f1 score:',f1_score(y_test,pre_labels))
- XGBoost.py
import xgboost as xgb from sklearn.preprocessing import StandardScaler #记录程序运行时间 import time start_time = time.time() from xgboost.sklearn import XGBClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report,roc_auc_score bankChurn = pd.read_csv('D:/work/lost data and dictionary/test/bankChurn.csv')#原始数据 bankChurn_data = pd.read_csv('D:/work/lost data and dictionary/test/bankChurn_data.csv')#预处理数据 Y_train=bankChurn['CHUR0_CUST_I0D']#标签 StS=StandardScaler().fit_transform(bankChurn_data) X_train,X_test,y_train,y_test=train_test_split(StS,Y_train,test_size=0.4,random_state=None) print(X_train.shape, X_test.shape) #模型参数设置 xlf = xgb.XGBClassifier(max_depth=10, learning_rate=0.1, n_estimators=10, silent=True, objective='binary:logistic', nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=0.85, colsample_bytree=0.7, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,#这个值是因为类别十分不平衡。 seed=1440) xlf.fit(X_train, y_train, eval_metric='error', verbose = True, eval_set = [(X_test, y_test)],early_stopping_rounds=100) # 计算 auc 分数、预测 preds = xlf.predict(X_test) pre_pro = xlf.predict_proba(X_test)[:,1] print('accuracy score:',accuracy_score(y_test,preds ,normalize=True)) print('classification report:',classification_report(y_test,preds )) print('precision score:',precision_score(y_test,preds )) print('roc_auc_score:%f' % roc_auc_score(y_test,pre_pro)) #输出运行时长 cost_time = time.time()-start_time print("xgboost success!",' ',"cost time:",cost_time,"(s)......")
-
libsvm.py
import os os.chdir('C:libsvm-2.81python') from svmutil import * from sklearn.metrics import accuracy_score,classification_report y,x=svm_read_problem('bankchurnLibsvm.txt')#转换成libsvm格式 # print(type(x)) x=np.array(x) y=np.array(y) stratified_folder=StratifiedKFold(n_splits=4,random_state=0,shuffle=True) for train_index,test_index in stratified_folder.split(x,y): print('shuffled train index:',train_index) print('shuffled test index:', test_index) print('shuffled x_train:', x[train_index]) print('shuffled x_test:', x[test_index]) print('shuffled y_train:', y[train_index]) print('shuffled y_test:', y[test_index]) print('.......') y_train=list(y[train_index]) y_test=list(y[test_index]) x_train=list(x[train_index]) x_test=list(x[test_index]) m=svm_train( y_train,x_train,'-c 4 -g 2') p_label,p_acc,p_val=svm_predict(y_test,x_test,m) print('accuracy score:',accuracy_score(y_test,p_label ,normalize=True)) print('classification report:',classification_report(y_test,p_label ))
-
BPtest
import pandas as pd import numpy as np from sklearn.model_selection import cross_val_score from sklearn.neural_network import MLPClassifier from sklearn.metrics import accuracy_score,roc_auc_score from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,classification_report bankChurn = pd.read_csv('D:/work/lost data and dictionary/test/bankChurn.csv') X_data = pd.read_csv('D:/work/lost data and dictionary/test/bankChurn_data.csv') X_data=X_data.values[:,:] Y_label=bankChurn['CHUR0_CUST_I0D'] Y_label=Y_label.values[:] data=np.hstack((X_data,Y_label.reshape(Y_label.size,1)))##将样本集与标签合并 np.random.shuffle(data)##混洗数据 X=data[:,:-1] Y=data[:,-1] train_x=X[:-8620] test_x=X[-8620:] train_y=Y[:-8620] test_y=Y[-8620:]#数据5:5 ######mlpclassifier_data():###多层感知机算法,BP算法 classifier=MLPClassifier(hidden_layer_sizes=(30,),activation='logistic',max_iter=1000) clf=classifier.fit(train_x,train_y) train_score=classifier.score(train_x,train_y) test_score=classifier.score(test_x,test_y) print('train_score:',train_score) print('test_score:',test_score) ####得到其他分类效果#### pre_labels = clf.predict(test_x) pre_pro = clf.predict_proba(test_x)[:,1] print('accuracy score:',accuracy_score(test_y,pre_labels,normalize=True)) print('recall score:',recall_score(test_y,pre_labels)) print('classification report:',classification_report(test_y,pre_labels)) print('precision score:',precision_score(test_y,pre_labels)) print('f1 score:',f1_score(test_y,pre_labels)) print('roc_auc_score:%f' % roc_auc_score(test_y,pre_pro))
DT XGBoost Libsvm BP Accuracy 0.856 0.91 0.894 0.90 Precision 0.86 0.89 0.84 0.88 Recall 0.86 0.91 0.89 0.90 F1 score 0.86 0.89 0.85 0.87