信用卡欺诈的模型
- 难点: 数据不均衡,有两种解决方案,一种是过采样和一种是下采样
- 过采样是对多的数据进行筛选,使得两种数据数量相同
- 下采样,通过对少的数据进行数据生成,使得两种数据数量相同
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')
# 读取文件
data = pd.read_csv('./data/creditcard.csv')
print(data.head())
# 看一下数据的维度
print(data.shape)
# 观察到正样本 和 负样本的数量, 看数量是否均衡
count_classes = pd.value_counts(data['Class'], sort=True).sort_index()
print(count_classes)
count_classes.plot(kind='bar')
plt.title('Fraud class histogram')
plt.xlabel('Class')
plt.ylabel('Frequency')
# 面对数据不规则, 提出两种解决方案, 一种是过采样 一种是下采样.
# 下采样, 数据样本不均衡, 想变成均衡数据, 可以让 0 和 1一样少
# 过采样, 对1号样本进行生成策略, 让生成的数据和 0 一样多
# 观察数据发现Amount数值比较大, 因此需要Amount对数据进行预处理, 保证特征的分布差异是差不多的
# 要么做归一化Min-Max 要么做标准化 Zscore
from sklearn.preprocessing import StandardScaler
# fit_transform 对数据进行变换操作
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
data = data.drop(['Time', 'Amount'], axis=1)
print(data.head())
# 先选择下采样 , 数据一样少
X = data.iloc[:, data.columns != 'Class']
y = data.iloc[:, data.columns == 'Class']
# 先发现有多少个, 然后找到数据的索引, 方便取数据
number_record_fraud = len(data[data.Class == 1])
fraud_indices = np.array(data[data.Class == 1].index)
# 对正常的索引 进行随机的选择
norm_indices = data[data.Class == 0].index
random_normal_indices = np.random.choice(norm_indices, number_record_fraud, replace=False)
random_normal_indices = np.array(random_normal_indices)
# 用合并的操作, 把样本合并在一起
under_sample_indices = np.concatenate([fraud_indices, random_normal_indices])
# 找到正确的样本
under_sample_data = data.iloc[under_sample_indices, :]
X_undersample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
y_undersample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']
# 打印出比例
print("Precentage of normal transcation: ", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))
print("Precentage of fraud transcation: ", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))
print("Total number of transactions in resample data: ", len(under_sample_data))
# 数据预处理完成, 通过下采样, 把不均衡的数据变成均衡的数据
from sklearn.model_selection import train_test_split
# 模型首先对训练集和测试集进行切分, train - test
# 交叉验证- 交叉验证就是通过对训练集进行多次切分, 分为训练集和验证集, 所有的训练集都可以进行训练, 且多次训练
# 每次训练和验证都是不一样的, 模型的效果可能会偏高或者偏低, 通过多次训练取平均值, 参数很重要
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print("Number transactions train dataset: ", len(X_train))
print("Number transactions test dataset: ", len(X_test))
print("Total number of transctions: ", len(X_test)+ len(X_train))
# Undersample 下采样进行切分
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(
X_undersample, y_undersample, test_size=0.3, random_state=0
)
print("**************")
print("Number transactions train dataset: ", len(X_train_undersample))
print("Number transactions test dataset: ", len(X_test_undersample))
print("Total number of transctions: ", len(X_test_undersample)+ len(X_train_undersample))
# 建立模型
# Recall = TP/(TP + FN) , 召回率和查全率
from sklearn.linear_model import LogisticRegression
# KFold 是做交叉验证,做几次交叉验证,
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix, recall_score, classification_report
def printing_Kfold_score(x_train_data, y_train_data):
# 不同的C参数, 正则化惩罚项, 模型更加稳定一点, 所以加入正则项, 可以判断参数的大小
# 我们希望损失函数越低越好,通过加入惩罚项, Loss + W
c_param_range = [0.01, 0.1, 1, 10, 100]
results_table = pd.DataFrame(index=range(len(c_param_range), 2), columns=['C_parameter', 'Mean recall score'])
results_table['C_parameter'] = c_param_range
# the k-fold
j = int(0)
for c_param in c_param_range:
print('------------------------------------')
print("C parameter: ", c_param)
print('------------------------------------')
print('')
recall_accs = []
# 交叉验证
iter = 1
fold = KFold(5, shuffle=False)
for train_index, test_index in fold.split(x_train_data):
# 调用逻辑回归, 正则化惩罚力度
lr = LogisticRegression(C=c_param, penalty='l2')
# 进行训练
lr.fit(x_train_data.iloc[train_index, :], y_train_data.iloc[train_index, :].values.ravel())
# 进行预测
y_pred_undersample = lr.predict(x_train_data.iloc[test_index, :].values)
# 计算回召率
recall_acc = recall_score(y_train_data.iloc[test_index, :].values, y_pred_undersample)
recall_accs.append(recall_acc)
print('Iteration ', iter, ": recall score = ", recall_acc)
iter += 1
# the mean value of those recall scores
results_table.loc[j, 'Mean recall score'] = np.mean(recall_accs)
j += 1
print('')
print("Mean recall score ", np.mean(recall_accs))
print('')
best_c = results_table.loc[results_table['Mean recall score']==results_table['Mean recall score'].max(),'C_parameter'].values[0]
# 最后, 我们可以验证C参数
print('************************************************')
print("Best model to choose from cross validation is with C parameter = ", best_c )
print("************************************************")
return best_c
# best_c = printing_Kfold_score(X_train_undersample, y_train_undersample)
# print(best_c)
#
#
# # 混淆矩阵 预测值 和 真实值
# # 下采样的recall值是可以的, 但是误杀太多了
#
#
#
#
# lr = LogisticRegression(C=best_c, penalty='l2')
#
# # 进行训练
# lr.fit(X_train, y_train.values.ravel())
#
# # 进行预测
# y_pred_undersample = lr.predict(X_test.values)
#
# # 计算混淆矩阵
# cnf_matrix = confusion_matrix(y_test, y_pred_undersample)
# np.set_printoptions(precision=2)
#
# print("Recall metrix in the testing dataset: ", cnf_matrix[1, 1]/(cnf_matrix[1, 0] + cnf_matrix[1, 1]))
#
# # 画出混淆矩阵
# class_names = [0, 1]
# plt.figure()
# plot_confusion_matrix(cnf_matrix, classes = class_names, title = 'Confusion matrix')
# 过采样的策略
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
credit_data = pd.read_csv('./data/creditcard.csv')
columns = credit_data.columns
# 标签在最后一列 'Class', 简单的删除掉他获取特征列
features_columns = columns.delete(len(columns)-1)
features = credit_data[features_columns]
labels = credit_data['Class']
features_train, features_test, labels_train, labels_test = train_test_split(
features, labels, test_size=0.2, random_state=0
)
# 对训练集进行生成数据,但是测试集是不需要动的
oversampler = SMOTE(random_state=0)
os_features, os_labels = oversampler.fit_sample(features_train, labels_train)
print(len(os_labels[os_labels == 1]))
os_features = pd.DataFrame(os_features)
os_labels = pd.DataFrame(os_labels)
best_c = printing_Kfold_score(os_features, os_labels)