    • 难点: 数据不均衡,有两种解决方案,一种是过采样和一种是下采样
    • 过采样是对多的数据进行筛选,使得两种数据数量相同
    • 下采样,通过对少的数据进行数据生成,使得两种数据数量相同
    import pandas as pd
    import matplotlib.pyplot as plt
    import numpy as np
    import warnings
    # 读取文件
    data = pd.read_csv('./data/creditcard.csv')
    # 看一下数据的维度
    # 观察到正样本 和 负样本的数量, 看数量是否均衡
    count_classes = pd.value_counts(data['Class'], sort=True).sort_index()
    plt.title('Fraud class histogram')
    # 面对数据不规则, 提出两种解决方案, 一种是过采样 一种是下采样.
    # 下采样, 数据样本不均衡, 想变成均衡数据, 可以让 0 和 1一样少
    # 过采样, 对1号样本进行生成策略, 让生成的数据和 0 一样多
    # 观察数据发现Amount数值比较大, 因此需要Amount对数据进行预处理, 保证特征的分布差异是差不多的
    # 要么做归一化Min-Max 要么做标准化 Zscore
    from sklearn.preprocessing import StandardScaler
    # fit_transform 对数据进行变换操作
    data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))
    data = data.drop(['Time', 'Amount'], axis=1)
    # 先选择下采样 , 数据一样少
    X = data.iloc[:, data.columns != 'Class']
    y = data.iloc[:, data.columns == 'Class']
    # 先发现有多少个, 然后找到数据的索引, 方便取数据
    number_record_fraud = len(data[data.Class == 1])
    fraud_indices = np.array(data[data.Class == 1].index)
    # 对正常的索引 进行随机的选择
    norm_indices = data[data.Class == 0].index
    random_normal_indices = np.random.choice(norm_indices, number_record_fraud, replace=False)
    random_normal_indices = np.array(random_normal_indices)
    # 用合并的操作, 把样本合并在一起
    under_sample_indices = np.concatenate([fraud_indices, random_normal_indices])
    # 找到正确的样本
    under_sample_data = data.iloc[under_sample_indices, :]
    X_undersample = under_sample_data.iloc[:, under_sample_data.columns != 'Class']
    y_undersample = under_sample_data.iloc[:, under_sample_data.columns == 'Class']
    # 打印出比例
    print("Precentage of normal transcation: ", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data))
    print("Precentage of fraud transcation: ", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data))
    print("Total number of transactions in resample data: ", len(under_sample_data))
    # 数据预处理完成, 通过下采样, 把不均衡的数据变成均衡的数据
    from sklearn.model_selection import train_test_split
    # 模型首先对训练集和测试集进行切分, train - test
    # 交叉验证- 交叉验证就是通过对训练集进行多次切分, 分为训练集和验证集, 所有的训练集都可以进行训练, 且多次训练
    # 每次训练和验证都是不一样的, 模型的效果可能会偏高或者偏低, 通过多次训练取平均值, 参数很重要
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    print("Number transactions train dataset: ", len(X_train))
    print("Number transactions test dataset: ", len(X_test))
    print("Total number of transctions: ", len(X_test)+ len(X_train))
    # Undersample 下采样进行切分
    X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(
        X_undersample, y_undersample, test_size=0.3, random_state=0
    print("Number transactions train dataset: ", len(X_train_undersample))
    print("Number transactions test dataset: ", len(X_test_undersample))
    print("Total number of transctions: ", len(X_test_undersample)+ len(X_train_undersample))
    # 建立模型
    # Recall = TP/(TP + FN) , 召回率和查全率
    from sklearn.linear_model import LogisticRegression
    # KFold 是做交叉验证,做几次交叉验证,
    from sklearn.model_selection import KFold, cross_val_score
    from sklearn.metrics import confusion_matrix, recall_score, classification_report
    def printing_Kfold_score(x_train_data, y_train_data):
        # 不同的C参数, 正则化惩罚项, 模型更加稳定一点, 所以加入正则项, 可以判断参数的大小
        # 我们希望损失函数越低越好,通过加入惩罚项, Loss + W
        c_param_range = [0.01, 0.1, 1, 10, 100]
        results_table = pd.DataFrame(index=range(len(c_param_range), 2), columns=['C_parameter', 'Mean recall score'])
        results_table['C_parameter'] = c_param_range
        # the k-fold
        j = int(0)
        for c_param in c_param_range:
            print("C parameter: ", c_param)
            recall_accs = []
            # 交叉验证
            iter = 1
            fold = KFold(5, shuffle=False)
            for train_index, test_index in fold.split(x_train_data):
                # 调用逻辑回归, 正则化惩罚力度
                lr = LogisticRegression(C=c_param, penalty='l2')
                # 进行训练
                lr.fit(x_train_data.iloc[train_index, :], y_train_data.iloc[train_index, :].values.ravel())
                # 进行预测
                y_pred_undersample = lr.predict(x_train_data.iloc[test_index, :].values)
                # 计算回召率
                recall_acc = recall_score(y_train_data.iloc[test_index, :].values, y_pred_undersample)
                print('Iteration ',  iter, ": recall score = ", recall_acc)
                iter += 1
            # the mean value of those recall scores
            results_table.loc[j, 'Mean recall score'] = np.mean(recall_accs)
            j += 1
            print("Mean recall score ", np.mean(recall_accs))
        best_c = results_table.loc[results_table['Mean recall score']==results_table['Mean recall score'].max(),'C_parameter'].values[0]
        # 最后, 我们可以验证C参数
        print("Best model to choose from cross validation is with C parameter = ", best_c )
        return best_c
    # best_c = printing_Kfold_score(X_train_undersample, y_train_undersample)
    # print(best_c)
    # # 混淆矩阵  预测值 和 真实值
    # # 下采样的recall值是可以的, 但是误杀太多了
    # lr = LogisticRegression(C=best_c, penalty='l2')
    # # 进行训练
    # lr.fit(X_train, y_train.values.ravel())
    # # 进行预测
    # y_pred_undersample = lr.predict(X_test.values)
    # # 计算混淆矩阵
    # cnf_matrix = confusion_matrix(y_test, y_pred_undersample)
    # np.set_printoptions(precision=2)
    # print("Recall metrix in the testing dataset: ", cnf_matrix[1, 1]/(cnf_matrix[1, 0] + cnf_matrix[1, 1]))
    # # 画出混淆矩阵
    # class_names = [0, 1]
    # plt.figure()
    # plot_confusion_matrix(cnf_matrix, classes = class_names, title = 'Confusion matrix')
    # 过采样的策略
    import pandas as pd
    from imblearn.over_sampling import SMOTE
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import confusion_matrix
    from sklearn.model_selection import train_test_split
    credit_data = pd.read_csv('./data/creditcard.csv')
    columns = credit_data.columns
    # 标签在最后一列 'Class', 简单的删除掉他获取特征列
    features_columns = columns.delete(len(columns)-1)
    features = credit_data[features_columns]
    labels = credit_data['Class']
    features_train, features_test, labels_train, labels_test = train_test_split(
        features, labels, test_size=0.2, random_state=0
    # 对训练集进行生成数据,但是测试集是不需要动的
    oversampler = SMOTE(random_state=0)
    os_features, os_labels = oversampler.fit_sample(features_train, labels_train)
    print(len(os_labels[os_labels == 1]))
    os_features = pd.DataFrame(os_features)
    os_labels = pd.DataFrame(os_labels)
    best_c = printing_Kfold_score(os_features, os_labels)
