Python3入门人工智能 掌握机器学习 深度学习 提升实战能力6:模型评价与优化

    过拟合和欠拟合












    数据分离与混淆矩阵














    模型优化















    实战准备





    实战一










    1 #generate new data 建立新数据
    2 x_2_range = np.linspace(40,90,300).reshape(-1,1)#最小值40,最大值90,产生300个点;转成300行一列的数组
    3 x_2_range = poly2.transform(x_2_range)
    4 y_2_range_predict = lr2.predict(x_2_range)
    6 x_5_range = np.linspace(40,90,300).reshape(-1,1)#最小值40,最大值90,产生300个点;转成300行一列的数组
    7 x_5_range = poly5.transform(x_5_range)
    8 y_5_range_predict = lr5.predict(x_5_range)


    •  实战二

    1 #load the data
    2 import pandas as pd
    3 import numpy as np
    4 data = pd.read_csv('data_class_raw.csv')
    5 data.head()

    1 #define x and y
    2 x = data.drop(['y'],axis=1)
    3 y = data.loc[:,'y']
    4 print(x.shape,y.shape)

     1 #visualize the data
     2 %matplotlib inline
     3 from matplotlib import pyplot as plt
     4 fig1 = plt.figure(figsize=(5,5))
     5 bad = plt.scatter(x.loc[:,'x1'][y==0],x.loc[:,'x2'][y==0])
     6 good = plt.scatter(x.loc[:,'x1'][y==1],x.loc[:,'x2'][y==1])
     7 plt.legend((good,bad),('good','bad'))
     8 plt.title('raw data')
     9 plt.xlabel('x1')
    10 plt.ylabel('x2')
    11 plt.show()

    1 #anomaly detextion 异常点检测
    2 from sklearn.covariance import EllipticEnvelope
    3 ad_model = EllipticEnvelope(contamination=0.02)
    4 ad_model.fit(x[y==0])
    5 y_predict_bad = ad_model.predict(x[y==0])
    6 print(y_predict_bad)
    [ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1]
     1 #visualize the data
     2 %matplotlib inline
     3 from matplotlib import pyplot as plt
     4 fig1 = plt.figure(figsize=(5,5))
     5 bad = plt.scatter(x.loc[:,'x1'][y==0],x.loc[:,'x2'][y==0])
     6 good = plt.scatter(x.loc[:,'x1'][y==1],x.loc[:,'x2'][y==1])
     7 plt.scatter(x.loc[:,'x1'][y==0][y_predict_bad==-1],x.loc[:,'x2'][y==0][y_predict_bad==-1],marker='x',s=150)
     8 plt.legend((good,bad),('good','bad'))
     9 plt.title('raw data')
    10 plt.xlabel('x1')
    11 plt.ylabel('x2')
    12 plt.show()

    1 data = pd.read_csv('data_class_processed.csv')
    2 data.head()
    3 #define x and y
    4 x = data.drop(['y'],axis=1)
    5 y = data.loc[:,'y']
     1 #pca
     2 from sklearn.preprocessing import StandardScaler
     3 from sklearn.decomposition import PCA
     4 x_norm = StandardScaler().fit_transform(x)#标准化处理数据
     5 pca = PCA(n_components=2)
     6 x_reduced = pca.fit_transform(x_norm)
     7 var_ratio = pca.explained_variance_ratio_
     8 print(var_ratio)
     9 fig4 = plt.figure(figsize=(5,5))
    10 plt.bar([1,2],var_ratio)
    11 plt.show()

    1 #train and test split:random_state=4,test_size=0.4 数据分离
    2 from sklearn.model_selection import train_test_split
    3 x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=4,test_size=0.4)
    4 print(x_train.shape,x_test.shape,x.shape)
    (21, 2) (14, 2) (35, 2)
     1 #knn model
     2 from sklearn.neighbors import KNeighborsClassifier
     3 knn_10 = KNeighborsClassifier(n_neighbors=10)
     4 knn_10.fit(x_train,y_train)
     5 y_train_predict = knn_10.predict(x_train)
     6 y_test_predict = knn_10.predict(x_test)
     7 #calculate the accuracy
     8 from sklearn.metrics import accuracy_score
     9 accuracy_train = accuracy_score(y_train,y_train_predict)
    10 accuracy_test = accuracy_score(y_test,y_test_predict)
    11 print('training accuracy:',accuracy_train)
    12 print('testing accuracy:',accuracy_test)
    training accuracy: 0.9047619047619048
    testing accuracy: 0.6428571428571429

     1 #visualize the knn result and boundary 2 xx, yy = np.meshgrid(np.arange(0,10,0.05),np.arange(0,10,0.05)) 3 print(yy.shape) 

    (200, 200) 

    1 x_range = np.c_[xx.ravel(),yy.ravel()] 2 print(x_range.shape) 
    (40000, 2)
    1 y_range_predict = knn_10.predict(x_range) 
     1 fig4 = plt.figure(figsize=(5,5))
     2 knn_bad = plt.scatter(x_range[:,0][y_range_predict==0],x_range[:,1][y_range_predict==0])
     3 knn_good = plt.scatter(x_range[:,0][y_range_predict==1],x_range[:,1][y_range_predict==1])
     5 bad = plt.scatter(x.loc[:,'x1'][y==0],x.loc[:,'x2'][y==0])
     6 good = plt.scatter(x.loc[:,'x1'][y==1],x.loc[:,'x2'][y==1])
     7 plt.legend((good,bad,knn_good,knn_bad),('good','bad','knn_good','knn_bad'))
     8 plt.title('raw data')
     9 plt.xlabel('x1')
    10 plt.ylabel('x2')
    11 plt.show()

      1 from sklearn.metrics import confusion_matrix 2 cm = confusion_matrix(y_test,y_test_predict) 3 print(cm) 

    [[4 2]
     [3 5]]
     1 TP = cm[1,1] 2 TN = cm[0,0] 3 FP = cm[0,1] 4 FN = cm[1,0] 5 print(TP,TN,FP,FN) 
    5 4 2 3
     1 accuracy =(TP+TN)/(TP+TN+FP+FN)#准确率:整体样本中正确样本数的比例
     2 recall = TP/(TP+FP)#Sensitivity 灵敏度(召回率):正样本中,预测正确的比例
     3 specificity = TN/(TN+FP)#特异度:负样本中,预测正确的比例
     4 precision = TP/(TP+FP)#精确率:预测结果为正样本中,预测正确的比例
     5 f1 = 2*precision*recall/(precision + recall)#F1 Score:综合Precision和Recall的喝一喝判断指标
     6 print('准确率:',accuracy)
     7 print('灵敏度:',recall)
     8 print('特异度:',specificity)
     9 print('精确率:',precision)
    10 print('F1 Score:',f1)
    准确率: 0.6428571428571429
    灵敏度: 0.7142857142857143
    特异度: 0.6666666666666666
    精确率: 0.7142857142857143
    F1 Score: 0.7142857142857143
     1 #try different k and calcualte the accuracy for each
     2 n = [i for i in range(1,21)]
     3 accuracy_train = []
     4 accuracy_test = []
     5 for i in n:
     6     knn = KNeighborsClassifier(n_neighbors=i)
     7     knn.fit(x_train,y_train)
     8     y_train_predict = knn.predict(x_train)
     9     y_test_predict = knn.predict(x_test)
    10     accuracy_train_i = accuracy_score(y_train,y_train_predict)
    11     accuracy_test_i = accuracy_score(y_test,y_test_predict)
    12     accuracy_train.append(accuracy_train_i)
    13     accuracy_test.append(accuracy_test_i)
    14 print(accuracy_train,accuracy_test)
    [1.0, 1.0, 1.0, 1.0, 1.0, 0.9523809523809523, 0.9523809523809523, 0.9523809523809523, 0.9047619047619048, 0.9047619047619048, 0.9047619047619048, 0.9523809523809523, 0.9047619047619048, 0.9047619047619048, 0.9523809523809523, 0.9047619047619048, 0.9047619047619048, 0.5714285714285714, 0.5714285714285714, 0.5714285714285714]
    [0.5714285714285714, 0.5, 0.5, 0.5714285714285714, 0.7142857142857143, 0.5714285714285714, 0.5714285714285714, 0.5714285714285714, 0.6428571428571429, 0.6428571428571429, 0.6428571428571429, 0.5714285714285714, 0.6428571428571429, 0.6428571428571429, 0.5714285714285714, 0.5714285714285714, 0.5714285714285714, 0.42857142857142855, 0.42857142857142855, 0.42857142857142855]
     1 fig5 = plt.figure(figsize=(12,5))
     2 plt.subplot(121)
     3 plt.plot(n,accuracy_train,marker='o')
     4 plt.title('training accuracy vs n_neighbors')
     5 plt.xlabel('n_neighbors')
     6 plt.ylabel('accuracy')
     8 plt.subplot(122)
     9 plt.plot(n,accuracy_test,marker='o')
    10 plt.title('testing accuracy vs n_neighbors')
    11 plt.xlabel('n_neighbors')
    12 plt.ylabel('accuracy')
    13 plt.show()

