import pandas as pd import numpy as np from matplotlib import pyplot as plt from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score def f(x): a = theta4 b = theta5 * X1_new + theta2 c = theta0 + theta1 * x + theta3 * x * x X2_new_boundary1 = (-b + np.sqrt(b * b - 4 * a * c)) / (2 * a) X2_new_boundary2 = (-b - np.sqrt(b * b - 4 * a * c)) / (2 * a) return X2_new_boundary1, X2_new_boundary2 if __name__ == '__main__': ''' 逻辑回归 ''' # load the data data = pd.read_csv('') data.head() ''' 第一次查看所有数据 ''' # visualize the data fig1 = plt.figure() plt.scatter(data.loc[:, 'example1'], data.loc[:, 'example2']) # .......导入数据 plt.title('example1-example2') # 设置表名 plt.xlabel('example1') # 设置X坐标轴 plt.ylabel('example2') # 设置Y坐标轴 plt.show() # 查看图像 ''' 第二次查看带有正确错误标识的数据 ''' # add label mask mask = data.loc[:, 'pass'] == 1 fig2 = plt.figure() passed = plt.scatter(data.loc[:, 'example1'][mask], data.loc[:, 'example2'][mask]) # .......导入数据 failed = plt.scatter(data.loc[:, 'example1'][~mask], data.loc[:, 'example2'][~mask]) # .......导入数据 plt.title('example1-example2') # 设置表名 plt.xlabel('example1') # 设置X坐标轴 plt.ylabel('example2') # 设置Y坐标轴 plt.legend((passed, failed), ('passed', 'failed')) plt.show() # 查看图像 # define X,Y X = data.drop(['pass'], axis=1) y = data.loc[:, 'pass'] y.head # 查看数据 X1 = data.loc[:, 'example1'] X2 = data.loc[:, 'example2'] X1_2 = X1 * X1 X2_2 = X2 * X2 X1_X2 = X1 * X2 X_new = {'X1': X1, 'X2': X2, 'X1_2': X1_2, 'X2_2': X2_2, 'X1_X2': X1_X2} X_new = pd.DataFrame(X_new) print(X_new) # 创建新的训练 LR2 = LogisticRegression() LR2.fit(X_new, y) y2_predict = LR2.predict(X_new) # 预测 accuracy2 = accuracy_score(y, y2_predict) print(accuracy2) X1_new = X1.sort_values() # 从小到大排序 theta0 = LR2.intercept_ theta1, theta2, theta3, theta4, theta5 = LR2.coef_[0][0], LR2.coef_[0][1], LR2.coef_[0][2], LR2.coef_[0][3], LR2.coef_[0][4] # 制作曲线参数 a = theta4 b = theta5 * X1_new + theta2 c = theta0 + theta1 * X1_new + theta3 * X1_new * X1_new X2_new_boundary = (-b + np.sqrt(b * b - 4 * a * c)) / (2 * a) fig4 = plt.figure() passed = plt.scatter(data.loc[:, 'example1'][mask], data.loc[:, 'example2'][mask]) # .......导入数据 failed = plt.scatter(data.loc[:, 'example1'][~mask], data.loc[:, 'example2'][~mask]) # .......导入数据 plt.plot(X1_new, X2_new_boundary) plt.title('example1-example2') # 设置表名 plt.xlabel('example1') # 设置X坐标轴 plt.ylabel('example2') # 设置Y坐标轴 plt.legend((passed, failed), ('passed', 'failed')) plt.show() # 查看图像 ''' 如果在这里使用二阶线性回归,那么只有一半的数据能被隔离开 也就是说忽略了X的第二种结果 接下来就是加上这种结果的第二条曲线 ''' ''' 正确方法 ''' # define f(x) --> 8 X2_new_boundary1 = [] X2_new_boundary2 = [] for x in X1_new: X2_new_boundary1.append(f(x)[0]) X2_new_boundary2.append(f(x)[1]) print(X2_new_boundary1, X2_new_boundary2) fig5 = plt.figure() passed = plt.scatter(data.loc[:, 'example1'][mask], data.loc[:, 'example2'][mask]) # .......导入数据 failed = plt.scatter(data.loc[:, 'example1'][~mask], data.loc[:, 'example2'][~mask]) # .......导入数据 plt.plot(X1_new, X2_new_boundary1) plt.plot(X1_new, X2_new_boundary2) plt.title('example1-example2') # 设置表名 plt.xlabel('example1') # 设置X坐标轴 plt.ylabel('example2') # 设置Y坐标轴 plt.legend((passed, failed), ('passed', 'failed')) plt.show() # 查看图像 ''' 你会发现虽然你补上了另外一个X值,但是两个X对应的曲线并没有连在一起 这是因为数之间本来就是有间隔的,不全,所以连不上,这时候需要我们把他补全 ''' X1_range = [-0.9 + x / 10000 for x in range(0, 19000)] X1_range = np.array(X1_range) X2_new_boundary1 = [] X2_new_boundary2 = [] for x in X1_new: X2_new_boundary1.append(f(x)[0]) X2_new_boundary2.append(f(x)[1]) fig5 = plt.figure() passed = plt.scatter(data.loc[:, 'example1'][mask], data.loc[:, 'example2'][mask]) # .......导入数据 failed = plt.scatter(data.loc[:, 'example1'][~mask], data.loc[:, 'example2'][~mask]) # .......导入数据 plt.plot(X1_range, X2_new_boundary1) plt.plot(X1_range, X2_new_boundary2) plt.title('example1-example2') # 设置表名 plt.xlabel('example1') # 设置X坐标轴 plt.ylabel('example2') # 设置Y坐标轴 plt.legend((passed, failed), ('passed', 'failed')) plt.show() # 查看图像