logistic 回归（线性和非线性）

一：线性logistic 回归

代码如下：

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.optimize as opt
import seaborn as sns

#读取数据集
path = 'ex2data1.txt'
data = pd.read_csv(path, header=None, names=['Exam 1', 'Exam 2', 'Admitted'])

#将正负数据集分开
positive = data[data['Admitted'].isin([1])]
negative = data[data['Admitted'].isin([0])]

'''
#查看分布
fig, ax = plt.subplots(figsize=(12, 8))
ax.scatter(positive['Exam 1'], positive['Exam 2'], s=60, c='b', marker='o', label='Admitted')
ax.scatter(negative['Exam 1'], negative['Exam 2'], s=50, c='r', marker='x', label='UnAdmitted')
ax.legend()
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')
plt.show()
'''

#sigmoid函数实现
def sigmoid(h):
    return 1 / (1 + np.exp(-h))


'''
#测试sigmoid函数
nums = np.arange(-10, 11, step=1)
fig, ax = plt.subplots(figsize=(12, 8))
ax.plot(nums, sigmoid(nums), 'k')
plt.show()
'''

#计算损失函数值
def cost(theta, X, y):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)

    part1 = np.multiply(-y, np.log(sigmoid(X * theta.T)))
    part2 = np.multiply((1-y), np.log(1-sigmoid(X * theta.T)))
    return np.sum(part1-part2) / len(X)

#在原矩阵第1列前加一列全1
data.insert(0, 'ones', 1)

cols = data.shape[1]

X = data.iloc[:, 0:cols-1]
y = data.iloc[:, cols-1:cols]

X = np.array(X.values)
y = np.array(y.values)
theta = np.zeros(3) #这里是一个行向量


#返回梯度向量，注意是向量
def gradient(theta, X, y):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)

    parameters = theta.ravel().shape[1]
    grad = np.zeros(parameters)

    error = sigmoid(X * theta.T) - y

    grad = error.T.dot(X)
    grad = grad / len(X)
    return grad

#通过高级算法计算出最好的theta值
result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X, y))

#print(cost(result[0], X, y))

#测试所得theta的性能
#计算原数据集的预测情况
def predict(theta, X):
    theta = np.matrix(theta)
    X = np.matrix(X)

    probability = sigmoid(X * theta.T)
    return [1 if i > 0.5 else 0 for i in probability]


theta_min = result[0]
predictions = predict(theta_min, X)

correct = [1 if((a == 1 and b == 1) or(a == 0 and b == 0)) else 0 for(a, b) in zip(predictions, y)]
accuracy = (sum(map(int, correct)) % len(correct))
print('accuracy = {0}%'.format(accuracy))#训练集测试准确度89%


# 作图
theta_temp = theta_min
theta_temp = theta_temp / theta_temp[2]

x = np.arange(130, step=0.1)
y = -(theta_temp[0] + theta_temp[1] * x)
#画出原点
sns.set(context='notebook', style='ticks', font_scale=1.5)
sns.lmplot('Exam 1', 'Exam 2', hue='Admitted', data=data,
           size=6,
           fit_reg=False,
           scatter_kws={"s": 25}
           )
#画出分界线
plt.plot(x, y, 'grey')
plt.xlim(0, 130)
plt.ylim(0, 130)
plt.title('Decision Boundary')
plt.show()

二：非线性logistic 回归（正则化）

代码如下：

import pandas as pd
import numpy as np
import scipy.optimize as opt
import matplotlib.pyplot as plt


path = 'ex2data2.txt'
data = pd.read_csv(path, header=None, names=['Test 1', 'Test 2', 'Accepted'])

positive = data[data['Accepted'].isin([1])]
negative = data[data['Accepted'].isin([0])]

'''
#显示原始数据的分布
fig, ax = plt.subplots(figsize=(12, 8))
ax.scatter(positive['Test 1'], positive['Test 2'], s=50, c='b', marker='o', label='Accepted')
ax.scatter(negative['Test 1'], negative['Test 2'], s=50, c='r', marker='x', label='Unaccepted')
ax.legend() #显示右上角的Accepted 和 Unaccepted标签
ax.set_xlabel('Test 1 Score')
ax.set_ylabel('Test 2 Score')
plt.show()
'''
degree = 5
x1 = data['Test 1']
x2 = data['Test 2']
#在data的第三列插入一列全1
data.insert(3, 'Ones', 1)

#创建多项式特征值，最高阶为4
for i in range(1, degree):
    for j in range(0, i):
        data['F' + str(i) + str(j)] = np.power(x1, i-j) * np.power(x2, j)

#删除原数据中的test 1和test 2两列
data.drop('Test 1', axis=1, inplace=True)
data.drop('Test 2', axis=1, inplace=True)


#sigmoid函数实现
def sigmoid(h):
    return 1 / (1 + np.exp(-h))


def cost(theta, X, y, learnRate):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)

    first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
    second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
    reg = (learnRate / (2 * len(X))) * np.sum(np.power(theta[:, 1:theta.shape[1]], 2))
    return np.sum(first - second) / len(X) + reg


learnRate = 1
cols = data.shape[1]

X = data.iloc[:, 1:cols]
y = data.iloc[:, 0:1]

X = np.array(X)
y = np.array(y)
theta = np.zeros(X.shape[1])


#计算原数据集的预测情况
def predict(theta, X):
    theta = np.matrix(theta)
    X = np.matrix(X)

    probability = sigmoid(X * theta.T)
    return [1 if i > 0.5 else 0 for i in probability]


def gradientReg(theta, X, y, learnRate):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)

    paramates = int(theta.ravel().shape[1])
    grad = np.zeros(paramates)

    grad = (sigmoid(X * theta.T) - y).T * X / len(X) + (learnRate / len(X)) * theta[:, i]
    grad[0] = grad[0] - (learnRate / len(X)) * theta[:, i]
    return grad

result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradientReg, args=(X, y, learnRate))
print(result)

theta_min = np.matrix(result[0])
predictions = predict(theta_min, X)
correct = [1 if((a == 1 and b == 1) or(a == 0 and b == 0)) else 0 for(a, b) in zip(predictions, y)]
accuracy = (sum(map(int, correct)) % len(correct))

print('accuracy = {0}%'.format(accuracy))

相关阅读:
Win10 iot 配置防火墙限制应用部署
 未能加载文件或程序集“********”或它的某一个依赖项。试图加载格式不正确的程序。
IIS 支持 m3u8
UWP WebView 禁用缩放
 Code First
关于永恒之蓝和 MS17-010 补丁
 《罗辑思维成大事者不纠结》读书笔记
 <王川自选集第一卷电子书 >读书笔记
 <王二的经济学故事>读书笔记
 <以交易为生>读书笔记
原文地址：https://www.cnblogs.com/qiang-wei/p/9839458.html