Linear Regression
公式推导
线性函数
[y = omega_1x_1+omega_2x_2 + cdots+omega_ix_i+b
]
可以用下面的方式利用矩阵在表示:
[y=left[
matrix{
omega_1 &&
omega_2 &&
cdots &&
omega_i &&
b
}
ight]left[
matrix{
x_1 \
x_2 \
cdots \
x_i \
1
}
ight]=omega^TX+b
]
对于提供的m
组数据,每个数据有(x_i)和(y_i)两个参数,分别表示输入和输出数据。所以上述的公式中的y
应是预测值,表示为:
[hat{y}=h(X_i)=omega^TX_i+b
]
下面引入损失函数的概念
[egin{align}
J( heta)=frac{1}{2m}sum_{i=0}^{m}(h_ heta(X_i)-y_i) \
J( heta)=frac{1}{2m}sum_{i=0}^{m}(omega^TX_i+b-y_i)
end{align}
]
针对m组数据,计算每组数据的预测值和真实值的差的平方和,再除以2m
,即为损失函数。
通过python代码,将(y= heta x)的参数在0~2
之间的变化的图像画出:
import matplotlib.pyplot as plt
import numpy as np
x = np.arange(0, 10, 0.01)
y = x
# 损失函数
def calCost(theta):
h = [i * theta for i in x]
cost = 0
for i in range(len(h)):
cost += (y[i]-h[i])*(y[i]-h[i])
return cost
thetas = np.arange(0, 2, 0.001)
costs = list(map(calCost, thetas))
plt.plot(thetas, costs)
plt.show()
为了让我们计算出的参数是的线性函数与数据集相拟合,我们可以得出当:
[egin{align}
frac{partial J( heta)}{partial omega^T}=0 \
frac{partial J( heta)}{partial b}=0
end{align}
]
当偏导数为0时,即此时的损失函数计算获得最小值,误差最小,w和b得到最优解,下面是上述方程的展开式:
[egin{align}
frac{partial J(omega,b)}{partialomega}
& = frac{1}{2m}sum_{i=1}^{m}2(omega^TX_i+b-y_i)X_i \
& = frac{1}{m}sum_{i=1}^m(omega^TX_i^2+bX_i-X_iy_i)\
& = frac{omega^T}{m}sum_{i=1}^{m}X_i^2-frac{1}{m}sum_{i=1}^{m}(y_i-b)X_i=0 \
frac{partial J(omega,b)}{partial b}
& = frac{1}{2m}sum_{i=1}^{m}2(omega^TX_i+b-y_i) \
& = frac{1}{m}sum_{i=1}^m(omega^TX_i+b-y_i)\
& = b=frac{1}{m}sum_{i=1}^{m}(y_i-omega X_i) = 0
end{align}
]
解得$$和b等于:
[egin{align}
omega=frac{sum_{i=1}^{m}y_i(X_i-ar{x})}{sum_{i=1}^{m}X_i^2-frac{1}{m}(sum_{i=1}^{m})^2} \
end{align}
]
[b=frac{1}{m}sum_{i=1}^{m}(y_i-omega^TX_i)
]
如果使用梯度下降法
那么每次更新的公式是:
[left[
matrix{
omega \
b
}
ight]_{new}=left[
matrix{
omega \
b
}
ight]_{old}-alphaleft[
matrix{
frac{partial J(omega,b)}{partialomega} \
frac{partial J(omega,b)}{partial b}
}
ight]
]
新参数根据原参数减去学习率乘上原参数对各项求偏导的值,进行更新。
Pytorch从0开始实现
生成随机数据
import torch
import numpy as np
import random
num_inputs = 1 # 参数个数
num_examples = 1000 # 生成数据个数
true_w = [2] # w
true_b = 6.7 # b
# 输入,1000行1列
features = torch.randn(num_examples, num_inputs, dtype=torch.float32)
# 输出:w1x1+w2x2+b+随机数(噪声)
labels = true_w[0] * features[:, 0] + true_b
labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()), dtype=torch.float32)
# 显示数据
from d2l import *
set_figsize()
plt.scatter(features[:, 0].numpy(), labels.numpy(), 1)
plt.show()
# 小批量数据生成
def data_iter(batch_size, features, labels):
num_examples = len(features) # 数据的总大小
# 生成下标列表
indices = list(range(num_examples))
# 打乱下标列表
random.shuffle(indices)
# 从0到数据总大小,每次增加batch_size个
for i in range(0, num_examples, batch_size):
# 取出数据的第i个到第i+batch_size个
# 最后一次可能不足一个batch,所以要取min
j = torch.LongTensor(indices[i: min(i + batch_size, num_examples)])
# 把下标从0到j-1的数据返回
yield features.index_select(0, j), labels.index_select(0, j)
# 生成w是均值为0,标准差为0.01的正态随机数
w = torch.tensor(np.random.normal(0, 0.01, (num_inputs, 1)), dtype=torch.float32)
# b初始化为0
b = torch.zeros(1, dtype=torch.float32)
# 允许求梯度
w.requires_grad_(requires_grad=True)
b.requires_grad_(requires_grad=True)
# y=wx+b
def linreg(X, w, b):
return torch.mm(X, w) + b
# 损失函数
def squared_loss(y_hat, y):
return (y_hat - y.view(y_hat.size())) ** 2 / 2
# SGD
def sgd(params, lr, batch_size):
for param in params:
param.data -= lr * param.grad / batch_size
lr = 0.03 # 学习率
num_epochs = 5 # 学习次数
net = linreg # 网络模型
loss = squared_loss # 损失函数
batch_size = 10 # 批量大小
# 训练模型一共需要num_epochs个迭代周期
for epoch in range(num_epochs):
# 在每一个迭代周期中,会使用训练数据集中所有样本一次(假设样本数能够被批量大小整除)
# X和y分别是小批量样本的特征和标签
for X, y in data_iter(batch_size, features, labels):
# l是有关小批量X和y的损失
l = loss(net(X, w, b), y).sum()
# 反向传播小批量的损失对模型参数计算梯度
l.backward()
# 使用小批量随机梯度下降迭代模型参数
sgd([w, b], lr, batch_size)
# 梯度清零,很重要
w.grad.data.zero_()
b.grad.data.zero_()
# 计算目前的w和b和正确的w和b的差值
train_l = loss(net(features, w, b), labels)
# 打印差值和
print('epoch %d, loss %f' % (epoch + 1, train_l.mean().item()))
print(true_w, '
', w)
print(true_b, '
', b)
打印结果
epoch 1, loss 0.048183
epoch 2, loss 0.000156
epoch 3, loss 0.000053
epoch 4, loss 0.000053
epoch 5, loss 0.000053
[2]
tensor([[2.0003]], requires_grad=True)
6.7
tensor([6.6993], requires_grad=True)
Pytorch 简洁实现
import torch
import numpy as np
from torch import nn
from torch.nn import init
import torch.optim as optim
# 生成数据
num_inputs = 1
num_examples = 1000
true_w = [2]
true_b = 4.2
features = torch.tensor(np.random.normal(0, 1, (num_examples, num_inputs)), dtype=torch.float)
labels = true_w[0] * features[:, 0] + true_b
labels += torch.tensor(np.random.normal(0, 0.01, size=labels.size()), dtype=torch.float)
import torch.utils.data as Data
# 批量大小
batch_size = 10
# 将训练数据的特征和标签组合
dataset = Data.TensorDataset(features, labels)
# 随机读取小批量
data_iter = Data.DataLoader(dataset, batch_size, shuffle=True)
net = nn.Sequential(
nn.Linear(num_inputs, 1)
)
# 初始化模型参数
# 修改w,正态分布,均值为0,标准差为1
init.normal_(net[0].weight, mean=0, std=0.01)
# 修改b
init.constant_(net[0].bias, val=0)
# 损失函数
loss = nn.MSELoss()
# SGD
optimizer = optim.SGD(net.parameters(), lr=0.03)
# 训练
num_epochs = 5
for epoch in range(1, num_epochs + 1):
for X, y in data_iter:
output = net(X)
l = loss(output, y.view(-1, 1))
optimizer.zero_grad() # 梯度清零,等价于net.zero_grad()
l.backward()
optimizer.step()
print('epoch %d, loss: %f' % (epoch, l.item()))
# 查看结果
dense = net[0]
print(true_w)
print(dense.weight)
print(true_b)
print(dense.bias)
输出值
epoch 1, loss: 0.000351
epoch 2, loss: 0.000137
epoch 3, loss: 0.000202
epoch 4, loss: 0.000056
epoch 5, loss: 0.000073
[2]
Parameter containing:
tensor([[2.0008]], requires_grad=True)
4.2
Parameter containing:
tensor([4.1992], requires_grad=True)