pytorch | A 60 MINUTE BLITZ 代码+详细注释

代码搬运自戳我
同时感谢提供了详细学习笔记的maomao9173神仙

PART1

from __future__ import print_function
import torch

x = torch.empty(5, 3) 
# empty -> 未初始化，随机值
x = torch.rand(5, 3) 
# rand -> [0, 1] 均匀随机
x = torch.randn(5, 3) 
# randn -> [0, 1] 正态分布
x = torch.zeros(5, 3, dtype = torch.long)
# zeros -> 全 0
x = torch.tensor([5.5, 3])
# 直接用数据构造 tensor
x = x.new_ones(5, 3, dtype = torch.double)
# new_ones : 获得一个n.m的张量，元素均为1，属性继承于张量x
x = torch.randn_like(x, dtype = torch.float)
# randn_like : 随机数
print(x.size())
# 展示张量的形状
y = torch.rand(5, 3)
print(x + y)
print(torch.add(x, y))
# 逐位相加，上面两个语句结果相同
result = torch.empty(5, 3)
torch.add(x, y, out = result)
# 把 x 和 y 相加的结果存到 result 里面
y.add_(x)
y.copy_(x)
# 把 x 加到 y 上去 / 把 x 拷到 y 里面去
# 常见语法 : 带有"_"后缀的函数一般都是把结果存在当前变量里
print(x[:, 1])
# 遍历行的第一列输出
x = torch.randn(4, 4)
y = x.view(16)
z = x.view(-1, 8)
# view 表示将原矩阵改变性状（长宽比）
# z 的第一维度尺寸会自动推断
x = torch.randn(1)
print(x)
print(x.item())
# 单元素tensor通过item获得其包含的值
a = torch.ones(5)
b = a.numpy()
# tensor <-> numpy : 轻而易举
a.add_(1)
print(a)
print(b)
# 如果torch tensor在gpu上跑，那么把tensor给numpy
# 和把numpy给tensor时，他们会占用同样的内存空间。
import numpy as np
a = np.ones(5)
b = torch.from_numpy(a)
np.add(a, 1, out = a)
print(a)
print(b)
# All the tensors on the CPU except a CharTensor
# support converting to Numpy and back
if torch.cuda.is_available():
    device = torch.device("cuda")
    y = torch.ones_like(x, device = device)
    # ones_like : 返回一个用1填充的 和输入形状与类型一致的数组
    print(x)
    print(y)
    # 通过这种方法，直接创建一个gpu上的tensor
    # x = x.to(device)
    # z = x + y
    print(z)
    print(z.to("cpu", torch.double))
    # .to还可以同时改变dtype！

PART2

import torch
import numpy as np
'''
torch.Tensor是autograd包里的核心类。如果把
[.requires_grad]设置为True，它就会跟踪每一
个操作，并在完成其他计算后，执行[.backward]
进行反向传播，自动计算梯度。它所得到的结果会
被累加到[.grad]属性中。
'''

# 使用[.detach()]来阻止它自动进行跟踪历史。
# 也可以使用[with torch.no_grad()]
# 避免它自动跟踪历史并占用空间。
# .grad_fn: 是创建这个tensor的function
x = torch.ones(2, 2, requires_grad = True)
print(x)
y = x + 2
print(y)
# y是通过一个操作创造的tensor而不是用户自行
# 所以它有grad_fn
print(y.grad_fn)
z = y * y * 3
out = z.mean()
print(z, out)
# mean函数求平均值
a = torch.randn(2, 2)
a = ((a * 3) / (a - 1))
print(a.requires_grad)
a.requires_grad_(True)
print(a.requires_grad)
b = (a * a).sum()
print(b.grad_fn)
# .requires_grad_改变requires_grad变量
# requires_grad默认设置为False
out.backward() # 进行反向传播
print(x.grad_fn)
x = x * x
x.backward()
print(x.grad)
x = torch.randn(3, requires_grad = True)
y = x * 2
while y.data.norm() < 1000:
    y = y * 2
# .data.norm()表示对数组里面（类型为float）所有值求平方和，然后开方
print(y)
# ones : 全1
# zeros: 全0

PART3

import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # 哇！又不会啦！
        # 1 input image channel, 6 output channels, 3x3 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.conv2 = nn.Conv2d(6, 16, 3)
        # 输入通道数 / 输出通道数 / 卷积核大小
        # 在pytorch中，定义层会默认使用HE初始化
        self.fc1 = nn.Linear(16 * 6 * 6, 120)
        # 输入张量大小 / 输出张量大小
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        # an affine operation: y = Wx + b

    def forward(self, x):
        # Max polling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # if the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), (2,3))
        x = x.view(-1, self.num_flat_features(x))
        # num_flat_features把之前的张量铺开成一维的
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]
        # 哇 是不会的诶!
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

net = Net()
print(net)
params = list(net.parameters())
# print(params)
# print(params.size())
print(params[0].size())
input = torch.randn(1, 1, 32, 32)
out = net(input)
print(out)
net.zero_grad()
out.backward(torch.randn(1, 10))
output = net(input)
target = torch.randn(10)
# 这样得到的target形状是10，我们需要转化它为[1, 10]
target = target.view(1, -1)
# print(target.size())

criterion = nn.MSELoss()
loss = criterion(output, target)
# print(loss)
# print(loss.grad_fn)
# print(loss.grad_fn.next_functions[0][0])
# print(loss.grad_fn.next_functions[0][0].next_functions[0][0])
net.zero_grad()
# 先清零梯度，不然会发生累积
print('conv1.bias.grad before backward:')
print(net.conv1.bias.grad)

loss.backward()

print('conv1.bias.grad after backward:')
print(net.conv1.bias.grad)
learning_rate = 0.01
for f in net.parameters:
    f.data.sub_(f.grad.data * learning_rate)
# 实现weight = weight - learning_rate * gradient
import torch.optim as optim
# 用这个包实现SGD / Nesterov-SGD / Adam / RMSProp.etc

optimizer = optim.SGD(net.parameters(), lr = 0.01)
# net的参数 / 学习率

# in your training loop:
optimizer.zero_grad() # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step() # Do the update
# 注意: 每次一定要做zero_grad

PART4

# What about data?

# Generally, when you have to deal with image, text, audio 
# or video data, you can use standard python packages that
# load data into a numpy array. Then you can convert this 
# array into a torch.*Tensor.

# For images, packages such as Pillow, OpenCV are useful

# For audio, packages such as scipy and librosa

# For text, either raw Python or Cython based loading, or
# NLTK and SpaCy are useful

# CIFAR-10和CIFAR-100是带有标签的数据集

import torch
import torchvision
import torchvision.transforms as transforms
# torchvision中提供的数据集是[0, 1]间的实数。
# 我们要把它转化成范围为[-1, 1]的实数。

# 注意！如果在Windows上运行，返回一个BrokenPipeError
# 的错误，请尝试把torch.utils.data.DataLoader() 的
# num_worker设置为0

transform = transforms.Compose(
    [
        transforms.ToTensor(),
        # 转化为tensor
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        # 前面的(0.5, 0.5, 0.5)是RGB通道上的均值
        # 后面(0.5, 0.5, 0.5)是三个通道的标准差
        # image = (图像 - 平均值) / std
        # 通过这个转化会把它变成[-1, 1]上的值
    ]
)

trainset = torchvision.datasets.CIFAR10(
    root = './data', 
    train = True,     # 训练数据
    download = True,
    transform = transform
)
trainloader = torch.utils.data.DataLoader(
    trainset,         # 训练数据来源
    batch_size = 4,   # 每次训练几个数据: default = 1
    shuffle = True,   # 是否随机打乱
    # num_workers = 1   # 使用几个subprocesses
)
testset = torchvision.datasets.CIFAR10(
    root = './data', 
    train = False,    # 非训练数据
    download = True,
    transform = transform
)
testloader = torch.utils.data.DataLoader(
    testset,
    batch_size = 4,
    shuffle = False,  # 每次不随机打乱
    # num_workers = 1
)
classes = ('plane', 'car', 'bird', 'cat', 'deer', 
           'dog', 'frog', 'horse', 'ship', 'truck')

import matplotlib.pyplot as plt
import numpy as np

# functions to show an image

def imshow(img):
    img = img / 2 + 0.5  # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    # transpose 是一个轴变换操作, x是0, y是1, z是2
    # 此处:y -> x, z -> y, x -> z
    plt.show()

# get some random training images
'''
dataiter = iter(trainloader)
images, lables = dataiter.next()
# 用dataiter来遍历所有的数据
# 由于shuffle为True, 每次都会随机选择。

# show images
imshow(torchvision.utils.make_grid(images))
# make_grid: 画图   &网格
# print lables
print(' '.join(classes[lables[j]] for j in range(4)))
# .join: 用' '链接join里面的字符串
'''
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv1 = nn.Conv2d(3, 100, 5)
        self.pool = nn.MaxPool2d(2, 2)
        # self.conv2 = nn.Conv2d(6, 16, 5)
        self.conv2 = nn.Conv2d(100, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
    
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
net = Net()

import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr = 0.001, momentum = 0.9)
# 网络元素 & 学习率 & 动量因子

path = './cifar_net.pth'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)
# 这个操作会递归地把所有模块转变为gpu上的模块

if (input() == "Train"):
    for epoch in range(10):   # loop over the dataset multiple times

        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            # enumerate (sequence, start = 0)
            # i是当前标号，data是获取到的元素
            inputs, lables = data
            # inputs = inputs
            # lables = lables
            inputs = inputs.to(device)
            lables = lables.to(device)
            # 由于网络本身不是很大，所以对速度提升不太明显
            # zero the parameter gradients first
            optimizer.zero_grad()
            #forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, lables)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' % 
                    (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0
    print('Finished Training')

    torch.save(net.state_dict(), path)
    # net.state_dict() -> 当前参数状态

dataiter = iter(testloader)
images, lables = dataiter.next()
#print images
imshow(torchvision.utils.make_grid(images))
print('GroundTruth: ', ' '.join(classes[lables[j]] for j in range(4)))

net = Net()
net.load_state_dict(torch.load(path))
# load back in our saved model 

correct = 0
total = 0
with torch.no_grad():
    # 在接下来的操作中不自动计算grad
    for data in testloader:
        images, lables = data
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        # 第一个参数是softmax函数输出的一个tensor
        # 第二个参数: 维度索引0/1, 0是每列max, 1是每行max
        # 返回两个Tensor, 第一个tensor是每行最大值，第二个tensor是最大值索引
        total += lables.size(0)
        correct += (predicted == lables).sum().item()
        # tensor.sum() 返回总和
        # tensor.item() 把Tensor转换为一个数字

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

class_correct = list(0. for i in range(10))
class_total = list(0. for i in range(10))
# list(a, b, c)代表创建一个为[a, b, c]的列表
# 0. 是一个double型的0
# for i in range(10)是创建10个这样的0
with torch.no_grad():
    for data in testloader:
        images, lables = data
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        c = (predicted == lables).squeeze()
        # squeeze: 把他们的结果压缩成一个Bool列表
        for i in range(4):
            # 数据集大小为4, 所以每次取testloader里面样本都有四个
            lable = lables[i]
            class_correct[lable] += c[i].item()
            # True为1, False为0
            class_total[lable] += 1

for i in range(10):
    print("Accuracy of %s : %2d %%" % (
            classes[i], 100 * class_correct[i] / class_total[i]))

# 附注：让人沮丧的是，简单的加深深度并不能有效地提高识别精度，甚至会导致loss函数不收敛。

相关阅读:
如何成为一名数据科学家
 暑假反思
 暑假计划（7月23日-8月21日）
ACM数论模板
 Nelder–Mead method
Introduction to Data Mining
51_1037最长循环节（miller rabin算法 pollard rho算法原根）
乘法逆元（转)
51_1228 序列求和（伯努利数）（转）
清除input中内容的简单方法
原文地址：https://www.cnblogs.com/fengxunling/p/13976930.html