如题所述:
深度学习框架MindSpore是华为公司研发的,由于性能设计的原因,MindSpore的一些使用方式和TensorFlow和PyTorch有一些不同,其中的一点就是在进行单步计算或者是非持续数据流的计算的时候,该种情景下MindSpore的编程方式有些自有特点。
参考前文:
https://www.cnblogs.com/devilmaycry812839668/p/14971668.html
我们给出代码:
#!/usr/bin python # encoding:UTF-8 """" 对输入的超参数进行处理 """ import os import argparse """ 设置运行的背景context """ from mindspore import context """ 对数据集进行预处理 """ import mindspore.dataset as ds import mindspore.dataset.transforms.c_transforms as C import mindspore.dataset.vision.c_transforms as CV from mindspore.dataset.vision import Inter from mindspore import dtype as mstype """ 构建神经网络 """ import mindspore.nn as nn from mindspore.common.initializer import Normal """ 训练时对模型参数的保存 """ from mindspore.train.callback import ModelCheckpoint, CheckpointConfig """ 导入模型训练需要的库 """ from mindspore.nn import Accuracy from mindspore.train.callback import LossMonitor from mindspore import Model parser = argparse.ArgumentParser(description='MindSpore LeNet Example') parser.add_argument('--device_target', type=str, default="GPU", choices=['Ascend', 'GPU', 'CPU']) args = parser.parse_known_args()[0] # 为mindspore设置运行背景context context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) def create_dataset(data_path, batch_size=32, repeat_size=1, num_parallel_workers=1): # 定义数据集 mnist_ds = ds.MnistDataset(data_path) resize_height, resize_width = 32, 32 rescale = 1.0 / 255.0 shift = 0.0 rescale_nml = 1 / 0.3081 shift_nml = -1 * 0.1307 / 0.3081 # 定义所需要操作的map映射 resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR) rescale_nml_op = CV.Rescale(rescale_nml, shift_nml) rescale_op = CV.Rescale(rescale, shift) hwc2chw_op = CV.HWC2CHW() type_cast_op = C.TypeCast(mstype.int32) # 使用map映射函数,将数据操作应用到数据集 mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers) mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers) mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers) mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers) mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers) # 进行shuffle、batch、repeat操作 buffer_size = 10000 mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size) mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) mnist_ds = mnist_ds.repeat(repeat_size) return mnist_ds class LeNet5(nn.Cell): """ Lenet网络结构 """ def __init__(self, num_class=10, num_channel=1): super(LeNet5, self).__init__() # 定义所需要的运算 self.conv1 = nn.Conv2d(num_channel, 6, 5, pad_mode='valid') self.conv2 = nn.Conv2d(6, 16, 5, pad_mode='valid') self.fc1 = nn.Dense(16 * 5 * 5, 120, weight_init=Normal(0.02)) self.fc2 = nn.Dense(120, 84, weight_init=Normal(0.02)) self.fc3 = nn.Dense(84, num_class, weight_init=Normal(0.02)) self.relu = nn.ReLU() self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2) self.flatten = nn.Flatten() def construct(self, x): # 使用定义好的运算构建前向网络 x = self.conv1(x) x = self.relu(x) x = self.max_pool2d(x) x = self.conv2(x) x = self.relu(x) x = self.max_pool2d(x) x = self.flatten(x) x = self.fc1(x) x = self.relu(x) x = self.fc2(x) x = self.relu(x) x = self.fc3(x) return x # 实例化网络 net = LeNet5() # 定义损失函数 net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # 定义优化器 net_opt = nn.Momentum(net.trainable_params(), learning_rate=0.01, momentum=0.9) # 设置模型保存参数 # 每125steps保存一次模型参数,最多保留15个文件 config_ck = CheckpointConfig(save_checkpoint_steps=125, keep_checkpoint_max=15) # 应用模型保存参数 ckpoint = ModelCheckpoint(prefix="checkpoint_lenet", config=config_ck) def train_net(args, model, epoch_size, data_path, repeat_size, ckpoint_cb, sink_mode): """定义训练的方法""" # 加载训练数据集 ds_train = create_dataset(os.path.join(data_path, "train"), 32, repeat_size) model.train(epoch_size, ds_train, callbacks=[ckpoint_cb, LossMonitor(125)], dataset_sink_mode=sink_mode) def test_net(network, model, data_path): """定义验证的方法""" ds_eval = create_dataset(os.path.join(data_path, "test")) acc = model.eval(ds_eval, dataset_sink_mode=False) print("{}".format(acc)) mnist_path = "./datasets/MNIST_Data" train_epoch = 1 dataset_size = 1 model = Model(net, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) # first time to train train_net(args, model, train_epoch, mnist_path, dataset_size, ckpoint, False) """ something to do 这里假设我们有其他的一些事情要做 fun() """ print('*'*50) # second time to train train_net(args, model, train_epoch, mnist_path, dataset_size, ckpoint, True) test_net(net, model, mnist_path)
运行结果:
WARNING: 'ControlDepend' is deprecated from version 1.1 and will be removed in a future version, use 'Depend' instead.
[WARNING] ME(19995:139935191224448,MainProcess):2021-07-09-02:26:46.308.564 [mindspore/ops/operations/array_ops.py:2302] WARN_DEPRECATED: The usage of Pack is deprecated. Please use Stack.
epoch: 1 step: 125, loss is 2.295223
epoch: 1 step: 250, loss is 2.2996147
epoch: 1 step: 375, loss is 2.3157573
epoch: 1 step: 500, loss is 2.305396
epoch: 1 step: 625, loss is 2.2769482
epoch: 1 step: 750, loss is 2.3012204
epoch: 1 step: 875, loss is 2.2151213
epoch: 1 step: 1000, loss is 0.53947586
epoch: 1 step: 1125, loss is 0.04369175
epoch: 1 step: 1250, loss is 0.25334337
epoch: 1 step: 1375, loss is 0.37018073
epoch: 1 step: 1500, loss is 0.10670306
epoch: 1 step: 1625, loss is 0.17298605
epoch: 1 step: 1750, loss is 0.018228231
epoch: 1 step: 1875, loss is 0.099561594
**************************************************
epoch: 1 step: 1875, loss is 0.13135403
{'Accuracy': 0.9771634615384616}
从代码上我们可以知道,第一个train_net是训练了一个epoch的数据,然后假设我们有其他的一些事情要做,做完其他事情后便又回来继续进行训练。该种方式:写两次train_net的方式确实可以实现本文title的场景,但是这里面有个比较重要的参数就是 model.train 中的 dataset_sink_mode ,因为这两个 tain_net 中传入的 dataset_sink_mode参数为False或者True对于算法的最终表现是不同的。
这里对 dataset_sink_mode 这个参数的含义不做完整解释,不过简单的理解可以把它当做是否将数据集中的数据先缓存一部分到运行设备端(这里我们假设运行端为NVIDIA显卡GPU)。
===========================================================
经过实验发现,以下的dataset_sink_mode设置是 可以 正常运行的:
# first time to train train_net(args, model, train_epoch, mnist_path, dataset_size, ckpoint, False) """ something to do 这里假设我们有其他的一些事情要做 fun() """ print('*'*50) # second time to train train_net(args, model, train_epoch, mnist_path, dataset_size, ckpoint, False)
# first time to train train_net(args, model, train_epoch, mnist_path, dataset_size, ckpoint, False) """ something to do 这里假设我们有其他的一些事情要做 fun() """ print('*'*50) # second time to train train_net(args, model, train_epoch, mnist_path, dataset_size, ckpoint, True)
经过实验发现,以下的dataset_sink_mode设置是不可以正常运行的:
# first time to train train_net(args, model, train_epoch, mnist_path, dataset_size, ckpoint, True) """ something to do 这里假设我们有其他的一些事情要做 fun() """ print('*'*50) # second time to train train_net(args, model, train_epoch, mnist_path, dataset_size, ckpoint, False)
# first time to train train_net(args, model, train_epoch, mnist_path, dataset_size, ckpoint, True) """ something to do 这里假设我们有其他的一些事情要做 fun() """ print('*'*50) # second time to train train_net(args, model, train_epoch, mnist_path, dataset_size, ckpoint, True)
针对下面设置:
# first time to train train_net(args, model, train_epoch, mnist_path, dataset_size, ckpoint, True) """ something to do 这里假设我们有其他的一些事情要做 fun() """ print('*'*50) # second time to train train_net(args, model, train_epoch, mnist_path, dataset_size, ckpoint, True)
我们给出报错信息:
WARNING: 'ControlDepend' is deprecated from version 1.1 and will be removed in a future version, use 'Depend' instead. [WARNING] ME(20966:140380477292672,MainProcess):2021-07-09-02:31:41.656.62 [mindspore/ops/operations/array_ops.py:2302] WARN_DEPRECATED: The usage of Pack is deprecated. Please use Stack. epoch: 1 step: 1875, loss is 0.0969878 ************************************************** [ERROR] ANALYZER(20966,python):2021-07-09-02:31:45.010.377 [mindspore/ccsrc/pipeline/jit/static_analysis/evaluator.cc:74] Eval] Function construct, The number of parameters of this function is 0, but the number of provided arguments is 2. NodeInfo: In file /usr/local/python-3.7.5/lib/python3.7/site-packages/mindspore/train/dataset_helper.py(85) def construct(self): Traceback (most recent call last): File "/tmp/pycharm_project_753/le_net2.py", line 149, in <module> train_net(args, model, train_epoch, mnist_path, dataset_size, ckpoint, True) File "/tmp/pycharm_project_753/le_net2.py", line 126, in train_net model.train(epoch_size, ds_train, callbacks=[ckpoint_cb, LossMonitor(125)], dataset_sink_mode=sink_mode) File "/usr/local/python-3.7.5/lib/python3.7/site-packages/mindspore/train/model.py", line 592, in train sink_size=sink_size) File "/usr/local/python-3.7.5/lib/python3.7/site-packages/mindspore/train/model.py", line 391, in _train self._train_dataset_sink_process(epoch, train_dataset, list_callback, cb_params, sink_size) File "/usr/local/python-3.7.5/lib/python3.7/site-packages/mindspore/train/model.py", line 452, in _train_dataset_sink_process outputs = self._train_network(*inputs) File "/usr/local/python-3.7.5/lib/python3.7/site-packages/mindspore/nn/cell.py", line 322, in __call__ out = self.compile_and_run(*inputs) File "/usr/local/python-3.7.5/lib/python3.7/site-packages/mindspore/nn/cell.py", line 578, in compile_and_run self.compile(*inputs) File "/usr/local/python-3.7.5/lib/python3.7/site-packages/mindspore/nn/cell.py", line 565, in compile _executor.compile(self, *inputs, phase=self.phase, auto_parallel_mode=self._auto_parallel_mode) File "/usr/local/python-3.7.5/lib/python3.7/site-packages/mindspore/common/api.py", line 505, in compile result = self._executor.compile(obj, args_list, phase, use_vm) TypeError: mindspore/ccsrc/pipeline/jit/static_analysis/evaluator.cc:74 Eval] Function construct, The number of parameters of this function is 0, but the number of provided arguments is 2. NodeInfo: In file /usr/local/python-3.7.5/lib/python3.7/site-packages/mindspore/train/dataset_helper.py(85) def construct(self): The function call stack (See file 'analyze_fail.dat' for details): # 0 In file /usr/local/python-3.7.5/lib/python3.7/site-packages/mindspore/train/dataset_helper.py(87) return self.network(*outputs) ^ 进程已结束,退出代码为 1
可以看到,即使是同一个模型对同一个数据集进行两次的持续数据流学习,那么第一次持续数据流学习的时候 dataset_sink_mode 只能设置为False, 也就是说第一次的训练不能在设备GPU上缓存数据,否则第二次进行持续数据流训练的时候则会报错。
============================================================
本文实验环境为 MindSpore1.1 docker版本
宿主机:Ubuntu18.04系统
CPU:I7-8700
GPU:1060ti NVIDIA显卡