• pytorch中tensorboardX进行可视化


    环境依赖:

    pytorch   0.4以上

    tensorboardX:   pip install tensorboardX、pip install tensorflow

    在项目代码中加入tensorboardX的记录代码,生成文件并返回到浏览器中显示可视化结果。

    官方示例:

    默认设置是在根目录下生成一个runs文件夹,里面存储summary的信息。

    在runs的同级目录下命令行中输入:

    tensorboard --logdir runs            (不是输tensorboardX)

    会出来一个网站,复制到浏览器即可可视化loss,acc,lr等数据的变化过程.

    举例说明pytorch中设置summary的方式:

      1 import argparse
      2 import os
      3 import numpy as np
      4 from tqdm import tqdm
      5 
      6 from mypath import Path
      7 from dataloaders import make_data_loader
      8 from modeling.sync_batchnorm.replicate import patch_replication_callback
      9 from modeling.deeplab import *
     10 from modeling.psp_net import *
     11 from utils.loss import SegmentationLosses
     12 from utils.calculate_weights import calculate_weigths_labels
     13 from utils.lr_scheduler import LR_Scheduler
     14 from utils.saver import Saver
     15 from utils.summaries import TensorboardSummary
     16 from utils.metrics import Evaluator
     17 from utils.misc import CrossEntropyLoss2d
     18 
     19 class Trainer(object):
     20     def __init__(self, args):
     21         self.args = args
     22 
     23         # Define Saver
     24         self.saver = Saver(args)
     25         self.saver.save_experiment_config()
     26         # Define Tensorboard Summary,是pytorch中的tensorboardX.
     27         self.summary = TensorboardSummary(self.saver.experiment_dir)
     28         self.writer = self.summary.create_summary()
     29         
     30         # Define Dataloader,根据不同的数据集修改此加载器
     31         kwargs = {'num_workers': args.workers, 'pin_memory': True}
     32         self.train_loader, self.val_loader, self.test_loader, self.nclass = make_data_loader(args, **kwargs)
     33 
     34         # Define network,需要修改的是类的数量.
     35         model = PSPNet(num_classes=self.nclass).cuda()
     36         #源代码的deeplabv3+模型
     37         # model = DeepLab(num_classes=self.nclass,
     38         #                 backbone=args.backbone,
     39         #                 output_stride=args.out_stride,
     40         #                 sync_bn=args.sync_bn,
     41         #                 freeze_bn=args.freeze_bn)
     42 
     43         # train_params = [{'params': model.get_1x_lr_params(), 'lr': args.lr},
     44         #                 {'params': model.get_10x_lr_params(), 'lr': args.lr * 10}]
     45 
     46         # Define Optimizer(deeplabv3+)
     47         # optimizer = torch.optim.SGD(train_params, momentum=args.momentum,
     48         #                             weight_decay=args.weight_decay, nesterov=args.nesterov)
     49         #PSPNET,修改的优化器部分,需要注意的是lr需要用args.lr来表示
     50         optimizer = torch.optim.SGD([
     51             {'params': [param for name, param in model.named_parameters() if name[-4:] == 'bias'],
     52              'lr': 2 * args.lr},
     53             {'params': [param for name, param in model.named_parameters() if name[-4:] != 'bias'],
     54              'lr': args.lr, 'weight_decay': args.weight_decay}
     55         ], momentum=args.momentum, nesterov=True)
     56 
     57 
     58 
     59 
     60         # Define Criterion,在util中有Loss文件对此重新定义,调用时候用self.criterion
     61         # whether to use class balanced weights
     62         if args.use_balanced_weights:
     63             classes_weights_path = os.path.join(Path.db_root_dir(args.dataset), args.dataset+'_classes_weights.npy')
     64             if os.path.isfile(classes_weights_path):
     65                 weight = np.load(classes_weights_path)
     66             else:
     67                 weight = calculate_weigths_labels(args.dataset, self.train_loader, self.nclass)
     68             weight = torch.from_numpy(weight.astype(np.float32))
     69         else:
     70             weight = None
     71         self.criterion = SegmentationLosses(weight=weight, cuda=args.cuda).build_loss(mode=args.loss_type)
     72         self.model, self.optimizer = model, optimizer
     73         
     74         # Define Evaluator
     75         self.evaluator = Evaluator(self.nclass)
     76         # Define lr scheduler
     77         self.scheduler = LR_Scheduler(args.lr_scheduler, args.lr,
     78                                             args.epochs, len(self.train_loader))
     79 
     80         # Using cuda
     81         if args.cuda:
     82             self.model = torch.nn.DataParallel(self.model, device_ids=self.args.gpu_ids)
     83             patch_replication_callback(self.model)
     84             self.model = self.model.cuda()
     85 
     86         # Resuming checkpoint
     87         self.best_pred = 0.0
     88         if args.resume is not None:
     89             if not os.path.isfile(args.resume):
     90                 raise RuntimeError("=> no checkpoint found at '{}'" .format(args.resume))
     91             checkpoint = torch.load(args.resume)
     92             args.start_epoch = checkpoint['epoch']
     93             if args.cuda:
     94                 self.model.module.load_state_dict(checkpoint['state_dict'])
     95             else:
     96                 self.model.load_state_dict(checkpoint['state_dict'])
     97             if not args.ft:
     98                 self.optimizer.load_state_dict(checkpoint['optimizer'])
     99             self.best_pred = checkpoint['best_pred']
    100             print("=> loaded checkpoint '{}' (epoch {})"
    101                   .format(args.resume, checkpoint['epoch']))
    102 
    103         # Clear start epoch if fine-tuning
    104         if args.ft:
    105             args.start_epoch = 0
    106     #训练函数
    107     def training(self, epoch):
    108         train_loss = 0.0
    109         self.model.train()
    110         tbar = tqdm(self.train_loader)
    111         num_img_tr = len(self.train_loader)
    112         #源代码deeplabv3+的加载方式,换成pspnet时需要进行loss的修改
    113         # for inputs_slice, gts_slice in zip(inputs, gts):
    114         #     inputs_slice = Variable(inputs_slice).cuda()
    115         #     gts_slice = Variable(gts_slice).cuda()
    116         #
    117         #     optimizer.zero_grad()
    118         #     outputs, aux = net(inputs_slice)
    119         #     assert outputs.size()[2:] == gts_slice.size()[1:]
    120         #     assert outputs.size()[1] == voc.num_classes
    121         #
    122         #     main_loss = criterion(outputs, gts_slice)
    123         #     aux_loss = criterion(aux, gts_slice)
    124         #     loss = main_loss + 0.4 * aux_loss
    125         #     loss.backward()
    126         #     optimizer.step()
    127         #
    128         #     train_main_loss.update(main_loss.item(), slice_batch_pixel_size)
    129         #     train_aux_loss.update(aux_loss.item(), slice_batch_pixel_size)
    130         for i, sample in enumerate(tbar):
    131             image, target = sample['image'], sample['label']
    132             if self.args.cuda:
    133                 image, target = image.cuda(), target.cuda()
    134             self.scheduler(self.optimizer, i, epoch, self.best_pred)
    135 
    136             self.optimizer.zero_grad()
    137             outputs, aux = self.model(image)#output即为标签
    138             assert outputs.size()[2:] == target.size()[1:]
    139             assert outputs.size()[1] == self.nclass
    140             loss = self.criterion(outputs, target)
    141             #criterion
    142             loss.backward()
    143 
    144             #deeplabv3+设置
    145             # self.optimizer.zero_grad()
    146             # output = self.model(image)
    147             # loss = self.criterion(output, target)
    148             # loss.backward()
    149             self.optimizer.step()
    150             train_loss += loss.item()
    151             tbar.set_description('Train loss: %.3f' % (train_loss / (i + 1)))
    152             self.writer.add_scalar('train/total_loss_iter', loss.item(), i + num_img_tr * epoch)
    153 
    154             # Show 10 * 3 inference results each epoch
    155             if i % (num_img_tr // 10) == 0:
    156                 global_step = i + num_img_tr * epoch
    157                 self.summary.visualize_image(self.writer, self.args.dataset, image, target, outputs, global_step)
    158 
    159         self.writer.add_scalar('train/total_loss_epoch', train_loss, epoch)
    160         print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0]))
    161         print('Loss: %.3f' % train_loss)
    162 
    163         if self.args.no_val:
    164             # save checkpoint every epoch
    165             is_best = False
    166             self.saver.save_checkpoint({
    167                 'epoch': epoch + 1,
    168                 'state_dict': self.model.module.state_dict(),
    169                 'optimizer': self.optimizer.state_dict(),
    170                 'best_pred': self.best_pred,
    171             }, is_best)
    172 
    173 
    174     def validation(self, epoch):
    175         self.model.eval()
    176         self.evaluator.reset()
    177         tbar = tqdm(self.val_loader, desc='
    ')
    178         test_loss = 0.0
    179         for i, sample in enumerate(tbar):
    180             image, target = sample['image'], sample['label']
    181             if self.args.cuda:
    182                 image, target = image.cuda(), target.cuda()
    183             with torch.no_grad():
    184                 output = self.model(image)
    185             loss = self.criterion(output, target)
    186             test_loss += loss.item()
    187             tbar.set_description('Test loss: %.3f' % (test_loss / (i + 1)))
    188             pred = output.data.cpu().numpy()
    189             target = target.cpu().numpy()
    190             pred = np.argmax(pred, axis=1)
    191             # Add batch sample into evaluator
    192             self.evaluator.add_batch(target, pred)
    193 
    194         # Fast test during the training
    195         Acc = self.evaluator.Pixel_Accuracy()
    196         Acc_class = self.evaluator.Pixel_Accuracy_Class()
    197         mIoU = self.evaluator.Mean_Intersection_over_Union()
    198         FWIoU = self.evaluator.Frequency_Weighted_Intersection_over_Union()
    199         self.writer.add_scalar('val/total_loss_epoch', test_loss, epoch)
    200         self.writer.add_scalar('val/mIoU', mIoU, epoch)
    201         self.writer.add_scalar('val/Acc', Acc, epoch)
    202         self.writer.add_scalar('val/Acc_class', Acc_class, epoch)
    203         self.writer.add_scalar('val/fwIoU', FWIoU, epoch)
    204         print('Validation:')
    205         print('[Epoch: %d, numImages: %5d]' % (epoch, i * self.args.batch_size + image.data.shape[0]))
    206         print("Acc:{}, Acc_class:{}, mIoU:{}, fwIoU: {}".format(Acc, Acc_class, mIoU, FWIoU))
    207         print('Loss: %.3f' % test_loss)
    208 
    209         new_pred = mIoU
    210         if new_pred > self.best_pred:
    211             is_best = True
    212             self.best_pred = new_pred
    213             self.saver.save_checkpoint({
    214                 'epoch': epoch + 1,
    215                 'state_dict': self.model.module.state_dict(),
    216                 'optimizer': self.optimizer.state_dict(),
    217                 'best_pred': self.best_pred,
    218             }, is_best)
    219 
    220 def main():
    221     # 超参数的设置
    222     parser = argparse.ArgumentParser(description="PyTorch DeeplabV3Plus Training")
    223     # 提取特征的卷积网络的设置
    224     parser.add_argument('--backbone', type=str, default='resnet',
    225                         choices=['resnet', 'xception', 'drn', 'mobilenet'],
    226                         help='backbone name (default: resnet)')
    227     parser.add_argument('--out-stride', type=int, default=16,
    228                         help='network output stride (default: 8)')
    229     parser.add_argument('--dataset', type=str, default='pascal',
    230                         choices=['pascal', 'coco', 'cityscapes'],
    231                         help='dataset name (default: pascal)')
    232     parser.add_argument('--use-sbd', action='store_true', default=False,
    233                         help='whether to use SBD dataset (default: True)')
    234     parser.add_argument('--workers', type=int, default=4,
    235                         metavar='N', help='dataloader threads')
    236     parser.add_argument('--base-size', type=int, default=513,
    237                         help='base image size')
    238     # 在cuda内存不足时可修改此参数,原参数为513
    239     parser.add_argument('--crop-size', type=int, default=256,
    240                         help='crop image size')
    241     parser.add_argument('--sync-bn', type=bool, default=None,
    242                         help='whether to use sync bn (default: auto)')
    243     parser.add_argument('--freeze-bn', type=bool, default=False,
    244                         help='whether to freeze bn parameters (default: False)')
    245     parser.add_argument('--loss-type', type=str, default='ce',
    246                         choices=['ce', 'focal'],
    247                         help='loss func type (default: ce)')
    248     # training hyper params
    249     parser.add_argument('--epochs', type=int, default=None, metavar='N',
    250                         help='number of epochs to train (default: auto)')
    251     parser.add_argument('--start_epoch', type=int, default=0,
    252                         metavar='N', help='start epochs (default:0)')
    253     parser.add_argument('--batch-size', type=int, default=None,
    254                         metavar='N', help='input batch size for 
    255                                 training (default: auto)')
    256     parser.add_argument('--test-batch-size', type=int, default=None,
    257                         metavar='N', help='input batch size for 
    258                                 testing (default: auto)')
    259     parser.add_argument('--use-balanced-weights', action='store_true', default=False,
    260                         help='whether to use balanced weights (default: False)')
    261     # optimizer params
    262     parser.add_argument('--lr', type=float, default=None, metavar='LR',
    263                         help='learning rate (default: auto)')
    264     parser.add_argument('--lr-scheduler', type=str, default='poly',
    265                         choices=['poly', 'step', 'cos'],
    266                         help='lr scheduler mode: (default: poly)')
    267     parser.add_argument('--momentum', type=float, default=0.9,
    268                         metavar='M', help='momentum (default: 0.9)')
    269     parser.add_argument('--weight-decay', type=float, default=5e-4,
    270                         metavar='M', help='w-decay (default: 5e-4)')
    271     parser.add_argument('--nesterov', action='store_true', default=False,
    272                         help='whether use nesterov (default: False)')
    273     # cuda, seed and logging
    274     parser.add_argument('--no-cuda', action='store_true', default=
    275                         False, help='disables CUDA training')
    276     parser.add_argument('--gpu-ids', type=str, default='0',
    277                         help='use which gpu to train, must be a 
    278                         comma-separated list of integers only (default=0)')
    279     parser.add_argument('--seed', type=int, default=1, metavar='S',
    280                         help='random seed (default: 1)')
    281     # checking point
    282     parser.add_argument('--resume', type=str, default=None,
    283                         help='put the path to resuming file if needed')
    284     parser.add_argument('--checkname', type=str, default=None,
    285                         help='set the checkpoint name')
    286     # finetuning pre-trained models
    287     parser.add_argument('--ft', action='store_true', default=False,
    288                         help='finetuning on a different dataset')
    289     # evaluation option
    290     parser.add_argument('--eval-interval', type=int, default=1,
    291                         help='evaluuation interval (default: 1)')
    292     parser.add_argument('--no-val', action='store_true', default=False,
    293                         help='skip validation during training')
    294 
    295     args = parser.parse_args()
    296     args.cuda = not args.no_cuda and torch.cuda.is_available()
    297     if args.cuda:
    298         try:
    299             args.gpu_ids = [int(s) for s in args.gpu_ids.split(',')]
    300         except ValueError:
    301             raise ValueError('Argument --gpu_ids must be a comma-separated list of integers only')
    302 
    303     if args.sync_bn is None:
    304         if args.cuda and len(args.gpu_ids) > 1:
    305             args.sync_bn = True
    306         else:
    307             args.sync_bn = False
    308 
    309     # 默认的 epochs, batch_size and lr
    310     if args.epochs is None:
    311         epoches = {
    312             'coco': 30,
    313             'cityscapes': 200,
    314             'pascal': 50,
    315             # 50
    316         }
    317         args.epochs = epoches[args.dataset.lower()]
    318 
    319     if args.batch_size is None:
    320         args.batch_size = 2 * len(args.gpu_ids)
    321 
    322         # 4*
    323 
    324     if args.test_batch_size is None:
    325         args.test_batch_size = args.batch_size
    326 
    327     if args.lr is None:
    328         lrs = {
    329             'coco': 0.1,
    330             'cityscapes': 0.01,
    331             'pascal': 0.007,
    332         }
    333         args.lr = lrs[args.dataset.lower()] / (2 * len(args.gpu_ids)) * args.batch_size
    334 
    335 
    336     if args.checkname is None:
    337         args.checkname = 'deeplab-'+str(args.backbone)
    338     print(args)
    339     torch.manual_seed(args.seed)
    340     trainer = Trainer(args)
    341     print('Starting Epoch:', trainer.args.start_epoch)
    342     print('Total Epoches:', trainer.args.epochs)
    343     for epoch in range(trainer.args.start_epoch, trainer.args.epochs):
    344         trainer.training(epoch)
    345         if not trainer.args.no_val and epoch % args.eval_interval == (args.eval_interval - 1):
    346             trainer.validation(epoch)
    347 
    348     trainer.writer.close()
    349 
    350 if __name__ == "__main__":
    351    main()
  • 相关阅读:
    体验一下:AndroidX
    Android研发技术的进阶之路
    App 冷启动与热启动及启动白屏优化
    Android Q 正式命名为 Android 10
    Android开发学习路线的七个阶段和步骤
    安卓旅途之——开发数独(一)
    项目总结
    小组互评与自评
    典型用户与场景
    第二个Sprint计划
  • 原文地址:https://www.cnblogs.com/ywheunji/p/10712620.html
Copyright © 2020-2023  润新知