• Pytorch加载变长度序列数据










    想要使用pytorch 框架中的 Dataset 和 Dataloader 类,将变长序列整合为batch数据 (主要是对长短不一的序列进行补齐),通过自定义collate_fn函数,实现对变长数据的处理。


    Dataset 主要负责读取单条数据,建立索引方式。
    Dataloader 负责将数据聚合为batch。

    测试环境: python 3.6 ,pytorch 1.2.0


    举例:其中的 1.json 样本格式为:



     1 import os
     2 import numpy as np
     3 import torch
     4 from torch.utils.data import Dataset
     5 from tqdm import tqdm
     6 class time_series_dataset(Dataset):
     7     def __init__(self, data_root):
     8         """
     9         :param data_root:   数据集路径
    10         """
    11         self.data_root = data_root
    12         file_list = os.listdir(data_root)
    13         file_prefix = []
    14         for file in file_list:
    15             if '.json' in file:
    16                 file_prefix.append(file.split('.')[0])
    17         file_prefix = list(set(file_prefix))
    18         self.data = file_prefix
    19     def __len__(self):
    20         return len(self.data)
    21     def __getitem__(self, index):
    22         prefix = self.data[index]
    23         import json
    24         with open(self.data_root+prefix+'.json','r',encoding='utf-8') as f:
    25             data_dic=json.load(f)
    26         feature = np.array(data_dic['feature'])
    27         length=len(data_dic['feature'])
    28         feature = torch.from_numpy(feature)
    29         label = np.array(data_dic['label'])
    30         label = torch.from_numpy(label)
    31         sample = {'feature': feature, 'label': label, 'id': prefix,'length':length}
    32         return sample



    1 dataset = time_series_dataset("./data/") # "./data/" 为数据集文件存储路径

    举例: dataset[0]

    1 {'feature': tensor([17, 14, 16, 18, 14, 16], dtype=torch.int32),
    2  'label': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
    3          0], dtype=torch.int32),
    4  'id': '2',
    5  'length': 6}



     1 from torch.nn.utils.rnn import pad_sequence
     3 def collate_func(batch_dic):
     4     batch_len=len(batch_dic) # 批尺寸
     5     max_seq_length=max([dic['length'] for dic in batch_dic]) # 一批数据中最长的那个样本长度
     6     mask_batch=torch.zeros((batch_len,max_seq_length)) # mask
     7     fea_batch=[]
     8     label_batch=[]
     9     id_batch=[]
    10     for i in range(len(batch_dic)): # 分别提取批样本中的feature、label、id、length信息
    11         dic=batch_dic[i]
    12         fea_batch.append(dic['feature'])
    13         label_batch.append(dic['label'])
    14         id_batch.append(dic['id'])
    15         mask_batch[i,:dic['length']]=1 # mask
    16     res={}
    17     res['feature']=pad_sequence(fea_batch,batch_first=True) # 将信息封装在字典res中
    18     res['label']=pad_sequence(label_batch,batch_first=True)
    19     res['id']=id_batch
    20     res['mask']=mask_batch
    21     return res


    说明: mask 字段用以存储变长序列的实际长度,补零的部分记为0,实际序列对应位置记为1。返回数据的格式及包含的字段,根据自己的需求进行定义。


    1     for i in range(len(batch_dic)):
    2         dic=batch_dic[i]
    3         fea_batch.append(dic['feature'])
    4         label_batch.append(dic['label'])
    5         id_batch.append(dic['id'])
    6         mask_batch[i,:dic['length']]=1
    1     fea_batch = list(map(lambda x: x['feature'], batch_dic))
    2     label_batch = list(map(lambda x: x['label'], batch_dic))
    3     id_batch = list(map(lambda x: x['id'], batch_dic))


    1 train_loader = DataLoader(dataset, batch_size=3, num_workers=1, shuffle=True,collate_fn=collate_func)


     1 import os
     2 import numpy as np
     3 import torch
     4 from torch.utils.data import Dataset
     5 from torch.utils.data import DataLoader
     6 from tqdm import tqdm
     7 class time_series_dataset(Dataset):
     8     def __init__(self, data_root):
     9         """
    10         :param data_root:   数据集路径
    11         """
    12         self.data_root = data_root
    13         file_list = os.listdir(data_root)
    14         file_prefix = []
    15         for file in file_list:
    16             if '.json' in file:
    17                 file_prefix.append(file.split('.')[0])
    18         file_prefix = list(set(file_prefix))
    19         self.data = file_prefix
    20     def __len__(self):
    21         return len(self.data)
    22     def __getitem__(self, index):
    23         prefix = self.data[index]
    24         import json
    25         with open(self.data_root+prefix+'.json','r',encoding='utf-8') as f:
    26             data_dic=json.load(f)
    27         feature = np.array(data_dic['feature'])
    28         length=len(data_dic['feature'])
    29         feature = torch.from_numpy(feature)
    30         label = np.array(data_dic['label'])
    31         label = torch.from_numpy(label)
    32         sample = {'feature': feature, 'label': label, 'id': prefix,'length':length}
    33         return sample
    34 def collate_func(batch_dic):
    35     #from torch.nn.utils.rnn import pad_sequence
    36     batch_len=len(batch_dic)
    37     max_seq_length=max([dic['length'] for dic in batch_dic])
    38     mask_batch=torch.zeros((batch_len,max_seq_length))
    39     fea_batch=[]
    40     label_batch=[]
    41     id_batch=[]
    42     for i in range(len(batch_dic)):
    43         dic=batch_dic[i]
    44         fea_batch.append(dic['feature'])
    45         label_batch.append(dic['label'])
    46         id_batch.append(dic['id'])
    47         mask_batch[i,:dic['length']]=1
    48     res={}
    49     res['feature']=pad_sequence(fea_batch,batch_first=True)
    50     res['label']=pad_sequence(label_batch,batch_first=True)
    51     res['id']=id_batch
    52     res['mask']=mask_batch
    53     return res
    54 if __name__ == "__main__":
    55     dataset = time_series_dataset("./data/")
    56     batch_size=3
    57     train_loader = DataLoader(dataset, batch_size=batch_size, num_workers=4, shuffle=True,collate_fn=collate_func)
    58     for batch_idx, batch in tqdm(enumerate(train_loader),total=int(len(train_loader.dataset) / batch_size) + 1):
    59         inputs,labels,masks,ids=batch['feature'],batch['label'],batch['mask'],batch['id']
    60         break



     1 from torch.nn.utils.rnn import pack_sequence
     2 from torch.utils.data import DataLoader
     4 def my_collate(batch):
     5     # batch contains a list of tuples of structure (sequence, target)
     6     data = [item[0] for item in batch]
     7     data = pack_sequence(data, enforce_sorted=False)
     8     targets = [item[1] for item in batch]
     9     return [data, targets]
    11 # ...
    12 # later in you code, when you define you DataLoader - use the custom collate function
    13 loader = DataLoader(dataset,
    14                       batch_size,
    15                       shuffle,
    16                       collate_fn=my_collate, # use custom collate function here
    17                       pin_memory=True)



     I wrote a simple code that maybe someone here can re-use. I wanted to make something that pads a generic dim, and I don’t use an RNN of any type so PackedSequence was a bit of overkill for me. It’s simple, but it works for me.

     1 def pad_tensor(vec, pad, dim):
     2     """
     3     args:
     4         vec - tensor to pad
     5         pad - the size to pad to
     6         dim - dimension to pad
     8     return:
     9         a new tensor padded to 'pad' in dimension 'dim'
    10     """
    11     pad_size = list(vec.shape)
    12     pad_size[dim] = pad - vec.size(dim)
    13     return torch.cat([vec, torch.zeros(*pad_size)], dim=dim)
    16 class PadCollate:
    17     """
    18     a variant of callate_fn that pads according to the longest sequence in
    19     a batch of sequences
    20     """
    22     def __init__(self, dim=0):
    23         """
    24         args:
    25             dim - the dimension to be padded (dimension of time in sequences)
    26         """
    27         self.dim = dim
    29     def pad_collate(self, batch):
    30         """
    31         args:
    32             batch - list of (tensor, label)
    34         reutrn:
    35             xs - a tensor of all examples in 'batch' after padding
    36             ys - a LongTensor of all labels in batch
    37         """
    38         # find longest sequence
    39         max_len = max(map(lambda x: x[0].shape[self.dim], batch))
    40         # pad according to max_len
    41         batch = map(lambda (x, y):
    42                     (pad_tensor(x, pad=max_len, dim=self.dim), y), batch)
    43         # stack all
    44         xs = torch.stack(map(lambda x: x[0], batch), dim=0)
    45         ys = torch.LongTensor(map(lambda x: x[1], batch))
    46         return xs, ys
    48     def __call__(self, batch):
    49         return self.pad_collate(batch)

    to be used with the data loader:

    1 train_loader = DataLoader(ds, ..., collate_fn=PadCollate(dim=0))


    If you are going to pack your padded sequences later, you can also immediately sort the batches from longest sequence to shortest:


     1 def sort_batch(batch, targets, lengths):
     2     """
     3     Sort a minibatch by the length of the sequences with the longest sequences first
     4     return the sorted batch targes and sequence lengths.
     5     This way the output can be used by pack_padded_sequences(...)
     6     """
     7     seq_lengths, perm_idx = lengths.sort(0, descending=True)
     8     seq_tensor = batch[perm_idx]
     9     target_tensor = targets[perm_idx]
    10     return seq_tensor, target_tensor, seq_lengths
    12 def pad_and_sort_batch(DataLoaderBatch):
    13     """
    14     DataLoaderBatch should be a list of (sequence, target, length) tuples...
    15     Returns a padded tensor of sequences sorted from longest to shortest, 
    16     """
    17     batch_size = len(DataLoaderBatch)
    18     batch_split = list(zip(*DataLoaderBatch))
    20     seqs, targs, lengths = batch_split[0], batch_split[1], batch_split[2]
    21     max_length = max(lengths)
    23     padded_seqs = np.zeros((batch_size, max_length))
    24     for i, l in enumerate(lengths):
    25         padded_seqs[i, 0:l] = seqs[i][0:l]
    27     return sort_batch(torch.tensor(padded_seqs), torch.tensor(targs).view(-1,1), torch.tensor(lengths))


    1 def __getitem__(self, idx):
    2         return self.sequences[idx], torch.tensor(self.targets[idx]), self.sequence_lengths[idx]

    使用时将pad_and_sort collator传到 DataLoader:

    1 train_gen = Data.DataLoader(train_data, batch_size=128, shuffle=True, collate_fn=pad_and_sort_batch)


     1 def collate_fn_padd(batch):
     2     '''
     3     Padds batch of variable length
     5     note: it converts things ToTensor manually here since the ToTensor transform
     6     assume it takes in images rather than arbitrary tensors.
     7     '''
     8     ## get sequence lengths
     9     lengths = torch.tensor([ t.shape[0] for t in batch ]).to(device)
    10     ## padd
    11     batch = [ torch.Tensor(t).to(device) for t in batch ]
    12     batch = torch.nn.utils.rnn.pad_sequence(batch)
    13     ## compute mask
    14     mask = (batch != 0).to(device)
    15     return batch, lengths, mask




  • 相关阅读:
    Fatal Error: TXK Install Service oracle.apps.fnd.txk.config.ProcessStateException: OUI process failed : Exit=255 See log for details
    here was insufficient free space available after evicting expired cache entries
    Python Decorator 和函数式编程
  • 原文地址:https://www.cnblogs.com/jiangkejie/p/13141781.html
Copyright © 2020-2023  润新知