    1. 图像,Nx3xHeight x Width
    2. BBs,NxMx4
    3. 类型,NxMx1
      因此,可以将BBs和类型组成一个。Pytorch默认的数据类型是batchsize x nChanns x H x W。

    在目标检测中,一般将图像进行缩放,使其尺寸满足一定要求,具体可以参考之前的博客。也就是要实现一个Resizer()的类进行变换。此外,通常要对图像进行标准化处理,以及水平翻转等变换。因此,在实现Dataset时要实现的变换有三个: Resizer()Normilizer()Augmenter()

    Python中图像数据读入一般都是 nChanns x H x W的numpy数组。常规的做法是使用Dataset中的transform对数据进行转换,输出torch类型的数组。

    由于CoCo数据集中图像的尺寸不一致,不能直接获得Nx3xHeight x Width类型的数组,因此要重写DataLoader中的collate_fn,将一个minibatch中的图像尺寸调整一致。如果想要按照图像被缩放比例进行采样,就要重写DataLoader中的batch_sampler
    batch_samplerDataLoader中的batch_size, shuffle, sampler, and drop_last参数是不兼容的,即在DataLoader中使用了batch_sampler,参数就不能再设置batch_size, shuffle, sampler, and drop_last参数。



    1. python中图像的读入的通常是numpy的uint8数组,需要转换成float类型,并除以255以使最大值为1.0;
    2. coco数据中有80个类型,但是给的标签值最大为90,说明并不连续,需要设置新的标签,新的标签要从0到79,一定从0开始
    3. coco数据集中有些图片的BBs标签高宽小于1,标注的问题,要注意舍去。


    class SimpleCoCoDataset(Dataset):
        def __init__(self, rootdir, set_name='val2017', transform=None):
            self.rootdir, self.set_name = rootdir, set_name
            self.transform = transform
            self.coco = COCO(os.path.join(self.rootdir, 'annotations', 'instances_'
                                          + self.set_name + '.json'))
            self.image_ids = self.coco.getImgIds()
        def load_classes(self):
            categories = self.coco.loadCats(self.coco.getCatIds())
            categories.sort(key=lambda x: x['id'])
            # coco ids is not from 1, and not continue
            # make a new index from 0 to 79, continuely
            # classes:             {names:      new_index}
            # coco_labels:         {new_index:  coco_index}
            # coco_labels_inverse: {coco_index: new_index}
            self.classes, self.coco_labels, self.coco_labels_inverse = {}, {}, {}
            for c in categories:
                self.coco_labels[len(self.classes)] = c['id']
                self.coco_labels_inverse[c['id']]   = len(self.classes)
                self.classes[c['name']] = len(self.classes)
            # labels:              {new_index:  names}
            self.labels = {}
            for k, v in self.classes.items():
                self.labels[v] = k
        def __len__(self):
            return len(self.image_ids)            
        def __getitem__(self, index):
            img = self.load_image(index)
            ann = self.load_anns(index)
            sample = {'img':img, 'ann': ann}
            if self.transform:
                sample = self.transform(sample)
            return sample
        def load_image(self, index):
            image_info = self.coco.loadImgs(self.image_ids[index])[0]
            imgpath       =  os.path.join(self.rootdir, 'images', self.set_name, 
            img = skimage.io.imread(imgpath)
            return img.astype(np.float32) / 255.0
        def load_anns(self, index):
            annotation_ids = self.coco.getAnnIds(self.image_ids[index], iscrowd=False)
            # anns is num_anns x 5, (x1, x2, y1, y2, new_idx)
            anns = np.zeros((0, 5))
            # skip the image without annoations
            if len(annotation_ids) == 0:
                return anns
            coco_anns = self.coco.loadAnns(annotation_ids)
            for a in coco_anns:
                # skip the annotations with width or height < 1
                if a['bbox'][2] < 1 or a['bbox'][3] < 1:
                ann = np.zeros((1, 5))
                ann[0, :4] = a['bbox']
                ann[0, 4]  = self.coco_labels_inverse[a['category_id']]
                anns = np.append(anns, ann, axis=0)
            # (x1, y1, width, height) --> (x1, y1, x2, y2)
            anns[:, 2] += anns[:, 0]
            anns[:, 3] += anns[:, 1]
            return anns
        def image_aspect_ratio(self, index):
            image = self.coco.loadImgs(self.image_ids[index])[0]
            return float(image['width']) / float(image['height'])


    实现了两种transform类型, Resizer()Normilizer()。数据的均值为[0.485, 0.456, 0.406],方差为:[0.229, 0.224, 0.225]。利用数组广播机制可以很容易写出Normilizer():

    class Normilizer(object):
        def __init__(self):
            self.mean = np.array([[[0.485, 0.456, 0.406]]], dtype=np.float32)
            self.std  = np.array([[[0.229, 0.224, 0.225]]], dtype=np.float32)
        def __call__(self, sample):
            image, anns = sample['img'], sample['ann']
            return {'img':(image.astype(np.float32)-self.mean)/ self.std,


    class Resizer():
        def __call__(self, sample, targetSize=608, maxSize=1024, pad_N=32):
            image, anns = sample['img'], sample['ann']
            rows, cols = image.shape[:2]
            smaller_size, larger_size = min(rows, cols), max(rows, cols)
            scale = targetSize / smaller_size
            if larger_size * scale > maxSize:
                scale = maxSize / larger_size
            image = skimage.transform.resize(image.astype(np.float64), 
            rows, cols, cns = image.shape[:3]
            # 填补放缩后的图片,并使其尺寸为32的整倍数
            pad_w, pad_h = (pad_N - cols % pad_N), (pad_N - rows % pad_N)
            new_image = np.zeros((rows + pad_h, cols + pad_w, cns)).astype(np.float32)
            new_image[:rows, :cols, :] = image.astype(np.float32)
            anns[:, :4] *= scale
            return {'img': torch.from_numpy(new_image), 
                    'ann': torch.from_numpy(anns),


    batch_sampler 提供了从Dataset中进行采样的方法,我们按照原始图像尺寸比例进行排序进行采样。这个类要集成torch.utils.data.Sampler类,并实现__len__()__iter__()两个方法。


    class AspectRatioBasedSampler(Sampler):
        def __init__(self, dataset, batch_size, drop_last):
            self.dataset    = dataset
            self.batch_size = batch_size
            self.drop_last  = drop_last
            self.groups     = self.group_images()
        def group_images(self):
            order = list(range(len(self.dataset)))
            order.sort(key=lambda x: self.dataset.image_aspect_ratio(x))
            return [[order[x % len(order)] for x in range(i, i+self.batch_size)]
                           for i in range(0, len(order), self.batch_size)]
        def __iter__(self):
            for group in self.groups:
                yield group
        def __len__(self):
            if self.drop_last:
                return len(self.dataset) // self.batch_size
                return (len(self.dataset) + self.batch_size - 1) // self.batch_size

    通过batch_sampler采样得到的样本数据,其图像尺寸可能不完全一致,这时就需要用到collate_fn参数指定被采样样本图片尺寸的调整方式。通常的做法是,获得这组样本中图片尺寸的最大值 (Width_{max})和$Height_{max} $,然后将改组样本中所有图像的尺寸调整 $ Height_{max} imes Width_{max} $ 最终返回图像数据为: $ BatchSize imes Height_{max} imes Width_{max} imes 3 $

    此外,每个样本中的BBs的数量也可能不同,设BBs数量最大值为 (Ann_{max}) ,也要将标签和类型尺寸调整相同,对于BBs小于 (Ann_{max}) 的样本,补充-1。最终返回标签数据为:(BatchSize imes Ann_{max} imes 5)

    def collater(data):
        imgs = [s['img'] for s in data]
        annots = [s['annot'] for s in data]
        scales = [s['scale'] for s in data]
        widths = [int(s.shape[0]) for s in imgs]
        heights = [int(s.shape[1]) for s in imgs]
        batch_size = len(imgs)
        max_width = np.array(widths).max()
        max_height = np.array(heights).max()
        padded_imgs = torch.zeros(batch_size, max_width, max_height, 3)
        for i in range(batch_size):
            img = imgs[i]
            padded_imgs[i, :int(img.shape[0]), :int(img.shape[1]), :] = img
        max_num_annots = max(annot.shape[0] for annot in annots)
        if max_num_annots > 0:
            annot_padded = torch.ones((len(annots), max_num_annots, 5)) * -1
            if max_num_annots > 0:
                for idx, annot in enumerate(annots):
                    if annot.shape[0] > 0:
                        annot_padded[idx, :annot.shape[0], :] = annot
            annot_padded = torch.ones((len(annots), 1, 5)) * -1
        padded_imgs = padded_imgs.permute(0, 3, 1, 2)
        return {'img': padded_imgs, 'annot': annot_padded, 'scale': scales}



    {'img': torch.tensor((batch_size, height, width, 3)), 'ann': torch.tensor((batch_size, num_ann, 5), 'scale': scalar }


    def my_coco_show(samples, labels):
        image, anns, scales = samples['img'].numpy(), samples['ann'].numpy(), samples['scale']
        imgIdx = 1
        for img, ann, scale in zip(image, anns, scales):
            # 去掉补充的-1
            ann = ann[ann[:, 4] != -1]
            if ann.shape[0] == 0:
            # 通过类型索引获得类型
            classes = []
            for idx in ann[:, 4]:
            # 反标准化        
            img = np.transpose(img, (1, 2, 0))
            img = img * np.array([[[0.229, 0.224, 0.225]]]) + np.array([[[0.485, 0.456, 0.406]]])
            for idx in range(ann.shape[0]):
                p1 = (int(round(ann[idx, 0])), int(round(ann[idx, 1])))
                p2 = (int(round(ann[idx, 2])), int(round(ann[idx, 3])))
                cv2.rectangle(img, p1,p2, (255, 0, 0), 2)
                 # 图像,文字内容, 坐标 ,字体,大小,颜色,字体厚度
                cv2.putText(img, classes[idx], (p2[0] - 40, p2[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2, 8)
            winName = str(imgIdx)
            cv2.namedWindow(winName, cv2.WINDOW_AUTOSIZE)
            cv2.moveWindow(winName, 10, 10)
            cv2.imshow(winName, img[:,:,::-1])
            imgIdx += 1
