https://zhuanlan.zhihu.com/p/36095768
我的推断,第二第三阶段应该不是把所有anchor进行bounding box regression,然后再选取当前条件下的所有roi,而是第一阶段选取512个roi,然后把在第一阶段匹配好的roi送到第二、三阶段
layer { name: "proposals_2nd" type: "DecodeBBox" bottom: "bbox_pred" bottom: "rois" bottom: "match_gt_boxes" top: "proposals_2nd" bbox_reg_param { bbox_mean: 0 bbox_mean: 0 bbox_mean: 0 bbox_mean: 0 bbox_std: 0.1 bbox_std: 0.1 bbox_std: 0.2 bbox_std: 0.2 } propagate_down: 0 propagate_down: 0 propagate_down: 0 }
这段代码就证明了这个想法:rois来自于第一阶段proposal_info,这些rois也是在第一阶段做roi-pooling用来训练的。
个人感觉cascade的模型就是4张图提取512个roi进行训练,然后经过第一阶段训练后,把这512个roi经过回归精修然后去除回归后x1大于x2和y1大于y2的和回归后和gt的iou大于0.95的,这样roi可能就没有512个了.把这些输入给第二阶段的proposal_info_2nd,让这个层再去决定训练样本,这样大可能训练的数据是不足512,并且3个阶段其实都是训练的同一个批roi,也就是说第一阶段进去的那些roi,后面几个阶段实际上也在训练他们,而不是新出来的框DecodeBbox层的输入是bbox_pred,rois和match_gt_boxes.首先明确一点,rpn网络会输出很多proposals出来,ProposalTarget层将这些proposals和gt算iou,确定正负样本并选取1:3的比例,然后输出rois,rois就是拿来具体训练的从rpn中获得那部分预提取框.DecodeBbox层就是将这些原本的rois回归成更精准的框,也就是在原始的rois的坐标上增加经过训练得到的回归的值,这个是通过DecodeBBoxesWithPrior函数实现.DecodeBbox层分为大致3个步骤:1.回归得到更精准的rois 2.去掉回归后x1大于x2和y1大于y2的框 3.去掉回归后和gt的iou大于0.95的框
这部分的结果
#include <cfloat> #include <vector> #include "caffe/util/bbox_util.hpp" #include "caffe/layers/decode_bbox_layer.hpp" namespace caffe { template <typename Dtype> void DecodeBBoxLayer<Dtype>::LayerSetUp( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { // bbox mean and std BBoxRegParameter bbox_reg_param = this->layer_param_.bbox_reg_param(); bbox_mean_.Reshape(4,1,1,1); bbox_std_.Reshape(4,1,1,1); if (bbox_reg_param.bbox_mean_size() > 0 && bbox_reg_param.bbox_std_size() > 0) { int num_means = this->layer_param_.bbox_reg_param().bbox_mean_size(); int num_stds = this->layer_param_.bbox_reg_param().bbox_std_size(); CHECK_EQ(num_means,4); CHECK_EQ(num_stds,4); for (int i = 0; i < 4; i++) { bbox_mean_.mutable_cpu_data()[i] = bbox_reg_param.bbox_mean(i); bbox_std_.mutable_cpu_data()[i] = bbox_reg_param.bbox_std(i); CHECK_GT(bbox_std_.mutable_cpu_data()[i],0); } } else { caffe_set(bbox_mean_.count(), Dtype(0), bbox_mean_.mutable_cpu_data()); caffe_set(bbox_std_.count(), Dtype(1), bbox_std_.mutable_cpu_data()); } } template <typename Dtype> void DecodeBBoxLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { // bottom: bbox_blob, prior_blob, (match_gt_boxes) CHECK_EQ(bottom[0]->num(),bottom[1]->num()); if (bottom.size()>=3) { CHECK_EQ(bottom[0]->num(),bottom[2]->num()); CHECK(this->phase_ == TRAIN); } CHECK_EQ(bottom[0]->channels(),8); CHECK_EQ(bottom[1]->channels(),5); bbox_pred_.ReshapeLike(*bottom[0]); top[0]->ReshapeLike(*bottom[1]); } template <typename Dtype> void DecodeBBoxLayer<Dtype>::Forward_cpu( const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { const int num = bottom[0]->num(); const int bbox_dim = bottom[0]->channels(); const int prior_dim = bottom[1]->channels(); //decode prior box [img_id x1 y1 x2 y2] const Dtype* prior_data = bottom[1]->cpu_data(); vector<BBox> prior_bboxes; for (int i = 0; i < num; i++) { BBox bbox; bbox.xmin = prior_data[i*prior_dim + 1]; bbox.ymin = prior_data[i*prior_dim + 2]; bbox.xmax = prior_data[i*prior_dim + 3]; bbox.ymax = prior_data[i*prior_dim + 4]; prior_bboxes.push_back(bbox); } // decode bbox predictions const Dtype* bbox_data = bottom[0]->cpu_data(); Dtype* bbox_pred_data = bbox_pred_.mutable_cpu_data(); DecodeBBoxesWithPrior(bbox_data,prior_bboxes,bbox_dim,bbox_mean_.cpu_data(), bbox_std_.cpu_data(),bbox_pred_data); vector<bool> valid_bbox_flags(num,true); // screen out mal-boxes if (this->phase_ == TRAIN) { for (int i = 0; i < num; i++) { const int base_index = i*bbox_dim+4; if (bbox_pred_data[base_index] > bbox_pred_data[base_index+2] || bbox_pred_data[base_index+1] > bbox_pred_data[base_index+3]) { valid_bbox_flags[i] = false; } } } // screen out high IoU boxes, to remove redundant gt boxes if (bottom.size()==3 && this->phase_ == TRAIN) { const Dtype* match_gt_boxes = bottom[2]->cpu_data(); const int gt_dim = bottom[2]->channels(); const float gt_iou_thr = this->layer_param_.decode_bbox_param().gt_iou_thr(); for (int i = 0; i < num; i++) { const float overlap = match_gt_boxes[i*gt_dim+gt_dim-1]; if (overlap >= gt_iou_thr) { valid_bbox_flags[i] = false; } } } vector<int> valid_bbox_ids; for (int i = 0; i < num; i++) { if (valid_bbox_flags[i]) { valid_bbox_ids.push_back(i); } } const int keep_num = valid_bbox_ids.size(); CHECK_GT(keep_num,0); top[0]->Reshape(keep_num, prior_dim, 1, 1); Dtype* decoded_bbox_data = top[0]->mutable_cpu_data(); for (int i = 0; i < keep_num; i++) { const int keep_id = valid_bbox_ids[i]; const int base_index = keep_id*bbox_dim+4; decoded_bbox_data[i*prior_dim] = prior_data[keep_id*prior_dim]; decoded_bbox_data[i*prior_dim+1] = bbox_pred_data[base_index]; decoded_bbox_data[i*prior_dim+2] = bbox_pred_data[base_index+1]; decoded_bbox_data[i*prior_dim+3] = bbox_pred_data[base_index+2]; decoded_bbox_data[i*prior_dim+4] = bbox_pred_data[base_index+3]; } } INSTANTIATE_CLASS(DecodeBBoxLayer); REGISTER_LAYER_CLASS(DecodeBBox); } // namespace caffe
DecodeBBoxesWithPrior函数在bbox_util.cpp里实现,完成的功能就是把bounding box regression的结果对输入的prior_bbox(其实就是faster中的输入的region proposal)进行回归获得更精确的框坐标,然后存储在pred_data
template <typename Dtype> void DecodeBBoxesWithPrior(const Dtype* bbox_data, const vector<BBox> prior_bboxes, const int bbox_dim, const Dtype* means, const Dtype* stds, Dtype* pred_data) { const int num = prior_bboxes.size(); const int cls_num = bbox_dim/4; for (int i = 0; i < num; i++) { Dtype pw, ph, cx, cy; pw = prior_bboxes[i].xmax-prior_bboxes[i].xmin+1; ph = prior_bboxes[i].ymax-prior_bboxes[i].ymin+1; cx = 0.5*(prior_bboxes[i].xmax+prior_bboxes[i].xmin); cy = 0.5*(prior_bboxes[i].ymax+prior_bboxes[i].ymin); for (int c = 0; c < cls_num; c++) { Dtype bx, by, bw, bh; // bbox de-normalization bx = bbox_data[i*bbox_dim+4*c]*stds[0]+means[0]; by = bbox_data[i*bbox_dim+4*c+1]*stds[1]+means[1]; bw = bbox_data[i*bbox_dim+4*c+2]*stds[2]+means[2]; bh = bbox_data[i*bbox_dim+4*c+3]*stds[3]+means[3]; Dtype tx, ty, tw, th; tx = bx*pw+cx; ty = by*ph+cy; tw = pw*exp(bw); th = ph*exp(bh); tx -= (tw-1)/2; ty -= (th-1)/2; pred_data[i*bbox_dim+4*c] = tx; pred_data[i*bbox_dim+4*c+1] = ty; pred_data[i*bbox_dim+4*c+2] = tx+tw-1; pred_data[i*bbox_dim+4*c+3] = ty+th-1; } } }