keras faster r-cnn源代码解析(四)——ROI
一、利用ROI训练分类器
最后是ROI的生成了,在每个训练的epch循环中,核心代码如下:
#输出:yield np.copy(x_img), [np.copy(y_rpn_cls), np.copy(y_rpn_regr)], img_data_aug
#网络训练的输入是图片,和找到的最优的anchor,不是GTbox
X, Y, img_data = next(data_gen_train)
#
loss_rpn = model_rpn.train_on_batch(X, Y)#Scalar training loss (if the model has a single output and no metrics) or list of scalars (if the model has multiple outputs and/or metrics).
write_log(callback, ['rpn_cls_loss', 'rpn_reg_loss'], loss_rpn, train_step)
P_rpn = model_rpn.predict_on_batch(X)#获取预测的Y值,即[np.copy(y_rpn_cls), np.copy(y_rpn_regr)]
#筛选出ROI,ROI指的是在feature map中合法box中具有最大概率包含物体的box,删除重叠率较高的box之后剩下来的box(xxx,4)
R = roi_helpers.rpn_to_roi(P_rpn[0], P_rpn[1], C, K.image_dim_ordering(), use_regr=True, overlap_thresh=0.7, max_boxes=300)
# note: calc_iou converts from (x1,y1,x2,y2) to (x,y,w,h) format
#img_data,图片的信息,R候选ROI
#输出:
#X2:#选取的iou大于0.7的roi,这里用X2是为了和上面的X相区别,
#Y1:对应的类别序号(1,xxx,21),类别标签是one_hot
#Y2:[np.array(y_class_regr_label),np.array(y_class_regr_coords)]包含对应的类别的标签和回归参数,类别标签是one_hot的
#IoUs:用于调试的,没有用
X2, Y1, Y2, IouS = roi_helpers.calc_iou(R, img_data, C, class_mapping)
if X2 is None:
rpn_accuracy_rpn_monitor.append(0)
rpn_accuracy_for_epoch.append(0)
continue
# sampling positive/negative samples
neg_samples = np.where(Y1[0, :, -1] == 1)#背景,最后一项代表背景分类
pos_samples = np.where(Y1[0, :, -1] == 0)#非背景
if len(neg_samples) > 0:
neg_samples = neg_samples[0]
else:
neg_samples = []
if len(pos_samples) > 0:
pos_samples = pos_samples[0]
else:
pos_samples = []
rpn_accuracy_rpn_monitor.append(len(pos_samples))
rpn_accuracy_for_epoch.append((len(pos_samples)))
if C.num_rois > 1:
if len(pos_samples) < C.num_rois//2:#选取一些正例样本和一些反例样本,共300个,每类约150个
selected_pos_samples = pos_samples.tolist()
else:
selected_pos_samples = np.random.choice(pos_samples, C.num_rois//2, replace=False).tolist()
try:
selected_neg_samples = np.random.choice(neg_samples, C.num_rois - len(selected_pos_samples), replace=False).tolist()
except:
selected_neg_samples = np.random.choice(neg_samples, C.num_rois - len(selected_pos_samples), replace=True).tolist()
sel_samples = selected_pos_samples + selected_neg_samples
else:
# in the extreme case where num_rois = 1, we pick a random pos or neg sample
selected_pos_samples = pos_samples.tolist()
selected_neg_samples = neg_samples.tolist()
if np.random.randint(0, 2):
sel_samples = random.choice(neg_samples)
else:
sel_samples = random.choice(pos_samples)
#X用于生成share_layers,X2是挑选出来的roi
loss_class = model_classifier.train_on_batch([X, X2[:, sel_samples, :]], [Y1[:, sel_samples, :], Y2[:, sel_samples, :]])
write_log(callback, ['detection_cls_loss', 'detection_reg_loss', 'detection_acc'], loss_class, train_step)
train_step += 1
这里面有两个关键方法,rpn_to_roi和calc_iou,下面分别注释。还可以参考http://geyao1995.com/Faster_rcnn%E4%BB%A3%E7%A0%81%E7%AC%94%E8%AE%B0_test_2_roi_helpers/
二、rpn_to_roi
2.1 rpn_to_roi 代码及注释如下:
#输入:
#rpn_layer:y_rpn_cls(1*37*37*18)前9个为valid后9个为overlap
#regr_layer:y_rpn_regr(1*37*37*72),前36个为overlap标记(实际上只有9个,是复制了4份),后36个为bestreg
#输出:
#result->boxes合法box中具有最大概率包含物体的box,删除重叠率较高的box(xxx,4)
def rpn_to_roi(rpn_layer, regr_layer, C, dim_ordering, use_regr=True, max_boxes=300,overlap_thresh=0.9):
regr_layer = regr_layer / C.std_scaling#生成regr_layer的时候乘了一个std_scaling,这里除回去
anchor_sizes = C.anchor_box_scales# [128, 256, 512]
anchor_ratios = C.anchor_box_ratios#[[1, 1], [1, 2], [2, 1]]
assert rpn_layer.shape[0] == 1
if dim_ordering == 'th':
(rows,cols) = rpn_layer.shape[2:]
elif dim_ordering == 'tf':
(rows, cols) = rpn_layer.shape[1:3]
curr_layer = 0#当前的anchor标记
if dim_ordering == 'tf':
A = np.zeros((4, rpn_layer.shape[1], rpn_layer.shape[2], rpn_layer.shape[3]))#A(4,W,H,18)
elif dim_ordering == 'th':
A = np.zeros((4, rpn_layer.shape[2], rpn_layer.shape[3], rpn_layer.shape[1]))
for anchor_size in anchor_sizes:#[128,256,512],以128为例
for anchor_ratio in anchor_ratios:#[1:1][1:2][2:1],以1:2为例
anchor_x = (anchor_size * anchor_ratio[0])/C.rpn_stride#anchor的x轴长度,在feture map图片上 (128*1)/16=8
anchor_y = (anchor_size * anchor_ratio[1])/C.rpn_stride#anchor的y轴长度,在feture map图片上 (128*2)/16=16
if dim_ordering == 'th':
regr = regr_layer[0, 4 * curr_layer:4 * curr_layer + 4, :, :]
else:
regr = regr_layer[0, :, :, 4 * curr_layer:4 * curr_layer + 4]#当前的回归参数,一共有72=36*2=9*4*2,reg=(W,H,4)
regr = np.transpose(regr, (2, 0, 1))#regr->(W,H,4)->(4,W,H)
X, Y = np.meshgrid(np.arange(cols),np. arange(rows))#生成网格点坐标矩阵,X.shape=(37*37) Y.shape=(37*37),一共有37*37个点,因此有同样数量的对应的坐标
#以下步骤是在所有的37*37个网格上对同样大小的anchor同时进行计算。
A[0, :, :, curr_layer] = X - anchor_x/2#获取网格中心点x坐标
A[1, :, :, curr_layer] = Y - anchor_y/2#获取网格中心点y坐标
A[2, :, :, curr_layer] = anchor_x##获取anchor宽度
A[3, :, :, curr_layer] = anchor_y#获取anchor高度
if use_regr:#对anchor采取变换
A[:, :, :, curr_layer] = apply_regr_np(A[:, :, :, curr_layer], regr)
#调整变换anchor的位置,且anchor的宽度和高度都不能小于1,
A[2, :, :, curr_layer] = np.maximum(1, A[2, :, :, curr_layer])
A[3, :, :, curr_layer] = np.maximum(1, A[3, :, :, curr_layer])
A[2, :, :, curr_layer] += A[0, :, :, curr_layer]#??
A[3, :, :, curr_layer] += A[1, :, :, curr_layer]#??
#修剪anchor,不能让anchor超出feature map范围
A[0, :, :, curr_layer] = np.maximum(0, A[0, :, :, curr_layer])
A[1, :, :, curr_layer] = np.maximum(0, A[1, :, :, curr_layer])
A[2, :, :, curr_layer] = np.minimum(cols-1, A[2, :, :, curr_layer])
A[3, :, :, curr_layer] = np.minimum(rows-1, A[3, :, :, curr_layer])
curr_layer += 1
all_boxes = np.reshape(A.transpose((0, 3, 1,2)), (4, -1)).transpose((1, 0))#A->(4,W,H,18)->(4,18,W,H)->(4,18*W*H)->(18*37*37,4)
all_probs = rpn_layer.transpose((0, 3, 1, 2)).reshape((-1))#(1,37,37,18)->(1,18,37,37)->(18*37*37)
x1 = all_boxes[:, 0]#(18*37*37)
y1 = all_boxes[:, 1]#(18*37*37)
x2 = all_boxes[:, 2]#(18*37*37)
y2 = all_boxes[:, 3]#(18*37*37)
idxs = np.where((x1 - x2 >= 0) | (y1 - y2 >= 0))#找到那些box坐标不合法的id,
all_boxes = np.delete(all_boxes, idxs, 0)#删掉这些box all_boxes(xxx,1)成为一个行向量
all_probs = np.delete(all_probs, idxs, 0)#删掉这些box对应的probs,即分类结果
result = non_max_suppression_fast(all_boxes, all_probs, overlap_thresh=overlap_thresh, max_boxes=max_boxes)[0]
#result->boxes合法box中具有最大概率包含物体的box,删除重叠率较高的box(xxx,4)
return result
2.2apply_regr_np
#输入:
#X:(4,37,37),当前feature map上的anchor的坐标(x,y,w,h)
#T:(4,37,37),对应的回归参数,这里的回归参数是经过RPN网络训练之后得到的回归参数,不是生成anchor时候得到的回归参数
#输出:变换之后的anchor,也就是认为最仅仅GTbox的一个anchor,这个anchor已经不是默认的anchor的大小和位置了,是经过平移和缩放的了
def apply_regr_np(X, T):
try:
x = X[0, :, :]
y = X[1, :, :]
w = X[2, :, :]
h = X[3, :, :]
tx = T[0, :, :]
ty = T[1, :, :]
tw = T[2, :, :]
th = T[3, :, :]
cx = x + w/2.
cy = y + h/2.
cx1 = tx * w + cx
cy1 = ty * h + cy
w1 = np.exp(tw.astype(np.float64)) * w
h1 = np.exp(th.astype(np.float64)) * h
x1 = cx1 - w1/2.
y1 = cy1 - h1/2.
x1 = np.round(x1)
y1 = np.round(y1)
w1 = np.round(w1)
h1 = np.round(h1)
return np.stack([x1, y1, w1, h1])
except Exception as e:
print(e)
return X
2.3non_max_suppression_fast
#输入:
#boxes:所有合法的box
#probs:所有合法的box的预测概率
#输出:
#boxes:合法box中具有最大概率包含物体的box,删除重叠率较高的box
#probs:对应的预测概率
def non_max_suppression_fast(boxes, probs, overlap_thresh=0.9, max_boxes=300):
# code used from here: http://www.pyimagesearch.com/2015/02/16/faster-non-maximum-suppression-python/
# if there are no boxes, return an empty list
if len(boxes) == 0:
return []
# grab the coordinates of the bounding boxes,注意这里x1,y1,x2,y2都是array,记录的是所有合法的box
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
#Given two array_like objects, check that the shape is equal and all elements of the first object are strictly smaller than those of the second object.
#下面这个assert同样是保证x1一定要小于对应的x2,即保证box合法
np.testing.assert_array_less(x1, x2)
np.testing.assert_array_less(y1, y2)
# if the bounding boxes integers, convert them to floats --
# this is important since we'll be doing a bunch of divisions
#将box的坐标转换为浮点数
if boxes.dtype.kind == "i":
boxes = boxes.astype("float")
# initialize the list of picked indexes
#初始化选取数组
pick = []
# calculate the areas
#计算所有box的面积
area = (x2 - x1) * (y2 - y1)
# sort the bounding boxes
idxs = np.argsort(probs)#对probs的概率进行排序,概率表明了这个box是否是物体,#retrun Array of indices that sort 'a' along the specified axis.
#从概率最大的box开始选
# keep looping while some indexes still remain in the indexes
# list
while len(idxs) > 0:
# grab the last index in the indexes list and add the
# index value to the list of picked indexes
last = len(idxs) - 1
i = idxs[last]#i=index value
pick.append(i)
# find the intersection
xx1_int = np.maximum(x1[i], x1[idxs[:last]])#取x1[i]和所有x1其他元素的之间的较大值,即取x1[i]作为左边界, np.maximum会让x[i]和中的所有元素比较,并且返回x,其中所有小于x[i]的都会被x[i]替换
yy1_int = np.maximum(y1[i], y1[idxs[:last]])#上边界
xx2_int = np.minimum(x2[i], x2[idxs[:last]])#右边界
yy2_int = np.minimum(y2[i], y2[idxs[:last]])#下边界
ww_int = np.maximum(0, xx2_int - xx1_int)#ww_int是数组
hh_int = np.maximum(0, yy2_int - yy1_int)
area_int = ww_int * hh_int#area_int也是数组
# find the union
area_union = area[i] + area[idxs[:last]] - area_int
# compute the ratio of overlap
overlap = area_int/(area_union + 1e-6)
#删除那些重复率比较高的box
# delete all indexes from the index list that have
idxs = np.delete(idxs, np.concatenate(([last],np.where(overlap > overlap_thresh)[0])))
#最多只保留300个box,或者当所有的box都处理挑选完毕之后就退出循环
if len(pick) >= max_boxes:
break
# return only the bounding boxes that were picked using the integer data type
boxes = boxes[pick].astype("int")
probs = probs[pick]
return boxes, probs
三、calc_iou
#输入:
#img_data,图片的信息,
#R候选ROI,(xxx,4) 在feature map上的坐标4->(x1,y1,x2,y2)
#输出:
#X:#选取的iou大于0.7的roi
#Y1:对应的类别序号(1,xxx,21),类别标签是one_hot
#Y2:[np.array(y_class_regr_label),np.array(y_class_regr_coords)]包含对应的类别的标签和回归参数,类别标签是one_hot的
#IoUs:用于调试的,没有用
def calc_iou(R, img_data, C, class_mapping):
'''
all_img_data[0] = {'width': 500, 'height': 500,
'bboxes': [{'y2': 500, 'y1': 27, 'x2': 183, 'x1': 20, 'class': 'person', 'difficult': False},
{'y2': 500, 'y1': 2, 'x2': 249, 'x1': 112, 'class': 'person', 'difficult': False},
{'y2': 490, 'y1': 233, 'x2': 376, 'x1': 246, 'class': 'person', 'difficult': False},
{'y2': 468, 'y1': 319, 'x2': 356, 'x1': 231, 'class': 'chair', 'difficult': False},
{'y2': 450, 'y1': 314, 'x2': 58, 'x1': 1, 'class': 'chair', 'difficult': True}], 'imageset': 'test',
'filepath': './datasets/VOC2007/JPEGImages/000910.jpg'}
'''
bboxes = img_data['bboxes']
(width, height) = (img_data['width'], img_data['height'])
# get image dimensions for resizing
(resized_width, resized_height) = data_generators.get_new_img_size(width, height, C.im_size)
gta = np.zeros((len(bboxes), 4))#GTbox的坐标,e.g. 5*4
for bbox_num, bbox in enumerate(bboxes):
# get the GT box coordinates, and resize to account for image resizing
#得到gta在feature map的坐标
gta[bbox_num, 0] = int(round(bbox['x1'] * (resized_width / float(width))/C.rpn_stride))#e.g. vgg (600/16)=37
gta[bbox_num, 1] = int(round(bbox['x2'] * (resized_width / float(width))/C.rpn_stride))
gta[bbox_num, 2] = int(round(bbox['y1'] * (resized_height / float(height))/C.rpn_stride))
gta[bbox_num, 3] = int(round(bbox['y2'] * (resized_height / float(height))/C.rpn_stride))
x_roi = []#选取的iou大于0.7的roi
y_class_num = []#对应的类别序号
y_class_regr_coords = []
y_class_regr_label = []
IoUs = [] # for debugging only
for ix in range(R.shape[0]):#遍历每个一ROI
(x1, y1, x2, y2) = R[ix, :]
x1 = int(round(x1))
y1 = int(round(y1))
x2 = int(round(x2))
y2 = int(round(y2))
best_iou = 0.0
best_bbox = -1
#计算当前ROI和gta在feature map上的iou
for bbox_num in range(len(bboxes)):
curr_iou = data_generators.iou([gta[bbox_num, 0], gta[bbox_num, 2], gta[bbox_num, 1], gta[bbox_num, 3]], [x1, y1, x2, y2])
if curr_iou > best_iou:
best_iou = curr_iou
best_bbox = bbox_num
if best_iou < C.classifier_min_overlap:#0.3 iou小于0.3则忽略这个ROI
continue
else:
w = x2 - x1
h = y2 - y1
x_roi.append([x1, y1, w, h])#将ROI的(x1,y1,x2,y2)转变为(x,y,w,h)格式
IoUs.append(best_iou)
if C.classifier_min_overlap <= best_iou < C.classifier_max_overlap:#如果best iou在0.3到0.7之间,则认为是背景
# hard negative example
cls_name = 'bg'
elif C.classifier_max_overlap <= best_iou:
cls_name = bboxes[best_bbox]['class']#如果bestiou大等于0.7则找到了对应的GTA的类别存入cls_name
cxg = (gta[best_bbox, 0] + gta[best_bbox, 1]) / 2.0
cyg = (gta[best_bbox, 2] + gta[best_bbox, 3]) / 2.0
cx = x1 + w / 2.0
cy = y1 + h / 2.0
#计算回归参数
tx = (cxg - cx) / float(w)
ty = (cyg - cy) / float(h)
tw = np.log((gta[best_bbox, 1] - gta[best_bbox, 0]) / float(w))
th = np.log((gta[best_bbox, 3] - gta[best_bbox, 2]) / float(h))
else:
print('roi = {}'.format(best_iou))
raise RuntimeError
class_num = class_mapping[cls_name]
class_label = len(class_mapping) * [0]#print(3*[0])->[0,0,0] class_label->(21*1)
class_label[class_num] = 1
y_class_num.append(copy.deepcopy(class_label))
coords = [0] * 4 * (len(class_mapping) - 1)#临时变量,存储坐标
labels = [0] * 4 * (len(class_mapping) - 1)#临时变量,存储类别标签
if cls_name != 'bg':
label_pos = 4 * class_num#每个类别的roi有4个坐标,占4个位置
sx, sy, sw, sh = C.classifier_regr_std
coords[label_pos:4+label_pos] = [sx*tx, sy*ty, sw*tw, sh*th]#乘一个回归标准差[8.0, 8.0, 4.0, 4.0],作用??
labels[label_pos:4+label_pos] = [1, 1, 1, 1]#对应的标签标记为1
y_class_regr_coords.append(copy.deepcopy(coords))
y_class_regr_label.append(copy.deepcopy(labels))
else:
y_class_regr_coords.append(copy.deepcopy(coords))
y_class_regr_label.append(copy.deepcopy(labels))
if len(x_roi) == 0:
return None, None, None, None
X = np.array(x_roi)
Y1 = np.array(y_class_num)#(xxx,21)
Y2 = np.concatenate([np.array(y_class_regr_label),np.array(y_class_regr_coords)],axis=1)
#最后返回的时候Y1又利用expand_dims增加了一个维度
return np.expand_dims(X, axis=0), np.expand_dims(Y1, axis=0), np.expand_dims(Y2, axis=0), IoUs