。因此这里使用其他的距离度量公式。聚类的目的是anchor boxes和临近的ground truth有更大的IOU值,这和anchor box的尺寸没有直接关系。自定义的距离度量公式:
到聚类中心的距离越小越好,但IOU值是越大越好,所以使用 1 - IOU,这样就保证距离越小,IOU值越大。
代码实现主要是 AlexeyAB/darknet 中scripts/gen_anchors.py,这里根据yolov2,yolov3的版本不同进行部分修改。yolov2的配置文件yolov2.cfg需要的anchors是相对特征图的,值很小基本都小于13;yolov3的配置文件yolov3.cfg需要的3个anchors是相对于原图来说的,相对都比较大。还有输入图片的大小(32的倍数)对于输出也是有影响的。
yolov2.cfg中[region] anchors = 0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828
yolov3.cfg中[region] anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
from os import listdir from os.path import isfile, join import argparse #import cv2 import numpy as np import sys import os import shutil import random import math def IOU(x,centroids): ''' :param x: 某一个ground truth的w,h :param centroids: anchor的w,h的集合[(w,h),(),...],共k个 :return: 单个ground truth box与所有k个anchor box的IoU值集合 ''' IoUs = [] w, h = x # ground truth的w,h for centroid in centroids: c_w,c_h = centroid #anchor的w,h if c_w>=w and c_h>=h: #anchor包围ground truth iou = w*h/(c_w*c_h) elif c_w>=w and c_h<=h: #anchor宽矮 iou = w*c_h/(w*h + (c_w-w)*c_h) elif c_w<=w and c_h>=h: #anchor瘦长 iou = c_w*h/(w*h + c_w*(c_h-h)) else: #ground truth包围anchor means both w,h are bigger than c_w and c_h respectively iou = (c_w*c_h)/(w*h) IoUs.append(iou) # will become (k,) shape return np.array(IoUs) def avg_IOU(X,centroids): ''' :param X: ground truth的w,h的集合[(w,h),(),...] :param centroids: anchor的w,h的集合[(w,h),(),...],共k个 ''' n,d = X.shape sum = 0. for i in range(X.shape[0]): sum+= max(IOU(X[i],centroids)) #返回一个ground truth与所有anchor的IoU中的最大值 return sum/n #对所有ground truth求平均 def write_anchors_to_file(centroids,X,anchor_file,input_shape,yolo_version): ''' :param centroids: anchor的w,h的集合[(w,h),(),...],共k个 :param X: ground truth的w,h的集合[(w,h),(),...] :param anchor_file: anchor和平均IoU的输出路径 ''' f = open(anchor_file,'w') anchors = centroids.copy() print(anchors.shape) if yolo_version=='yolov2': for i in range(anchors.shape[0]): #yolo中对图片的缩放倍数为32倍,所以这里除以32, # 如果网络架构有改变,根据实际的缩放倍数来 #求出anchor相对于缩放32倍以后的特征图的实际大小(yolov2) anchors[i][0]*=input_shape/32. anchors[i][1]*=input_shape/32. elif yolo_version=='yolov3': for i in range(anchors.shape[0]): #求出yolov3相对于原图的实际大小 anchors[i][0]*=input_shape anchors[i][1]*=input_shape else: print("the yolo version is not right!") exit(-1) widths = anchors[:,0] sorted_indices = np.argsort(widths) print('Anchors = ', anchors[sorted_indices]) for i in sorted_indices[:-1]: f.write('%0.2f,%0.2f, '%(anchors[i,0],anchors[i,1])) #there should not be comma after last anchor, that's why f.write('%0.2f,%0.2f\n'%(anchors[sorted_indices[-1:],0],anchors[sorted_indices[-1:],1])) f.write('%f\n'%(avg_IOU(X,centroids))) print() def kmeans(X,centroids,eps,anchor_file,input_shape,yolo_version): N = X.shape[0] #ground truth的个数 iterations = 0 print("centroids.shape",centroids) k,dim = centroids.shape #anchor的个数k以及w,h两维,dim默认等于2 prev_assignments = np.ones(N)*(-1) #对每个ground truth分配初始标签 iter = 0 old_D = np.zeros((N,k)) #初始化每个ground truth对每个anchor的IoU while True: D = [] iter+=1 for i in range(N): d = 1 - IOU(X[i],centroids) D.append(d) D = np.array(D) # D.shape = (N,k) 得到每个ground truth对每个anchor的IoU print("iter {}: dists = {}".format(iter,np.sum(np.abs(old_D-D)))) #计算每次迭代和前一次IoU的变化值 #assign samples to centroids assignments = np.argmin(D,axis=1) #将每个ground truth分配给距离d最小的anchor序号 if (assignments == prev_assignments).all() : #如果前一次分配的结果和这次的结果相同,就输出anchor以及平均IoU print("Centroids = ",centroids) write_anchors_to_file(centroids,X,anchor_file,input_shape,yolo_version) return #calculate new centroids centroid_sums=np.zeros((k,dim),np.float) #初始化以便对每个簇的w,h求和 for i in range(N): centroid_sums[assignments[i]]+=X[i] #将每个簇中的ground truth的w和h分别累加 for j in range(k): #对簇中的w,h求平均 centroids[j] = centroid_sums[j]/(np.sum(assignments==j)+1) prev_assignments = assignments.copy() old_D = D.copy() def main(argv): parser = argparse.ArgumentParser() parser.add_argument('-filelist', default = r'train.txt', help='path to filelist\n' ) parser.add_argument('-output_dir', default = r'output/', type = str, help='Output anchor directory\n' ) parser.add_argument('-num_clusters', default = 0, type = int, help='number of clusters\n' ) ''' 需要注意的是yolov2输出的值比较小是相对特征图来说的, yolov3输出值较大是相对原图来说的, 所以yolov2和yolov3的输出是有区别的 ''' parser.add_argument('-yolo_version', default='yolov2', type=str, help='yolov2 or yolov3\n') parser.add_argument('-yolo_input_shape', default=416, type=int, help='input images shape,multiples of 32. etc. 416*416\n') args = parser.parse_args() if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) f = open(args.filelist) lines = [line.rstrip('\n') for line in f.readlines()] annotation_dims = [] for line in lines: line = line.replace('JPEGImages','labels') line = line.replace('.jpg','.txt') line = line.replace('.png','.txt') print(line) f2 = open(line) for line in f2.readlines(): line = line.rstrip('\n') w,h = line.split(' ')[3:] #print(w,h) annotation_dims.append((float(w),float(h))) annotation_dims = np.array(annotation_dims) #保存所有ground truth框的(w,h) eps = 0.005 if args.num_clusters == 0: for num_clusters in range(1,11): #we make 1 through 10 clusters anchor_file = join( args.output_dir,'anchors%d.txt'%(num_clusters)) indices = [ random.randrange(annotation_dims.shape[0]) for i in range(num_clusters)] centroids = annotation_dims[indices] kmeans(annotation_dims,centroids,eps,anchor_file,args.yolo_input_shape,args.yolo_version) print('centroids.shape', centroids.shape) else: anchor_file = join( args.output_dir,'anchors%d.txt'%(args.num_clusters)) indices = [ random.randrange(annotation_dims.shape[0]) for i in range(args.num_clusters)] centroids = annotation_dims[indices] kmeans(annotation_dims,centroids,eps,anchor_file,args.yolo_input_shape,args.yolo_version) print('centroids.shape', centroids.shape) if __name__=="__main__": main(sys.argv)
(2)很重要的一点: 获取anchors后,给anchor打标。
具体的代码可以看下面: (YOLO V3中)
def preprocess_true_boxes(true_boxes, true_labels, input_shape, anchors, num_classes): """ Preprocess true boxes to training input format Parameters: ----------- :param true_boxes: numpy.ndarray of shape [T, 4] T: the number of boxes in each image. 4: coordinate => x_min, y_min, x_max, y_max :param true_labels: class id :param input_shape: the shape of input image to the yolov3 network, [416, 416] :param anchors: array, shape=[9,2], 9: the number of anchors, 2: width, height :param num_classes: integer, for coco dataset, it is 80 Returns: ---------- y_true: list(3 array), shape like yolo_outputs, [13, 13, 3, 85] 13:cell szie, 3:number of anchors 85: box_centers, box_sizes, confidence, probability """ input_shape = np.array(input_shape, dtype=np.int32) num_layers = len(anchors) // 3 anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] if num_layers==3 else [[3,4,5], [1,2,3]] grid_sizes = [input_shape//32, input_shape//16, input_shape//8] box_centers = (true_boxes[:, 0:2] + true_boxes[:, 2:4]) / 2 # the center of box box_sizes = true_boxes[:, 2:4] - true_boxes[:, 0:2] # the height and width of box true_boxes[:, 0:2] = box_centers true_boxes[:, 2:4] = box_sizes y_true_13 = np.zeros(shape=[grid_sizes[0][0], grid_sizes[0][1], 3, 5+num_classes], dtype=np.float32) y_true_26 = np.zeros(shape=[grid_sizes[1][0], grid_sizes[1][1], 3, 5+num_classes], dtype=np.float32) y_true_52 = np.zeros(shape=[grid_sizes[2][0], grid_sizes[2][1], 3, 5+num_classes], dtype=np.float32) y_true = [y_true_13, y_true_26, y_true_52] anchors_max = anchors / 2. anchors_min = -anchors_max valid_mask = box_sizes[:, 0] > 0 # Discard zero rows. wh = box_sizes[valid_mask] # set the center of all boxes as the origin of their coordinates # and correct their coordinates wh = np.expand_dims(wh, -2) boxes_max = wh / 2. boxes_min = -boxes_max intersect_mins = np.maximum(boxes_min, anchors_min) intersect_maxs = np.minimum(boxes_max, anchors_max) intersect_wh = np.maximum(intersect_maxs - intersect_mins, 0.) intersect_area = intersect_wh[..., 0] * intersect_wh[..., 1] box_area = wh[..., 0] * wh[..., 1] anchor_area = anchors[:, 0] * anchors[:, 1] iou = intersect_area / (box_area + anchor_area - intersect_area) # Find best anchor for each true box best_anchor = np.argmax(iou, axis=-1) for t, n in enumerate(best_anchor): for l in range(num_layers): if n not in anchor_mask[l]: continue # 这里也是用到归一化的思想。 i = np.floor(true_boxes[t,1]/input_shape[::-1]*grid_sizes[l][0]).astype('int32') j = np.floor(true_boxes[t,0]/input_shape[::-1]*grid_sizes[l][1]).astype('int32') k = anchor_mask[l].index(n) c = true_labels[t].astype('int32') y_true[l][i, j, k, 0:4] = true_boxes[t, 0:4] y_true[l][i, j, k, 4] = 1 y_true[l][i, j, k, 5+c] = 1 return y_true_13, y_true_26, y_true_52
def preprocess_true_boxes(true_boxes, anchors, image_size): """ 参数 -------------- true_boxes : 实际框的位置和类别,我们的输入。二个维度: 第一个维度:一张图片中有几个实际框 第二个维度: [x, y, w, h, class],x,y 是框中心点坐标,w,h 是框的宽度和高度。x,y,w,h 均是除以图片 分辨率得到的[0,1]范围的比值。 anchors : 实际anchor boxes 的值,论文中使用了五个。[w,h],都是相对于gird cell 的比值。二个维度: 第一个维度:anchor boxes的数量,这里是5 第二个维度:[w,h],w,h,都是相对于gird cell长宽的比值。 [1.08, 1.19], [3.42, 4.41], [6.63, 11.38], [9.42, 5.11], [16.62, 10.52] image_size : 图片的实际尺寸。这里是416x416。 Returns -------------- detectors_mask : 取值是0或者1,这里的shape是[13,13,5,1],四个维度。 第一个维度:true_boxes的中心位于第几行(y方向上属于第几个gird cell) 第二个维度:true_boxes的中心位于第几列(x方向上属于第几个gird cell) 第三个维度:哪个anchor box 第四个维度:0/1。1的就是用于预测改true boxes 的 anchor boxes matching_true_boxes: 这里的shape是[13,13,5,5],四个维度。 第一个维度:true_boxes的中心位于第几行(y方向上属于第几个gird cel) 第二个维度:true_boxes的中心位于第几列(x方向上属于第几个gird cel) 第三个维度:第几个anchor box 第四个维度:[x,y,w,h,class]。这里的x,y表示offset,是相当于gird cell的,w,h是取了log函数的, class是属于第几类。后面的代码会详细看到 """ height, width = image_size num_anchors = len(anchors) assert height % 32 == 0, '输入的图片的高度必须是32的倍数,不然会报错。' assert width % 32 == 0, '输入的图片的宽度必须是32的倍数,不然会报错。' conv_height = height // 32 '进行gird cell划分' conv_width = width // 32 '进行gird cell划分' num_box_params = true_boxes.shape[1] detectors_mask = np.zeros( (conv_height, conv_width, num_anchors, 1), dtype=np.float32) matching_true_boxes = np.zeros( (conv_height, conv_width, num_anchors, num_box_params), dtype=np.float32) '确定detectors_mask和matching_true_boxes的维度,用0填充' for box in true_boxes: '遍历实际框' box_class = box[4:5] '提取类别信息,属于哪类' box = box[0:4] * np.array( [conv_width, conv_height, conv_width, conv_height]) '换算成相对于gird cell的值' i = np.floor(box[1]).astype('int') '(y方向上属于第几个gird cell)' j = np.floor(box[0]).astype('int') '(x方向上属于第几个gird cell)' best_iou = 0 best_anchor = 0 '计算anchor boxes 和 true boxes的iou,找到最佳预测的一个anchor boxes' for k, anchor in enumerate(anchors): # Find IOU between box shifted to origin and anchor box. box_maxes = box[2:4] / 2. box_mins = -box_maxes anchor_maxes = (anchor / 2.) anchor_mins = -anchor_maxes intersect_mins = np.maximum(box_mins, anchor_mins) intersect_maxes = np.minimum(box_maxes, anchor_maxes) intersect_wh = np.maximum(intersect_maxes - intersect_mins, 0.) intersect_area = intersect_wh[0] * intersect_wh[1] box_area = box[2] * box[3] anchor_area = anchor[0] * anchor[1] iou = intersect_area / (box_area + anchor_area - intersect_area) if iou > best_iou: best_iou = iou best_anchor = k if best_iou > 0: detectors_mask[i, j, best_anchor] = 1 '找到最佳预测anchor boxes' adjusted_box = np.array( [ box[0] - j, box[1] - i, 'x,y都是相对于gird cell的位置,左上角[0,0],右下角[1,1]' np.log(box[2] / anchors[best_anchor][0]), '对应实际框w,h和anchor boxes w,h的比值取log函数' np.log(box[3] / anchors[best_anchor][1]), box_class 'class实际框的物体是属于第几类' ], dtype=np.float32) matching_true_boxes[i, j, best_anchor] = adjusted_box return detectors_mask, matching_true_boxes
每个anchor的预测的维度为 (4+1+num_class)。也就是说每个anchor的预测包括xywh,confidence,class。这些输出并不是真正的网络预测结果,要得到真正的网络预测结果,需要进行一些转换。
(1)对于预测的bbox的中心,需要压缩到0-1之间,再加上anchor相对于grid在x和y方向上的偏移。这一点,和yolo v1是一致的。
(2)对于预测的bbox的宽高,这个和faster RCNN一样,是相对于anchor宽高的一个放缩。exp(w)和exp(h)分别对应了宽高的放缩因子。
以上得到的结果,实际上还不是最终的预测结果,以上得到的center_x,center_y,w,h都是在Grid这个尺度上做的,所以要乘上 步长32 就可以得到在原图尺度上的预测结果。 在yolov3中,步长有:32、16、8。
(1) https://zhuanlan.zhihu.com/p/40659490
(2)YOLOv2、v3使用K-means聚类计算anchor boxes的具体方法: http://www.pianshen.com/article/513490257/
