diff --git a/retinaface/Makefile b/RetinaFace/Makefile similarity index 100% rename from retinaface/Makefile rename to RetinaFace/Makefile diff --git a/retinaface/README.md b/RetinaFace/README.md similarity index 100% rename from retinaface/README.md rename to RetinaFace/README.md diff --git a/retinaface/rcnn/cython/__init__.py b/RetinaFace/rcnn/PY_OP/__init__.py old mode 100644 new mode 100755 similarity index 100% rename from retinaface/rcnn/cython/__init__.py rename to RetinaFace/rcnn/PY_OP/__init__.py diff --git a/RetinaFace/rcnn/PY_OP/rpn_fpn_ohem3.py b/RetinaFace/rcnn/PY_OP/rpn_fpn_ohem3.py new file mode 100644 index 0000000..ad040a3 --- /dev/null +++ b/RetinaFace/rcnn/PY_OP/rpn_fpn_ohem3.py @@ -0,0 +1,172 @@ + +from __future__ import print_function +import sys +import mxnet as mx +import numpy as np +from distutils.util import strtobool +from ..config import config, generate_config + + +STAT = {0:0} +STEP = 28800 + +class RPNFPNOHEM3Operator(mx.operator.CustomOp): + def __init__(self, stride=0, network='', dataset='', prefix=''): + super(RPNFPNOHEM3Operator, self).__init__() + self.stride = int(stride) + self.prefix = prefix + generate_config(network, dataset) + self.mode = 0 + if self.prefix!='face': + self.mode = 0 + if network=='pbox': + self.mode = 2 + global STAT + for k in config.RPN_FEAT_STRIDE: + STAT[k] = [0,0,0] + + def forward(self, is_train, req, in_data, out_data, aux): + global STAT + + cls_score = in_data[0].asnumpy() #BS, 2, ANCHORS + labels_raw = in_data[1].asnumpy() # BS, ANCHORS + + A = config.NUM_ANCHORS + anchor_weight = np.zeros( (labels_raw.shape[0], labels_raw.shape[1],1), dtype=np.float32 ) + valid_count = np.zeros( (labels_raw.shape[0],1), dtype=np.float32 ) + #print('anchor_weight', anchor_weight.shape) + + #assert labels.shape[0]==1 + #assert cls_score.shape[0]==1 + #assert bbox_weight.shape[0]==1 + #print('shape', cls_score.shape, labels.shape, file=sys.stderr) + #print('bbox_weight 0', bbox_weight.shape, file=sys.stderr) + #bbox_weight = np.zeros( (labels_raw.shape[0], labels_raw.shape[1], 4), dtype=np.float32) + _stat = [0,0,0] + for ibatch in xrange(labels_raw.shape[0]): + _anchor_weight = np.zeros( (labels_raw.shape[1],1), dtype=np.float32) + labels = labels_raw[ibatch] + fg_score = cls_score[ibatch,1,:] - cls_score[ibatch,0,:] + + + + fg_inds = np.where(labels>0)[0] + num_fg = int(config.TRAIN.RPN_FG_FRACTION * config.TRAIN.RPN_BATCH_SIZE) + origin_num_fg = len(fg_inds) + #print(len(fg_inds), num_fg, file=sys.stderr) + if len(fg_inds) > num_fg: + if self.mode>=1: + disable_inds = np.random.choice(fg_inds, size=(len(fg_inds) - num_fg), replace=False) + labels[disable_inds] = -1 + else: + pos_ohem_scores = fg_score[fg_inds] + order_pos_ohem_scores = pos_ohem_scores.ravel().argsort() + sampled_inds = fg_inds[order_pos_ohem_scores[:num_fg]] + labels[fg_inds] = -1 + labels[sampled_inds] = 1 + + n_fg = np.sum(labels>0) + fg_inds = np.where(labels>0)[0] + num_bg = config.TRAIN.RPN_BATCH_SIZE - n_fg + #num_bg = max(10, num_fg*int(1.0/config.TRAIN.RPN_FG_FRACTION-1)) + #if self.mode==2: + # num_bg = num_fg*int(1.0/config.TRAIN.RPN_FG_FRACTION-1) + + bg_inds = np.where(labels == 0)[0] + origin_num_bg = len(bg_inds) + if num_bg==0: + labels[bg_inds] = -1 + elif len(bg_inds) > num_bg: + # sort ohem scores + + if self.mode>=1: + disable_inds = np.random.choice(bg_inds, size=(len(bg_inds) - num_bg), replace=False) + labels[disable_inds] = -1 + else: + neg_ohem_scores = fg_score[bg_inds] + order_neg_ohem_scores = neg_ohem_scores.ravel().argsort()[::-1] + sampled_inds = bg_inds[order_neg_ohem_scores[:num_bg]] + #print('sampled_inds_bg', sampled_inds, file=sys.stderr) + labels[bg_inds] = -1 + labels[sampled_inds] = 0 + + if n_fg>0: + order0_labels = labels.reshape( (1, A, -1) ).transpose( (0, 2, 1) ).reshape( (-1,) ) + bbox_fg_inds = np.where(order0_labels>0)[0] + #print('bbox_fg_inds, order0 ', bbox_fg_inds, file=sys.stderr) + _anchor_weight[bbox_fg_inds,:] = 1.0 + anchor_weight[ibatch] = _anchor_weight + valid_count[ibatch][0] = n_fg + + if self.prefix=='face': + #print('fg-bg', self.stride, n_fg, num_bg) + STAT[0]+=1 + STAT[self.stride][0] += config.TRAIN.RPN_BATCH_SIZE + STAT[self.stride][1] += n_fg + STAT[self.stride][2] += np.sum(fg_score[fg_inds]>=0) + #_stat[0] += config.TRAIN.RPN_BATCH_SIZE + #_stat[1] += n_fg + #_stat[2] += np.sum(fg_score[fg_inds]>=0) + #print('stride num_fg', self.stride, n_fg, file=sys.stderr) + #ACC[self.stride] += np.sum(fg_score[fg_inds]>=0) + #x = float(labels_raw.shape[0]*len(config.RPN_FEAT_STRIDE)) + x = 1.0 + if STAT[0]%STEP==0: + _str = ['STAT'] + STAT[0] = 0 + for k in config.RPN_FEAT_STRIDE: + acc = float(STAT[k][2])/STAT[k][1] + acc0 = float(STAT[k][1])/STAT[k][0] + #_str.append("%d: all-fg(%d, %d, %.4f), fg-fgcorrect(%d, %d, %.4f)"%(k,STAT[k][0], STAT[k][1], acc0, STAT[k][1], STAT[k][2], acc)) + _str.append("%d: (%d, %d, %.4f)"%(k, STAT[k][1], STAT[k][2], acc)) + STAT[k] = [0,0,0] + _str = ' | '.join(_str) + print(_str, file=sys.stderr) + #if self.stride==4 and num_fg>0: + # print('_stat_', self.stride, num_fg, num_bg, file=sys.stderr) + + #labels_ohem = mx.nd.array(labels_raw) + #anchor_weight = mx.nd.array(anchor_weight) + #print('valid_count', self.stride, np.sum(valid_count)) + #print('_stat', _stat, valid_count) + + for ind, val in enumerate([labels_raw, anchor_weight, valid_count]): + val = mx.nd.array(val) + self.assign(out_data[ind], req[ind], val) + + def backward(self, req, out_grad, in_data, out_data, in_grad, aux): + for i in range(len(in_grad)): + self.assign(in_grad[i], req[i], 0) + + +@mx.operator.register('rpn_fpn_ohem3') +class RPNFPNOHEM3Prop(mx.operator.CustomOpProp): + def __init__(self, stride=0, network='', dataset='', prefix=''): + super(RPNFPNOHEM3Prop, self).__init__(need_top_grad=False) + self.stride = stride + self.network=network + self.dataset=dataset + self.prefix = prefix + + def list_arguments(self): + return ['cls_score', 'labels'] + + def list_outputs(self): + return ['labels_ohem', 'anchor_weight', 'valid_count'] + + def infer_shape(self, in_shape): + labels_shape = in_shape[1] + #print('in_rpn_ohem', in_shape[0], in_shape[1], in_shape[2], file=sys.stderr) + anchor_weight_shape = [labels_shape[0], labels_shape[1], 1] + #print('in_rpn_ohem', labels_shape, anchor_weight_shape) + + return in_shape, \ + [labels_shape, anchor_weight_shape, [labels_shape[0], 1]] + + def create_operator(self, ctx, shapes, dtypes): + return RPNFPNOHEM3Operator(self.stride, self.network, self.dataset, self.prefix) + + def declare_backward_dependency(self, out_grad, in_data, out_data): + return [] + + diff --git a/retinaface/rcnn/processing/__init__.py b/RetinaFace/rcnn/__init__.py similarity index 100% rename from retinaface/rcnn/processing/__init__.py rename to RetinaFace/rcnn/__init__.py diff --git a/RetinaFace/rcnn/config.py b/RetinaFace/rcnn/config.py new file mode 100644 index 0000000..88f195f --- /dev/null +++ b/RetinaFace/rcnn/config.py @@ -0,0 +1,302 @@ +import numpy as np +from easydict import EasyDict as edict + +config = edict() + +# network related params +config.PIXEL_MEANS = np.array([103.939, 116.779, 123.68]) +config.PIXEL_STDS = np.array([1.0, 1.0, 1.0]) +config.PIXEL_SCALE = 1.0 +config.IMAGE_STRIDE = 0 + +# dataset related params +config.NUM_CLASSES = 2 +config.PRE_SCALES = [(1200, 1600)] # first is scale (the shorter side); second is max size +config.SCALES = [(640, 640)] # first is scale (the shorter side); second is max size +#config.SCALES = [(800, 800)] # first is scale (the shorter side); second is max size +config.ORIGIN_SCALE = False + +_ratio = (1.,) + +RAC_SSH = { + '32': {'SCALES': (32,16), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999}, + '16': {'SCALES': (8,4), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999}, + '8': {'SCALES': (2,1), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999}, +} + +_ratio = (1.,1.5) +RAC_SSH2 = { + '32': {'SCALES': (32,16), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999}, + '16': {'SCALES': (8,4), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999}, + '8': {'SCALES': (2,1), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999}, +} + +_ratio = (1.,1.5) +RAC_SSH3 = { + '32': {'SCALES': (32,16), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999}, + '16': {'SCALES': (8,4), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999}, + '8': {'SCALES': (2,1), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999}, + '4': {'SCALES': (2,1), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999}, +} + +RAC_RETINA = {} +_ratios = (1.0,) +_ass = 2.0**(1.0/3) +_basescale = 1.0 +for _stride in [4, 8, 16, 32, 64]: + key = str(_stride) + value = {'BASE_SIZE': 16, 'RATIOS': _ratios, 'ALLOWED_BORDER': 9999} + scales = [] + for _ in range(3): + scales.append(_basescale) + _basescale *= _ass + value['SCALES'] = tuple(scales) + RAC_RETINA[key] = value + + +config.RPN_ANCHOR_CFG = RAC_SSH #default + +config.NET_MODE = 2 +config.HEAD_MODULE = 'SSH' +#config.HEAD_MODULE = 'RF' +config.LR_MODE = 0 +config.LANDMARK_LR_MULT = 2.0 +config.HEAD_FILTER_NUM = 256 +config.CONTEXT_FILTER_RATIO = 1 +config.max_feat_channel = 9999 + +config.USE_CROP = True +config.USE_DCN = 0 +config.FACE_LANDMARK = True +config.USE_OCCLUSION = False +config.USE_BLUR = False +config.MORE_SMALL_BOX = True + +config.LAYER_FIX = False + +config.HEAD_BOX = False +config.DENSE_ANCHOR = False +config.USE_MAXOUT = 0 +config.SHARE_WEIGHT_BBOX = False +config.SHARE_WEIGHT_LANDMARK = False + +config.RANDOM_FEAT_STRIDE = False +config.NUM_CPU = 4 +config.MIXUP = 0.0 +config.USE_3D = False + +#config.BBOX_MASK_THRESH = 0 +config.COLOR_MODE = 2 +config.COLOR_JITTERING = 0.125 +#config.COLOR_JITTERING = 0 +#config.COLOR_JITTERING = 0.2 + + +config.TRAIN = edict() + +config.TRAIN.IMAGE_ALIGN = 0 +config.TRAIN.MIN_BOX_SIZE = 0 +config.BBOX_MASK_THRESH = config.TRAIN.MIN_BOX_SIZE +# R-CNN and RPN +# size of images for each device, 2 for rcnn, 1 for rpn and e2e +config.TRAIN.BATCH_IMAGES = 8 +# e2e changes behavior of anchor loader and metric +config.TRAIN.END2END = True +# group images with similar aspect ratio +config.TRAIN.ASPECT_GROUPING = False + +# RPN anchor loader +# rpn anchors batch size +config.TRAIN.RPN_ENABLE_OHEM = 2 +config.TRAIN.RPN_BATCH_SIZE = 256 +# rpn anchors sampling params +config.TRAIN.RPN_FG_FRACTION = 0.25 +config.TRAIN.RPN_POSITIVE_OVERLAP = 0.5 +config.TRAIN.RPN_NEGATIVE_OVERLAP = 0.3 +config.TRAIN.RPN_CLOBBER_POSITIVES = False +config.TRAIN.RPN_FORCE_POSITIVE = False +# rpn bounding box regression params +#config.TRAIN.RPN_BBOX_WEIGHTS = (1.0, 1.0, 1.0, 1.0) +#config.TRAIN.RPN_POSITIVE_WEIGHT = -1.0 +#config.TRAIN.RPN_LANDMARK_WEIGHTS = (1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0) +#config.TRAIN.RPN_INVALID_LANDMARK_WEIGHTS = (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0) + +# used for end2end training +# RPN proposal +#config.TRAIN.CXX_PROPOSAL = True +#config.TRAIN.RPN_NMS_THRESH = 0.7 +#config.TRAIN.RPN_PRE_NMS_TOP_N = 12000 +#config.TRAIN.RPN_POST_NMS_TOP_N = 2000 +#config.TRAIN.RPN_MIN_SIZE = config.RPN_FEAT_STRIDE +#config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED = True +#config.TRAIN.BBOX_MEANS = (0.0, 0.0, 0.0, 0.0) +#config.TRAIN.BBOX_STDS = (0.1, 0.1, 0.2, 0.2) + +config.TEST = edict() + +# R-CNN testing +# use rpn to generate proposal +config.TEST.HAS_RPN = False +# size of images for each device +config.TEST.BATCH_IMAGES = 1 + +# RPN proposal +config.TEST.CXX_PROPOSAL = True +config.TEST.RPN_NMS_THRESH = 0.3 +config.TEST.RPN_PRE_NMS_TOP_N = 1000 +config.TEST.RPN_POST_NMS_TOP_N = 3000 +#config.TEST.RPN_MIN_SIZE = config.RPN_FEAT_STRIDE +#config.TEST.RPN_MIN_SIZE = [0,0,0] + +# RCNN nms +config.TEST.NMS = 0.3 + +config.TEST.SCORE_THRESH = 0.05 +config.TEST.IOU_THRESH = 0.5 + + +# network settings +network = edict() + +network.ssh = edict() + +network.mnet = edict() +#network.mnet.pretrained = 'model/mnasnet' +#network.mnet.pretrained = 'model/mobilenetv2_0_5' +#network.mnet.pretrained = 'model/mobilenet_0_5' +#network.mnet.MULTIPLIER = 0.5 +#network.mnet.pretrained = 'model/mobilenet_0_25' +#network.mnet.pretrained_epoch = 0 +#network.mnet.PIXEL_MEANS = np.array([0.406, 0.456, 0.485]) +#network.mnet.PIXEL_STDS = np.array([0.225, 0.224, 0.229]) +#network.mnet.PIXEL_SCALE = 255.0 +network.mnet.FIXED_PARAMS = ['^stage1', '^.*upsampling'] +network.mnet.BATCH_IMAGES = 16 +network.mnet.HEAD_FILTER_NUM = 64 +network.mnet.CONTEXT_FILTER_RATIO = 1 + +network.mnet.PIXEL_MEANS = np.array([0.0, 0.0, 0.0]) +network.mnet.PIXEL_STDS = np.array([1.0, 1.0, 1.0]) +network.mnet.PIXEL_SCALE = 1.0 +#network.mnet.pretrained = 'model/mobilenetfd_0_25' #78 +#network.mnet.pretrained = 'model/mobilenetfd2' #75 +network.mnet.pretrained = 'model/mobilenet025fd0' #78 +#network.mnet.pretrained = 'model/mobilenet025fd1' #75 +#network.mnet.pretrained = 'model/mobilenet025fd2' # +network.mnet.pretrained_epoch = 0 +network.mnet.max_feat_channel = 8888 +network.mnet.COLOR_MODE = 1 +network.mnet.USE_CROP = True +network.mnet.RPN_ANCHOR_CFG = RAC_SSH +network.mnet.LAYER_FIX = True +network.mnet.LANDMARK_LR_MULT = 2.5 + + +network.resnet = edict() +#network.resnet.pretrained = 'model/ResNet50_v1d' +#network.resnet.pretrained = 'model/resnet-50' +network.resnet.pretrained = 'model/resnet-152' +#network.resnet.pretrained = 'model/senet154' +#network.resnet.pretrained = 'model/densenet161' +network.resnet.pretrained_epoch = 0 +#network.mnet.PIXEL_MEANS = np.array([103.939, 116.779, 123.68]) +#network.mnet.PIXEL_STDS = np.array([57.375, 57.12, 58.393]) +#network.resnet.PIXEL_MEANS = np.array([0.406, 0.456, 0.485]) +#network.resnet.PIXEL_STDS = np.array([0.225, 0.224, 0.229]) +#network.resnet.PIXEL_SCALE = 255.0 +network.resnet.lr_step = '1,2,3,4,5,55,68,80' +network.resnet.lr = 0.001 +network.resnet.PIXEL_MEANS = np.array([0.0, 0.0, 0.0]) +network.resnet.PIXEL_STDS = np.array([1.0, 1.0, 1.0]) +network.resnet.PIXEL_SCALE = 1.0 +network.resnet.FIXED_PARAMS = ['^stage1', '^.*upsampling'] +network.resnet.BATCH_IMAGES = 8 +network.resnet.HEAD_FILTER_NUM = 256 +network.resnet.CONTEXT_FILTER_RATIO = 1 +network.resnet.USE_DCN = 2 +network.resnet.RPN_BATCH_SIZE = 256 +network.resnet.RPN_ANCHOR_CFG = RAC_RETINA + +network.resnet.USE_DCN = 0 +network.resnet.pretrained = 'model/resnet-50' +network.resnet.RPN_ANCHOR_CFG = RAC_SSH + + +# dataset settings +dataset = edict() + +dataset.widerface = edict() +dataset.widerface.dataset = 'widerface' +dataset.widerface.image_set = 'train' +dataset.widerface.test_image_set = 'val' +dataset.widerface.root_path = 'data' +dataset.widerface.dataset_path = 'data/widerface' +dataset.widerface.NUM_CLASSES = 2 + +dataset.retinaface = edict() +dataset.retinaface.dataset = 'retinaface' +dataset.retinaface.image_set = 'train' +dataset.retinaface.test_image_set = 'val' +dataset.retinaface.root_path = 'data' +dataset.retinaface.dataset_path = 'data/retinaface' +dataset.retinaface.NUM_CLASSES = 2 + +# default settings +default = edict() + +config.FIXED_PARAMS = ['^conv1', '^conv2', '^conv3', '^.*upsampling'] +#config.FIXED_PARAMS = ['^.*upsampling'] +#config.FIXED_PARAMS = ['^conv1', '^conv2', '^conv3'] +#config.FIXED_PARAMS = ['^conv0', '^stage1', 'gamma', 'beta'] #for resnet + +# default network +default.network = 'resnet' +default.pretrained = 'model/resnet-152' +#default.network = 'resnetssh' +default.pretrained_epoch = 0 +# default dataset +default.dataset = 'retinaface' +default.image_set = 'train' +default.test_image_set = 'val' +default.root_path = 'data' +default.dataset_path = 'data/retinaface' +# default training +default.frequent = 20 +default.kvstore = 'device' +# default e2e +default.prefix = 'model/retinaface' +default.end_epoch = 10000 +default.lr_step = '55,68,80' +default.lr = 0.01 + +def generate_config(_network, _dataset): + for k, v in network[_network].items(): + if k in config: + config[k] = v + elif k in default: + default[k] = v + if k in config.TRAIN: + config.TRAIN[k] = v + for k, v in dataset[_dataset].items(): + if k in config: + config[k] = v + elif k in default: + default[k] = v + if k in config.TRAIN: + config.TRAIN[k] = v + config.network = _network + config.dataset = _dataset + config.RPN_FEAT_STRIDE = [] + num_anchors = [] + for k in config.RPN_ANCHOR_CFG: + config.RPN_FEAT_STRIDE.append( int(k) ) + _num_anchors = len(config.RPN_ANCHOR_CFG[k]['SCALES'])*len(config.RPN_ANCHOR_CFG[k]['RATIOS']) + if config.DENSE_ANCHOR: + _num_anchors *= 2 + config.RPN_ANCHOR_CFG[k]['NUM_ANCHORS'] = _num_anchors + num_anchors.append(_num_anchors) + config.RPN_FEAT_STRIDE = sorted(config.RPN_FEAT_STRIDE, reverse=True) + for j in range(1,len(num_anchors)): + assert num_anchors[0]==num_anchors[j] + config.NUM_ANCHORS = num_anchors[0] + diff --git a/RetinaFace/rcnn/core/__init__.py b/RetinaFace/rcnn/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/RetinaFace/rcnn/core/callback.py b/RetinaFace/rcnn/core/callback.py new file mode 100644 index 0000000..317d5cd --- /dev/null +++ b/RetinaFace/rcnn/core/callback.py @@ -0,0 +1,13 @@ +import mxnet as mx + + +def do_checkpoint(prefix, means, stds): + def _callback(iter_no, sym, arg, aux): + if 'bbox_pred_weight' in arg: + arg['bbox_pred_weight_test'] = (arg['bbox_pred_weight'].T * mx.nd.array(stds)).T + arg['bbox_pred_bias_test'] = arg['bbox_pred_bias'] * mx.nd.array(stds) + mx.nd.array(means) + mx.model.save_checkpoint(prefix, iter_no + 1, sym, arg, aux) + if 'bbox_pred_weight' in arg: + arg.pop('bbox_pred_weight_test') + arg.pop('bbox_pred_bias_test') + return _callback diff --git a/RetinaFace/rcnn/core/loader.py b/RetinaFace/rcnn/core/loader.py new file mode 100644 index 0000000..600420e --- /dev/null +++ b/RetinaFace/rcnn/core/loader.py @@ -0,0 +1,463 @@ +from __future__ import print_function +import sys +import mxnet as mx +import numpy as np +import random +import datetime +import multiprocessing +import cv2 +from mxnet.executor_manager import _split_input_slice + +from rcnn.config import config +from rcnn.io.image import tensor_vstack +from rcnn.io.rpn import get_rpn_testbatch, get_rpn_batch, assign_anchor_fpn, get_crop_batch, AA + + +class CropLoader(mx.io.DataIter): + def __init__(self, feat_sym, roidb, batch_size=1, shuffle=False, ctx=None, work_load_list=None, + aspect_grouping=False): + """ + This Iter will provide roi data to Fast R-CNN network + :param feat_sym: to infer shape of assign_output + :param roidb: must be preprocessed + :param batch_size: must divide BATCH_SIZE(128) + :param shuffle: bool + :param ctx: list of contexts + :param work_load_list: list of work load + :param aspect_grouping: group images with similar aspects + :return: AnchorLoader + """ + super(CropLoader, self).__init__() + + # save parameters as properties + self.feat_sym = feat_sym + self.roidb = roidb + self.batch_size = batch_size + self.shuffle = shuffle + self.ctx = ctx + if self.ctx is None: + self.ctx = [mx.cpu()] + self.work_load_list = work_load_list + #self.feat_stride = feat_stride + #self.anchor_scales = anchor_scales + #self.anchor_ratios = anchor_ratios + #self.allowed_border = allowed_border + self.aspect_grouping = aspect_grouping + self.feat_stride = config.RPN_FEAT_STRIDE + + # infer properties from roidb + self.size = len(roidb) + self.index = np.arange(self.size) + + # decide data and label names + #self.data_name = ['data'] + #self.label_name = [] + #self.label_name.append('label') + #self.label_name.append('bbox_target') + #self.label_name.append('bbox_weight') + + self.data_name = ['data'] + #self.label_name = ['label', 'bbox_target', 'bbox_weight'] + self.label_name = [] + prefixes = ['face'] + if config.HEAD_BOX: + prefixes.append('head') + names = [] + for prefix in prefixes: + names += [prefix+'_label', prefix+'_bbox_target', prefix+'_bbox_weight'] + if prefix=='face' and config.FACE_LANDMARK: + names += [prefix+'_landmark_target', prefix+'_landmark_weight'] + #names = ['label', 'bbox_weight'] + for stride in self.feat_stride: + for n in names: + k = "%s_stride%d"%(n,stride) + self.label_name.append(k) + # status variable for synchronization between get_data and get_label + self.cur = 0 + self.batch = None + self.data = None + self.label = None + # infer shape + feat_shape_list = [] + _data_shape = [('data', (1, 3, max([v[1] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] + _data_shape = dict(_data_shape) + for i in range(len(self.feat_stride)): + _, feat_shape, _ = self.feat_sym[i].infer_shape(**_data_shape) + feat_shape = [int(i) for i in feat_shape[0]] + feat_shape_list.append(feat_shape) + self.aa = AA(feat_shape_list) + + self._debug = False + self._debug_id = 0 + self._times = [0.0, 0.0, 0.0, 0.0] + + # get first batch to fill in provide_data and provide_label + self.reset() + self.get_batch() + + @property + def provide_data(self): + return [(k, v.shape) for k, v in zip(self.data_name, self.data)] + + @property + def provide_label(self): + return [(k, v.shape) for k, v in zip(self.label_name, self.label)] + + def reset(self): + self.cur = 0 + if self.shuffle: + np.random.shuffle(self.index) + + def iter_next(self): + return self.cur + self.batch_size <= self.size + + def next(self): + if self.iter_next(): + self.get_batch() + self.cur += self.batch_size + return mx.io.DataBatch(data=self.data, label=self.label, + pad=self.getpad(), index=self.getindex(), + provide_data=self.provide_data, provide_label=self.provide_label) + else: + raise StopIteration + + def getindex(self): + return self.cur / self.batch_size + + def getpad(self): + if self.cur + self.batch_size > self.size: + return self.cur + self.batch_size - self.size + else: + return 0 + + def infer_shape(self, max_data_shape=None, max_label_shape=None): + """ Return maximum data and label shape for single gpu """ + if max_data_shape is None: + max_data_shape = [] + if max_label_shape is None: + max_label_shape = [] + max_shapes = dict(max_data_shape + max_label_shape) + input_batch_size = max_shapes['data'][0] + dummy_boxes = np.zeros((0, 5)) + dummy_info = [ [max_shapes['data'][2], max_shapes['data'][3], 1.0] ] + dummy_label = {'gt_boxes' : dummy_boxes} + dummy_blur = np.zeros((0,)) + dummy_label['gt_blur'] = dummy_blur + + + label_dict = {} + if config.HEAD_BOX: + head_label_dict = self.aa.assign_anchor_fpn(dummy_label, dummy_info, False, prefix='head') + label_dict.update(head_label_dict) + + if config.FACE_LANDMARK: + dummy_landmarks = np.zeros( (0,5,3) ) + dummy_label['gt_landmarks'] = dummy_landmarks + face_label_dict = self.aa.assign_anchor_fpn(dummy_label, dummy_info, config.FACE_LANDMARK, prefix='face') + label_dict.update(face_label_dict) + + label_list = [] + for k in self.label_name: + label_list.append(label_dict[k]) + label_shape = [(k, tuple([input_batch_size] + list(v.shape[1:]))) for k, v in zip(self.label_name, label_list)] + return max_data_shape, label_shape + + def get_batch(self): + # slice roidb + cur_from = self.cur + cur_to = min(cur_from + self.batch_size, self.size) + assert cur_to==cur_from+self.batch_size + roidb = [self.roidb[self.index[i]] for i in range(cur_from, cur_to)] + + # decide multi device slice + work_load_list = self.work_load_list + ctx = self.ctx + if work_load_list is None: + work_load_list = [1] * len(ctx) + assert isinstance(work_load_list, list) and len(work_load_list) == len(ctx), \ + "Invalid settings for work load. " + slices = _split_input_slice(self.batch_size, work_load_list) + + # get testing data for multigpu + data_list = [] + label_list = [] + for islice in slices: + iroidb = [roidb[i] for i in range(islice.start, islice.stop)] + data, label = get_crop_batch(iroidb) + data_list += data + label_list += label + #data_list.append(data) + #label_list.append(label) + + # pad data first and then assign anchor (read label) + #data_tensor = tensor_vstack([batch['data'] for batch in data_list]) + #for i_card in range(len(data_list)): + # data_list[i_card]['data'] = data_tensor[ + # i_card * config.TRAIN.BATCH_IMAGES:(1 + i_card) * config.TRAIN.BATCH_IMAGES] + + #iiddxx = 0 + select_stride = 0 + if config.RANDOM_FEAT_STRIDE: + select_stride = random.choice(config.RPN_FEAT_STRIDE) + + for data, label in zip(data_list, label_list): + data_shape = {k: v.shape for k, v in data.items()} + del data_shape['im_info'] + feat_shape_list = [] + for s in range(len(self.feat_stride)): + _, feat_shape, _ = self.feat_sym[s].infer_shape(**data_shape) + feat_shape = [int(i) for i in feat_shape[0]] + feat_shape_list.append(feat_shape) + im_info = data['im_info'] + gt_boxes = label['gt_boxes'] + gt_label = {'gt_boxes':gt_boxes} + if config.USE_BLUR: + gt_blur = label['gt_blur'] + gt_label['gt_blur'] = gt_blur + if self._debug: + img = data['data'].copy()[0].transpose( (1,2,0) )[:,:,::-1].copy() + print('DEBUG SHAPE', data['data'].shape, label['gt_boxes'].shape) + + box = label['gt_boxes'].copy()[0][0:4].astype(np.int) + cv2.rectangle(img, (box[0], box[1]), (box[2], box[3]), (0, 255, 0), 2) + filename = './debugout/%d.png' % (self._debug_id) + print('debug write', filename) + cv2.imwrite(filename, img) + self._debug_id+=1 + #print('DEBUG', img.shape, bbox.shape) + label_dict = {} + if config.HEAD_BOX: + head_label_dict = self.aa.assign_anchor_fpn(gt_label, im_info, False, prefix='head', select_stride = select_stride) + label_dict.update(head_label_dict) + if config.FACE_LANDMARK: + gt_landmarks = label['gt_landmarks'] + gt_label['gt_landmarks'] = gt_landmarks + #ta = datetime.datetime.now() + #face_label_dict = assign_anchor_fpn(feat_shape_list, gt_label, im_info, config.FACE_LANDMARK, prefix='face', select_stride = select_stride) + face_label_dict = self.aa.assign_anchor_fpn(gt_label, im_info, config.FACE_LANDMARK, prefix='face', select_stride = select_stride) + #tb = datetime.datetime.now() + #self._times[0] += (tb-ta).total_seconds() + label_dict.update(face_label_dict) + #print('im_info', im_info.shape) + #print(gt_boxes.shape) + for k in self.label_name: + label[k] = label_dict[k] + + all_data = dict() + for key in self.data_name: + all_data[key] = tensor_vstack([batch[key] for batch in data_list]) + + all_label = dict() + for key in self.label_name: + pad = 0 if key.startswith('bbox_') else -1 + #print('label vstack', key, pad, len(label_list), file=sys.stderr) + all_label[key] = tensor_vstack([batch[key] for batch in label_list], pad=pad) + + self.data = [mx.nd.array(all_data[key]) for key in self.data_name] + self.label = [mx.nd.array(all_label[key]) for key in self.label_name] + #print(self._times) + +class CropLoader2(mx.io.DataIter): + def __init__(self, feat_sym, roidb, batch_size=1, shuffle=False, ctx=None, work_load_list=None, + aspect_grouping=False): + """ + This Iter will provide roi data to Fast R-CNN network + :param feat_sym: to infer shape of assign_output + :param roidb: must be preprocessed + :param batch_size: must divide BATCH_SIZE(128) + :param shuffle: bool + :param ctx: list of contexts + :param work_load_list: list of work load + :param aspect_grouping: group images with similar aspects + :return: AnchorLoader + """ + super(CropLoader2, self).__init__() + + # save parameters as properties + self.feat_sym = feat_sym + self.roidb = roidb + self.batch_size = batch_size + self.shuffle = shuffle + self.ctx = ctx + if self.ctx is None: + self.ctx = [mx.cpu()] + self.work_load_list = work_load_list + #self.feat_stride = feat_stride + #self.anchor_scales = anchor_scales + #self.anchor_ratios = anchor_ratios + #self.allowed_border = allowed_border + self.aspect_grouping = aspect_grouping + self.feat_stride = config.RPN_FEAT_STRIDE + + # infer properties from roidb + self.size = len(roidb) + + # decide data and label names + #self.data_name = ['data'] + #self.label_name = [] + #self.label_name.append('label') + #self.label_name.append('bbox_target') + #self.label_name.append('bbox_weight') + + self.data_name = ['data'] + #self.label_name = ['label', 'bbox_target', 'bbox_weight'] + self.label_name = [] + prefixes = ['face'] + if config.HEAD_BOX: + prefixes.append('head') + names = [] + for prefix in prefixes: + names += [prefix+'_label', prefix+'_bbox_target', prefix+'_bbox_weight'] + if prefix=='face' and config.FACE_LANDMARK: + names += [prefix+'_landmark_target', prefix+'_landmark_weight'] + #names = ['label', 'bbox_weight'] + for stride in self.feat_stride: + for n in names: + k = "%s_stride%d"%(n,stride) + self.label_name.append(k) + # status variable for synchronization between get_data and get_label + self.cur = 0 + self.batch = None + self.data = None + self.label = None + + # get first batch to fill in provide_data and provide_label + self.reset() + self.q_in = [multiprocessing.Queue(1024) for i in range(config.NUM_CPU)] + #self.q_in = multiprocessing.Queue(1024) + self.q_out = multiprocessing.Queue(1024) + self.start() + self.get_batch() + + @property + def provide_data(self): + return [(k, v.shape) for k, v in zip(self.data_name, self.data)] + + @property + def provide_label(self): + return [(k, v.shape) for k, v in zip(self.label_name, self.label)] + + def reset(self): + pass + + @staticmethod + def input_worker(q_in, roidb, batch_size): + index = np.arange(len(roidb)) + np.random.shuffle(index) + cur_from = 0 + while True: + cur_to = cur_from + batch_size + if cur_to>len(roidb): + np.random.shuffle(index) + cur_from = 0 + continue + _roidb = [roidb[index[i]] for i in range(cur_from, cur_to)] + istart = index[cur_from] + q_in[istart%len(q_in)].put(_roidb) + cur_from = cur_to + + @staticmethod + def gen_worker(q_in, q_out): + while True: + deq = q_in.get() + if deq is None: + break + _roidb = deq + data, label = get_crop_batch(_roidb) + print('generated') + q_out.put( (data, label) ) + + def start(self): + input_process = multiprocessing.Process(target=CropLoader2.input_worker, args=(self.q_in, self.roidb, self.batch_size)) + #gen_process = multiprocessing.Process(target=gen_worker, args=(q_in, q_out)) + gen_process = [multiprocessing.Process(target=CropLoader2.gen_worker, args=(self.q_in[i], self.q_out)) \ + for i in range(config.NUM_CPU)] + input_process.start() + for p in gen_process: + p.start() + + + def next(self): + self.get_batch() + return mx.io.DataBatch(data=self.data, label=self.label, + provide_data=self.provide_data, provide_label=self.provide_label) + + def infer_shape(self, max_data_shape=None, max_label_shape=None): + """ Return maximum data and label shape for single gpu """ + if max_data_shape is None: + max_data_shape = [] + if max_label_shape is None: + max_label_shape = [] + max_shapes = dict(max_data_shape + max_label_shape) + input_batch_size = max_shapes['data'][0] + dummy_boxes = np.zeros((0, 5)) + dummy_info = [ [max_shapes['data'][2], max_shapes['data'][3], 1.0] ] + dummy_label = {'gt_boxes' : dummy_boxes} + + # infer shape + feat_shape_list = [] + for i in range(len(self.feat_stride)): + _, feat_shape, _ = self.feat_sym[i].infer_shape(**max_shapes) + feat_shape = [int(i) for i in feat_shape[0]] + feat_shape_list.append(feat_shape) + + label_dict = {} + if config.HEAD_BOX: + head_label_dict = assign_anchor_fpn(feat_shape_list, dummy_label, dummy_info, False, prefix='head') + label_dict.update(head_label_dict) + + if config.FACE_LANDMARK: + dummy_landmarks = np.zeros( (0,11) ) + dummy_label['gt_landmarks'] = dummy_landmarks + face_label_dict = assign_anchor_fpn(feat_shape_list, dummy_label, dummy_info, config.FACE_LANDMARK, prefix='face') + label_dict.update(face_label_dict) + + label_list = [] + for k in self.label_name: + label_list.append(label_dict[k]) + label_shape = [(k, tuple([input_batch_size] + list(v.shape[1:]))) for k, v in zip(self.label_name, label_list)] + return max_data_shape, label_shape + + def get_batch(self): + deq = self.q_out.get() + print('q_out got') + data_list, label_list = deq + + for data, label in zip(data_list, label_list): + data_shape = {k: v.shape for k, v in data.items()} + del data_shape['im_info'] + feat_shape_list = [] + for s in range(len(self.feat_stride)): + _, feat_shape, _ = self.feat_sym[s].infer_shape(**data_shape) + feat_shape = [int(i) for i in feat_shape[0]] + feat_shape_list.append(feat_shape) + #for k in self.label_name: + # label[k] = [0 for i in range(config.TRAIN.BATCH_IMAGES)] + im_info = data['im_info'] + gt_boxes = label['gt_boxes'] + gt_label = {'gt_boxes':gt_boxes} + label_dict = {} + head_label_dict = assign_anchor_fpn(feat_shape_list, gt_label, im_info, False, prefix='head') + label_dict.update(head_label_dict) + if config.FACE_LANDMARK: + gt_landmarks = label['gt_landmarks'] + gt_label['gt_landmarks'] = gt_landmarks + face_label_dict = assign_anchor_fpn(feat_shape_list, gt_label, im_info, config.FACE_LANDMARK, prefix='face') + label_dict.update(face_label_dict) + #print('im_info', im_info.shape) + #print(gt_boxes.shape) + for k in self.label_name: + label[k] = label_dict[k] + + all_data = dict() + for key in self.data_name: + all_data[key] = tensor_vstack([batch[key] for batch in data_list]) + + all_label = dict() + for key in self.label_name: + pad = 0 if key.startswith('bbox_') else -1 + #print('label vstack', key, pad, len(label_list), file=sys.stderr) + all_label[key] = tensor_vstack([batch[key] for batch in label_list], pad=pad) + self.data = [mx.nd.array(all_data[key]) for key in self.data_name] + self.label = [mx.nd.array(all_label[key]) for key in self.label_name] + diff --git a/RetinaFace/rcnn/core/metric.py b/RetinaFace/rcnn/core/metric.py new file mode 100644 index 0000000..949fe29 --- /dev/null +++ b/RetinaFace/rcnn/core/metric.py @@ -0,0 +1,165 @@ +from __future__ import print_function +import sys +import mxnet as mx +import numpy as np + +from rcnn.config import config + + +def get_rpn_names(): + pred = ['rpn_cls_prob', 'rpn_bbox_loss', 'rpn_label', 'rpn_bbox_weight'] + label = ['rpn_label', 'rpn_bbox_target', 'rpn_bbox_weight'] + return pred, label + + + +class RPNAccMetric(mx.metric.EvalMetric): + def __init__(self, pred_idx=-1, label_idx=-1,name='RPNAcc'): + super(RPNAccMetric, self).__init__(name) + self.pred, self.label = get_rpn_names() + #self.name = 'RPNAcc' + self.name = [name, name+'_BG', name+'_FG'] + self.pred_idx = pred_idx + self.label_idx = label_idx + self.STAT = [0, 0, 0] + + def reset(self): + """Clear the internal statistics to initial state.""" + if isinstance(self.name, str): + self.num_inst = 0 + self.sum_metric = 0.0 + else: + #print('reset to ',len(self.name), self.name, file=sys.stderr) + self.num_inst = [0] * len(self.name) + self.sum_metric = [0.0] * len(self.name) + + + def get(self): + if isinstance(self.name, str): + if self.num_inst == 0: + return (self.name, float('nan')) + else: + return (self.name, self.sum_metric / self.num_inst) + else: + names = ['%s'%(self.name[i]) for i in range(len(self.name))] + values = [x / y if y != 0 else float('nan') \ + for x, y in zip(self.sum_metric, self.num_inst)] + return (names, values) + + def update(self, labels, preds): + if self.pred_idx>=0 and self.label_idx>=0: + pred = preds[self.pred_idx] + label = preds[self.label_idx] + else: + pred = preds[self.pred.index('rpn_cls_prob')] + label = labels[self.label.index('rpn_label')] + #label = preds[self.pred.index('rpn_label')] + + num_images = pred.shape[0] + #print(pred.shape, label.shape, file=sys.stderr) + # pred (b, c, p) or (b, c, h, w) + pred_label = mx.ndarray.argmax_channel(pred).asnumpy().astype('int32') + #pred_label = pred_label.reshape((pred_label.shape[0], -1)) + pred_label = pred_label.reshape(-1,) + # label (b, p) + label = label.asnumpy().astype('int32').reshape(-1,) + #print(pred_label.shape, label.shape) + + # filter with keep_inds + keep_inds = np.where(label != -1)[0] + #print('in_metric', pred_label.shape, label.shape, len(keep_inds), file=sys.stderr) + #print(keep_inds, file=sys.stderr) + _pred_label = pred_label[keep_inds] + _label = label[keep_inds] + #print('in_metric2', pred_label.shape, label.shape, len(keep_inds), file=sys.stderr) + if isinstance(self.name, str): + self.sum_metric += np.sum(_pred_label.flat == _label.flat) + self.num_inst += len(_pred_label.flat) + else: + self.sum_metric[0] += np.sum(_pred_label.flat == _label.flat) + self.num_inst[0] += len(_pred_label.flat) + + keep_inds = np.where(label == 0)[0] + _pred_label = pred_label[keep_inds] + _label = label[keep_inds] + self.sum_metric[1] += np.sum(_pred_label.flat == _label.flat) + self.num_inst[1] += len(_pred_label.flat) + + keep_inds = np.where(label == 1)[0] + _pred_label = pred_label[keep_inds] + _label = label[keep_inds] + a = np.sum(_pred_label.flat == _label.flat) + b = len(_pred_label.flat) + self.sum_metric[2] += a + self.num_inst[2] += b + + #self.STAT[0]+=a + #self.STAT[1]+=b + #self.STAT[2]+=num_images + #if self.STAT[2]%400==0: + # print('FG_ACC', self.pred_idx, self.STAT[2], self.STAT[0], self.STAT[1], float(self.STAT[0])/self.STAT[1], file=sys.stderr) + # self.STAT = [0,0,0] + + +class RPNLogLossMetric(mx.metric.EvalMetric): + def __init__(self, pred_idx=-1, label_idx=-1): + super(RPNLogLossMetric, self).__init__('RPNLogLoss') + self.pred, self.label = get_rpn_names() + self.pred_idx = pred_idx + self.label_idx = label_idx + + def update(self, labels, preds): + if self.pred_idx>=0 and self.label_idx>=0: + pred = preds[self.pred_idx] + label = preds[self.label_idx] + else: + pred = preds[self.pred.index('rpn_cls_prob')] + label = labels[self.label.index('rpn_label')] + #label = preds[self.pred.index('rpn_label')] + + # label (b, p) + label = label.asnumpy().astype('int32').reshape((-1)) + # pred (b, c, p) or (b, c, h, w) --> (b, p, c) --> (b*p, c) + pred = pred.asnumpy().reshape((pred.shape[0], pred.shape[1], -1)).transpose((0, 2, 1)) + pred = pred.reshape((label.shape[0], -1)) + + # filter with keep_inds + keep_inds = np.where(label != -1)[0] + label = label[keep_inds] + cls = pred[keep_inds, label] + #print('in_metric log', label.shape, cls.shape, file=sys.stderr) + + cls += 1e-14 + cls_loss = -1 * np.log(cls) + cls_loss = np.sum(cls_loss) + self.sum_metric += cls_loss + self.num_inst += label.shape[0] + + +class RPNL1LossMetric(mx.metric.EvalMetric): + def __init__(self, loss_idx=-1, weight_idx=-1, name='RPNL1Loss'): + super(RPNL1LossMetric, self).__init__(name) + self.pred, self.label = get_rpn_names() + self.loss_idx = loss_idx + self.weight_idx = weight_idx + self.name = name + + def update(self, labels, preds): + if self.loss_idx>=0 and self.weight_idx>=0: + bbox_loss = preds[self.loss_idx].asnumpy() + bbox_weight = preds[self.weight_idx].asnumpy() + else: + bbox_loss = preds[self.pred.index('rpn_bbox_loss')].asnumpy() + bbox_weight = labels[self.label.index('rpn_bbox_weight')].asnumpy() + #bbox_weight = preds[self.pred.index('rpn_bbox_weight')].asnumpy() + + #print('in_metric', self.name, bbox_weight.shape, bbox_loss.shape) + + # calculate num_inst (average on those fg anchors) + num_inst = np.sum(bbox_weight > 0) / (bbox_weight.shape[1]/config.NUM_ANCHORS) + #print('in_metric log', bbox_loss.shape, num_inst, file=sys.stderr) + + self.sum_metric += np.sum(bbox_loss) + self.num_inst += num_inst + + diff --git a/RetinaFace/rcnn/core/module.py b/RetinaFace/rcnn/core/module.py new file mode 100644 index 0000000..bf28f8e --- /dev/null +++ b/RetinaFace/rcnn/core/module.py @@ -0,0 +1,215 @@ +"""A `MutableModule` implement the `BaseModule` API, and allows input shape +varying with training iterations. If shapes vary, executors will rebind, +using shared arrays from the initial module binded with maximum shape. +""" + +import logging + +from mxnet import context as ctx +from mxnet.initializer import Uniform +from mxnet.module.base_module import BaseModule +from mxnet.module.module import Module + +class MutableModule(BaseModule): + """A mutable module is a module that supports variable input data. + + Parameters + ---------- + symbol : Symbol + data_names : list of str + label_names : list of str + logger : Logger + context : Context or list of Context + work_load_list : list of number + max_data_shapes : list of (name, shape) tuple, designating inputs whose shape vary + max_label_shapes : list of (name, shape) tuple, designating inputs whose shape vary + fixed_param_prefix : list of str, indicating fixed parameters + """ + def __init__(self, symbol, data_names, label_names, + logger=logging, context=ctx.cpu(), work_load_list=None, + max_data_shapes=None, max_label_shapes=None, fixed_param_prefix=None): + super(MutableModule, self).__init__(logger=logger) + self._symbol = symbol + self._data_names = data_names + self._label_names = label_names + self._context = context + self._work_load_list = work_load_list + + self._curr_module = None + self._max_data_shapes = max_data_shapes + self._max_label_shapes = max_label_shapes + self._fixed_param_prefix = fixed_param_prefix + + fixed_param_names = list() + if fixed_param_prefix is not None: + for name in self._symbol.list_arguments(): + for prefix in self._fixed_param_prefix: + if prefix in name: + fixed_param_names.append(name) + self._fixed_param_names = fixed_param_names + + def _reset_bind(self): + self.binded = False + self._curr_module = None + + @property + def data_names(self): + return self._data_names + + @property + def output_names(self): + return self._symbol.list_outputs() + + @property + def data_shapes(self): + assert self.binded + return self._curr_module.data_shapes + + @property + def label_shapes(self): + assert self.binded + return self._curr_module.label_shapes + + @property + def output_shapes(self): + assert self.binded + return self._curr_module.output_shapes + + def get_params(self): + assert self.binded and self.params_initialized + return self._curr_module.get_params() + + def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None, + allow_missing=False, force_init=False, allow_extra=False): + if self.params_initialized and not force_init: + return + assert self.binded, 'call bind before initializing the parameters' + self._curr_module.init_params(initializer=initializer, arg_params=arg_params, + aux_params=aux_params, allow_missing=allow_missing, + force_init=force_init, allow_extra=allow_extra) + self.params_initialized = True + + def bind(self, data_shapes, label_shapes=None, for_training=True, + inputs_need_grad=False, force_rebind=False, shared_module=None): + # in case we already initialized params, keep it + if self.params_initialized: + arg_params, aux_params = self.get_params() + + # force rebinding is typically used when one want to switch from + # training to prediction phase. + if force_rebind: + self._reset_bind() + + if self.binded: + self.logger.warning('Already binded, ignoring bind()') + return + + assert shared_module is None, 'shared_module for MutableModule is not supported' + + self.for_training = for_training + self.inputs_need_grad = inputs_need_grad + self.binded = True + + max_shapes_dict = dict() + if self._max_data_shapes is not None: + max_shapes_dict.update(dict(self._max_data_shapes)) + if self._max_label_shapes is not None: + max_shapes_dict.update(dict(self._max_label_shapes)) + + max_data_shapes = list() + for name, shape in data_shapes: + if name in max_shapes_dict: + max_data_shapes.append((name, max_shapes_dict[name])) + else: + max_data_shapes.append((name, shape)) + + max_label_shapes = list() + if label_shapes is not None: + for name, shape in label_shapes: + if name in max_shapes_dict: + max_label_shapes.append((name, max_shapes_dict[name])) + else: + max_label_shapes.append((name, shape)) + + if len(max_label_shapes) == 0: + max_label_shapes = None + + module = Module(self._symbol, self._data_names, self._label_names, logger=self.logger, + context=self._context, work_load_list=self._work_load_list, + fixed_param_names=self._fixed_param_names) + module.bind(max_data_shapes, max_label_shapes, for_training, inputs_need_grad, + force_rebind=False, shared_module=None) + self._curr_module = module + + # copy back saved params, if already initialized + if self.params_initialized: + self.set_params(arg_params, aux_params) + + def init_optimizer(self, kvstore='local', optimizer='sgd', + optimizer_params=(('learning_rate', 0.01),), force_init=False): + assert self.binded and self.params_initialized + if self.optimizer_initialized and not force_init: + self.logger.warning('optimizer already initialized, ignoring.') + return + + self._curr_module.init_optimizer(kvstore, optimizer, optimizer_params, + force_init=force_init) + self.optimizer_initialized = True + + def forward(self, data_batch, is_train=None): + assert self.binded and self.params_initialized + + # get current_shapes + if self._curr_module.label_shapes is not None: + current_shapes = dict(self._curr_module.data_shapes + self._curr_module.label_shapes) + else: + current_shapes = dict(self._curr_module.data_shapes) + + # get input_shapes + if data_batch.provide_label is not None: + input_shapes = dict(data_batch.provide_data + data_batch.provide_label) + else: + input_shapes = dict(data_batch.provide_data) + + # decide if shape changed + shape_changed = False + for k, v in current_shapes.items(): + if v != input_shapes[k]: + shape_changed = True + + if shape_changed: + module = Module(self._symbol, self._data_names, self._label_names, + logger=self.logger, context=self._context, + work_load_list=self._work_load_list, + fixed_param_names=self._fixed_param_names) + module.bind(data_batch.provide_data, data_batch.provide_label, self._curr_module.for_training, + self._curr_module.inputs_need_grad, force_rebind=False, + shared_module=self._curr_module) + self._curr_module = module + + self._curr_module.forward(data_batch, is_train=is_train) + + def backward(self, out_grads=None): + assert self.binded and self.params_initialized + self._curr_module.backward(out_grads=out_grads) + + def update(self): + assert self.binded and self.params_initialized and self.optimizer_initialized + self._curr_module.update() + + def get_outputs(self, merge_multi_context=True): + assert self.binded and self.params_initialized + return self._curr_module.get_outputs(merge_multi_context=merge_multi_context) + + def get_input_grads(self, merge_multi_context=True): + assert self.binded and self.params_initialized and self.inputs_need_grad + return self._curr_module.get_input_grads(merge_multi_context=merge_multi_context) + + def update_metric(self, eval_metric, labels): + assert self.binded and self.params_initialized + self._curr_module.update_metric(eval_metric, labels) + + def install_monitor(self, mon): + """ Install monitor on all executors """ + assert self.binded + self._curr_module.install_monitor(mon) diff --git a/RetinaFace/rcnn/core/module_bak.py b/RetinaFace/rcnn/core/module_bak.py new file mode 100644 index 0000000..569d50d --- /dev/null +++ b/RetinaFace/rcnn/core/module_bak.py @@ -0,0 +1,215 @@ +"""A `MutableModule` implement the `BaseModule` API, and allows input shape +varying with training iterations. If shapes vary, executors will rebind, +using shared arrays from the initial module binded with maximum shape. +""" + +import logging + +from mxnet import context as ctx +from mxnet.initializer import Uniform +from mxnet.module.base_module import BaseModule +from mxnet.module.module import Module + +class MutableModule(BaseModule): + """A mutable module is a module that supports variable input data. + + Parameters + ---------- + symbol : Symbol + data_names : list of str + label_names : list of str + logger : Logger + context : Context or list of Context + work_load_list : list of number + max_data_shapes : list of (name, shape) tuple, designating inputs whose shape vary + max_label_shapes : list of (name, shape) tuple, designating inputs whose shape vary + fixed_param_prefix : list of str, indicating fixed parameters + """ + def __init__(self, symbol, data_names, label_names, + logger=logging, context=ctx.cpu(), work_load_list=None, + max_data_shapes=None, max_label_shapes=None, fixed_param_prefix=None): + super(MutableModule, self).__init__(logger=logger) + self._symbol = symbol + self._data_names = data_names + self._label_names = label_names + self._context = context + self._work_load_list = work_load_list + + self._curr_module = None + self._max_data_shapes = max_data_shapes + self._max_label_shapes = max_label_shapes + self._fixed_param_prefix = fixed_param_prefix + + fixed_param_names = list() + if fixed_param_prefix is not None: + for name in self._symbol.list_arguments(): + for prefix in self._fixed_param_prefix: + if prefix in name: + fixed_param_names.append(name) + self._fixed_param_names = fixed_param_names + + def _reset_bind(self): + self.binded = False + self._curr_module = None + + @property + def data_names(self): + return self._data_names + + @property + def output_names(self): + return self._symbol.list_outputs() + + @property + def data_shapes(self): + assert self.binded + return self._curr_module.data_shapes + + @property + def label_shapes(self): + assert self.binded + return self._curr_module.label_shapes + + @property + def output_shapes(self): + assert self.binded + return self._curr_module.output_shapes + + def get_params(self): + assert self.binded and self.params_initialized + return self._curr_module.get_params() + + def init_params(self, initializer=Uniform(0.01), arg_params=None, aux_params=None, + allow_missing=False, force_init=False, allow_extra=False): + if self.params_initialized and not force_init: + return + assert self.binded, 'call bind before initializing the parameters' + self._curr_module.init_params(initializer=initializer, arg_params=arg_params, + aux_params=aux_params, allow_missing=allow_missing, + force_init=force_init, allow_extra=allow_extra) + self.params_initialized = True + + def bind(self, data_shapes, label_shapes=None, for_training=True, + inputs_need_grad=False, force_rebind=False, shared_module=None, grad_req='write'): + # in case we already initialized params, keep it + if self.params_initialized: + arg_params, aux_params = self.get_params() + + # force rebinding is typically used when one want to switch from + # training to prediction phase. + if force_rebind: + self._reset_bind() + + if self.binded: + self.logger.warning('Already binded, ignoring bind()') + return + + assert shared_module is None, 'shared_module for MutableModule is not supported' + + self.for_training = for_training + self.inputs_need_grad = inputs_need_grad + self.binded = True + + max_shapes_dict = dict() + if self._max_data_shapes is not None: + max_shapes_dict.update(dict(self._max_data_shapes)) + if self._max_label_shapes is not None: + max_shapes_dict.update(dict(self._max_label_shapes)) + + max_data_shapes = list() + for name, shape in data_shapes: + if name in max_shapes_dict: + max_data_shapes.append((name, max_shapes_dict[name])) + else: + max_data_shapes.append((name, shape)) + + max_label_shapes = list() + if label_shapes is not None: + for name, shape in label_shapes: + if name in max_shapes_dict: + max_label_shapes.append((name, max_shapes_dict[name])) + else: + max_label_shapes.append((name, shape)) + + if len(max_label_shapes) == 0: + max_label_shapes = None + + module = Module(self._symbol, self._data_names, self._label_names, logger=self.logger, + context=self._context, work_load_list=self._work_load_list, + fixed_param_names=self._fixed_param_names) + module.bind(max_data_shapes, max_label_shapes, for_training, inputs_need_grad, + force_rebind=False, shared_module=None) + self._curr_module = module + + # copy back saved params, if already initialized + if self.params_initialized: + self.set_params(arg_params, aux_params) + + def init_optimizer(self, kvstore='local', optimizer='sgd', + optimizer_params=(('learning_rate', 0.01),), force_init=False): + assert self.binded and self.params_initialized + if self.optimizer_initialized and not force_init: + self.logger.warning('optimizer already initialized, ignoring.') + return + + self._curr_module.init_optimizer(kvstore, optimizer, optimizer_params, + force_init=force_init) + self.optimizer_initialized = True + + def forward(self, data_batch, is_train=None): + assert self.binded and self.params_initialized + + # get current_shapes + if self._curr_module.label_shapes is not None: + current_shapes = dict(self._curr_module.data_shapes + self._curr_module.label_shapes) + else: + current_shapes = dict(self._curr_module.data_shapes) + + # get input_shapes + if data_batch.provide_label is not None: + input_shapes = dict(data_batch.provide_data + data_batch.provide_label) + else: + input_shapes = dict(data_batch.provide_data) + + # decide if shape changed + shape_changed = False + for k, v in current_shapes.items(): + if v != input_shapes[k]: + shape_changed = True + + if shape_changed: + module = Module(self._symbol, self._data_names, self._label_names, + logger=self.logger, context=self._context, + work_load_list=self._work_load_list, + fixed_param_names=self._fixed_param_names) + module.bind(data_batch.provide_data, data_batch.provide_label, self._curr_module.for_training, + self._curr_module.inputs_need_grad, force_rebind=False, + shared_module=self._curr_module) + self._curr_module = module + + self._curr_module.forward(data_batch, is_train=is_train) + + def backward(self, out_grads=None): + assert self.binded and self.params_initialized + self._curr_module.backward(out_grads=out_grads) + + def update(self): + assert self.binded and self.params_initialized and self.optimizer_initialized + self._curr_module.update() + + def get_outputs(self, merge_multi_context=True): + assert self.binded and self.params_initialized + return self._curr_module.get_outputs(merge_multi_context=merge_multi_context) + + def get_input_grads(self, merge_multi_context=True): + assert self.binded and self.params_initialized and self.inputs_need_grad + return self._curr_module.get_input_grads(merge_multi_context=merge_multi_context) + + def update_metric(self, eval_metric, labels): + assert self.binded and self.params_initialized + self._curr_module.update_metric(eval_metric, labels) + + def install_monitor(self, mon): + """ Install monitor on all executors """ + assert self.binded + self._curr_module.install_monitor(mon) diff --git a/RetinaFace/rcnn/core/tester.py b/RetinaFace/rcnn/core/tester.py new file mode 100644 index 0000000..8d981fa --- /dev/null +++ b/RetinaFace/rcnn/core/tester.py @@ -0,0 +1,485 @@ +from __future__ import print_function +try: + import cPickle as pickle +except ImportError: + import pickle +import os +import sys +import time +import mxnet as mx +import numpy as np +from builtins import range + +from mxnet.module import Module +from .module import MutableModule +from rcnn.logger import logger +from rcnn.config import config +from rcnn.io import image +from rcnn.processing.bbox_transform import bbox_pred, clip_boxes +from rcnn.processing.nms import py_nms_wrapper, cpu_nms_wrapper, gpu_nms_wrapper +from rcnn.processing.bbox_transform import bbox_overlaps + + + +def IOU(Reframe,GTframe): + x1 = Reframe[0]; + y1 = Reframe[1]; + width1 = Reframe[2]-Reframe[0]; + height1 = Reframe[3]-Reframe[1]; + + x2 = GTframe[0] + y2 = GTframe[1] + width2 = GTframe[2]-GTframe[0] + height2 = GTframe[3]-GTframe[1] + + endx = max(x1+width1,x2+width2) + startx = min(x1,x2) + width = width1+width2-(endx-startx) + + endy = max(y1+height1,y2+height2) + starty = min(y1,y2) + height = height1+height2-(endy-starty) + + if width <=0 or height <= 0: + ratio = 0 + else: + Area = width*height + Area1 = width1*height1 + Area2 = width2*height2 + ratio = Area*1./(Area1+Area2-Area) + return ratio + +class Predictor(object): + def __init__(self, symbol, data_names, label_names, + context=mx.cpu(), max_data_shapes=None, + provide_data=None, provide_label=None, + arg_params=None, aux_params=None): + #self._mod = MutableModule(symbol, data_names, label_names, + # context=context, max_data_shapes=max_data_shapes) + self._mod = Module(symbol, data_names, label_names, context=context) + self._mod.bind(provide_data, provide_label, for_training=False) + self._mod.init_params(arg_params=arg_params, aux_params=aux_params) + + def predict(self, data_batch): + self._mod.forward(data_batch) + return dict(zip(self._mod.output_names, self._mod.get_outputs())) #TODO + #return self._mod.get_outputs() + + +def im_proposal(predictor, data_batch, data_names, scale): + data_dict = dict(zip(data_names, data_batch.data)) + output = predictor.predict(data_batch) + + # drop the batch index + boxes = output['rois_output'].asnumpy()[:, 1:] + scores = output['rois_score'].asnumpy() + + # transform to original scale + boxes = boxes / scale + + return scores, boxes, data_dict + +def _im_proposal(predictor, data_batch, data_names, scale): + data_dict = dict(zip(data_names, data_batch.data)) + output = predictor.predict(data_batch) + print('output', output) + + # drop the batch index + boxes = output['rois_output'].asnumpy()[:, 1:] + scores = output['rois_score'].asnumpy() + + # transform to original scale + boxes = boxes / scale + + return scores, boxes, data_dict + + +def generate_proposals(predictor, test_data, imdb, vis=False, thresh=0.): + """ + Generate detections results using RPN. + :param predictor: Predictor + :param test_data: data iterator, must be non-shuffled + :param imdb: image database + :param vis: controls visualization + :param thresh: thresh for valid detections + :return: list of detected boxes + """ + assert vis or not test_data.shuffle + data_names = [k[0] for k in test_data.provide_data] + + i = 0 + t = time.time() + imdb_boxes = list() + original_boxes = list() + for im_info, data_batch in test_data: + t1 = time.time() - t + t = time.time() + + scale = im_info[0, 2] + scores, boxes, data_dict = im_proposal(predictor, data_batch, data_names, scale) + print(scores.shape, boxes.shape, file=sys.stderr) + t2 = time.time() - t + t = time.time() + + # assemble proposals + dets = np.hstack((boxes, scores)) + original_boxes.append(dets) + + # filter proposals + keep = np.where(dets[:, 4:] > thresh)[0] + dets = dets[keep, :] + imdb_boxes.append(dets) + + if vis: + vis_all_detection(data_dict['data'].asnumpy(), [dets], ['obj'], scale) + + logger.info('generating %d/%d ' % (i + 1, imdb.num_images) + + 'proposal %d ' % (dets.shape[0]) + + 'data %.4fs net %.4fs' % (t1, t2)) + i += 1 + + assert len(imdb_boxes) == imdb.num_images, 'calculations not complete' + + # save results + rpn_folder = os.path.join(imdb.root_path, 'rpn_data') + if not os.path.exists(rpn_folder): + os.mkdir(rpn_folder) + + rpn_file = os.path.join(rpn_folder, imdb.name + '_rpn.pkl') + with open(rpn_file, 'wb') as f: + pickle.dump(imdb_boxes, f, pickle.HIGHEST_PROTOCOL) + + if thresh > 0: + full_rpn_file = os.path.join(rpn_folder, imdb.name + '_full_rpn.pkl') + with open(full_rpn_file, 'wb') as f: + pickle.dump(original_boxes, f, pickle.HIGHEST_PROTOCOL) + + logger.info('wrote rpn proposals to %s' % rpn_file) + return imdb_boxes + +def test_proposals(predictor, test_data, imdb, roidb, vis=False): + """ + Test detections results using RPN. + :param predictor: Predictor + :param test_data: data iterator, must be non-shuffled + :param imdb: image database + :param roidb: roidb + :param vis: controls visualization + :return: recall, mAP + """ + assert vis or not test_data.shuffle + data_names = [k[0] for k in test_data.provide_data] + + #bbox_file = os.path.join(rpn_folder, imdb.name + '_bbox.txt') + #bbox_f = open(bbox_file, 'w') + + i = 0 + t = time.time() + output_folder = os.path.join(imdb.root_path, 'output') + if not os.path.exists(output_folder): + os.mkdir(output_folder) + imdb_boxes = list() + original_boxes = list() + gt_overlaps = np.zeros(0) + overall = [0.0, 0.0] + gt_max = np.array( (0.0, 0.0) ) + num_pos = 0 + #apply scale, for SSH + #_, roidb = image.get_image(roidb) + for im_info, data_batch in test_data: + t1 = time.time() - t + t = time.time() + + oscale = im_info[0, 2] + #print('scale', scale, file=sys.stderr) + scale = 1.0 #fix scale=1.0 for SSH face detector + scores, boxes, data_dict = im_proposal(predictor, data_batch, data_names, scale) + #print(scores.shape, boxes.shape, file=sys.stderr) + t2 = time.time() - t + t = time.time() + + # assemble proposals + dets = np.hstack((boxes, scores)) + original_boxes.append(dets) + + # filter proposals + keep = np.where(dets[:, 4:] > config.TEST.SCORE_THRESH)[0] + dets = dets[keep, :] + imdb_boxes.append(dets) + + + logger.info('generating %d/%d ' % (i + 1, imdb.num_images) + + 'proposal %d ' % (dets.shape[0]) + + 'data %.4fs net %.4fs' % (t1, t2)) + + #if dets.shape[0]==0: + # continue + if vis: + vis_all_detection(data_dict['data'].asnumpy(), [dets], ['obj'], scale) + boxes = dets + #max_gt_overlaps = roidb[i]['gt_overlaps'].max(axis=1) + #gt_inds = np.where((roidb[i]['gt_classes'] > 0) & (max_gt_overlaps == 1))[0] + #gt_boxes = roidb[i]['boxes'][gt_inds, :] + gt_boxes = roidb[i]['boxes'].copy() * oscale # as roidb is the original one, need to scale GT for SSH + gt_areas = (gt_boxes[:, 2] - gt_boxes[:, 0] + 1) * (gt_boxes[:, 3] - gt_boxes[:, 1] + 1) + num_pos += gt_boxes.shape[0] + + overlaps = bbox_overlaps(boxes.astype(np.float), gt_boxes.astype(np.float)) + #print(im_info, gt_boxes.shape, boxes.shape, overlaps.shape, file=sys.stderr) + + _gt_overlaps = np.zeros((gt_boxes.shape[0])) + # choose whatever is smaller to iterate + + #for j in range(gt_boxes.shape[0]): + # print('gt %d,%d,%d,%d'% (gt_boxes[j][0], gt_boxes[j][1], gt_boxes[j][2]-gt_boxes[j][0], gt_boxes[j][3]-gt_boxes[j][1]), file=sys.stderr) + # gt_max = np.maximum( gt_max, np.array( (gt_boxes[j][2], gt_boxes[j][3]) ) ) + #print('gt max', gt_max, file=sys.stderr) + #for j in range(boxes.shape[0]): + # print('anchor_box %.2f,%.2f,%.2f,%.2f'% (boxes[j][0], boxes[j][1], boxes[j][2]-boxes[j][0], boxes[j][3]-boxes[j][1]), file=sys.stderr) + + #rounds = min(boxes.shape[0], gt_boxes.shape[0]) + #for j in range(rounds): + # # find which proposal maximally covers each gt box + # argmax_overlaps = overlaps.argmax(axis=0) + # print(j, 'argmax_overlaps', argmax_overlaps, file=sys.stderr) + # # get the IoU amount of coverage for each gt box + # max_overlaps = overlaps.max(axis=0) + # print(j, 'max_overlaps', max_overlaps, file=sys.stderr) + # # find which gt box is covered by most IoU + # gt_ind = max_overlaps.argmax() + # gt_ovr = max_overlaps.max() + # assert (gt_ovr >= 0), '%s\n%s\n%s' % (boxes, gt_boxes, overlaps) + # # find the proposal box that covers the best covered gt box + # box_ind = argmax_overlaps[gt_ind] + # print('max box', gt_ind, box_ind, (boxes[box_ind][0], boxes[box_ind][1], boxes[box_ind][2]-boxes[box_ind][0], boxes[box_ind][3]-boxes[box_ind][1], boxes[box_ind][4]), file=sys.stderr) + # # record the IoU coverage of this gt box + # _gt_overlaps[j] = overlaps[box_ind, gt_ind] + # assert (_gt_overlaps[j] == gt_ovr) + # # mark the proposal box and the gt box as used + # overlaps[box_ind, :] = -1 + # overlaps[:, gt_ind] = -1 + + if boxes.shape[0]>0: + _gt_overlaps = overlaps.max(axis=0) + #print('max_overlaps', _gt_overlaps, file=sys.stderr) + for j in range(len(_gt_overlaps)): + if _gt_overlaps[j]>config.TEST.IOU_THRESH: + continue + print(j, 'failed', gt_boxes[j], 'max_overlap:', _gt_overlaps[j], file=sys.stderr) + #_idx = np.where(overlaps[:,j]>0.4)[0] + #print(j, _idx, file=sys.stderr) + #print(overlaps[_idx,j], file=sys.stderr) + #for __idx in _idx: + # print(gt_boxes[j], boxes[__idx], overlaps[__idx,j], IOU(gt_boxes[j], boxes[__idx,0:4]), file=sys.stderr) + + # append recorded IoU coverage level + found = (_gt_overlaps > config.TEST.IOU_THRESH).sum() + _recall = found / float(gt_boxes.shape[0]) + print('recall', _recall, gt_boxes.shape[0], boxes.shape[0], gt_areas, file=sys.stderr) + overall[0]+=found + overall[1]+=gt_boxes.shape[0] + #gt_overlaps = np.hstack((gt_overlaps, _gt_overlaps)) + #_recall = (gt_overlaps >= threshold).sum() / float(num_pos) + _recall = float(overall[0])/overall[1] + print('recall_all', _recall, file=sys.stderr) + + + boxes[:,0:4] /= oscale + _vec = roidb[i]['image'].split('/') + out_dir = os.path.join(output_folder, _vec[-2]) + if not os.path.exists(out_dir): + os.mkdir(out_dir) + out_file = os.path.join(out_dir, _vec[-1].replace('jpg', 'txt')) + with open(out_file, 'w') as f: + name = '/'.join(roidb[i]['image'].split('/')[-2:]) + f.write("%s\n"%(name)) + f.write("%d\n"%(boxes.shape[0])) + for b in range(boxes.shape[0]): + box = boxes[b] + f.write("%d %d %d %d %g \n"%(box[0], box[1], box[2]-box[0], box[3]-box[1], box[4])) + i += 1 + + #bbox_f.close() + return + gt_overlaps = np.sort(gt_overlaps) + recalls = np.zeros_like(thresholds) + + # compute recall for each IoU threshold + for i, t in enumerate(thresholds): + recalls[i] = (gt_overlaps >= t).sum() / float(num_pos) + ar = recalls.mean() + + # print results + print('average recall for {}: {:.3f}'.format(area_name, ar)) + for threshold, recall in zip(thresholds, recalls): + print('recall @{:.2f}: {:.3f}'.format(threshold, recall)) + + + + + assert len(imdb_boxes) == imdb.num_images, 'calculations not complete' + + # save results + + rpn_file = os.path.join(rpn_folder, imdb.name + '_rpn.pkl') + with open(rpn_file, 'wb') as f: + pickle.dump(imdb_boxes, f, pickle.HIGHEST_PROTOCOL) + + logger.info('wrote rpn proposals to %s' % rpn_file) + return imdb_boxes + +def im_detect(predictor, data_batch, data_names, scale): + output = predictor.predict(data_batch) + + data_dict = dict(zip(data_names, data_batch.data)) + if config.TEST.HAS_RPN: + rois = output['rois_output'].asnumpy()[:, 1:] + else: + rois = data_dict['rois'].asnumpy().reshape((-1, 5))[:, 1:] + im_shape = data_dict['data'].shape + + # save output + scores = output['cls_prob_reshape_output'].asnumpy()[0] + bbox_deltas = output['bbox_pred_reshape_output'].asnumpy()[0] + + # post processing + pred_boxes = bbox_pred(rois, bbox_deltas) + pred_boxes = clip_boxes(pred_boxes, im_shape[-2:]) + + # we used scaled image & roi to train, so it is necessary to transform them back + pred_boxes = pred_boxes / scale + + return scores, pred_boxes, data_dict + + +def pred_eval(predictor, test_data, imdb, vis=False, thresh=1e-3): + """ + wrapper for calculating offline validation for faster data analysis + in this example, all threshold are set by hand + :param predictor: Predictor + :param test_data: data iterator, must be non-shuffle + :param imdb: image database + :param vis: controls visualization + :param thresh: valid detection threshold + :return: + """ + assert vis or not test_data.shuffle + data_names = [k[0] for k in test_data.provide_data] + + nms = py_nms_wrapper(config.TEST.NMS) + + # limit detections to max_per_image over all classes + max_per_image = -1 + + num_images = imdb.num_images + # all detections are collected into: + # all_boxes[cls][image] = N x 5 array of detections in + # (x1, y1, x2, y2, score) + all_boxes = [[[] for _ in range(num_images)] + for _ in range(imdb.num_classes)] + + i = 0 + t = time.time() + for im_info, data_batch in test_data: + t1 = time.time() - t + t = time.time() + + scale = im_info[0, 2] + scores, boxes, data_dict = im_detect(predictor, data_batch, data_names, scale) + + t2 = time.time() - t + t = time.time() + + for j in range(1, imdb.num_classes): + indexes = np.where(scores[:, j] > thresh)[0] + cls_scores = scores[indexes, j, np.newaxis] + cls_boxes = boxes[indexes, j * 4:(j + 1) * 4] + cls_dets = np.hstack((cls_boxes, cls_scores)) + keep = nms(cls_dets) + all_boxes[j][i] = cls_dets[keep, :] + + if max_per_image > 0: + image_scores = np.hstack([all_boxes[j][i][:, -1] + for j in range(1, imdb.num_classes)]) + if len(image_scores) > max_per_image: + image_thresh = np.sort(image_scores)[-max_per_image] + for j in range(1, imdb.num_classes): + keep = np.where(all_boxes[j][i][:, -1] >= image_thresh)[0] + all_boxes[j][i] = all_boxes[j][i][keep, :] + + if vis: + boxes_this_image = [[]] + [all_boxes[j][i] for j in range(1, imdb.num_classes)] + vis_all_detection(data_dict['data'].asnumpy(), boxes_this_image, imdb.classes, scale) + + t3 = time.time() - t + t = time.time() + logger.info('testing %d/%d data %.4fs net %.4fs post %.4fs' % (i, imdb.num_images, t1, t2, t3)) + i += 1 + + det_file = os.path.join(imdb.cache_path, imdb.name + '_detections.pkl') + with open(det_file, 'wb') as f: + pickle.dump(all_boxes, f, protocol=pickle.HIGHEST_PROTOCOL) + + imdb.evaluate_detections(all_boxes) + + +def vis_all_detection(im_array, detections, class_names, scale): + """ + visualize all detections in one image + :param im_array: [b=1 c h w] in rgb + :param detections: [ numpy.ndarray([[x1 y1 x2 y2 score]]) for j in classes ] + :param class_names: list of names in imdb + :param scale: visualize the scaled image + :return: + """ + import matplotlib.pyplot as plt + import random + im = image.transform_inverse(im_array, config.PIXEL_MEANS) + plt.imshow(im) + for j, name in enumerate(class_names): + if name == '__background__': + continue + color = (random.random(), random.random(), random.random()) # generate a random color + dets = detections[j] + for det in dets: + bbox = det[:4] * scale + score = det[-1] + rect = plt.Rectangle((bbox[0], bbox[1]), + bbox[2] - bbox[0], + bbox[3] - bbox[1], fill=False, + edgecolor=color, linewidth=3.5) + plt.gca().add_patch(rect) + plt.gca().text(bbox[0], bbox[1] - 2, + '{:s} {:.3f}'.format(name, score), + bbox=dict(facecolor=color, alpha=0.5), fontsize=12, color='white') + plt.show() + + +def draw_all_detection(im_array, detections, class_names, scale): + """ + visualize all detections in one image + :param im_array: [b=1 c h w] in rgb + :param detections: [ numpy.ndarray([[x1 y1 x2 y2 score]]) for j in classes ] + :param class_names: list of names in imdb + :param scale: visualize the scaled image + :return: + """ + import cv2 + import random + color_white = (255, 255, 255) + im = image.transform_inverse(im_array, config.PIXEL_MEANS) + # change to bgr + im = cv2.cvtColor(im, cv2.cv.CV_RGB2BGR) + for j, name in enumerate(class_names): + if name == '__background__': + continue + color = (random.randint(0, 256), random.randint(0, 256), random.randint(0, 256)) # generate a random color + dets = detections[j] + for det in dets: + bbox = det[:4] * scale + score = det[-1] + bbox = map(int, bbox) + cv2.rectangle(im, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color=color, thickness=2) + cv2.putText(im, '%s %.3f' % (class_names[j], score), (bbox[0], bbox[1] + 10), + color=color_white, fontFace=cv2.FONT_HERSHEY_COMPLEX, fontScale=0.5) + return im diff --git a/retinaface/rcnn/cython/.gitignore b/RetinaFace/rcnn/cython/.gitignore similarity index 100% rename from retinaface/rcnn/cython/.gitignore rename to RetinaFace/rcnn/cython/.gitignore diff --git a/RetinaFace/rcnn/cython/__init__.py b/RetinaFace/rcnn/cython/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/retinaface/rcnn/cython/anchors.pyx b/RetinaFace/rcnn/cython/anchors.pyx similarity index 100% rename from retinaface/rcnn/cython/anchors.pyx rename to RetinaFace/rcnn/cython/anchors.pyx diff --git a/retinaface/rcnn/cython/bbox.pyx b/RetinaFace/rcnn/cython/bbox.pyx similarity index 100% rename from retinaface/rcnn/cython/bbox.pyx rename to RetinaFace/rcnn/cython/bbox.pyx diff --git a/retinaface/rcnn/cython/cpu_nms.pyx b/RetinaFace/rcnn/cython/cpu_nms.pyx similarity index 100% rename from retinaface/rcnn/cython/cpu_nms.pyx rename to RetinaFace/rcnn/cython/cpu_nms.pyx diff --git a/retinaface/rcnn/cython/gpu_nms.hpp b/RetinaFace/rcnn/cython/gpu_nms.hpp similarity index 100% rename from retinaface/rcnn/cython/gpu_nms.hpp rename to RetinaFace/rcnn/cython/gpu_nms.hpp diff --git a/retinaface/rcnn/cython/gpu_nms.pyx b/RetinaFace/rcnn/cython/gpu_nms.pyx similarity index 100% rename from retinaface/rcnn/cython/gpu_nms.pyx rename to RetinaFace/rcnn/cython/gpu_nms.pyx diff --git a/retinaface/rcnn/cython/nms_kernel.cu b/RetinaFace/rcnn/cython/nms_kernel.cu similarity index 100% rename from retinaface/rcnn/cython/nms_kernel.cu rename to RetinaFace/rcnn/cython/nms_kernel.cu diff --git a/retinaface/rcnn/cython/setup.py b/RetinaFace/rcnn/cython/setup.py similarity index 100% rename from retinaface/rcnn/cython/setup.py rename to RetinaFace/rcnn/cython/setup.py diff --git a/retinaface/rcnn/dataset/__init__.py b/RetinaFace/rcnn/dataset/__init__.py similarity index 100% rename from retinaface/rcnn/dataset/__init__.py rename to RetinaFace/rcnn/dataset/__init__.py diff --git a/retinaface/rcnn/dataset/ds_utils.py b/RetinaFace/rcnn/dataset/ds_utils.py similarity index 100% rename from retinaface/rcnn/dataset/ds_utils.py rename to RetinaFace/rcnn/dataset/ds_utils.py diff --git a/retinaface/rcnn/dataset/imdb.py b/RetinaFace/rcnn/dataset/imdb.py similarity index 100% rename from retinaface/rcnn/dataset/imdb.py rename to RetinaFace/rcnn/dataset/imdb.py diff --git a/retinaface/rcnn/dataset/retinaface.py b/RetinaFace/rcnn/dataset/retinaface.py similarity index 100% rename from retinaface/rcnn/dataset/retinaface.py rename to RetinaFace/rcnn/dataset/retinaface.py diff --git a/RetinaFace/rcnn/io/__init__.py b/RetinaFace/rcnn/io/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/RetinaFace/rcnn/io/image.py b/RetinaFace/rcnn/io/image.py new file mode 100644 index 0000000..93deccd --- /dev/null +++ b/RetinaFace/rcnn/io/image.py @@ -0,0 +1,808 @@ +from __future__ import print_function +import numpy as np +import cv2 +import os +import math +import sys +import random +from ..config import config + +def brightness_aug(src, x): + alpha = 1.0 + random.uniform(-x, x) + src *= alpha + return src + +def contrast_aug(src, x): + alpha = 1.0 + random.uniform(-x, x) + coef = np.array([[[0.299, 0.587, 0.114]]]) + gray = src * coef + gray = (3.0 * (1.0 - alpha) / gray.size) * np.sum(gray) + src *= alpha + src += gray + return src + +def saturation_aug(src, x): + alpha = 1.0 + random.uniform(-x, x) + coef = np.array([[[0.299, 0.587, 0.114]]]) + gray = src * coef + gray = np.sum(gray, axis=2, keepdims=True) + gray *= (1.0 - alpha) + src *= alpha + src += gray + return src + +def color_aug(img, x): + if config.COLOR_MODE>1: + augs = [brightness_aug, contrast_aug, saturation_aug] + random.shuffle(augs) + else: + augs = [brightness_aug] + for aug in augs: + #print(img.shape) + img = aug(img, x) + #print(img.shape) + return img + +def get_image(roidb, scale=False): + """ + preprocess image and return processed roidb + :param roidb: a list of roidb + :return: list of img as in mxnet format + roidb add new item['im_info'] + 0 --- x (width, second dim of im) + | + y (height, first dim of im) + """ + num_images = len(roidb) + processed_ims = [] + processed_roidb = [] + for i in range(num_images): + roi_rec = roidb[i] + if 'stream' in roi_rec: + im = cv2.imdecode(roi_rec['stream'], cv2.IMREAD_COLOR) + else: + assert os.path.exists(roi_rec['image']), '{} does not exist'.format(roi_rec['image']) + im = cv2.imread(roi_rec['image']) + if roidb[i]['flipped']: + im = im[:, ::-1, :] + new_rec = roi_rec.copy() + if scale: + scale_range = config.TRAIN.SCALE_RANGE + im_scale = np.random.uniform(scale_range[0], scale_range[1]) + im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) + elif not config.ORIGIN_SCALE: + scale_ind = random.randrange(len(config.SCALES)) + target_size = config.SCALES[scale_ind][0] + max_size = config.SCALES[scale_ind][1] + im, im_scale = resize(im, target_size, max_size, stride=config.IMAGE_STRIDE) + else: + im_scale = 1.0 + im_tensor = transform(im, config.PIXEL_MEANS, config.PIXEL_STDS) + if 'boxes_mask' in roi_rec: + im = im.astype(np.float32) + boxes_mask = roi_rec['boxes_mask'].copy() * im_scale + boxes_mask = boxes_mask.astype(np.int) + for j in xrange(boxes_mask.shape[0]): + m = boxes_mask[j] + im_tensor[:,:,m[1]:m[3],m[0]:m[2]] = 0.0 + #print('find mask', m, file=sys.stderr) + processed_ims.append(im_tensor) + new_rec['boxes'] = roi_rec['boxes'].copy() * im_scale + if config.TRAIN.IMAGE_ALIGN>0: + if im_tensor.shape[2]%config.TRAIN.IMAGE_ALIGN!=0 or im_tensor.shape[3]%config.TRAIN.IMAGE_ALIGN!=0: + new_height = math.ceil(float(im_tensor.shape[2])/config.TRAIN.IMAGE_ALIGN)*config.TRAIN.IMAGE_ALIGN + new_width = math.ceil(float(im_tensor.shape[3])/config.TRAIN.IMAGE_ALIGN)*config.TRAIN.IMAGE_ALIGN + new_im_tensor = np.zeros((1, 3, int(new_height), int(new_width))) + new_im_tensor[:,:,0:im_tensor.shape[2],0:im_tensor.shape[3]] = im_tensor + print(im_tensor.shape, new_im_tensor.shape, file=sys.stderr) + im_tensor = new_im_tensor + #print('boxes', new_rec['boxes'], file=sys.stderr) + im_info = [im_tensor.shape[2], im_tensor.shape[3], im_scale] + new_rec['im_info'] = im_info + processed_roidb.append(new_rec) + return processed_ims, processed_roidb + +TMP_ID = -1 +#bakup method +def __get_crop_image(roidb): + """ + preprocess image and return processed roidb + :param roidb: a list of roidb + :return: list of img as in mxnet format + roidb add new item['im_info'] + 0 --- x (width, second dim of im) + | + y (height, first dim of im) + """ + #roidb and each roi_rec can not be changed as it will be reused in next epoch + num_images = len(roidb) + processed_ims = [] + processed_roidb = [] + for i in range(num_images): + roi_rec = roidb[i] + if 'stream' in roi_rec: + im = cv2.imdecode(roi_rec['stream'], cv2.IMREAD_COLOR) + else: + assert os.path.exists(roi_rec['image']), '{} does not exist'.format(roi_rec['image']) + im = cv2.imread(roi_rec['image']) + if roidb[i]['flipped']: + im = im[:, ::-1, :] + if 'boxes_mask' in roi_rec: + #im = im.astype(np.float32) + boxes_mask = roi_rec['boxes_mask'].copy() + boxes_mask = boxes_mask.astype(np.int) + for j in xrange(boxes_mask.shape[0]): + m = boxes_mask[j] + im[m[1]:m[3],m[0]:m[2],:] = 0 + #print('find mask', m, file=sys.stderr) + new_rec = roi_rec.copy() + + + #choose one gt randomly + SIZE = config.SCALES[0][0] + TARGET_BOX_SCALES = np.array([16,32,64,128,256,512]) + assert roi_rec['boxes'].shape[0]>0 + candidates = [] + for i in xrange(roi_rec['boxes'].shape[0]): + box = roi_rec['boxes'][i] + box_size = max(box[2]-box[0], box[3]-box[1]) + if box_sizeim.shape[1] or box[3]>im.shape[0]: + # continue; + candidates.append(i) + assert len(candidates)>0 + box_ind = random.choice(candidates) + box = roi_rec['boxes'][box_ind] + box_size = max(box[2]-box[0], box[3]-box[1]) + dist = np.abs(TARGET_BOX_SCALES - box_size) + nearest = np.argmin(dist) + target_ind = random.randrange(min(len(TARGET_BOX_SCALES), nearest+2)) + target_box_size = TARGET_BOX_SCALES[target_ind] + im_scale = float(target_box_size) / box_size + #min_scale = float(SIZE)/np.min(im.shape[0:2]) + #if im_scale=im.shape[1] or center[1]>=im.shape[0]: + continue + if box_size0 + DEBUG = True + if DEBUG: + global TMP_ID + if TMP_ID<10: + tim = im.copy() + for i in xrange(new_rec['boxes'].shape[0]): + box = new_rec['boxes'][i].copy().astype(np.int) + cv2.rectangle(tim, (box[0], box[1]), (box[2], box[3]), (255, 0, 0), 1) + filename = './trainimages/train%d.png' % TMP_ID + TMP_ID+=1 + cv2.imwrite(filename, tim) + + im_tensor = transform(im, config.PIXEL_MEANS, config.PIXEL_STDS, config.PIXEL_SCALE) + + processed_ims.append(im_tensor) + #print('boxes', new_rec['boxes'], file=sys.stderr) + im_info = [im_tensor.shape[2], im_tensor.shape[3], im_scale] + new_rec['im_info'] = im_info + processed_roidb.append(new_rec) + return processed_ims, processed_roidb + +def expand_bboxes(bboxes, + image_width, + image_height, + expand_left=2., + expand_up=2., + expand_right=2., + expand_down=2.): + """ + Expand bboxes, expand 2 times by defalut. + """ + expand_boxes = [] + for bbox in bboxes: + xmin = bbox[0] + ymin = bbox[1] + xmax = bbox[2] + ymax = bbox[3] + w = xmax - xmin + h = ymax - ymin + ex_xmin = max(xmin - w / expand_left, 0.) + ex_ymin = max(ymin - h / expand_up, 0.) + ex_xmax = min(xmax + w / expand_right, image_width) + ex_ymax = min(ymax + h / expand_down, image_height) + expand_boxes.append([ex_xmin, ex_ymin, ex_xmax, ex_ymax]) + return expand_boxes + +def get_crop_image1(roidb): + """ + preprocess image and return processed roidb + :param roidb: a list of roidb + :return: list of img as in mxnet format + roidb add new item['im_info'] + 0 --- x (width, second dim of im) + | + y (height, first dim of im) + """ + #roidb and each roi_rec can not be changed as it will be reused in next epoch + num_images = len(roidb) + processed_ims = [] + processed_roidb = [] + for i in range(num_images): + roi_rec = roidb[i] + if 'stream' in roi_rec: + im = cv2.imdecode(roi_rec['stream'], cv2.IMREAD_COLOR) + else: + assert os.path.exists(roi_rec['image']), '{} does not exist'.format(roi_rec['image']) + im = cv2.imread(roi_rec['image']) + if roidb[i]['flipped']: + im = im[:, ::-1, :] + if 'boxes_mask' in roi_rec: + #im = im.astype(np.float32) + boxes_mask = roi_rec['boxes_mask'].copy() + boxes_mask = boxes_mask.astype(np.int) + for j in xrange(boxes_mask.shape[0]): + m = boxes_mask[j] + im[m[1]:m[3],m[0]:m[2],:] = 127 + #print('find mask', m, file=sys.stderr) + SIZE = config.SCALES[0][0] + PRE_SCALES = [0.3, 0.45, 0.6, 0.8, 1.0] + #PRE_SCALES = [0.3, 0.45, 0.6, 0.8, 1.0, 0.8, 1.0, 0.8, 1.0] + _scale = random.choice(PRE_SCALES) + #_scale = np.random.uniform(PRE_SCALES[0], PRE_SCALES[-1]) + size = int(np.min(im.shape[0:2])*_scale) + #size = int(np.round(_scale*np.min(im.shape[0:2]))) + im_scale = float(SIZE)/size + #origin_im_scale = im_scale + #size = np.round(np.min(im.shape[0:2])*im_scale) + #im_scale *= (float(SIZE)/size) + origin_shape = im.shape + if _scale>10.0: #avoid im.size=SIZE and im.shape[1]>=SIZE + #print('image size', origin_shape, _scale, SIZE, size, im_scale) + + new_rec = roi_rec.copy() + new_rec['boxes'] = roi_rec['boxes'].copy() * im_scale + if config.FACE_LANDMARK: + new_rec['landmarks'] = roi_rec['landmarks'].copy() + new_rec['landmarks'][:,:,0:2] *= im_scale + retry = 0 + LIMIT = 25 + size = SIZE + while retry=im_new.shape[1] or centery>=im_new.shape[0]: + continue + if box_size0 or retry==LIMIT-1: + im = im_new + new_rec['boxes'] = np.array(valid_boxes) + new_rec['gt_classes'] = new_rec['gt_classes'][valid] + if config.FACE_LANDMARK: + new_rec['landmarks'] = np.array(valid_landmarks) + if config.HEAD_BOX: + face_box = new_rec['boxes'] + head_box = expand_bboxes(face_box, image_width=im.shape[1], image_height=im.shape[0]) + new_rec['boxes_head'] = np.array(head_box) + break + + retry+=1 + + if config.COLOR_MODE>0 and config.COLOR_JITTERING>0.0: + im = im.astype(np.float32) + im = color_aug(im, config.COLOR_JITTERING) + + #assert np.all(new_rec['landmarks'][:,10]>0.0) + global TMP_ID + if TMP_ID>=0 and TMP_ID<10: + tim = im.copy().astype(np.uint8) + for i in xrange(new_rec['boxes'].shape[0]): + box = new_rec['boxes'][i].copy().astype(np.int) + cv2.rectangle(tim, (box[0], box[1]), (box[2], box[3]), (255, 0, 0), 1) + print('draw box:', box) + if config.FACE_LANDMARK: + for i in xrange(new_rec['landmarks'].shape[0]): + landmark = new_rec['landmarks'][i].copy() + if landmark[0][2]<0: + print('zero', landmark) + continue + landmark = landmark.astype(np.int) + print('draw landmark', landmark) + for k in range(5): + color = (0, 0, 255) + if k==0 or k==3: + color = (0, 255, 0) + pp = (landmark[k][0], landmark[k][1]) + cv2.circle(tim, (pp[0], pp[1]), 1, color, 2) + filename = './trainimages/train%d.png' % TMP_ID + print('write', filename) + cv2.imwrite(filename, tim) + TMP_ID+=1 + + im_tensor = transform(im, config.PIXEL_MEANS, config.PIXEL_STDS, config.PIXEL_SCALE) + + processed_ims.append(im_tensor) + #print('boxes', new_rec['boxes'], file=sys.stderr) + im_info = [im_tensor.shape[2], im_tensor.shape[3], im_scale] + new_rec['im_info'] = np.array(im_info, dtype=np.float32) + processed_roidb.append(new_rec) + return processed_ims, processed_roidb + +def get_crop_image2(roidb): + """ + preprocess image and return processed roidb + :param roidb: a list of roidb + :return: list of img as in mxnet format + roidb add new item['im_info'] + 0 --- x (width, second dim of im) + | + y (height, first dim of im) + """ + #roidb and each roi_rec can not be changed as it will be reused in next epoch + num_images = len(roidb) + processed_ims = [] + processed_roidb = [] + for i in range(num_images): + roi_rec = roidb[i] + if 'stream' in roi_rec: + im = cv2.imdecode(roi_rec['stream'], cv2.IMREAD_COLOR) + else: + assert os.path.exists(roi_rec['image']), '{} does not exist'.format(roi_rec['image']) + im = cv2.imread(roi_rec['image']) + if roidb[i]['flipped']: + im = im[:, ::-1, :] + if 'boxes_mask' in roi_rec: + #im = im.astype(np.float32) + boxes_mask = roi_rec['boxes_mask'].copy() + boxes_mask = boxes_mask.astype(np.int) + for j in xrange(boxes_mask.shape[0]): + m = boxes_mask[j] + im[m[1]:m[3],m[0]:m[2],:] = 0 + #print('find mask', m, file=sys.stderr) + SIZE = config.SCALES[0][0] + scale_array = np.array([16,32,64,128,256,512], dtype=np.float32) + candidates = [] + for i in xrange(roi_rec['boxes'].shape[0]): + box = roi_rec['boxes'][i] + box_size = max(box[2]-box[0], box[3]-box[1]) + if box_sizeim.shape[1] or box[3]>im.shape[0]: + # continue; + candidates.append(i) + assert len(candidates)>0 + box_ind = random.choice(candidates) + box = roi_rec['boxes'][box_ind] + width = box[2]-box[0] + height = box[3]-box[1] + wid = width + hei = height + resize_width, resize_height = config.SCALES[0] + image_width = im.shape[0] + image_height = im.shape[1] + area = width*height + range_size = 0 + for scale_ind in range(0, len(scale_array) - 1): + if area > scale_array[scale_ind] ** 2 and area < \ + scale_array[scale_ind + 1] ** 2: + range_size = scale_ind + 1 + break + + if area > scale_array[len(scale_array) - 2]**2: + range_size = len(scale_array) - 2 + scale_choose = 0.0 + if range_size == 0: + rand_idx_size = 0 + else: + # np.random.randint range: [low, high) + rng_rand_size = np.random.randint(0, range_size + 1) + rand_idx_size = rng_rand_size % (range_size + 1) + + if rand_idx_size == range_size: + min_resize_val = scale_array[rand_idx_size] / 2.0 + max_resize_val = min(2.0 * scale_array[rand_idx_size], + 2 * math.sqrt(wid * hei)) + scale_choose = random.uniform(min_resize_val, max_resize_val) + else: + min_resize_val = scale_array[rand_idx_size] / 2.0 + max_resize_val = 2.0 * scale_array[rand_idx_size] + scale_choose = random.uniform(min_resize_val, max_resize_val) + + sample_bbox_size = wid * resize_width / scale_choose + + w_off_orig = 0.0 + h_off_orig = 0.0 + if sample_bbox_size < max(image_height, image_width): + if wid <= sample_bbox_size: + w_off_orig = np.random.uniform(xmin + wid - sample_bbox_size, + xmin) + else: + w_off_orig = np.random.uniform(xmin, + xmin + wid - sample_bbox_size) + + if hei <= sample_bbox_size: + h_off_orig = np.random.uniform(ymin + hei - sample_bbox_size, + ymin) + else: + h_off_orig = np.random.uniform(ymin, + ymin + hei - sample_bbox_size) + + else: + w_off_orig = np.random.uniform(image_width - sample_bbox_size, 0.0) + h_off_orig = np.random.uniform(image_height - sample_bbox_size, 0.0) + + w_off_orig = math.floor(w_off_orig) + h_off_orig = math.floor(h_off_orig) + + # Figure out top left coordinates. + w_off = 0.0 + h_off = 0.0 + w_off = float(w_off_orig / image_width) + h_off = float(h_off_orig / image_height) + im_new = im[up:(up+size), left:(left+size), :] + + sampled_bbox = bbox(w_off, h_off, + w_off + float(sample_bbox_size / image_width), + h_off + float(sample_bbox_size / image_height)) + return sampled_bbox + + box_size = max(box[2]-box[0], box[3]-box[1]) + dist = np.abs(TARGET_BOX_SCALES - box_size) + nearest = np.argmin(dist) + target_ind = random.randrange(min(len(TARGET_BOX_SCALES), nearest+2)) + target_box_size = TARGET_BOX_SCALES[target_ind] + im_scale = float(target_box_size) / box_size + PRE_SCALES = [0.3, 0.45, 0.6, 0.8, 1.0] + _scale = random.choice(PRE_SCALES) + #_scale = np.random.uniform(PRE_SCALES[0], PRE_SCALES[-1]) + size = int(np.round(_scale*np.min(im.shape[0:2]))) + im_scale = float(SIZE)/size + #origin_im_scale = im_scale + #size = np.round(np.min(im.shape[0:2])*im_scale) + #im_scale *= (float(SIZE)/size) + origin_shape = im.shape + if _scale>10.0: #avoid im.size=SIZE and im.shape[1]>=SIZE + + new_rec = roi_rec.copy() + new_rec['boxes'] = roi_rec['boxes'].copy() * im_scale + if config.FACE_LANDMARK: + new_rec['landmarks'] = roi_rec['landmarks'].copy() * im_scale + retry = 0 + LIMIT = 25 + size = SIZE + while retry=im_new.shape[1] or centery>=im_new.shape[0]: + continue + if box_size0 or retry==LIMIT-1: + im = im_new + new_rec['boxes'] = np.array(valid_boxes) + new_rec['gt_classes'] = new_rec['gt_classes'][valid] + if config.FACE_LANDMARK: + new_rec['landmarks'] = np.array(valid_landmarks) + break + + retry+=1 + + if config.COLOR_JITTERING>0.0: + im = im.astype(np.float32) + im = color_aug(im, config.COLOR_JITTERING) + + #assert np.all(new_rec['landmarks'][:,10]>0.0) + global TMP_ID + if TMP_ID>=0 and TMP_ID<10: + tim = im.copy().astype(np.uint8) + for i in xrange(new_rec['boxes'].shape[0]): + box = new_rec['boxes'][i].copy().astype(np.int) + cv2.rectangle(tim, (box[0], box[1]), (box[2], box[3]), (255, 0, 0), 1) + print('draw box:', box) + if config.FACE_LANDMARK: + for i in xrange(new_rec['landmarks'].shape[0]): + landmark = new_rec['landmarks'][i].copy() + if landmark[10]==0.0: + print('zero', landmark) + continue + landmark = landmark.astype(np.int) + print('draw landmark', landmark) + for k in range(5): + color = (0, 0, 255) + if k==0 or k==3: + color = (0, 255, 0) + pp = (landmark[k*2], landmark[1+k*2]) + cv2.circle(tim, (pp[0], pp[1]), 1, color, 2) + filename = './trainimages/train%d.png' % TMP_ID + print('write', filename) + cv2.imwrite(filename, tim) + TMP_ID+=1 + + im_tensor = transform(im, config.PIXEL_MEANS, config.PIXEL_STDS, config.PIXEL_SCALE) + + processed_ims.append(im_tensor) + #print('boxes', new_rec['boxes'], file=sys.stderr) + im_info = [im_tensor.shape[2], im_tensor.shape[3], im_scale] + new_rec['im_info'] = np.array(im_info, dtype=np.float32) + processed_roidb.append(new_rec) + return processed_ims, processed_roidb + +def do_mixup(im1, roidb1, im2, roidb2): + im = (im1+im2)/2.0 + roidb = {} + #print(roidb1.keys()) + #for k in roidb1: + for k in ['boxes', 'landmarks', 'gt_classes', 'im_info']: + v1 = roidb1[k] + v2 = roidb2[k] + if k!='im_info': + #print('try', k, v1.shape, v2.shape) + if v1.shape[0]>0 and v2.shape[0]>0: + v = np.concatenate( (v1, v2), axis=0 ) + else: + v = v1 + else: + v = v1 + #print(k, v1.shape, v2.shape, v.shape) + roidb[k] = v + return im, roidb + +def get_crop_image(roidb): + ims, roidbs = get_crop_image1(roidb) + if config.MIXUP>0.0 and np.random.random()=i: + j+=1 + im, roidb = do_mixup(im, roidb, ims[j], roidbs[j]) + ims[i] = im + roidbs[i] = roidb + return ims, roidbs + +def resize(im, target_size, max_size, stride=0, min_size=0): + """ + only resize input image to target size and return scale + :param im: BGR image input by opencv + :param target_size: one dimensional size (the short side) + :param max_size: one dimensional max size (the long side) + :param stride: if given, pad the image to designated stride + :return: + """ + im_shape = im.shape + im_size_min = np.min(im_shape[0:2]) + im_size_max = np.max(im_shape[0:2]) + im_scale = float(target_size) / float(im_size_min) + # prevent bigger axis from being more than max_size: + if np.round(im_scale * im_size_max) > max_size: + im_scale = float(max_size) / float(im_size_max) + if min_size>0 and np.round(im_scale*im_size_min) ['bbox_targets'] + :return: data, label + """ + num_images = len(roidb) + imgs, roidb = get_image(roidb) + im_array = tensor_vstack(imgs) + + assert config.TRAIN.BATCH_ROIS % config.TRAIN.BATCH_IMAGES == 0, \ + 'BATCHIMAGES {} must divide BATCH_ROIS {}'.format(config.TRAIN.BATCH_IMAGES, config.TRAIN.BATCH_ROIS) + rois_per_image = int(config.TRAIN.BATCH_ROIS / config.TRAIN.BATCH_IMAGES) + fg_rois_per_image = int(round(config.TRAIN.FG_FRACTION * rois_per_image)) + + rois_array = list() + labels_array = list() + bbox_targets_array = list() + bbox_weights_array = list() + + for im_i in range(num_images): + roi_rec = roidb[im_i] + + # infer num_classes from gt_overlaps + num_classes = roi_rec['gt_overlaps'].shape[1] + + # label = class RoI has max overlap with + rois = roi_rec['boxes'] + labels = roi_rec['max_classes'] + overlaps = roi_rec['max_overlaps'] + bbox_targets = roi_rec['bbox_targets'] + + im_rois, labels, bbox_targets, bbox_weights = \ + sample_rois(rois, fg_rois_per_image, rois_per_image, num_classes, + labels, overlaps, bbox_targets) + + # project im_rois + # do not round roi + rois = im_rois + batch_index = im_i * np.ones((rois.shape[0], 1)) + rois_array_this_image = np.hstack((batch_index, rois)) + rois_array.append(rois_array_this_image) + + # add labels + labels_array.append(labels) + bbox_targets_array.append(bbox_targets) + bbox_weights_array.append(bbox_weights) + + rois_array = np.array(rois_array) + labels_array = np.array(labels_array) + bbox_targets_array = np.array(bbox_targets_array) + bbox_weights_array = np.array(bbox_weights_array) + + data = {'data': im_array, + 'rois': rois_array} + label = {'label': labels_array, + 'bbox_target': bbox_targets_array, + 'bbox_weight': bbox_weights_array} + + return data, label + + +def sample_rois(rois, fg_rois_per_image, rois_per_image, num_classes, + labels=None, overlaps=None, bbox_targets=None, gt_boxes=None): + """ + generate random sample of ROIs comprising foreground and background examples + :param rois: all_rois [n, 4]; e2e: [n, 5] with batch_index + :param fg_rois_per_image: foreground roi number + :param rois_per_image: total roi number + :param num_classes: number of classes + :param labels: maybe precomputed + :param overlaps: maybe precomputed (max_overlaps) + :param bbox_targets: maybe precomputed + :param gt_boxes: optional for e2e [n, 5] (x1, y1, x2, y2, cls) + :return: (labels, rois, bbox_targets, bbox_weights) + """ + if labels is None: + overlaps = bbox_overlaps(rois[:, 1:].astype(np.float), gt_boxes[:, :4].astype(np.float)) + gt_assignment = overlaps.argmax(axis=1) + overlaps = overlaps.max(axis=1) + labels = gt_boxes[gt_assignment, 4] + + # foreground RoI with FG_THRESH overlap + fg_indexes = np.where(overlaps >= config.TRAIN.FG_THRESH)[0] + # guard against the case when an image has fewer than fg_rois_per_image foreground RoIs + fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_indexes.size) + # Sample foreground regions without replacement + if len(fg_indexes) > fg_rois_per_this_image: + fg_indexes = npr.choice(fg_indexes, size=fg_rois_per_this_image, replace=False) + + # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) + bg_indexes = np.where((overlaps < config.TRAIN.BG_THRESH_HI) & (overlaps >= config.TRAIN.BG_THRESH_LO))[0] + # Compute number of background RoIs to take from this image (guarding against there being fewer than desired) + bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image + bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_indexes.size) + # Sample foreground regions without replacement + if len(bg_indexes) > bg_rois_per_this_image: + bg_indexes = npr.choice(bg_indexes, size=bg_rois_per_this_image, replace=False) + + # indexes selected + keep_indexes = np.append(fg_indexes, bg_indexes) + neg_idx = np.where(overlaps < config.TRAIN.FG_THRESH)[0] + neg_rois = rois[neg_idx] + # pad more to ensure a fixed minibatch size + while keep_indexes.shape[0] < rois_per_image: + gap = np.minimum(len(neg_rois), rois_per_image - keep_indexes.shape[0]) + gap_indexes = npr.choice(range(len(neg_rois)), size=gap, replace=False) + keep_indexes = np.append(keep_indexes, neg_idx[gap_indexes]) + + # select labels + labels = labels[keep_indexes] + # set labels of bg_rois to be 0 + labels[fg_rois_per_this_image:] = 0 + rois = rois[keep_indexes] + + # load or compute bbox_target + if bbox_targets is not None: + bbox_target_data = bbox_targets[keep_indexes, :] + else: + targets = bbox_transform(rois[:, 1:], gt_boxes[gt_assignment[keep_indexes], :4]) + if config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED: + targets = ((targets - np.array(config.TRAIN.BBOX_MEANS)) + / np.array(config.TRAIN.BBOX_STDS)) + bbox_target_data = np.hstack((labels[:, np.newaxis], targets)) + + bbox_targets, bbox_weights = \ + expand_bbox_regression_targets(bbox_target_data, num_classes) + + return rois, labels, bbox_targets, bbox_weights + +def get_fpn_rcnn_testbatch(roidb): + """ + return a dict of testbatch + :param roidb: ['image', 'flipped'] + ['boxes'] + :return: data, label, im_info + """ + assert len(roidb) == 1, 'Single batch only' + imgs, roidb = get_image(roidb) + im_array = imgs[0] + im_info = np.array([roidb[0]['im_info']], dtype=np.float32) + + im_rois = roidb[0]['boxes'] + rois = im_rois + + # assign rois + rois_area = np.sqrt((rois[:, 2] - rois[:, 0]) * (rois[:, 3] - rois[:, 1])) + area_threshold = {'P5': 448, 'P4': 224, 'P3': 112} + rois_p5 = rois[area_threshold['P5'] <= rois_area] + rois_p4 = rois[np.logical_and(area_threshold['P4'] <= rois_area, rois_area < area_threshold['P5'])] + rois_p3 = rois[np.logical_and(area_threshold['P3'] <= rois_area, rois_area < area_threshold['P4'])] + rois_p2 = rois[np.logical_and(0 < rois_area, rois_area < area_threshold['P3'])] + + # pad a virtual rois if on rois assigned + if rois_p5.size == 0: + rois_p5 = np.array([[12,34,56,78]]) + if rois_p4.size == 0: + rois_p4 = np.array([[12,34,56,78]]) + if rois_p3.size == 0: + rois_p3 = np.array([[12,34,56,78]]) + if rois_p2.size == 0: + rois_p2 = np.array([[12,34,56,78]]) + + p5_batch_index = 0 * np.ones((rois_p5.shape[0], 1)) + rois_p5_array = np.hstack((p5_batch_index, rois_p5))[np.newaxis, :] + + p4_batch_index = 0 * np.ones((rois_p4.shape[0], 1)) + rois_p4_array = np.hstack((p4_batch_index, rois_p4))[np.newaxis, :] + + p3_batch_index = 0 * np.ones((rois_p3.shape[0], 1)) + rois_p3_array = np.hstack((p3_batch_index, rois_p3))[np.newaxis, :] + + p2_batch_index = 0 * np.ones((rois_p2.shape[0], 1)) + rois_p2_array = np.hstack((p2_batch_index, rois_p2))[np.newaxis, :] + + data = {'data': im_array, + 'rois_stride32': rois_p5_array, + 'rois_stride16': rois_p4_array, + 'rois_stride8': rois_p3_array, + 'rois_stride4': rois_p2_array} + label = {} + + return data, label, im_info + +def get_fpn_maskrcnn_batch(roidb): + """ + return a dictionary that contains raw data. + """ + num_images = len(roidb) + imgs, roidb = get_image(roidb, scale=config.TRAIN.SCALE) #TODO + #imgs, roidb = get_image(roidb) + im_array = tensor_vstack(imgs) + + assert config.TRAIN.BATCH_ROIS % config.TRAIN.BATCH_IMAGES == 0, \ + 'BATCHIMAGES {} must divide BATCH_ROIS {}'.format(config.TRAIN.BATCH_IMAGES, config.TRAIN.BATCH_ROIS) + rois_per_image = config.TRAIN.BATCH_ROIS / config.TRAIN.BATCH_IMAGES + fg_rois_per_image = np.round(config.TRAIN.FG_FRACTION * rois_per_image).astype(int) + + rois_on_imgs = dict() + labels_on_imgs = dict() + bbox_targets_on_imgs = dict() + bbox_weights_on_imgs = dict() + mask_targets_on_imgs = dict() + mask_weights_on_imgs = dict() + for s in config.RCNN_FEAT_STRIDE: + rois_on_imgs.update({'stride%s' % s : list()}) + labels_on_imgs.update({'stride%s' % s : list()}) + bbox_targets_on_imgs.update({'stride%s' % s : list()}) + bbox_weights_on_imgs.update({'stride%s' % s : list()}) + mask_targets_on_imgs.update({'stride%s' % s : list()}) + mask_weights_on_imgs.update({'stride%s' % s : list()}) + + # Sample rois + level_related_data_on_imgs = {} + for im_i in range(num_images): + roi_rec = roidb[im_i] + # infer num_classes from gt_overlaps + num_classes = roi_rec['gt_overlaps'].shape[1] + # label = class RoI has max overlap with + rois = roi_rec['boxes'] + labels = roi_rec['max_classes'] + overlaps = roi_rec['max_overlaps'] + bbox_targets = roi_rec['bbox_targets'] + im_info = roi_rec['im_info'] + + mask_targets = roi_rec['mask_targets'] + mask_labels = roi_rec['mask_labels'] + mask_inds = roi_rec['mask_inds'] + + assign_levels = roi_rec['assign_levels'] + + im_rois_on_levels, labels_on_levels, bbox_targets_on_levels, bbox_weights_on_levels, mask_targets_on_levels, mask_weights_on_levels = \ + sample_rois_fpn(rois, assign_levels, fg_rois_per_image, rois_per_image, num_classes, + labels, overlaps, bbox_targets, mask_targets=mask_targets, mask_labels=mask_labels, mask_inds=mask_inds, im_info=im_info) + + level_related_data_on_imgs.update({'img_%s' % im_i: {'rois_on_levels': im_rois_on_levels, + 'labels_on_levels': labels_on_levels, + 'bbox_targets_on_levels': bbox_targets_on_levels, + 'bbox_weights_on_levels': bbox_weights_on_levels, + 'mask_targets_on_levels': mask_targets_on_levels, + 'mask_weights_on_levels': mask_weights_on_levels, }}) + + return im_array, level_related_data_on_imgs + + +def sample_rois(rois, fg_rois_per_image, rois_per_image, num_classes, + labels=None, overlaps=None, bbox_targets=None, gt_boxes=None, mask_targets=None, + mask_labels=None, mask_inds=None): + """ + generate random sample of ROIs comprising foreground and background examples + :param rois: all_rois [n, 4]; e2e: [n, 5] with batch_index + :param fg_rois_per_image: foreground roi number + :param rois_per_image: total roi number + :param num_classes: number of classes + :param labels: maybe precomputed + :param overlaps: maybe precomputed (max_overlaps) + :param bbox_targets: maybe precomputed + :param gt_boxes: optional for e2e [n, 5] (x1, y1, x2, y2, cls) + :return: (rois, labels, bbox_targets, bbox_weights) + """ + if labels is None: + if len(gt_boxes) == 0: + gt_boxes = np.zeros((1, 5)) + gt_assignment = np.zeros((len(rois), ), dtype=np.int32) + overlaps = np.zeros((len(rois), )) + labels = np.zeros((len(rois), )) + else: + overlaps = bbox_overlaps(rois[:, 1:].astype(np.float), gt_boxes[:, :4].astype(np.float)) + gt_assignment = overlaps.argmax(axis=1) + overlaps = overlaps.max(axis=1) + labels = gt_boxes[gt_assignment, 4] + + num_rois = rois.shape[0] + # foreground RoI with FG_THRESH overlap + fg_indexes = np.where(overlaps >= config.TRAIN.FG_THRESH)[0] + # guard against the case when an image has fewer than fg_rois_per_image foreground RoIs + fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_indexes.size) + # Sample foreground regions without replacement + if len(fg_indexes) > fg_rois_per_this_image: + fg_indexes = npr.choice(fg_indexes, size=fg_rois_per_this_image, replace=False) + + # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) + bg_indexes = np.where((overlaps < config.TRAIN.BG_THRESH_HI) & (overlaps >= config.TRAIN.BG_THRESH_LO))[0] + # Compute number of background RoIs to take from this image (guarding against there being fewer than desired) + bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image + bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_indexes.size) + # Sample foreground regions without replacement + if len(bg_indexes) > bg_rois_per_this_image: + bg_indexes = npr.choice(bg_indexes, size=bg_rois_per_this_image, replace=False) + + # indexes selected + keep_indexes = np.append(fg_indexes, bg_indexes) + + neg_idx = np.where(overlaps < config.TRAIN.FG_THRESH)[0] + neg_rois = rois[neg_idx] + + # pad more to ensure a fixed minibatch size + while keep_indexes.shape[0] < rois_per_image: + gap = np.minimum(len(neg_rois), rois_per_image - keep_indexes.shape[0]) + gap_indexes = npr.choice(range(len(neg_rois)), size=gap, replace=False) + keep_indexes = np.append(keep_indexes, neg_idx[gap_indexes]) + + # select labels + labels = labels[keep_indexes] + # set labels of bg_rois to be 0 + labels[fg_rois_per_this_image:] = 0 + rois = rois[keep_indexes] + if mask_targets is not None: + assert mask_labels is not None + assert mask_inds is not None + def _mask_umap(mask_targets, mask_labels, mask_inds): + _mask_targets = np.zeros((num_rois, num_classes, 28, 28), dtype=np.int8) + _mask_weights = np.zeros((num_rois, num_classes, 28, 28), dtype=np.int8) + _mask_targets[mask_inds, mask_labels] = mask_targets + _mask_weights[mask_inds, mask_labels] = 1 + _mask_weights[:, 0] = 0 # set background mask weight to zeros + return _mask_targets, _mask_weights # [num_rois, num_classes, 28, 28] + mask_targets, mask_weights = _mask_umap(mask_targets, mask_labels, mask_inds) + mask_targets = mask_targets[keep_indexes] + mask_weights = mask_weights[keep_indexes] + + # load or compute bbox_target + if bbox_targets is not None: + bbox_target_data = bbox_targets[keep_indexes, :] + else: + targets = bbox_transform(rois[:, 1:], gt_boxes[gt_assignment[keep_indexes], :4]) + if config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED: + targets = ((targets - np.array(config.TRAIN.BBOX_MEANS)) + / np.array(config.TRAIN.BBOX_STDS)) + bbox_target_data = np.hstack((labels[:, np.newaxis], targets)) + + bbox_targets, bbox_weights = \ + expand_bbox_regression_targets(bbox_target_data, num_classes) + + if mask_targets is not None: + return rois, labels, bbox_targets, bbox_weights, mask_targets, mask_weights + else: + return rois, labels, bbox_targets, bbox_weights + +def sample_rois_fpn(rois, assign_levels, fg_rois_per_image, rois_per_image, num_classes, + labels=None, overlaps=None, bbox_targets=None, mask_targets=None, mask_labels=None, mask_inds=None, gt_boxes=None, im_info=None): + """ + generate random sample of ROIs comprising foreground and background examples + :param rois: all_rois [n, 4]; e2e: [n, 5] with batch_index + :param assign_levels: [n] + :param fg_rois_per_image: foreground roi number + :param rois_per_image: total roi number + :param num_classes: number of classes + :param labels: maybe precomputed + :param overlaps: maybe precomputed (max_overlaps) + :param bbox_targets: maybe precomputed + :param gt_boxes: optional for e2e [n, 5] (x1, y1, x2, y2, cls) + :return: (rois, labels, bbox_targets, bbox_weights) + """ + DEBUG = False + if labels is None: + if len(gt_boxes) == 0: + gt_boxes = np.zeros((1, 5)) + gt_assignment = np.zeros((len(rois), ), dtype=np.int32) + overlaps = np.zeros((len(rois), )) + labels = np.zeros((len(rois), )) + else: + overlaps = bbox_overlaps(rois[:, 1:].astype(np.float), gt_boxes[:, :4].astype(np.float)) + gt_assignment = overlaps.argmax(axis=1) + overlaps = overlaps.max(axis=1) + labels = gt_boxes[gt_assignment, 4] + + num_rois = rois.shape[0] + # foreground RoI with FG_THRESH overlap + fg_indexes = np.where(overlaps >= config.TRAIN.FG_THRESH)[0] + # guard against the case when an image has fewer than fg_rois_per_image foreground RoIs + fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_indexes.size) + + if DEBUG: + print 'fg total num:', len(fg_indexes) + + # Sample foreground regions without replacement + if len(fg_indexes) > fg_rois_per_this_image: + fg_indexes = npr.choice(fg_indexes, size=fg_rois_per_this_image, replace=False) + + # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) + bg_indexes = np.where((overlaps < config.TRAIN.BG_THRESH_HI) & (overlaps >= config.TRAIN.BG_THRESH_LO))[0] + if DEBUG: + print 'bg total num:', len(bg_indexes) + # Compute number of background RoIs to take from this image (guarding against there being fewer than desired) + bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image + bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_indexes.size) + # Sample foreground regions without replacement + if len(bg_indexes) > bg_rois_per_this_image: + bg_indexes = npr.choice(bg_indexes, size=bg_rois_per_this_image, replace=False) + if DEBUG: + print 'fg num:', len(fg_indexes) + print 'bg num:', len(bg_indexes) + + # bg rois statistics + if DEBUG: + bg_assign = assign_levels[bg_indexes] + bg_rois_on_levels = dict() + for i, s in enumerate(config.RCNN_FEAT_STRIDE): + bg_rois_on_levels.update({'stride%s'%s:len(np.where(bg_assign == s)[0])}) + print bg_rois_on_levels + + # indexes selected + keep_indexes = np.append(fg_indexes, bg_indexes) + + neg_idx = np.where(overlaps < config.TRAIN.FG_THRESH)[0] + neg_rois = rois[neg_idx] + + # pad more to ensure a fixed minibatch size + while keep_indexes.shape[0] < rois_per_image: + gap = np.minimum(len(neg_rois), rois_per_image - keep_indexes.shape[0]) + gap_indexes = npr.choice(range(len(neg_rois)), size=gap, replace=False) + keep_indexes = np.append(keep_indexes, neg_idx[gap_indexes]) + + # select labels + labels = labels[keep_indexes] + # set labels of bg_rois to be 0 + labels[fg_rois_per_this_image:] = 0 + rois = rois[keep_indexes] + assign_levels = assign_levels[keep_indexes] + + if mask_targets is not None: + assert mask_labels is not None + assert mask_inds is not None + def _mask_umap(mask_targets, mask_labels, mask_inds): + _mask_targets = np.zeros((num_rois, num_classes, 28, 28), dtype=np.int8) + _mask_weights = np.zeros((num_rois, num_classes, 1, 1), dtype=np.int8) + _mask_targets[mask_inds, mask_labels] = mask_targets + _mask_weights[mask_inds, mask_labels] = 1 + return _mask_targets, _mask_weights # [num_rois, num_classes, 28, 28] + mask_targets, mask_weights = _mask_umap(mask_targets, mask_labels, mask_inds) + mask_targets = mask_targets[keep_indexes] + mask_weights = mask_weights[keep_indexes] + + # load or compute bbox_target + if bbox_targets is not None: + bbox_target_data = bbox_targets[keep_indexes, :] + else: + targets = bbox_transform(rois[:, 1:], gt_boxes[gt_assignment[keep_indexes], :4]) + if config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED: + targets = ((targets - np.array(config.TRAIN.BBOX_MEANS)) + / np.array(config.TRAIN.BBOX_STDS)) + bbox_target_data = np.hstack((labels[:, np.newaxis], targets)) + + bbox_targets, bbox_weights = \ + expand_bbox_regression_targets(bbox_target_data, num_classes) + + # Assign to levels + rois_on_levels = dict() + labels_on_levels = dict() + bbox_targets_on_levels = dict() + bbox_weights_on_levels = dict() + if mask_targets is not None: + mask_targets_on_levels = dict() + mask_weights_on_levels = dict() + for i, s in enumerate(config.RCNN_FEAT_STRIDE): + index = np.where(assign_levels == s) + _rois = rois[index] + _labels = labels[index] + _bbox_targets = bbox_targets[index] + _bbox_weights = bbox_weights[index] + if mask_targets is not None: + _mask_targets = mask_targets[index] + _mask_weights = mask_weights[index] + + rois_on_levels.update({'stride%s' % s: _rois}) + labels_on_levels.update({'stride%s' % s: _labels}) + bbox_targets_on_levels.update({'stride%s' % s: _bbox_targets}) + bbox_weights_on_levels.update({'stride%s' % s: _bbox_weights}) + if mask_targets is not None: + mask_targets_on_levels.update({'stride%s' % s: _mask_targets}) + mask_weights_on_levels.update({'stride%s' % s: _mask_weights}) + + if mask_targets is not None: + return rois_on_levels, labels_on_levels, bbox_targets_on_levels, bbox_weights_on_levels,mask_targets_on_levels,mask_weights_on_levels + else: + return rois_on_levels, labels_on_levels, bbox_targets_on_levels, bbox_weights_on_levels + +def get_rois(rois, rois_per_image, num_classes, + labels=None, overlaps=None, bbox_targets=None, gt_boxes=None): + """ + get top N ROIs, used in online hard example mining + :param rois: all_rois [n, 4]; e2e: [n, 5] with batch_index + :param rois_per_image: total roi number + :param num_classes: number of classes + :param labels: maybe precomputed + :param overlaps: maybe precomputed (max_overlaps) + :param bbox_targets: maybe precomputed + :param gt_boxes: optional for e2e [n, 5] (x1, y1, x2, y2, cls) + :return: (rois, labels, bbox_targets, bbox_weights) + """ + if labels is None: + if len(gt_boxes) == 0: + gt_boxes = np.array([[1, 1, 1, 1, 0]]) + overlaps = bbox_overlaps(rois[:, 1:].astype(np.float), gt_boxes[:, :4].astype(np.float)) + gt_assignment = overlaps.argmax(axis=1) + overlaps = overlaps.max(axis=1) + labels = gt_boxes[gt_assignment, 4] + + # select indices + keep_indexes = np.arange(rois.shape[0]) + if keep_indexes.shape[0] > rois_per_image: + keep_indexes = npr.choice(keep_indexes, size=rois_per_image, replace=False) + + # if not enough, pad until rois_per_image is satisfied + while keep_indexes.shape[0] < rois_per_image: + gap = np.minimum(rois_per_image - keep_indexes.shape[0], len(rois)) + gap_indexes = npr.choice(range(len(rois)), size=gap, replace=False) + keep_indexes = np.append(keep_indexes, gap_indexes) + + # suppress any bg defined by overlap + bg_indexes = np.where((overlaps < config.TRAIN.BG_THRESH_HI) & (overlaps >= config.TRAIN.BG_THRESH_LO))[0] + labels[bg_indexes] = 0 + + labels = labels[keep_indexes] + rois = rois[keep_indexes] + + # load or compute bbox_target + if bbox_targets is not None: + bbox_target_data = bbox_targets[keep_indexes, :] + else: + targets = bbox_transform(rois[:, 1:], gt_boxes[gt_assignment[keep_indexes], :4]) + if config.TRAIN.BBOX_NORMALIZATION_PRECOMPUTED: + targets = ((targets - np.array(config.TRAIN.BBOX_MEANS)) + / np.array(config.TRAIN.BBOX_STDS)) + bbox_target_data = np.hstack((labels[:, np.newaxis], targets)) + + bbox_targets, bbox_weights = \ + expand_bbox_regression_targets(bbox_target_data, num_classes) + + return rois, labels, bbox_targets, bbox_weights + diff --git a/RetinaFace/rcnn/io/rpn.py b/RetinaFace/rcnn/io/rpn.py new file mode 100644 index 0000000..cd12a70 --- /dev/null +++ b/RetinaFace/rcnn/io/rpn.py @@ -0,0 +1,758 @@ +""" +RPN: +data = + {'data': [num_images, c, h, w], + 'im_info': [num_images, 4] (optional)} +label = + {'gt_boxes': [num_boxes, 5] (optional), + 'label': [batch_size, 1] <- [batch_size, num_anchors, feat_height, feat_width], + 'bbox_target': [batch_size, num_anchors, feat_height, feat_width], + 'bbox_weight': [batch_size, num_anchors, feat_height, feat_width]} +""" + +from __future__ import print_function +import sys +import logging +import datetime +import numpy as np +import numpy.random as npr + +from ..logger import logger +from ..config import config +from .image import get_image, tensor_vstack, get_crop_image +from ..processing.generate_anchor import generate_anchors, anchors_plane +from ..processing.bbox_transform import bbox_overlaps, bbox_transform, landmark_transform + +STAT = {0:0, 8:0, 16:0, 32:0} + +def get_rpn_testbatch(roidb): + """ + return a dict of testbatch + :param roidb: ['image', 'flipped'] + :return: data, label, im_info + """ + assert len(roidb) == 1, 'Single batch only' + imgs, roidb = get_image(roidb) + im_array = imgs[0] + im_info = np.array([roidb[0]['im_info']], dtype=np.float32) + + data = {'data': im_array, + 'im_info': im_info} + label = {} + + return data, label, im_info + +def get_rpn_batch(roidb): + """ + prototype for rpn batch: data, im_info, gt_boxes + :param roidb: ['image', 'flipped'] + ['gt_boxes', 'boxes', 'gt_classes'] + :return: data, label + """ + assert len(roidb) == 1, 'Single batch only' + imgs, roidb = get_image(roidb) + im_array = imgs[0] + im_info = np.array([roidb[0]['im_info']], dtype=np.float32) + + # gt boxes: (x1, y1, x2, y2, cls) + if roidb[0]['gt_classes'].size > 0: + gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0] + gt_boxes = np.empty((roidb[0]['boxes'].shape[0], 5), dtype=np.float32) + gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] + gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds] + else: + gt_boxes = np.empty((0, 5), dtype=np.float32) + + data = {'data': im_array, + 'im_info': im_info} + label = {'gt_boxes': gt_boxes} + + return data, label + +def get_crop_batch(roidb): + """ + prototype for rpn batch: data, im_info, gt_boxes + :param roidb: ['image', 'flipped'] + ['gt_boxes', 'boxes', 'gt_classes'] + :return: data, label + """ + #assert len(roidb) == 1, 'Single batch only' + data_list = [] + label_list = [] + imgs, roidb = get_crop_image(roidb) + assert len(imgs)==len(roidb) + for i in range(len(imgs)): + im_array = imgs[i] + im_info = np.array([roidb[i]['im_info']], dtype=np.float32) + + # gt boxes: (x1, y1, x2, y2, cls) + if roidb[i]['gt_classes'].size > 0: + gt_inds = np.where(roidb[i]['gt_classes'] != 0)[0] + gt_boxes = np.empty((roidb[i]['boxes'].shape[0], 5), dtype=np.float32) + gt_boxes[:, 0:4] = roidb[i]['boxes'][gt_inds, :] + gt_boxes[:, 4] = roidb[i]['gt_classes'][gt_inds] + if config.USE_BLUR: + gt_blur = roidb[i]['blur'] + if config.FACE_LANDMARK: + #gt_landmarks = np.empty((roidb[i]['landmarks'].shape[0], 11), dtype=np.float32) + gt_landmarks = roidb[i]['landmarks'][gt_inds,:,:] + if config.HEAD_BOX: + gt_boxes_head = np.empty((roidb[i]['boxes_head'].shape[0], 5), dtype=np.float32) + gt_boxes_head[:, 0:4] = roidb[i]['boxes_head'][gt_inds, :] + gt_boxes_head[:, 4] = roidb[i]['gt_classes'][gt_inds] + else: + gt_boxes = np.empty((0, 5), dtype=np.float32) + if config.USE_BLUR: + gt_blur = np.empty((0,), dtype=np.float32) + if config.FACE_LANDMARK: + gt_landmarks = np.empty((0, 5, 3), dtype=np.float32) + if config.HEAD_BOX: + gt_boxes_head = np.empty((0, 5), dtype=np.float32) + + data = {'data': im_array, + 'im_info': im_info} + label = {'gt_boxes': gt_boxes} + if config.USE_BLUR: + label['gt_blur'] = gt_blur + if config.FACE_LANDMARK: + label['gt_landmarks'] = gt_landmarks + if config.HEAD_BOX: + label['gt_boxes_head'] = gt_boxes_head + data_list.append(data) + label_list.append(label) + + return data_list, label_list + +def assign_anchor_fpn(feat_shape, gt_label, im_info, landmark=False, prefix='face', select_stride=0): + """ + assign ground truth boxes to anchor positions + :param feat_shape: infer output shape + :param gt_boxes: assign ground truth + :param im_info: filter out anchors overlapped with edges + :return: tuple + labels: of shape (batch_size, 1) <- (batch_size, num_anchors, feat_height, feat_width) + bbox_targets: of shape (batch_size, num_anchors * 4, feat_height, feat_width) + bbox_weights: mark the assigned anchors + """ + def _unmap(data, count, inds, fill=0): + """" unmap a subset inds of data into original data of size count """ + if len(data.shape) == 1: + ret = np.empty((count,), dtype=np.float32) + ret.fill(fill) + ret[inds] = data + else: + ret = np.empty((count,) + data.shape[1:], dtype=np.float32) + ret.fill(fill) + ret[inds, :] = data + return ret + + global STAT + DEBUG = False + + im_info = im_info[0] + gt_boxes = gt_label['gt_boxes'] + # clean up boxes + nonneg = np.where(gt_boxes[:, 4] != -1)[0] + gt_boxes = gt_boxes[nonneg] + if config.USE_BLUR: + gt_blur = gt_label['gt_blur'] + gt_blur = gt_blur[nonneg] + if landmark: + gt_landmarks = gt_label['gt_landmarks'] + gt_landmarks = gt_landmarks[nonneg] + assert gt_boxes.shape[0]==gt_landmarks.shape[0] + #scales = np.array(scales, dtype=np.float32) + feat_strides = config.RPN_FEAT_STRIDE + bbox_pred_len = 4 + landmark_pred_len = 10 + if config.USE_BLUR: + gt_boxes[:,4] = gt_blur + bbox_pred_len = 5 + if config.USE_OCCLUSION: + landmark_pred_len = 15 + + anchors_list = [] + anchors_num_list = [] + inds_inside_list = [] + feat_infos = [] + A_list = [] + for i in range(len(feat_strides)): + stride = feat_strides[i] + sstride = str(stride) + base_size = config.RPN_ANCHOR_CFG[sstride]['BASE_SIZE'] + allowed_border = config.RPN_ANCHOR_CFG[sstride]['ALLOWED_BORDER'] + ratios = config.RPN_ANCHOR_CFG[sstride]['RATIOS'] + scales = config.RPN_ANCHOR_CFG[sstride]['SCALES'] + base_anchors = generate_anchors(base_size=base_size, ratios=list(ratios), scales=np.array(scales, dtype=np.float32), stride = stride, dense_anchor = config.DENSE_ANCHOR) + num_anchors = base_anchors.shape[0] + feat_height, feat_width = feat_shape[i][-2:] + feat_stride = feat_strides[i] + feat_infos.append([feat_height, feat_width]) + + A = num_anchors + A_list.append(A) + K = feat_height * feat_width + + all_anchors = anchors_plane(feat_height, feat_width, feat_stride, base_anchors) + all_anchors = all_anchors.reshape((K * A, 4)) + #print('anchor0', stride, all_anchors[0]) + + total_anchors = int(K * A) + anchors_num_list.append(total_anchors) + # only keep anchors inside the image + inds_inside = np.where((all_anchors[:, 0] >= -allowed_border) & + (all_anchors[:, 1] >= -allowed_border) & + (all_anchors[:, 2] < im_info[1] + allowed_border) & + (all_anchors[:, 3] < im_info[0] + allowed_border))[0] + if DEBUG: + print('total_anchors', total_anchors) + print('inds_inside', len(inds_inside)) + + # keep only inside anchors + anchors = all_anchors[inds_inside, :] + #print('AA', anchors.shape, len(inds_inside)) + + anchors_list.append(anchors) + inds_inside_list.append(inds_inside) + + # Concat anchors from each level + anchors = np.concatenate(anchors_list) + for i in range(1, len(inds_inside_list)): + inds_inside_list[i] = inds_inside_list[i] + sum(anchors_num_list[:i]) + inds_inside = np.concatenate(inds_inside_list) + total_anchors = sum(anchors_num_list) + #print('total_anchors', anchors.shape[0], len(inds_inside), file=sys.stderr) + + # label: 1 is positive, 0 is negative, -1 is dont care + labels = np.empty((len(inds_inside),), dtype=np.float32) + labels.fill(-1) + #print('BB', anchors.shape, len(inds_inside)) + #print('gt_boxes', gt_boxes.shape, file=sys.stderr) + + if gt_boxes.size > 0: + # overlap between the anchors and the gt boxes + # overlaps (ex, gt) + overlaps = bbox_overlaps(anchors.astype(np.float), gt_boxes.astype(np.float)) + argmax_overlaps = overlaps.argmax(axis=1) + #print('AAA', argmax_overlaps.shape) + max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] + gt_argmax_overlaps = overlaps.argmax(axis=0) + gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])] + gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] + + if not config.TRAIN.RPN_CLOBBER_POSITIVES: + # assign bg labels first so that positive labels can clobber them + labels[max_overlaps < config.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 + + # fg label: for each gt, anchor with highest overlap + if config.TRAIN.RPN_FORCE_POSITIVE: + labels[gt_argmax_overlaps] = 1 + + # fg label: above threshold IoU + labels[max_overlaps >= config.TRAIN.RPN_POSITIVE_OVERLAP] = 1 + + if config.TRAIN.RPN_CLOBBER_POSITIVES: + # assign bg labels last so that negative labels can clobber positives + labels[max_overlaps < config.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 + else: + labels[:] = 0 + fg_inds = np.where(labels == 1)[0] + #print('fg count', len(fg_inds)) + + # subsample positive labels if we have too many + if config.TRAIN.RPN_ENABLE_OHEM==0: + fg_inds = np.where(labels == 1)[0] + num_fg = int(config.TRAIN.RPN_FG_FRACTION * config.TRAIN.RPN_BATCH_SIZE) + if len(fg_inds) > num_fg: + disable_inds = npr.choice(fg_inds, size=(len(fg_inds) - num_fg), replace=False) + if DEBUG: + disable_inds = fg_inds[:(len(fg_inds) - num_fg)] + labels[disable_inds] = -1 + + # subsample negative labels if we have too many + num_bg = config.TRAIN.RPN_BATCH_SIZE - np.sum(labels == 1) + bg_inds = np.where(labels == 0)[0] + if len(bg_inds) > num_bg: + disable_inds = npr.choice(bg_inds, size=(len(bg_inds) - num_bg), replace=False) + if DEBUG: + disable_inds = bg_inds[:(len(bg_inds) - num_bg)] + labels[disable_inds] = -1 + + #fg_inds = np.where(labels == 1)[0] + #num_fg = len(fg_inds) + #num_bg = num_fg*int(1.0/config.TRAIN.RPN_FG_FRACTION-1) + + #bg_inds = np.where(labels == 0)[0] + #if len(bg_inds) > num_bg: + # disable_inds = npr.choice(bg_inds, size=(len(bg_inds) - num_bg), replace=False) + # if DEBUG: + # disable_inds = bg_inds[:(len(bg_inds) - num_bg)] + # labels[disable_inds] = -1 + else: + fg_inds = np.where(labels == 1)[0] + num_fg = len(fg_inds) + bg_inds = np.where(labels == 0)[0] + num_bg = len(bg_inds) + + #print('anchor stat', num_fg, num_bg) + + + bbox_targets = np.zeros((len(inds_inside), bbox_pred_len), dtype=np.float32) + if gt_boxes.size > 0: + #print('GT', gt_boxes.shape, gt_boxes[argmax_overlaps, :4].shape) + bbox_targets[:,:] = bbox_transform(anchors, gt_boxes[argmax_overlaps, :]) + #bbox_targets[:,4] = gt_blur + + bbox_weights = np.zeros((len(inds_inside), bbox_pred_len), dtype=np.float32) + #bbox_weights[labels == 1, :] = np.array(config.TRAIN.RPN_BBOX_WEIGHTS) + bbox_weights[labels == 1, 0:4] = 1.0 + if bbox_pred_len>4: + bbox_weights[labels == 1, 4:bbox_pred_len] = 0.1 + + if landmark: + landmark_targets = np.zeros((len(inds_inside), landmark_pred_len), dtype=np.float32) + #landmark_weights = np.zeros((len(inds_inside), 10), dtype=np.float32) + landmark_weights = np.zeros((len(inds_inside), landmark_pred_len), dtype=np.float32) + #landmark_weights[labels == 1, :] = np.array(config.TRAIN.RPN_LANDMARK_WEIGHTS) + if landmark_pred_len==10: + landmark_weights[labels == 1, :] = 1.0 + elif landmark_pred_len==15: + v = [1.0, 1.0, 0.1] * 5 + assert len(v)==15 + landmark_weights[labels == 1, :] = np.array(v) + else: + assert False + #TODO here + if gt_landmarks.size > 0: + #print('AAA',argmax_overlaps) + a_landmarks = gt_landmarks[argmax_overlaps,:,:] + landmark_targets[:] = landmark_transform(anchors, a_landmarks) + invalid = np.where(a_landmarks[:,0,2]<0.0)[0] + #assert len(invalid)==0 + #landmark_weights[invalid, :] = np.array(config.TRAIN.RPN_INVALID_LANDMARK_WEIGHTS) + landmark_weights[invalid, :] = 0.0 + + #if DEBUG: + # _sums = bbox_targets[labels == 1, :].sum(axis=0) + # _squared_sums = (bbox_targets[labels == 1, :] ** 2).sum(axis=0) + # _counts = np.sum(labels == 1) + # means = _sums / (_counts + 1e-14) + # stds = np.sqrt(_squared_sums / _counts - means ** 2) + # print 'means', means + # print 'stdevs', stds + # map up to original set of anchors + #print(labels.shape, total_anchors, inds_inside.shape, inds_inside[0], inds_inside[-1]) + labels = _unmap(labels, total_anchors, inds_inside, fill=-1) + bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0) + bbox_weights = _unmap(bbox_weights, total_anchors, inds_inside, fill=0) + if landmark: + landmark_targets = _unmap(landmark_targets, total_anchors, inds_inside, fill=0) + landmark_weights = _unmap(landmark_weights, total_anchors, inds_inside, fill=0) + #print('CC', anchors.shape, len(inds_inside)) + + #if DEBUG: + # if gt_boxes.size > 0: + # print 'rpn: max max_overlaps', np.max(max_overlaps) + # print 'rpn: num_positives', np.sum(labels == 1) + # print 'rpn: num_negatives', np.sum(labels == 0) + # _fg_sum = np.sum(labels == 1) + # _bg_sum = np.sum(labels == 0) + # _count = 1 + # print 'rpn: num_positive avg', _fg_sum / _count + # print 'rpn: num_negative avg', _bg_sum / _count + + # resahpe + label_list = list() + bbox_target_list = list() + bbox_weight_list = list() + if landmark: + landmark_target_list = list() + landmark_weight_list = list() + anchors_num_range = [0] + anchors_num_list + label = {} + for i in range(len(feat_strides)): + stride = feat_strides[i] + feat_height, feat_width = feat_infos[i] + A = A_list[i] + _label = labels[sum(anchors_num_range[:i+1]):sum(anchors_num_range[:i+1])+anchors_num_range[i+1]] + if select_stride>0 and stride!=select_stride: + #print('set', stride, select_stride) + _label[:] = -1 + #print('_label', _label.shape, select_stride) + #_fg_inds = np.where(_label == 1)[0] + #n_fg = len(_fg_inds) + #STAT[0]+=1 + #STAT[stride]+=n_fg + #if STAT[0]%100==0: + # print('rpn_stat', STAT, file=sys.stderr) + bbox_target = bbox_targets[sum(anchors_num_range[:i+1]):sum(anchors_num_range[:i+1])+anchors_num_range[i+1]] + bbox_weight = bbox_weights[sum(anchors_num_range[:i+1]):sum(anchors_num_range[:i+1])+anchors_num_range[i+1]] + if landmark: + landmark_target = landmark_targets[sum(anchors_num_range[:i+1]):sum(anchors_num_range[:i+1])+anchors_num_range[i+1]] + landmark_weight = landmark_weights[sum(anchors_num_range[:i+1]):sum(anchors_num_range[:i+1])+anchors_num_range[i+1]] + + _label = _label.reshape((1, feat_height, feat_width, A)).transpose(0, 3, 1, 2) + _label = _label.reshape((1, A * feat_height * feat_width)) + bbox_target = bbox_target.reshape((1, feat_height*feat_width, A * bbox_pred_len)).transpose(0, 2, 1) + bbox_weight = bbox_weight.reshape((1, feat_height*feat_width, A * bbox_pred_len)).transpose((0, 2, 1)) + label['%s_label_stride%d'%(prefix, stride)] = _label + label['%s_bbox_target_stride%d'%(prefix,stride)] = bbox_target + label['%s_bbox_weight_stride%d'%(prefix,stride)] = bbox_weight + if landmark: + landmark_target = landmark_target.reshape((1, feat_height*feat_width, A * landmark_pred_len)).transpose(0, 2, 1) + landmark_weight = landmark_weight.reshape((1, feat_height*feat_width, A * landmark_pred_len)).transpose((0, 2, 1)) + label['%s_landmark_target_stride%d'%(prefix,stride)] = landmark_target + label['%s_landmark_weight_stride%d'%(prefix,stride)] = landmark_weight + #print('in_rpn', stride,_label.shape, bbox_target.shape, bbox_weight.shape, file=sys.stderr) + label_list.append(_label) + #print('DD', _label.shape) + bbox_target_list.append(bbox_target) + bbox_weight_list.append(bbox_weight) + if landmark: + landmark_target_list.append(landmark_target) + landmark_weight_list.append(landmark_weight) + + label_concat = np.concatenate(label_list, axis=1) + bbox_target_concat = np.concatenate(bbox_target_list, axis=2) + bbox_weight_concat = np.concatenate(bbox_weight_list, axis=2) + #fg_inds = np.where(label_concat[0] == 1)[0] + #print('fg_inds_in_rpn2', fg_inds, file=sys.stderr) + + label.update({'%s_label'%prefix: label_concat, + '%s_bbox_target'%prefix: bbox_target_concat, + '%s_bbox_weight'%prefix: bbox_weight_concat} + ) + if landmark: + landmark_target_concat = np.concatenate(landmark_target_list, axis=2) + landmark_weight_concat = np.concatenate(landmark_weight_list, axis=2) + label['%s_landmark_target'%prefix] = landmark_target_concat + label['%s_landmark_weight'%prefix] = landmark_weight_concat + return label + + +class AA: + def __init__(self, feat_shape): + self.feat_shape = feat_shape + feat_strides = config.RPN_FEAT_STRIDE + anchors_list = [] + anchors_num_list = [] + inds_inside_list = [] + feat_infos = [] + A_list = [] + DEBUG = False + for i in range(len(feat_strides)): + stride = feat_strides[i] + sstride = str(stride) + base_size = config.RPN_ANCHOR_CFG[sstride]['BASE_SIZE'] + allowed_border = config.RPN_ANCHOR_CFG[sstride]['ALLOWED_BORDER'] + ratios = config.RPN_ANCHOR_CFG[sstride]['RATIOS'] + scales = config.RPN_ANCHOR_CFG[sstride]['SCALES'] + base_anchors = generate_anchors(base_size=base_size, ratios=list(ratios), scales=np.array(scales, dtype=np.float32), stride = stride, dense_anchor = config.DENSE_ANCHOR) + num_anchors = base_anchors.shape[0] + feat_height, feat_width = feat_shape[i][-2:] + feat_stride = feat_strides[i] + feat_infos.append([feat_height, feat_width]) + + A = num_anchors + A_list.append(A) + K = feat_height * feat_width + + all_anchors = anchors_plane(feat_height, feat_width, feat_stride, base_anchors) + all_anchors = all_anchors.reshape((K * A, 4)) + #print('anchor0', stride, all_anchors[0]) + + total_anchors = int(K * A) + anchors_num_list.append(total_anchors) + # only keep anchors inside the image + inds_inside = np.where((all_anchors[:, 0] >= -allowed_border) & + (all_anchors[:, 1] >= -allowed_border) & + (all_anchors[:, 2] < config.SCALES[0][1] + allowed_border) & + (all_anchors[:, 3] < config.SCALES[0][1] + allowed_border))[0] + if DEBUG: + print('total_anchors', total_anchors) + print('inds_inside', len(inds_inside)) + + # keep only inside anchors + anchors = all_anchors[inds_inside, :] + #print('AA', anchors.shape, len(inds_inside)) + + anchors_list.append(anchors) + inds_inside_list.append(inds_inside) + anchors = np.concatenate(anchors_list) + for i in range(1, len(inds_inside_list)): + inds_inside_list[i] = inds_inside_list[i] + sum(anchors_num_list[:i]) + inds_inside = np.concatenate(inds_inside_list) + #self.anchors_list = anchors_list + #self.inds_inside_list = inds_inside_list + self.anchors = anchors + self.inds_inside = inds_inside + self.anchors_num_list = anchors_num_list + self.feat_infos = feat_infos + self.A_list = A_list + self._times = [0.0, 0.0, 0.0, 0.0] + + @staticmethod + def _unmap(data, count, inds, fill=0): + """" unmap a subset inds of data into original data of size count """ + if len(data.shape) == 1: + ret = np.empty((count,), dtype=np.float32) + ret.fill(fill) + ret[inds] = data + else: + ret = np.empty((count,) + data.shape[1:], dtype=np.float32) + ret.fill(fill) + ret[inds, :] = data + return ret + + def assign_anchor_fpn(self, gt_label, im_info, landmark=False, prefix='face', select_stride=0): + + #ta = datetime.datetime.now() + im_info = im_info[0] + gt_boxes = gt_label['gt_boxes'] + # clean up boxes + nonneg = np.where(gt_boxes[:, 4] != -1)[0] + gt_boxes = gt_boxes[nonneg] + if config.USE_BLUR: + gt_blur = gt_label['gt_blur'] + gt_blur = gt_blur[nonneg] + if landmark: + gt_landmarks = gt_label['gt_landmarks'] + gt_landmarks = gt_landmarks[nonneg] + assert gt_boxes.shape[0]==gt_landmarks.shape[0] + #scales = np.array(scales, dtype=np.float32) + feat_strides = config.RPN_FEAT_STRIDE + bbox_pred_len = 4 + landmark_pred_len = 10 + if config.USE_BLUR: + gt_boxes[:,4] = gt_blur + bbox_pred_len = 5 + if config.USE_OCCLUSION: + landmark_pred_len = 15 + + #anchors_list = self.anchors_list + #inds_inside_list = self.inds_inside_list + anchors = self.anchors + inds_inside = self.inds_inside + anchors_num_list = self.anchors_num_list + feat_infos = self.feat_infos + A_list = self.A_list + + total_anchors = sum(anchors_num_list) + #print('total_anchors', anchors.shape[0], len(inds_inside), file=sys.stderr) + + # label: 1 is positive, 0 is negative, -1 is dont care + labels = np.empty((len(inds_inside),), dtype=np.float32) + labels.fill(-1) + #print('BB', anchors.shape, len(inds_inside)) + #print('gt_boxes', gt_boxes.shape, file=sys.stderr) + #tb = datetime.datetime.now() + #self._times[0] += (tb-ta).total_seconds() + #ta = datetime.datetime.now() + + if gt_boxes.size > 0: + # overlap between the anchors and the gt boxes + # overlaps (ex, gt) + overlaps = bbox_overlaps(anchors.astype(np.float), gt_boxes.astype(np.float)) + argmax_overlaps = overlaps.argmax(axis=1) + #print('AAA', argmax_overlaps.shape) + max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] + gt_argmax_overlaps = overlaps.argmax(axis=0) + gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])] + gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] + + if not config.TRAIN.RPN_CLOBBER_POSITIVES: + # assign bg labels first so that positive labels can clobber them + labels[max_overlaps < config.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 + + # fg label: for each gt, anchor with highest overlap + if config.TRAIN.RPN_FORCE_POSITIVE: + labels[gt_argmax_overlaps] = 1 + + # fg label: above threshold IoU + labels[max_overlaps >= config.TRAIN.RPN_POSITIVE_OVERLAP] = 1 + + if config.TRAIN.RPN_CLOBBER_POSITIVES: + # assign bg labels last so that negative labels can clobber positives + labels[max_overlaps < config.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 + else: + labels[:] = 0 + fg_inds = np.where(labels == 1)[0] + #print('fg count', len(fg_inds)) + + # subsample positive labels if we have too many + if config.TRAIN.RPN_ENABLE_OHEM==0: + fg_inds = np.where(labels == 1)[0] + num_fg = int(config.TRAIN.RPN_FG_FRACTION * config.TRAIN.RPN_BATCH_SIZE) + if len(fg_inds) > num_fg: + disable_inds = npr.choice(fg_inds, size=(len(fg_inds) - num_fg), replace=False) + if DEBUG: + disable_inds = fg_inds[:(len(fg_inds) - num_fg)] + labels[disable_inds] = -1 + + # subsample negative labels if we have too many + num_bg = config.TRAIN.RPN_BATCH_SIZE - np.sum(labels == 1) + bg_inds = np.where(labels == 0)[0] + if len(bg_inds) > num_bg: + disable_inds = npr.choice(bg_inds, size=(len(bg_inds) - num_bg), replace=False) + if DEBUG: + disable_inds = bg_inds[:(len(bg_inds) - num_bg)] + labels[disable_inds] = -1 + + #fg_inds = np.where(labels == 1)[0] + #num_fg = len(fg_inds) + #num_bg = num_fg*int(1.0/config.TRAIN.RPN_FG_FRACTION-1) + + #bg_inds = np.where(labels == 0)[0] + #if len(bg_inds) > num_bg: + # disable_inds = npr.choice(bg_inds, size=(len(bg_inds) - num_bg), replace=False) + # if DEBUG: + # disable_inds = bg_inds[:(len(bg_inds) - num_bg)] + # labels[disable_inds] = -1 + else: + fg_inds = np.where(labels == 1)[0] + num_fg = len(fg_inds) + bg_inds = np.where(labels == 0)[0] + num_bg = len(bg_inds) + + #print('anchor stat', num_fg, num_bg) + + + bbox_targets = np.zeros((len(inds_inside), bbox_pred_len), dtype=np.float32) + if gt_boxes.size > 0: + #print('GT', gt_boxes.shape, gt_boxes[argmax_overlaps, :4].shape) + bbox_targets[:,:] = bbox_transform(anchors, gt_boxes[argmax_overlaps, :]) + #bbox_targets[:,4] = gt_blur + #tb = datetime.datetime.now() + #self._times[1] += (tb-ta).total_seconds() + #ta = datetime.datetime.now() + + bbox_weights = np.zeros((len(inds_inside), bbox_pred_len), dtype=np.float32) + #bbox_weights[labels == 1, :] = np.array(config.TRAIN.RPN_BBOX_WEIGHTS) + bbox_weights[labels == 1, 0:4] = 1.0 + if bbox_pred_len>4: + bbox_weights[labels == 1, 4:bbox_pred_len] = 0.1 + + if landmark: + landmark_targets = np.zeros((len(inds_inside), landmark_pred_len), dtype=np.float32) + #landmark_weights = np.zeros((len(inds_inside), 10), dtype=np.float32) + landmark_weights = np.zeros((len(inds_inside), landmark_pred_len), dtype=np.float32) + #landmark_weights[labels == 1, :] = np.array(config.TRAIN.RPN_LANDMARK_WEIGHTS) + if landmark_pred_len==10: + landmark_weights[labels == 1, :] = 1.0 + elif landmark_pred_len==15: + v = [1.0, 1.0, 0.1] * 5 + assert len(v)==15 + landmark_weights[labels == 1, :] = np.array(v) + else: + assert False + #TODO here + if gt_landmarks.size > 0: + #print('AAA',argmax_overlaps) + a_landmarks = gt_landmarks[argmax_overlaps,:,:] + landmark_targets[:] = landmark_transform(anchors, a_landmarks) + invalid = np.where(a_landmarks[:,0,2]<0.0)[0] + #assert len(invalid)==0 + #landmark_weights[invalid, :] = np.array(config.TRAIN.RPN_INVALID_LANDMARK_WEIGHTS) + landmark_weights[invalid, :] = 0.0 + #tb = datetime.datetime.now() + #self._times[2] += (tb-ta).total_seconds() + #ta = datetime.datetime.now() + + #if DEBUG: + # _sums = bbox_targets[labels == 1, :].sum(axis=0) + # _squared_sums = (bbox_targets[labels == 1, :] ** 2).sum(axis=0) + # _counts = np.sum(labels == 1) + # means = _sums / (_counts + 1e-14) + # stds = np.sqrt(_squared_sums / _counts - means ** 2) + # print 'means', means + # print 'stdevs', stds + # map up to original set of anchors + #print(labels.shape, total_anchors, inds_inside.shape, inds_inside[0], inds_inside[-1]) + labels = AA._unmap(labels, total_anchors, inds_inside, fill=-1) + bbox_targets = AA._unmap(bbox_targets, total_anchors, inds_inside, fill=0) + bbox_weights = AA._unmap(bbox_weights, total_anchors, inds_inside, fill=0) + if landmark: + landmark_targets = AA._unmap(landmark_targets, total_anchors, inds_inside, fill=0) + landmark_weights = AA._unmap(landmark_weights, total_anchors, inds_inside, fill=0) + #print('CC', anchors.shape, len(inds_inside)) + + #if DEBUG: + # if gt_boxes.size > 0: + # print 'rpn: max max_overlaps', np.max(max_overlaps) + # print 'rpn: num_positives', np.sum(labels == 1) + # print 'rpn: num_negatives', np.sum(labels == 0) + # _fg_sum = np.sum(labels == 1) + # _bg_sum = np.sum(labels == 0) + # _count = 1 + # print 'rpn: num_positive avg', _fg_sum / _count + # print 'rpn: num_negative avg', _bg_sum / _count + + # resahpe + label_list = list() + bbox_target_list = list() + bbox_weight_list = list() + if landmark: + landmark_target_list = list() + landmark_weight_list = list() + anchors_num_range = [0] + anchors_num_list + label = {} + for i in range(len(feat_strides)): + stride = feat_strides[i] + feat_height, feat_width = feat_infos[i] + A = A_list[i] + _label = labels[sum(anchors_num_range[:i+1]):sum(anchors_num_range[:i+1])+anchors_num_range[i+1]] + if select_stride>0 and stride!=select_stride: + #print('set', stride, select_stride) + _label[:] = -1 + #print('_label', _label.shape, select_stride) + #_fg_inds = np.where(_label == 1)[0] + #n_fg = len(_fg_inds) + #STAT[0]+=1 + #STAT[stride]+=n_fg + #if STAT[0]%100==0: + # print('rpn_stat', STAT, file=sys.stderr) + bbox_target = bbox_targets[sum(anchors_num_range[:i+1]):sum(anchors_num_range[:i+1])+anchors_num_range[i+1]] + bbox_weight = bbox_weights[sum(anchors_num_range[:i+1]):sum(anchors_num_range[:i+1])+anchors_num_range[i+1]] + if landmark: + landmark_target = landmark_targets[sum(anchors_num_range[:i+1]):sum(anchors_num_range[:i+1])+anchors_num_range[i+1]] + landmark_weight = landmark_weights[sum(anchors_num_range[:i+1]):sum(anchors_num_range[:i+1])+anchors_num_range[i+1]] + + _label = _label.reshape((1, feat_height, feat_width, A)).transpose(0, 3, 1, 2) + _label = _label.reshape((1, A * feat_height * feat_width)) + bbox_target = bbox_target.reshape((1, feat_height*feat_width, A * bbox_pred_len)).transpose(0, 2, 1) + bbox_weight = bbox_weight.reshape((1, feat_height*feat_width, A * bbox_pred_len)).transpose((0, 2, 1)) + label['%s_label_stride%d'%(prefix, stride)] = _label + label['%s_bbox_target_stride%d'%(prefix,stride)] = bbox_target + label['%s_bbox_weight_stride%d'%(prefix,stride)] = bbox_weight + if landmark: + landmark_target = landmark_target.reshape((1, feat_height*feat_width, A * landmark_pred_len)).transpose(0, 2, 1) + landmark_weight = landmark_weight.reshape((1, feat_height*feat_width, A * landmark_pred_len)).transpose((0, 2, 1)) + label['%s_landmark_target_stride%d'%(prefix,stride)] = landmark_target + label['%s_landmark_weight_stride%d'%(prefix,stride)] = landmark_weight + #print('in_rpn', stride,_label.shape, bbox_target.shape, bbox_weight.shape, file=sys.stderr) + label_list.append(_label) + #print('DD', _label.shape) + bbox_target_list.append(bbox_target) + bbox_weight_list.append(bbox_weight) + if landmark: + landmark_target_list.append(landmark_target) + landmark_weight_list.append(landmark_weight) + + label_concat = np.concatenate(label_list, axis=1) + bbox_target_concat = np.concatenate(bbox_target_list, axis=2) + bbox_weight_concat = np.concatenate(bbox_weight_list, axis=2) + #fg_inds = np.where(label_concat[0] == 1)[0] + #print('fg_inds_in_rpn2', fg_inds, file=sys.stderr) + + label.update({'%s_label'%prefix: label_concat, + '%s_bbox_target'%prefix: bbox_target_concat, + '%s_bbox_weight'%prefix: bbox_weight_concat} + ) + if landmark: + landmark_target_concat = np.concatenate(landmark_target_list, axis=2) + landmark_weight_concat = np.concatenate(landmark_weight_list, axis=2) + label['%s_landmark_target'%prefix] = landmark_target_concat + label['%s_landmark_weight'%prefix] = landmark_weight_concat + #tb = datetime.datetime.now() + #self._times[3] += (tb-ta).total_seconds() + #ta = datetime.datetime.now() + #print(self._times) + return label diff --git a/retinaface/rcnn/logger.py b/RetinaFace/rcnn/logger.py similarity index 100% rename from retinaface/rcnn/logger.py rename to RetinaFace/rcnn/logger.py diff --git a/RetinaFace/rcnn/processing/__init__.py b/RetinaFace/rcnn/processing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/retinaface/rcnn/processing/assign_levels.py b/RetinaFace/rcnn/processing/assign_levels.py similarity index 100% rename from retinaface/rcnn/processing/assign_levels.py rename to RetinaFace/rcnn/processing/assign_levels.py diff --git a/retinaface/rcnn/processing/bbox_regression.py b/RetinaFace/rcnn/processing/bbox_regression.py similarity index 100% rename from retinaface/rcnn/processing/bbox_regression.py rename to RetinaFace/rcnn/processing/bbox_regression.py diff --git a/retinaface/rcnn/processing/bbox_transform.py b/RetinaFace/rcnn/processing/bbox_transform.py similarity index 100% rename from retinaface/rcnn/processing/bbox_transform.py rename to RetinaFace/rcnn/processing/bbox_transform.py diff --git a/retinaface/rcnn/processing/generate_anchor.py b/RetinaFace/rcnn/processing/generate_anchor.py similarity index 100% rename from retinaface/rcnn/processing/generate_anchor.py rename to RetinaFace/rcnn/processing/generate_anchor.py diff --git a/retinaface/rcnn/processing/nms.py b/RetinaFace/rcnn/processing/nms.py similarity index 100% rename from retinaface/rcnn/processing/nms.py rename to RetinaFace/rcnn/processing/nms.py diff --git a/retinaface/rcnn/pycocotools/UPSTREAM_REV b/RetinaFace/rcnn/pycocotools/UPSTREAM_REV similarity index 100% rename from retinaface/rcnn/pycocotools/UPSTREAM_REV rename to RetinaFace/rcnn/pycocotools/UPSTREAM_REV diff --git a/retinaface/rcnn/pycocotools/__init__.py b/RetinaFace/rcnn/pycocotools/__init__.py similarity index 100% rename from retinaface/rcnn/pycocotools/__init__.py rename to RetinaFace/rcnn/pycocotools/__init__.py diff --git a/retinaface/rcnn/pycocotools/_mask.c b/RetinaFace/rcnn/pycocotools/_mask.c similarity index 100% rename from retinaface/rcnn/pycocotools/_mask.c rename to RetinaFace/rcnn/pycocotools/_mask.c diff --git a/retinaface/rcnn/pycocotools/_mask.pyx b/RetinaFace/rcnn/pycocotools/_mask.pyx similarity index 100% rename from retinaface/rcnn/pycocotools/_mask.pyx rename to RetinaFace/rcnn/pycocotools/_mask.pyx diff --git a/retinaface/rcnn/pycocotools/coco.py b/RetinaFace/rcnn/pycocotools/coco.py similarity index 100% rename from retinaface/rcnn/pycocotools/coco.py rename to RetinaFace/rcnn/pycocotools/coco.py diff --git a/retinaface/rcnn/pycocotools/cocoeval.py b/RetinaFace/rcnn/pycocotools/cocoeval.py similarity index 100% rename from retinaface/rcnn/pycocotools/cocoeval.py rename to RetinaFace/rcnn/pycocotools/cocoeval.py diff --git a/retinaface/rcnn/pycocotools/mask.py b/RetinaFace/rcnn/pycocotools/mask.py similarity index 100% rename from retinaface/rcnn/pycocotools/mask.py rename to RetinaFace/rcnn/pycocotools/mask.py diff --git a/retinaface/rcnn/pycocotools/maskApi.c b/RetinaFace/rcnn/pycocotools/maskApi.c similarity index 100% rename from retinaface/rcnn/pycocotools/maskApi.c rename to RetinaFace/rcnn/pycocotools/maskApi.c diff --git a/retinaface/rcnn/pycocotools/maskApi.h b/RetinaFace/rcnn/pycocotools/maskApi.h similarity index 100% rename from retinaface/rcnn/pycocotools/maskApi.h rename to RetinaFace/rcnn/pycocotools/maskApi.h diff --git a/retinaface/rcnn/pycocotools/setup.py b/RetinaFace/rcnn/pycocotools/setup.py similarity index 100% rename from retinaface/rcnn/pycocotools/setup.py rename to RetinaFace/rcnn/pycocotools/setup.py diff --git a/RetinaFace/rcnn/symbol/__init__.py b/RetinaFace/rcnn/symbol/__init__.py new file mode 100644 index 0000000..1ed3e17 --- /dev/null +++ b/RetinaFace/rcnn/symbol/__init__.py @@ -0,0 +1,3 @@ +from .symbol_ssh import * +from .symbol_mnet import * +from .symbol_resnet import * diff --git a/RetinaFace/rcnn/symbol/pyramidbox.py b/RetinaFace/rcnn/symbol/pyramidbox.py new file mode 100644 index 0000000..dae2537 --- /dev/null +++ b/RetinaFace/rcnn/symbol/pyramidbox.py @@ -0,0 +1,427 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import six +import paddle.fluid as fluid +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.initializer import Xavier +from paddle.fluid.initializer import Constant +from paddle.fluid.initializer import Bilinear +from paddle.fluid.regularizer import L2Decay + + +def conv_bn(input, filter, ksize, stride, padding, act='relu', bias_attr=False): + conv = fluid.layers.conv2d( + input=input, + filter_size=ksize, + num_filters=filter, + stride=stride, + padding=padding, + act=None, + bias_attr=bias_attr) + return fluid.layers.batch_norm(input=conv, act=act) + + +def conv_block(input, groups, filters, ksizes, strides=None, with_pool=True): + assert len(filters) == groups + assert len(ksizes) == groups + strides = [1] * groups if strides is None else strides + w_attr = ParamAttr(learning_rate=1., initializer=Xavier()) + b_attr = ParamAttr(learning_rate=2., regularizer=L2Decay(0.)) + conv = input + for i in six.moves.xrange(groups): + conv = fluid.layers.conv2d( + input=conv, + num_filters=filters[i], + filter_size=ksizes[i], + stride=strides[i], + padding=(ksizes[i] - 1) // 2, + param_attr=w_attr, + bias_attr=b_attr, + act='relu') + if with_pool: + pool = fluid.layers.pool2d( + input=conv, + pool_size=2, + pool_type='max', + pool_stride=2, + ceil_mode=True) + return conv, pool + else: + return conv + + +class PyramidBox(object): + def __init__(self, + data_shape, + num_classes=None, + use_transposed_conv2d=True, + is_infer=False, + sub_network=False): + """ + TODO(qingqing): add comments. + """ + self.data_shape = data_shape + self.min_sizes = [16., 32., 64., 128., 256., 512.] + self.steps = [4., 8., 16., 32., 64., 128.] + self.num_classes = num_classes + self.use_transposed_conv2d = use_transposed_conv2d + self.is_infer = is_infer + self.sub_network = sub_network + + # the base network is VGG with atrous layers + self._input() + self._vgg() + if sub_network: + self._low_level_fpn() + self._cpm_module() + self._pyramidbox() + else: + self._vgg_ssd() + + def feeds(self): + if self.is_infer: + return [self.image] + else: + return [self.image, self.face_box, self.head_box, self.gt_label] + + def _input(self): + self.image = fluid.layers.data( + name='image', shape=self.data_shape, dtype='float32') + if not self.is_infer: + self.face_box = fluid.layers.data( + name='face_box', shape=[4], dtype='float32', lod_level=1) + self.head_box = fluid.layers.data( + name='head_box', shape=[4], dtype='float32', lod_level=1) + self.gt_label = fluid.layers.data( + name='gt_label', shape=[1], dtype='int32', lod_level=1) + + def _vgg(self): + self.conv1, self.pool1 = conv_block(self.image, 2, [64] * 2, [3] * 2) + self.conv2, self.pool2 = conv_block(self.pool1, 2, [128] * 2, [3] * 2) + + #priorbox min_size is 16 + self.conv3, self.pool3 = conv_block(self.pool2, 3, [256] * 3, [3] * 3) + #priorbox min_size is 32 + self.conv4, self.pool4 = conv_block(self.pool3, 3, [512] * 3, [3] * 3) + #priorbox min_size is 64 + self.conv5, self.pool5 = conv_block(self.pool4, 3, [512] * 3, [3] * 3) + + # fc6 and fc7 in paper, priorbox min_size is 128 + self.conv6 = conv_block( + self.pool5, 2, [1024, 1024], [3, 1], with_pool=False) + # conv6_1 and conv6_2 in paper, priorbox min_size is 256 + self.conv7 = conv_block( + self.conv6, 2, [256, 512], [1, 3], [1, 2], with_pool=False) + # conv7_1 and conv7_2 in paper, priorbox mini_size is 512 + self.conv8 = conv_block( + self.conv7, 2, [128, 256], [1, 3], [1, 2], with_pool=False) + + def _low_level_fpn(self): + """ + Low-level feature pyramid network. + """ + + def fpn(up_from, up_to): + ch = up_to.shape[1] + b_attr = ParamAttr(learning_rate=2., regularizer=L2Decay(0.)) + conv1 = fluid.layers.conv2d( + up_from, ch, 1, act='relu', bias_attr=b_attr) + if self.use_transposed_conv2d: + w_attr = ParamAttr( + learning_rate=0., + regularizer=L2Decay(0.), + initializer=Bilinear()) + upsampling = fluid.layers.conv2d_transpose( + conv1, + ch, + output_size=None, + filter_size=4, + padding=1, + stride=2, + groups=ch, + param_attr=w_attr, + bias_attr=False, + use_cudnn=True) + else: + upsampling = fluid.layers.resize_bilinear( + conv1, out_shape=up_to.shape[2:]) + + conv2 = fluid.layers.conv2d( + up_to, ch, 1, act='relu', bias_attr=b_attr) + if self.is_infer: + upsampling = fluid.layers.crop(upsampling, shape=conv2) + # eltwise mul + conv_fuse = upsampling * conv2 + return conv_fuse + + self.lfpn2_on_conv5 = fpn(self.conv6, self.conv5) + self.lfpn1_on_conv4 = fpn(self.lfpn2_on_conv5, self.conv4) + self.lfpn0_on_conv3 = fpn(self.lfpn1_on_conv4, self.conv3) + + def _cpm_module(self): + """ + Context-sensitive Prediction Module + """ + + def cpm(input): + # residual + branch1 = conv_bn(input, 1024, 1, 1, 0, None) + branch2a = conv_bn(input, 256, 1, 1, 0, act='relu') + branch2b = conv_bn(branch2a, 256, 3, 1, 1, act='relu') + branch2c = conv_bn(branch2b, 1024, 1, 1, 0, None) + sum = branch1 + branch2c + rescomb = fluid.layers.relu(x=sum) + + # ssh + b_attr = ParamAttr(learning_rate=2., regularizer=L2Decay(0.)) + ssh_1 = fluid.layers.conv2d(rescomb, 256, 3, 1, 1, bias_attr=b_attr) + ssh_dimred = fluid.layers.conv2d( + rescomb, 128, 3, 1, 1, act='relu', bias_attr=b_attr) + ssh_2 = fluid.layers.conv2d( + ssh_dimred, 128, 3, 1, 1, bias_attr=b_attr) + ssh_3a = fluid.layers.conv2d( + ssh_dimred, 128, 3, 1, 1, act='relu', bias_attr=b_attr) + ssh_3b = fluid.layers.conv2d(ssh_3a, 128, 3, 1, 1, bias_attr=b_attr) + + ssh_concat = fluid.layers.concat([ssh_1, ssh_2, ssh_3b], axis=1) + ssh_out = fluid.layers.relu(x=ssh_concat) + return ssh_out + + self.ssh_conv3 = cpm(self.lfpn0_on_conv3) + self.ssh_conv4 = cpm(self.lfpn1_on_conv4) + self.ssh_conv5 = cpm(self.lfpn2_on_conv5) + self.ssh_conv6 = cpm(self.conv6) + self.ssh_conv7 = cpm(self.conv7) + self.ssh_conv8 = cpm(self.conv8) + + def _l2_norm_scale(self, input, init_scale=1.0, channel_shared=False): + from paddle.fluid.layer_helper import LayerHelper + helper = LayerHelper("Scale") + l2_norm = fluid.layers.l2_normalize( + input, axis=1) # l2 norm along channel + shape = [1] if channel_shared else [input.shape[1]] + scale = helper.create_parameter( + attr=helper.param_attr, + shape=shape, + dtype=input.dtype, + default_initializer=Constant(init_scale)) + out = fluid.layers.elementwise_mul( + x=l2_norm, y=scale, axis=-1 if channel_shared else 1) + return out + + def _pyramidbox(self): + """ + Get prior-boxes and pyramid-box + """ + self.ssh_conv3_norm = self._l2_norm_scale( + self.ssh_conv3, init_scale=10.) + self.ssh_conv4_norm = self._l2_norm_scale(self.ssh_conv4, init_scale=8.) + self.ssh_conv5_norm = self._l2_norm_scale(self.ssh_conv5, init_scale=5.) + + def permute_and_reshape(input, last_dim): + trans = fluid.layers.transpose(input, perm=[0, 2, 3, 1]) + compile_shape = [ + trans.shape[0], np.prod(trans.shape[1:]) // last_dim, last_dim + ] + run_shape = fluid.layers.assign( + np.array([0, -1, last_dim]).astype("int32")) + return fluid.layers.reshape( + trans, shape=compile_shape, actual_shape=run_shape) + + face_locs, face_confs = [], [] + head_locs, head_confs = [], [] + boxes, vars = [], [] + inputs = [ + self.ssh_conv3_norm, self.ssh_conv4_norm, self.ssh_conv5_norm, + self.ssh_conv6, self.ssh_conv7, self.ssh_conv8 + ] + b_attr = ParamAttr(learning_rate=2., regularizer=L2Decay(0.)) + for i, input in enumerate(inputs): + mbox_loc = fluid.layers.conv2d(input, 8, 3, 1, 1, bias_attr=b_attr) + face_loc, head_loc = fluid.layers.split( + mbox_loc, num_or_sections=2, dim=1) + face_loc = permute_and_reshape(face_loc, 4) + head_loc = permute_and_reshape(head_loc, 4) + + mbox_conf = fluid.layers.conv2d(input, 6, 3, 1, 1, bias_attr=b_attr) + face_conf1, face_conf3, head_conf = fluid.layers.split( + mbox_conf, num_or_sections=[1, 3, 2], dim=1) + face_conf3_maxin = fluid.layers.reduce_max( + face_conf3, dim=1, keep_dim=True) + face_conf = fluid.layers.concat( + [face_conf1, face_conf3_maxin], axis=1) + + face_conf = permute_and_reshape(face_conf, 2) + head_conf = permute_and_reshape(head_conf, 2) + + face_locs.append(face_loc) + face_confs.append(face_conf) + + head_locs.append(head_loc) + head_confs.append(head_conf) + + box, var = fluid.layers.prior_box( + input, + self.image, + min_sizes=[self.min_sizes[i]], + steps=[self.steps[i]] * 2, + aspect_ratios=[1.], + clip=False, + flip=True, + offset=0.5) + box = fluid.layers.reshape(box, shape=[-1, 4]) + var = fluid.layers.reshape(var, shape=[-1, 4]) + + boxes.append(box) + vars.append(var) + + self.face_mbox_loc = fluid.layers.concat(face_locs, axis=1) + self.face_mbox_conf = fluid.layers.concat(face_confs, axis=1) + + self.head_mbox_loc = fluid.layers.concat(head_locs, axis=1) + self.head_mbox_conf = fluid.layers.concat(head_confs, axis=1) + + self.prior_boxes = fluid.layers.concat(boxes) + self.box_vars = fluid.layers.concat(vars) + + def _vgg_ssd(self): + self.conv3_norm = self._l2_norm_scale(self.conv3, init_scale=10.) + self.conv4_norm = self._l2_norm_scale(self.conv4, init_scale=8.) + self.conv5_norm = self._l2_norm_scale(self.conv5, init_scale=5.) + + def permute_and_reshape(input, last_dim): + trans = fluid.layers.transpose(input, perm=[0, 2, 3, 1]) + compile_shape = [ + trans.shape[0], np.prod(trans.shape[1:]) // last_dim, last_dim + ] + run_shape = fluid.layers.assign( + np.array([0, -1, last_dim]).astype("int32")) + return fluid.layers.reshape( + trans, shape=compile_shape, actual_shape=run_shape) + + locs, confs = [], [] + boxes, vars = [], [] + b_attr = ParamAttr(learning_rate=2., regularizer=L2Decay(0.)) + + # conv3 + mbox_loc = fluid.layers.conv2d( + self.conv3_norm, 4, 3, 1, 1, bias_attr=b_attr) + loc = permute_and_reshape(mbox_loc, 4) + mbox_conf = fluid.layers.conv2d( + self.conv3_norm, 4, 3, 1, 1, bias_attr=b_attr) + conf1, conf3 = fluid.layers.split( + mbox_conf, num_or_sections=[1, 3], dim=1) + conf3_maxin = fluid.layers.reduce_max(conf3, dim=1, keep_dim=True) + conf = fluid.layers.concat([conf1, conf3_maxin], axis=1) + conf = permute_and_reshape(conf, 2) + box, var = fluid.layers.prior_box( + self.conv3_norm, + self.image, + min_sizes=[16.], + steps=[4, 4], + aspect_ratios=[1.], + clip=False, + flip=True, + offset=0.5) + box = fluid.layers.reshape(box, shape=[-1, 4]) + var = fluid.layers.reshape(var, shape=[-1, 4]) + + locs.append(loc) + confs.append(conf) + boxes.append(box) + vars.append(var) + + min_sizes = [32., 64., 128., 256., 512.] + steps = [8., 16., 32., 64., 128.] + inputs = [ + self.conv4_norm, self.conv5_norm, self.conv6, self.conv7, self.conv8 + ] + for i, input in enumerate(inputs): + mbox_loc = fluid.layers.conv2d(input, 4, 3, 1, 1, bias_attr=b_attr) + loc = permute_and_reshape(mbox_loc, 4) + + mbox_conf = fluid.layers.conv2d(input, 2, 3, 1, 1, bias_attr=b_attr) + conf = permute_and_reshape(mbox_conf, 2) + box, var = fluid.layers.prior_box( + input, + self.image, + min_sizes=[min_sizes[i]], + steps=[steps[i]] * 2, + aspect_ratios=[1.], + clip=False, + flip=True, + offset=0.5) + box = fluid.layers.reshape(box, shape=[-1, 4]) + var = fluid.layers.reshape(var, shape=[-1, 4]) + + locs.append(loc) + confs.append(conf) + boxes.append(box) + vars.append(var) + + self.face_mbox_loc = fluid.layers.concat(locs, axis=1) + self.face_mbox_conf = fluid.layers.concat(confs, axis=1) + self.prior_boxes = fluid.layers.concat(boxes) + self.box_vars = fluid.layers.concat(vars) + + def vgg_ssd_loss(self): + loss = fluid.layers.ssd_loss( + self.face_mbox_loc, + self.face_mbox_conf, + self.face_box, + self.gt_label, + self.prior_boxes, + self.box_vars, + overlap_threshold=0.35, + neg_overlap=0.35) + loss = fluid.layers.reduce_sum(loss) + return loss + + def train(self): + face_loss = fluid.layers.ssd_loss( + self.face_mbox_loc, + self.face_mbox_conf, + self.face_box, + self.gt_label, + self.prior_boxes, + self.box_vars, + overlap_threshold=0.35, + neg_overlap=0.35) + face_loss.persistable = True + head_loss = fluid.layers.ssd_loss( + self.head_mbox_loc, + self.head_mbox_conf, + self.head_box, + self.gt_label, + self.prior_boxes, + self.box_vars, + overlap_threshold=0.35, + neg_overlap=0.35) + head_loss.persistable = True + face_loss = fluid.layers.reduce_sum(face_loss) + face_loss.persistable = True + head_loss = fluid.layers.reduce_sum(head_loss) + head_loss.persistable = True + total_loss = face_loss + head_loss + total_loss.persistable = True + return face_loss, head_loss, total_loss + + def infer(self, main_program=None): + if main_program is None: + test_program = fluid.default_main_program().clone(for_test=True) + else: + test_program = main_program.clone(for_test=True) + with fluid.program_guard(test_program): + face_nmsed_out = fluid.layers.detection_output( + self.face_mbox_loc, + self.face_mbox_conf, + self.prior_boxes, + self.box_vars, + nms_threshold=0.3, + nms_top_k=5000, + keep_top_k=750, + score_threshold=0.01) + return test_program, face_nmsed_out diff --git a/RetinaFace/rcnn/symbol/symbol_common.py b/RetinaFace/rcnn/symbol/symbol_common.py new file mode 100644 index 0000000..30719eb --- /dev/null +++ b/RetinaFace/rcnn/symbol/symbol_common.py @@ -0,0 +1,489 @@ +import mxnet as mx +import mxnet.ndarray as nd +import numpy as np +from rcnn.config import config +from rcnn.PY_OP import rpn_fpn_ohem3 + +def conv_only(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \ + stride=(1,1), bias_wd_mult=0.0, shared_weight=None, shared_bias = None): + if shared_weight is None: + weight = mx.symbol.Variable(name="{}_weight".format(name), + init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + bias = mx.symbol.Variable(name="{}_bias".format(name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(bias_wd_mult)}) + else: + weight = shared_weight + bias = shared_bias + print('reuse shared var in', name) + conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \ + stride=stride, num_filter=num_filter, name="{}".format(name), weight = weight, bias=bias) + return conv + +def conv_deformable(net, num_filter, num_group=1, act_type='relu',name=''): + if config.USE_DCN==1: + f = num_group*18 + conv_offset = mx.symbol.Convolution(name=name+'_conv_offset', data = net, + num_filter=f, pad=(1, 1), kernel=(3, 3), stride=(1, 1)) + net = mx.contrib.symbol.DeformableConvolution(name=name+"_conv", data=net, offset=conv_offset, + num_filter=num_filter, pad=(1,1), kernel=(3, 3), num_deformable_group=num_group, stride=(1, 1), no_bias=False) + else: + print('use dcnv2 at', name) + lr_mult = 0.1 + weight_var = mx.sym.Variable(name=name+'_conv2_offset_weight', init=mx.init.Zero(), lr_mult=lr_mult) + bias_var = mx.sym.Variable(name=name+'_conv2_offset_bias', init=mx.init.Zero(), lr_mult=lr_mult) + conv2_offset = mx.symbol.Convolution(name=name + '_conv2_offset', data=net, num_filter=27, + pad=(1, 1), kernel=(3, 3), stride=(1,1), weight=weight_var, bias=bias_var, lr_mult=lr_mult) + conv2_offset_t = mx.sym.slice_axis(conv2_offset, axis=1, begin=0, end=18) + conv2_mask = mx.sym.slice_axis(conv2_offset, axis=1, begin=18, end=None) + conv2_mask = 2 * mx.sym.Activation(conv2_mask, act_type='sigmoid') + + conv2 = mx.contrib.symbol.ModulatedDeformableConvolution(name=name + '_conv2', data=net, offset=conv2_offset_t, mask=conv2_mask, + num_filter=num_filter, pad=(1, 1), kernel=(3, 3), stride=(1,1), + num_deformable_group=num_group, no_bias=True) + net = conv2 + net = mx.sym.BatchNorm(data=net, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn') + if len(act_type)>0: + net = mx.symbol.Activation(data=net, act_type=act_type, name=name+'_act') + return net + +def conv_act_layer_dw(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \ + stride=(1,1), act_type="relu", bias_wd_mult=0.0): + assert kernel[0]==3 + weight = mx.symbol.Variable(name="{}_weight".format(name), + init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + bias = mx.symbol.Variable(name="{}_bias".format(name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(bias_wd_mult)}) + conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \ + stride=stride, num_filter=num_filter, num_group=num_filter, name="{}".format(name), weight=weight, bias=bias) + conv = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn') + if len(act_type)>0: + relu = mx.symbol.Activation(data=conv, act_type=act_type, \ + name="{}_{}".format(name, act_type)) + else: + relu = conv + return relu + +def conv_act_layer(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \ + stride=(1,1), act_type="relu", bias_wd_mult=0.0, separable=False, filter_in = -1): + + if config.USE_DCN>1 and kernel==(3,3) and pad==(1,1) and stride==(1,1) and not separable: + return conv_deformable(from_layer, num_filter, num_group=1, act_type = act_type, name=name) + + if separable: + assert kernel[0]>1 + assert filter_in>0 + if not separable: + weight = mx.symbol.Variable(name="{}_weight".format(name), + init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + bias = mx.symbol.Variable(name="{}_bias".format(name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(bias_wd_mult)}) + conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \ + stride=stride, num_filter=num_filter, name="{}".format(name), weight=weight, bias=bias) + conv = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn') + else: + if filter_in<0: + filter_in = num_filter + conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \ + stride=stride, num_filter=filter_in, num_group=filter_in, name="{}_sep".format(name)) + conv = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_sep_bn') + conv = mx.symbol.Activation(data=conv, act_type='relu', \ + name="{}_sep_bn_relu".format(name)) + conv = mx.symbol.Convolution(data=conv, kernel=(1,1), pad=(0,0), \ + stride=(1,1), num_filter=num_filter, name="{}".format(name)) + conv = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn') + if len(act_type)>0: + relu = mx.symbol.Activation(data=conv, act_type=act_type, \ + name="{}_{}".format(name, act_type)) + else: + relu = conv + return relu + +def ssh_context_module(body, num_filter, filter_in, name): + conv_dimred = conv_act_layer(body, name+'_conv1', + num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', separable=False, filter_in = filter_in) + conv5x5 = conv_act_layer(conv_dimred, name+'_conv2', + num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', separable=False) + conv7x7_1 = conv_act_layer(conv_dimred, name+'_conv3_1', + num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', separable=False) + conv7x7 = conv_act_layer(conv7x7_1, name+'_conv3_2', + num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', separable=False) + return (conv5x5, conv7x7) + + +def ssh_detection_module(body, num_filter, filter_in, name): + assert num_filter%4==0 + conv3x3 = conv_act_layer(body, name+'_conv1', + num_filter//2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', separable=False, filter_in=filter_in) + #_filter = max(num_filter//4, 16) + _filter = num_filter//4 + conv5x5, conv7x7 = ssh_context_module(body, _filter, filter_in, name+'_context') + ret = mx.sym.concat(*[conv3x3, conv5x5, conv7x7], dim=1, name = name+'_concat') + ret = mx.symbol.Activation(data=ret, act_type='relu', name=name+'_concat_relu') + out_filter = num_filter//2+_filter*2 + if config.USE_DCN>0: + ret = conv_deformable(ret, num_filter = out_filter, name = name+'_concat_dcn') + return ret + +#def retina_context_module(body, kernel, num_filter, filter_in, name): +# conv_dimred = conv_act_layer(body, name+'_conv0', +# num_filter, kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='relu', separable=False, filter_in = filter_in) +# conv1 = conv_act_layer(conv_dimred, name+'_conv1', +# num_filter*6, kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='relu', separable=False, filter_in = filter_in) +# conv2 = conv_act_layer(conv1, name+'_conv2', +# num_filter*6, kernel=kernel, pad=((kernel[0]-1)//2, (kernel[1]-1)//2), stride=(1, 1), act_type='relu', separable=True, filter_in = num_filter*6) +# conv3 = conv_act_layer(conv2, name+'_conv3', +# num_filter, kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='relu', separable=False) +# conv3 = conv3 + conv_dimred +# return conv3 + +def retina_detection_module(body, num_filter, filter_in, name): + assert num_filter%4==0 + conv1 = conv_act_layer(body, name+'_conv1', + num_filter//2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', separable=False, filter_in=filter_in) + conv2 = conv_act_layer(conv1, name+'_conv2', + num_filter//2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', separable=False, filter_in=num_filter//2) + conv3 = conv_act_layer(conv2, name+'_conv3', + num_filter//2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', separable=False, filter_in=num_filter//2) + conv4 = conv2 + conv3 + body = mx.sym.concat(*[conv1, conv4], dim=1, name = name+'_concat') + if config.USE_DCN>0: + body = conv_deformable(body, num_filter = num_filter, name = name+'_concat_dcn') + return body + + +def head_module(body, num_filter, filter_in, name): + if config.HEAD_MODULE=='SSH': + return ssh_detection_module(body, num_filter, filter_in, name) + else: + return retina_detection_module(body, num_filter, filter_in, name) + + +def upsampling(data, num_filter, name): + #ret = mx.symbol.Deconvolution(data=data, num_filter=num_filter, kernel=(4,4), stride=(2, 2), pad=(1,1), + # num_group = num_filter, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'}, + # name=name) + #ret = mx.symbol.Deconvolution(data=data, num_filter=num_filter, kernel=(2,2), stride=(2, 2), pad=(0,0), + # num_group = num_filter, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'}, + # name=name) + ret = mx.symbol.UpSampling(data, scale=2, sample_type='nearest', workspace=512, name=name, num_args=1) + return ret + +def get_sym_conv(data, sym): + all_layers = sym.get_internals() + + isize = 640 + _, out_shape, _ = all_layers.infer_shape(data = (1,3,isize,isize)) + last_entry = None + c1 = None + c2 = None + c3 = None + c1_name = None + c2_name = None + c3_name = None + c1_filter = -1 + c2_filter = -1 + c3_filter = -1 + #print(len(all_layers), len(out_shape)) + #print(all_layers.__class__) + outputs = all_layers.list_outputs() + #print(outputs.__class__, len(outputs)) + count = len(outputs) + stride2name = {} + stride2layer = {} + stride2shape = {} + for i in range(count): + name = outputs[i] + shape = out_shape[i] + if not name.endswith('_output'): + continue + if len(shape)!=4: + continue + assert isize%shape[2]==0 + if shape[1]>config.max_feat_channel: + break + stride = isize//shape[2] + stride2name[stride] = name + stride2layer[stride] = all_layers[name] + stride2shape[stride] = shape + #print(name, shape) + #if c1 is None and shape[2]==isize//16: + # cname = last_entry[0] + # #print('c1', last_entry) + # c1 = all_layers[cname] + # c1_name = cname + #if c2 is None and shape[2]==isize//32: + # cname = last_entry[0] + # #print('c2', last_entry) + # c2 = all_layers[cname] + # c2_name = cname + #if shape[2]==isize//32: + # c3 = all_layers[name] + # #print('c3', name, shape) + # c3_name = name + + #last_entry = (name, shape) + + F1 = config.HEAD_FILTER_NUM + F2 = F1 + if config.SHARE_WEIGHT_BBOX or config.SHARE_WEIGHT_LANDMARK: + F2 = F1 + strides = sorted(stride2name.keys()) + for stride in strides: + print('stride', stride, stride2name[stride], stride2shape[stride]) + print('F1_F2', F1, F2) + #print('cnames', c1_name, c2_name, c3_name, F1, F2) + _bwm = 1.0 + c0 = stride2layer[4] + c1 = stride2layer[8] + c2 = stride2layer[16] + c3 = stride2layer[32] + c3 = conv_act_layer(c3, 'rf_c3_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1) + c3_up = upsampling(c3, F2, 'rf_c3_upsampling') + c2_lateral = conv_act_layer(c2, 'rf_c2_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + if config.USE_CROP: + c3_up = mx.symbol.Crop(*[c3_up, c2_lateral]) + c2 = c2_lateral+c3_up + c2 = conv_act_layer(c2, 'rf_c2_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_lateral = conv_act_layer(c1, 'rf_c1_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + c2_up = upsampling(c2, F2, 'rf_c2_upsampling') + #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + if config.USE_CROP: + c2_up = mx.symbol.Crop(*[c2_up, c1_lateral]) + c1 = c1_lateral+c2_up + c1 = conv_act_layer(c1, 'rf_c1_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + m1 = head_module(c1, F2*config.CONTEXT_FILTER_RATIO, F2, 'rf_c1_det') + m2 = head_module(c2, F1*config.CONTEXT_FILTER_RATIO, F2, 'rf_c2_det') + m3 = head_module(c3, F1*config.CONTEXT_FILTER_RATIO, F2, 'rf_c3_det') + if len(config.RPN_ANCHOR_CFG)==3: + ret = {8: m1, 16:m2, 32: m3} + elif len(config.RPN_ANCHOR_CFG)==1: + ret = {16:m2} + elif len(config.RPN_ANCHOR_CFG)==2: + ret = {8: m1, 16:m2} + elif len(config.RPN_ANCHOR_CFG)==4: + c0_lateral = conv_act_layer(c0, 'rf_c0_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_up = upsampling(c1, F2, 'rf_c1_upsampling') + if config.USE_CROP: + c1_up = mx.symbol.Crop(*[c1_up, c0_lateral]) + c0 = c0_lateral+c1_up + c0 = conv_act_layer(c0, 'rf_c0_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + + m0 = head_module(c0, F2*config.CONTEXT_FILTER_RATIO, F2, 'rf_c0_det') + ret = {4: m0, 8: m1, 16:m2, 32: m3} + elif len(config.RPN_ANCHOR_CFG)==5: + c0_lateral = conv_act_layer(c0, 'rf_c0_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_up = upsampling(c1, F2, 'rf_c1_upsampling') + if config.USE_CROP: + c1_up = mx.symbol.Crop(*[c1_up, c0_lateral]) + c0 = c0_lateral+c1_up + c0 = conv_act_layer(c0, 'rf_c0_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + + c4 = conv_act_layer(c3, 'rf_c4', + F2, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu', bias_wd_mult=_bwm) + m0 = head_module(c0, F2*config.CONTEXT_FILTER_RATIO, F2, 'rf_c0_det') + m4 = head_module(c4, F1*config.CONTEXT_FILTER_RATIO, F2, 'rf_c4_det') + ret = {4: m0, 8: m1, 16:m2, 32: m3, 64: m4} + elif len(config.RPN_ANCHOR_CFG)==6: + c0_lateral = conv_act_layer(c0, 'rf_c0_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_up = upsampling(c1, F2, 'rf_c1_upsampling') + if config.USE_CROP: + c1_up = mx.symbol.Crop(*[c1_up, c0_lateral]) + c0 = c0_lateral+c1_up + c0 = conv_act_layer(c0, 'rf_c0_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + + c4 = conv_act_layer(c3, 'rf_c4', + F2, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu', bias_wd_mult=_bwm) + c5 = conv_act_layer(c4, 'rf_c5', + F2, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu', bias_wd_mult=_bwm) + m0 = head_module(c0, F2*config.CONTEXT_FILTER_RATIO, F2, 'rf_c0_det') + m4 = head_module(c4, F1*config.CONTEXT_FILTER_RATIO, F2, 'rf_c4_det') + m5 = head_module(c5, F1*config.CONTEXT_FILTER_RATIO, F2, 'rf_c5_det') + ret = {4: m0, 8: m1, 16:m2, 32: m3, 64: m4, 128: m5} + + #return {8: m1, 16:m2, 32: m3} + return ret + +def get_out(conv_fpn_feat, prefix, stride, landmark=False, lr_mult=1.0, shared_vars = None): + A = config.NUM_ANCHORS + bbox_pred_len = 4 + landmark_pred_len = 10 + if config.USE_BLUR: + bbox_pred_len = 5 + if config.USE_OCCLUSION: + landmark_pred_len = 15 + ret_group = [] + num_anchors = config.RPN_ANCHOR_CFG[str(stride)]['NUM_ANCHORS'] + label = mx.symbol.Variable(name='%s_label_stride%d'%(prefix,stride)) + bbox_target = mx.symbol.Variable(name='%s_bbox_target_stride%d'%(prefix,stride)) + bbox_weight = mx.symbol.Variable(name='%s_bbox_weight_stride%d'%(prefix,stride)) + if landmark: + landmark_target = mx.symbol.Variable(name='%s_landmark_target_stride%d'%(prefix,stride)) + landmark_weight = mx.symbol.Variable(name='%s_landmark_weight_stride%d'%(prefix,stride)) + rpn_relu = conv_fpn_feat[stride] + maxout_stat = 0 + if config.USE_MAXOUT>=1 and stride==config.RPN_FEAT_STRIDE[-1]: + maxout_stat = 1 + if config.USE_MAXOUT>=2 and stride!=config.RPN_FEAT_STRIDE[-1]: + maxout_stat = 2 + + + if maxout_stat==0: + rpn_cls_score = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d'%(prefix, stride), 2*num_anchors, + kernel=(1,1), pad=(0,0), stride=(1, 1), shared_weight = shared_vars[0][0], shared_bias = shared_vars[0][1]) + elif maxout_stat==1: + cls_list = [] + for a in range(num_anchors): + rpn_cls_score_bg = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_bg'%(prefix,stride,a), 3, + kernel=(1,1), pad=(0,0), stride=(1, 1)) + rpn_cls_score_bg = mx.sym.max(rpn_cls_score_bg, axis=1, keepdims=True) + cls_list.append(rpn_cls_score_bg) + rpn_cls_score_fg = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_fg'%(prefix,stride,a), 1, + kernel=(1,1), pad=(0,0), stride=(1, 1)) + cls_list.append(rpn_cls_score_fg) + rpn_cls_score = mx.sym.concat(*cls_list, dim=1, name='%s_rpn_cls_score_stride%d'%(prefix,stride)) + else: + cls_list = [] + for a in range(num_anchors): + rpn_cls_score_bg = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_bg'%(prefix,stride,a), 1, + kernel=(1,1), pad=(0,0), stride=(1, 1)) + cls_list.append(rpn_cls_score_bg) + rpn_cls_score_fg = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_fg'%(prefix,stride,a), 3, + kernel=(1,1), pad=(0,0), stride=(1, 1)) + rpn_cls_score_fg = mx.sym.max(rpn_cls_score_fg, axis=1, keepdims=True) + cls_list.append(rpn_cls_score_fg) + rpn_cls_score = mx.sym.concat(*cls_list, dim=1, name='%s_rpn_cls_score_stride%d'%(prefix,stride)) + + rpn_bbox_pred = conv_only(rpn_relu, '%s_rpn_bbox_pred_stride%d'%(prefix,stride), bbox_pred_len*num_anchors, + kernel=(1,1), pad=(0,0), stride=(1, 1), shared_weight = shared_vars[1][0], shared_bias = shared_vars[1][1]) + + # prepare rpn data + rpn_cls_score_reshape = mx.symbol.Reshape(data=rpn_cls_score, + shape=(0, 2, -1), + name="%s_rpn_cls_score_reshape_stride%s" % (prefix,stride)) + + rpn_bbox_pred_reshape = mx.symbol.Reshape(data=rpn_bbox_pred, + shape=(0, 0, -1), + name="%s_rpn_bbox_pred_reshape_stride%s" % (prefix,stride)) + if landmark: + rpn_landmark_pred = conv_only(rpn_relu, '%s_rpn_landmark_pred_stride%d'%(prefix,stride), landmark_pred_len*num_anchors, + kernel=(1,1), pad=(0,0), stride=(1, 1), shared_weight = shared_vars[2][0], shared_bias = shared_vars[2][1]) + rpn_landmark_pred_reshape = mx.symbol.Reshape(data=rpn_landmark_pred, + shape=(0, 0, -1), + name="%s_rpn_landmark_pred_reshape_stride%s" % (prefix,stride)) + + if config.TRAIN.RPN_ENABLE_OHEM>=2: + label, anchor_weight, valid_count = mx.sym.Custom(op_type='rpn_fpn_ohem3', stride=int(stride), network=config.network, dataset=config.dataset, prefix=prefix, cls_score=rpn_cls_score_reshape, labels = label) + + _bbox_weight = mx.sym.tile(anchor_weight, (1,1,bbox_pred_len)) + _bbox_weight = _bbox_weight.reshape((0, -1, A * bbox_pred_len)).transpose((0,2,1)) + bbox_weight = mx.sym.elemwise_mul(bbox_weight, _bbox_weight, name='%s_bbox_weight_mul_stride%s'%(prefix,stride)) + + if landmark: + _landmark_weight = mx.sym.tile(anchor_weight, (1,1,landmark_pred_len)) + _landmark_weight = _landmark_weight.reshape((0, -1, A * landmark_pred_len)).transpose((0,2,1)) + landmark_weight = mx.sym.elemwise_mul(landmark_weight, _landmark_weight, name='%s_landmark_weight_mul_stride%s'%(prefix,stride)) + #if not config.FACE_LANDMARK: + # label, bbox_weight = mx.sym.Custom(op_type='rpn_fpn_ohem', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight , labels = label) + #else: + # label, bbox_weight, landmark_weight = mx.sym.Custom(op_type='rpn_fpn_ohem2', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight, landmark_weight=landmark_weight, labels = label) + #cls loss + rpn_cls_prob = mx.symbol.SoftmaxOutput(data=rpn_cls_score_reshape, + label=label, + multi_output=True, + normalization='valid', use_ignore=True, ignore_label=-1, + grad_scale = lr_mult, + name='%s_rpn_cls_prob_stride%d'%(prefix,stride)) + ret_group.append(rpn_cls_prob) + ret_group.append(mx.sym.BlockGrad(label)) + + valid_count = mx.symbol.sum(valid_count) + valid_count = valid_count + 0.001 #avoid zero + + #bbox loss + bbox_diff = rpn_bbox_pred_reshape-bbox_target + bbox_diff = bbox_diff * bbox_weight + rpn_bbox_loss_ = mx.symbol.smooth_l1(name='%s_rpn_bbox_loss_stride%d_'%(prefix,stride), scalar=3.0, data=bbox_diff) + if config.LR_MODE==0: + rpn_bbox_loss = mx.sym.MakeLoss(name='%s_rpn_bbox_loss_stride%d'%(prefix,stride), data=rpn_bbox_loss_, grad_scale=1.0*lr_mult / (config.TRAIN.RPN_BATCH_SIZE)) + else: + rpn_bbox_loss_ = mx.symbol.broadcast_div(rpn_bbox_loss_, valid_count) + rpn_bbox_loss = mx.sym.MakeLoss(name='%s_rpn_bbox_loss_stride%d'%(prefix,stride), data=rpn_bbox_loss_, grad_scale=0.25*lr_mult) + ret_group.append(rpn_bbox_loss) + ret_group.append(mx.sym.BlockGrad(bbox_weight)) + + #landmark loss + if landmark: + landmark_diff = rpn_landmark_pred_reshape-landmark_target + landmark_diff = landmark_diff * landmark_weight + rpn_landmark_loss_ = mx.symbol.smooth_l1(name='%s_rpn_landmark_loss_stride%d_'%(prefix,stride), scalar=3.0, data=landmark_diff) + if config.LR_MODE==0: + rpn_landmark_loss = mx.sym.MakeLoss(name='%s_rpn_landmark_loss_stride%d'%(prefix,stride), data=rpn_landmark_loss_, grad_scale=0.4*config.LANDMARK_LR_MULT*lr_mult / (config.TRAIN.RPN_BATCH_SIZE)) + else: + rpn_landmark_loss_ = mx.symbol.broadcast_div(rpn_landmark_loss_, valid_count) + rpn_landmark_loss = mx.sym.MakeLoss(name='%s_rpn_landmark_loss_stride%d'%(prefix,stride), data=rpn_landmark_loss_, grad_scale=0.1*config.LANDMARK_LR_MULT*lr_mult) + ret_group.append(rpn_landmark_loss) + ret_group.append(mx.sym.BlockGrad(landmark_weight)) + if config.USE_3D: + from rcnn.PY_OP import rpn_3d_mesh + pass + return ret_group + +def get_sym_train(sym): + data = mx.symbol.Variable(name="data") + + # shared convolutional layers + conv_fpn_feat = get_sym_conv(data, sym) + ret_group = [] + shared_vars = [] + if config.SHARE_WEIGHT_BBOX: + assert config.USE_MAXOUT==0 + _name = 'face_rpn_cls_score_share' + shared_weight = mx.symbol.Variable(name="{}_weight".format(_name), + init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + shared_bias = mx.symbol.Variable(name="{}_bias".format(_name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(0.0)}) + shared_vars.append( [shared_weight, shared_bias] ) + _name = 'face_rpn_bbox_pred_share' + shared_weight = mx.symbol.Variable(name="{}_weight".format(_name), + init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + shared_bias = mx.symbol.Variable(name="{}_bias".format(_name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(0.0)}) + shared_vars.append( [shared_weight, shared_bias] ) + else: + shared_vars.append( [None, None] ) + shared_vars.append( [None, None] ) + if config.SHARE_WEIGHT_LANDMARK: + _name = 'face_rpn_landmark_pred_share' + shared_weight = mx.symbol.Variable(name="{}_weight".format(_name), + init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + shared_bias = mx.symbol.Variable(name="{}_bias".format(_name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(0.0)}) + shared_vars.append( [shared_weight, shared_bias] ) + else: + shared_vars.append( [None, None] ) + + for stride in config.RPN_FEAT_STRIDE: + ret = get_out(conv_fpn_feat, 'face', stride, config.FACE_LANDMARK, lr_mult=1.0, shared_vars = shared_vars) + ret_group += ret + if config.HEAD_BOX: + assert not config.SHARE_WEIGHT_BBOX and not config.SHARE_WEIGHT_LANDMARK + shared_vars = [ [None, None], [None, None], [None, None] ] + ret = get_out(conv_fpn_feat, 'head', stride, False, lr_mult=0.5, shared_vars = shared_vars) + ret_group += ret + + return mx.sym.Group(ret_group) + + diff --git a/RetinaFace/rcnn/symbol/symbol_common.py.bak b/RetinaFace/rcnn/symbol/symbol_common.py.bak new file mode 100644 index 0000000..30f36d5 --- /dev/null +++ b/RetinaFace/rcnn/symbol/symbol_common.py.bak @@ -0,0 +1,649 @@ +import mxnet as mx +import mxnet.ndarray as nd +import mxnet.gluon as gluon +import mxnet.gluon.nn as nn +import mxnet.autograd as ag +import numpy as np +from rcnn.config import config +from rcnn.PY_OP import rpn_fpn_ohem, rpn_fpn_ohem2, rpn_fpn_ohem3 + +def conv_only(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \ + stride=(1,1), bias_wd_mult=0.0, shared_weight=None, shared_bias = None): + if shared_weight is None: + weight = mx.symbol.Variable(name="{}_weight".format(name), + init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + bias = mx.symbol.Variable(name="{}_bias".format(name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(bias_wd_mult)}) + else: + weight = shared_weight + bias = shared_bias + print('reuse shared var in', name) + conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \ + stride=stride, num_filter=num_filter, name="{}".format(name), weight = weight, bias=bias) + return conv + +def conv_deformable(net, num_filter, num_group=1, act_type='relu',name=''): + if config.USE_DCN==1: + f = num_group*18 + conv_offset = mx.symbol.Convolution(name=name+'_conv_offset', data = net, + num_filter=f, pad=(1, 1), kernel=(3, 3), stride=(1, 1)) + net = mx.contrib.symbol.DeformableConvolution(name=name+"_conv", data=net, offset=conv_offset, + num_filter=num_filter, pad=(1,1), kernel=(3, 3), num_deformable_group=num_group, stride=(1, 1), no_bias=False) + else: + print('use dcnv2 at', name) + lr_mult = 0.1 + weight_var = mx.sym.Variable(name=name+'_conv2_offset_weight', init=mx.init.Zero(), lr_mult=lr_mult) + bias_var = mx.sym.Variable(name=name+'_conv2_offset_bias', init=mx.init.Zero(), lr_mult=lr_mult) + conv2_offset = mx.symbol.Convolution(name=name + '_conv2_offset', data=net, num_filter=27, + pad=(1, 1), kernel=(3, 3), stride=(1,1), weight=weight_var, bias=bias_var, lr_mult=lr_mult) + conv2_offset_t = mx.sym.slice_axis(conv2_offset, axis=1, begin=0, end=18) + conv2_mask = mx.sym.slice_axis(conv2_offset, axis=1, begin=18, end=None) + conv2_mask = 2 * mx.sym.Activation(conv2_mask, act_type='sigmoid') + + conv2 = mx.contrib.symbol.ModulatedDeformableConvolution(name=name + '_conv2', data=net, offset=conv2_offset_t, mask=conv2_mask, + num_filter=num_filter, pad=(1, 1), kernel=(3, 3), stride=(1,1), + num_deformable_group=num_group, no_bias=True) + net = conv2 + net = mx.sym.BatchNorm(data=net, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn') + if len(act_type)>0: + net = mx.symbol.Activation(data=net, act_type=act_type, name=name+'_act') + return net + +def conv_act_layer_dw(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \ + stride=(1,1), act_type="relu", bias_wd_mult=0.0): + assert kernel[0]==3 + weight = mx.symbol.Variable(name="{}_weight".format(name), + init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + bias = mx.symbol.Variable(name="{}_bias".format(name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(bias_wd_mult)}) + conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \ + stride=stride, num_filter=num_filter, num_group=num_filter, name="{}".format(name), weight=weight, bias=bias) + conv = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn') + if len(act_type)>0: + relu = mx.symbol.Activation(data=conv, act_type=act_type, \ + name="{}_{}".format(name, act_type)) + else: + relu = conv + return relu + +def conv_act_layer(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \ + stride=(1,1), act_type="relu", bias_wd_mult=0.0, separable=False, filter_in = -1): + + if config.USE_DCN>1 and kernel==(3,3) and pad==(1,1) and stride==(1,1) and not separable: + return conv_deformable(from_layer, num_filter, num_group=1, act_type = act_type, name=name) + + if separable: + assert kernel[0]>1 + assert filter_in>0 + if not separable: + weight = mx.symbol.Variable(name="{}_weight".format(name), + init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + bias = mx.symbol.Variable(name="{}_bias".format(name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(bias_wd_mult)}) + conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \ + stride=stride, num_filter=num_filter, name="{}".format(name), weight=weight, bias=bias) + conv = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn') + else: + if filter_in<0: + filter_in = num_filter + conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \ + stride=stride, num_filter=filter_in, num_group=filter_in, name="{}_sep".format(name)) + conv = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_sep_bn') + conv = mx.symbol.Activation(data=conv, act_type='relu', \ + name="{}_sep_bn_relu".format(name)) + conv = mx.symbol.Convolution(data=conv, kernel=(1,1), pad=(0,0), \ + stride=(1,1), num_filter=num_filter, name="{}".format(name)) + conv = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn') + if len(act_type)>0: + relu = mx.symbol.Activation(data=conv, act_type=act_type, \ + name="{}_{}".format(name, act_type)) + else: + relu = conv + return relu + +def ssh_context_module(body, num_filter, filter_in, name): + conv_dimred = conv_act_layer(body, name+'_conv1', + num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', separable=False, filter_in = filter_in) + conv5x5 = conv_act_layer(conv_dimred, name+'_conv2', + num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', separable=False) + conv7x7_1 = conv_act_layer(conv_dimred, name+'_conv3_1', + num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', separable=False) + conv7x7 = conv_act_layer(conv7x7_1, name+'_conv3_2', + num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', separable=False) + return (conv5x5, conv7x7) + + +def ssh_detection_module(body, num_filter, filter_in, name): + conv3x3 = conv_act_layer(body, name+'_conv1', + num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', separable=False, filter_in=filter_in) + conv5x5, conv7x7 = ssh_context_module(body, num_filter//2, filter_in, name+'_context') + ret = mx.sym.concat(*[conv3x3, conv5x5, conv7x7], dim=1, name = name+'_concat') + ret = mx.symbol.Activation(data=ret, act_type='relu', name=name+'_concat_relu') + if config.USE_DCN>0: + ret = conv_deformable(ret, num_filter = num_filter*2, name = name+'_concat_dcn') + return ret + +def insight_context_module(body, kernel, num_filter, filter_in, name): + conv_dimred = conv_act_layer(body, name+'_conv0', + num_filter, kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='relu', separable=False, filter_in = filter_in) + conv1 = conv_act_layer(conv_dimred, name+'_conv1', + num_filter*6, kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='relu', separable=False, filter_in = filter_in) + conv2 = conv_act_layer(conv1, name+'_conv2', + num_filter*6, kernel=kernel, pad=((kernel[0]-1)//2, (kernel[1]-1)//2), stride=(1, 1), act_type='relu', separable=True, filter_in = num_filter*6) + conv3 = conv_act_layer(conv2, name+'_conv3', + num_filter, kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='relu', separable=False) + conv3 = conv3 + conv_dimred + return conv3 + +def insight_detection_module(body, num_filter, filter_in, name): + conv3x3 = insight_context_module(body, (3,3), num_filter//2, filter_in, name+'_context3x3') + conv5x5 = insight_context_module(body, (5,5), num_filter//2, filter_in, name+'_context5x5') + ret = mx.sym.concat(*[conv3x3, conv5x5], dim=1, name = name+'_concat') + if config.USE_DCN: + ret = conv_deformable(ret, num_filter = num_filter*2, name = name+'_concat_dcn') + return ret + +def upsampling(data, num_filter, name): + #ret = mx.symbol.Deconvolution(data=data, num_filter=num_filter, kernel=(4,4), stride=(2, 2), pad=(1,1), + # num_group = num_filter, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'}, + # name=name) + #ret = mx.symbol.Deconvolution(data=data, num_filter=num_filter, kernel=(2,2), stride=(2, 2), pad=(0,0), + # num_group = num_filter, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'}, + # name=name) + ret = mx.symbol.UpSampling(data, scale=2, sample_type='nearest', workspace=512, name=name, num_args=1) + return ret + +def get_sym_conv(data, sym): + mm = config.MULTIPLIER + all_layers = sym.get_internals() + #print(all_layers) + ##c1 = all_layers['mobilenetv20_features_linearbottleneck6_relu60_relu6_output'] #96 + #c1 = all_layers['mobilenetv20_features_linearbottleneck5_elemwise_add0_output'] # 16 + ##c2 = all_layers['mobilenetv20_features_linearbottleneck13_relu60_relu6_output'] + #c2 = all_layers['mobilenetv20_features_linearbottleneck12_elemwise_add0_output'] # 48 + ##c3 = all_layers['mobilenetv20_features_linearbottleneck16_batchnorm2_fwd_output'] # 160 + #c3 = all_layers['mobilenetv20_features_linearbottleneck13_batchnorm2_fwd_output'] # 80 + #c1_filter = int(32*mm) + #c2_filter = int(96*mm) + #c3_filter = int(160*mm) + + #c1 = all_layers['mobilenet0_relu10_fwd_output'] + #c2 = all_layers['mobilenet0_relu22_fwd_output'] + #c3 = all_layers['mobilenet0_relu26_fwd_output'] + + #c1 = all_layers['conv_6_relu_output'] + #c2 = all_layers['conv_12_relu_output'] + #c3 = all_layers['conv_14_relu_output'] + #c1_filter = int(256*mm) + #c2_filter = int(512*mm) + #c3_filter = int(1024*mm) + + isize = 640 + _, out_shape, _ = all_layers.infer_shape(data = (1,3,isize,isize)) + last_entry = None + c1 = None + c2 = None + c3 = None + c1_name = None + c2_name = None + c3_name = None + c1_filter = -1 + c2_filter = -1 + c3_filter = -1 + #print(len(all_layers), len(out_shape)) + #print(all_layers.__class__) + outputs = all_layers.list_outputs() + #print(outputs.__class__, len(outputs)) + count = len(outputs) + stride2name = {} + stride2layer = {} + for i in range(count): + name = outputs[i] + shape = out_shape[i] + if not name.endswith('_output'): + continue + if len(shape)!=4: + continue + assert isize%shape[2]==0 + stride = isize//shape[2] + stride2name[stride] = name + stride2layer[stride] = all_layers[name] + #print(name, shape) + #if c1 is None and shape[2]==isize//16: + # cname = last_entry[0] + # #print('c1', last_entry) + # c1 = all_layers[cname] + # c1_name = cname + #if c2 is None and shape[2]==isize//32: + # cname = last_entry[0] + # #print('c2', last_entry) + # c2 = all_layers[cname] + # c2_name = cname + #if shape[2]==isize//32: + # c3 = all_layers[name] + # #print('c3', name, shape) + # c3_name = name + + #last_entry = (name, shape) + + #F1 = int(256*mm) + #F2 = int(128*mm) + F1 = int(config.HEAD_FILTER_NUM*mm) + F2 = F1 + if config.SHARE_WEIGHT_BBOX or config.SHARE_WEIGHT_LANDMARK: + F2 = F1 + print('stride2name', stride2name, F1, F2) + #print('cnames', c1_name, c2_name, c3_name, F1, F2) + _bwm = 1.0 + if config.NET_MODE==0: + c1_lateral = conv_act_layer(c1, 'rf_c1_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c2_lateral = conv_act_layer(c2, 'rf_c2_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #conv5_128_up = mx.symbol.Deconvolution(data=conv5_128, num_filter=F2, kernel=(4,4), stride=(2, 2), pad=(1,1), + # num_group = F2, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'}, + # name='ssh_m2_red_upsampling') + #c2_up = mx.symbol.UpSampling(c2_lateral, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + c2_up = upsampling(c2_lateral, F2, 'rf_c2_red_upsampling') + #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + c2_up = mx.symbol.Crop(*[c2_up, c1_lateral]) + + c1 = c1_lateral+c2_up + + c1 = conv_act_layer(c1, 'rf_c1_conv', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + m1 = ssh_detection_module(c1, F2, F2, 'rf_c1_det') + m2 = ssh_detection_module(c2, F1, c2_filter, 'rf_c2_det') + m3 = ssh_detection_module(c3, F1, c3_filter, 'rf_c3_det') + elif config.NET_MODE==1: + c3_lateral = conv_act_layer(c3, 'ssh_c3_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c3_up = mx.symbol.UpSampling(c3_lateral, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1) + c3_up = upsampling(c3_lateral, F2, 'ssh_c3_upsampling') + c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3_up = mx.symbol.Crop(*[c3_up, c2_lateral]) + c2 = c2_lateral+c3_up + c2 = conv_act_layer(c2, 'ssh_c2_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + c2_up = upsampling(c2, F2, 'ssh_c2_upsampling') + #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + c2_up = mx.symbol.Crop(*[c2_up, c1_lateral]) + + c1 = c1_lateral+c2_up + + c1 = conv_act_layer(c1, 'ssh_m1_conv', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det') + m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det') + m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det') + elif config.NET_MODE==2: + c0 = stride2layer[4] + c1 = stride2layer[8] + c2 = stride2layer[16] + c3 = stride2layer[32] + c3 = conv_act_layer(c3, 'rf_c3_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1) + c3_up = upsampling(c3, F2, 'rf_c3_upsampling') + c2_lateral = conv_act_layer(c2, 'rf_c2_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3_up = mx.symbol.Crop(*[c3_up, c2_lateral]) + c2 = c2_lateral+c3_up + c2 = conv_act_layer(c2, 'rf_c2_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_lateral = conv_act_layer(c1, 'rf_c1_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + c2_up = upsampling(c2, F2, 'rf_c2_upsampling') + #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + c2_up = mx.symbol.Crop(*[c2_up, c1_lateral]) + c1 = c1_lateral+c2_up + c1 = conv_act_layer(c1, 'rf_c1_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + m1 = ssh_detection_module(c1, F2*config.CONTEXT_FILTER_RATIO//2, F2, 'rf_c1_det') #output *2 filters + m2 = ssh_detection_module(c2, F1*config.CONTEXT_FILTER_RATIO//2, F2, 'rf_c2_det') # output *2 filters + m3 = ssh_detection_module(c3, F1*config.CONTEXT_FILTER_RATIO//2, F2, 'rf_c3_det') + if len(config.RPN_ANCHOR_CFG)==3: + ret = {8: m1, 16:m2, 32: m3} + elif len(config.RPN_ANCHOR_CFG)==1: + ret = {16:m2} + elif len(config.RPN_ANCHOR_CFG)==2: + ret = {8: m1, 16:m2} + elif len(config.RPN_ANCHOR_CFG)==5: + c0_lateral = conv_act_layer(c0, 'rf_c0_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_up = upsampling(c1, F2, 'rf_c1_upsampling') + c1_up = mx.symbol.Crop(*[c1_up, c0_lateral]) + c0 = c0_lateral+c1_up + c0 = conv_act_layer(c0, 'rf_c0_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + + c4 = conv_act_layer(c3, 'rf_c4', + F2, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu', bias_wd_mult=_bwm) + m0 = ssh_detection_module(c0, F2*config.CONTEXT_FILTER_RATIO//2, F2, 'rf_c0_det') #output *2 filters + m4 = ssh_detection_module(c4, F1*config.CONTEXT_FILTER_RATIO//2, F2, 'rf_c4_det') # output *2 filters + ret = {4: m0, 8: m1, 16:m2, 32: m3, 64: m4} + elif len(config.RPN_ANCHOR_CFG)==6: + c0_lateral = conv_act_layer(c0, 'rf_c0_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_up = upsampling(c1, F2, 'rf_c1_upsampling') + c1_up = mx.symbol.Crop(*[c1_up, c0_lateral]) + c0 = c0_lateral+c1_up + c0 = conv_act_layer(c0, 'rf_c0_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + + c4 = conv_act_layer(c3, 'rf_c4', + F2, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu', bias_wd_mult=_bwm) + c5 = conv_act_layer(c4, 'rf_c5', + F2, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu', bias_wd_mult=_bwm) + m0 = ssh_detection_module(c0, F2*config.CONTEXT_FILTER_RATIO//2, F2, 'rf_c0_det') #output *2 filters + m4 = ssh_detection_module(c4, F1*config.CONTEXT_FILTER_RATIO//2, F2, 'rf_c4_det') # output *2 filters + m5 = ssh_detection_module(c5, F1*config.CONTEXT_FILTER_RATIO//2, F2, 'rf_c5_det') + ret = {4: m0, 8: m1, 16:m2, 32: m3, 64: m4, 128: m5} + + elif config.NET_MODE==3: + assert len(config.RPN_ANCHOR_CFG)==6 + c0 = stride2layer[4] + c1 = stride2layer[8] + c2 = stride2layer[16] + c3 = stride2layer[32] + c3 = conv_act_layer(c3, 'ssh_c3_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c4 = conv_act_layer(c3, 'ssh_c4', + F2, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu', bias_wd_mult=_bwm) + c5 = conv_act_layer(c4, 'ssh_c5', + F2, kernel=(3, 3), pad=(1, 1), stride=(2, 2), act_type='relu', bias_wd_mult=_bwm) + c5_up = upsampling(c5, F2, 'ssh_c5_upsampling') + c4_lateral = c4 + c5_up = mx.symbol.Crop(*[c5_up, c4_lateral]) + c4 = c4_lateral+c5_up + c4 = conv_act_layer(c4, 'ssh_c4_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c4_up = upsampling(c4, F2, 'ssh_c4_upsampling') + c4_up = mx.symbol.Crop(*[c4_up, c3]) + c3 = c3+c4_up + c3 = conv_act_layer(c3, 'ssh_c3_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + + #c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1) + c3_up = upsampling(c3, F2, 'ssh_c3_upsampling') + c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3_up = mx.symbol.Crop(*[c3_up, c2_lateral]) + c2 = c2_lateral+c3_up + c2 = conv_act_layer(c2, 'ssh_c2_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + c2_up = upsampling(c2, F2, 'ssh_c2_upsampling') + #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + c2_up = mx.symbol.Crop(*[c2_up, c1_lateral]) + c1 = c1_lateral+c2_up + c1 = conv_act_layer(c1, 'ssh_c1_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + m1 = ssh_detection_module(c1, F2*config.CONTEXT_FILTER_RATIO//2, F2, 'ssh_m1_det') #output *2 filters + m2 = ssh_detection_module(c2, F1*config.CONTEXT_FILTER_RATIO//2, F2, 'ssh_m2_det') # output *2 filters + m3 = ssh_detection_module(c3, F1*config.CONTEXT_FILTER_RATIO//2, F2, 'ssh_m3_det') + c0_lateral = conv_act_layer(c0, 'ssh_c0_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_up = upsampling(c1, F2, 'ssh_c1_upsampling') + c1_up = mx.symbol.Crop(*[c1_up, c0_lateral]) + c0 = c0_lateral+c1_up + c0 = conv_act_layer(c0, 'ssh_c0_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + + m0 = ssh_detection_module(c0, F2*config.CONTEXT_FILTER_RATIO//2, F2, 'ssh_m0_det') #output *2 filters + m4 = ssh_detection_module(c4, F1*config.CONTEXT_FILTER_RATIO//2, F2, 'ssh_m4_det') # output *2 filters + m5 = ssh_detection_module(c5, F1*config.CONTEXT_FILTER_RATIO//2, F2, 'ssh_m5_det') + ret = {4: m0, 8: m1, 16:m2, 32: m3, 64: m4, 128: m5} + elif config.NET_MODE==4: + c3 = conv_act_layer(c3, 'ssh_c3_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1) + c3_up = upsampling(c3, F2, 'ssh_c3_upsampling') + c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3_up = mx.symbol.Crop(*[c3_up, c2_lateral]) + c2 = c2_lateral+c3_up + c2 = conv_act_layer(c2, 'ssh_c2_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + c2_up = upsampling(c2, F2, 'ssh_c2_upsampling') + #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + c2_up = mx.symbol.Crop(*[c2_up, c1_lateral]) + c1 = c1_lateral+c2_up + c1 = conv_act_layer(c1, 'ssh_c1_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + + m1 = ssh_detection_module(c1, F2//2, F2, 'ssh_m1_det') + m2 = ssh_detection_module(c2, F1//2, c2_filter, 'ssh_m2_det') + m3 = ssh_detection_module(c3, F1//2, c3_filter, 'ssh_m3_det') + elif config.NET_MODE==5: + c3 = conv_act_layer_dw(c3, 'ssh_c3_lateral_m', + F2, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3 = conv_act_layer(c3, 'ssh_c3_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1) + c3_up = upsampling(c3, F2, 'ssh_c3_upsampling') + c2 = conv_act_layer_dw(c2, 'ssh_c2_lateral_m', + F2, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3_up = mx.symbol.Crop(*[c3_up, c2_lateral]) + c2 = c2_lateral+c3_up + c2 = conv_act_layer(c2, 'ssh_c2_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1 = conv_act_layer_dw(c1, 'ssh_c1_lateral_m', + F2, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + c2_up = upsampling(c2, F2, 'ssh_c2_upsampling') + #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + c2_up = mx.symbol.Crop(*[c2_up, c1_lateral]) + c1 = c1_lateral+c2_up + c1 = conv_act_layer(c1, 'ssh_c1_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + + m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det') + m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det') + m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det') + elif config.NET_MODE==6: + c3 = conv_act_layer(c3, 'ssh_c3_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1) + c3_up = upsampling(c3, F2, 'ssh_c3_upsampling') + c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3_up = mx.symbol.Crop(*[c3_up, c2_lateral]) + c2 = c2_lateral+c3_up + c2 = conv_act_layer(c2, 'ssh_c2_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + c2_up = upsampling(c2, F2, 'ssh_c2_upsampling') + #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + c2_up = mx.symbol.Crop(*[c2_up, c1_lateral]) + c1 = c1_lateral+c2_up + c1 = conv_act_layer(c1, 'ssh_c1_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + + m1 = insight_detection_module(c1, F2, F2, 'ssh_m1_det') + m2 = insight_detection_module(c2, F1, F2, 'ssh_m2_det') + m3 = insight_detection_module(c3, F1, F2, 'ssh_m3_det') + + #return {8: m1, 16:m2, 32: m3} + return ret + +def get_out(conv_fpn_feat, prefix, stride, landmark=False, lr_mult=1.0, shared_vars = None): + A = config.NUM_ANCHORS + bbox_pred_len = 4 + landmark_pred_len = 10 + if config.USE_BLUR: + bbox_pred_len = 5 + if config.USE_OCCLUSION: + landmark_pred_len = 15 + ret_group = [] + num_anchors = config.RPN_ANCHOR_CFG[str(stride)]['NUM_ANCHORS'] + label = mx.symbol.Variable(name='%s_label_stride%d'%(prefix,stride)) + bbox_target = mx.symbol.Variable(name='%s_bbox_target_stride%d'%(prefix,stride)) + bbox_weight = mx.symbol.Variable(name='%s_bbox_weight_stride%d'%(prefix,stride)) + if landmark: + landmark_target = mx.symbol.Variable(name='%s_landmark_target_stride%d'%(prefix,stride)) + landmark_weight = mx.symbol.Variable(name='%s_landmark_weight_stride%d'%(prefix,stride)) + rpn_relu = conv_fpn_feat[stride] + maxout_stat = 0 + if config.USE_MAXOUT>=1 and stride==config.RPN_FEAT_STRIDE[-1]: + maxout_stat = 1 + if config.USE_MAXOUT>=2 and stride!=config.RPN_FEAT_STRIDE[-1]: + maxout_stat = 2 + + + if maxout_stat==0: + rpn_cls_score = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d'%(prefix, stride), 2*num_anchors, + kernel=(1,1), pad=(0,0), stride=(1, 1), shared_weight = shared_vars[0][0], shared_bias = shared_vars[0][1]) + elif maxout_stat==1: + cls_list = [] + for a in range(num_anchors): + rpn_cls_score_bg = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_bg'%(prefix,stride,a), 3, + kernel=(1,1), pad=(0,0), stride=(1, 1)) + rpn_cls_score_bg = mx.sym.max(rpn_cls_score_bg, axis=1, keepdims=True) + cls_list.append(rpn_cls_score_bg) + rpn_cls_score_fg = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_fg'%(prefix,stride,a), 1, + kernel=(1,1), pad=(0,0), stride=(1, 1)) + cls_list.append(rpn_cls_score_fg) + rpn_cls_score = mx.sym.concat(*cls_list, dim=1, name='%s_rpn_cls_score_stride%d'%(prefix,stride)) + else: + cls_list = [] + for a in range(num_anchors): + rpn_cls_score_bg = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_bg'%(prefix,stride,a), 1, + kernel=(1,1), pad=(0,0), stride=(1, 1)) + cls_list.append(rpn_cls_score_bg) + rpn_cls_score_fg = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_fg'%(prefix,stride,a), 3, + kernel=(1,1), pad=(0,0), stride=(1, 1)) + rpn_cls_score_fg = mx.sym.max(rpn_cls_score_fg, axis=1, keepdims=True) + cls_list.append(rpn_cls_score_fg) + rpn_cls_score = mx.sym.concat(*cls_list, dim=1, name='%s_rpn_cls_score_stride%d'%(prefix,stride)) + + rpn_bbox_pred = conv_only(rpn_relu, '%s_rpn_bbox_pred_stride%d'%(prefix,stride), bbox_pred_len*num_anchors, + kernel=(1,1), pad=(0,0), stride=(1, 1), shared_weight = shared_vars[1][0], shared_bias = shared_vars[1][1]) + + # prepare rpn data + rpn_cls_score_reshape = mx.symbol.Reshape(data=rpn_cls_score, + shape=(0, 2, -1), + name="%s_rpn_cls_score_reshape_stride%s" % (prefix,stride)) + + rpn_bbox_pred_reshape = mx.symbol.Reshape(data=rpn_bbox_pred, + shape=(0, 0, -1), + name="%s_rpn_bbox_pred_reshape_stride%s" % (prefix,stride)) + if landmark: + rpn_landmark_pred = conv_only(rpn_relu, '%s_rpn_landmark_pred_stride%d'%(prefix,stride), landmark_pred_len*num_anchors, + kernel=(1,1), pad=(0,0), stride=(1, 1), shared_weight = shared_vars[2][0], shared_bias = shared_vars[2][1]) + rpn_landmark_pred_reshape = mx.symbol.Reshape(data=rpn_landmark_pred, + shape=(0, 0, -1), + name="%s_rpn_landmark_pred_reshape_stride%s" % (prefix,stride)) + + if config.TRAIN.RPN_ENABLE_OHEM>=2: + label, anchor_weight, valid_count = mx.sym.Custom(op_type='rpn_fpn_ohem3', stride=int(stride), network=config.network, dataset=config.dataset, prefix=prefix, cls_score=rpn_cls_score_reshape, labels = label) + + _bbox_weight = mx.sym.tile(anchor_weight, (1,1,bbox_pred_len)) + _bbox_weight = _bbox_weight.reshape((0, -1, A * bbox_pred_len)).transpose((0,2,1)) + bbox_weight = mx.sym.elemwise_mul(bbox_weight, _bbox_weight, name='%s_bbox_weight_mul_stride%s'%(prefix,stride)) + + if landmark: + _landmark_weight = mx.sym.tile(anchor_weight, (1,1,landmark_pred_len)) + _landmark_weight = _landmark_weight.reshape((0, -1, A * landmark_pred_len)).transpose((0,2,1)) + landmark_weight = mx.sym.elemwise_mul(landmark_weight, _landmark_weight, name='%s_landmark_weight_mul_stride%s'%(prefix,stride)) + #if not config.FACE_LANDMARK: + # label, bbox_weight = mx.sym.Custom(op_type='rpn_fpn_ohem', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight , labels = label) + #else: + # label, bbox_weight, landmark_weight = mx.sym.Custom(op_type='rpn_fpn_ohem2', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight, landmark_weight=landmark_weight, labels = label) + #cls loss + rpn_cls_prob = mx.symbol.SoftmaxOutput(data=rpn_cls_score_reshape, + label=label, + multi_output=True, + normalization='valid', use_ignore=True, ignore_label=-1, + grad_scale = lr_mult, + name='%s_rpn_cls_prob_stride%d'%(prefix,stride)) + ret_group.append(rpn_cls_prob) + ret_group.append(mx.sym.BlockGrad(label)) + + valid_count = mx.symbol.mean(valid_count) + valid_count = valid_count + 0.001 #avoid zero + + #bbox loss + bbox_diff = rpn_bbox_pred_reshape-bbox_target + bbox_diff = bbox_diff * bbox_weight + rpn_bbox_loss_ = mx.symbol.smooth_l1(name='%s_rpn_bbox_loss_stride%d_'%(prefix,stride), scalar=3.0, data=bbox_diff) + rpn_bbox_loss = mx.sym.MakeLoss(name='%s_rpn_bbox_loss_stride%d'%(prefix,stride), data=rpn_bbox_loss_, grad_scale=1.0*lr_mult / (config.TRAIN.RPN_BATCH_SIZE)) + #rpn_bbox_loss_ = mx.symbol.broadcast_div(rpn_bbox_loss_, valid_count) + #rpn_bbox_loss = mx.sym.MakeLoss(name='%s_rpn_bbox_loss_stride%d'%(prefix,stride), data=rpn_bbox_loss_, grad_scale=1.0*lr_mult / (config.TRAIN.BATCH_IMAGES*16)) + ret_group.append(rpn_bbox_loss) + ret_group.append(mx.sym.BlockGrad(bbox_weight)) + + #landmark loss + if landmark: + landmark_diff = rpn_landmark_pred_reshape-landmark_target + landmark_diff = landmark_diff * landmark_weight + rpn_landmark_loss_ = mx.symbol.smooth_l1(name='%s_rpn_landmark_loss_stride%d_'%(prefix,stride), scalar=3.0, data=landmark_diff) + rpn_landmark_loss = mx.sym.MakeLoss(name='%s_rpn_landmark_loss_stride%d'%(prefix,stride), data=rpn_landmark_loss_, grad_scale=0.5*lr_mult / (config.TRAIN.RPN_BATCH_SIZE)) + #rpn_landmark_loss_ = mx.symbol.broadcast_div(rpn_landmark_loss_, valid_count) + #rpn_landmark_loss = mx.sym.MakeLoss(name='%s_rpn_landmark_loss_stride%d'%(prefix,stride), data=rpn_landmark_loss_, grad_scale=1.0*lr_mult / (config.TRAIN.BATCH_IMAGES*40)) + ret_group.append(rpn_landmark_loss) + ret_group.append(mx.sym.BlockGrad(landmark_weight)) + return ret_group + +def get_sym_train(sym): + data = mx.symbol.Variable(name="data") + + # shared convolutional layers + conv_fpn_feat = get_sym_conv(data, sym) + ret_group = [] + shared_vars = [] + if config.SHARE_WEIGHT_BBOX: + assert config.USE_MAXOUT==0 + _name = 'face_rpn_cls_score_share' + shared_weight = mx.symbol.Variable(name="{}_weight".format(_name), + init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + shared_bias = mx.symbol.Variable(name="{}_bias".format(_name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(0.0)}) + shared_vars.append( [shared_weight, shared_bias] ) + _name = 'face_rpn_bbox_pred_share' + shared_weight = mx.symbol.Variable(name="{}_weight".format(_name), + init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + shared_bias = mx.symbol.Variable(name="{}_bias".format(_name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(0.0)}) + shared_vars.append( [shared_weight, shared_bias] ) + else: + shared_vars.append( [None, None] ) + shared_vars.append( [None, None] ) + if config.SHARE_WEIGHT_LANDMARK: + _name = 'face_rpn_landmark_pred_share' + shared_weight = mx.symbol.Variable(name="{}_weight".format(_name), + init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + shared_bias = mx.symbol.Variable(name="{}_bias".format(_name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(0.0)}) + shared_vars.append( [shared_weight, shared_bias] ) + else: + shared_vars.append( [None, None] ) + + for stride in config.RPN_FEAT_STRIDE: + ret = get_out(conv_fpn_feat, 'face', stride, config.FACE_LANDMARK, lr_mult=1.0, shared_vars = shared_vars) + ret_group += ret + if config.HEAD_BOX: + assert not config.SHARE_WEIGHT_BBOX and not config.SHARE_WEIGHT_LANDMARK + shared_vars = [ [None, None], [None, None], [None, None] ] + ret = get_out(conv_fpn_feat, 'head', stride, False, lr_mult=0.5, shared_vars = shared_vars) + ret_group += ret + + return mx.sym.Group(ret_group) + + diff --git a/RetinaFace/rcnn/symbol/symbol_mnet.py b/RetinaFace/rcnn/symbol/symbol_mnet.py new file mode 100644 index 0000000..ee45168 --- /dev/null +++ b/RetinaFace/rcnn/symbol/symbol_mnet.py @@ -0,0 +1,492 @@ +import mxnet as mx +import mxnet.ndarray as nd +import mxnet.gluon as gluon +import mxnet.gluon.nn as nn +import mxnet.autograd as ag +import numpy as np +from rcnn.config import config +from rcnn.PY_OP import rpn_fpn_ohem3 +from symbol_common import get_sym_train + + +def conv_only(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \ + stride=(1,1), bias_wd_mult=0.0, shared_weight=None, shared_bias = None): + if shared_weight is None: + weight = mx.symbol.Variable(name="{}_weight".format(name), + init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + bias = mx.symbol.Variable(name="{}_bias".format(name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(bias_wd_mult)}) + else: + weight = shared_weight + bias = shared_bias + print('reuse shared var in', name) + conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \ + stride=stride, num_filter=num_filter, name="{}".format(name), weight = weight, bias=bias) + return conv + +def conv_act_layer_dw(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \ + stride=(1,1), act_type="relu", bias_wd_mult=0.0): + assert kernel[0]==3 + weight = mx.symbol.Variable(name="{}_weight".format(name), + init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + bias = mx.symbol.Variable(name="{}_bias".format(name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(bias_wd_mult)}) + conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \ + stride=stride, num_filter=num_filter, num_group=num_filter, name="{}".format(name), weight=weight, bias=bias) + conv = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn') + if len(act_type)>0: + relu = mx.symbol.Activation(data=conv, act_type=act_type, \ + name="{}_{}".format(name, act_type)) + else: + relu = conv + return relu + +def conv_act_layer(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \ + stride=(1,1), act_type="relu", bias_wd_mult=0.0, separable=False, filter_in = -1): + + separable = False + if separable: + assert kernel[0]==3 + if not separable: + weight = mx.symbol.Variable(name="{}_weight".format(name), + init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + bias = mx.symbol.Variable(name="{}_bias".format(name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(bias_wd_mult)}) + conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \ + stride=stride, num_filter=num_filter, name="{}".format(name), weight=weight, bias=bias) + conv = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn') + else: + if filter_in<0: + filter_in = num_filter + conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \ + stride=stride, num_filter=filter_in, num_group=filter_in, name="{}_sep".format(name)) + conv = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_sep_bn') + conv = mx.symbol.Activation(data=conv, act_type='relu', \ + name="{}_sep_bn_relu".format(name)) + conv = mx.symbol.Convolution(data=conv, kernel=(1,1), pad=(0,0), \ + stride=(1,1), num_filter=num_filter, name="{}".format(name)) + conv = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn') + if len(act_type)>0: + relu = mx.symbol.Activation(data=conv, act_type=act_type, \ + name="{}_{}".format(name, act_type)) + else: + relu = conv + return relu + +def ssh_context_module(body, num_filter, filter_in, name): + conv_dimred = conv_act_layer(body, name+'_conv1', + num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', separable=True, filter_in = filter_in) + conv5x5 = conv_act_layer(conv_dimred, name+'_conv2', + num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', separable=True) + conv7x7_1 = conv_act_layer(conv_dimred, name+'_conv3_1', + num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', separable=True) + conv7x7 = conv_act_layer(conv7x7_1, name+'_conv3_2', + num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', separable=True) + return (conv5x5, conv7x7) + +def ssh_detection_module(body, num_filter, filter_in, name): + conv3x3 = conv_act_layer(body, name+'_conv1', + num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', separable=True, filter_in=filter_in) + conv5x5, conv7x7 = ssh_context_module(body, num_filter//2, filter_in, name+'_context') + ret = mx.sym.concat(*[conv3x3, conv5x5, conv7x7], dim=1, name = name+'_concat') + ret = mx.symbol.Activation(data=ret, act_type='relu', name=name+'_concat_relu') + return ret + + +def upsampling(data, num_filter, name): + #ret = mx.symbol.Deconvolution(data=data, num_filter=num_filter, kernel=(4,4), stride=(2, 2), pad=(1,1), + # num_group = num_filter, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'}, + # name=name) + #ret = mx.symbol.Deconvolution(data=data, num_filter=num_filter, kernel=(2,2), stride=(2, 2), pad=(0,0), + # num_group = num_filter, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'}, + # name=name) + ret = mx.symbol.UpSampling(data, scale=2, sample_type='nearest', workspace=512, name=name, num_args=1) + return ret + +def get_mnet_conv(data, sym): + mm = config.MULTIPLIER + all_layers = sym.get_internals() + #print(all_layers) + ##c1 = all_layers['mobilenetv20_features_linearbottleneck6_relu60_relu6_output'] #96 + #c1 = all_layers['mobilenetv20_features_linearbottleneck5_elemwise_add0_output'] # 16 + ##c2 = all_layers['mobilenetv20_features_linearbottleneck13_relu60_relu6_output'] + #c2 = all_layers['mobilenetv20_features_linearbottleneck12_elemwise_add0_output'] # 48 + ##c3 = all_layers['mobilenetv20_features_linearbottleneck16_batchnorm2_fwd_output'] # 160 + #c3 = all_layers['mobilenetv20_features_linearbottleneck13_batchnorm2_fwd_output'] # 80 + #c1_filter = int(32*mm) + #c2_filter = int(96*mm) + #c3_filter = int(160*mm) + + #c1 = all_layers['mobilenet0_relu10_fwd_output'] + #c2 = all_layers['mobilenet0_relu22_fwd_output'] + #c3 = all_layers['mobilenet0_relu26_fwd_output'] + + #c1 = all_layers['conv_6_relu_output'] + #c2 = all_layers['conv_12_relu_output'] + #c3 = all_layers['conv_14_relu_output'] + #c1_filter = int(256*mm) + #c2_filter = int(512*mm) + #c3_filter = int(1024*mm) + + isize = 640 + _, out_shape, _ = all_layers.infer_shape(data = (1,3,isize,isize)) + last_entry = None + c1 = None + c2 = None + c3 = None + c1_name = None + c2_name = None + c3_name = None + c1_filter = -1 + c2_filter = -1 + c3_filter = -1 + #print(len(all_layers), len(out_shape)) + #print(all_layers.__class__) + outputs = all_layers.list_outputs() + #print(outputs.__class__, len(outputs)) + count = len(outputs) + for i in range(count): + name = outputs[i] + shape = out_shape[i] + if not name.endswith('_output'): + continue + if len(shape)!=4: + continue + #print(name, shape) + if c1 is None and shape[2]==isize//16: + cname = last_entry[0] + #print('c1', last_entry) + c1 = all_layers[cname] + c1_name = cname + if c2 is None and shape[2]==isize//32: + cname = last_entry[0] + #print('c2', last_entry) + c2 = all_layers[cname] + c2_name = cname + if shape[2]==isize//32: + c3 = all_layers[name] + #print('c3', name, shape) + c3_name = name + + last_entry = (name, shape) + print('cnames', c1_name, c2_name, c3_name) + + F1 = int(256*mm) + F2 = int(128*mm) + if config.SHARE_WEIGHT_BBOX or config.SHARE_WEIGHT_LANDMARK: + F2 = F1 + _bwm = 1.0 + if config.NET_MODE==0: + c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c2_lateral = conv_act_layer(c2, 'ssh_m2_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #conv5_128_up = mx.symbol.Deconvolution(data=conv5_128, num_filter=F2, kernel=(4,4), stride=(2, 2), pad=(1,1), + # num_group = F2, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'}, + # name='ssh_m2_red_upsampling') + #c2_up = mx.symbol.UpSampling(c2_lateral, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + c2_up = upsampling(c2_lateral, F2, 'ssh_m2_red_upsampling') + #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + c2_up = mx.symbol.Crop(*[c2_up, c1_lateral]) + + c1 = c1_lateral+c2_up + + c1 = conv_act_layer(c1, 'ssh_m1_conv', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det') + m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det') + m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det') + elif config.NET_MODE==1: + c3_lateral = conv_act_layer(c3, 'ssh_c3_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c3_up = mx.symbol.UpSampling(c3_lateral, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1) + c3_up = upsampling(c3_lateral, F2, 'ssh_c3_upsampling') + c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3_up = mx.symbol.Crop(*[c3_up, c2_lateral]) + c2 = c2_lateral+c3_up + c2 = conv_act_layer(c2, 'ssh_c2_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + c2_up = upsampling(c2, F2, 'ssh_c2_upsampling') + #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + c2_up = mx.symbol.Crop(*[c2_up, c1_lateral]) + + c1 = c1_lateral+c2_up + + c1 = conv_act_layer(c1, 'ssh_m1_conv', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det') + m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det') + m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det') + elif config.NET_MODE==2: + c3 = conv_act_layer(c3, 'ssh_c3_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1) + c3_up = upsampling(c3, F2, 'ssh_c3_upsampling') + c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3_up = mx.symbol.Crop(*[c3_up, c2_lateral]) + c2 = c2_lateral+c3_up + c2 = conv_act_layer(c2, 'ssh_c2_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + c2_up = upsampling(c2, F2, 'ssh_c2_upsampling') + #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + c2_up = mx.symbol.Crop(*[c2_up, c1_lateral]) + c1 = c1_lateral+c2_up + c1 = conv_act_layer(c1, 'ssh_c1_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + + m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det') + m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det') + m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det') + elif config.NET_MODE==3: + #c3 = conv_act_layer(c3, 'ssh_c3_lateral', + # F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3 = ssh_detection_module(c3, F2//2, c3_filter, 'ssh_c3_lateral') + #c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1) + c3_up = upsampling(c3, F2, 'ssh_c3_upsampling') + #c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral', + # F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c2_lateral = ssh_detection_module(c2, F2//2, c2_filter, 'ssh_c2_lateral') + c3_up = mx.symbol.Crop(*[c3_up, c2_lateral]) + c2 = c2_lateral+c3_up + c2 = conv_act_layer(c2, 'ssh_c2_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv', + # F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_lateral = ssh_detection_module(c1, F2//2, c1_filter, 'ssh_c1_lateral') + #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + c2_up = upsampling(c2, F2, 'ssh_c2_upsampling') + #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + c2_up = mx.symbol.Crop(*[c2_up, c1_lateral]) + c1 = c1_lateral+c2_up + c1 = conv_act_layer(c1, 'ssh_c1_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + + m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det') + m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det') + m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det') + elif config.NET_MODE==4: + c3 = conv_act_layer(c3, 'ssh_c3_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1) + c3_up = upsampling(c3, F2, 'ssh_c3_upsampling') + c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3_up = mx.symbol.Crop(*[c3_up, c2_lateral]) + c2 = c2_lateral+c3_up + c2 = conv_act_layer(c2, 'ssh_c2_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + c2_up = upsampling(c2, F2, 'ssh_c2_upsampling') + #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + c2_up = mx.symbol.Crop(*[c2_up, c1_lateral]) + c1 = c1_lateral+c2_up + c1 = conv_act_layer(c1, 'ssh_c1_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + + m1 = ssh_detection_module(c1, F2//2, F2, 'ssh_m1_det') + m2 = ssh_detection_module(c2, F1//2, c2_filter, 'ssh_m2_det') + m3 = ssh_detection_module(c3, F1//2, c3_filter, 'ssh_m3_det') + elif config.NET_MODE==5: + c3 = conv_act_layer_dw(c3, 'ssh_c3_lateral_m', + F2, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3 = conv_act_layer(c3, 'ssh_c3_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1) + c3_up = upsampling(c3, F2, 'ssh_c3_upsampling') + c2 = conv_act_layer_dw(c2, 'ssh_c2_lateral_m', + F2, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3_up = mx.symbol.Crop(*[c3_up, c2_lateral]) + c2 = c2_lateral+c3_up + c2 = conv_act_layer(c2, 'ssh_c2_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1 = conv_act_layer_dw(c1, 'ssh_c1_lateral_m', + F2, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + c2_up = upsampling(c2, F2, 'ssh_c2_upsampling') + #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + c2_up = mx.symbol.Crop(*[c2_up, c1_lateral]) + c1 = c1_lateral+c2_up + c1 = conv_act_layer(c1, 'ssh_c1_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + + m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det') + m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det') + m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det') + + return {8: m1, 16:m2, 32: m3} + +def get_out(conv_fpn_feat, prefix, stride, landmark=False, lr_mult=1.0, shared_vars = None): + A = config.NUM_ANCHORS + bbox_pred_len = 4 + landmark_pred_len = 10 + if config.USE_BLUR: + bbox_pred_len = 5 + if config.USE_OCCLUSION: + landmark_pred_len = 15 + ret_group = [] + num_anchors = config.RPN_ANCHOR_CFG[str(stride)]['NUM_ANCHORS'] + label = mx.symbol.Variable(name='%s_label_stride%d'%(prefix,stride)) + bbox_target = mx.symbol.Variable(name='%s_bbox_target_stride%d'%(prefix,stride)) + bbox_weight = mx.symbol.Variable(name='%s_bbox_weight_stride%d'%(prefix,stride)) + if landmark: + landmark_target = mx.symbol.Variable(name='%s_landmark_target_stride%d'%(prefix,stride)) + landmark_weight = mx.symbol.Variable(name='%s_landmark_weight_stride%d'%(prefix,stride)) + rpn_relu = conv_fpn_feat[stride] + maxout_stat = 0 + if config.USE_MAXOUT>=1 and stride==config.RPN_FEAT_STRIDE[-1]: + maxout_stat = 1 + if config.USE_MAXOUT>=2 and stride!=config.RPN_FEAT_STRIDE[-1]: + maxout_stat = 2 + + + if maxout_stat==0: + rpn_cls_score = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d'%(prefix, stride), 2*num_anchors, + kernel=(1,1), pad=(0,0), stride=(1, 1), shared_weight = shared_vars[0][0], shared_bias = shared_vars[0][1]) + elif maxout_stat==1: + cls_list = [] + for a in range(num_anchors): + rpn_cls_score_bg = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_bg'%(prefix,stride,a), 3, + kernel=(1,1), pad=(0,0), stride=(1, 1)) + rpn_cls_score_bg = mx.sym.max(rpn_cls_score_bg, axis=1, keepdims=True) + cls_list.append(rpn_cls_score_bg) + rpn_cls_score_fg = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_fg'%(prefix,stride,a), 1, + kernel=(1,1), pad=(0,0), stride=(1, 1)) + cls_list.append(rpn_cls_score_fg) + rpn_cls_score = mx.sym.concat(*cls_list, dim=1, name='%s_rpn_cls_score_stride%d'%(prefix,stride)) + else: + cls_list = [] + for a in range(num_anchors): + rpn_cls_score_bg = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_bg'%(prefix,stride,a), 1, + kernel=(1,1), pad=(0,0), stride=(1, 1)) + cls_list.append(rpn_cls_score_bg) + rpn_cls_score_fg = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_fg'%(prefix,stride,a), 3, + kernel=(1,1), pad=(0,0), stride=(1, 1)) + rpn_cls_score_fg = mx.sym.max(rpn_cls_score_fg, axis=1, keepdims=True) + cls_list.append(rpn_cls_score_fg) + rpn_cls_score = mx.sym.concat(*cls_list, dim=1, name='%s_rpn_cls_score_stride%d'%(prefix,stride)) + + rpn_bbox_pred = conv_only(rpn_relu, '%s_rpn_bbox_pred_stride%d'%(prefix,stride), bbox_pred_len*num_anchors, + kernel=(1,1), pad=(0,0), stride=(1, 1), shared_weight = shared_vars[1][0], shared_bias = shared_vars[1][1]) + + # prepare rpn data + if not config.FBN: + rpn_cls_score_reshape = mx.symbol.Reshape(data=rpn_cls_score, + shape=(0, 2, -1), + name="%s_rpn_cls_score_reshape_stride%s" % (prefix,stride)) + else: + rpn_cls_score_reshape = mx.symbol.Reshape(data=rpn_cls_score, + shape=(0, 2, -1), + name="%s_rpn_cls_score_reshape_stride%s_pre" % (prefix,stride)) + rpn_cls_score_reshape = mx.symbol.BatchNorm(rpn_cls_score_reshape, fix_gamma=True, eps=2e-5, name="%s_rpn_cls_score_reshape_stride%s"%(prefix, stride)) + + rpn_bbox_pred_reshape = mx.symbol.Reshape(data=rpn_bbox_pred, + shape=(0, 0, -1), + name="%s_rpn_bbox_pred_reshape_stride%s" % (prefix,stride)) + if landmark: + rpn_landmark_pred = conv_only(rpn_relu, '%s_rpn_landmark_pred_stride%d'%(prefix,stride), landmark_pred_len*num_anchors, + kernel=(1,1), pad=(0,0), stride=(1, 1), shared_weight = shared_vars[2][0], shared_bias = shared_vars[2][1]) + rpn_landmark_pred_reshape = mx.symbol.Reshape(data=rpn_landmark_pred, + shape=(0, 0, -1), + name="%s_rpn_landmark_pred_reshape_stride%s" % (prefix,stride)) + + if config.TRAIN.RPN_ENABLE_OHEM>=2: + label, anchor_weight = mx.sym.Custom(op_type='rpn_fpn_ohem3', stride=int(stride), network=config.network, dataset=config.dataset, prefix=prefix, cls_score=rpn_cls_score_reshape, labels = label) + + _bbox_weight = mx.sym.tile(anchor_weight, (1,1,bbox_pred_len)) + _bbox_weight = _bbox_weight.reshape((0, -1, A * bbox_pred_len)).transpose((0,2,1)) + bbox_weight = mx.sym.elemwise_mul(bbox_weight, _bbox_weight, name='%s_bbox_weight_mul_stride%s'%(prefix,stride)) + + if landmark: + _landmark_weight = mx.sym.tile(anchor_weight, (1,1,landmark_pred_len)) + _landmark_weight = _landmark_weight.reshape((0, -1, A * landmark_pred_len)).transpose((0,2,1)) + landmark_weight = mx.sym.elemwise_mul(landmark_weight, _landmark_weight, name='%s_landmark_weight_mul_stride%s'%(prefix,stride)) + #if not config.FACE_LANDMARK: + # label, bbox_weight = mx.sym.Custom(op_type='rpn_fpn_ohem', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight , labels = label) + #else: + # label, bbox_weight, landmark_weight = mx.sym.Custom(op_type='rpn_fpn_ohem2', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight, landmark_weight=landmark_weight, labels = label) + #cls loss + rpn_cls_prob = mx.symbol.SoftmaxOutput(data=rpn_cls_score_reshape, + label=label, + multi_output=True, + normalization='valid', use_ignore=True, ignore_label=-1, + grad_scale = lr_mult, + name='%s_rpn_cls_prob_stride%d'%(prefix,stride)) + ret_group.append(rpn_cls_prob) + ret_group.append(mx.sym.BlockGrad(label)) + + #bbox loss + bbox_diff = rpn_bbox_pred_reshape-bbox_target + bbox_diff = bbox_diff * bbox_weight + rpn_bbox_loss_ = mx.symbol.smooth_l1(name='%s_rpn_bbox_loss_stride%d_'%(prefix,stride), scalar=3.0, data=bbox_diff) + rpn_bbox_loss = mx.sym.MakeLoss(name='%s_rpn_bbox_loss_stride%d'%(prefix,stride), data=rpn_bbox_loss_, grad_scale=1.0*lr_mult / (config.TRAIN.RPN_BATCH_SIZE)) + ret_group.append(rpn_bbox_loss) + ret_group.append(mx.sym.BlockGrad(bbox_weight)) + + #landmark loss + if landmark: + landmark_diff = rpn_landmark_pred_reshape-landmark_target + landmark_diff = landmark_diff * landmark_weight + rpn_landmark_loss_ = mx.symbol.smooth_l1(name='%s_rpn_landmark_loss_stride%d_'%(prefix,stride), scalar=3.0, data=landmark_diff) + rpn_landmark_loss = mx.sym.MakeLoss(name='%s_rpn_landmark_loss_stride%d'%(prefix,stride), data=rpn_landmark_loss_, grad_scale=0.5*lr_mult / (config.TRAIN.RPN_BATCH_SIZE)) + ret_group.append(rpn_landmark_loss) + ret_group.append(mx.sym.BlockGrad(landmark_weight)) + return ret_group + +def get_mnet_train(sym): + return get_sym_train(sym) + #data = mx.symbol.Variable(name="data") + ## shared convolutional layers + #conv_fpn_feat = get_mnet_conv(data, sym) + #ret_group = [] + #shared_vars = [] + #if config.SHARE_WEIGHT_BBOX: + # assert config.USE_MAXOUT==0 + # _name = 'face_rpn_cls_score_share' + # shared_weight = mx.symbol.Variable(name="{}_weight".format(_name), + # init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + # shared_bias = mx.symbol.Variable(name="{}_bias".format(_name), + # init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(0.0)}) + # shared_vars.append( [shared_weight, shared_bias] ) + # _name = 'face_rpn_bbox_pred_share' + # shared_weight = mx.symbol.Variable(name="{}_weight".format(_name), + # init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + # shared_bias = mx.symbol.Variable(name="{}_bias".format(_name), + # init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(0.0)}) + # shared_vars.append( [shared_weight, shared_bias] ) + #else: + # shared_vars.append( [None, None] ) + # shared_vars.append( [None, None] ) + #if config.SHARE_WEIGHT_LANDMARK: + # _name = 'face_rpn_landmark_pred_share' + # shared_weight = mx.symbol.Variable(name="{}_weight".format(_name), + # init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + # shared_bias = mx.symbol.Variable(name="{}_bias".format(_name), + # init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(0.0)}) + # shared_vars.append( [shared_weight, shared_bias] ) + #else: + # shared_vars.append( [None, None] ) + + #for stride in config.RPN_FEAT_STRIDE: + # ret = get_out(conv_fpn_feat, 'face', stride, config.FACE_LANDMARK, lr_mult=1.0, shared_vars = shared_vars) + # ret_group += ret + # if config.HEAD_BOX: + # ret = get_out(conv_fpn_feat, 'head', stride, False, lr_mult=0.5) + # ret_group += ret + + #return mx.sym.Group(ret_group) + + diff --git a/RetinaFace/rcnn/symbol/symbol_mnet.py.bak b/RetinaFace/rcnn/symbol/symbol_mnet.py.bak new file mode 100644 index 0000000..899804f --- /dev/null +++ b/RetinaFace/rcnn/symbol/symbol_mnet.py.bak @@ -0,0 +1,362 @@ +import mxnet as mx +import mxnet.ndarray as nd +import mxnet.gluon as gluon +import mxnet.gluon.nn as nn +import mxnet.autograd as ag +import numpy as np +from rcnn.config import config +from rcnn.PY_OP import rpn_fpn_ohem, rpn_fpn_ohem2, rpn_fpn_ohem3 + +USE_DCN = False +MM = 1.0 + +def ConvBlock(channels, kernel_size, strides, **kwargs): + out = nn.HybridSequential(**kwargs) + with out.name_scope(): + out.add( + nn.Conv2D(channels, kernel_size, strides=strides, padding=1, use_bias=False), + nn.BatchNorm(scale=True), + nn.Activation('relu') + ) + return out + +def Conv1x1(channels, is_linear=False, **kwargs): + out = nn.HybridSequential(**kwargs) + with out.name_scope(): + out.add( + nn.Conv2D(channels, 1, padding=0, use_bias=False), + nn.BatchNorm(scale=True) + ) + if not is_linear: + out.add(nn.Activation('relu')) + return out + +def DWise(channels, strides, kernel_size=3, **kwargs): + out = nn.HybridSequential(**kwargs) + with out.name_scope(): + out.add( + nn.Conv2D(channels, kernel_size, strides=strides, padding=kernel_size // 2, groups=channels, use_bias=False), + nn.BatchNorm(scale=True), + nn.Activation('relu') + ) + return out + +class SepCONV(nn.HybridBlock): + def __init__(self, inp, output, kernel_size, depth_multiplier=1, with_bn=True, **kwargs): + super(SepCONV, self).__init__(**kwargs) + with self.name_scope(): + self.net = nn.HybridSequential() + cn = int(inp*depth_multiplier) + + if output is None: + self.net.add( + nn.Conv2D(in_channels=inp, channels=cn, groups=inp, kernel_size=kernel_size, strides=(1,1), padding=kernel_size // 2 + , use_bias=not with_bn) + ) + else: + self.net.add( + nn.Conv2D(in_channels=inp, channels=cn, groups=inp, kernel_size=kernel_size, strides=(1,1), padding=kernel_size // 2 + , use_bias=False), + nn.BatchNorm(), + nn.Activation('relu'), + nn.Conv2D(in_channels=cn, channels=output, kernel_size=(1,1), strides=(1,1) + , use_bias=not with_bn) + ) + + self.with_bn = with_bn + self.act = nn.Activation('relu') + if with_bn: + self.bn = nn.BatchNorm() + def hybrid_forward(self, F ,x): + x = self.net(x) + if self.with_bn: + x = self.bn(x) + if self.act is not None: + x = self.act(x) + return x + +class ExpandedConv(nn.HybridBlock): + def __init__(self, inp, oup, t, strides, kernel=3, same_shape=True, **kwargs): + super(ExpandedConv, self).__init__(**kwargs) + + self.same_shape = same_shape + self.strides = strides + with self.name_scope(): + self.bottleneck = nn.HybridSequential() + self.bottleneck.add( + Conv1x1(inp*t, prefix="expand_"), + DWise(inp*t, self.strides, kernel, prefix="dwise_"), + Conv1x1(oup, is_linear=True, prefix="linear_") + ) + def hybrid_forward(self, F, x): + out = self.bottleneck(x) + if self.strides == 1 and self.same_shape: + out = F.elemwise_add(out, x) + return out + +def ExpandedConvSequence(t, k, inp, oup, repeats, first_strides, **kwargs): + seq = nn.HybridSequential(**kwargs) + with seq.name_scope(): + seq.add(ExpandedConv(inp, oup, t, first_strides, k, same_shape=False)) + curr_inp = oup + for i in range(1, repeats): + seq.add(ExpandedConv(curr_inp, oup, t, 1)) + curr_inp = oup + return seq + +class Mnasnet(nn.HybridBlock): + def __init__(self, multiplier=1.0, **kwargs): + super(Mnasnet, self).__init__(**kwargs) + mm = multiplier + + self.first_oup = 32 + self.interverted_residual_setting = [ + # t, c, n, s, k + [3, int(24*mm), 3, 2, 3, "stage2_"], # -> 56x56 + [3, int(40*mm), 3, 2, 5, "stage3_"], # -> 28x28 + [6, int(80*mm), 3, 2, 5, "stage4_1_"], # -> 14x14 + [6, int(96*mm), 2, 1, 3, "stage4_2_"], # -> 14x14 + [6, int(192*mm), 4, 2, 5, "stage5_1_"], # -> 7x7 + [6, int(320*mm), 1, 1, 3, "stage5_2_"], # -> 7x7 + ] + self.last_channels = 1280 + + with self.name_scope(): + self.features = nn.HybridSequential() + self.features.add(ConvBlock(self.first_oup, 3, 2, prefix="stage1_conv0_")) + self.features.add(SepCONV(self.first_oup, 16, 3, prefix="stage1_sepconv0_")) + inp = 16 + for i, (t, c, n, s, k, prefix) in enumerate(self.interverted_residual_setting): + oup = c + self.features.add(ExpandedConvSequence(t, k, inp, oup, n, s, prefix=prefix)) + inp = oup + + self.features.add(Conv1x1(self.last_channels, prefix="stage5_3_")) + def hybrid_forward(self, F, x): + x = self.features(x) + return x + +def conv_act_layer(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \ + stride=(1,1), act_type="relu", bias_wd_mult=0.0, dcn=False): + + weight = mx.symbol.Variable(name="{}_weight".format(name), + init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + bias = mx.symbol.Variable(name="{}_bias".format(name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(bias_wd_mult)}) + if not dcn: + conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \ + stride=stride, num_filter=num_filter, name="{}".format(name), weight = weight, bias=bias) + else: + assert kernel[0]==3 and kernel[1]==3 + num_group = 1 + f = num_group*18 + offset_weight = mx.symbol.Variable(name="{}_offset_weight".format(name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '1.0'}) + offset_bias = mx.symbol.Variable(name="{}_offset_bias".format(name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(bias_wd_mult)}) + conv_offset = mx.symbol.Convolution(name=name+'_offset', data = from_layer, weight=offset_weight, bias=offset_bias, + num_filter=f, pad=(1, 1), kernel=(3, 3), stride=(1, 1)) + conv = mx.contrib.symbol.DeformableConvolution(name=name, data=from_layer, offset=conv_offset, weight=weight, bias=bias, + num_filter=num_filter, pad=(1,1), kernel=(3, 3), num_deformable_group=num_group, stride=(1, 1), no_bias=False) + if len(act_type)>0: + relu = mx.symbol.Activation(data=conv, act_type=act_type, \ + name="{}_{}".format(name, act_type)) + else: + relu = conv + return relu + +def ssh_context_module(body, num_filters, name): + conv_dimred = conv_act_layer(body, name+'_conv1', + num_filters, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', dcn=False) + conv5x5 = conv_act_layer(conv_dimred, name+'_conv2', + num_filters, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', dcn=USE_DCN) + conv7x7_1 = conv_act_layer(conv_dimred, name+'_conv3_1', + num_filters, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', dcn=False) + conv7x7 = conv_act_layer(conv7x7_1, name+'_conv3_2', + num_filters, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', dcn=USE_DCN) + return (conv5x5, conv7x7) + +def ssh_detection_module(body, num_filters, name): + conv3x3 = conv_act_layer(body, name+'_conv1', + num_filters, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', dcn=USE_DCN) + conv5x5, conv7x7 = ssh_context_module(body, num_filters//2, name+'_context') + ret = mx.sym.concat(*[conv3x3, conv5x5, conv7x7], dim=1, name = name+'_concat') + ret = mx.symbol.Activation(data=ret, act_type='relu', name=name+'_concat_relu') + return ret + +def conv_bn(input, filter, ksize, stride, padding, act_type='relu', name=''): + conv = mx.symbol.Convolution(data=input, kernel=(ksize,ksize), pad=(padding,padding), \ + stride=(stride,stride), num_filter=filter, name=name+"_conv") + ret = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn') + if act_type is not None: + ret = mx.symbol.Activation(data=ret, act_type=act_type, \ + name="{}_{}".format(name, act_type)) + return ret + +def cpm(input, name): + # residual + branch1 = conv_bn(input, 1024, 1, 1, 0, act_type=None, name=name+"_branch1") + branch2a = conv_bn(input, 256, 1, 1, 0, act_type='relu', name=name+"_branch2a") + branch2b = conv_bn(branch2a, 256, 3, 1, 1, act_type='relu', name=name+"_branch2b") + branch2c = conv_bn(branch2b, 1024, 1, 1, 0, act_type=None, name=name+"_branch2c") + sum = branch1 + branch2c + rescomb = mx.symbol.Activation(data=sum, act_type='relu', name="%s_relu2"%(name)) + + ssh_out = ssh_detection_module(rescomb, 256, name=name+"_ssh") + return ssh_out + +def get_mnet_conv(data): + mm = MM + net = Mnasnet(mm, prefix="") + body = net(data) + + all_layers = body.get_internals() + #print(all_layers) + c1 = all_layers['stage3_expandedconv2_elemwise_add0_output'] + c2 = all_layers['stage4_2_expandedconv1_elemwise_add0_output'] + #c3 = all_layers['stage5_3_relu0_fwd_output'] + c3 = all_layers['stage5_2_expandedconv0_linear_batchnorm0_fwd_output'] + + F1 = int(256*mm) + F2 = int(128*mm) + _bwm = 1.0 + conv4_128 = conv_act_layer(c1, 'ssh_m1_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + conv5_128 = conv_act_layer(c2, 'ssh_m2_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + conv5_128_up = mx.symbol.Deconvolution(data=conv5_128, num_filter=F2, kernel=(4,4), stride=(2, 2), pad=(1,1), + num_group = F2, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'}, + name='ssh_m2_red_upsampling') + #conv5_128_up = mx.symbol.UpSampling(conv5_128, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + #conv5_128_up = mx.symbol.Crop(*[conv5_128_up, conv4_128]) + + conv_sum = conv4_128+conv5_128_up + #conv_sum = conv_1x1 + + m1_conv = conv_act_layer(conv_sum, 'ssh_m1_conv', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + m1 = ssh_detection_module(m1_conv, F2, 'ssh_m1_det') + m2 = ssh_detection_module(c2, F1, 'ssh_m2_det') + m3 = ssh_detection_module(c3, F1, 'ssh_m3_det') + return {8: m1, 16:m2, 32: m3} + +def get_out(conv_fpn_feat, prefix, stride, landmark=False, lr_mult=1.0): + A = config.NUM_ANCHORS + ret_group = [] + num_anchors = config.RPN_ANCHOR_CFG[str(stride)]['NUM_ANCHORS'] + label = mx.symbol.Variable(name='%s_label_stride%d'%(prefix,stride)) + bbox_target = mx.symbol.Variable(name='%s_bbox_target_stride%d'%(prefix,stride)) + bbox_weight = mx.symbol.Variable(name='%s_bbox_weight_stride%d'%(prefix,stride)) + if landmark: + landmark_target = mx.symbol.Variable(name='%s_landmark_target_stride%d'%(prefix,stride)) + landmark_weight = mx.symbol.Variable(name='%s_landmark_weight_stride%d'%(prefix,stride)) + rpn_relu = conv_fpn_feat[stride] + maxout_stat = 0 + if config.USE_MAXOUT>=1 and stride==config.RPN_FEAT_STRIDE[-1]: + maxout_stat = 1 + if config.USE_MAXOUT>=2 and stride!=config.RPN_FEAT_STRIDE[-1]: + maxout_stat = 2 + + if maxout_stat==0: + rpn_cls_score = conv_act_layer(rpn_relu, '%s_rpn_cls_score_stride%d'%(prefix, stride), 2*num_anchors, + kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='') + elif maxout_stat==1: + cls_list = [] + for a in range(num_anchors): + rpn_cls_score_bg = conv_act_layer(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_bg'%(prefix,stride,a), 3, + kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='') + rpn_cls_score_bg = mx.sym.max(rpn_cls_score_bg, axis=1, keepdims=True) + cls_list.append(rpn_cls_score_bg) + rpn_cls_score_fg = conv_act_layer(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_fg'%(prefix,stride,a), 1, + kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='') + cls_list.append(rpn_cls_score_fg) + rpn_cls_score = mx.sym.concat(*cls_list, dim=1, name='%s_rpn_cls_score_stride%d'%(prefix,stride)) + else: + cls_list = [] + for a in range(num_anchors): + rpn_cls_score_bg = conv_act_layer(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_bg'%(prefix,stride,a), 1, + kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='') + cls_list.append(rpn_cls_score_bg) + rpn_cls_score_fg = conv_act_layer(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_fg'%(prefix,stride,a), 3, + kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='') + rpn_cls_score_fg = mx.sym.max(rpn_cls_score_fg, axis=1, keepdims=True) + cls_list.append(rpn_cls_score_fg) + rpn_cls_score = mx.sym.concat(*cls_list, dim=1, name='%s_rpn_cls_score_stride%d'%(prefix,stride)) + + rpn_bbox_pred = conv_act_layer(rpn_relu, '%s_rpn_bbox_pred_stride%d'%(prefix,stride), 4*num_anchors, + kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='') + + # prepare rpn data + rpn_cls_score_reshape = mx.symbol.Reshape(data=rpn_cls_score, + shape=(0, 2, -1), + name="%s_rpn_cls_score_reshape_stride%s" % (prefix,stride)) + rpn_bbox_pred_reshape = mx.symbol.Reshape(data=rpn_bbox_pred, + shape=(0, 0, -1), + name="%s_rpn_bbox_pred_reshape_stride%s" % (prefix,stride)) + if landmark: + rpn_landmark_pred = conv_act_layer(rpn_relu, '%s_rpn_landmark_pred_stride%d'%(prefix,stride), 10*num_anchors, + kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='') + rpn_landmark_pred_reshape = mx.symbol.Reshape(data=rpn_landmark_pred, + shape=(0, 0, -1), + name="%s_rpn_landmark_pred_reshape_stride%s" % (prefix,stride)) + + if config.TRAIN.RPN_ENABLE_OHEM>=2: + label, anchor_weight = mx.sym.Custom(op_type='rpn_fpn_ohem3', stride=int(stride), network=config.network, dataset=config.dataset, prefix=prefix, cls_score=rpn_cls_score_reshape, labels = label) + + _bbox_weight = mx.sym.tile(anchor_weight, (1,1,4)) + _bbox_weight = _bbox_weight.reshape((0, -1, A * 4)).transpose((0,2,1)) + bbox_weight = mx.sym.elemwise_mul(bbox_weight, _bbox_weight, name='%s_bbox_weight_mul_stride%s'%(prefix,stride)) + + if landmark: + _landmark_weight = mx.sym.tile(anchor_weight, (1,1,10)) + _landmark_weight = _landmark_weight.reshape((0, -1, A * 10)).transpose((0,2,1)) + landmark_weight = mx.sym.elemwise_mul(landmark_weight, _landmark_weight, name='%s_landmark_weight_mul_stride%s'%(prefix,stride)) + #if not config.FACE_LANDMARK: + # label, bbox_weight = mx.sym.Custom(op_type='rpn_fpn_ohem', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight , labels = label) + #else: + # label, bbox_weight, landmark_weight = mx.sym.Custom(op_type='rpn_fpn_ohem2', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight, landmark_weight=landmark_weight, labels = label) + #cls loss + rpn_cls_prob = mx.symbol.SoftmaxOutput(data=rpn_cls_score_reshape, + label=label, + multi_output=True, + normalization='valid', use_ignore=True, ignore_label=-1, + grad_scale = lr_mult, + name='%s_rpn_cls_prob_stride%d'%(prefix,stride)) + ret_group.append(rpn_cls_prob) + ret_group.append(mx.sym.BlockGrad(label)) + + #bbox loss + bbox_diff = rpn_bbox_pred_reshape-bbox_target + bbox_diff = bbox_diff * bbox_weight + rpn_bbox_loss_ = mx.symbol.smooth_l1(name='%s_rpn_bbox_loss_stride%d_'%(prefix,stride), scalar=3.0, data=bbox_diff) + rpn_bbox_loss = mx.sym.MakeLoss(name='%s_rpn_bbox_loss_stride%d'%(prefix,stride), data=rpn_bbox_loss_, grad_scale=1.0*lr_mult / (config.TRAIN.RPN_BATCH_SIZE)) + ret_group.append(rpn_bbox_loss) + ret_group.append(mx.sym.BlockGrad(bbox_weight)) + + #landmark loss + if landmark: + landmark_diff = rpn_landmark_pred_reshape-landmark_target + landmark_diff = landmark_diff * landmark_weight + rpn_landmark_loss_ = mx.symbol.smooth_l1(name='%s_rpn_landmark_loss_stride%d_'%(prefix,stride), scalar=3.0, data=landmark_diff) + rpn_landmark_loss = mx.sym.MakeLoss(name='%s_rpn_landmark_loss_stride%d'%(prefix,stride), data=rpn_landmark_loss_, grad_scale=0.5*lr_mult / (config.TRAIN.RPN_BATCH_SIZE)) + ret_group.append(rpn_landmark_loss) + ret_group.append(mx.sym.BlockGrad(landmark_weight)) + return ret_group + +def get_mnet_train(): + data = mx.symbol.Variable(name="data") + + # shared convolutional layers + conv_fpn_feat = get_mnet_conv(data) + ret_group = [] + for stride in config.RPN_FEAT_STRIDE: + ret = get_out(conv_fpn_feat, 'face', stride, config.FACE_LANDMARK, lr_mult=1.0) + ret_group += ret + if config.HEAD_BOX: + ret = get_out(conv_fpn_feat, 'head', stride, False, lr_mult=1.0) + ret_group += ret + + return mx.sym.Group(ret_group) + + diff --git a/RetinaFace/rcnn/symbol/symbol_resnet.py b/RetinaFace/rcnn/symbol/symbol_resnet.py new file mode 100644 index 0000000..e435095 --- /dev/null +++ b/RetinaFace/rcnn/symbol/symbol_resnet.py @@ -0,0 +1,423 @@ +import mxnet as mx +import mxnet.ndarray as nd +import mxnet.gluon as gluon +import mxnet.gluon.nn as nn +import mxnet.autograd as ag +import numpy as np +from rcnn.config import config +from rcnn.PY_OP import rpn_fpn_ohem3 +from symbol_common import get_sym_train + +def conv_only(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \ + stride=(1,1), bias_wd_mult=0.0): + weight = mx.symbol.Variable(name="{}_weight".format(name), + init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + bias = mx.symbol.Variable(name="{}_bias".format(name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(bias_wd_mult)}) + conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \ + stride=stride, num_filter=num_filter, name="{}".format(name), weight = weight, bias=bias) + return conv + +def conv_act_layer_dw(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \ + stride=(1,1), act_type="relu", bias_wd_mult=0.0): + assert kernel[0]==3 + weight = mx.symbol.Variable(name="{}_weight".format(name), + init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + bias = mx.symbol.Variable(name="{}_bias".format(name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(bias_wd_mult)}) + conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \ + stride=stride, num_filter=num_filter, num_group=num_filter, name="{}".format(name), weight=weight, bias=bias) + conv = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn') + if len(act_type)>0: + relu = mx.symbol.Activation(data=conv, act_type=act_type, \ + name="{}_{}".format(name, act_type)) + else: + relu = conv + return relu + +def conv_act_layer(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \ + stride=(1,1), act_type="relu", bias_wd_mult=0.0, separable=False, filter_in = -1): + + separable = False + if separable: + assert kernel[0]==3 + if not separable: + weight = mx.symbol.Variable(name="{}_weight".format(name), + init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + bias = mx.symbol.Variable(name="{}_bias".format(name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(bias_wd_mult)}) + conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \ + stride=stride, num_filter=num_filter, name="{}".format(name), weight=weight, bias=bias) + conv = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn') + else: + if filter_in<0: + filter_in = num_filter + conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \ + stride=stride, num_filter=filter_in, num_group=filter_in, name="{}_sep".format(name)) + conv = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_sep_bn') + conv = mx.symbol.Activation(data=conv, act_type='relu', \ + name="{}_sep_bn_relu".format(name)) + conv = mx.symbol.Convolution(data=conv, kernel=(1,1), pad=(0,0), \ + stride=(1,1), num_filter=num_filter, name="{}".format(name)) + conv = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn') + if len(act_type)>0: + relu = mx.symbol.Activation(data=conv, act_type=act_type, \ + name="{}_{}".format(name, act_type)) + else: + relu = conv + return relu + +def ssh_context_module(body, num_filter, filter_in, name): + conv_dimred = conv_act_layer(body, name+'_conv1', + num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', separable=True, filter_in = filter_in) + conv5x5 = conv_act_layer(conv_dimred, name+'_conv2', + num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', separable=True) + conv7x7_1 = conv_act_layer(conv_dimred, name+'_conv3_1', + num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', separable=True) + conv7x7 = conv_act_layer(conv7x7_1, name+'_conv3_2', + num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', separable=True) + return (conv5x5, conv7x7) + +def conv_deformable(net, num_filter, num_group=1, act_type='relu',name=''): + f = num_group*18 + conv_offset = mx.symbol.Convolution(name=name+'_conv_offset', data = net, + num_filter=f, pad=(1, 1), kernel=(3, 3), stride=(1, 1)) + net = mx.contrib.symbol.DeformableConvolution(name=name+"_conv", data=net, offset=conv_offset, + num_filter=num_filter, pad=(1,1), kernel=(3, 3), num_deformable_group=num_group, stride=(1, 1), no_bias=False) + net = mx.sym.BatchNorm(data=net, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn') + if len(act_type)>0: + net = mx.symbol.Activation(data=net, act_type=act_type, name=name+'_act') + return net + +def ssh_detection_module(body, num_filter, filter_in, name): + conv3x3 = conv_act_layer(body, name+'_conv1', + num_filter, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', separable=True, filter_in=filter_in) + conv5x5, conv7x7 = ssh_context_module(body, num_filter//2, filter_in, name+'_context') + ret = mx.sym.concat(*[conv3x3, conv5x5, conv7x7], dim=1, name = name+'_concat') + ret = mx.symbol.Activation(data=ret, act_type='relu', name=name+'_concat_relu') + if config.USE_DCN==1: + ret = conv_deformable(ret, num_filter = num_filter*2, name = name+'_concat_dcn') + elif config.USE_DCN==2: + ret = conv_deformable2(ret, num_filter = num_filter*2, name = name+'_concat_dcn') + return ret + + +def get_resnet_conv(data, sym): + all_layers = sym.get_internals() + isize = 640 + _, out_shape, _ = all_layers.infer_shape(data = (1,3,isize,isize)) + last_entry = None + c1 = None + c2 = None + c3 = None + #print(len(all_layers), len(out_shape)) + #print(all_layers.__class__) + outputs = all_layers.list_outputs() + #print(outputs.__class__, len(outputs)) + count = len(outputs) + for i in range(count): + name = outputs[i] + shape = out_shape[i] + if not name.endswith('_output'): + continue + if len(shape)!=4: + continue + print(name, shape) + if c1 is None and shape[2]==isize//16: + cname = last_entry[0] + print('c1', last_entry) + c1 = all_layers[cname] + if c2 is None and shape[2]==isize//32: + cname = last_entry[0] + print('c2', last_entry) + c2 = all_layers[cname] + if shape[2]==isize//32: + c3 = all_layers[name] + print('c3', name, shape) + + last_entry = (name, shape) + + c1_filter = -1 + c2_filter = -1 + c3_filter = -1 + + F1 = 256 + F2 = 256 + _bwm = 1.0 + if config.NET_MODE==0: + c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c2_lateral = conv_act_layer(c2, 'ssh_m2_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #conv5_128_up = mx.symbol.Deconvolution(data=conv5_128, num_filter=F2, kernel=(4,4), stride=(2, 2), pad=(1,1), + # num_group = F2, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'}, + # name='ssh_m2_red_upsampling') + c2_up = mx.symbol.UpSampling(c2_lateral, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + c2_up = mx.symbol.Crop(*[c2_up, c1_lateral]) + + c1 = c1_lateral+c2_up + + c1 = conv_act_layer(c1, 'ssh_m1_conv', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det') + m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det') + m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det') + elif config.NET_MODE==1: + c3_lateral = conv_act_layer(c3, 'ssh_c3_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3_up = mx.symbol.UpSampling(c3_lateral, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1) + c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3_up = mx.symbol.Crop(*[c3_up, c2_lateral]) + c2 = c2_lateral+c3_up + c2 = conv_act_layer(c2, 'ssh_c2_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + c2_up = mx.symbol.Crop(*[c2_up, c1_lateral]) + + c1 = c1_lateral+c2_up + + c1 = conv_act_layer(c1, 'ssh_m1_conv', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det') + m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det') + m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det') + elif config.NET_MODE==2: + n1 = ssh_detection_module(c1, F2, F2, 'ssh_n1_det') + n2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_n2_det') + n3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_n3_det') + c3 = conv_act_layer(c3, 'ssh_c3_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1) + c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3_up = mx.symbol.Crop(*[c3_up, c2_lateral]) + c2 = c2_lateral+c3_up + c2 = conv_act_layer(c2, 'ssh_c2_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + c2_up = mx.symbol.Crop(*[c2_up, c1_lateral]) + c1 = c1_lateral+c2_up + c1 = conv_act_layer(c1, 'ssh_c1_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + + m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det') + m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det') + m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det') + elif config.NET_MODE==3: + #c3 = conv_act_layer(c3, 'ssh_c3_lateral', + # F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3 = ssh_detection_module(c3, F2//2, c3_filter, 'ssh_c3_lateral') + c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1) + #c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral', + # F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c2_lateral = ssh_detection_module(c2, F2//2, c2_filter, 'ssh_c2_lateral') + c3_up = mx.symbol.Crop(*[c3_up, c2_lateral]) + c2 = c2_lateral+c3_up + c2 = conv_act_layer(c2, 'ssh_c2_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + #c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv', + # F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_lateral = ssh_detection_module(c1, F2//2, c1_filter, 'ssh_c1_lateral') + c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + c2_up = mx.symbol.Crop(*[c2_up, c1_lateral]) + c1 = c1_lateral+c2_up + c1 = conv_act_layer(c1, 'ssh_c1_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + + m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det') + m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det') + m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det') + elif config.NET_MODE==4: + c3 = conv_act_layer(c3, 'ssh_c3_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1) + c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3_up = mx.symbol.Crop(*[c3_up, c2_lateral]) + c2 = c2_lateral+c3_up + c2 = conv_act_layer(c2, 'ssh_c2_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + c2_up = mx.symbol.Crop(*[c2_up, c1_lateral]) + c1 = c1_lateral+c2_up + c1 = conv_act_layer(c1, 'ssh_c1_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + + m1 = ssh_detection_module(c1, F2//2, F2, 'ssh_m1_det') + m2 = ssh_detection_module(c2, F1//2, c2_filter, 'ssh_m2_det') + m3 = ssh_detection_module(c3, F1//2, c3_filter, 'ssh_m3_det') + elif config.NET_MODE==5: + c3 = conv_act_layer_dw(c3, 'ssh_c3_lateral_m', + F2, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3 = conv_act_layer(c3, 'ssh_c3_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3_up = mx.symbol.UpSampling(c3, scale=2, sample_type='nearest', workspace=512, name='ssh_c3_up', num_args=1) + c2 = conv_act_layer_dw(c2, 'ssh_c2_lateral_m', + F2, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c2_lateral = conv_act_layer(c2, 'ssh_c2_lateral', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c3_up = mx.symbol.Crop(*[c3_up, c2_lateral]) + c2 = c2_lateral+c3_up + c2 = conv_act_layer(c2, 'ssh_c2_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1 = conv_act_layer_dw(c1, 'ssh_c1_lateral_m', + F2, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c1_lateral = conv_act_layer(c1, 'ssh_m1_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + c2_up = mx.symbol.UpSampling(c2, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + #conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + c2_up = mx.symbol.Crop(*[c2_up, c1_lateral]) + c1 = c1_lateral+c2_up + c1 = conv_act_layer(c1, 'ssh_c1_aggr', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + + m1 = ssh_detection_module(c1, F2, F2, 'ssh_m1_det') + m2 = ssh_detection_module(c2, F1, c2_filter, 'ssh_m2_det') + m3 = ssh_detection_module(c3, F1, c3_filter, 'ssh_m3_det') + + return {8: m1, 16:m2, 32: m3}, {8: n1, 16:n2, 32: n3} + +def get_out(conv_fpn_feat, prefix, stride, landmark=False, lr_mult=1.0): + A = config.NUM_ANCHORS + bbox_pred_len = 4 + landmark_pred_len = 10 + if config.USE_BLUR: + bbox_pred_len = 5 + if config.USE_OCCLUSION: + landmark_pred_len = 15 + ret_group = [] + num_anchors = config.RPN_ANCHOR_CFG[str(stride)]['NUM_ANCHORS'] + label = mx.symbol.Variable(name='%s_label_stride%d'%(prefix,stride)) + bbox_target = mx.symbol.Variable(name='%s_bbox_target_stride%d'%(prefix,stride)) + bbox_weight = mx.symbol.Variable(name='%s_bbox_weight_stride%d'%(prefix,stride)) + if landmark: + landmark_target = mx.symbol.Variable(name='%s_landmark_target_stride%d'%(prefix,stride)) + landmark_weight = mx.symbol.Variable(name='%s_landmark_weight_stride%d'%(prefix,stride)) + rpn_relu = conv_fpn_feat[stride] + maxout_stat = 0 + if config.USE_MAXOUT>=1 and stride==config.RPN_FEAT_STRIDE[-1]: + maxout_stat = 1 + if config.USE_MAXOUT>=2 and stride!=config.RPN_FEAT_STRIDE[-1]: + maxout_stat = 2 + + if maxout_stat==0: + rpn_cls_score = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d'%(prefix, stride), 2*num_anchors, + kernel=(1,1), pad=(0,0), stride=(1, 1)) + elif maxout_stat==1: + cls_list = [] + for a in range(num_anchors): + rpn_cls_score_bg = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_bg'%(prefix,stride,a), 3, + kernel=(1,1), pad=(0,0), stride=(1, 1)) + rpn_cls_score_bg = mx.sym.max(rpn_cls_score_bg, axis=1, keepdims=True) + cls_list.append(rpn_cls_score_bg) + rpn_cls_score_fg = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_fg'%(prefix,stride,a), 1, + kernel=(1,1), pad=(0,0), stride=(1, 1)) + cls_list.append(rpn_cls_score_fg) + rpn_cls_score = mx.sym.concat(*cls_list, dim=1, name='%s_rpn_cls_score_stride%d'%(prefix,stride)) + else: + cls_list = [] + for a in range(num_anchors): + rpn_cls_score_bg = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_bg'%(prefix,stride,a), 1, + kernel=(1,1), pad=(0,0), stride=(1, 1)) + cls_list.append(rpn_cls_score_bg) + rpn_cls_score_fg = conv_only(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_fg'%(prefix,stride,a), 3, + kernel=(1,1), pad=(0,0), stride=(1, 1)) + rpn_cls_score_fg = mx.sym.max(rpn_cls_score_fg, axis=1, keepdims=True) + cls_list.append(rpn_cls_score_fg) + rpn_cls_score = mx.sym.concat(*cls_list, dim=1, name='%s_rpn_cls_score_stride%d'%(prefix,stride)) + + rpn_bbox_pred = conv_only(rpn_relu, '%s_rpn_bbox_pred_stride%d'%(prefix,stride), bbox_pred_len*num_anchors, + kernel=(1,1), pad=(0,0), stride=(1, 1)) + + # prepare rpn data + if not config.FBN: + rpn_cls_score_reshape = mx.symbol.Reshape(data=rpn_cls_score, + shape=(0, 2, -1), + name="%s_rpn_cls_score_reshape_stride%s" % (prefix,stride)) + else: + rpn_cls_score_reshape = mx.symbol.Reshape(data=rpn_cls_score, + shape=(0, 2, -1), + name="%s_rpn_cls_score_reshape_stride%s_pre" % (prefix,stride)) + rpn_cls_score_reshape = mx.symbol.BatchNorm(rpn_cls_score_reshape, fix_gamma=True, eps=2e-5, name="%s_rpn_cls_score_reshape_stride%s"%(prefix, stride)) + + rpn_bbox_pred_reshape = mx.symbol.Reshape(data=rpn_bbox_pred, + shape=(0, 0, -1), + name="%s_rpn_bbox_pred_reshape_stride%s" % (prefix,stride)) + if landmark: + rpn_landmark_pred = conv_only(rpn_relu, '%s_rpn_landmark_pred_stride%d'%(prefix,stride), landmark_pred_len*num_anchors, + kernel=(1,1), pad=(0,0), stride=(1, 1)) + rpn_landmark_pred_reshape = mx.symbol.Reshape(data=rpn_landmark_pred, + shape=(0, 0, -1), + name="%s_rpn_landmark_pred_reshape_stride%s" % (prefix,stride)) + + if config.TRAIN.RPN_ENABLE_OHEM>=2: + label, anchor_weight = mx.sym.Custom(op_type='rpn_fpn_ohem3', stride=int(stride), network=config.network, dataset=config.dataset, prefix=prefix, cls_score=rpn_cls_score_reshape, labels = label) + + _bbox_weight = mx.sym.tile(anchor_weight, (1,1,bbox_pred_len)) + _bbox_weight = _bbox_weight.reshape((0, -1, A * bbox_pred_len)).transpose((0,2,1)) + bbox_weight = mx.sym.elemwise_mul(bbox_weight, _bbox_weight, name='%s_bbox_weight_mul_stride%s'%(prefix,stride)) + + if landmark: + _landmark_weight = mx.sym.tile(anchor_weight, (1,1,landmark_pred_len)) + _landmark_weight = _landmark_weight.reshape((0, -1, A * landmark_pred_len)).transpose((0,2,1)) + landmark_weight = mx.sym.elemwise_mul(landmark_weight, _landmark_weight, name='%s_landmark_weight_mul_stride%s'%(prefix,stride)) + #if not config.FACE_LANDMARK: + # label, bbox_weight = mx.sym.Custom(op_type='rpn_fpn_ohem', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight , labels = label) + #else: + # label, bbox_weight, landmark_weight = mx.sym.Custom(op_type='rpn_fpn_ohem2', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight, landmark_weight=landmark_weight, labels = label) + #cls loss + rpn_cls_prob = mx.symbol.SoftmaxOutput(data=rpn_cls_score_reshape, + label=label, + multi_output=True, + normalization='valid', use_ignore=True, ignore_label=-1, + grad_scale = lr_mult, + name='%s_rpn_cls_prob_stride%d'%(prefix,stride)) + ret_group.append(rpn_cls_prob) + ret_group.append(mx.sym.BlockGrad(label)) + + #bbox loss + bbox_diff = rpn_bbox_pred_reshape-bbox_target + bbox_diff = bbox_diff * bbox_weight + rpn_bbox_loss_ = mx.symbol.smooth_l1(name='%s_rpn_bbox_loss_stride%d_'%(prefix,stride), scalar=3.0, data=bbox_diff) + rpn_bbox_loss = mx.sym.MakeLoss(name='%s_rpn_bbox_loss_stride%d'%(prefix,stride), data=rpn_bbox_loss_, grad_scale=1.0*lr_mult / (config.TRAIN.RPN_BATCH_SIZE)) + ret_group.append(rpn_bbox_loss) + ret_group.append(mx.sym.BlockGrad(bbox_weight)) + + #landmark loss + if landmark: + landmark_diff = rpn_landmark_pred_reshape-landmark_target + landmark_diff = landmark_diff * landmark_weight + rpn_landmark_loss_ = mx.symbol.smooth_l1(name='%s_rpn_landmark_loss_stride%d_'%(prefix,stride), scalar=3.0, data=landmark_diff) + rpn_landmark_loss = mx.sym.MakeLoss(name='%s_rpn_landmark_loss_stride%d'%(prefix,stride), data=rpn_landmark_loss_, grad_scale=0.5*lr_mult / (config.TRAIN.RPN_BATCH_SIZE)) + ret_group.append(rpn_landmark_loss) + ret_group.append(mx.sym.BlockGrad(landmark_weight)) + return ret_group + +def get_resnet_train(sym): + return get_sym_train(sym) + #data = mx.symbol.Variable(name="data") + ## shared convolutional layers + #conv_fpn_feat, conv_fpn_feat2 = get_resnet_conv(data, sym) + #ret_group = [] + #for stride in config.RPN_FEAT_STRIDE: + # ret = get_out(conv_fpn_feat, 'face', stride, config.FACE_LANDMARK, lr_mult=1.0) + # ret_group += ret + # if config.HEAD_BOX: + # ret = get_out(conv_fpn_feat2, 'head', stride, False, lr_mult=1.0) + # ret_group += ret + + #return mx.sym.Group(ret_group) + + diff --git a/RetinaFace/rcnn/symbol/symbol_ssh.py b/RetinaFace/rcnn/symbol/symbol_ssh.py new file mode 100644 index 0000000..31541ee --- /dev/null +++ b/RetinaFace/rcnn/symbol/symbol_ssh.py @@ -0,0 +1,365 @@ +import mxnet as mx +import numpy as np +from rcnn.config import config +from rcnn.PY_OP import rpn_fpn_ohem3 +FPN = False +USE_DCN=False + +def conv_act_layer(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \ + stride=(1,1), act_type="relu", bias_wd_mult=0.0, dcn=False): + + weight = mx.symbol.Variable(name="{}_weight".format(name), + init=mx.init.Normal(0.01), attr={'__lr_mult__': '1.0'}) + bias = mx.symbol.Variable(name="{}_bias".format(name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(bias_wd_mult)}) + if not dcn: + conv = mx.symbol.Convolution(data=from_layer, kernel=kernel, pad=pad, \ + stride=stride, num_filter=num_filter, name="{}".format(name), weight = weight, bias=bias) + else: + assert kernel[0]==3 and kernel[1]==3 + num_group = 1 + f = num_group*18 + offset_weight = mx.symbol.Variable(name="{}_offset_weight".format(name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '1.0'}) + offset_bias = mx.symbol.Variable(name="{}_offset_bias".format(name), + init=mx.init.Constant(0.0), attr={'__lr_mult__': '2.0', '__wd_mult__': str(bias_wd_mult)}) + conv_offset = mx.symbol.Convolution(name=name+'_offset', data = from_layer, weight=offset_weight, bias=offset_bias, + num_filter=f, pad=(1, 1), kernel=(3, 3), stride=(1, 1)) + conv = mx.contrib.symbol.DeformableConvolution(name=name, data=from_layer, offset=conv_offset, weight=weight, bias=bias, + num_filter=num_filter, pad=(1,1), kernel=(3, 3), num_deformable_group=num_group, stride=(1, 1), no_bias=False) + if len(act_type)>0: + relu = mx.symbol.Activation(data=conv, act_type=act_type, \ + name="{}_{}".format(name, act_type)) + else: + relu = conv + return relu + +def ssh_context_module(body, num_filters, name): + conv_dimred = conv_act_layer(body, name+'_conv1', + num_filters, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', dcn=False) + conv5x5 = conv_act_layer(conv_dimred, name+'_conv2', + num_filters, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', dcn=USE_DCN) + conv7x7_1 = conv_act_layer(conv_dimred, name+'_conv3_1', + num_filters, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', dcn=False) + conv7x7 = conv_act_layer(conv7x7_1, name+'_conv3_2', + num_filters, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', dcn=USE_DCN) + return (conv5x5, conv7x7) + +def ssh_detection_module(body, num_filters, name): + conv3x3 = conv_act_layer(body, name+'_conv1', + num_filters, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='', dcn=USE_DCN) + conv5x5, conv7x7 = ssh_context_module(body, num_filters//2, name+'_context') + ret = mx.sym.concat(*[conv3x3, conv5x5, conv7x7], dim=1, name = name+'_concat') + ret = mx.symbol.Activation(data=ret, act_type='relu', name=name+'_concat_relu') + return ret + +def conv_bn(input, filter, ksize, stride, padding, act_type='relu', name=''): + conv = mx.symbol.Convolution(data=input, kernel=(ksize,ksize), pad=(padding,padding), \ + stride=(stride,stride), num_filter=filter, name=name+"_conv") + ret = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=0.9, name=name + '_bn') + if act_type is not None: + ret = mx.symbol.Activation(data=ret, act_type=act_type, \ + name="{}_{}".format(name, act_type)) + return ret + +def cpm(input, name): + # residual + branch1 = conv_bn(input, 1024, 1, 1, 0, act_type=None, name=name+"_branch1") + branch2a = conv_bn(input, 256, 1, 1, 0, act_type='relu', name=name+"_branch2a") + branch2b = conv_bn(branch2a, 256, 3, 1, 1, act_type='relu', name=name+"_branch2b") + branch2c = conv_bn(branch2b, 1024, 1, 1, 0, act_type=None, name=name+"_branch2c") + sum = branch1 + branch2c + rescomb = mx.symbol.Activation(data=sum, act_type='relu', name="%s_relu2"%(name)) + + ssh_out = ssh_detection_module(rescomb, 256, name=name+"_ssh") + return ssh_out + +def get_feat_down(conv_feat): + #P5 = mx.symbol.Convolution(data=conv_feat[0], kernel=(1, 1), num_filter=256, name="P5_lateral") + P5 = conv_act_layer(conv_feat[0], 'P5_lateral', + 256, kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='relu') + + # P5 2x upsampling + C4 = P4 + P5_up = mx.symbol.UpSampling(P5, scale=2, sample_type='nearest', workspace=512, name='P5_upsampling', num_args=1) + #P4_la = mx.symbol.Convolution(data=conv_feat[1], kernel=(1, 1), num_filter=256, name="P4_lateral") + P4_la = conv_act_layer(conv_feat[1], 'P4_lateral', + 256, kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='relu') + P5_clip = mx.symbol.Crop(*[P5_up, P4_la], name="P4_clip") + P4 = mx.sym.ElementWiseSum(*[P5_clip, P4_la], name="P4_sum") + #P4 = mx.symbol.Convolution(data=P4, kernel=(3, 3), pad=(1, 1), num_filter=256, name="P4_aggregate") + P4 = conv_act_layer(P4, 'P4_aggregate', + 256, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu') + + # P4 2x upsampling + C3 = P3 + P4_up = mx.symbol.UpSampling(P4, scale=2, sample_type='nearest', workspace=512, name='P4_upsampling', num_args=1) + #P3_la = mx.symbol.Convolution(data=conv_feat[2], kernel=(1, 1), num_filter=256, name="P3_lateral") + P3_la = conv_act_layer(conv_feat[2], 'P3_lateral', + 256, kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='relu') + P4_clip = mx.symbol.Crop(*[P4_up, P3_la], name="P3_clip") + P3 = mx.sym.ElementWiseSum(*[P4_clip, P3_la], name="P3_sum") + #P3 = mx.symbol.Convolution(data=P3, kernel=(3, 3), pad=(1, 1), num_filter=256, name="P3_aggregate") + P3 = conv_act_layer(P3, 'P3_aggregate', + 256, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu') + + return P3, P4, P5 + +def get_ssh_conv(data): + """ + shared convolutional layers + :param data: Symbol + :return: Symbol + """ + # group 1 + #conv1_1 = mx.symbol.Convolution( + # data=data, kernel=(3, 3), pad=(1, 1), num_filter=64, workspace=2048, name="conv1_1") + #relu1_1 = mx.symbol.Activation(data=conv1_1, act_type="relu", name="relu1_1") + relu1_1 = conv_act_layer(data, 'conv1_1', + 64, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu') + #conv1_2 = mx.symbol.Convolution( + # data=relu1_1, kernel=(3, 3), pad=(1, 1), num_filter=64, workspace=2048, name="conv1_2") + #relu1_2 = mx.symbol.Activation(data=conv1_2, act_type="relu", name="relu1_2") + relu1_2 = conv_act_layer(relu1_1, 'conv1_2', + 64, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu') + pool1 = mx.symbol.Pooling( + data=relu1_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool1") + # group 2 + #conv2_1 = mx.symbol.Convolution( + # data=pool1, kernel=(3, 3), pad=(1, 1), num_filter=128, workspace=2048, name="conv2_1") + #relu2_1 = mx.symbol.Activation(data=conv2_1, act_type="relu", name="relu2_1") + relu2_1 = conv_act_layer(pool1, 'conv2_1', + 128, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu') + #conv2_2 = mx.symbol.Convolution( + # data=relu2_1, kernel=(3, 3), pad=(1, 1), num_filter=128, workspace=2048, name="conv2_2") + #relu2_2 = mx.symbol.Activation(data=conv2_2, act_type="relu", name="relu2_2") + relu2_2 = conv_act_layer(relu2_1, 'conv2_2', + 128, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu') + pool2 = mx.symbol.Pooling( + data=relu2_2, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool2") + # group 3 + #conv3_1 = mx.symbol.Convolution( + # data=pool2, kernel=(3, 3), pad=(1, 1), num_filter=256, workspace=2048, name="conv3_1") + #relu3_1 = mx.symbol.Activation(data=conv3_1, act_type="relu", name="relu3_1") + relu3_1 = conv_act_layer(pool2, 'conv3_1', + 256, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu') + #conv3_2 = mx.symbol.Convolution( + # data=relu3_1, kernel=(3, 3), pad=(1, 1), num_filter=256, workspace=2048, name="conv3_2") + #relu3_2 = mx.symbol.Activation(data=conv3_2, act_type="relu", name="relu3_2") + relu3_2 = conv_act_layer(relu3_1, 'conv3_2', + 256, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu') + #conv3_3 = mx.symbol.Convolution( + # data=relu3_2, kernel=(3, 3), pad=(1, 1), num_filter=256, workspace=2048, name="conv3_3") + #relu3_3 = mx.symbol.Activation(data=conv3_3, act_type="relu", name="relu3_3") + relu3_3 = conv_act_layer(relu3_2, 'conv3_3', + 256, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu') + pool3 = mx.symbol.Pooling( + data=relu3_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool3") + # group 4 + #conv4_1 = mx.symbol.Convolution( + # data=pool3, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv4_1") + #relu4_1 = mx.symbol.Activation(data=conv4_1, act_type="relu", name="relu4_1") + relu4_1 = conv_act_layer(pool3, 'conv4_1', + 512, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu') + #conv4_2 = mx.symbol.Convolution( + # data=relu4_1, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv4_2") + #relu4_2 = mx.symbol.Activation(data=conv4_2, act_type="relu", name="relu4_2") + relu4_2 = conv_act_layer(relu4_1, 'conv4_2', + 512, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu') + #conv4_3 = mx.symbol.Convolution( + # data=relu4_2, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv4_3") + #relu4_3 = mx.symbol.Activation(data=conv4_3, act_type="relu", name="relu4_3") + relu4_3 = conv_act_layer(relu4_2, 'conv4_3', + 512, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu') + pool4 = mx.symbol.Pooling( + data=relu4_3, pool_type="max", kernel=(2, 2), stride=(2, 2), name="pool4") + # group 5 + #conv5_1 = mx.symbol.Convolution( + # data=pool4, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv5_1") + #relu5_1 = mx.symbol.Activation(data=conv5_1, act_type="relu", name="relu5_1") + relu5_1 = conv_act_layer(pool4, 'conv5_1', + 512, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu') + #conv5_2 = mx.symbol.Convolution( + # data=relu5_1, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv5_2") + #relu5_2 = mx.symbol.Activation(data=conv5_2, act_type="relu", name="relu5_2") + relu5_2 = conv_act_layer(relu5_1, 'conv5_2', + 512, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu') + #conv5_3 = mx.symbol.Convolution( + # data=relu5_2, kernel=(3, 3), pad=(1, 1), num_filter=512, workspace=2048, name="conv5_3") + #relu5_3 = mx.symbol.Activation(data=conv5_3, act_type="relu", name="relu5_3") + relu5_3 = conv_act_layer(relu5_2, 'conv5_3', + 512, kernel=(3,3), pad=(1,1), stride=(1, 1), act_type='relu') + m3_pool = mx.sym.Pooling(data=relu5_3, kernel=(2, 2), stride=(2,2), pad=(0,0), pool_type='max') + if config.SSH_MODE<=5: + #if FPN: + # relu4_3, relu5_3, m3_pool = get_feat_down([m3_pool, relu5_3, relu4_3]) + + F1 = 256 + F2 = 128 + if config.SSH_MODE==1: + F2 = 256 + _bwm = 1.0 + conv4_128 = conv_act_layer(relu4_3, 'ssh_m1_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + conv5_128 = conv_act_layer(relu5_3, 'ssh_m2_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + conv5_128_up = mx.symbol.Deconvolution(data=conv5_128, num_filter=F2, kernel=(4,4), stride=(2, 2), pad=(1,1), + num_group = F2, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'}, + name='ssh_m2_red_upsampling') + #conv5_128_up = mx.symbol.UpSampling(conv5_128, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + #conv5_128_up = mx.symbol.Crop(*[conv5_128_up, conv4_128]) + + conv_sum = conv4_128+conv5_128_up + #conv_sum = conv_1x1 + + m1_conv = conv_act_layer(conv_sum, 'ssh_m1_conv', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + m1 = ssh_detection_module(m1_conv, F2, 'ssh_m1_det') + m2 = ssh_detection_module(relu5_3, F1, 'ssh_m2_det') + m3 = ssh_detection_module(m3_pool, F1, 'ssh_m3_det') + return {8: m1, 16:m2, 32: m3} + else: + F1 = 256 + F2 = 256 + _bwm = 1.0 + conv4_128 = conv_act_layer(relu4_3, 'ssh_m1_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + conv5_128 = conv_act_layer(relu5_3, 'ssh_m2_red_conv', + F2, kernel=(1, 1), pad=(0, 0), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + conv5_128_up = mx.symbol.Deconvolution(data=conv5_128, num_filter=F2, kernel=(4,4), stride=(2, 2), pad=(1,1), + num_group = F2, no_bias = True, attr={'__lr_mult__': '0.0', '__wd_mult__': '0.0'}, + name='ssh_m2_red_upsampling') + #conv5_128_up = mx.symbol.UpSampling(conv5_128, scale=2, sample_type='nearest', workspace=512, name='ssh_m2_red_up', num_args=1) + conv4_128 = mx.symbol.Crop(*[conv4_128, conv5_128_up]) + #conv5_128_up = mx.symbol.Crop(*[conv5_128_up, conv4_128]) + + conv_sum = conv4_128+conv5_128_up + m1_conv = conv_act_layer(conv_sum, 'ssh_m1_conv', + F2, kernel=(3, 3), pad=(1, 1), stride=(1, 1), act_type='relu', bias_wd_mult=_bwm) + m1 = cpm(m1_conv, 'ssh_m1_det') + m2 = cpm(relu5_3, 'ssh_m2_det') + m3 = cpm(m3_pool, 'ssh_m3_det') + return {8: m1, 16:m2, 32: m3} + +def get_out(conv_fpn_feat, prefix, stride, landmark=False, lr_mult=1.0): + A = config.NUM_ANCHORS + ret_group = [] + num_anchors = config.RPN_ANCHOR_CFG[str(stride)]['NUM_ANCHORS'] + label = mx.symbol.Variable(name='%s_label_stride%d'%(prefix,stride)) + bbox_target = mx.symbol.Variable(name='%s_bbox_target_stride%d'%(prefix,stride)) + bbox_weight = mx.symbol.Variable(name='%s_bbox_weight_stride%d'%(prefix,stride)) + if landmark: + landmark_target = mx.symbol.Variable(name='%s_landmark_target_stride%d'%(prefix,stride)) + landmark_weight = mx.symbol.Variable(name='%s_landmark_weight_stride%d'%(prefix,stride)) + rpn_relu = conv_fpn_feat[stride] + maxout_stat = 0 + if config.USE_MAXOUT>=1 and stride==config.RPN_FEAT_STRIDE[-1]: + maxout_stat = 1 + if config.USE_MAXOUT>=2 and stride!=config.RPN_FEAT_STRIDE[-1]: + maxout_stat = 2 + + if maxout_stat==0: + rpn_cls_score = conv_act_layer(rpn_relu, '%s_rpn_cls_score_stride%d'%(prefix, stride), 2*num_anchors, + kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='') + elif maxout_stat==1: + cls_list = [] + for a in range(num_anchors): + rpn_cls_score_bg = conv_act_layer(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_bg'%(prefix,stride,a), 3, + kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='') + rpn_cls_score_bg = mx.sym.max(rpn_cls_score_bg, axis=1, keepdims=True) + cls_list.append(rpn_cls_score_bg) + rpn_cls_score_fg = conv_act_layer(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_fg'%(prefix,stride,a), 1, + kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='') + cls_list.append(rpn_cls_score_fg) + rpn_cls_score = mx.sym.concat(*cls_list, dim=1, name='%s_rpn_cls_score_stride%d'%(prefix,stride)) + else: + cls_list = [] + for a in range(num_anchors): + rpn_cls_score_bg = conv_act_layer(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_bg'%(prefix,stride,a), 1, + kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='') + cls_list.append(rpn_cls_score_bg) + rpn_cls_score_fg = conv_act_layer(rpn_relu, '%s_rpn_cls_score_stride%d_anchor%d_fg'%(prefix,stride,a), 3, + kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='') + rpn_cls_score_fg = mx.sym.max(rpn_cls_score_fg, axis=1, keepdims=True) + cls_list.append(rpn_cls_score_fg) + rpn_cls_score = mx.sym.concat(*cls_list, dim=1, name='%s_rpn_cls_score_stride%d'%(prefix,stride)) + + rpn_bbox_pred = conv_act_layer(rpn_relu, '%s_rpn_bbox_pred_stride%d'%(prefix,stride), 4*num_anchors, + kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='') + + # prepare rpn data + rpn_cls_score_reshape = mx.symbol.Reshape(data=rpn_cls_score, + shape=(0, 2, -1), + name="%s_rpn_cls_score_reshape_stride%s" % (prefix,stride)) + rpn_bbox_pred_reshape = mx.symbol.Reshape(data=rpn_bbox_pred, + shape=(0, 0, -1), + name="%s_rpn_bbox_pred_reshape_stride%s" % (prefix,stride)) + if landmark: + rpn_landmark_pred = conv_act_layer(rpn_relu, '%s_rpn_landmark_pred_stride%d'%(prefix,stride), 10*num_anchors, + kernel=(1,1), pad=(0,0), stride=(1, 1), act_type='') + rpn_landmark_pred_reshape = mx.symbol.Reshape(data=rpn_landmark_pred, + shape=(0, 0, -1), + name="%s_rpn_landmark_pred_reshape_stride%s" % (prefix,stride)) + + if config.TRAIN.RPN_ENABLE_OHEM>=2: + label, anchor_weight = mx.sym.Custom(op_type='rpn_fpn_ohem3', stride=int(stride), network=config.network, dataset=config.dataset, prefix=prefix, cls_score=rpn_cls_score_reshape, labels = label) + + _bbox_weight = mx.sym.tile(anchor_weight, (1,1,4)) + _bbox_weight = _bbox_weight.reshape((0, -1, A * 4)).transpose((0,2,1)) + bbox_weight = mx.sym.elemwise_mul(bbox_weight, _bbox_weight, name='%s_bbox_weight_mul_stride%s'%(prefix,stride)) + + if landmark: + _landmark_weight = mx.sym.tile(anchor_weight, (1,1,10)) + _landmark_weight = _landmark_weight.reshape((0, -1, A * 10)).transpose((0,2,1)) + landmark_weight = mx.sym.elemwise_mul(landmark_weight, _landmark_weight, name='%s_landmark_weight_mul_stride%s'%(prefix,stride)) + #if not config.FACE_LANDMARK: + # label, bbox_weight = mx.sym.Custom(op_type='rpn_fpn_ohem', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight , labels = label) + #else: + # label, bbox_weight, landmark_weight = mx.sym.Custom(op_type='rpn_fpn_ohem2', stride=int(stride), cls_score=rpn_cls_score_reshape, bbox_weight = bbox_weight, landmark_weight=landmark_weight, labels = label) + #cls loss + rpn_cls_prob = mx.symbol.SoftmaxOutput(data=rpn_cls_score_reshape, + label=label, + multi_output=True, + normalization='valid', use_ignore=True, ignore_label=-1, + grad_scale = lr_mult, + name='%s_rpn_cls_prob_stride%d'%(prefix,stride)) + ret_group.append(rpn_cls_prob) + ret_group.append(mx.sym.BlockGrad(label)) + + #bbox loss + bbox_diff = rpn_bbox_pred_reshape-bbox_target + bbox_diff = bbox_diff * bbox_weight + rpn_bbox_loss_ = mx.symbol.smooth_l1(name='%s_rpn_bbox_loss_stride%d_'%(prefix,stride), scalar=3.0, data=bbox_diff) + rpn_bbox_loss = mx.sym.MakeLoss(name='%s_rpn_bbox_loss_stride%d'%(prefix,stride), data=rpn_bbox_loss_, grad_scale=1.0*lr_mult / (config.TRAIN.RPN_BATCH_SIZE)) + ret_group.append(rpn_bbox_loss) + ret_group.append(mx.sym.BlockGrad(bbox_weight)) + + #landmark loss + if landmark: + landmark_diff = rpn_landmark_pred_reshape-landmark_target + landmark_diff = landmark_diff * landmark_weight + rpn_landmark_loss_ = mx.symbol.smooth_l1(name='%s_rpn_landmark_loss_stride%d_'%(prefix,stride), scalar=3.0, data=landmark_diff) + rpn_landmark_loss = mx.sym.MakeLoss(name='%s_rpn_landmark_loss_stride%d'%(prefix,stride), data=rpn_landmark_loss_, grad_scale=0.5*lr_mult / (config.TRAIN.RPN_BATCH_SIZE)) + ret_group.append(rpn_landmark_loss) + ret_group.append(mx.sym.BlockGrad(landmark_weight)) + return ret_group + +def get_ssh_train(): + """ + Region Proposal Network with VGG + :return: Symbol + """ + data = mx.symbol.Variable(name="data") + + # shared convolutional layers + conv_fpn_feat = get_ssh_conv(data) + ret_group = [] + for stride in config.RPN_FEAT_STRIDE: + ret = get_out(conv_fpn_feat, 'face', stride, config.FACE_LANDMARK, lr_mult=1.0) + ret_group += ret + if config.HEAD_BOX: + ret = get_out(conv_fpn_feat, 'head', stride, False, lr_mult=1.0) + ret_group += ret + + return mx.sym.Group(ret_group) + + diff --git a/RetinaFace/rcnn/tools/__init__.py b/RetinaFace/rcnn/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/RetinaFace/rcnn/tools/demo_images.py b/RetinaFace/rcnn/tools/demo_images.py new file mode 100644 index 0000000..23d3c9c --- /dev/null +++ b/RetinaFace/rcnn/tools/demo_images.py @@ -0,0 +1,307 @@ +import argparse +from ..config import default, generate_config +from ..symbol import symbol_insightext +from ..utils.load_model import load_param +from ..core.module import MutableModule +from rcnn.processing.bbox_transform import nonlinear_pred, clip_boxes +from rcnn.processing.nms import py_nms_wrapper +from rcnn.processing.nms import processing_nms_wrapper +bbox_pred = nonlinear_pred + +import numpy as np +import os +from scipy import io +import cv2 +import matplotlib +matplotlib.use('Agg') +import matplotlib.pyplot as plt + +def demo_maskrcnn(network, ctx, prefix, epoch, + vis= True, has_rpn = True, thresh = 0.001): + + assert has_rpn,"Only has_rpn==True has been supported." + sym = eval('get_' + network + '_mask_test')(num_classes=config.NUM_CLASSES) + arg_params, aux_params = load_param(prefix, epoch, convert=True, ctx=ctx, process=True) + + max_image_shape = (1,3,1024,1024) + max_data_shapes = [("data",max_image_shape),("im_info",(1,3))] + mod = MutableModule(symbol = sym, data_names = ["data","im_info"], label_names= None, + max_data_shapes = max_data_shapes, + context=ctx) + mod.bind(data_shapes = max_data_shapes, label_shapes = None, for_training=False) + mod.init_params(arg_params=arg_params, aux_params=aux_params) + + class OneDataBatch(): + def __init__(self,img): + im_info = mx.nd.array([[img.shape[0],img.shape[1],1.0]]) + img = np.transpose(img,(2,0,1)) + img = img[np.newaxis,(2,1,0)] + self.data = [mx.nd.array(img),im_info] + self.label = None + self.provide_label = None + self.provide_data = [("data",(1,3,img.shape[2],img.shape[3])),("im_info",(1,3))] + + #img_ori = cv2.imread(img_path) + #batch = OneDataBatch(img_ori) + #mod.forward(batch, False) + #results = mod.get_outputs() + #output = dict(zip(mod.output_names, results)) + #rois = output['rois_output'].asnumpy()[:, 1:] + + + #scores = output['cls_prob_reshape_output'].asnumpy()[0] + #bbox_deltas = output['bbox_pred_reshape_output'].asnumpy()[0] + #mask_output = output['mask_prob_output'].asnumpy() + + #pred_boxes = bbox_pred(rois, bbox_deltas) + #pred_boxes = clip_boxes(pred_boxes, [img_ori.shape[0],img_ori.shape[1]]) + + #nms = py_nms_wrapper(config.TEST.NMS) + + #boxes= pred_boxes + + #CLASSES = ('__background__', 'person', 'rider', 'car', 'truck', 'bus', 'train', 'mcycle', 'bicycle') + #CLASSES = ('__background__', 'text') + #all_boxes = [[[] for _ in xrange(1)] + # for _ in xrange(len(CLASSES))] + #all_masks = [[[] for _ in xrange(1)] + # for _ in xrange(len(CLASSES))] + #label = np.argmax(scores, axis=1) + #label = label[:, np.newaxis] + + #for cls in CLASSES: + # cls_ind = CLASSES.index(cls) + # cls_boxes = boxes[:, 4 * cls_ind:4 * (cls_ind + 1)] + # cls_masks = mask_output[:, cls_ind, :, :] + # cls_scores = scores[:, cls_ind, np.newaxis] + # #print cls_scores.shape, label.shape + # keep = np.where((cls_scores >= thresh) & (label == cls_ind))[0] + # cls_masks = cls_masks[keep, :, :] + # dets = np.hstack((cls_boxes, cls_scores)).astype(np.float32)[keep, :] + # keep = nms(dets) + # #print dets.shape, cls_masks.shape + # all_boxes[cls_ind] = dets[keep, :] + # all_masks[cls_ind] = cls_masks[keep, :, :] + + #boxes_this_image = [[]] + [all_boxes[j] for j in range(1, len(CLASSES))] + #masks_this_image = [[]] + [all_masks[j] for j in range(1, len(CLASSES))] + + + #import copy + #import random + # class_names = CLASSES + #color_white = (255, 255, 255) + #scale = 1.0 + #im = copy.copy(img_ori) + + #for j, name in enumerate(class_names): + # if name == '__background__': + # continue + # color = (random.randint(0, 256), random.randint(0, 256), random.randint(0, 256)) # generate a random color + # dets = boxes_this_image[j] + # masks = masks_this_image[j] + # for i in range(len(dets)): + # bbox = dets[i, :4] * scale + # if bbox[2] == bbox[0] or bbox[3] == bbox[1] or bbox[0] == bbox[1] or bbox[2] == bbox[3] : + # continue + # score = dets[i, -1] + # bbox = map(int, bbox) + # cv2.rectangle(im, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color=color, thickness=2) + # cv2.putText(im, '%s %.3f' % (class_names[j], score), (bbox[0], bbox[1] + 10), + # color=color_white, fontFace=cv2.FONT_HERSHEY_COMPLEX, fontScale=0.5) + # mask = masks[i, :, :] + # mask = cv2.resize(mask, (bbox[2] - bbox[0], (bbox[3] - bbox[1])), interpolation=cv2.INTER_LINEAR) + #3 + + # mask[mask > 0.5] = 1 + # mask[mask <= 0.5] = 0 + # mask_color = random.randint(0, 255) + # c = random.randint(0, 2) + # target = im[bbox[1]: bbox[3], bbox[0]: bbox[2], c] + mask_color * mask + # target[target >= 255] = 255 + # im[bbox[1]: bbox[3], bbox[0]: bbox[2], c] = target + #im = im[:,:,(2,1,0)] + #cv2.imwrite("figures/test_result.jpg",im) + #plt.imshow(im) + #fig1 = plt.gcf() + #plt.savefig("figures/test_result.jpg") + #if vis: + #plt.show() + #else: + imglist_file = os.path.join(default.dataset_path, 'imglists', 'test.lst') + assert os.path.exists(imglist_file), 'Path does not exist: {}'.format(imglist_file) + imgfiles_list = [] + with open(imglist_file, 'r') as f: + for line in f: + file_list = dict() + label = line.strip().split('\t') + #file_list['img_id'] = label[0] + file_list['img_path'] = label[1] + #file_list['ins_seg_path'] = label[2].replace('labelTrainIds', 'instanceIds') + imgfiles_list.append(file_list) + + #assert len(imgfiles_list) == self.num_images, 'number of boxes matrix must match number of images' + roidb = [] + index = 0 + for im in range(len(imgfiles_list)): + #print '===============================', im, '=====================================' + #roi_rec = dict() + #img_path = os.path.join(self.data_path, imgfiles_list[im]['img_path']) + index = im + 1; + img_path = os.path.join(default.dataset_path, 'ch4_test_images','img_' + str(index) + '.jpg') + #size = cv2.imread(roi_rec['image']).shape + #roi_rec['height'] = size[0] + #roi_rec['width'] = size[1] + #img_path = os.path.join(img_path, 'img_' + index + '.jpg') + + + img_ori = cv2.imread(img_path) + #img_ori = cv2.resize(img_ori, (, 28), interpolation=cv2.INTER_NEAREST) + batch = OneDataBatch(img_ori) + mod.forward(batch, False) + results = mod.get_outputs() + output = dict(zip(mod.output_names, results)) + rois = output['rois_output'].asnumpy()[:, 1:] + + + scores = output['cls_prob_reshape_output'].asnumpy()[0] + bbox_deltas = output['bbox_pred_reshape_output'].asnumpy()[0] + mask_output = output['mask_prob_output'].asnumpy() + + pred_boxes = bbox_pred(rois, bbox_deltas) + pred_boxes = clip_boxes(pred_boxes, [img_ori.shape[0],img_ori.shape[1]]) + + #nms = py_nms_wrapper(config.TEST.NMS) + nms = processing_nms_wrapper(config.TEST.NMS, 0.8) + boxes= pred_boxes + + CLASSES = ('__background__', 'text') + + all_boxes = [[[] for _ in xrange(1)] + for _ in xrange(len(CLASSES))] + all_masks = [[[] for _ in xrange(1)] + for _ in xrange(len(CLASSES))] + label = np.argmax(scores, axis=1) + label = label[:, np.newaxis] + + for cls in CLASSES: + cls_ind = CLASSES.index(cls) + cls_boxes = boxes[:, 4 * cls_ind:4 * (cls_ind + 1)] + cls_masks = mask_output[:, cls_ind, :, :] + cls_scores = scores[:, cls_ind, np.newaxis] + #print cls_scores.shape, label.shape + keep = np.where((cls_scores >= thresh) & (label == cls_ind))[0] + cls_masks = cls_masks[keep, :, :] + dets = np.hstack((cls_boxes, cls_scores)).astype(np.float32)[keep, :] + keep = nms(dets) + #print dets.shape, cls_masks.shape + all_boxes[cls_ind] = dets[keep, :] + all_masks[cls_ind] = cls_masks[keep, :, :] + + boxes_this_image = [[]] + [all_boxes[j] for j in range(1, len(CLASSES))] + masks_this_image = [[]] + [all_masks[j] for j in range(1, len(CLASSES))] + + + import copy + import random + class_names = CLASSES + color_white = (255, 255, 255) + scale = 1.0 + im = copy.copy(img_ori) + num_boxes = 0 + + for j, name in enumerate(class_names): + if name == '__background__': + continue + color = (random.randint(0, 256), random.randint(0, 256), random.randint(0, 256)) # generate a random color + dets = boxes_this_image[j] + masks = masks_this_image[j] + for i in range(len(dets)): + #num_boxes += 1 + bbox = dets[i, :4] * scale + #if bbox[2] == bbox[0] or bbox[3] == bbox[1] or bbox[0] == bbox[1] or bbox[2] == bbox[3] : + if bbox[2] == bbox[0] or bbox[3] == bbox[1] : + continue + num_boxes += 1 + score = dets[i, -1] + bbox = map(int, bbox) + cv2.rectangle(im, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color=color, thickness=2) + cv2.putText(im, '%s %.3f' % (class_names[j], score), (bbox[0], bbox[1] + 10), + color=color_white, fontFace=cv2.FONT_HERSHEY_COMPLEX, fontScale=0.5) + mask = masks[i, :, :] + mask = cv2.resize(mask, (bbox[2] - bbox[0], (bbox[3] - bbox[1])), interpolation=cv2.INTER_LINEAR) + mask[mask > 0.5] = 1 + mask[mask <= 0.5] = 0 + + px = np.where(mask == 1) + x_min = np.min(px[1]) + y_min = np.min(px[0]) + x_max = np.max(px[1]) + y_max = np.max(px[0]) + #if x_max - x_min <= 1 or y_max - y_min <= 1: + # continue + im_binary = np.zeros(im[:,:,0].shape) + im_binary[bbox[1]: bbox[3], bbox[0]: bbox[2]] = im_binary[bbox[1]: bbox[3], bbox[0]: bbox[2]] + mask + mask_color = random.randint(0, 255) + c = random.randint(0, 2) + target = im[bbox[1]: bbox[3], bbox[0]: bbox[2], c] + mask_color * mask + target[target >= 255] = 255 + im[bbox[1]: bbox[3], bbox[0]: bbox[2], c] = target + #cv2.imwrite("figures/test_result.jpg",im) + inst_dir = os.path.join(default.dataset_path, 'test_mat') + if not os.path.exists(inst_dir): + os.makedirs(inst_dir) + inst_path = os.path.join(inst_dir,'result_{}_{}.mat'.format(index,num_boxes)) + io.savemat(inst_path, {'Segmentation': im_binary}) + numbox = open('data/boxnum.txt','a') + numbox.write(str(num_boxes)+'\n') + numbox.close() + img_dir = os.path.join(default.dataset_path, 'test_result_img') + if not os.path.exists(img_dir): + os.makedirs(img_dir) + img_path = os.path.join(img_dir,'result_{}.jpg'.format(index)) + cv2.imwrite(img_path,im) + + #im = im[:,:,(2,1,0)] + #plt.imshow(im) + #if vis: + # plt.show() + #else: + # plt.savefig("figures/test_result.jpg") +def parse_args(): + parser = argparse.ArgumentParser(description='Test a Fast R-CNN network') + # general + parser.add_argument('--network', help='network name', default=default.network, type=str) + parser.add_argument('--dataset', help='dataset name', default=default.dataset, type=str) + args, rest = parser.parse_known_args() + generate_config(args.network, args.dataset) + # testing + parser.add_argument('--prefix', help='model to test with', default=default.rcnn_prefix, type=str) + parser.add_argument('--epoch', help='model to test with', default=default.rcnn_epoch, type=int) + parser.add_argument('--gpu', help='GPU device to test with', default=0, type=int) + # rcnn + parser.add_argument('--vis', help='turn on visualization', action='store_true') + parser.add_argument('--thresh', help='valid detection threshold', default=1e-3, type=float) + parser.add_argument('--image_name', help='image file path',type=str) + + + args = parser.parse_args() + + return args + + +def main(): + args = parse_args() + ctx = mx.gpu(args.gpu) + print args + demo_maskrcnn(network = args.network, + ctx = ctx, + prefix = args.prefix, + epoch = args.epoch, + img_path = args.image_name, + vis= args.vis, + has_rpn = True, + thresh = args.thresh) + +if __name__ == '__main__': + main() diff --git a/RetinaFace/rcnn/tools/demo_single_image.py b/RetinaFace/rcnn/tools/demo_single_image.py new file mode 100644 index 0000000..2031daa --- /dev/null +++ b/RetinaFace/rcnn/tools/demo_single_image.py @@ -0,0 +1,164 @@ +import argparse +from ..config import default, generate_config +from ..symbol import * +from ..utils.load_model import load_param +from ..core.module import MutableModule +from rcnn.processing.bbox_transform import nonlinear_pred, clip_boxes +from rcnn.processing.nms import py_nms_wrapper +import mxnet as mx +#from rcnn.processing.nms import processing_nms_wrapper +bbox_pred = nonlinear_pred + +import numpy as np +import cv2 +import matplotlib.pyplot as plt + +def demo_maskrcnn(network, ctx, prefix, epoch,img_path, + vis= True, has_rpn = True, thresh = 0.001): + + assert has_rpn,"Only has_rpn==True has been supported." + #sym = eval('get_' + network + '_mask_test')(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCHORS) + sym = eval('get_' + network + '_mask_test')(num_classes=config.NUM_CLASSES) + arg_params, aux_params = load_param(prefix, epoch, convert=True, ctx=ctx, process=True) + for k,v in arg_params.iteritems(): + print(k, v.shape) + + max_image_shape = (1,3,1024,1024) + max_data_shapes = [("data",max_image_shape),("im_info",(1,3))] + mod = MutableModule(symbol = sym, data_names = ["data","im_info"], label_names= None, + max_data_shapes = max_data_shapes, + context=ctx) + mod.bind(data_shapes = max_data_shapes, label_shapes = None, for_training=False) + mod.init_params(arg_params=arg_params, aux_params=aux_params) + + class OneDataBatch(): + def __init__(self,img): + im_info = mx.nd.array([[img.shape[0],img.shape[1],1.0]]) + img = np.transpose(img,(2,0,1)) + img = img[np.newaxis,(2,1,0)] + self.data = [mx.nd.array(img),im_info] + self.label = None + self.provide_label = None + self.provide_data = [("data",(1,3,img.shape[2],img.shape[3])),("im_info",(1,3))] + + img_ori = cv2.imread(img_path) + batch = OneDataBatch(img_ori) + mod.forward(batch, False) + results = mod.get_outputs() + output = dict(zip(mod.output_names, results)) + rois = output['rois_output'].asnumpy()[:, 1:] + + + scores = output['cls_prob_reshape_output'].asnumpy()[0] + bbox_deltas = output['bbox_pred_reshape_output'].asnumpy()[0] + mask_output = output['mask_prob_output'].asnumpy() + + pred_boxes = bbox_pred(rois, bbox_deltas) + pred_boxes = clip_boxes(pred_boxes, [img_ori.shape[0],img_ori.shape[1]]) + + nms = py_nms_wrapper(config.TEST.NMS) + #nms = processing_nms_wrapper(config.TEST.NMS, 0.7) + boxes= pred_boxes + + CLASSES = ('__background__', 'text') + + all_boxes = [[[] for _ in xrange(1)] + for _ in xrange(len(CLASSES))] + all_masks = [[[] for _ in xrange(1)] + for _ in xrange(len(CLASSES))] + label = np.argmax(scores, axis=1) + label = label[:, np.newaxis] + + for cls in CLASSES: + cls_ind = CLASSES.index(cls) + cls_boxes = boxes[:, 4 * cls_ind:4 * (cls_ind + 1)] + cls_masks = mask_output[:, cls_ind, :, :] + cls_scores = scores[:, cls_ind, np.newaxis] + #print cls_scores.shape, label.shape + keep = np.where((cls_scores >= thresh) & (label == cls_ind))[0] + cls_masks = cls_masks[keep, :, :] + dets = np.hstack((cls_boxes, cls_scores)).astype(np.float32)[keep, :] + keep = nms(dets) + #print dets.shape, cls_masks.shape + all_boxes[cls_ind] = dets[keep, :] + all_masks[cls_ind] = cls_masks[keep, :, :] + + boxes_this_image = [[]] + [all_boxes[j] for j in range(1, len(CLASSES))] + masks_this_image = [[]] + [all_masks[j] for j in range(1, len(CLASSES))] + + + import copy + import random + class_names = CLASSES + color_white = (255, 255, 255) + scale = 1.0 + im = copy.copy(img_ori) + + for j, name in enumerate(class_names): + if name == '__background__': + continue + color = (random.randint(0, 256), random.randint(0, 256), random.randint(0, 256)) # generate a random color + dets = boxes_this_image[j] + masks = masks_this_image[j] + for i in range(len(dets)): + bbox = dets[i, :4] * scale + if bbox[2] == bbox[0] or bbox[3] == bbox[1] or bbox[0] == bbox[1] or bbox[2] == bbox[3] : + continue + score = dets[i, -1] + bbox = map(int, bbox) + cv2.rectangle(im, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color=color, thickness=2) + cv2.putText(im, '%s %.3f' % (class_names[j], score), (bbox[0], bbox[1] + 10), + color=color_white, fontFace=cv2.FONT_HERSHEY_COMPLEX, fontScale=0.5) + mask = masks[i, :, :] + mask = cv2.resize(mask, (bbox[2] - bbox[0], (bbox[3] - bbox[1])), interpolation=cv2.INTER_LINEAR) + mask[mask > 0.5] = 1 + mask[mask <= 0.5] = 0 + mask_color = random.randint(0, 255) + c = random.randint(0, 2) + target = im[bbox[1]: bbox[3], bbox[0]: bbox[2], c] + mask_color * mask + target[target >= 255] = 255 + im[bbox[1]: bbox[3], bbox[0]: bbox[2], c] = target + ##im = im[:,:,(2,1,0)] + ##plt.imshow(im) + cv2.imwrite("figures/test_result.jpg",im) + #if vis: + # plt.show() + # else: + # plt.savefig("figures/test_result.jpg") +def parse_args(): + parser = argparse.ArgumentParser(description='Test a Fast R-CNN network') + # general + parser.add_argument('--network', help='network name', default=default.network, type=str) + parser.add_argument('--dataset', help='dataset name', default=default.dataset, type=str) + args, rest = parser.parse_known_args() + generate_config(args.network, args.dataset) + # testing + parser.add_argument('--prefix', help='model to test with', default=default.rcnn_prefix, type=str) + parser.add_argument('--epoch', help='model to test with', default=default.rcnn_epoch, type=int) + parser.add_argument('--gpu', help='GPU device to test with', default=0, type=int) + # rcnn + parser.add_argument('--vis', help='turn on visualization', action='store_true') + parser.add_argument('--thresh', help='valid detection threshold', default=1e-3, type=float) + parser.add_argument('--image_name', help='image file path',type=str) + + + args = parser.parse_args() + + return args + + +def main(): + args = parse_args() + ctx = mx.gpu(args.gpu) + print args + demo_maskrcnn(network = args.network, + ctx = ctx, + prefix = args.prefix, + epoch = args.epoch, + img_path = args.image_name, + vis= args.vis, + has_rpn = True, + thresh = args.thresh) + +if __name__ == '__main__': + main() diff --git a/RetinaFace/rcnn/tools/reeval.py b/RetinaFace/rcnn/tools/reeval.py new file mode 100644 index 0000000..1bc9c5b --- /dev/null +++ b/RetinaFace/rcnn/tools/reeval.py @@ -0,0 +1,50 @@ +import argparse +try: + import cPickle as pickle +except ImportError: + import pickle +import os +import mxnet as mx + +from ..logger import logger +from ..config import config, default, generate_config +from ..dataset import * + + +def reeval(args): + # load imdb + imdb = eval(args.dataset)(args.image_set, args.root_path, args.dataset_path) + + # load detection results + cache_file = os.path.join(imdb.cache_path, imdb.name, 'detections.pkl') + with open(cache_file) as f: + detections = pickle.load(f) + + # eval + imdb.evaluate_detections(detections) + + +def parse_args(): + parser = argparse.ArgumentParser(description='imdb test') + # general + parser.add_argument('--network', help='network name', default=default.network, type=str) + parser.add_argument('--dataset', help='dataset name', default=default.dataset, type=str) + args, rest = parser.parse_known_args() + generate_config(args.network, args.dataset) + parser.add_argument('--image_set', help='image_set name', default=default.image_set, type=str) + parser.add_argument('--root_path', help='output data folder', default=default.root_path, type=str) + parser.add_argument('--dataset_path', help='dataset path', default=default.dataset_path, type=str) + # other + parser.add_argument('--no_shuffle', help='disable random shuffle', action='store_true') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + logger.info('Called with argument: %s' % args) + reeval(args) + + +if __name__ == '__main__': + main() diff --git a/RetinaFace/rcnn/tools/test_rcnn.py b/RetinaFace/rcnn/tools/test_rcnn.py new file mode 100644 index 0000000..5e53106 --- /dev/null +++ b/RetinaFace/rcnn/tools/test_rcnn.py @@ -0,0 +1,109 @@ +import argparse +import pprint +import mxnet as mx + +from ..logger import logger +from ..config import config, default, generate_config +from ..symbol import * +from ..dataset import * +from ..core.loader import TestLoader +from ..core.tester import Predictor, pred_eval +from ..utils.load_model import load_param + + +def test_rcnn(network, dataset, image_set, root_path, dataset_path, + ctx, prefix, epoch, + vis, shuffle, has_rpn, proposal, thresh): + # set config + if has_rpn: + config.TEST.HAS_RPN = True + + # print config + logger.info(pprint.pformat(config)) + + # load symbol and testing data + if has_rpn: + sym = eval('get_' + network + '_test')(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCHORS) + imdb = eval(dataset)(image_set, root_path, dataset_path) + roidb = imdb.gt_roidb() + else: + sym = eval('get_' + network + '_rcnn_test')(num_classes=config.NUM_CLASSES) + imdb = eval(dataset)(image_set, root_path, dataset_path) + gt_roidb = imdb.gt_roidb() + roidb = eval('imdb.' + proposal + '_roidb')(gt_roidb) + + # get test data iter + test_data = TestLoader(roidb, batch_size=1, shuffle=shuffle, has_rpn=has_rpn) + + # load model + arg_params, aux_params = load_param(prefix, epoch, convert=True, ctx=ctx, process=True) + + # infer shape + data_shape_dict = dict(test_data.provide_data) + arg_shape, _, aux_shape = sym.infer_shape(**data_shape_dict) + arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape)) + aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape)) + + # check parameters + for k in sym.list_arguments(): + if k in data_shape_dict or 'label' in k: + continue + assert k in arg_params, k + ' not initialized' + assert arg_params[k].shape == arg_shape_dict[k], \ + 'shape inconsistent for ' + k + ' inferred ' + str(arg_shape_dict[k]) + ' provided ' + str(arg_params[k].shape) + for k in sym.list_auxiliary_states(): + assert k in aux_params, k + ' not initialized' + assert aux_params[k].shape == aux_shape_dict[k], \ + 'shape inconsistent for ' + k + ' inferred ' + str(aux_shape_dict[k]) + ' provided ' + str(aux_params[k].shape) + + # decide maximum shape + data_names = [k[0] for k in test_data.provide_data] + label_names = None + max_data_shape = [('data', (1, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] + if not has_rpn: + max_data_shape.append(('rois', (1, config.TEST.PROPOSAL_POST_NMS_TOP_N + 30, 5))) + + # create predictor + predictor = Predictor(sym, data_names, label_names, + context=ctx, max_data_shapes=max_data_shape, + provide_data=test_data.provide_data, provide_label=test_data.provide_label, + arg_params=arg_params, aux_params=aux_params) + + # start detection + pred_eval(predictor, test_data, imdb, vis=vis, thresh=thresh) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Test a Fast R-CNN network') + # general + parser.add_argument('--network', help='network name', default=default.network, type=str) + parser.add_argument('--dataset', help='dataset name', default=default.dataset, type=str) + args, rest = parser.parse_known_args() + generate_config(args.network, args.dataset) + parser.add_argument('--image_set', help='image_set name', default=default.test_image_set, type=str) + parser.add_argument('--root_path', help='output data folder', default=default.root_path, type=str) + parser.add_argument('--dataset_path', help='dataset path', default=default.dataset_path, type=str) + # testing + parser.add_argument('--prefix', help='model to test with', default=default.rcnn_prefix, type=str) + parser.add_argument('--epoch', help='model to test with', default=default.rcnn_epoch, type=int) + parser.add_argument('--gpu', help='GPU device to test with', default=0, type=int) + # rcnn + parser.add_argument('--vis', help='turn on visualization', action='store_true') + parser.add_argument('--thresh', help='valid detection threshold', default=1e-3, type=float) + parser.add_argument('--shuffle', help='shuffle data on visualization', action='store_true') + parser.add_argument('--has_rpn', help='generate proposals on the fly', action='store_true') + parser.add_argument('--proposal', help='can be ss for selective search or rpn', default='rpn', type=str) + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + logger.info('Called with argument: %s' % args) + ctx = mx.gpu(args.gpu) + test_rcnn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path, + ctx, args.prefix, args.epoch, + args.vis, args.shuffle, args.has_rpn, args.proposal, args.thresh) + +if __name__ == '__main__': + main() diff --git a/RetinaFace/rcnn/tools/test_rpn.py b/RetinaFace/rcnn/tools/test_rpn.py new file mode 100644 index 0000000..5622227 --- /dev/null +++ b/RetinaFace/rcnn/tools/test_rpn.py @@ -0,0 +1,102 @@ +import argparse +import pprint +import mxnet as mx + +from ..logger import logger +from ..config import config, default, generate_config +from ..symbol import * +from ..dataset import * +from ..core.loader import TestLoader +from ..core.tester import Predictor, generate_proposals, test_proposals +from ..utils.load_model import load_param + + +def test_rpn(network, dataset, image_set, root_path, dataset_path, + ctx, prefix, epoch, + vis, shuffle, thresh, test_output=False): + # rpn generate proposal config + config.TEST.HAS_RPN = True + + # print config + logger.info(pprint.pformat(config)) + + # load symbol + sym = eval('get_' + network + '_rpn_test')() + + # load dataset and prepare imdb for training + imdb = eval(dataset)(image_set, root_path, dataset_path) + roidb = imdb.gt_roidb() + test_data = TestLoader(roidb, batch_size=1, shuffle=shuffle, has_rpn=True, withlabel=True) + + # load model + arg_params, aux_params = load_param(prefix, epoch, convert=True, ctx=ctx) + + # infer shape + data_shape_dict = dict(test_data.provide_data) + arg_shape, _, aux_shape = sym.infer_shape(**data_shape_dict) + arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape)) + aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape)) + + # check parameters + for k in sym.list_arguments(): + if k in data_shape_dict or 'label' in k: + continue + assert k in arg_params, k + ' not initialized' + assert arg_params[k].shape == arg_shape_dict[k], \ + 'shape inconsistent for ' + k + ' inferred ' + str(arg_shape_dict[k]) + ' provided ' + str(arg_params[k].shape) + for k in sym.list_auxiliary_states(): + assert k in aux_params, k + ' not initialized' + assert aux_params[k].shape == aux_shape_dict[k], \ + 'shape inconsistent for ' + k + ' inferred ' + str(aux_shape_dict[k]) + ' provided ' + str(aux_params[k].shape) + + # decide maximum shape + data_names = [k[0] for k in test_data.provide_data] + label_names = None if test_data.provide_label is None else [k[0] for k in test_data.provide_label] + max_data_shape = [('data', (1, 3, max([v[1] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] + + # create predictor + predictor = Predictor(sym, data_names, label_names, + context=ctx, max_data_shapes=max_data_shape, + provide_data=test_data.provide_data, provide_label=test_data.provide_label, + arg_params=arg_params, aux_params=aux_params) + + # start testing + if not test_output: + imdb_boxes = generate_proposals(predictor, test_data, imdb, vis=vis, thresh=thresh) + imdb.evaluate_recall(roidb, candidate_boxes=imdb_boxes) + else: + test_proposals(predictor, test_data, imdb, roidb, vis=vis) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Test a Region Proposal Network') + # general + parser.add_argument('--network', help='network name', default=default.network, type=str) + parser.add_argument('--dataset', help='dataset name', default=default.dataset, type=str) + args, rest = parser.parse_known_args() + generate_config(args.network, args.dataset) + parser.add_argument('--image_set', help='image_set name', default=default.test_image_set, type=str) + parser.add_argument('--root_path', help='output data folder', default=default.root_path, type=str) + parser.add_argument('--dataset_path', help='dataset path', default=default.dataset_path, type=str) + # testing + parser.add_argument('--prefix', help='model to test with', default=default.rpn_prefix, type=str) + parser.add_argument('--epoch', help='model to test with', default=default.rpn_epoch, type=int) + # rpn + parser.add_argument('--gpu', help='GPU device to test with', default=0, type=int) + parser.add_argument('--vis', help='turn on visualization', action='store_true') + parser.add_argument('--thresh', help='rpn proposal threshold', default=0, type=float) + parser.add_argument('--shuffle', help='shuffle data on visualization', action='store_true') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + logger.info('Called with argument: %s' % args) + ctx = mx.gpu(args.gpu) + test_rpn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path, + ctx, args.prefix, args.epoch, + args.vis, args.shuffle, args.thresh) + +if __name__ == '__main__': + main() diff --git a/RetinaFace/rcnn/tools/train_maskrcnn.py b/RetinaFace/rcnn/tools/train_maskrcnn.py new file mode 100755 index 0000000..435ba9b --- /dev/null +++ b/RetinaFace/rcnn/tools/train_maskrcnn.py @@ -0,0 +1,210 @@ +import argparse +import logging +import pprint +import mxnet as mx +import numpy as np +import os.path as osp +import cPickle as pkl + +from ..config import config, default, generate_config +from ..symbol import * +from ..core import callback, metric +from ..core.loader import MaskROIIter +from ..core.module import MutableModule +from ..processing.bbox_regression import add_bbox_regression_targets, add_mask_targets +from ..processing.assign_levels import add_assign_targets +from ..utils.load_data import load_proposal_roidb, merge_roidb #, filter_roidb +from ..utils.load_model import load_param + +def train_maskrcnn(network, dataset, image_set, root_path, dataset_path, + frequent, kvstore, work_load_list, no_flip, no_shuffle, resume, + ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, + train_shared, lr, lr_step, proposal, maskrcnn_stage=None): + # set up logger + logging.basicConfig() + logger = logging.getLogger() + logger.setLevel(logging.INFO) + + # load symbol + config.TRAIN.BATCH_IMAGES = 1 + config.TRAIN.BATCH_ROIS = 256 + sym = eval('get_' + network + '_maskrcnn')(num_classes=config.NUM_CLASSES) + + # setup multi-gpu + batch_size = len(ctx) + input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size + + # print config + pprint.pprint(config) + + USE_CACHE = True + + if USE_CACHE: + roidb_file = root_path + '/cache/' + dataset + '_roidb_with_mask.pkl' + mean_file = root_path + '/cache/' + dataset + '_roidb_mean.pkl' + std_file = root_path + '/cache/' + dataset + '_roidb_std.pkl' + if maskrcnn_stage is not None: + roidb_file = root_path + '/cache/' + dataset + '_roidb_with_mask_' + maskrcnn_stage + '.pkl' + mean_file = root_path + '/cache/' + dataset + '_roidb_mean_' + maskrcnn_stage + '.pkl' + std_file = root_path + '/cache/' + dataset + '_roidb_std_' + maskrcnn_stage + '.pkl' + + if USE_CACHE and osp.exists(roidb_file) and osp.exists(mean_file) and osp.exists(std_file): + print 'Load ' + roidb_file + with open(roidb_file, 'r') as f: + roidb = pkl.load(f) + print 'Load ' + mean_file + with open(mean_file, 'r') as f: + means = pkl.load(f) + print 'Load ' + std_file + with open(std_file, 'r') as f: + stds = pkl.load(f) + else: + # load dataset and prepare imdb for training + image_sets = [iset for iset in image_set.split('+')] + roidbs = [load_proposal_roidb(dataset, image_set, root_path, dataset_path, + proposal=proposal, append_gt=True, flip=not no_flip) + for image_set in image_sets] + roidb = merge_roidb(roidbs) + + def filter_roidb(roidb): + """ remove roidb entries without usable rois """ + + def is_valid(entry): + """ valid images have at least 1 fg or bg roi """ + overlaps = entry['max_overlaps'] + fg_inds = np.where(overlaps >= config.TRAIN.FG_THRESH)[0] + bg_inds = np.where((overlaps < config.TRAIN.BG_THRESH_HI) & (overlaps >= config.TRAIN.BG_THRESH_LO))[0] + valid = len(fg_inds) > 0 and len(bg_inds) > 0 + return valid + + num = len(roidb) + filtered_roidb = [entry for entry in roidb if is_valid(entry)] + num_after = len(filtered_roidb) + print 'filtered %d roidb entries: %d -> %d' % (num - num_after, num, num_after) + + return filtered_roidb + + roidb = filter_roidb(roidb) + means, stds = add_bbox_regression_targets(roidb) + add_assign_targets(roidb) + add_mask_targets(roidb) + if USE_CACHE: + for file, obj in zip([roidb_file, mean_file, std_file], [roidb, means, stds]): + with open(file, 'w') as f: + pkl.dump(obj, f, -1) + + # load training data + train_data = MaskROIIter(roidb, batch_size=input_batch_size, shuffle=not no_shuffle, + ctx=ctx, work_load_list=work_load_list, aspect_grouping=config.TRAIN.ASPECT_GROUPING) + + # infer max shape + max_data_shape = [('data', (input_batch_size, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] + max_label_shape = [] + for s in config.RCNN_FEAT_STRIDE: + max_data_shape.append(('rois_stride%s' % s, (input_batch_size, config.TRAIN.BATCH_ROIS, 5))) + max_label_shape.append(('label_stride%s' % s, (input_batch_size, config.TRAIN.BATCH_ROIS))) + max_label_shape.append(('bbox_target_stride%s' % s, (input_batch_size, config.TRAIN.BATCH_ROIS*config.NUM_CLASSES*4))) + max_label_shape.append(('bbox_weight_stride%s' % s, (input_batch_size, config.TRAIN.BATCH_ROIS*config.NUM_CLASSES*4))) + max_label_shape.append(('mask_target_stride%s' % s, (input_batch_size, config.TRAIN.BATCH_ROIS, config.NUM_CLASSES, 28, 28))) + max_label_shape.append(('mask_weight_stride%s' % s, (input_batch_size, config.TRAIN.BATCH_ROIS, config.NUM_CLASSES, 1, 1))) + # infer shape + data_shape_dict = dict(train_data.provide_data + train_data.provide_label) + + arg_shape, out_shape, aux_shape = sym.infer_shape(**data_shape_dict) + arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape)) + out_shape_dict = zip(sym.list_outputs(), out_shape) + aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape)) + print 'output shape' + pprint.pprint(out_shape_dict) + + # load and initialize params + if resume: + arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) + else: + arg_params, aux_params = load_param(pretrained, epoch, convert=True) + init_bbox_pred = mx.init.Normal(sigma=0.001) + init_internal = mx.init.Normal(sigma=0.01) + init = mx.init.Xavier(factor_type="in", rnd_type='gaussian', magnitude=2) + for k in sym.list_arguments(): + if k in data_shape_dict: + continue + if k not in arg_params: + print 'init', k + arg_params[k] = mx.nd.zeros(shape=arg_shape_dict[k]) + init_internal(k, arg_params[k]) + if k in ['rcnn_fc_bbox_weight', 'bbox_pred_weight']: + init_bbox_pred(k, arg_params[k]) + if k.endswith('bias'): + arg_params[k] = mx.nd.zeros(shape=arg_shape_dict[k]) + if 'ctx_red_weight' in k: + ctx_shape = np.array(arg_shape_dict[k]) + ctx_shape[1] /= 2 + arg_params[k][:] = np.concatenate((np.eye(ctx_shape[1]).reshape(ctx_shape), np.zeros(ctx_shape)), axis=1) + + for k in sym.list_auxiliary_states(): + if k not in aux_params: + print 'init', k + aux_params[k] = mx.nd.zeros(shape=aux_shape_dict[k]) + init(k, aux_params[k]) + + # check parameter shapes + for k in sym.list_arguments(): + if k in data_shape_dict: + continue + assert k in arg_params, k + ' not initialized' + assert arg_params[k].shape == arg_shape_dict[k], \ + 'shape inconsistent for ' + k + ' inferred ' + str(arg_shape_dict[k]) + ' provided ' + str(arg_params[k].shape) + for k in sym.list_auxiliary_states(): + assert k in aux_params, k + ' not initialized' + assert aux_params[k].shape == aux_shape_dict[k], \ + 'shape inconsistent for ' + k + ' inferred ' + str(aux_shape_dict[k]) + ' provided ' + str(aux_params[k].shape) + + # prepare training + # create solver + data_names = [k[0] for k in train_data.provide_data] + label_names = [k[0] for k in train_data.provide_label] + if train_shared: + fixed_param_prefix = config.FIXED_PARAMS_SHARED + else: + fixed_param_prefix = config.FIXED_PARAMS + mod = MutableModule(sym, data_names=data_names, label_names=label_names, + logger=logger, context=ctx, work_load_list=work_load_list, + max_data_shapes=max_data_shape, max_label_shapes=max_label_shape, + fixed_param_prefix=fixed_param_prefix) + + # decide training params + # metric + eval_metric = metric.RCNNAccMetric() + cls_metric = metric.RCNNLogLossMetric() + bbox_metric = metric.RCNNL1LossMetric() + mask_acc_metric = metric.MaskAccMetric() + mask_log_metric = metric.MaskLogLossMetric() + eval_metrics = mx.metric.CompositeEvalMetric() + for child_metric in [eval_metric, cls_metric, bbox_metric, mask_acc_metric, mask_log_metric]: + eval_metrics.add(child_metric) + # callback + batch_end_callback = mx.callback.Speedometer(train_data.batch_size, frequent=frequent) + epoch_end_callback = callback.do_checkpoint(prefix, means, stds) + # decide learning rate + base_lr = lr + lr_factor = 0.1 + lr_epoch = [int(epoch) for epoch in lr_step.split(',')] + lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] + lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) + lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] + print 'lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters + lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor) + # optimizer + optimizer_params = {'momentum': 0.9, + 'wd': 0.0001, + 'learning_rate': lr, + 'lr_scheduler': lr_scheduler, + 'rescale_grad': (1.0 / batch_size), + 'clip_gradient': 5} + + # train + mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, + batch_end_callback=batch_end_callback, kvstore=kvstore, + optimizer='sgd', optimizer_params=optimizer_params, + arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch) + diff --git a/RetinaFace/rcnn/tools/train_rcnn.py b/RetinaFace/rcnn/tools/train_rcnn.py new file mode 100644 index 0000000..4d2f797 --- /dev/null +++ b/RetinaFace/rcnn/tools/train_rcnn.py @@ -0,0 +1,172 @@ +import argparse +import pprint +import mxnet as mx + +from ..logger import logger +from ..config import config, default, generate_config +from ..symbol import * +from ..core import callback, metric +from ..core.loader import ROIIter +from ..core.module import MutableModule +from ..processing.bbox_regression import add_bbox_regression_targets +from ..utils.load_data import load_proposal_roidb, merge_roidb, filter_roidb +from ..utils.load_model import load_param + + +def train_rcnn(network, dataset, image_set, root_path, dataset_path, + frequent, kvstore, work_load_list, no_flip, no_shuffle, resume, + ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, + train_shared, lr, lr_step, proposal): + # set up config + config.TRAIN.BATCH_IMAGES = 2 + config.TRAIN.BATCH_ROIS = 128 + if proposal == 'ss': + config.TRAIN.BG_THRESH_LO = 0.1 # reproduce Fast R-CNN + + # load symbol + sym = eval('get_' + network + '_rcnn')(num_classes=config.NUM_CLASSES) + + # setup multi-gpu + batch_size = len(ctx) + input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size + + # print config + logger.info(pprint.pformat(config)) + + # load dataset and prepare imdb for training + image_sets = [iset for iset in image_set.split('+')] + roidbs = [load_proposal_roidb(dataset, image_set, root_path, dataset_path, + proposal=proposal, append_gt=True, flip=not no_flip) + for image_set in image_sets] + roidb = merge_roidb(roidbs) + roidb = filter_roidb(roidb) + means, stds = add_bbox_regression_targets(roidb) + + # load training data + train_data = ROIIter(roidb, batch_size=input_batch_size, shuffle=not no_shuffle, + ctx=ctx, work_load_list=work_load_list, aspect_grouping=config.TRAIN.ASPECT_GROUPING) + + # infer max shape + max_data_shape = [('data', (input_batch_size, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] + logger.info('providing maximum shape %s' % max_data_shape) + + # infer shape + data_shape_dict = dict(train_data.provide_data + train_data.provide_label) + arg_shape, out_shape, aux_shape = sym.infer_shape(**data_shape_dict) + arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape)) + out_shape_dict = dict(zip(sym.list_outputs(), out_shape)) + aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape)) + logger.info('output shape %s' % pprint.pformat(out_shape_dict)) + + # load and initialize params + if resume: + arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) + else: + arg_params, aux_params = load_param(pretrained, epoch, convert=True) + arg_params['cls_score_weight'] = mx.random.normal(0, 0.01, shape=arg_shape_dict['cls_score_weight']) + arg_params['cls_score_bias'] = mx.nd.zeros(shape=arg_shape_dict['cls_score_bias']) + arg_params['bbox_pred_weight'] = mx.random.normal(0, 0.001, shape=arg_shape_dict['bbox_pred_weight']) + arg_params['bbox_pred_bias'] = mx.nd.zeros(shape=arg_shape_dict['bbox_pred_bias']) + + # check parameter shapes + for k in sym.list_arguments(): + if k in data_shape_dict: + continue + assert k in arg_params, k + ' not initialized' + assert arg_params[k].shape == arg_shape_dict[k], \ + 'shape inconsistent for ' + k + ' inferred ' + str(arg_shape_dict[k]) + ' provided ' + str(arg_params[k].shape) + for k in sym.list_auxiliary_states(): + assert k in aux_params, k + ' not initialized' + assert aux_params[k].shape == aux_shape_dict[k], \ + 'shape inconsistent for ' + k + ' inferred ' + str(aux_shape_dict[k]) + ' provided ' + str(aux_params[k].shape) + + # prepare training + # create solver + data_names = [k[0] for k in train_data.provide_data] + label_names = [k[0] for k in train_data.provide_label] + if train_shared: + fixed_param_prefix = config.FIXED_PARAMS_SHARED + else: + fixed_param_prefix = config.FIXED_PARAMS + mod = MutableModule(sym, data_names=data_names, label_names=label_names, + logger=logger, context=ctx, work_load_list=work_load_list, + max_data_shapes=max_data_shape, fixed_param_prefix=fixed_param_prefix) + + # decide training params + # metric + eval_metric = metric.RCNNAccMetric() + cls_metric = metric.RCNNLogLossMetric() + bbox_metric = metric.RCNNL1LossMetric() + eval_metrics = mx.metric.CompositeEvalMetric() + for child_metric in [eval_metric, cls_metric, bbox_metric]: + eval_metrics.add(child_metric) + # callback + batch_end_callback = mx.callback.Speedometer(train_data.batch_size, frequent=frequent, auto_reset=False) + epoch_end_callback = callback.do_checkpoint(prefix, means, stds) + # decide learning rate + base_lr = lr + lr_factor = 0.1 + lr_epoch = [int(epoch) for epoch in lr_step.split(',')] + lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] + lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) + lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] + logger.info('lr %f lr_epoch_diff %s lr_iters %s' % (lr, lr_epoch_diff, lr_iters)) + lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor) + # optimizer + optimizer_params = {'momentum': 0.9, + 'wd': 0.0005, + 'learning_rate': lr, + 'lr_scheduler': lr_scheduler, + 'rescale_grad': (1.0 / batch_size), + 'clip_gradient': 5} + + # train + mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, + batch_end_callback=batch_end_callback, kvstore=kvstore, + optimizer='sgd', optimizer_params=optimizer_params, + arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Train a Fast R-CNN Network') + # general + parser.add_argument('--network', help='network name', default=default.network, type=str) + parser.add_argument('--dataset', help='dataset name', default=default.dataset, type=str) + args, rest = parser.parse_known_args() + generate_config(args.network, args.dataset) + parser.add_argument('--image_set', help='image_set name', default=default.image_set, type=str) + parser.add_argument('--root_path', help='output data folder', default=default.root_path, type=str) + parser.add_argument('--dataset_path', help='dataset path', default=default.dataset_path, type=str) + # training + parser.add_argument('--frequent', help='frequency of logging', default=default.frequent, type=int) + parser.add_argument('--kvstore', help='the kv-store type', default=default.kvstore, type=str) + parser.add_argument('--work_load_list', help='work load for different devices', default=None, type=list) + parser.add_argument('--no_flip', help='disable flip images', action='store_true') + parser.add_argument('--no_shuffle', help='disable random shuffle', action='store_true') + parser.add_argument('--resume', help='continue training', action='store_true') + # rcnn + parser.add_argument('--gpus', help='GPU device to train with', default='0', type=str) + parser.add_argument('--pretrained', help='pretrained model prefix', default=default.pretrained, type=str) + parser.add_argument('--pretrained_epoch', help='pretrained model epoch', default=default.pretrained_epoch, type=int) + parser.add_argument('--prefix', help='new model prefix', default=default.rcnn_prefix, type=str) + parser.add_argument('--begin_epoch', help='begin epoch of training', default=0, type=int) + parser.add_argument('--end_epoch', help='end epoch of training', default=default.rcnn_epoch, type=int) + parser.add_argument('--lr', help='base learning rate', default=default.rcnn_lr, type=float) + parser.add_argument('--lr_step', help='learning rate steps (in epoch)', default=default.rcnn_lr_step, type=str) + parser.add_argument('--train_shared', help='second round train shared params', action='store_true') + parser.add_argument('--proposal', help='can be ss for selective search or rpn', default='rpn', type=str) + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + logger.info('Called with argument: %s' % args) + ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')] + train_rcnn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path, + args.frequent, args.kvstore, args.work_load_list, args.no_flip, args.no_shuffle, args.resume, + ctx, args.pretrained, args.pretrained_epoch, args.prefix, args.begin_epoch, args.end_epoch, + train_shared=args.train_shared, lr=args.lr, lr_step=args.lr_step, proposal=args.proposal) + +if __name__ == '__main__': + main() diff --git a/RetinaFace/rcnn/tools/train_rpn.py b/RetinaFace/rcnn/tools/train_rpn.py new file mode 100644 index 0000000..5a9e296 --- /dev/null +++ b/RetinaFace/rcnn/tools/train_rpn.py @@ -0,0 +1,195 @@ +import argparse +import logging +import pprint +import mxnet as mx + +from ..config import config, default, generate_config +from ..symbol import * +from ..core import callback, metric +from ..core.loader import AnchorLoaderFPN +from ..core.module import MutableModule +from ..utils.load_data import load_gt_roidb, merge_roidb, filter_roidb +from ..utils.load_model import load_param + + +def train_rpn(network, dataset, image_set, root_path, dataset_path, + frequent, kvstore, work_load_list, no_flip, no_shuffle, resume, + ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, + train_shared, lr, lr_step): + # set up logger + logging.basicConfig() + logger = logging.getLogger() + logger.setLevel(logging.INFO) + + # setup config + assert config.TRAIN.BATCH_IMAGES==1 + + # load symbol + sym = eval('get_' + network + '_rpn')() + feat_sym = [] + for stride in config.RPN_FEAT_STRIDE: + feat_sym.append(sym.get_internals()['rpn_cls_score_stride%s_output' % stride]) + + + # setup multi-gpu + batch_size = len(ctx) + input_batch_size = config.TRAIN.BATCH_IMAGES * batch_size + + # print config + pprint.pprint(config) + + # load dataset and prepare imdb for training + image_sets = [iset for iset in image_set.split('+')] + roidbs = [load_gt_roidb(dataset, image_set, root_path, dataset_path, + flip=not no_flip) + for image_set in image_sets] + roidb = merge_roidb(roidbs) + roidb = filter_roidb(roidb) + + # load training data + #train_data = AnchorLoaderFPN(feat_sym, roidb, batch_size=input_batch_size, shuffle=not no_shuffle, + # ctx=ctx, work_load_list=work_load_list, + # feat_stride=config.RPN_FEAT_STRIDE, anchor_scales=config.ANCHOR_SCALES, + # anchor_ratios=config.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING, + # allowed_border=9999) + train_data = AnchorLoaderFPN(feat_sym, roidb, batch_size=input_batch_size, shuffle=not no_shuffle, + ctx=ctx, work_load_list=work_load_list) + + # infer max shape + max_data_shape = [('data', (input_batch_size, 3, max([v[0] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] + max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) + print 'providing maximum shape', max_data_shape, max_label_shape + + # infer shape + data_shape_dict = dict(train_data.provide_data + train_data.provide_label) + arg_shape, out_shape, aux_shape = sym.infer_shape(**data_shape_dict) + arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape)) + out_shape_dict = zip(sym.list_outputs(), out_shape) + aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape)) + print 'output shape' + pprint.pprint(out_shape_dict) + + # load and initialize params + if resume: + arg_params, aux_params = load_param(prefix, begin_epoch, convert=True) + else: + arg_params, aux_params = load_param(pretrained, epoch, convert=True) + init = mx.init.Xavier(factor_type="in", rnd_type='gaussian', magnitude=2) + init_internal = mx.init.Normal(sigma=0.01) + for k in sym.list_arguments(): + if k in data_shape_dict: + continue + if k not in arg_params: + print 'init', k + arg_params[k] = mx.nd.zeros(shape=arg_shape_dict[k]) + if not k.endswith('bias'): + init_internal(k, arg_params[k]) + + for k in sym.list_auxiliary_states(): + if k not in aux_params: + print 'init', k + aux_params[k] = mx.nd.zeros(shape=aux_shape_dict[k]) + init(k, aux_params[k]) + + # check parameter shapes + for k in sym.list_arguments(): + if k in data_shape_dict: + continue + assert k in arg_params, k + ' not initialized' + assert arg_params[k].shape == arg_shape_dict[k], \ + 'shape inconsistent for ' + k + ' inferred ' + str(arg_shape_dict[k]) + ' provided ' + str(arg_params[k].shape) + for k in sym.list_auxiliary_states(): + assert k in aux_params, k + ' not initialized' + assert aux_params[k].shape == aux_shape_dict[k], \ + 'shape inconsistent for ' + k + ' inferred ' + str(aux_shape_dict[k]) + ' provided ' + str(aux_params[k].shape) + + # create solver + data_names = [k[0] for k in train_data.provide_data] + label_names = [k[0] for k in train_data.provide_label] + if train_shared: + fixed_param_prefix = config.FIXED_PARAMS_SHARED + else: + fixed_param_prefix = config.FIXED_PARAMS + mod = MutableModule(sym, data_names=data_names, label_names=label_names, + logger=logger, context=ctx, work_load_list=work_load_list, + max_data_shapes=max_data_shape, max_label_shapes=max_label_shape, + fixed_param_prefix=fixed_param_prefix) + + # decide training params + # metric + eval_metric = metric.RPNAccMetric() + cls_metric = metric.RPNLogLossMetric() + bbox_metric = metric.RPNL1LossMetric() + eval_metrics = mx.metric.CompositeEvalMetric() + for child_metric in [eval_metric,cls_metric,bbox_metric]: + eval_metrics.add(child_metric) + # callback + batch_end_callback = [] + batch_end_callback.append(mx.callback.Speedometer(train_data.batch_size, frequent=frequent)) + epoch_end_callback = mx.callback.do_checkpoint(prefix) + # decide learning rate + base_lr = lr + lr_factor = 0.1 + lr_epoch = [int(epoch) for epoch in lr_step.split(',')] + lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] + lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) + lr_iters = [int(epoch * len(roidb) / batch_size) for epoch in lr_epoch_diff] + print 'lr', lr, 'lr_epoch_diff', lr_epoch_diff, 'lr_iters', lr_iters + lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(lr_iters, lr_factor) + # optimizer + optimizer_params = {'momentum': 0.9, + 'wd': 0.0001, + 'learning_rate': lr, + 'lr_scheduler': lr_scheduler, + 'rescale_grad': (1.0 / batch_size), + 'clip_gradient': 5} + + # train + mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, + batch_end_callback=batch_end_callback, kvstore=kvstore, + optimizer='sgd', optimizer_params=optimizer_params, + arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Train a Region Proposal Network') + # general + parser.add_argument('--network', help='network name', default=default.network, type=str) + parser.add_argument('--dataset', help='dataset name', default=default.dataset, type=str) + args, rest = parser.parse_known_args() + generate_config(args.network, args.dataset) + parser.add_argument('--image_set', help='image_set name', default=default.image_set, type=str) + parser.add_argument('--root_path', help='output data folder', default=default.root_path, type=str) + parser.add_argument('--dataset_path', help='dataset path', default=default.dataset_path, type=str) + # training + parser.add_argument('--frequent', help='frequency of logging', default=default.frequent, type=int) + parser.add_argument('--kvstore', help='the kv-store type', default=default.kvstore, type=str) + parser.add_argument('--work_load_list', help='work load for different devices', default=None, type=list) + parser.add_argument('--no_flip', help='disable flip images', action='store_true') + parser.add_argument('--no_shuffle', help='disable random shuffle', action='store_true') + parser.add_argument('--resume', help='continue training', action='store_true') + # rpn + parser.add_argument('--gpus', help='GPU device to train with', default='0', type=str) + parser.add_argument('--pretrained', help='pretrained model prefix', default=default.pretrained, type=str) + parser.add_argument('--pretrained_epoch', help='pretrained model epoch', default=default.pretrained_epoch, type=int) + parser.add_argument('--prefix', help='new model prefix', default=default.rpn_prefix, type=str) + parser.add_argument('--begin_epoch', help='begin epoch of training', default=0, type=int) + parser.add_argument('--end_epoch', help='end epoch of training', default=default.rpn_epoch, type=int) + parser.add_argument('--lr', help='base learning rate', default=default.rpn_lr, type=float) + parser.add_argument('--lr_step', help='learning rate steps (in epoch)', default=default.rpn_lr_step, type=str) + parser.add_argument('--train_shared', help='second round train shared params', action='store_true') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + print 'Called with argument:', args + ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')] + train_rpn(args.network, args.dataset, args.image_set, args.root_path, args.dataset_path, + args.frequent, args.kvstore, args.work_load_list, args.no_flip, args.no_shuffle, args.resume, + ctx, args.pretrained, args.pretrained_epoch, args.prefix, args.begin_epoch, args.end_epoch, + train_shared=args.train_shared, lr=args.lr, lr_step=args.lr_step) + +if __name__ == '__main__': + main() diff --git a/RetinaFace/rcnn/utils/__init__.py b/RetinaFace/rcnn/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/RetinaFace/rcnn/utils/combine_model.py b/RetinaFace/rcnn/utils/combine_model.py new file mode 100644 index 0000000..824efef --- /dev/null +++ b/RetinaFace/rcnn/utils/combine_model.py @@ -0,0 +1,22 @@ +from .load_model import load_checkpoint +from .save_model import save_checkpoint + + +def combine_model(prefix1, epoch1, prefix2, epoch2, prefix_out, epoch_out): + args1, auxs1 = load_checkpoint(prefix1, epoch1) + args2, auxs2 = load_checkpoint(prefix2, epoch2) + arg_names = args1.keys() + args2.keys() + aux_names = auxs1.keys() + auxs2.keys() + args = dict() + for arg in arg_names: + if arg in args1: + args[arg] = args1[arg] + else: + args[arg] = args2[arg] + auxs = dict() + for aux in aux_names: + if aux in auxs1: + auxs[aux] = auxs1[aux] + else: + auxs[aux] = auxs2[aux] + save_checkpoint(prefix_out, epoch_out, args, auxs) diff --git a/RetinaFace/rcnn/utils/load_data.py b/RetinaFace/rcnn/utils/load_data.py new file mode 100644 index 0000000..180814c --- /dev/null +++ b/RetinaFace/rcnn/utils/load_data.py @@ -0,0 +1,55 @@ +import numpy as np +from ..logger import logger +from ..config import config +from ..dataset import * + + +def load_gt_roidb(dataset_name, image_set_name, root_path, dataset_path, + flip=False): + """ load ground truth roidb """ + imdb = eval(dataset_name)(image_set_name, root_path, dataset_path) + roidb = imdb.gt_roidb() + print('roidb size', len(roidb)) + if flip: + roidb = imdb.append_flipped_images(roidb) + print('flipped roidb size', len(roidb)) + return roidb + + +def load_proposal_roidb(dataset_name, image_set_name, root_path, dataset_path, + proposal='rpn', append_gt=True, flip=False): + """ load proposal roidb (append_gt when training) """ + imdb = eval(dataset_name)(image_set_name, root_path, dataset_path) + gt_roidb = imdb.gt_roidb() + roidb = eval('imdb.' + proposal + '_roidb')(gt_roidb, append_gt) + if flip: + roidb = imdb.append_flipped_images(roidb) + return roidb + + +def merge_roidb(roidbs): + """ roidb are list, concat them together """ + roidb = roidbs[0] + for r in roidbs[1:]: + roidb.extend(r) + return roidb + + +def filter_roidb(roidb): + """ remove roidb entries without usable rois """ + + def is_valid(entry): + """ valid images have at least 1 fg or bg roi """ + overlaps = entry['max_overlaps'] + fg_inds = np.where(overlaps >= config.TRAIN.FG_THRESH)[0] + bg_inds = np.where((overlaps < config.TRAIN.BG_THRESH_HI) & (overlaps >= config.TRAIN.BG_THRESH_LO))[0] + valid = len(fg_inds) > 0 or len(bg_inds) > 0 + #valid = len(fg_inds) > 0 + return valid + + num = len(roidb) + filtered_roidb = [entry for entry in roidb if is_valid(entry)] + num_after = len(filtered_roidb) + logger.info('load data: filtered %d roidb entries: %d -> %d' % (num - num_after, num, num_after)) + + return filtered_roidb diff --git a/RetinaFace/rcnn/utils/load_model.py b/RetinaFace/rcnn/utils/load_model.py new file mode 100644 index 0000000..6f83548 --- /dev/null +++ b/RetinaFace/rcnn/utils/load_model.py @@ -0,0 +1,59 @@ +import mxnet as mx + + +def load_checkpoint(prefix, epoch): + """ + Load model checkpoint from file. + :param prefix: Prefix of model name. + :param epoch: Epoch number of model we would like to load. + :return: (arg_params, aux_params) + arg_params : dict of str to NDArray + Model parameter, dict of name to NDArray of net's weights. + aux_params : dict of str to NDArray + Model parameter, dict of name to NDArray of net's auxiliary states. + """ + save_dict = mx.nd.load('%s-%04d.params' % (prefix, epoch)) + arg_params = {} + aux_params = {} + for k, v in save_dict.items(): + tp, name = k.split(':', 1) + if tp == 'arg': + arg_params[name] = v + if tp == 'aux': + aux_params[name] = v + return arg_params, aux_params + + +def convert_context(params, ctx): + """ + :param params: dict of str to NDArray + :param ctx: the context to convert to + :return: dict of str of NDArray with context ctx + """ + new_params = dict() + for k, v in params.items(): + new_params[k] = v.as_in_context(ctx) + return new_params + + +def load_param(prefix, epoch, convert=False, ctx=None, process=False): + """ + wrapper for load checkpoint + :param prefix: Prefix of model name. + :param epoch: Epoch number of model we would like to load. + :param convert: reference model should be converted to GPU NDArray first + :param ctx: if convert then ctx must be designated. + :param process: model should drop any test + :return: (arg_params, aux_params) + """ + arg_params, aux_params = load_checkpoint(prefix, epoch) + if convert: + if ctx is None: + ctx = mx.cpu() + arg_params = convert_context(arg_params, ctx) + aux_params = convert_context(aux_params, ctx) + if process: + tests = [k for k in arg_params.keys() if '_test' in k] + for test in tests: + arg_params[test.replace('_test', '')] = arg_params.pop(test) + return arg_params, aux_params diff --git a/RetinaFace/rcnn/utils/save_model.py b/RetinaFace/rcnn/utils/save_model.py new file mode 100644 index 0000000..1c98869 --- /dev/null +++ b/RetinaFace/rcnn/utils/save_model.py @@ -0,0 +1,18 @@ +import mxnet as mx + + +def save_checkpoint(prefix, epoch, arg_params, aux_params): + """Checkpoint the model data into file. + :param prefix: Prefix of model name. + :param epoch: The epoch number of the model. + :param arg_params: dict of str to NDArray + Model parameter, dict of name to NDArray of net's weights. + :param aux_params: dict of str to NDArray + Model parameter, dict of name to NDArray of net's auxiliary states. + :return: None + prefix-epoch.params will be saved for parameters. + """ + save_dict = {('arg:%s' % k) : v for k, v in arg_params.items()} + save_dict.update({('aux:%s' % k) : v for k, v in aux_params.items()}) + param_name = '%s-%04d.params' % (prefix, epoch) + mx.nd.save(param_name, save_dict) diff --git a/retinaface/retinaface.py b/RetinaFace/retinaface.py similarity index 100% rename from retinaface/retinaface.py rename to RetinaFace/retinaface.py diff --git a/retinaface/test_widerface.py b/RetinaFace/test_widerface.py similarity index 98% rename from retinaface/test_widerface.py rename to RetinaFace/test_widerface.py index 1e87125..1572e64 100644 --- a/retinaface/test_widerface.py +++ b/RetinaFace/test_widerface.py @@ -185,10 +185,9 @@ def test(args): def main(): global args args = parse_args() - if args.mode==0: - args.pyramid = False - args.bbox_vote = False - else: + args.pyramid = False + args.bbox_vote = False + if args.mode==1: args.pyramid = True args.bbox_vote = True logger.info('Called with argument: %s' % args) diff --git a/RetinaFace/train.py b/RetinaFace/train.py new file mode 100644 index 0000000..e316e53 --- /dev/null +++ b/RetinaFace/train.py @@ -0,0 +1,329 @@ +from __future__ import print_function +import sys +import argparse +import os +import pprint +import re +import mxnet as mx +import numpy as np +from mxnet.module import Module +import mxnet.optimizer as optimizer + +from rcnn.logger import logger +from rcnn.config import config, default, generate_config +from rcnn.symbol import * +from rcnn.core import callback, metric +from rcnn.core.loader import CropLoader, CropLoader2 +from rcnn.core.module import MutableModule +from rcnn.utils.load_data import load_gt_roidb, merge_roidb, filter_roidb +from rcnn.utils.load_model import load_param + + +def get_fixed_params(symbol, fixed_param): + if not config.LAYER_FIX: + return [] + fixed_param_names = [] + #for name in symbol.list_arguments(): + # for f in fixed_param: + # if re.match(f, name): + # fixed_param_names.append(name) + #pre = 'mobilenetv20_features_linearbottleneck' + idx = 0 + for name in symbol.list_arguments(): + #print(idx, name) + if idx<7 and name!='data': + fixed_param_names.append(name) + #elif name.startswith('stage1_'): + # fixed_param_names.append(name) + if name.find('upsampling')>=0: + fixed_param_names.append(name) + + idx+=1 + return fixed_param_names + +def train_net(args, ctx, pretrained, epoch, prefix, begin_epoch, end_epoch, + lr=0.001, lr_step='5'): + # setup config + #init_config() + #print(config) + # setup multi-gpu + + input_batch_size = config.TRAIN.BATCH_IMAGES * len(ctx) + + # print config + logger.info(pprint.pformat(config)) + + # load dataset and prepare imdb for training + image_sets = [iset for iset in args.image_set.split('+')] + roidbs = [load_gt_roidb(args.dataset, image_set, args.root_path, args.dataset_path, + flip=not args.no_flip) + for image_set in image_sets] + #roidb = merge_roidb(roidbs) + #roidb = filter_roidb(roidb) + roidb = roidbs[0] + + # load symbol + #sym = eval('get_' + args.network + '_train')(num_classes=config.NUM_CLASSES, num_anchors=config.NUM_ANCHORS) + #feat_sym = sym.get_internals()['rpn_cls_score_output'] + #train_data = AnchorLoader(feat_sym, roidb, batch_size=input_batch_size, shuffle=not args.no_shuffle, + # ctx=ctx, work_load_list=args.work_load_list, + # feat_stride=config.RPN_FEAT_STRIDE, anchor_scales=config.ANCHOR_SCALES, + # anchor_ratios=config.ANCHOR_RATIOS, aspect_grouping=config.TRAIN.ASPECT_GROUPING) + + # load and initialize params + sym = None + if len(pretrained)==0: + arg_params = {} + aux_params = {} + else: + logger.info('loading %s,%d'%(pretrained, epoch)) + sym, arg_params, aux_params = mx.model.load_checkpoint(pretrained, epoch) + #arg_params, aux_params = load_param(pretrained, epoch, convert=True) + #for k in ['rpn_conv_3x3', 'rpn_cls_score', 'rpn_bbox_pred', 'cls_score', 'bbox_pred']: + # _k = k+"_weight" + # if _k in arg_shape_dict: + # v = 0.001 if _k.startswith('bbox_') else 0.01 + # arg_params[_k] = mx.random.normal(0, v, shape=arg_shape_dict[_k]) + # print('init %s with normal %.5f'%(_k,v)) + # _k = k+"_bias" + # if _k in arg_shape_dict: + # arg_params[_k] = mx.nd.zeros(shape=arg_shape_dict[_k]) + # print('init %s with zero'%(_k)) + + sym = eval('get_' + args.network + '_train')(sym) + #print(sym.get_internals()) + feat_sym = [] + for stride in config.RPN_FEAT_STRIDE: + feat_sym.append(sym.get_internals()['face_rpn_cls_score_stride%s_output' % stride]) + + + + train_data = CropLoader(feat_sym, roidb, batch_size=input_batch_size, shuffle=not args.no_shuffle, + ctx=ctx, work_load_list=args.work_load_list) + + + # infer max shape + max_data_shape = [('data', (1, 3, max([v[1] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] + #max_data_shape = [('data', (1, 3, max([v[1] for v in config.SCALES]), max([v[1] for v in config.SCALES])))] + max_data_shape, max_label_shape = train_data.infer_shape(max_data_shape) + max_data_shape.append(('gt_boxes', (1, roidb[0]['max_num_boxes'], 5))) + logger.info('providing maximum shape %s %s' % (max_data_shape, max_label_shape)) + + # infer shape + data_shape_dict = dict(train_data.provide_data + train_data.provide_label) + arg_shape, out_shape, aux_shape = sym.infer_shape(**data_shape_dict) + arg_shape_dict = dict(zip(sym.list_arguments(), arg_shape)) + out_shape_dict = dict(zip(sym.list_outputs(), out_shape)) + aux_shape_dict = dict(zip(sym.list_auxiliary_states(), aux_shape)) + logger.info('output shape %s' % pprint.pformat(out_shape_dict)) + + + for k,v in arg_shape_dict.iteritems(): + if k.find('upsampling')>=0: + print('initializing upsampling_weight', k) + arg_params[k] = mx.nd.zeros(shape=v) + init = mx.init.Initializer() + init._init_bilinear(k, arg_params[k]) + #print(args[k]) + + # check parameter shapes + #for k in sym.list_arguments(): + # if k in data_shape_dict: + # continue + # assert k in arg_params, k + ' not initialized' + # assert arg_params[k].shape == arg_shape_dict[k], \ + # 'shape inconsistent for ' + k + ' inferred ' + str(arg_shape_dict[k]) + ' provided ' + str(arg_params[k].shape) + #for k in sym.list_auxiliary_states(): + # assert k in aux_params, k + ' not initialized' + # assert aux_params[k].shape == aux_shape_dict[k], \ + # 'shape inconsistent for ' + k + ' inferred ' + str(aux_shape_dict[k]) + ' provided ' + str(aux_params[k].shape) + + fixed_param_prefix = config.FIXED_PARAMS + # create solver + data_names = [k[0] for k in train_data.provide_data] + label_names = [k[0] for k in train_data.provide_label] + fixed_param_names = get_fixed_params(sym, fixed_param_prefix) + print('fixed', fixed_param_names, file=sys.stderr) + mod = Module(sym, data_names=data_names, label_names=label_names, + logger=logger, context=ctx, work_load_list=args.work_load_list, + fixed_param_names=fixed_param_names) + + # metric + eval_metrics = mx.metric.CompositeEvalMetric() + mid=0 + for m in range(len(config.RPN_FEAT_STRIDE)): + stride = config.RPN_FEAT_STRIDE[m] + #mid = m*MSTEP + _metric = metric.RPNAccMetric(pred_idx=mid, label_idx=mid+1, name='RPNAcc_s%s'%stride) + eval_metrics.add(_metric) + mid+=2 + #_metric = metric.RPNLogLossMetric(pred_idx=mid, label_idx=mid+1) + #eval_metrics.add(_metric) + + _metric = metric.RPNL1LossMetric(loss_idx=mid, weight_idx=mid+1, name='RPNL1Loss_s%s'%stride) + eval_metrics.add(_metric) + mid+=2 + if config.FACE_LANDMARK: + _metric = metric.RPNL1LossMetric(loss_idx=mid, weight_idx=mid+1, name='RPNLandMarkL1Loss_s%s'%stride) + eval_metrics.add(_metric) + mid+=2 + if config.HEAD_BOX: + _metric = metric.RPNAccMetric(pred_idx=mid, label_idx=mid+1, name='RPNAcc_head_s%s'%stride) + eval_metrics.add(_metric) + mid+=2 + #_metric = metric.RPNLogLossMetric(pred_idx=mid, label_idx=mid+1) + #eval_metrics.add(_metric) + + _metric = metric.RPNL1LossMetric(loss_idx=mid, weight_idx=mid+1, name='RPNL1Loss_head_s%s'%stride) + eval_metrics.add(_metric) + mid+=2 + + # callback + #means = np.tile(np.array(config.TRAIN.BBOX_MEANS), config.NUM_CLASSES) + #stds = np.tile(np.array(config.TRAIN.BBOX_STDS), config.NUM_CLASSES) + #epoch_end_callback = callback.do_checkpoint(prefix, means, stds) + epoch_end_callback = None + # decide learning rate + #base_lr = lr + #lr_factor = 0.1 + #lr = base_lr * (lr_factor ** (len(lr_epoch) - len(lr_epoch_diff))) + + lr_epoch = [int(epoch) for epoch in lr_step.split(',')] + lr_epoch_diff = [epoch - begin_epoch for epoch in lr_epoch if epoch > begin_epoch] + lr_iters = [int(epoch * len(roidb) / input_batch_size) for epoch in lr_epoch_diff] + + lr_steps = [] + if len(lr_iters)==5: + factors = [0.5, 0.5, 0.4, 0.1, 0.1] + for i in range(5): + lr_steps.append( (lr_iters[i], factors[i]) ) + elif len(lr_iters)==8: #warmup + for li in lr_iters[0:5]: + lr_steps.append( (li, 1.5849) ) + for li in lr_iters[5:]: + lr_steps.append( (li, 0.1) ) + else: + for li in lr_iters: + lr_steps.append( (li, 0.1) ) + #lr_steps = [ (20,0.1), (40, 0.1) ] #XXX + + end_epoch = 10000 + logger.info('lr %f lr_epoch_diff %s lr_steps %s' % (lr, lr_epoch_diff, lr_steps)) + # optimizer + opt = optimizer.SGD(learning_rate=lr, momentum=0.9, wd=0.0005, rescale_grad=1.0/len(ctx), clip_gradient=None) + initializer=mx.init.Xavier() + #initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style + + train_data = mx.io.PrefetchingIter(train_data) + + _cb = mx.callback.Speedometer(train_data.batch_size, frequent=args.frequent, auto_reset=False) + global_step = [0] + + def save_model(epoch): + arg, aux = mod.get_params() + all_layers = mod.symbol.get_internals() + outs = [] + for stride in config.RPN_FEAT_STRIDE: + num_anchors = config.RPN_ANCHOR_CFG[str(stride)]['NUM_ANCHORS'] + _name = 'face_rpn_cls_score_stride%d_output' % stride + rpn_cls_score = all_layers[_name] + + + # prepare rpn data + rpn_cls_score_reshape = mx.symbol.Reshape(data=rpn_cls_score, + shape=(0, 2, -1, 0), + name="face_rpn_cls_score_reshape_stride%d" % stride) + + rpn_cls_prob = mx.symbol.SoftmaxActivation(data=rpn_cls_score_reshape, + mode="channel", + name="face_rpn_cls_prob_stride%d" % stride) + rpn_cls_prob_reshape = mx.symbol.Reshape(data=rpn_cls_prob, + shape=(0, 2 * num_anchors, -1, 0), + name='face_rpn_cls_prob_reshape_stride%d' % stride) + _name = 'face_rpn_bbox_pred_stride%d_output' % stride + rpn_bbox_pred = all_layers[_name] + outs.append(rpn_cls_prob_reshape) + outs.append(rpn_bbox_pred) + if config.FACE_LANDMARK: + _name = 'face_rpn_landmark_pred_stride%d_output' % stride + rpn_landmark_pred = all_layers[_name] + outs.append(rpn_landmark_pred) + _sym = mx.sym.Group(outs) + mx.model.save_checkpoint(prefix, epoch, _sym, arg, aux) + + def _batch_callback(param): + #global global_step + _cb(param) + global_step[0]+=1 + mbatch = global_step[0] + for step in lr_steps: + if mbatch==step[0]: + opt.lr *= step[1] + print('lr change to', opt.lr,' in batch', mbatch, file=sys.stderr) + break + + if mbatch==lr_steps[-1][0]: + print('saving final checkpoint', mbatch, file=sys.stderr) + save_model(0) + #arg, aux = mod.get_params() + #mx.model.save_checkpoint(prefix, 99, mod.symbol, arg, aux) + sys.exit(0) + + # train + mod.fit(train_data, eval_metric=eval_metrics, epoch_end_callback=epoch_end_callback, + batch_end_callback=_batch_callback, kvstore=args.kvstore, + optimizer=opt, + initializer = initializer, + allow_missing=True, + arg_params=arg_params, aux_params=aux_params, begin_epoch=begin_epoch, num_epoch=end_epoch) + + +def parse_args(): + parser = argparse.ArgumentParser(description='Train RetinaFace') + # general + parser.add_argument('--network', help='network name', default=default.network, type=str) + parser.add_argument('--dataset', help='dataset name', default=default.dataset, type=str) + args, rest = parser.parse_known_args() + generate_config(args.network, args.dataset) + parser.add_argument('--image_set', help='image_set name', default=default.image_set, type=str) + parser.add_argument('--root_path', help='output data folder', default=default.root_path, type=str) + parser.add_argument('--dataset_path', help='dataset path', default=default.dataset_path, type=str) + # training + parser.add_argument('--frequent', help='frequency of logging', default=default.frequent, type=int) + parser.add_argument('--kvstore', help='the kv-store type', default=default.kvstore, type=str) + parser.add_argument('--work_load_list', help='work load for different devices', default=None, type=list) + parser.add_argument('--no_flip', help='disable flip images', action='store_true') + parser.add_argument('--no_shuffle', help='disable random shuffle', action='store_true') + # e2e + #parser.add_argument('--gpus', help='GPU device to train with', default='0,1,2,3', type=str) + parser.add_argument('--pretrained', help='pretrained model prefix', default=default.pretrained, type=str) + parser.add_argument('--pretrained_epoch', help='pretrained model epoch', default=default.pretrained_epoch, type=int) + parser.add_argument('--prefix', help='new model prefix', default=default.prefix, type=str) + parser.add_argument('--begin_epoch', help='begin epoch of training, use with resume', default=0, type=int) + parser.add_argument('--end_epoch', help='end epoch of training', default=default.end_epoch, type=int) + parser.add_argument('--lr', help='base learning rate', default=default.lr, type=float) + parser.add_argument('--lr_step', help='learning rate steps (in epoch)', default=default.lr_step, type=str) + parser.add_argument('--no_ohem', help='disable online hard mining', action='store_true') + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + logger.info('Called with argument: %s' % args) + #ctx = [mx.gpu(int(i)) for i in args.gpus.split(',')] + ctx = [] + cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip() + if len(cvd)>0: + for i in xrange(len(cvd.split(','))): + ctx.append(mx.gpu(i)) + if len(ctx)==0: + ctx = [mx.cpu()] + print('use cpu') + else: + print('gpu num:', len(ctx)) + train_net(args, ctx, args.pretrained, args.pretrained_epoch, args.prefix, args.begin_epoch, args.end_epoch, + lr=args.lr, lr_step=args.lr_step) + +if __name__ == '__main__': + main() diff --git a/retinaface/test.py b/retinaface/test.py deleted file mode 100644 index d7dc978..0000000 --- a/retinaface/test.py +++ /dev/null @@ -1,57 +0,0 @@ -import cv2 -import sys -import numpy as np -import datetime -import os -import glob -from retinaface import RetinaFace - -thresh = 0.8 -scales = [480, 640] - -count = 10 - -gpuid = 0 -detector = RetinaFace('./model/R50', 0, gpuid, 'net3') - -img = cv2.imread(sys.argv[1]) -print(img.shape) -im_shape = img.shape -target_size = scales[0] -max_size = scales[1] -im_size_min = np.min(im_shape[0:2]) -im_size_max = np.max(im_shape[0:2]) -#im_scale = 1.0 -#if im_size_min>target_size or im_size_max>max_size: -im_scale = float(target_size) / float(im_size_min) -# prevent bigger axis from being more than max_size: -if np.round(im_scale * im_size_max) > max_size: - im_scale = float(max_size) / float(im_size_max) - -print('im_scale', im_scale) - -for c in range(count): - faces, landmarks = detector.detect(img, thresh, scales=[im_scale]) - print(c, faces.shape, landmarks.shape) - -if faces is not None: - print('find', faces.shape[0], 'faces') - for i in range(faces.shape[0]): - #print('score', faces[i][4]) - box = faces[i].astype(np.int) - #color = (255,0,0) - color = (0,0,255) - cv2.rectangle(img, (box[0], box[1]), (box[2], box[3]), color, 2) - if landmarks is not None: - landmark5 = landmarks[i].astype(np.int) - #print(landmark.shape) - for l in range(landmark5.shape[0]): - color = (0,0,255) - if l==0 or l==3: - color = (0,255,0) - cv2.circle(img, (landmark5[l][0], landmark5[l][1]), 1, color, 2) - - filename = './detector_test.jpg' - print('writing', filename) - cv2.imwrite(filename, img) -