diff --git a/src/operator/lsoftmax-inl.h b/3rdparty/operator/lsoftmax-inl.h similarity index 100% rename from src/operator/lsoftmax-inl.h rename to 3rdparty/operator/lsoftmax-inl.h diff --git a/src/operator/lsoftmax.cc b/3rdparty/operator/lsoftmax.cc similarity index 100% rename from src/operator/lsoftmax.cc rename to 3rdparty/operator/lsoftmax.cc diff --git a/src/operator/lsoftmax.cu b/3rdparty/operator/lsoftmax.cu similarity index 100% rename from src/operator/lsoftmax.cu rename to 3rdparty/operator/lsoftmax.cu diff --git a/src/align/__init__.py b/src/align/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/align/align_celeb.py b/src/align/align_celeb.py new file mode 100644 index 0000000..fa2e2c5 --- /dev/null +++ b/src/align/align_celeb.py @@ -0,0 +1,223 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from scipy import misc +import sys +import os +import cv2 +import argparse +import tensorflow as tf +import numpy as np +import base64 +#import facenet +import detect_face +from easydict import EasyDict as edict +import random +from time import sleep +sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) +import face_image +import face_preprocess + +def to_rgb(img): + w, h = img.shape + ret = np.empty((w, h, 3), dtype=np.uint8) + ret[:, :, 0] = ret[:, :, 1] = ret[:, :, 2] = img + return ret + + +def IOU(Reframe,GTframe): + x1 = Reframe[0]; + y1 = Reframe[1]; + width1 = Reframe[2]-Reframe[0]; + height1 = Reframe[3]-Reframe[1]; + + x2 = GTframe[0] + y2 = GTframe[1] + width2 = GTframe[2]-GTframe[0] + height2 = GTframe[3]-GTframe[1] + + endx = max(x1+width1,x2+width2) + startx = min(x1,x2) + width = width1+width2-(endx-startx) + + endy = max(y1+height1,y2+height2) + starty = min(y1,y2) + height = height1+height2-(endy-starty) + + if width <=0 or height <= 0: + ratio = 0 + else: + Area = width*height + Area1 = width1*height1 + Area2 = width2*height2 + ratio = Area*1./(Area1+Area2-Area) + return ratio + + +def main(args): + output_dir = os.path.expanduser(args.output_dir) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + datamap = {} + pp = 0 + datasize = 0 + verr = 0 + for line in open(args.input_dir+"_clean_list.txt", 'r'): + pp+=1 + if pp%10000==0: + print('loading list', pp) + line = line.strip()[2:] + if not line.startswith('m.'): + continue + vec = line.split('/') + assert len(vec)==2 + #print(line) + person = vec[0] + img = vec[1] + try: + img_id = int(img.split('.')[0]) + except ValueError: + #print('value error', line) + verr+=1 + continue + if not person in datamap: + labelid = len(datamap) + datamap[person] = [labelid, {img_id : 1}] + else: + datamap[person][1][img_id] = 1 + datasize+=1 + + print('dataset size', args.name, datasize) + print('dataset err', verr) + + print('Creating networks and loading parameters') + + with tf.Graph().as_default(): + #gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_memory_fraction) + #sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)) + sess = tf.Session() + with sess.as_default(): + pnet, rnet, onet = detect_face.create_mtcnn(sess, None) + + minsize = 100 # minimum size of face + threshold = [ 0.6, 0.7, 0.7 ] # three steps's threshold + factor = 0.709 # scale factor + + print(minsize) + print(threshold) + print(factor) + + # Add a random key to the filename to allow alignment using multiple processes + #random_key = np.random.randint(0, high=99999) + #bounding_boxes_filename = os.path.join(output_dir, 'bounding_boxes_%05d.txt' % random_key) + output_filename = os.path.join(output_dir, 'faceinsight_align_%s.lst' % args.name) + + with open(output_filename, "w") as text_file: + nrof_images_total = 0 + nrof_successfully_aligned = 0 + nrof_changed = 0 + nrof_iou3 = 0 + nrof_force = 0 + for line in open(args.input_dir, 'r'): + vec = line.strip().split() + person = vec[0] + img_id = int(vec[1]) + v = datamap.get(person, None) + if v is None: + continue + if not img_id in v[1]: + continue + labelid = v[0] + img_str = base64.b64decode(vec[-1]) + nparr = np.fromstring(img_str, np.uint8) + img = cv2.imdecode(nparr, cv2.CV_LOAD_IMAGE_COLOR) + img = img[...,::-1] #to rgb + if nrof_images_total%100==0: + print("Processing %d, (%d)" % (nrof_images_total, nrof_successfully_aligned)) + nrof_images_total += 1 + target_dir = os.path.join(output_dir, person) + if not os.path.exists(target_dir): + os.makedirs(target_dir) + target_path = os.path.join(target_dir, "%d.jpg"%img_id) + _minsize = minsize + fimage = edict() + fimage.bbox = None + fimage.image_path = target_path + fimage.classname = str(labelid) + if fimage.bbox is not None: + _bb = fimage.bbox + _minsize = min( [_bb[2]-_bb[0], _bb[3]-_bb[1], img.shape[0]//2, img.shape[1]//2] ) + + bounding_boxes, points = detect_face.detect_face(img, _minsize, pnet, rnet, onet, threshold, factor) + bindex = -1 + nrof_faces = bounding_boxes.shape[0] + if fimage.bbox is None and nrof_faces>0: + det = bounding_boxes[:,0:4] + img_size = np.asarray(img.shape)[0:2] + bindex = 0 + if nrof_faces>1: + bounding_box_size = (det[:,2]-det[:,0])*(det[:,3]-det[:,1]) + img_center = img_size / 2 + offsets = np.vstack([ (det[:,0]+det[:,2])/2-img_center[1], (det[:,1]+det[:,3])/2-img_center[0] ]) + offset_dist_squared = np.sum(np.power(offsets,2.0),0) + bindex = np.argmax(bounding_box_size-offset_dist_squared*2.0) # some extra weight on the centering + if fimage.bbox is not None: + if nrof_faces>0: + assert(bounding_boxes.shape[0]==points.shape[1]) + det = bounding_boxes[:,0:4] + img_size = np.asarray(img.shape)[0:2] + index2 = [0.0, 0] + for i in xrange(det.shape[0]): + _det = det[i] + iou = IOU(fimage.bbox, _det) + if iou>index2[0]: + index2[0] = iou + index2[1] = i + if index2[0]>-0.3: + bindex = index2[1] + nrof_iou3+=1 + if bindex<0: + bounding_boxes, points = detect_face.detect_face_force(img, fimage.bbox, pnet, rnet, onet) + bindex = 0 + nrof_force+=1 + + if bindex>=0: + + det = bounding_boxes[:,0:4] + det = det[bindex,:] + points = points[:, bindex] + landmark = points.reshape((2,5)).T + #points need to be transpose, points = points.reshape( (5,2) ).transpose() + det = np.squeeze(det) + bb = det + points = list(points.flatten()) + assert(len(points)==10) + warped = face_preprocess.preprocess(img, bbox=bb, landmark = landmark, image_size="112,96") + misc.imsave(target_path, warped) + nrof_successfully_aligned += 1 + oline = '%d\t%s\t%d' % (1,fimage.image_path, int(fimage.classname)) + #oline = '%d\t%s\t%d\t%d\t%d\t%d\t%d\t' % (0,fimage.image_path, int(fimage.classname), bb[0], bb[1], bb[2], bb[3]) + #oline += '\t'.join([str(x) for x in points]) + text_file.write("%s\n"%oline) + + print('Total number of images: %d' % nrof_images_total) + print('Number of successfully aligned images: %d' % nrof_successfully_aligned) + print('Number of changed: %d' % nrof_changed) + print('Number of iou3: %d' % nrof_iou3) + print('Number of force: %d' % nrof_force) + +def parse_arguments(argv): + parser = argparse.ArgumentParser() + + parser.add_argument('--input-dir', type=str, help='Directory with unaligned images.') + parser.add_argument('--name', type=str, default='celeb', help='') + parser.add_argument('--output-dir', type=str, help='Directory with aligned face thumbnails.') + #parser.add_argument('--image_size', type=int, + # help='Image size (height, width) in pixels.', default=182) + #parser.add_argument('--margin', type=int, + # help='Margin for the crop around the bounding box (height, width) in pixels.', default=44) + return parser.parse_args(argv) + +if __name__ == '__main__': + main(parse_arguments(sys.argv[1:])) diff --git a/src/align/align_dataset.py b/src/align/align_dataset.py new file mode 100644 index 0000000..e74224a --- /dev/null +++ b/src/align/align_dataset.py @@ -0,0 +1,137 @@ +"""Performs face alignment and stores face thumbnails in the output directory.""" + +# MIT License +# +# Copyright (c) 2016 David Sandberg +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from scipy import misc +import sys +import os +import argparse +import random +import align_dlib # @UnresolvedImport +import facenet + +def main(args): + align = align_dlib.AlignDlib(os.path.expanduser(args.dlib_face_predictor)) + landmarkIndices = align_dlib.AlignDlib.OUTER_EYES_AND_NOSE + output_dir = os.path.expanduser(args.output_dir) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + # Store some git revision info in a text file in the log directory + src_path,_ = os.path.split(os.path.realpath(__file__)) + facenet.store_revision_info(src_path, output_dir, ' '.join(sys.argv)) + dataset = facenet.get_dataset(args.input_dir) + random.shuffle(dataset) + # Scale the image such that the face fills the frame when cropped to crop_size + scale = float(args.face_size) / args.image_size + nrof_images_total = 0 + nrof_prealigned_images = 0 + nrof_successfully_aligned = 0 + for cls in dataset: + output_class_dir = os.path.join(output_dir, cls.name) + if not os.path.exists(output_class_dir): + os.makedirs(output_class_dir) + random.shuffle(cls.image_paths) + for image_path in cls.image_paths: + nrof_images_total += 1 + filename = os.path.splitext(os.path.split(image_path)[1])[0] + output_filename = os.path.join(output_class_dir, filename+'.png') + if not os.path.exists(output_filename): + try: + img = misc.imread(image_path) + except (IOError, ValueError, IndexError) as e: + errorMessage = '{}: {}'.format(image_path, e) + print(errorMessage) + else: + if img.ndim == 2: + img = facenet.to_rgb(img) + if args.use_center_crop: + scaled = misc.imresize(img, args.prealigned_scale, interp='bilinear') + sz1 = scaled.shape[1]/2 + sz2 = args.image_size/2 + aligned = scaled[(sz1-sz2):(sz1+sz2),(sz1-sz2):(sz1+sz2),:] + else: + aligned = align.align(args.image_size, img, landmarkIndices=landmarkIndices, + skipMulti=False, scale=scale) + if aligned is not None: + print(image_path) + nrof_successfully_aligned += 1 + misc.imsave(output_filename, aligned) + elif args.prealigned_dir: + # Face detection failed. Use center crop from pre-aligned dataset + class_name = os.path.split(output_class_dir)[1] + image_path_without_ext = os.path.join(os.path.expanduser(args.prealigned_dir), + class_name, filename) + # Find the extension of the image + exts = ('jpg', 'png') + for ext in exts: + temp_path = image_path_without_ext + '.' + ext + image_path = '' + if os.path.exists(temp_path): + image_path = temp_path + break + try: + img = misc.imread(image_path) + except (IOError, ValueError, IndexError) as e: + errorMessage = '{}: {}'.format(image_path, e) + print(errorMessage) + else: + scaled = misc.imresize(img, args.prealigned_scale, interp='bilinear') + sz1 = scaled.shape[1]/2 + sz2 = args.image_size/2 + cropped = scaled[(sz1-sz2):(sz1+sz2),(sz1-sz2):(sz1+sz2),:] + print(image_path) + nrof_prealigned_images += 1 + misc.imsave(output_filename, cropped) + else: + print('Unable to align "%s"' % image_path) + + print('Total number of images: %d' % nrof_images_total) + print('Number of successfully aligned images: %d' % nrof_successfully_aligned) + print('Number of pre-aligned images: %d' % nrof_prealigned_images) + + +def parse_arguments(argv): + parser = argparse.ArgumentParser() + + parser.add_argument('input_dir', type=str, help='Directory with unaligned images.') + parser.add_argument('output_dir', type=str, help='Directory with aligned face thumbnails.') + parser.add_argument('--dlib_face_predictor', type=str, + help='File containing the dlib face predictor.', default='../data/shape_predictor_68_face_landmarks.dat') + parser.add_argument('--image_size', type=int, + help='Image size (height, width) in pixels.', default=110) + parser.add_argument('--face_size', type=int, + help='Size of the face thumbnail (height, width) in pixels.', default=96) + parser.add_argument('--use_center_crop', + help='Use the center crop of the original image after scaling the image using prealigned_scale.', action='store_true') + parser.add_argument('--prealigned_dir', type=str, + help='Replace image with a pre-aligned version when face detection fails.', default='') + parser.add_argument('--prealigned_scale', type=float, + help='The amount of scaling to apply to prealigned images before taking the center crop.', default=0.87) + return parser.parse_args(argv) + +if __name__ == '__main__': + main(parse_arguments(sys.argv[1:])) diff --git a/src/align/align_dataset_mtcnn.py b/src/align/align_dataset_mtcnn.py new file mode 100644 index 0000000..d2a3eea --- /dev/null +++ b/src/align/align_dataset_mtcnn.py @@ -0,0 +1,143 @@ +"""Performs face alignment and stores face thumbnails in the output directory.""" +# MIT License +# +# Copyright (c) 2016 David Sandberg +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from scipy import misc +import sys +import os +import argparse +import tensorflow as tf +import numpy as np +import facenet +import align.detect_face +import random +from time import sleep + +def main(args): + sleep(random.random()) + output_dir = os.path.expanduser(args.output_dir) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + # Store some git revision info in a text file in the log directory + src_path,_ = os.path.split(os.path.realpath(__file__)) + facenet.store_revision_info(src_path, output_dir, ' '.join(sys.argv)) + dataset = facenet.get_dataset(args.input_dir) + + print('Creating networks and loading parameters') + + with tf.Graph().as_default(): + gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_memory_fraction) + sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)) + with sess.as_default(): + pnet, rnet, onet = align.detect_face.create_mtcnn(sess, None) + + minsize = 20 # minimum size of face + threshold = [ 0.6, 0.7, 0.7 ] # three steps's threshold + factor = 0.709 # scale factor + + # Add a random key to the filename to allow alignment using multiple processes + random_key = np.random.randint(0, high=99999) + bounding_boxes_filename = os.path.join(output_dir, 'bounding_boxes_%05d.txt' % random_key) + + with open(bounding_boxes_filename, "w") as text_file: + nrof_images_total = 0 + nrof_successfully_aligned = 0 + if args.random_order: + random.shuffle(dataset) + for cls in dataset: + output_class_dir = os.path.join(output_dir, cls.name) + if not os.path.exists(output_class_dir): + os.makedirs(output_class_dir) + if args.random_order: + random.shuffle(cls.image_paths) + for image_path in cls.image_paths: + nrof_images_total += 1 + filename = os.path.splitext(os.path.split(image_path)[1])[0] + output_filename = os.path.join(output_class_dir, filename+'.png') + print(image_path) + if not os.path.exists(output_filename): + try: + img = misc.imread(image_path) + except (IOError, ValueError, IndexError) as e: + errorMessage = '{}: {}'.format(image_path, e) + print(errorMessage) + else: + if img.ndim<2: + print('Unable to align "%s"' % image_path) + text_file.write('%s\n' % (output_filename)) + continue + if img.ndim == 2: + img = facenet.to_rgb(img) + img = img[:,:,0:3] + + bounding_boxes, _ = align.detect_face.detect_face(img, minsize, pnet, rnet, onet, threshold, factor) + nrof_faces = bounding_boxes.shape[0] + if nrof_faces>0: + det = bounding_boxes[:,0:4] + img_size = np.asarray(img.shape)[0:2] + if nrof_faces>1: + bounding_box_size = (det[:,2]-det[:,0])*(det[:,3]-det[:,1]) + img_center = img_size / 2 + offsets = np.vstack([ (det[:,0]+det[:,2])/2-img_center[1], (det[:,1]+det[:,3])/2-img_center[0] ]) + offset_dist_squared = np.sum(np.power(offsets,2.0),0) + index = np.argmax(bounding_box_size-offset_dist_squared*2.0) # some extra weight on the centering + det = det[index,:] + det = np.squeeze(det) + bb = np.zeros(4, dtype=np.int32) + bb[0] = np.maximum(det[0]-args.margin/2, 0) + bb[1] = np.maximum(det[1]-args.margin/2, 0) + bb[2] = np.minimum(det[2]+args.margin/2, img_size[1]) + bb[3] = np.minimum(det[3]+args.margin/2, img_size[0]) + cropped = img[bb[1]:bb[3],bb[0]:bb[2],:] + scaled = misc.imresize(cropped, (args.image_size, args.image_size), interp='bilinear') + nrof_successfully_aligned += 1 + misc.imsave(output_filename, scaled) + text_file.write('%s %d %d %d %d\n' % (output_filename, bb[0], bb[1], bb[2], bb[3])) + else: + print('Unable to align "%s"' % image_path) + text_file.write('%s\n' % (output_filename)) + + print('Total number of images: %d' % nrof_images_total) + print('Number of successfully aligned images: %d' % nrof_successfully_aligned) + + +def parse_arguments(argv): + parser = argparse.ArgumentParser() + + parser.add_argument('input_dir', type=str, help='Directory with unaligned images.') + parser.add_argument('output_dir', type=str, help='Directory with aligned face thumbnails.') + parser.add_argument('--image_size', type=int, + help='Image size (height, width) in pixels.', default=182) + parser.add_argument('--margin', type=int, + help='Margin for the crop around the bounding box (height, width) in pixels.', default=44) + parser.add_argument('--random_order', + help='Shuffles the order of images to enable alignment using multiple processes.', action='store_true') + parser.add_argument('--gpu_memory_fraction', type=float, + help='Upper bound on the amount of GPU memory that will be used by the process.', default=1.0) + return parser.parse_args(argv) + +if __name__ == '__main__': + main(parse_arguments(sys.argv[1:])) diff --git a/src/align/align_dlib.py b/src/align/align_dlib.py new file mode 100644 index 0000000..e5e1337 --- /dev/null +++ b/src/align/align_dlib.py @@ -0,0 +1,204 @@ +# Copyright 2015-2016 Carnegie Mellon University +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Module for dlib-based alignment.""" + +# NOTE: This file has been copied from the openface project. +# https://github.com/cmusatyalab/openface/blob/master/openface/align_dlib.py + +import cv2 +import dlib +import numpy as np + +TEMPLATE = np.float32([ + (0.0792396913815, 0.339223741112), (0.0829219487236, 0.456955367943), + (0.0967927109165, 0.575648016728), (0.122141515615, 0.691921601066), + (0.168687863544, 0.800341263616), (0.239789390707, 0.895732504778), + (0.325662452515, 0.977068762493), (0.422318282013, 1.04329000149), + (0.531777802068, 1.06080371126), (0.641296298053, 1.03981924107), + (0.738105872266, 0.972268833998), (0.824444363295, 0.889624082279), + (0.894792677532, 0.792494155836), (0.939395486253, 0.681546643421), + (0.96111933829, 0.562238253072), (0.970579841181, 0.441758925744), + (0.971193274221, 0.322118743967), (0.163846223133, 0.249151738053), + (0.21780354657, 0.204255863861), (0.291299351124, 0.192367318323), + (0.367460241458, 0.203582210627), (0.4392945113, 0.233135599851), + (0.586445962425, 0.228141644834), (0.660152671635, 0.195923841854), + (0.737466449096, 0.182360984545), (0.813236546239, 0.192828009114), + (0.8707571886, 0.235293377042), (0.51534533827, 0.31863546193), + (0.516221448289, 0.396200446263), (0.517118861835, 0.473797687758), + (0.51816430343, 0.553157797772), (0.433701156035, 0.604054457668), + (0.475501237769, 0.62076344024), (0.520712933176, 0.634268222208), + (0.565874114041, 0.618796581487), (0.607054002672, 0.60157671656), + (0.252418718401, 0.331052263829), (0.298663015648, 0.302646354002), + (0.355749724218, 0.303020650651), (0.403718978315, 0.33867711083), + (0.352507175597, 0.349987615384), (0.296791759886, 0.350478978225), + (0.631326076346, 0.334136672344), (0.679073381078, 0.29645404267), + (0.73597236153, 0.294721285802), (0.782865376271, 0.321305281656), + (0.740312274764, 0.341849376713), (0.68499850091, 0.343734332172), + (0.353167761422, 0.746189164237), (0.414587777921, 0.719053835073), + (0.477677654595, 0.706835892494), (0.522732900812, 0.717092275768), + (0.569832064287, 0.705414478982), (0.635195811927, 0.71565572516), + (0.69951672331, 0.739419187253), (0.639447159575, 0.805236879972), + (0.576410514055, 0.835436670169), (0.525398405766, 0.841706377792), + (0.47641545769, 0.837505914975), (0.41379548902, 0.810045601727), + (0.380084785646, 0.749979603086), (0.477955996282, 0.74513234612), + (0.523389793327, 0.748924302636), (0.571057789237, 0.74332894691), + (0.672409137852, 0.744177032192), (0.572539621444, 0.776609286626), + (0.5240106503, 0.783370783245), (0.477561227414, 0.778476346951)]) + +INV_TEMPLATE = np.float32([ + (-0.04099179660567834, -0.008425234314031194, 2.575498465013183), + (0.04062510634554352, -0.009678089746831375, -1.2534351452524177), + (0.0003666902601348179, 0.01810332406086298, -0.32206331976076663)]) + +TPL_MIN, TPL_MAX = np.min(TEMPLATE, axis=0), np.max(TEMPLATE, axis=0) +MINMAX_TEMPLATE = (TEMPLATE - TPL_MIN) / (TPL_MAX - TPL_MIN) + + +class AlignDlib: + """ + Use `dlib's landmark estimation `_ to align faces. + + The alignment preprocess faces for input into a neural network. + Faces are resized to the same size (such as 96x96) and transformed + to make landmarks (such as the eyes and nose) appear at the same + location on every image. + + Normalized landmarks: + + .. image:: ../images/dlib-landmark-mean.png + """ + + #: Landmark indices corresponding to the inner eyes and bottom lip. + INNER_EYES_AND_BOTTOM_LIP = [39, 42, 57] + + #: Landmark indices corresponding to the outer eyes and nose. + OUTER_EYES_AND_NOSE = [36, 45, 33] + + def __init__(self, facePredictor): + """ + Instantiate an 'AlignDlib' object. + + :param facePredictor: The path to dlib's + :type facePredictor: str + """ + assert facePredictor is not None + + #pylint: disable=no-member + self.detector = dlib.get_frontal_face_detector() + self.predictor = dlib.shape_predictor(facePredictor) + + def getAllFaceBoundingBoxes(self, rgbImg): + """ + Find all face bounding boxes in an image. + + :param rgbImg: RGB image to process. Shape: (height, width, 3) + :type rgbImg: numpy.ndarray + :return: All face bounding boxes in an image. + :rtype: dlib.rectangles + """ + assert rgbImg is not None + + try: + return self.detector(rgbImg, 1) + except Exception as e: #pylint: disable=broad-except + print("Warning: {}".format(e)) + # In rare cases, exceptions are thrown. + return [] + + def getLargestFaceBoundingBox(self, rgbImg, skipMulti=False): + """ + Find the largest face bounding box in an image. + + :param rgbImg: RGB image to process. Shape: (height, width, 3) + :type rgbImg: numpy.ndarray + :param skipMulti: Skip image if more than one face detected. + :type skipMulti: bool + :return: The largest face bounding box in an image, or None. + :rtype: dlib.rectangle + """ + assert rgbImg is not None + + faces = self.getAllFaceBoundingBoxes(rgbImg) + if (not skipMulti and len(faces) > 0) or len(faces) == 1: + return max(faces, key=lambda rect: rect.width() * rect.height()) + else: + return None + + def findLandmarks(self, rgbImg, bb): + """ + Find the landmarks of a face. + + :param rgbImg: RGB image to process. Shape: (height, width, 3) + :type rgbImg: numpy.ndarray + :param bb: Bounding box around the face to find landmarks for. + :type bb: dlib.rectangle + :return: Detected landmark locations. + :rtype: list of (x,y) tuples + """ + assert rgbImg is not None + assert bb is not None + + points = self.predictor(rgbImg, bb) + #return list(map(lambda p: (p.x, p.y), points.parts())) + return [(p.x, p.y) for p in points.parts()] + + #pylint: disable=dangerous-default-value + def align(self, imgDim, rgbImg, bb=None, + landmarks=None, landmarkIndices=INNER_EYES_AND_BOTTOM_LIP, + skipMulti=False, scale=1.0): + r"""align(imgDim, rgbImg, bb=None, landmarks=None, landmarkIndices=INNER_EYES_AND_BOTTOM_LIP) + + Transform and align a face in an image. + + :param imgDim: The edge length in pixels of the square the image is resized to. + :type imgDim: int + :param rgbImg: RGB image to process. Shape: (height, width, 3) + :type rgbImg: numpy.ndarray + :param bb: Bounding box around the face to align. \ + Defaults to the largest face. + :type bb: dlib.rectangle + :param landmarks: Detected landmark locations. \ + Landmarks found on `bb` if not provided. + :type landmarks: list of (x,y) tuples + :param landmarkIndices: The indices to transform to. + :type landmarkIndices: list of ints + :param skipMulti: Skip image if more than one face detected. + :type skipMulti: bool + :param scale: Scale image before cropping to the size given by imgDim. + :type scale: float + :return: The aligned RGB image. Shape: (imgDim, imgDim, 3) + :rtype: numpy.ndarray + """ + assert imgDim is not None + assert rgbImg is not None + assert landmarkIndices is not None + + if bb is None: + bb = self.getLargestFaceBoundingBox(rgbImg, skipMulti) + if bb is None: + return + + if landmarks is None: + landmarks = self.findLandmarks(rgbImg, bb) + + npLandmarks = np.float32(landmarks) + npLandmarkIndices = np.array(landmarkIndices) + + #pylint: disable=maybe-no-member + H = cv2.getAffineTransform(npLandmarks[npLandmarkIndices], + imgDim * MINMAX_TEMPLATE[npLandmarkIndices]*scale + imgDim*(1-scale)/2) + thumbnail = cv2.warpAffine(rgbImg, H, (imgDim, imgDim)) + + return thumbnail diff --git a/src/align/align_facescrub.py b/src/align/align_facescrub.py new file mode 100644 index 0000000..cd9fd03 --- /dev/null +++ b/src/align/align_facescrub.py @@ -0,0 +1,271 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from scipy import misc +import sys +import os +import json +import argparse +import tensorflow as tf +import numpy as np +#import facenet +import detect_face +import random +from time import sleep +sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) +import face_image +from skimage import transform as trans +import cv2 + +def to_rgb(img): + w, h = img.shape + ret = np.empty((w, h, 3), dtype=np.uint8) + ret[:, :, 0] = ret[:, :, 1] = ret[:, :, 2] = img + return ret + + +def IOU(Reframe,GTframe): + x1 = Reframe[0]; + y1 = Reframe[1]; + width1 = Reframe[2]-Reframe[0]; + height1 = Reframe[3]-Reframe[1]; + + x2 = GTframe[0] + y2 = GTframe[1] + width2 = GTframe[2]-GTframe[0] + height2 = GTframe[3]-GTframe[1] + + endx = max(x1+width1,x2+width2) + startx = min(x1,x2) + width = width1+width2-(endx-startx) + + endy = max(y1+height1,y2+height2) + starty = min(y1,y2) + height = height1+height2-(endy-starty) + + if width <=0 or height <= 0: + ratio = 0 + else: + Area = width*height + Area1 = width1*height1 + Area2 = width2*height2 + ratio = Area*1./(Area1+Area2-Area) + return ratio + + +def main(args): + output_dir = os.path.expanduser(args.output_dir) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + # Store some git revision info in a text file in the log directory + #facenet.store_revision_info(src_path, output_dir, ' '.join(sys.argv)) + image_dir = os.path.join(args.input_dir, 'facescrub') + dataset = face_image.get_dataset('facescrub', image_dir) + print('dataset size', len(dataset)) + bbox = {} + for label_file in ['facescrub_actors.txt', 'facescrub_actresses.txt']: + label_file = os.path.join(args.input_dir, label_file) + pp = 0 + for line in open(label_file, 'r'): + pp+=1 + if pp==1: + continue + vec = line.split("\t") + key = (vec[0], int(vec[2])) + value = [int(x) for x in vec[4].split(',')] + bbox[key] = value + print('bbox size', len(bbox)) + + valid_key = {} + json_data = open(os.path.join(args.input_dir, 'facescrub_uncropped_features_list.json')).read() + json_data = json.loads(json_data)['path'] + for _data in json_data: + key = _data.split('/')[-1] + pos = key.rfind('.') + if pos<0: + print(_data) + else: + key = key[0:pos] + keys = key.split('_') + #print(key) + if len(keys)!=2: + print('err', key, _data) + continue + #assert len(keys)==2 + key = (keys[0], int(keys[1])) + valid_key[key] = 1 + #print(key) + print('valid keys', len(valid_key)) + + print('Creating networks and loading parameters') + + with tf.Graph().as_default(): + #gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_memory_fraction) + #sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)) + sess = tf.Session() + with sess.as_default(): + pnet, rnet, onet = detect_face.create_mtcnn(sess, None) + + minsize = 100 # minimum size of face + threshold = [ 0.6, 0.7, 0.7 ] # three steps's threshold + factor = 0.709 # scale factor + image_size = [112,96] + src = np.array([ + [30.2946, 51.6963], + [65.5318, 51.5014], + [48.0252, 71.7366], + [33.5493, 92.3655], + [62.7299, 92.2041] ], dtype=np.float32 ) + + # Add a random key to the filename to allow alignment using multiple processes + #random_key = np.random.randint(0, high=99999) + #bounding_boxes_filename = os.path.join(output_dir, 'bounding_boxes_%05d.txt' % random_key) + #output_filename = os.path.join(output_dir, 'faceinsight_align_%s.lst' % args.name) + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + output_filename = os.path.join(args.output_dir, 'lst') + + + with open(output_filename, "w") as text_file: + nrof_images_total = 0 + nrof = np.zeros( (5,), dtype=np.int32) + for fimage in dataset: + if nrof_images_total%100==0: + print("Processing %d, (%s)" % (nrof_images_total, nrof)) + nrof_images_total += 1 + #if nrof_images_total<950000: + # continue + image_path = fimage.image_path + if not os.path.exists(image_path): + print('image not found (%s)'%image_path) + continue + filename = os.path.splitext(os.path.split(image_path)[1])[0] + #print(image_path) + try: + img = misc.imread(image_path) + except (IOError, ValueError, IndexError) as e: + errorMessage = '{}: {}'.format(image_path, e) + print(errorMessage) + else: + if img.ndim<2: + print('Unable to align "%s", img dim error' % image_path) + #text_file.write('%s\n' % (output_filename)) + continue + if img.ndim == 2: + img = to_rgb(img) + img = img[:,:,0:3] + _paths = fimage.image_path.split('/') + a,b,c = _paths[-3], _paths[-2], _paths[-1] + target_dir = os.path.join(args.output_dir, a, b) + if not os.path.exists(target_dir): + os.makedirs(target_dir) + target_file = os.path.join(target_dir, c) + warped = None + if fimage.landmark is not None: + dst = fimage.landmark.astype(np.float32) + + tform = trans.SimilarityTransform() + tform.estimate(dst, src[0:3,:]*1.5+image_size[0]*0.25) + M = tform.params[0:2,:] + warped0 = cv2.warpAffine(img,M,(image_size[1]*2,image_size[0]*2), borderValue = 0.0) + _minsize = image_size[0] + bounding_boxes, points = detect_face.detect_face(warped0, _minsize, pnet, rnet, onet, threshold, factor) + if bounding_boxes.shape[0]>0: + bindex = 0 + det = bounding_boxes[bindex,0:4] + #points need to be transpose, points = points.reshape( (5,2) ).transpose() + dst = points[:, bindex].reshape( (2,5) ).T + tform = trans.SimilarityTransform() + tform.estimate(dst, src) + M = tform.params[0:2,:] + warped = cv2.warpAffine(warped0,M,(image_size[1],image_size[0]), borderValue = 0.0) + nrof[0]+=1 + #assert fimage.bbox is not None + if warped is None and fimage.bbox is not None: + _minsize = img.shape[0]//4 + bounding_boxes, points = detect_face.detect_face(img, _minsize, pnet, rnet, onet, threshold, factor) + if bounding_boxes.shape[0]>0: + det = bounding_boxes[:,0:4] + bindex = -1 + index2 = [0.0, 0] + for i in xrange(det.shape[0]): + _det = det[i] + iou = IOU(fimage.bbox, _det) + if iou>index2[0]: + index2[0] = iou + index2[1] = i + if index2[0]>0.3: + bindex = index2[1] + if bindex>=0: + dst = points[:, bindex].reshape( (2,5) ).T + tform = trans.SimilarityTransform() + tform.estimate(dst, src) + M = tform.params[0:2,:] + warped = cv2.warpAffine(img,M,(image_size[1],image_size[0]), borderValue = 0.0) + nrof[1]+=1 + #print('1',target_file,index2[0]) + if warped is None and fimage.bbox is not None: + bb = fimage.bbox + #croped = img[bb[1]:bb[3],bb[0]:bb[2],:] + bounding_boxes, points = detect_face.detect_face_force(img, bb, pnet, rnet, onet) + assert bounding_boxes.shape[0]==1 + _box = bounding_boxes[0] + if _box[4]>=0.3: + dst = points[:, 0].reshape( (2,5) ).T + tform = trans.SimilarityTransform() + tform.estimate(dst, src) + M = tform.params[0:2,:] + warped = cv2.warpAffine(img,M,(image_size[1],image_size[0]), borderValue = 0.0) + nrof[2]+=1 + #print('2',target_file) + + if warped is None: + roi = np.zeros( (4,), dtype=np.int32) + roi[0] = int(img.shape[1]*0.06) + roi[1] = int(img.shape[0]*0.06) + roi[2] = img.shape[1]-roi[0] + roi[3] = img.shape[0]-roi[1] + if fimage.bbox is not None: + bb = fimage.bbox + h = bb[3]-bb[1] + w = bb[2]-bb[0] + x = bb[0] + y = bb[1] + #roi = np.copy(bb) + _w = int( (float(h)/image_size[0])*image_size[1] ) + x += (w-_w)//2 + #x = min( max(0,x), img.shape[1] ) + x = max(0,x) + xw = x+_w + xw = min(xw, img.shape[1]) + roi = np.array( (x, y, xw, y+h), dtype=np.int32) + nrof[3]+=1 + else: + nrof[4]+=1 + #print('3',bb,roi,img.shape) + #print('3',target_file) + warped = img[roi[1]:roi[3],roi[0]:roi[2],:] + #print(warped.shape) + warped = cv2.resize(warped, (image_size[1], image_size[0])) + bgr = warped[...,::-1] + cv2.imwrite(target_file, bgr) + oline = '%d\t%s\t%d\n' % (1,target_file, int(fimage.classname)) + text_file.write(oline) + + +def parse_arguments(argv): + parser = argparse.ArgumentParser() + + parser.add_argument('--input-dir', type=str, help='Directory with unaligned images.') + parser.add_argument('--output-dir', type=str, help='Directory with aligned face thumbnails.') + #parser.add_argument('--image_size', type=int, + # help='Image size (height, width) in pixels.', default=182) + #parser.add_argument('--margin', type=int, + # help='Margin for the crop around the bounding box (height, width) in pixels.', default=44) + return parser.parse_args(argv) + +if __name__ == '__main__': + main(parse_arguments(sys.argv[1:])) + diff --git a/src/align/align_insight.py b/src/align/align_insight.py new file mode 100644 index 0000000..a0e5d3a --- /dev/null +++ b/src/align/align_insight.py @@ -0,0 +1,247 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from scipy import misc +import sys +import os +import argparse +import tensorflow as tf +import numpy as np +#import facenet +import detect_face +import random +from time import sleep +sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) +import face_image + +def to_rgb(img): + w, h = img.shape + ret = np.empty((w, h, 3), dtype=np.uint8) + ret[:, :, 0] = ret[:, :, 1] = ret[:, :, 2] = img + return ret + + +def IOU(Reframe,GTframe): + x1 = Reframe[0]; + y1 = Reframe[1]; + width1 = Reframe[2]-Reframe[0]; + height1 = Reframe[3]-Reframe[1]; + + x2 = GTframe[0] + y2 = GTframe[1] + width2 = GTframe[2]-GTframe[0] + height2 = GTframe[3]-GTframe[1] + + endx = max(x1+width1,x2+width2) + startx = min(x1,x2) + width = width1+width2-(endx-startx) + + endy = max(y1+height1,y2+height2) + starty = min(y1,y2) + height = height1+height2-(endy-starty) + + if width <=0 or height <= 0: + ratio = 0 + else: + Area = width*height + Area1 = width1*height1 + Area2 = width2*height2 + ratio = Area*1./(Area1+Area2-Area) + return ratio + + +def main(args): + output_dir = os.path.expanduser(args.output_dir) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + # Store some git revision info in a text file in the log directory + src_path,_ = os.path.split(os.path.realpath(__file__)) + #facenet.store_revision_info(src_path, output_dir, ' '.join(sys.argv)) + dataset = face_image.get_dataset(args.name, args.input_dir) + print('dataset size', args.name, len(dataset)) + + print('Creating networks and loading parameters') + + with tf.Graph().as_default(): + #gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_memory_fraction) + #sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)) + sess = tf.Session() + with sess.as_default(): + pnet, rnet, onet = detect_face.create_mtcnn(sess, None) + + minsize = 100 # minimum size of face + threshold = [ 0.6, 0.7, 0.7 ] # three steps's threshold + factor = 0.709 # scale factor + if args.name=='lfw' or args.name=='webface' or args.name=='vgg': + minsize = 20 + threshold = [0.6,0.7,0.9] + factor = 0.85 + + print(minsize) + print(threshold) + print(factor) + + # Add a random key to the filename to allow alignment using multiple processes + #random_key = np.random.randint(0, high=99999) + #bounding_boxes_filename = os.path.join(output_dir, 'bounding_boxes_%05d.txt' % random_key) + output_filename = os.path.join(output_dir, 'faceinsight_align_%s.lst' % args.name) + + with open(output_filename, "w") as text_file: + nrof_images_total = 0 + nrof_successfully_aligned = 0 + nrof_changed = 0 + nrof_iou3 = 0 + nrof_force = 0 + for fimage in dataset: + if nrof_images_total%100==0: + print("Processing %d, (%d)" % (nrof_images_total, nrof_successfully_aligned)) + nrof_images_total += 1 + image_path = fimage.image_path + if not os.path.exists(image_path): + print('image not found (%s)'%image_path) + continue + filename = os.path.splitext(os.path.split(image_path)[1])[0] + #print(image_path) + try: + img = misc.imread(image_path) + except (IOError, ValueError, IndexError) as e: + errorMessage = '{}: {}'.format(image_path, e) + print(errorMessage) + else: + if img.ndim<2: + print('Unable to align "%s", img dim error' % image_path) + #text_file.write('%s\n' % (output_filename)) + continue + if img.ndim == 2: + img = to_rgb(img) + img = img[:,:,0:3] + _minsize = minsize + if fimage.bbox is not None: + _bb = fimage.bbox + _minsize = min( [_bb[2]-_bb[0], _bb[3]-_bb[1], img.shape[0]//2, img.shape[1]//2] ) + + bounding_boxes, points = detect_face.detect_face(img, _minsize, pnet, rnet, onet, threshold, factor) + bindex = -1 + nrof_faces = bounding_boxes.shape[0] + if fimage.bbox is None and nrof_faces>0: + det = bounding_boxes[:,0:4] + img_size = np.asarray(img.shape)[0:2] + bindex = 0 + if nrof_faces>1: + bounding_box_size = (det[:,2]-det[:,0])*(det[:,3]-det[:,1]) + img_center = img_size / 2 + offsets = np.vstack([ (det[:,0]+det[:,2])/2-img_center[1], (det[:,1]+det[:,3])/2-img_center[0] ]) + offset_dist_squared = np.sum(np.power(offsets,2.0),0) + bindex = np.argmax(bounding_box_size-offset_dist_squared*2.0) # some extra weight on the centering + if fimage.bbox is not None: + if nrof_faces>0: + assert(bounding_boxes.shape[0]==points.shape[1]) + det = bounding_boxes[:,0:4] + img_size = np.asarray(img.shape)[0:2] + index2 = [0.0, 0] + for i in xrange(det.shape[0]): + _det = det[i] + iou = IOU(fimage.bbox, _det) + if iou>index2[0]: + index2[0] = iou + index2[1] = i + if index2[0]>-0.3: + bindex = index2[1] + nrof_iou3+=1 + if bindex<0: + bounding_boxes, points = detect_face.detect_face_force(img, fimage.bbox, pnet, rnet, onet) + bindex = 0 + nrof_force+=1 + #if bindex<0: + # _img = img[fimage.bbox[1]:fimage.bbox[3], fimage.bbox[0]:fimage.bbox[2],:] + # woffset = fimage.bbox[0] + # hoffset = fimage.bbox[1] + # _minsize = min( [_img.shape[0]//3, _img.shape[1]//3] ) + # bounding_boxes, points = detect_face.detect_face(_img, _minsize, pnet, rnet, onet, [0.6,0.7,0.01], factor) + # nrof_faces = bounding_boxes.shape[0] + # print(nrof_faces) + # if nrof_faces>0: + # #print(points.shape) + # #assert(nrof_faces>0) + # bounding_boxes[:,0]+=woffset + # bounding_boxes[:,2]+=woffset + # bounding_boxes[:,1]+=hoffset + # bounding_boxes[:,3]+=hoffset + # points[0:5,:] += woffset + # points[5:10,:] += hoffset + # bindex = 0 + # score = bounding_boxes[bindex,4] + # print(score) + # if score<=0.0: + # bindex = -1 + # else: + # nrof_force+=1 + #if bindex<0: + # _bb = fimage.bbox + # _minsize = min( [_bb[2]-_bb[0], _bb[3]-_bb[1], img.shape[0]//2, img.shape[1]//2] ) + # bounding_boxes, points = detect_face.detect_face(img, _minsize, pnet, rnet, onet, [0.6,0.7,0.1], factor) + # nrof_faces = bounding_boxes.shape[0] + # print(nrof_faces) + # if nrof_faces>0: + # bindex = 0 + #if fimage.bbox is not None and bounding_boxes.shape[0]==0: + # bounding_boxes, points = detect_face.detect_face(img, _minsize, pnet, rnet, onet, [0.6,0.7,0.3], factor) + + + #print(bounding_boxes.shape, points.shape) + #print(nrof_faces, points.shape) + + if bindex>=0: + + det = bounding_boxes[:,0:4] + det = det[bindex,:] + points = points[:, bindex] + #points need to be transpose, points = points.reshape( (5,2) ).transpose() + det = np.squeeze(det) + #bb = np.zeros(4, dtype=np.int32) + #bb[0] = np.maximum(det[0]-args.margin/2, 0) + #bb[1] = np.maximum(det[1]-args.margin/2, 0) + #bb[2] = np.minimum(det[2]+args.margin/2, img_size[1]) + #bb[3] = np.minimum(det[3]+args.margin/2, img_size[0]) + bb = det + #print(points.shape) + points = list(points.flatten()) + assert(len(points)==10) + #cropped = img[bb[1]:bb[3],bb[0]:bb[2],:] + #scaled = misc.imresize(cropped, (args.image_size, args.image_size), interp='bilinear') + #misc.imsave(output_filename, scaled) + nrof_successfully_aligned += 1 + oline = '%d\t%s\t%d\t%d\t%d\t%d\t%d\t' % (0,fimage.image_path, int(fimage.classname), bb[0], bb[1], bb[2], bb[3]) + oline += '\t'.join([str(x) for x in points]) + text_file.write("%s\n"%oline) + else: + print('Unable to align "%s", no face detected' % image_path) + if args.force>0: + if fimage.bbox is None: + oline = '%d\t%s\t%d\n' % (0,fimage.image_path, int(fimage.classname)) + else: + bb = fimage.bbox + oline = '%d\t%s\t%d\t%d\t%d\t%d\t%d\n' % (0,fimage.image_path, int(fimage.classname), bb[0], bb[1], bb[2], bb[3]) + text_file.write(oline) + #text_file.write('%s\n' % (output_filename)) + + print('Total number of images: %d' % nrof_images_total) + print('Number of successfully aligned images: %d' % nrof_successfully_aligned) + print('Number of changed: %d' % nrof_changed) + print('Number of iou3: %d' % nrof_iou3) + print('Number of force: %d' % nrof_force) + +def parse_arguments(argv): + parser = argparse.ArgumentParser() + + parser.add_argument('--input-dir', type=str, help='Directory with unaligned images.') + parser.add_argument('--name', type=str, help='dataset name, can be facescrub, megaface, webface, celeb.') + parser.add_argument('--output-dir', type=str, help='Directory with aligned face thumbnails.') + parser.add_argument('--force', type=int, help='force to output if no faces detected.', default=1) + #parser.add_argument('--margin', type=int, + # help='Margin for the crop around the bounding box (height, width) in pixels.', default=44) + return parser.parse_args(argv) + +if __name__ == '__main__': + main(parse_arguments(sys.argv[1:])) diff --git a/src/align/align_lfw.py b/src/align/align_lfw.py new file mode 100644 index 0000000..dd8ce94 --- /dev/null +++ b/src/align/align_lfw.py @@ -0,0 +1,161 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from scipy import misc +import sys +import os +import argparse +import tensorflow as tf +import numpy as np +#import facenet +import detect_face +import random +from time import sleep +sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) +import face_image +import face_preprocess +from skimage import transform as trans +import cv2 + +def to_rgb(img): + w, h = img.shape + ret = np.empty((w, h, 3), dtype=np.uint8) + ret[:, :, 0] = ret[:, :, 1] = ret[:, :, 2] = img + return ret + + +def IOU(Reframe,GTframe): + x1 = Reframe[0]; + y1 = Reframe[1]; + width1 = Reframe[2]-Reframe[0]; + height1 = Reframe[3]-Reframe[1]; + + x2 = GTframe[0] + y2 = GTframe[1] + width2 = GTframe[2]-GTframe[0] + height2 = GTframe[3]-GTframe[1] + + endx = max(x1+width1,x2+width2) + startx = min(x1,x2) + width = width1+width2-(endx-startx) + + endy = max(y1+height1,y2+height2) + starty = min(y1,y2) + height = height1+height2-(endy-starty) + + if width <=0 or height <= 0: + ratio = 0 + else: + Area = width*height + Area1 = width1*height1 + Area2 = width2*height2 + ratio = Area*1./(Area1+Area2-Area) + return ratio + + +def main(args): + #facenet.store_revision_info(src_path, output_dir, ' '.join(sys.argv)) + dataset = face_image.get_dataset('lfw', args.input_dir) + print('dataset size', 'lfw', len(dataset)) + + print('Creating networks and loading parameters') + + with tf.Graph().as_default(): + #gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_memory_fraction) + #sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)) + sess = tf.Session() + with sess.as_default(): + pnet, rnet, onet = detect_face.create_mtcnn(sess, None) + + minsize = 20 + threshold = [0.6,0.7,0.9] + factor = 0.85 + + # Add a random key to the filename to allow alignment using multiple processes + #random_key = np.random.randint(0, high=99999) + #bounding_boxes_filename = os.path.join(output_dir, 'bounding_boxes_%05d.txt' % random_key) + #output_filename = os.path.join(output_dir, 'faceinsight_align_%s.lst' % args.name) + + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + output_filename = os.path.join(args.output_dir, 'lst') + + + with open(output_filename, "w") as text_file: + nrof_images_total = 0 + nrof = np.zeros( (5,), dtype=np.int32) + for fimage in dataset: + if nrof_images_total%100==0: + print("Processing %d, (%s)" % (nrof_images_total, nrof)) + nrof_images_total += 1 + #if nrof_images_total<950000: + # continue + image_path = fimage.image_path + if not os.path.exists(image_path): + print('image not found (%s)'%image_path) + continue + filename = os.path.splitext(os.path.split(image_path)[1])[0] + #print(image_path) + try: + img = misc.imread(image_path) + except (IOError, ValueError, IndexError) as e: + errorMessage = '{}: {}'.format(image_path, e) + print(errorMessage) + else: + if img.ndim<2: + print('Unable to align "%s", img dim error' % image_path) + #text_file.write('%s\n' % (output_filename)) + continue + if img.ndim == 2: + img = to_rgb(img) + img = img[:,:,0:3] + _paths = fimage.image_path.split('/') + a,b = _paths[-2], _paths[-1] + target_dir = os.path.join(args.output_dir, a) + if not os.path.exists(target_dir): + os.makedirs(target_dir) + target_file = os.path.join(target_dir, b) + _minsize = minsize + _bbox = None + _landmark = None + bounding_boxes, points = detect_face.detect_face(img, _minsize, pnet, rnet, onet, threshold, factor) + nrof_faces = bounding_boxes.shape[0] + if nrof_faces>0: + det = bounding_boxes[:,0:4] + img_size = np.asarray(img.shape)[0:2] + bindex = 0 + if nrof_faces>1: + bounding_box_size = (det[:,2]-det[:,0])*(det[:,3]-det[:,1]) + img_center = img_size / 2 + offsets = np.vstack([ (det[:,0]+det[:,2])/2-img_center[1], (det[:,1]+det[:,3])/2-img_center[0] ]) + offset_dist_squared = np.sum(np.power(offsets,2.0),0) + bindex = np.argmax(bounding_box_size-offset_dist_squared*2.0) # some extra weight on the centering + _bbox = bounding_boxes[bindex, 0:4] + _landmark = points[:, bindex].reshape( (2,5) ).T + nrof[0]+=1 + else: + nrof[1]+=1 + warped = face_preprocess.preprocess(img, bbox=_bbox, landmark = _landmark, image_size=args.image_size) + bgr = warped[...,::-1] + #print(bgr.shape) + cv2.imwrite(target_file, bgr) + oline = '%d\t%s\t%d\n' % (1,target_file, int(fimage.classname)) + text_file.write(oline) + + +def parse_arguments(argv): + parser = argparse.ArgumentParser() + + parser.add_argument('--input-dir', type=str, help='Directory with unaligned images.') + parser.add_argument('--output-dir', type=str, help='Directory with aligned face thumbnails.') + parser.add_argument('--image-size', type=str, help='Image size (height, width) in pixels.', default='112,96') + #parser.add_argument('--margin', type=int, + # help='Margin for the crop around the bounding box (height, width) in pixels.', default=44) + return parser.parse_args(argv) + +if __name__ == '__main__': + main(parse_arguments(sys.argv[1:])) + + diff --git a/src/align/align_megaface.py b/src/align/align_megaface.py new file mode 100644 index 0000000..e43a295 --- /dev/null +++ b/src/align/align_megaface.py @@ -0,0 +1,237 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from scipy import misc +import sys +import os +import argparse +import tensorflow as tf +import numpy as np +#import facenet +import detect_face +import random +from time import sleep +sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) +import face_image +from skimage import transform as trans +import cv2 + +def to_rgb(img): + w, h = img.shape + ret = np.empty((w, h, 3), dtype=np.uint8) + ret[:, :, 0] = ret[:, :, 1] = ret[:, :, 2] = img + return ret + + +def IOU(Reframe,GTframe): + x1 = Reframe[0]; + y1 = Reframe[1]; + width1 = Reframe[2]-Reframe[0]; + height1 = Reframe[3]-Reframe[1]; + + x2 = GTframe[0] + y2 = GTframe[1] + width2 = GTframe[2]-GTframe[0] + height2 = GTframe[3]-GTframe[1] + + endx = max(x1+width1,x2+width2) + startx = min(x1,x2) + width = width1+width2-(endx-startx) + + endy = max(y1+height1,y2+height2) + starty = min(y1,y2) + height = height1+height2-(endy-starty) + + if width <=0 or height <= 0: + ratio = 0 + else: + Area = width*height + Area1 = width1*height1 + Area2 = width2*height2 + ratio = Area*1./(Area1+Area2-Area) + return ratio + + +def main(args): + output_dir = os.path.expanduser(args.output_dir) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + # Store some git revision info in a text file in the log directory + src_path,_ = os.path.split(os.path.realpath(__file__)) + #facenet.store_revision_info(src_path, output_dir, ' '.join(sys.argv)) + dataset = face_image.get_dataset(args.name, args.input_dir) + print('dataset size', args.name, len(dataset)) + + print('Creating networks and loading parameters') + + with tf.Graph().as_default(): + #gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_memory_fraction) + #sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, log_device_placement=False)) + sess = tf.Session() + with sess.as_default(): + pnet, rnet, onet = detect_face.create_mtcnn(sess, None) + + minsize = 100 # minimum size of face + threshold = [ 0.6, 0.7, 0.7 ] # three steps's threshold + factor = 0.709 # scale factor + image_size = [112,96] + src = np.array([ + [30.2946, 51.6963], + [65.5318, 51.5014], + [48.0252, 71.7366], + [33.5493, 92.3655], + [62.7299, 92.2041] ], dtype=np.float32 ) + + # Add a random key to the filename to allow alignment using multiple processes + #random_key = np.random.randint(0, high=99999) + #bounding_boxes_filename = os.path.join(output_dir, 'bounding_boxes_%05d.txt' % random_key) + #output_filename = os.path.join(output_dir, 'faceinsight_align_%s.lst' % args.name) + if not os.path.exists(args.output_dir): + os.makedirs(args.output_dir) + + output_filename = os.path.join(args.output_dir, 'lst') + + + with open(output_filename, "w") as text_file: + nrof_images_total = 0 + nrof = np.zeros( (5,), dtype=np.int32) + for fimage in dataset: + if nrof_images_total%100==0: + print("Processing %d, (%s)" % (nrof_images_total, nrof)) + nrof_images_total += 1 + #if nrof_images_total<950000: + # continue + image_path = fimage.image_path + if not os.path.exists(image_path): + print('image not found (%s)'%image_path) + continue + filename = os.path.splitext(os.path.split(image_path)[1])[0] + #print(image_path) + try: + img = misc.imread(image_path) + except (IOError, ValueError, IndexError) as e: + errorMessage = '{}: {}'.format(image_path, e) + print(errorMessage) + else: + if img.ndim<2: + print('Unable to align "%s", img dim error' % image_path) + #text_file.write('%s\n' % (output_filename)) + continue + if img.ndim == 2: + img = to_rgb(img) + img = img[:,:,0:3] + _paths = fimage.image_path.split('/') + a,b,c = _paths[-3], _paths[-2], _paths[-1] + target_dir = os.path.join(args.output_dir, a, b) + if not os.path.exists(target_dir): + os.makedirs(target_dir) + target_file = os.path.join(target_dir, c) + warped = None + if fimage.landmark is not None: + dst = fimage.landmark.astype(np.float32) + + tform = trans.SimilarityTransform() + tform.estimate(dst, src[0:3,:]*1.5+image_size[0]*0.25) + M = tform.params[0:2,:] + warped0 = cv2.warpAffine(img,M,(image_size[1]*2,image_size[0]*2), borderValue = 0.0) + _minsize = image_size[0] + bounding_boxes, points = detect_face.detect_face(warped0, _minsize, pnet, rnet, onet, threshold, factor) + if bounding_boxes.shape[0]>0: + bindex = 0 + det = bounding_boxes[bindex,0:4] + #points need to be transpose, points = points.reshape( (5,2) ).transpose() + dst = points[:, bindex].reshape( (2,5) ).T + tform = trans.SimilarityTransform() + tform.estimate(dst, src) + M = tform.params[0:2,:] + warped = cv2.warpAffine(warped0,M,(image_size[1],image_size[0]), borderValue = 0.0) + nrof[0]+=1 + #assert fimage.bbox is not None + if warped is None and fimage.bbox is not None: + _minsize = img.shape[0]//4 + bounding_boxes, points = detect_face.detect_face(img, _minsize, pnet, rnet, onet, threshold, factor) + if bounding_boxes.shape[0]>0: + det = bounding_boxes[:,0:4] + bindex = -1 + index2 = [0.0, 0] + for i in xrange(det.shape[0]): + _det = det[i] + iou = IOU(fimage.bbox, _det) + if iou>index2[0]: + index2[0] = iou + index2[1] = i + if index2[0]>0.3: + bindex = index2[1] + if bindex>=0: + dst = points[:, bindex].reshape( (2,5) ).T + tform = trans.SimilarityTransform() + tform.estimate(dst, src) + M = tform.params[0:2,:] + warped = cv2.warpAffine(img,M,(image_size[1],image_size[0]), borderValue = 0.0) + nrof[1]+=1 + #print('1',target_file,index2[0]) + if warped is None and fimage.bbox is not None: + bb = fimage.bbox + #croped = img[bb[1]:bb[3],bb[0]:bb[2],:] + bounding_boxes, points = detect_face.detect_face_force(img, bb, pnet, rnet, onet) + assert bounding_boxes.shape[0]==1 + _box = bounding_boxes[0] + if _box[4]>=0.3: + dst = points[:, 0].reshape( (2,5) ).T + tform = trans.SimilarityTransform() + tform.estimate(dst, src) + M = tform.params[0:2,:] + warped = cv2.warpAffine(img,M,(image_size[1],image_size[0]), borderValue = 0.0) + nrof[2]+=1 + #print('2',target_file) + + if warped is None: + roi = np.zeros( (4,), dtype=np.int32) + roi[0] = int(img.shape[1]*0.06) + roi[1] = int(img.shape[0]*0.06) + roi[2] = img.shape[1]-roi[0] + roi[3] = img.shape[0]-roi[1] + if fimage.bbox is not None: + bb = fimage.bbox + h = bb[3]-bb[1] + w = bb[2]-bb[0] + x = bb[0] + y = bb[1] + #roi = np.copy(bb) + _w = int( (float(h)/image_size[0])*image_size[1] ) + x += (w-_w)//2 + #x = min( max(0,x), img.shape[1] ) + x = max(0,x) + xw = x+_w + xw = min(xw, img.shape[1]) + roi = np.array( (x, y, xw, y+h), dtype=np.int32) + nrof[3]+=1 + else: + nrof[4]+=1 + #print('3',bb,roi,img.shape) + #print('3',target_file) + warped = img[roi[1]:roi[3],roi[0]:roi[2],:] + #print(warped.shape) + warped = cv2.resize(warped, (image_size[1], image_size[0])) + bgr = warped[...,::-1] + cv2.imwrite(target_file, bgr) + oline = '%d\t%s\t%d\n' % (1,target_file, int(fimage.classname)) + text_file.write(oline) + + +def parse_arguments(argv): + parser = argparse.ArgumentParser() + + parser.add_argument('--input-dir', type=str, help='Directory with unaligned images.') + parser.add_argument('--name', type=str, help='dataset name, can be facescrub, megaface, webface, celeb.') + parser.add_argument('--output-dir', type=str, help='Directory with aligned face thumbnails.') + #parser.add_argument('--image_size', type=int, + # help='Image size (height, width) in pixels.', default=182) + #parser.add_argument('--margin', type=int, + # help='Margin for the crop around the bounding box (height, width) in pixels.', default=44) + return parser.parse_args(argv) + +if __name__ == '__main__': + main(parse_arguments(sys.argv[1:])) + diff --git a/src/align/det1.npy b/src/align/det1.npy new file mode 100644 index 0000000..7c05a2c Binary files /dev/null and b/src/align/det1.npy differ diff --git a/src/align/det2.npy b/src/align/det2.npy new file mode 100644 index 0000000..85d5bf0 Binary files /dev/null and b/src/align/det2.npy differ diff --git a/src/align/det3.npy b/src/align/det3.npy new file mode 100644 index 0000000..90d5ba9 Binary files /dev/null and b/src/align/det3.npy differ diff --git a/src/align/detect_face.py b/src/align/detect_face.py new file mode 100644 index 0000000..47af5f5 --- /dev/null +++ b/src/align/detect_face.py @@ -0,0 +1,848 @@ +""" Tensorflow implementation of the face detection / alignment algorithm found at +https://github.com/kpzhang93/MTCNN_face_detection_alignment +""" +# MIT License +# +# Copyright (c) 2016 David Sandberg +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from six import string_types, iteritems + +import numpy as np +import tensorflow as tf +#from math import floor +import cv2 +import os + +def layer(op): + '''Decorator for composable network layers.''' + + def layer_decorated(self, *args, **kwargs): + # Automatically set a name if not provided. + name = kwargs.setdefault('name', self.get_unique_name(op.__name__)) + # Figure out the layer inputs. + if len(self.terminals) == 0: + raise RuntimeError('No input variables found for layer %s.' % name) + elif len(self.terminals) == 1: + layer_input = self.terminals[0] + else: + layer_input = list(self.terminals) + # Perform the operation and get the output. + layer_output = op(self, layer_input, *args, **kwargs) + # Add to layer LUT. + self.layers[name] = layer_output + # This output is now the input for the next layer. + self.feed(layer_output) + # Return self for chained calls. + return self + + return layer_decorated + +class Network(object): + + def __init__(self, inputs, trainable=True): + # The input nodes for this network + self.inputs = inputs + # The current list of terminal nodes + self.terminals = [] + # Mapping from layer names to layers + self.layers = dict(inputs) + # If true, the resulting variables are set as trainable + self.trainable = trainable + + self.setup() + + def setup(self): + '''Construct the network. ''' + raise NotImplementedError('Must be implemented by the subclass.') + + def load(self, data_path, session, ignore_missing=False): + '''Load network weights. + data_path: The path to the numpy-serialized network weights + session: The current TensorFlow session + ignore_missing: If true, serialized weights for missing layers are ignored. + ''' + data_dict = np.load(data_path, encoding='latin1').item() #pylint: disable=no-member + + for op_name in data_dict: + with tf.variable_scope(op_name, reuse=True): + for param_name, data in iteritems(data_dict[op_name]): + try: + var = tf.get_variable(param_name) + session.run(var.assign(data)) + except ValueError: + if not ignore_missing: + raise + + def feed(self, *args): + '''Set the input(s) for the next operation by replacing the terminal nodes. + The arguments can be either layer names or the actual layers. + ''' + assert len(args) != 0 + self.terminals = [] + for fed_layer in args: + if isinstance(fed_layer, string_types): + try: + fed_layer = self.layers[fed_layer] + except KeyError: + raise KeyError('Unknown layer name fed: %s' % fed_layer) + self.terminals.append(fed_layer) + return self + + def get_output(self): + '''Returns the current network output.''' + return self.terminals[-1] + + def get_unique_name(self, prefix): + '''Returns an index-suffixed unique name for the given prefix. + This is used for auto-generating layer names based on the type-prefix. + ''' + ident = sum(t.startswith(prefix) for t, _ in self.layers.items()) + 1 + return '%s_%d' % (prefix, ident) + + def make_var(self, name, shape): + '''Creates a new TensorFlow variable.''' + return tf.get_variable(name, shape, trainable=self.trainable) + + def validate_padding(self, padding): + '''Verifies that the padding is one of the supported ones.''' + assert padding in ('SAME', 'VALID') + + @layer + def conv(self, + inp, + k_h, + k_w, + c_o, + s_h, + s_w, + name, + relu=True, + padding='SAME', + group=1, + biased=True): + # Verify that the padding is acceptable + self.validate_padding(padding) + # Get the number of channels in the input + c_i = int(inp.get_shape()[-1]) + # Verify that the grouping parameter is valid + assert c_i % group == 0 + assert c_o % group == 0 + # Convolution for a given input and kernel + convolve = lambda i, k: tf.nn.conv2d(i, k, [1, s_h, s_w, 1], padding=padding) + with tf.variable_scope(name) as scope: + kernel = self.make_var('weights', shape=[k_h, k_w, c_i // group, c_o]) + # This is the common-case. Convolve the input without any further complications. + output = convolve(inp, kernel) + # Add the biases + if biased: + biases = self.make_var('biases', [c_o]) + output = tf.nn.bias_add(output, biases) + if relu: + # ReLU non-linearity + output = tf.nn.relu(output, name=scope.name) + return output + + @layer + def prelu(self, inp, name): + with tf.variable_scope(name): + i = int(inp.get_shape()[-1]) + alpha = self.make_var('alpha', shape=(i,)) + output = tf.nn.relu(inp) + tf.multiply(alpha, -tf.nn.relu(-inp)) + return output + + @layer + def max_pool(self, inp, k_h, k_w, s_h, s_w, name, padding='SAME'): + self.validate_padding(padding) + return tf.nn.max_pool(inp, + ksize=[1, k_h, k_w, 1], + strides=[1, s_h, s_w, 1], + padding=padding, + name=name) + + @layer + def fc(self, inp, num_out, name, relu=True): + with tf.variable_scope(name): + input_shape = inp.get_shape() + if input_shape.ndims == 4: + # The input is spatial. Vectorize it first. + dim = 1 + for d in input_shape[1:].as_list(): + dim *= int(d) + feed_in = tf.reshape(inp, [-1, dim]) + else: + feed_in, dim = (inp, input_shape[-1].value) + weights = self.make_var('weights', shape=[dim, num_out]) + biases = self.make_var('biases', [num_out]) + op = tf.nn.relu_layer if relu else tf.nn.xw_plus_b + fc = op(feed_in, weights, biases, name=name) + return fc + + + """ + Multi dimensional softmax, + refer to https://github.com/tensorflow/tensorflow/issues/210 + compute softmax along the dimension of target + the native softmax only supports batch_size x dimension + """ + @layer + def softmax(self, target, axis, name=None): + max_axis = tf.reduce_max(target, axis, keep_dims=True) + target_exp = tf.exp(target-max_axis) + normalize = tf.reduce_sum(target_exp, axis, keep_dims=True) + softmax = tf.div(target_exp, normalize, name) + return softmax + +class PNet(Network): + def setup(self): + (self.feed('data') #pylint: disable=no-value-for-parameter, no-member + .conv(3, 3, 10, 1, 1, padding='VALID', relu=False, name='conv1') + .prelu(name='PReLU1') + .max_pool(2, 2, 2, 2, name='pool1') + .conv(3, 3, 16, 1, 1, padding='VALID', relu=False, name='conv2') + .prelu(name='PReLU2') + .conv(3, 3, 32, 1, 1, padding='VALID', relu=False, name='conv3') + .prelu(name='PReLU3') + .conv(1, 1, 2, 1, 1, relu=False, name='conv4-1') + .softmax(3,name='prob1')) + + (self.feed('PReLU3') #pylint: disable=no-value-for-parameter + .conv(1, 1, 4, 1, 1, relu=False, name='conv4-2')) + +class RNet(Network): + def setup(self): + (self.feed('data') #pylint: disable=no-value-for-parameter, no-member + .conv(3, 3, 28, 1, 1, padding='VALID', relu=False, name='conv1') + .prelu(name='prelu1') + .max_pool(3, 3, 2, 2, name='pool1') + .conv(3, 3, 48, 1, 1, padding='VALID', relu=False, name='conv2') + .prelu(name='prelu2') + .max_pool(3, 3, 2, 2, padding='VALID', name='pool2') + .conv(2, 2, 64, 1, 1, padding='VALID', relu=False, name='conv3') + .prelu(name='prelu3') + .fc(128, relu=False, name='conv4') + .prelu(name='prelu4') + .fc(2, relu=False, name='conv5-1') + .softmax(1,name='prob1')) + + (self.feed('prelu4') #pylint: disable=no-value-for-parameter + .fc(4, relu=False, name='conv5-2')) + +class ONet(Network): + def setup(self): + (self.feed('data') #pylint: disable=no-value-for-parameter, no-member + .conv(3, 3, 32, 1, 1, padding='VALID', relu=False, name='conv1') + .prelu(name='prelu1') + .max_pool(3, 3, 2, 2, name='pool1') + .conv(3, 3, 64, 1, 1, padding='VALID', relu=False, name='conv2') + .prelu(name='prelu2') + .max_pool(3, 3, 2, 2, padding='VALID', name='pool2') + .conv(3, 3, 64, 1, 1, padding='VALID', relu=False, name='conv3') + .prelu(name='prelu3') + .max_pool(2, 2, 2, 2, name='pool3') + .conv(2, 2, 128, 1, 1, padding='VALID', relu=False, name='conv4') + .prelu(name='prelu4') + .fc(256, relu=False, name='conv5') + .prelu(name='prelu5') + .fc(2, relu=False, name='conv6-1') + .softmax(1, name='prob1')) + + (self.feed('prelu5') #pylint: disable=no-value-for-parameter + .fc(4, relu=False, name='conv6-2')) + + (self.feed('prelu5') #pylint: disable=no-value-for-parameter + .fc(10, relu=False, name='conv6-3')) + +def create_mtcnn(sess, model_path): + if not model_path: + model_path,_ = os.path.split(os.path.realpath(__file__)) + + with tf.variable_scope('pnet'): + data = tf.placeholder(tf.float32, (None,None,None,3), 'input') + pnet = PNet({'data':data}) + pnet.load(os.path.join(model_path, 'det1.npy'), sess) + with tf.variable_scope('rnet'): + data = tf.placeholder(tf.float32, (None,24,24,3), 'input') + rnet = RNet({'data':data}) + rnet.load(os.path.join(model_path, 'det2.npy'), sess) + with tf.variable_scope('onet'): + data = tf.placeholder(tf.float32, (None,48,48,3), 'input') + onet = ONet({'data':data}) + onet.load(os.path.join(model_path, 'det3.npy'), sess) + + pnet_fun = lambda img : sess.run(('pnet/conv4-2/BiasAdd:0', 'pnet/prob1:0'), feed_dict={'pnet/input:0':img}) + rnet_fun = lambda img : sess.run(('rnet/conv5-2/conv5-2:0', 'rnet/prob1:0'), feed_dict={'rnet/input:0':img}) + onet_fun = lambda img : sess.run(('onet/conv6-2/conv6-2:0', 'onet/conv6-3/conv6-3:0', 'onet/prob1:0'), feed_dict={'onet/input:0':img}) + return pnet_fun, rnet_fun, onet_fun + +def detect_face(img, minsize, pnet, rnet, onet, threshold, factor): + # im: input image + # minsize: minimum of faces' size + # pnet, rnet, onet: caffemodel + # threshold: threshold=[th1 th2 th3], th1-3 are three steps's threshold + # fastresize: resize img from last scale (using in high-resolution images) if fastresize==true + factor_count=0 + total_boxes=np.empty((0,9)) + points=[] + h=img.shape[0] + w=img.shape[1] + minl=np.amin([h, w]) + m=12.0/minsize + minl=minl*m + # creat scale pyramid + scales=[] + while minl>=12: + scales += [m*np.power(factor, factor_count)] + minl = minl*factor + factor_count += 1 + + # first stage + for j in range(len(scales)): + scale=scales[j] + hs=int(np.ceil(h*scale)) + ws=int(np.ceil(w*scale)) + im_data = imresample(img, (hs, ws)) + im_data = (im_data-127.5)*0.0078125 + img_x = np.expand_dims(im_data, 0) + img_y = np.transpose(img_x, (0,2,1,3)) + out = pnet(img_y) + out0 = np.transpose(out[0], (0,2,1,3)) + out1 = np.transpose(out[1], (0,2,1,3)) + + boxes, _ = generateBoundingBox(out1[0,:,:,1].copy(), out0[0,:,:,:].copy(), scale, threshold[0]) + + # inter-scale nms + pick = nms(boxes.copy(), 0.5, 'Union') + if boxes.size>0 and pick.size>0: + boxes = boxes[pick,:] + total_boxes = np.append(total_boxes, boxes, axis=0) + + numbox = total_boxes.shape[0] + if numbox>0: + pick = nms(total_boxes.copy(), 0.7, 'Union') + total_boxes = total_boxes[pick,:] + regw = total_boxes[:,2]-total_boxes[:,0] + regh = total_boxes[:,3]-total_boxes[:,1] + qq1 = total_boxes[:,0]+total_boxes[:,5]*regw + qq2 = total_boxes[:,1]+total_boxes[:,6]*regh + qq3 = total_boxes[:,2]+total_boxes[:,7]*regw + qq4 = total_boxes[:,3]+total_boxes[:,8]*regh + total_boxes = np.transpose(np.vstack([qq1, qq2, qq3, qq4, total_boxes[:,4]])) + total_boxes = rerec(total_boxes.copy()) + total_boxes[:,0:4] = np.fix(total_boxes[:,0:4]).astype(np.int32) + dy,edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h) + + numbox = total_boxes.shape[0] + if numbox>0: + # second stage + tempimg = np.zeros((24,24,3,numbox)) + for k in range(0,numbox): + tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3)) + tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:] + if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0: + tempimg[:,:,:,k] = imresample(tmp, (24, 24)) + else: + return np.empty() + tempimg = (tempimg-127.5)*0.0078125 + tempimg1 = np.transpose(tempimg, (3,1,0,2)) + out = rnet(tempimg1) + out0 = np.transpose(out[0]) + out1 = np.transpose(out[1]) + score = out1[1,:] + ipass = np.where(score>threshold[1]) + total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)]) + mv = out0[:,ipass[0]] + if total_boxes.shape[0]>0: + pick = nms(total_boxes, 0.7, 'Union') + total_boxes = total_boxes[pick,:] + total_boxes = bbreg(total_boxes.copy(), np.transpose(mv[:,pick])) + total_boxes = rerec(total_boxes.copy()) + + numbox = total_boxes.shape[0] + if numbox>0: + # third stage + total_boxes = np.fix(total_boxes).astype(np.int32) + dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h) + tempimg = np.zeros((48,48,3,numbox)) + for k in range(0,numbox): + tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3)) + tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:] + if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0: + tempimg[:,:,:,k] = imresample(tmp, (48, 48)) + else: + return np.empty() + tempimg = (tempimg-127.5)*0.0078125 + tempimg1 = np.transpose(tempimg, (3,1,0,2)) + out = onet(tempimg1) + out0 = np.transpose(out[0]) + out1 = np.transpose(out[1]) + out2 = np.transpose(out[2]) + score = out2[1,:] + points = out1 + ipass = np.where(score>threshold[2]) + points = points[:,ipass[0]] + total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)]) + mv = out0[:,ipass[0]] + + w = total_boxes[:,2]-total_boxes[:,0]+1 + h = total_boxes[:,3]-total_boxes[:,1]+1 + points[0:5,:] = np.tile(w,(5, 1))*points[0:5,:] + np.tile(total_boxes[:,0],(5, 1))-1 + points[5:10,:] = np.tile(h,(5, 1))*points[5:10,:] + np.tile(total_boxes[:,1],(5, 1))-1 + if total_boxes.shape[0]>0: + total_boxes = bbreg(total_boxes.copy(), np.transpose(mv)) + pick = nms(total_boxes.copy(), 0.7, 'Min') + total_boxes = total_boxes[pick,:] + points = points[:,pick] + + return total_boxes, points + +def detect_face_force(img, bbox, pnet, rnet, onet): + total_boxes = np.zeros( (1,5), dtype=np.float32) + total_boxes[0,0:4] = bbox + threshold = [0.0,0.0,0.0] + h=img.shape[0] + w=img.shape[1] + numbox = total_boxes.shape[0] + if numbox>0: + dy,edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h) + # second stage + tempimg = np.zeros((24,24,3,numbox)) + for k in range(0,numbox): + tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3)) + tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:] + if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0: + tempimg[:,:,:,k] = imresample(tmp, (24, 24)) + else: + return np.empty() + tempimg = (tempimg-127.5)*0.0078125 + tempimg1 = np.transpose(tempimg, (3,1,0,2)) + out = rnet(tempimg1) + out0 = np.transpose(out[0]) + out1 = np.transpose(out[1]) + score = out1[1,:] + ipass = np.where(score>threshold[1]) + total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)]) + mv = out0[:,ipass[0]] + if total_boxes.shape[0]>0: + pick = nms(total_boxes, 0.7, 'Union') + total_boxes = total_boxes[pick,:] + total_boxes = bbreg(total_boxes.copy(), np.transpose(mv[:,pick])) + total_boxes = rerec(total_boxes.copy()) + + numbox = total_boxes.shape[0] + if numbox>0: + # third stage + total_boxes = np.fix(total_boxes).astype(np.int32) + dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(total_boxes.copy(), w, h) + tempimg = np.zeros((48,48,3,numbox)) + for k in range(0,numbox): + tmp = np.zeros((int(tmph[k]),int(tmpw[k]),3)) + tmp[dy[k]-1:edy[k],dx[k]-1:edx[k],:] = img[y[k]-1:ey[k],x[k]-1:ex[k],:] + if tmp.shape[0]>0 and tmp.shape[1]>0 or tmp.shape[0]==0 and tmp.shape[1]==0: + tempimg[:,:,:,k] = imresample(tmp, (48, 48)) + else: + return np.empty() + tempimg = (tempimg-127.5)*0.0078125 + tempimg1 = np.transpose(tempimg, (3,1,0,2)) + out = onet(tempimg1) + out0 = np.transpose(out[0]) + out1 = np.transpose(out[1]) + out2 = np.transpose(out[2]) + score = out2[1,:] + points = out1 + ipass = np.where(score>threshold[2]) + points = points[:,ipass[0]] + total_boxes = np.hstack([total_boxes[ipass[0],0:4].copy(), np.expand_dims(score[ipass].copy(),1)]) + mv = out0[:,ipass[0]] + + w = total_boxes[:,2]-total_boxes[:,0]+1 + h = total_boxes[:,3]-total_boxes[:,1]+1 + points[0:5,:] = np.tile(w,(5, 1))*points[0:5,:] + np.tile(total_boxes[:,0],(5, 1))-1 + points[5:10,:] = np.tile(h,(5, 1))*points[5:10,:] + np.tile(total_boxes[:,1],(5, 1))-1 + if total_boxes.shape[0]>0: + total_boxes = bbreg(total_boxes.copy(), np.transpose(mv)) + pick = nms(total_boxes.copy(), 0.7, 'Min') + total_boxes = total_boxes[pick,:] + points = points[:,pick] + + return total_boxes, points + +def bulk_detect_face(images, detection_window_size_ratio, pnet, rnet, onet, threshold, factor): + # im: input image + # minsize: minimum of faces' size + # pnet, rnet, onet: caffemodel + # threshold: threshold=[th1 th2 th3], th1-3 are three steps's threshold [0-1] + + all_scales = [None] * len(images) + images_with_boxes = [None] * len(images) + + for i in range(len(images)): + images_with_boxes[i] = {'total_boxes': np.empty((0, 9))} + + # create scale pyramid + for index, img in enumerate(images): + all_scales[index] = [] + h = img.shape[0] + w = img.shape[1] + minsize = int(detection_window_size_ratio * np.minimum(w, h)) + factor_count = 0 + minl = np.amin([h, w]) + if minsize <= 12: + minsize = 12 + + m = 12.0 / minsize + minl = minl * m + while minl >= 12: + all_scales[index].append(m * np.power(factor, factor_count)) + minl = minl * factor + factor_count += 1 + + # # # # # # # # # # # # # + # first stage - fast proposal network (pnet) to obtain face candidates + # # # # # # # # # # # # # + + images_obj_per_resolution = {} + + # TODO: use some type of rounding to number module 8 to increase probability that pyramid images will have the same resolution across input images + + for index, scales in enumerate(all_scales): + h = images[index].shape[0] + w = images[index].shape[1] + + for scale in scales: + hs = int(np.ceil(h * scale)) + ws = int(np.ceil(w * scale)) + + if (ws, hs) not in images_obj_per_resolution: + images_obj_per_resolution[(ws, hs)] = [] + + im_data = imresample(images[index], (hs, ws)) + im_data = (im_data - 127.5) * 0.0078125 + img_y = np.transpose(im_data, (1, 0, 2)) # caffe uses different dimensions ordering + images_obj_per_resolution[(ws, hs)].append({'scale': scale, 'image': img_y, 'index': index}) + + for resolution in images_obj_per_resolution: + images_per_resolution = [i['image'] for i in images_obj_per_resolution[resolution]] + outs = pnet(images_per_resolution) + + for index in range(len(outs[0])): + scale = images_obj_per_resolution[resolution][index]['scale'] + image_index = images_obj_per_resolution[resolution][index]['index'] + out0 = np.transpose(outs[0][index], (1, 0, 2)) + out1 = np.transpose(outs[1][index], (1, 0, 2)) + + boxes, _ = generateBoundingBox(out1[:, :, 1].copy(), out0[:, :, :].copy(), scale, threshold[0]) + + # inter-scale nms + pick = nms(boxes.copy(), 0.5, 'Union') + if boxes.size > 0 and pick.size > 0: + boxes = boxes[pick, :] + images_with_boxes[image_index]['total_boxes'] = np.append(images_with_boxes[image_index]['total_boxes'], + boxes, + axis=0) + + for index, image_obj in enumerate(images_with_boxes): + numbox = image_obj['total_boxes'].shape[0] + if numbox > 0: + h = images[index].shape[0] + w = images[index].shape[1] + pick = nms(image_obj['total_boxes'].copy(), 0.7, 'Union') + image_obj['total_boxes'] = image_obj['total_boxes'][pick, :] + regw = image_obj['total_boxes'][:, 2] - image_obj['total_boxes'][:, 0] + regh = image_obj['total_boxes'][:, 3] - image_obj['total_boxes'][:, 1] + qq1 = image_obj['total_boxes'][:, 0] + image_obj['total_boxes'][:, 5] * regw + qq2 = image_obj['total_boxes'][:, 1] + image_obj['total_boxes'][:, 6] * regh + qq3 = image_obj['total_boxes'][:, 2] + image_obj['total_boxes'][:, 7] * regw + qq4 = image_obj['total_boxes'][:, 3] + image_obj['total_boxes'][:, 8] * regh + image_obj['total_boxes'] = np.transpose(np.vstack([qq1, qq2, qq3, qq4, image_obj['total_boxes'][:, 4]])) + image_obj['total_boxes'] = rerec(image_obj['total_boxes'].copy()) + image_obj['total_boxes'][:, 0:4] = np.fix(image_obj['total_boxes'][:, 0:4]).astype(np.int32) + dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(image_obj['total_boxes'].copy(), w, h) + + numbox = image_obj['total_boxes'].shape[0] + tempimg = np.zeros((24, 24, 3, numbox)) + + if numbox > 0: + for k in range(0, numbox): + tmp = np.zeros((int(tmph[k]), int(tmpw[k]), 3)) + tmp[dy[k] - 1:edy[k], dx[k] - 1:edx[k], :] = images[index][y[k] - 1:ey[k], x[k] - 1:ex[k], :] + if tmp.shape[0] > 0 and tmp.shape[1] > 0 or tmp.shape[0] == 0 and tmp.shape[1] == 0: + tempimg[:, :, :, k] = imresample(tmp, (24, 24)) + else: + return np.empty() + + tempimg = (tempimg - 127.5) * 0.0078125 + image_obj['rnet_input'] = np.transpose(tempimg, (3, 1, 0, 2)) + + # # # # # # # # # # # # # + # second stage - refinement of face candidates with rnet + # # # # # # # # # # # # # + + bulk_rnet_input = np.empty((0, 24, 24, 3)) + for index, image_obj in enumerate(images_with_boxes): + if 'rnet_input' in image_obj: + bulk_rnet_input = np.append(bulk_rnet_input, image_obj['rnet_input'], axis=0) + + out = rnet(bulk_rnet_input) + out0 = np.transpose(out[0]) + out1 = np.transpose(out[1]) + score = out1[1, :] + + i = 0 + for index, image_obj in enumerate(images_with_boxes): + if 'rnet_input' not in image_obj: + continue + + rnet_input_count = image_obj['rnet_input'].shape[0] + score_per_image = score[i:i + rnet_input_count] + out0_per_image = out0[:, i:i + rnet_input_count] + + ipass = np.where(score_per_image > threshold[1]) + image_obj['total_boxes'] = np.hstack([image_obj['total_boxes'][ipass[0], 0:4].copy(), + np.expand_dims(score_per_image[ipass].copy(), 1)]) + + mv = out0_per_image[:, ipass[0]] + + if image_obj['total_boxes'].shape[0] > 0: + h = images[index].shape[0] + w = images[index].shape[1] + pick = nms(image_obj['total_boxes'], 0.7, 'Union') + image_obj['total_boxes'] = image_obj['total_boxes'][pick, :] + image_obj['total_boxes'] = bbreg(image_obj['total_boxes'].copy(), np.transpose(mv[:, pick])) + image_obj['total_boxes'] = rerec(image_obj['total_boxes'].copy()) + + numbox = image_obj['total_boxes'].shape[0] + + if numbox > 0: + tempimg = np.zeros((48, 48, 3, numbox)) + image_obj['total_boxes'] = np.fix(image_obj['total_boxes']).astype(np.int32) + dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph = pad(image_obj['total_boxes'].copy(), w, h) + + for k in range(0, numbox): + tmp = np.zeros((int(tmph[k]), int(tmpw[k]), 3)) + tmp[dy[k] - 1:edy[k], dx[k] - 1:edx[k], :] = images[index][y[k] - 1:ey[k], x[k] - 1:ex[k], :] + if tmp.shape[0] > 0 and tmp.shape[1] > 0 or tmp.shape[0] == 0 and tmp.shape[1] == 0: + tempimg[:, :, :, k] = imresample(tmp, (48, 48)) + else: + return np.empty() + tempimg = (tempimg - 127.5) * 0.0078125 + image_obj['onet_input'] = np.transpose(tempimg, (3, 1, 0, 2)) + + i += rnet_input_count + + # # # # # # # # # # # # # + # third stage - further refinement and facial landmarks positions with onet + # # # # # # # # # # # # # + + bulk_onet_input = np.empty((0, 48, 48, 3)) + for index, image_obj in enumerate(images_with_boxes): + if 'onet_input' in image_obj: + bulk_onet_input = np.append(bulk_onet_input, image_obj['onet_input'], axis=0) + + out = onet(bulk_onet_input) + + out0 = np.transpose(out[0]) + out1 = np.transpose(out[1]) + out2 = np.transpose(out[2]) + score = out2[1, :] + points = out1 + + i = 0 + ret = [] + for index, image_obj in enumerate(images_with_boxes): + if 'onet_input' not in image_obj: + ret.append(None) + continue + + onet_input_count = image_obj['onet_input'].shape[0] + + out0_per_image = out0[:, i:i + onet_input_count] + score_per_image = score[i:i + onet_input_count] + points_per_image = points[:, i:i + onet_input_count] + + ipass = np.where(score_per_image > threshold[2]) + points_per_image = points_per_image[:, ipass[0]] + + image_obj['total_boxes'] = np.hstack([image_obj['total_boxes'][ipass[0], 0:4].copy(), + np.expand_dims(score_per_image[ipass].copy(), 1)]) + mv = out0_per_image[:, ipass[0]] + + w = image_obj['total_boxes'][:, 2] - image_obj['total_boxes'][:, 0] + 1 + h = image_obj['total_boxes'][:, 3] - image_obj['total_boxes'][:, 1] + 1 + points_per_image[0:5, :] = np.tile(w, (5, 1)) * points_per_image[0:5, :] + np.tile( + image_obj['total_boxes'][:, 0], (5, 1)) - 1 + points_per_image[5:10, :] = np.tile(h, (5, 1)) * points_per_image[5:10, :] + np.tile( + image_obj['total_boxes'][:, 1], (5, 1)) - 1 + + if image_obj['total_boxes'].shape[0] > 0: + image_obj['total_boxes'] = bbreg(image_obj['total_boxes'].copy(), np.transpose(mv)) + pick = nms(image_obj['total_boxes'].copy(), 0.7, 'Min') + image_obj['total_boxes'] = image_obj['total_boxes'][pick, :] + points_per_image = points_per_image[:, pick] + + ret.append((image_obj['total_boxes'], points_per_image)) + else: + ret.append(None) + + i += onet_input_count + + return ret + + +# function [boundingbox] = bbreg(boundingbox,reg) +def bbreg(boundingbox,reg): + # calibrate bounding boxes + if reg.shape[1]==1: + reg = np.reshape(reg, (reg.shape[2], reg.shape[3])) + + w = boundingbox[:,2]-boundingbox[:,0]+1 + h = boundingbox[:,3]-boundingbox[:,1]+1 + b1 = boundingbox[:,0]+reg[:,0]*w + b2 = boundingbox[:,1]+reg[:,1]*h + b3 = boundingbox[:,2]+reg[:,2]*w + b4 = boundingbox[:,3]+reg[:,3]*h + boundingbox[:,0:4] = np.transpose(np.vstack([b1, b2, b3, b4 ])) + return boundingbox + +def generateBoundingBox(imap, reg, scale, t): + # use heatmap to generate bounding boxes + stride=2 + cellsize=12 + + imap = np.transpose(imap) + dx1 = np.transpose(reg[:,:,0]) + dy1 = np.transpose(reg[:,:,1]) + dx2 = np.transpose(reg[:,:,2]) + dy2 = np.transpose(reg[:,:,3]) + y, x = np.where(imap >= t) + if y.shape[0]==1: + dx1 = np.flipud(dx1) + dy1 = np.flipud(dy1) + dx2 = np.flipud(dx2) + dy2 = np.flipud(dy2) + score = imap[(y,x)] + reg = np.transpose(np.vstack([ dx1[(y,x)], dy1[(y,x)], dx2[(y,x)], dy2[(y,x)] ])) + if reg.size==0: + reg = np.empty((0,3)) + bb = np.transpose(np.vstack([y,x])) + q1 = np.fix((stride*bb+1)/scale) + q2 = np.fix((stride*bb+cellsize-1+1)/scale) + boundingbox = np.hstack([q1, q2, np.expand_dims(score,1), reg]) + return boundingbox, reg + +# function pick = nms(boxes,threshold,type) +def nms(boxes, threshold, method): + if boxes.size==0: + return np.empty((0,3)) + x1 = boxes[:,0] + y1 = boxes[:,1] + x2 = boxes[:,2] + y2 = boxes[:,3] + s = boxes[:,4] + area = (x2-x1+1) * (y2-y1+1) + I = np.argsort(s) + pick = np.zeros_like(s, dtype=np.int16) + counter = 0 + while I.size>0: + i = I[-1] + pick[counter] = i + counter += 1 + idx = I[0:-1] + xx1 = np.maximum(x1[i], x1[idx]) + yy1 = np.maximum(y1[i], y1[idx]) + xx2 = np.minimum(x2[i], x2[idx]) + yy2 = np.minimum(y2[i], y2[idx]) + w = np.maximum(0.0, xx2-xx1+1) + h = np.maximum(0.0, yy2-yy1+1) + inter = w * h + if method is 'Min': + o = inter / np.minimum(area[i], area[idx]) + else: + o = inter / (area[i] + area[idx] - inter) + I = I[np.where(o<=threshold)] + pick = pick[0:counter] + return pick + +# function [dy edy dx edx y ey x ex tmpw tmph] = pad(total_boxes,w,h) +def pad(total_boxes, w, h): + # compute the padding coordinates (pad the bounding boxes to square) + tmpw = (total_boxes[:,2]-total_boxes[:,0]+1).astype(np.int32) + tmph = (total_boxes[:,3]-total_boxes[:,1]+1).astype(np.int32) + numbox = total_boxes.shape[0] + + dx = np.ones((numbox), dtype=np.int32) + dy = np.ones((numbox), dtype=np.int32) + edx = tmpw.copy().astype(np.int32) + edy = tmph.copy().astype(np.int32) + + x = total_boxes[:,0].copy().astype(np.int32) + y = total_boxes[:,1].copy().astype(np.int32) + ex = total_boxes[:,2].copy().astype(np.int32) + ey = total_boxes[:,3].copy().astype(np.int32) + + tmp = np.where(ex>w) + edx.flat[tmp] = np.expand_dims(-ex[tmp]+w+tmpw[tmp],1) + ex[tmp] = w + + tmp = np.where(ey>h) + edy.flat[tmp] = np.expand_dims(-ey[tmp]+h+tmph[tmp],1) + ey[tmp] = h + + tmp = np.where(x<1) + dx.flat[tmp] = np.expand_dims(2-x[tmp],1) + x[tmp] = 1 + + tmp = np.where(y<1) + dy.flat[tmp] = np.expand_dims(2-y[tmp],1) + y[tmp] = 1 + + return dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph + +# function [bboxA] = rerec(bboxA) +def rerec(bboxA): + # convert bboxA to square + h = bboxA[:,3]-bboxA[:,1] + w = bboxA[:,2]-bboxA[:,0] + l = np.maximum(w, h) + bboxA[:,0] = bboxA[:,0]+w*0.5-l*0.5 + bboxA[:,1] = bboxA[:,1]+h*0.5-l*0.5 + bboxA[:,2:4] = bboxA[:,0:2] + np.transpose(np.tile(l,(2,1))) + return bboxA + +def imresample(img, sz): + im_data = cv2.resize(img, (sz[1], sz[0]), interpolation=cv2.INTER_AREA) #@UndefinedVariable + return im_data + + # This method is kept for debugging purpose +# h=img.shape[0] +# w=img.shape[1] +# hs, ws = sz +# dx = float(w) / ws +# dy = float(h) / hs +# im_data = np.zeros((hs,ws,3)) +# for a1 in range(0,hs): +# for a2 in range(0,ws): +# for a3 in range(0,3): +# im_data[a1,a2,a3] = img[int(floor(a1*dy)),int(floor(a2*dx)),a3] +# return im_data + diff --git a/src/common/face2rec2.py b/src/common/face2rec2.py new file mode 100644 index 0000000..46b6115 --- /dev/null +++ b/src/common/face2rec2.py @@ -0,0 +1,251 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# -*- coding: utf-8 -*- +from __future__ import print_function +import os +import sys + +#curr_path = os.path.abspath(os.path.dirname(__file__)) +#sys.path.append(os.path.join(curr_path, "../python")) +import mxnet as mx +import random +import argparse +import cv2 +import time +import traceback +#from builtins import range +from easydict import EasyDict as edict +import face_preprocess + +try: + import multiprocessing +except ImportError: + multiprocessing = None + + + +def read_list(path_in): + with open(path_in) as fin: + identities = [] + last = [-1, -1] + _id = 1 + while True: + line = fin.readline() + if not line: + break + item = edict() + item.flag = 0 + item.image_path, item.label, item.bbox, item.landmark, item.aligned = face_preprocess.parse_lst_line(line) + item.id = _id + yield item + if item.label!=last[0]: + if last[1]>=0: + identities.append( (last[1], _id) ) + last[0] = item.label + last[1] = _id + _id+=1 + identities.append( (last[1], _id) ) + item = edict() + item.flag = 1 + item.id = 0 + item.label = [float(_id), float(_id+len(identities))] + yield item + for identity in identities: + item = edict() + item.flag = 2 + item.id = _id + _id+=1 + item.label = [float(identity[0]), float(identity[1])] + yield item + + + +def image_encode(args, i, item, q_out): + oitem = [item.id] + if item.flag==0: + fullpath = item.image_path + header = mx.recordio.IRHeader(item.flag, item.label, item.id, 0) + #print('write', item.flag, item.id, item.label) + if item.aligned: + with open(fullpath, 'rb') as fin: + img = fin.read() + s = mx.recordio.pack(header, img) + q_out.put((i, s, oitem)) + else: + img = cv2.imread(fullpath, args.color) + assert item.landmark is not None + img = face_preprocess.preprocess(img, bbox = item.bbox, landmark=item.landmark, image_size='112,112') + s = mx.recordio.pack_img(header, img, quality=args.quality, img_fmt=args.encoding) + q_out.put((i, s, oitem)) + else: #flag==1 or 2 + header = mx.recordio.IRHeader(item.flag, item.label, item.id, 0) + #print('write', item.flag, item.id, item.label) + s = mx.recordio.pack(header, '') + q_out.put((i, s, oitem)) + + +def read_worker(args, q_in, q_out): + while True: + deq = q_in.get() + if deq is None: + break + i, item = deq + image_encode(args, i, item, q_out) + +def write_worker(q_out, fname, working_dir): + pre_time = time.time() + count = 0 + fname = os.path.basename(fname) + fname_rec = os.path.splitext(fname)[0] + '.rec' + fname_idx = os.path.splitext(fname)[0] + '.idx' + record = mx.recordio.MXIndexedRecordIO(os.path.join(working_dir, fname_idx), + os.path.join(working_dir, fname_rec), 'w') + buf = {} + more = True + while more: + deq = q_out.get() + if deq is not None: + i, s, item = deq + buf[i] = (s, item) + else: + more = False + while count in buf: + s, item = buf[count] + del buf[count] + if s is not None: + record.write_idx(item[0], s) + + if count % 1000 == 0: + cur_time = time.time() + print('time:', cur_time - pre_time, ' count:', count) + pre_time = cur_time + count += 1 + +def parse_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description='Create an image list or \ + make a record database by reading from an image list') + parser.add_argument('prefix', help='prefix of input/output lst and rec files.') + #parser.add_argument('root', help='path to folder containing images.') + + cgroup = parser.add_argument_group('Options for creating image lists') + cgroup.add_argument('--list', type=bool, default=False, + help='If this is set im2rec will create image list(s) by traversing root folder\ + and output to .lst.\ + Otherwise im2rec will read .lst and create a database at .rec') + cgroup.add_argument('--exts', nargs='+', default=['.jpeg', '.jpg'], + help='list of acceptable image extensions.') + cgroup.add_argument('--chunks', type=int, default=1, help='number of chunks.') + cgroup.add_argument('--train-ratio', type=float, default=1.0, + help='Ratio of images to use for training.') + cgroup.add_argument('--test-ratio', type=float, default=0, + help='Ratio of images to use for testing.') + cgroup.add_argument('--recursive', type=bool, default=False, + help='If true recursively walk through subdirs and assign an unique label\ + to images in each folder. Otherwise only include images in the root folder\ + and give them label 0.') + cgroup.add_argument('--shuffle', type=bool, default=True, help='If this is set as True, \ + im2rec will randomize the image order in .lst') + + rgroup = parser.add_argument_group('Options for creating database') + rgroup.add_argument('--quality', type=int, default=95, + help='JPEG quality for encoding, 1-100; or PNG compression for encoding, 1-9') + rgroup.add_argument('--num-thread', type=int, default=1, + help='number of thread to use for encoding. order of images will be different\ + from the input list if >1. the input list will be modified to match the\ + resulting order.') + rgroup.add_argument('--color', type=int, default=1, choices=[-1, 0, 1], + help='specify the color mode of the loaded image.\ + 1: Loads a color image. Any transparency of image will be neglected. It is the default flag.\ + 0: Loads image in grayscale mode.\ + -1:Loads image as such including alpha channel.') + rgroup.add_argument('--encoding', type=str, default='.jpg', choices=['.jpg', '.png'], + help='specify the encoding of the images.') + rgroup.add_argument('--pack-label', type=bool, default=False, + help='Whether to also pack multi dimensional label in the record file') + rgroup.add_argument('--image-size', type=str, default='112,96', choices=['112,96', '112,112'], help='image size, set to 112,96 or 112,112') + args = parser.parse_args() + args.prefix = os.path.abspath(args.prefix) + #args.root = os.path.abspath(args.root) + return args + +if __name__ == '__main__': + args = parse_args() + if args.list: + make_list(args) + else: + if os.path.isdir(args.prefix): + working_dir = args.prefix + else: + working_dir = os.path.dirname(args.prefix) + files = [os.path.join(working_dir, fname) for fname in os.listdir(working_dir) + if os.path.isfile(os.path.join(working_dir, fname))] + count = 0 + for fname in files: + if fname.startswith(args.prefix) and fname.endswith('.lst'): + print('Creating .rec file from', fname, 'in', working_dir) + count += 1 + image_list = read_list(fname) + # -- write_record -- # + if args.num_thread > 1 and multiprocessing is not None: + q_in = [multiprocessing.Queue(1024) for i in range(args.num_thread)] + q_out = multiprocessing.Queue(1024) + read_process = [multiprocessing.Process(target=read_worker, args=(args, q_in[i], q_out)) \ + for i in range(args.num_thread)] + for p in read_process: + p.start() + write_process = multiprocessing.Process(target=write_worker, args=(q_out, fname, working_dir)) + write_process.start() + + for i, item in enumerate(image_list): + q_in[i % len(q_in)].put((i, item)) + for q in q_in: + q.put(None) + for p in read_process: + p.join() + + q_out.put(None) + write_process.join() + else: + print('multiprocessing not available, fall back to single threaded encoding') + try: + import Queue as queue + except ImportError: + import queue + q_out = queue.Queue() + fname = os.path.basename(fname) + fname_rec = os.path.splitext(fname)[0] + '.rec' + fname_idx = os.path.splitext(fname)[0] + '.idx' + record = mx.recordio.MXIndexedRecordIO(os.path.join(working_dir, fname_idx), + os.path.join(working_dir, fname_rec), 'w') + cnt = 0 + pre_time = time.time() + for i, item in enumerate(image_list): + image_encode(args, i, item, q_out) + if q_out.empty(): + continue + _, s, _ = q_out.get() + record.write_idx(item[0], s) + if cnt % 1000 == 0: + cur_time = time.time() + print('time:', cur_time - pre_time, ' count:', cnt) + pre_time = cur_time + cnt += 1 + if not count: + print('Did not find and list file with prefix %s'%args.prefix) diff --git a/src/inceptions.py b/src/inceptions.py deleted file mode 100644 index 2f1a8bd..0000000 --- a/src/inceptions.py +++ /dev/null @@ -1,720 +0,0 @@ -# -*- coding:utf-8 -*- -__author__ = 'zhangshuai' -modified_date = '16/7/5' -__modify__ = 'anchengwu' -modified_date = '17/2/22' -__modify2__ = 'weiyangwang' -modified_date = '17/9/20' - - -''' -Inception v4 , suittable for image with around 299 x 299 - -Reference: - Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning - Christian Szegedy, Sergey Ioffe, Vincent Vanhoucke - arXiv.1602.07261 - - -Inception V3, suitable for images with around 299 x 299 - -Reference: - -Szegedy, Christian, et al. "Rethinking the Inception Architecture for Computer Vision." arXiv preprint arXiv:1512.00567 (2015). - -''' - - -# -------------------------------------------------------- - -# Modified By DeepInsight - -# 0. Make Code Tidier (with exec) -# 1. Scalable Inception V3, V4, -resnetV2 -# 2. Todo: Modified For XCeption, make Conv11 num_group_11 and Other Conv num_group independent. -# 3. Todo: Module Options: Deformable, Attention Along Features/Along Image -# 4. Todo: Adaptive Encoder-Decoder Symbol For Segmenter -# 5. Todo: Adaptive Symbol For Detector - -# -------------------------------------------------------- - - -import mxnet as mx -import numpy as np - -######## Inception Common: - -## Todo: Deformable, Attention - -def Conv(data, num_filter, num_group = 1, kernel=(1, 1), stride=(1, 1), pad=(0, 0), \ - act_type="relu", mirror_attr={}, with_act=True, name=None, suffix=''): - - conv = mx.sym.Convolution(data=data, num_filter=num_filter, num_group=num_group, kernel=kernel, stride=stride, pad=pad, no_bias=True, name='%s%s_conv2d' %(name, suffix)) - bn = mx.sym.BatchNorm(data=conv, name='%s%s_batchnorm' %(name, suffix), fix_gamma=True) - if with_act: - act = mx.sym.Activation(data=bn, act_type=act_type, name='%s%s_relu' %(name, suffix)) - return act - else: - return bn - -def get_input_size(lastout=8): - input_size = 2*lastout + 1 # 17 - input_size = 2*input_size + 1 # 35 - input_size = 2*input_size + 1 # 71 - input_size = input_size + 2 # 73 - input_size = 2*input_size + 1 # 147 - input_size = input_size + 2 # 149 - input_size = 2*input_size + 1 # 299 - return input_size - - -######## Inception ResNetv2: Scalable, XCeptionized - -# Todo Scalable and XCeptionized - -''' Fade-away ConvFactory - -def ConvFactory(data, num_filter, kernel, stride=(1, 1), pad=(0, 0), act_type="relu", mirror_attr={}, with_act=True): - conv = mx.symbol.Convolution( - data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad) - bn = mx.symbol.BatchNorm(data=conv) - if with_act: - act = mx.symbol.Activation( - data=bn, act_type=act_type, attr=mirror_attr) - return act - else: - return bn -''' - -def block35_irv2(net, input_num_channels, - basefilter=16, num_group=1 ,num_group_11=1, scale=1.0, - with_act=True, act_type='relu', mirror_attr={}, name=None): - #Conv11 - tower_conv = Conv(net, basefilter*2, num_group=num_group_11, kernel=(1, 1), name=name+'_35b11') - #Conv11-Conv33 - tower_conv1_0 = Conv(net, basefilter*2, num_group=num_group_11, kernel=(1, 1), name=name+'_35b21') - tower_conv1_1 = Conv(tower_conv1_0, basefilter*2, num_group=num_group, kernel=(3, 3), pad=(1, 1), name=name+'_35b22') - #Conv11-Conv33-Conv33 - tower_conv2_0 = Conv(net, basefilter*2, num_group=num_group_11,kernel=(1, 1), name=name+'_35b31') - tower_conv2_1 = Conv(tower_conv2_0, basefilter*3, num_group=num_group, kernel=(3, 3), pad=(1, 1), name=name+'_35b32') - tower_conv2_2 = Conv(tower_conv2_1, basefilter*4, num_group=num_group, kernel=(3, 3), pad=(1, 1), name=name+'_35b33') - #Concat - tower_mixed = mx.symbol.Concat(*[tower_conv, tower_conv1_1, tower_conv2_2]) - tower_out = Conv(tower_mixed, input_num_channels, num_group=num_group_11, kernel=(1, 1), with_act=False, name=name+'_35out') - - - net = net + tower_out * scale - if with_act: - act = mx.symbol.Activation( - data=net, act_type=act_type, attr=mirror_attr) - return act - else: - return net - - -def block17_irv2(net, input_num_channels, - basefilter=32, num_group=1 ,num_group_11=1, scale=1.0, - with_act=True, act_type='relu', mirror_attr={}, name=None): - # Conv11 - tower_conv = Conv(net, basefilter*6, num_group=num_group_11, kernel=(1, 1), name=name+'_17b11') - # Conv11-Conv17-Conv71 - tower_conv1_0 = Conv(net, basefilter*6, num_group=num_group_11, kernel=(1, 1), name=name+'_17b21') - tower_conv1_1 = Conv(tower_conv1_0, basefilter*5, num_group=num_group, kernel=(1, 7), pad=(1, 2), name=name+'_17b22') - tower_conv1_2 = Conv(tower_conv1_1, basefilter*6, num_group=num_group, kernel=(7, 1), pad=(2, 1), name=name+'_17b23') - # Concat - tower_mixed = mx.symbol.Concat(*[tower_conv, tower_conv1_2]) - # Conv11 - tower_out = Conv( - tower_mixed, input_num_channels, num_group=num_group_11, kernel=(1, 1), with_act=False, name=name+'_17out') - net = net + tower_out * scale - if with_act: - act = mx.symbol.Activation( - data=net, act_type=act_type, attr=mirror_attr) - return act - else: - return net - - -def block8_irv2(net, input_num_channels, - basefilter=32, num_group=1 ,num_group_11=1, scale=1.0, - with_act=True, act_type='relu', mirror_attr={}, name=None): - # Conv11 - tower_conv = Conv(net, basefilter*6, num_group=num_group_11, kernel=(1, 1), name=name+'_8b11') - # Conv11-Conv13-Conv31 - tower_conv1_0 = Conv(net, basefilter*6, num_group=num_group_11, kernel=(1, 1), name=name+'_8b21') - tower_conv1_1 = Conv(tower_conv1_0, basefilter*7, num_group=num_group, kernel=(1, 3), pad=(0, 1), name=name+'_8b22') - tower_conv1_2 = Conv(tower_conv1_1, basefilter*8, num_group=num_group, kernel=(3, 1), pad=(1, 0), name=name+'_8b23') - #Concat - tower_mixed = mx.symbol.Concat(*[tower_conv, tower_conv1_2]) - #Conv11 - tower_out = Conv( - tower_mixed, input_num_channels, num_group=num_group_11, kernel=(1, 1), with_act=False, name=name+'_8out') - - net = net + tower_out * scale - if with_act: - act = mx.symbol.Activation( - data=net, act_type=act_type, attr=mirror_attr) - return act - else: - return net - - -def repeat(inputs, repetitions, layer, name=None, *args, **kwargs): - outputs = inputs - for i in range(repetitions): - outputs = layer(outputs, name=name+'_'+str(i), *args, **kwargs) - return outputs - - -def get_symbol_irv2(num_classes=1000, - basefilter=16, num_group=1 ,num_group_11=1, scale=1.0, - lastout = 8, - units = [10,20,9], - **kwargs): - data = mx.symbol.Variable(name='data') - # Size 299 - # Stem 1 And Downsampling - conv1a_3_3 = Conv(data, - basefilter*2, num_group=num_group, - kernel=(3, 3), stride=(2, 2), name='conv1a') - # Size 149 - conv2a_3_3 = Conv(conv1a_3_3, basefilter*2, num_group=num_group, kernel=(3, 3), name='conv2a') - # Size 147 - conv2b_3_3 = Conv(conv2a_3_3, basefilter*4, num_group=num_group, kernel=(3, 3), pad=(1, 1), name='conv2b') - # Size 147 - maxpool3a_3_3 = mx.symbol.Pooling( - data=conv2b_3_3, kernel=(3, 3), stride=(2, 2), pool_type='max') - # Stem 2 And Downsampling - conv3b_1_1 = Conv(maxpool3a_3_3, basefilter*5, num_group=num_group_11, kernel=(1, 1), name='conv3b') - # 73 - conv4a_3_3 = Conv(conv3b_1_1, basefilter*12, num_group=num_group, kernel=(3, 3), name='conv4a') - # 71 - maxpool5a_3_3 = mx.symbol.Pooling( - data=conv4a_3_3, kernel=(3, 3), stride=(2, 2), pool_type='max') - - # Size 35 - # Stem 3 And Downsampling - # Branch31: Conv11 - tower_conv = Conv(maxpool5a_3_3, basefilter*6, num_group=num_group_11, kernel=(1, 1), name='branch31') - # Branch32: Conv11-Conv55 - tower_conv1_0 = Conv(maxpool5a_3_3, basefilter*3, num_group=num_group_11, kernel=(1, 1), name='branch321') - tower_conv1_1 = Conv(tower_conv1_0, basefilter*4, num_group=num_group, kernel=(5, 5), pad=(2, 2), name='branch322') - # Branch33: Conv11-Conv33-Conv33 - tower_conv2_0 = Conv(maxpool5a_3_3, basefilter*4, num_group=num_group_11, kernel=(1, 1), name='branch331') - tower_conv2_1 = Conv(tower_conv2_0, basefilter*6, num_group=num_group, kernel=(3, 3), pad=(1, 1), name='branch332') - tower_conv2_2 = Conv(tower_conv2_1, basefilter*6, num_group=num_group, kernel=(3, 3), pad=(1, 1), name='branch333') - # Branch34: Pool-Conv11 - tower_pool3_0 = mx.symbol.Pooling(data=maxpool5a_3_3, kernel=( - 3, 3), stride=(1, 1), pad=(1, 1), pool_type='avg') - tower_conv3_1 = Conv(tower_pool3_0, basefilter*4, num_group=num_group_11, kernel=(1, 1),name='branch34') - # Concat - tower_5b_out = mx.symbol.Concat( - *[tower_conv, tower_conv1_1, tower_conv2_2, tower_conv3_1]) - - # Repeat 1 - net = repeat(tower_5b_out, units[0], block35_irv2, scale=0.17, input_num_channels=basefilter*20,\ - basefilter=basefilter, num_group=num_group ,num_group_11=num_group_11, name='repeat1') - - # Size 35 - - # Branch 41 - tower_conv = Conv(net, basefilter*24, num_group=num_group, kernel=(3, 3), stride=(2, 2), name='branch41') - # Branch 42 - tower_conv1_0 = Conv(net, basefilter*16, num_group=num_group_11, kernel=(1, 1), name='branch421') - tower_conv1_1 = Conv(tower_conv1_0, basefilter*16, num_group=num_group, kernel=(3, 3), pad=(1, 1), name='branch422') - tower_conv1_2 = Conv(tower_conv1_1, basefilter*24, num_group=num_group, kernel=(3, 3), stride=(2, 2), name='branch423') - tower_pool = mx.symbol.Pooling(net, kernel=( - 3, 3), stride=(2, 2), pool_type='max') - - # Concat - net = mx.symbol.Concat(*[tower_conv, tower_conv1_2, tower_pool]) - # Repeat 2 - net = repeat(net, units[1], block17_irv2, scale=0.1, input_num_channels=basefilter*68,\ - basefilter=basefilter*2, num_group=num_group, num_group_11=num_group_11,name='repeat2') - - - # Size 17 - - # Branch51: Conv11-Conv33 - tower_conv = Conv(net, basefilter*16, num_group=num_group_11, kernel=(1, 1) ,name='branch511') - tower_conv0_1 = Conv(tower_conv, basefilter*24, num_group=num_group, kernel=(3, 3), stride=(2, 2) ,name='branch512') - # Branch52: Conv11-Conv33 ? Is this XCeption - tower_conv1 = Conv(net, basefilter*16, num_group=num_group_11, kernel=(1, 1) ,name='branch521') - tower_conv1_1 = Conv(tower_conv1, basefilter*18, num_group=num_group, kernel=(3, 3), stride=(2, 2) ,name='branch522') - # Branch53: Conv11-Conv33-Conv33 - tower_conv2 = Conv(net, basefilter*16, num_group=num_group_11, kernel=(1, 1) ,name='branch531') - tower_conv2_1 = Conv(tower_conv2, basefilter*18, num_group=num_group, kernel=(3, 3), pad=(1, 1) ,name='branch532') - tower_conv2_2 = Conv(tower_conv2_1, basefilter*20, num_group=num_group, kernel=(3, 3), stride=(2, 2) ,name='branch533') - # Pool33 - tower_pool = mx.symbol.Pooling(net, kernel=( - 3, 3), stride=(2, 2), pool_type='max') - net = mx.symbol.Concat( - *[tower_conv0_1, tower_conv1_1, tower_conv2_2, tower_pool]) - - # Size 8 - net = repeat(net, units[2], block8_irv2, scale=0.2, input_num_channels=basefilter*130,\ - basefilter=basefilter*2, num_group=num_group ,num_group_11=num_group_11,name='repeat3') - net = block8_irv2(net, with_act=False, input_num_channels=basefilter*130, - basefilter=basefilter*2, num_group=num_group ,num_group_11=num_group_11,name='block8') - - # Trailing - net = Conv(net, basefilter*96, num_group=num_group_11, kernel=(1, 1), name='trailing') - net = mx.symbol.Pooling(net, kernel=( - 1, 1), global_pool=True, stride=(2, 2), pool_type='avg') - net = mx.symbol.Flatten(net) - net = mx.symbol.Dropout(data=net, p=0.2) - net = mx.symbol.FullyConnected(data=net, num_hidden=num_classes) - softmax = mx.symbol.SoftmaxOutput(data=net, name='softmax') - return net, softmax - - - -######## Inception V4: Scalable, XCeptionized - -def Inception_stem_V4(data, basefilter=32, stem_num_group=1, stem_num_group_11=1, name= None): - - # Size 299 - c = Conv(data, basefilter, num_group=stem_num_group, kernel=(3, 3), stride=(2, 2), name='%s_conv1_3*3' %name) - # 149 - c = Conv(c, basefilter, num_group=stem_num_group, kernel=(3, 3), name='%s_conv2_3*3' %name) - # 147 - c = Conv(c, basefilter, num_group=stem_num_group, kernel=(3, 3), pad=(1, 1), name='%s_conv3_3*3' %name) - # 147 - p1 = mx.sym.Pooling(c, kernel=(3, 3), stride=(2, 2), pool_type='max', name='%s_maxpool_1' %name) - # 73 - c2 = Conv(c, basefilter*3, num_group=stem_num_group, kernel=(3, 3), stride=(2, 2), name='%s_conv4_3*3' %name) - concat = mx.sym.Concat(*[p1, c2], name='%s_concat_1' %name) - - c1 = Conv(concat, basefilter*2, num_group=stem_num_group_11, kernel=(1, 1), pad=(0, 0), name='%s_conv5_1*1' %name) - c1 = Conv(c1, basefilter*3, num_group=stem_num_group, kernel=(3, 3), name='%s_conv6_3*3' %name) - - # 71 - - c2 = Conv(concat, basefilter*2, num_group=stem_num_group_11, kernel=(1, 1), pad=(0, 0), name='%s_conv7_1*1' %name) - c2 = Conv(c2, basefilter*2, num_group=stem_num_group, kernel=(7, 1), pad=(3, 0), name='%s_conv8_7*1' %name) - c2 = Conv(c2, basefilter*2, num_group=stem_num_group, kernel=(1, 7), pad=(0, 3), name='%s_conv9_1*7' %name) - c2 = Conv(c2, basefilter*3, num_group=stem_num_group, kernel=(3, 3), pad=(0, 0), name='%s_conv10_3*3' %name) - - concat = mx.sym.Concat(*[c1, c2], name='%s_concat_2' %name) - - c1 = Conv(concat, basefilter*6, num_group=stem_num_group, kernel=(3, 3), stride=(2, 2), name='%s_conv11_3*3' %name) - p1 = mx.sym.Pooling(concat, kernel=(3, 3), stride=(2, 2), pool_type='max', name='%s_maxpool_2' %name) - - # 35 - - concat = mx.sym.Concat(*[c1, p1], name='%s_concat_3' %name) - return concat - - -def InceptionA_V4(input, basefilter=32, num_group=1 ,num_group_11=1, name=None): - # Pool33-Conv11 - p1 = mx.sym.Pooling(input, kernel=(3, 3), pad=(1, 1), pool_type='avg', name='%s_avgpool_1' %name) - c1 = Conv(p1, basefilter*3, kernel=(1, 1), num_group=num_group_11, pad=(0, 0), name='%s_conv1_1*1' %name) - # Conv11 - c2 = Conv(input, basefilter*3, kernel=(1, 1), num_group=num_group_11, pad=(0, 0), name='%s_conv2_1*1' %name) - # Conv11-Conv33 - c3 = Conv(input, basefilter*2, kernel=(1, 1), num_group=num_group_11, pad=(0, 0), name='%s_conv3_1*1' %name) - c3 = Conv(c3, basefilter*3, kernel=(3, 3), num_group=num_group, pad=(1, 1), name='%s_conv4_3*3' %name) - # Conv11-Conv33-Conv33 - c4 = Conv(input, basefilter*2, kernel=(1, 1), num_group=num_group_11, pad=(0, 0), name='%s_conv5_1*1' % name) - c4 = Conv(c4, basefilter*3, kernel=(3, 3), num_group=num_group, pad=(1, 1), name='%s_conv6_3*3' % name) - c4 = Conv(c4, basefilter*3, kernel=(3, 3), num_group=num_group, pad=(1, 1), name='%s_conv7_3*3' %name) - - concat = mx.sym.Concat(*[c1, c2, c3, c4], name='%s_concat_1' %name) - return concat - - -def ReductionA_V4(input, basefilter=32, num_group=1, num_group_11=1, name=None): - # Pool33 - p1 = mx.sym.Pooling(input, kernel=(3, 3), stride=(2, 2), pool_type='max', name='%s_maxpool_1' %name) - # Conv33 - c2 = Conv(input, basefilter*12, num_group=num_group, kernel=(3, 3), stride=(2, 2), name='%s_conv1_3*3' %name) - # Conv11-Conv33-Conv33 - c3 = Conv(input, basefilter*6, num_group=num_group_11, kernel=(1, 1), pad=(0, 0), name='%s_conv2_1*1' %name) - c3 = Conv(c3, basefilter*7, num_group=num_group, kernel=(3, 3), pad=(1, 1), name='%s_conv3_3*3' %name) - c3 = Conv(c3, basefilter*8, num_group=num_group, kernel=(3, 3), stride=(2, 2), pad=(0, 0), name='%s_conv4_3*3' %name) - - concat = mx.sym.Concat(*[p1, c2, c3], name='%s_concat_1' %name) - - return concat - -def InceptionB_V4(input, basefilter=32, num_group=1, num_group_11=1, name=None): - # Pool33-Conv11 - p1 = mx.sym.Pooling(input, kernel=(3, 3), pad=(1, 1), pool_type='avg', name='%s_avgpool_1' %name) - c1 = Conv(p1, basefilter*4, num_group=num_group_11, kernel=(1, 1), pad=(0, 0), name='%s_conv1_1*1' %name) - # Conv11 - c2 = Conv(input, basefilter*12, num_group=num_group_11, kernel=(1, 1), pad=(0, 0), name='%s_conv2_1*1' %name) - # Conv11-Conv17-Conv71 - c3 = Conv(input, basefilter*6, num_group=num_group_11, kernel=(1, 1), pad=(0, 0), name='%s_conv3_1*1' %name) - c3 = Conv(c3, basefilter*7, num_group=num_group, kernel=(1, 7), pad=(0, 3), name='%s_conv4_1*7' %name) - #paper wrong - c3 = Conv(c3, basefilter*8, num_group=num_group, kernel=(7, 1), pad=(3, 0), name='%s_conv5_1*7' %name) - - # COnv11-Conv17-Conv71-Conv17-Conv71 - c4 = Conv(input, basefilter*6, kernel=(1, 1), pad=(0, 0), name='%s_conv6_1*1' %name) - c4 = Conv(c4, basefilter*6, num_group=num_group, kernel=(1, 7), pad=(0, 3), name='%s_conv7_1*7' %name) - c4 = Conv(c4, basefilter*7, num_group=num_group, kernel=(7, 1), pad=(3, 0), name='%s_conv8_7*1' %name) - c4 = Conv(c4, basefilter*7, num_group=num_group, kernel=(1, 7), pad=(0, 3), name='%s_conv9_1*7' %name) - c4 = Conv(c4, basefilter*8, num_group=num_group, kernel=(7, 1), pad=(3, 0), name='%s_conv10_7*1' %name) - - concat = mx.sym.Concat(*[c1, c2, c3, c4], name='%s_concat_1' %name) - - return concat - -def ReductionB_V4(input, basefilter=64, num_group=1, num_group_11=1, name=None): - # Pool33 - p1 = mx.sym.Pooling(input, kernel=(3, 3), stride=(2, 2), pool_type='max', name='%s_maxpool_1' %name) - # Conv11-Conv33 - c2 = Conv(input, basefilter*3 , num_group=num_group_11, kernel=(1, 1), pad=(0, 0), name='%s_conv1_1*1' %name) - c2 = Conv(c2, basefilter*3, num_group=num_group, kernel=(3, 3), stride=(2, 2), name='%s_conv2_3*3' %name) - # Conv11-Conv17-Conv71-Conv33 - c3 = Conv(input, basefilter*3, num_group=num_group_11, kernel=(1, 1), pad=(0, 0), name='%s_conv3_1*1' %name) - c3 = Conv(c3, basefilter*4, num_group=num_group, kernel=(1, 7), pad=(0, 3), name='%s_conv4_1*7' %name) - c3 = Conv(c3, basefilter*5, num_group=num_group, kernel=(7, 1), pad=(3, 0), name='%s_conv5_7*1' %name) - c3 = Conv(c3, basefilter*5, num_group=num_group, kernel=(3, 3), stride=(2, 2), name='%s_conv6_3*3' %name) - - concat = mx.sym.Concat(*[p1, c2, c3], name='%s_concat_1' %name) - - return concat - - -def InceptionC_V4(input, basefilter=64, num_group=1, num_group_11=1, name=None): - # Pool33-Conv11 - p1 = mx.sym.Pooling(input, kernel=(3, 3), pad=(1, 1), pool_type='avg', name='%s_avgpool_1' %name) - c1 = Conv(p1, basefilter*4, num_group=num_group_11, kernel=(1, 1), pad=(0, 0), name='%s_conv1_1*1' %name) - # Conv11 - c2 = Conv(input, basefilter*4, num_group=num_group_11, kernel=(1, 1), pad=(0, 0), name='%s_conv2_1*1' %name) - # Conv11-[Conv13;Conv31] - c3 = Conv(input, basefilter*6, num_group=num_group_11, kernel=(1, 1), pad=(0, 0), name='%s_conv3_1*1' %name) - c3_1 = Conv(c3, basefilter*4, num_group=num_group, kernel=(1, 3), pad=(0, 1), name='%s_conv4_3*1' %name) - c3_2 = Conv(c3, basefilter*4, num_group=num_group, kernel=(3, 1), pad=(1, 0), name='%s_conv5_1*3' %name) - # Conv11-Conv13-Conv31-[Conv13;Conv31] - c4 = Conv(input, basefilter*6, num_group=num_group_11, kernel=(1, 1), pad=(0, 0), name='%s_conv6_1*1' %name) - c4 = Conv(c4, basefilter*7, num_group=num_group, kernel=(1, 3), pad=(0, 1), name='%s_conv7_1*3' %name) - c4 = Conv(c4, basefilter*8, num_group=num_group, kernel=(3, 1), pad=(1, 0), name='%s_conv8_3*1' %name) - c4_1 = Conv(c4, basefilter*4, num_group=num_group, kernel=(3, 1), pad=(1, 0), name='%s_conv9_1*3' %name) - c4_2 = Conv(c4, basefilter*4, num_group=num_group, kernel=(1, 3), pad=(0, 1), name='%s_conv10_3*1' %name) - - concat = mx.sym.Concat(*[c1, c2, c3_1, c3_2, c4_1, c4_2], name='%s_concat' %name) - - return concat - - -def get_symbol_V4(num_classes=1000, \ - units=[4,7,3], basefilter=32, num_group=1, num_group_11=1, \ - lastout=8, - dtype='float32', **kwargs): - data = mx.sym.Variable(name="data") - if dtype == 'float32': - data = mx.sym.identity(data=data, name='id') - else: - if dtype == 'float16': - data = mx.sym.Cast(data=data, dtype=np.float16) - x = Inception_stem_V4(data, - basefilter=basefilter, - stem_num_group=num_group, - stem_num_group_11=num_group_11, - name='in_stem') - - #4 * InceptionA By Default - - for i in range(units[0]): - x = InceptionA_V4(x, - basefilter=basefilter, - num_group=num_group, - num_group_11=num_group_11, - name='in%dA' %(i+1)) - - #Reduction A : Size 35-17 - x = ReductionA_V4(x, - basefilter=basefilter, - num_group=num_group, - num_group_11=num_group_11, - name='re1A') - - #7 * InceptionB By Default - - for i in range(units[1]): - x = InceptionB_V4(x, - basefilter=basefilter, - num_group=num_group, - num_group_11=num_group_11, - name='in%dB' %(i+1)) - - #ReductionB : Size 17-8 - x = ReductionB_V4(x, - basefilter=basefilter*2, - num_group=num_group, - num_group_11=num_group_11, - name='re1B') - - #3 * InceptionC By Default - - for i in range(units[2]): - x = InceptionC_V4(x, - basefilter=basefilter*2, - num_group=num_group, - num_group_11=num_group_11, - name='in%dC' %(i+1)) - - #Average Pooling - x = mx.sym.Pooling(x, kernel=(lastout, lastout), pad=(1, 1), pool_type='avg', name='global_avgpool') - - #Dropout - x = mx.sym.Dropout(x, p=0.2) - - flatten = mx.sym.Flatten(x, name='flatten') - fc1 = mx.sym.FullyConnected(flatten, num_hidden=num_classes, name='fc1') - if dtype == 'float16': - fc1 = mx.sym.Cast(data=fc1, dtype=np.float32) - softmax = mx.sym.SoftmaxOutput(fc1, name='softmax') - - return softmax - - - -######## Inception V3: Scalable, XCeptionized - -# First Stage -def Inception7A_V3(data, - basefilter=16, # - num_filters=[], # Length-7 - num_group=1, num_group_11=1, - pool='avg', name=''): - assert len(num_filters)==7 - num_1x1, num_3x3_red, num_3x3_1, num_3x3_2, num_5x5_red, num_5x5, proj = tuple( num_filters ) - # Branch 1 : Conv11 - tower_1x1 = Conv(data, basefilter*num_1x1, num_group=num_group_11, name=('%s_conv' % name)) - # Branch 2 : Conv11-Conv55 - tower_5x5 = Conv(data, basefilter*num_5x5_red, num_group=num_group_11, name=('%s_tower' % name), suffix='_conv') - tower_5x5 = Conv(tower_5x5, basefilter*num_5x5, num_group=num_group, kernel=(5, 5), pad=(2, 2), name=('%s_tower' % name), suffix='_conv_1') - # Branch 3 : Conv11-Conv33-Conv33 - tower_3x3 = Conv(data, basefilter*num_3x3_red, num_group=num_group_11, name=('%s_tower_1' % name), suffix='_conv') - tower_3x3 = Conv(tower_3x3, basefilter*num_3x3_1, num_group=num_group, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1') - tower_3x3 = Conv(tower_3x3, basefilter*num_3x3_2, num_group=num_group, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_2') - # Branch 4: Pool33-Conv11 - pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name))) - cproj = Conv(pooling, basefilter*proj, num_group=num_group_11, name=('%s_tower_2' % name), suffix='_conv') - concat = mx.sym.Concat(*[tower_1x1, tower_5x5, tower_3x3, cproj], name='ch_concat_%s_chconcat' % name) - return concat - - -# First Downsample - -# Field: (x-2)/2, original 38 - -def Inception7B_V3(data, - basefilter=32, # Base=32 - num_filters=[], # Length-4 - num_group=1, num_group_11=1, - pool="max", - name=''): - - assert len(num_filters)==4 - num_3x3, num_d3x3_red, num_d3x3_1, num_d3x3_2 = tuple(num_filters) - - # Branch 1: Conv33 - tower_3x3 = Conv(data, basefilter*num_3x3, num_group=num_group, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_conv' % name)) - # Branch 2: Conv11-Conv33-Conv33 - tower_d3x3 = Conv(data, basefilter*num_d3x3_red, num_group=num_group_11, name=('%s_tower' % name), suffix='_conv') - tower_d3x3 = Conv(tower_d3x3, basefilter*num_d3x3_1, num_group=num_group, kernel=(3, 3), pad=(1, 1), stride=(1, 1), name=('%s_tower' % name), suffix='_conv_1') - tower_d3x3 = Conv(tower_d3x3, basefilter*num_d3x3_2, num_group=num_group, kernel=(3, 3), pad=(0, 0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_2') - # Branch 3: Pool33 - pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pad=(0,0), pool_type="max", name=('max_pool_%s_pool' % name)) - concat = mx.sym.Concat(*[tower_3x3, tower_d3x3, pooling], name='ch_concat_%s_chconcat' % name) - return concat - - -# Second Stage - -def Inception7C_V3(data, - basefilter=32, - num_filters=[], # Length-10 - num_group=1, num_group_11=1, - pool = 'avg', - name = ''): - - assert len(num_filters)==10 - num_1x1, num_d7_red, num_d7_1, num_d7_2, num_q7_red, \ - num_q7_1, num_q7_2, num_q7_3, num_q7_4, proj = tuple(num_filters) - - # Branch 1 : Conv11 - tower_1x1 = Conv(data=data, num_filter=basefilter*num_1x1, kernel=(1, 1), name=('%s_conv' % name)) - # Branch 2: Conv11-Conv17-Conv71 - tower_d7 = Conv(data=data, num_filter=basefilter*num_d7_red, name=('%s_tower' % name), suffix='_conv') - tower_d7 = Conv(data=tower_d7, num_filter=basefilter*num_d7_1, num_group=num_group, kernel=(1, 7), pad=(0, 3), name=('%s_tower' % name), suffix='_conv_1') - tower_d7 = Conv(data=tower_d7, num_filter=basefilter*num_d7_2, num_group=num_group, kernel=(7, 1), pad=(3, 0), name=('%s_tower' % name), suffix='_conv_2') - # Branch 3:Conv11-Conv17-Conv71-Conv17-Conv71 - tower_q7 = Conv(data=data, num_filter=basefilter*num_q7_red, num_group=num_group_11, name=('%s_tower_1' % name), suffix='_conv') - tower_q7 = Conv(data=tower_q7, num_filter=basefilter*num_q7_1, num_group=num_group, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_1') - tower_q7 = Conv(data=tower_q7, num_filter=basefilter*num_q7_2, num_group=num_group, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_2') - tower_q7 = Conv(data=tower_q7, num_filter=basefilter*num_q7_3, num_group=num_group, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_3') - tower_q7 = Conv(data=tower_q7, num_filter=basefilter*num_q7_4, num_group=num_group, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_4') - # Branch4: Pooling-Conv11 - pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name))) - cproj = Conv(data=pooling, num_filter=basefilter*proj, num_group=num_group_11, kernel=(1, 1), name=('%s_tower_2' % name), suffix='_conv') - # concat - concat = mx.sym.Concat(*[tower_1x1, tower_d7, tower_q7, cproj], name='ch_concat_%s_chconcat' % name) - return concat - - -# Second Downsample - -# Field Change: (x-2)/2, original 18 - -def Inception7D_V3(data, - basefilter=64, - num_filters=[], # Length-6 - num_group=1, num_group_11=1, - pool='max', - name=''): - - assert len(num_filters)==6 - - num_3x3_red, num_3x3,\ - num_d7_3x3_red, num_d7_1, num_d7_2, num_d7_3x3 = tuple(num_filters) - - # Branch 1: Conv11-Conv33 - tower_3x3 = Conv(data=data, num_filter=basefilter*num_3x3_red, num_group=num_group_11, name=('%s_tower' % name), suffix='_conv') - tower_3x3 = Conv(data=tower_3x3, num_filter=basefilter*num_3x3, num_group=num_group, kernel=(3, 3), pad=(0,0), stride=(2, 2), name=('%s_tower' % name), suffix='_conv_1') - # Branch 2: Conv11-Conv17-Conv71-Conv33 - tower_d7_3x3 = Conv(data=data, num_filter=basefilter*num_d7_3x3_red, num_group=num_group_11, name=('%s_tower_1' % name), suffix='_conv') - tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=basefilter*num_d7_1, num_group=num_group, kernel=(1, 7), pad=(0, 3), name=('%s_tower_1' % name), suffix='_conv_1') - tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=basefilter*num_d7_2, num_group=num_group, kernel=(7, 1), pad=(3, 0), name=('%s_tower_1' % name), suffix='_conv_2') - tower_d7_3x3 = Conv(data=tower_d7_3x3, num_filter=basefilter*num_d7_3x3, num_group=num_group, kernel=(3, 3), stride=(2, 2), name=('%s_tower_1' % name), suffix='_conv_3') - # Branch 3: Pool33 - pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(2, 2), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name))) - # concat - concat = mx.sym.Concat(*[tower_3x3, tower_d7_3x3, pooling], name='ch_concat_%s_chconcat' % name) - return concat - - -# Doesn't change field - -def Inception7E_V3(data, - basefilter=64, - num_filters=[], # Length-9 - num_group=1, num_group_11=1, - pool='max', - name=''): - - assert len(num_filters)==9 - - num_1x1, num_d3_red, num_d3_1, num_d3_2,\ - num_3x3_d3_red, num_3x3, num_3x3_d3_1, num_3x3_d3_2, proj = tuple(num_filters) - - - # Branch 1: Conv11 - tower_1x1 = Conv(data=data, num_filter=basefilter*num_1x1, num_group=num_group_11, kernel=(1, 1), name=('%s_conv' % name)) - # Branch 2: Conv11-Conv13-Conv31 - tower_d3 = Conv(data=data, num_filter=basefilter*num_d3_red, num_group=num_group_11, name=('%s_tower' % name), suffix='_conv') - tower_d3_a = Conv(data=tower_d3, num_filter=basefilter*num_d3_1, num_group=num_group, kernel=(1, 3), pad=(0, 1), name=('%s_tower' % name), suffix='_mixed_conv') - tower_d3_b = Conv(data=tower_d3, num_filter=basefilter*num_d3_2, num_group=num_group, kernel=(3, 1), pad=(1, 0), name=('%s_tower' % name), suffix='_mixed_conv_1') - # Branch 3: Conv11-Conv33-Conv13-Conv31 - tower_3x3_d3 = Conv(data=data, num_filter=basefilter*num_3x3_d3_red, num_group=num_group_11, name=('%s_tower_1' % name), suffix='_conv') - tower_3x3_d3 = Conv(data=tower_3x3_d3, num_filter=basefilter*num_3x3, num_group=num_group, kernel=(3, 3), pad=(1, 1), name=('%s_tower_1' % name), suffix='_conv_1') - tower_3x3_d3_a = Conv(data=tower_3x3_d3, num_filter=basefilter*num_3x3_d3_1, num_group=num_group, kernel=(1, 3), pad=(0, 1), name=('%s_tower_1' % name), suffix='_mixed_conv') - tower_3x3_d3_b = Conv(data=tower_3x3_d3, num_filter=basefilter*num_3x3_d3_2, num_group=num_group, kernel=(3, 1), pad=(1, 0), name=('%s_tower_1' % name), suffix='_mixed_conv_1') - # Branch 4: Pool33-Conv11 - pooling = mx.sym.Pooling(data=data, kernel=(3, 3), stride=(1, 1), pad=(1, 1), pool_type=pool, name=('%s_pool_%s_pool' % (pool, name))) - cproj = Conv(data=pooling, num_filter=basefilter*proj, kernel=(1, 1), num_group=num_group_11, name=('%s_tower_2' % name), suffix='_conv') - # concat - concat = mx.sym.Concat(*[tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj], name='ch_concat_%s_chconcat' % name) - return concat - - - -def get_symbol_V3(num_classes=1000, - basefilter=16, num_group=1, num_group_11=1, num_group_stem=1, - lastout = 8, - dtype='float32', **kwargs): - data = mx.sym.Variable(name="data") - if dtype == 'float32': - data = mx.sym.identity(data=data, name='id') - else: - if dtype == 'float16': - data = mx.sym.Cast(data=data, dtype=np.float16) - # Stem Stage 1 - - # 299 - conv = Conv(data, basefilter*2, num_group=num_group_stem, kernel=(3, 3), stride=(2, 2), name="conv") - # 149 - conv_1 = Conv(conv, basefilter*2, num_group=num_group_stem, kernel=(3, 3), name="conv_1") - # 147 - conv_2 = Conv(conv_1, basefilter*4, num_group=num_group_stem, kernel=(3, 3), pad=(1, 1), name="conv_2") - # 147 - pool = mx.sym.Pooling(data=conv_2, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool") - # 73 - # Stem Stage 2 - conv_3 = Conv(pool, basefilter*5, num_group=num_group_11, kernel=(1, 1), name="conv_3") - conv_4 = Conv(conv_3, basefilter*12, num_group=num_group_stem, kernel=(3, 3), name="conv_4") - # 71 - pool1 = mx.sym.Pooling(data=conv_4, kernel=(3, 3), stride=(2, 2), pool_type="max", name="pool1") - # 35 - # Main Stage 1 - in3a = Inception7A_V3(pool1, - basefilter=basefilter*1, - num_filters=[4,4,6,6,3,4,2], - num_group=num_group, num_group_11=num_group_11, - pool="avg", name="mixed") - in3b = Inception7A_V3(in3a, - basefilter=basefilter*1, - num_filters=[4,4,6,6,3,4,2], - num_group=num_group, num_group_11=num_group_11, - pool="avg", name="mixed_1") - in3c = Inception7A_V3(in3b, - basefilter=basefilter*1, - num_filters=[4,4,6,6,3,4,2], - num_group=num_group, num_group_11=num_group_11, - pool="avg", name="mixed_2") - in3d = Inception7B_V3(in3c, - basefilter=basefilter*2, - num_filters=[12,2,3,3], - num_group=num_group, num_group_11=num_group_11, - pool="max", name="mixed_3") - # Main Stage2 - in4a = Inception7C_V3(in3d, - basefilter=basefilter*2, - num_filters=[6,4,4,6,4,4,4,4,6,6], - num_group=num_group, num_group_11=num_group_11, - pool="avg", name="mixed_4") - in4b = Inception7C_V3(in4a, - basefilter=basefilter*2, - num_filters=[6,5,5,6,5,5,5,5,6,6], - num_group=num_group, num_group_11=num_group_11, - pool="avg", name="mixed_5") - in4c = Inception7C_V3(in4b, - basefilter=basefilter*2, - num_filters=[6,5,5,6,5,5,5,5,6,6], - num_group=num_group, num_group_11=num_group_11, - pool="avg", name="mixed_6") - in4d = Inception7C_V3(in4c, - basefilter=basefilter*2, - num_filters=[6,6,6,6,6,6,6,6,6,6], - num_group=num_group, num_group_11=num_group_11, - pool="avg", name="mixed_7") - in4e = Inception7D_V3(in4d, - basefilter=basefilter*4, - num_filters=[3,5,3,3,3,3], - num_group=num_group, num_group_11=num_group_11, - pool="max", name="mixed_8") - # Main Stage3 - in5a = Inception7E_V3(in4e, - basefilter=basefilter*4, - num_filters=[5,6,6,6,7,6,6,6,3], - num_group=num_group, num_group_11=num_group_11, - pool="avg", name="mixed_9") - in5b = Inception7E_V3(in5a, - basefilter=basefilter*4, - num_filters=[5,6,6,6,7,6,6,6,3], - num_group=num_group, num_group_11=num_group_11, - pool="max", name="mixed_10") - # pool - pool = mx.sym.Pooling(data=in5b, kernel=(lastout, lastout), stride=(1, 1), pool_type="avg", name="global_pool") # last=8 - flatten = mx.sym.Flatten(data=pool, name="flatten") - fc1 = mx.sym.FullyConnected(data=flatten, num_hidden=num_classes, name='fc1') - if dtype == 'float16': - fc1 = mx.sym.Cast(data=fc1, dtype=np.float32) - softmax = mx.sym.SoftmaxOutput(data=fc1, name='softmax') - return softmax - diff --git a/src/marginalnet.py b/src/marginalnet.py deleted file mode 100644 index dc44482..0000000 --- a/src/marginalnet.py +++ /dev/null @@ -1,310 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -import mxnet as mx -import numpy as np - - - -def _Conv(data, num_filter, kernel, stride, pad, name, no_bias=False, workspace=256): - _weight = mx.symbol.Variable(name+'_weight') - _bias = mx.symbol.Variable(name+'_bias', lr_mult=2.0, wd_mult=0.0) - body = mx.sym.Convolution(data=data, weight = _weight, bias = _bias, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias = no_bias, workspace = workspace, name = name) - return body - -def Conv(**kwargs): - name = kwargs.get('name') - _weight = mx.symbol.Variable(name+'_weight') - _bias = mx.symbol.Variable(name+'_bias', lr_mult=2.0, wd_mult=0.0) - body = mx.sym.Convolution(weight = _weight, bias = _bias, **kwargs) - return body - - -def Act(data, name): - body = mx.sym.LeakyReLU(data = data, act_type='prelu', name = name) - return body - -def resnet_unit0(data, num_filter, name, workspace = 256): - bn_mom = 0.9 - body = Conv(data=data, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1, 1), - name=name+"_conv", workspace=workspace) - body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn') - body = Act(data=body, name=name+'_relu') - return body - -def resnet_unit1(data, num_filter, name, dim_match=True, workspace = 256): - bn_mom = 0.9 - shortcut = data - body = Conv(data=data, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1, 1), - name=name+"_conv1", workspace=workspace) - body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn1') - body = Act(data=body, name=name+'_relu1') - body = Conv(data=body, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1, 1), - name=name+"_conv2", workspace=workspace) - body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn2') - if dim_match: - body = body+shortcut - body = Act(data=body, name=name+'_relu2') - return body - -def resnet_unit2(data, num_filter, name, dim_match=True, workspace = 256): - bn_mom = 0.9 - shortcut = data - body = mx.sym.BatchNorm(data=data, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn1') - body = Act(data=body, name=name+'_relu1') - body = Conv(data=body, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1, 1), - name=name+"_conv1", workspace=workspace) - #body = mx.symbol.Dropout(data=body, p=0.2) - body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn2') - body = Act(data=body, name=name+'_relu2') - body = Conv(data=body, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1, 1), - name=name+"_conv2", workspace=workspace) - if dim_match: - body = body+shortcut - return body - -def resnet_unit3(data, num_filter, name, dim_match=True, workspace = 256): - bn_mom = 0.9 - shortcut = data - body = mx.sym.BatchNorm(data=data, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn1') - body = Conv(data=body, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1, 1), - name=name+"_conv1", workspace=workspace) - body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn2') - body = Act(data=body, name=name+'_relu1') - body = Conv(data=body, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1, 1), - name=name+"_conv2", workspace=workspace) - body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn3') - if dim_match: - body = body+shortcut - return body - -def resnet_unit4(data, num_filter, name, dim_match=True, workspace = 256): - bn_mom = 0.9 - shortcut = data - body = mx.sym.BatchNorm(data=data, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn1') - body = Act(data=body, name=name+'_relu1') - body = Conv(data=data, num_filter=int(num_filter*0.25), kernel=(1,1), stride=(1,1), pad=(0,0), - name=name+"_conv1", workspace=workspace) - body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn2') - body = Act(data=body, name=name+'_relu2') - body = Conv(data=body, num_filter=int(num_filter*0.25), kernel=(3,3), stride=(1,1), pad=(1, 1), - name=name+"_conv2", workspace=workspace) - body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn3') - body = Act(data=body, name=name+'_relu3') - body = Conv(data=body, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0, 0), - name=name+"_conv3", workspace=workspace) - if dim_match: - body = body+shortcut - return body - -def resnet_unit5(data, num_filter, name, dim_match=True, workspace = 256): - bn_mom = 0.9 - shortcut = data - body = Conv(data=data, num_filter=int(num_filter*0.5), kernel=(1,1), stride=(1,1), pad=(0,0), - name=name+"_conv1", workspace=workspace) - body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn1') - body = Act(data=body, name=name+'_relu1') - body = Conv(data=body, num_filter=int(num_filter*0.5), kernel=(3,3), stride=(1,1), pad=(1, 1), num_group=32, - name=name+"_conv2", workspace=workspace) - body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn2') - body = Act(data=body, name=name+'_relu2') - body = Conv(data=body, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), - name=name+"_conv3", workspace=workspace) - body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn3') - if dim_match: - body = body+shortcut - body = Act(data=body, name=name+'_relu3') - return body - -def resnet_unit6(data, num_filter, name, dim_match=True, workspace = 256): - bn_mom = 0.9 - shortcut = data - body = Conv(data=data, num_filter=num_filter*4, kernel=(1,1), stride=(1,1), pad=(0,0), - name=name+"_conv1", workspace=workspace) - body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn1') - body = Act(data=body, name=name+'_relu1') - body = Conv(data=body, num_filter=num_filter*4, kernel=(3,3), stride=(1,1), pad=(1, 1), num_group=32, - name=name+"_conv2", workspace=workspace) - body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn2') - body = Act(data=body, name=name+'_relu2') - body = Conv(data=body, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), - name=name+"_conv3", workspace=workspace) - body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn3') - if dim_match: - body = body+shortcut - body = Act(data=body, name=name+'_relu3') - return body - -def resnet_unit7(data, num_filter, name, dim_match=True, workspace = 256): - #se block - bn_mom = 0.9 - shortcut = data - body = mx.sym.BatchNorm(data=data, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn1') - body = Conv(data=body, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1, 1), - name=name+"_conv1", workspace=workspace) - body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn2') - body = Act(data=body, name=name+'_relu1') - body = Conv(data=body, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1, 1), - name=name+"_conv2", workspace=workspace) - res = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn3') - - body = mx.sym.Pooling(data=res, global_pool=True, kernel=(7, 7), pool_type='avg', name=name+'_se_pool1') - body = Conv(data=body, num_filter=num_filter//16, kernel=(1,1), stride=(1,1), pad=(0,0), - name=name+"_se_conv1", workspace=workspace) - body = Act(data=body, name=name+'_se_relu1') - body = Conv(data=body, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), - name=name+"_se_conv2", workspace=workspace) - body = mx.symbol.Activation(data=body, act_type='sigmoid', name=name+"_se_sigmoid") - body = mx.symbol.broadcast_mul(res, body) - - if dim_match: - body = body+shortcut - return body - -def resnet_unit100(data, num_filter, name, dim_match=True, workspace = 256): - bn_mom = 0.9 - body = mx.sym.BatchNorm(data=data, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn1') - body = Conv(data=body, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1, 1), - name=name+"_conv1", workspace=workspace) - body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn2') - act = Act(data=body, name=name+'_relu1') - body = Conv(data=act, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1, 1), - name=name+"_conv2", workspace=workspace) - body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name+'_bn3') - if not dim_match: - shortcut = Conv(data=act, num_filter=num_filter, kernel=(1,1), pad=(0,0), name=name+"_shortcut", workspace=workspace) - else: - shortcut = data - body = body+shortcut - return body - -def resnet_unit(rtype, data, num_filter, name, dim_match=True, workspace = 256): - if rtype==1: - return resnet_unit1(data=data, num_filter=num_filter, name=name, dim_match=dim_match, workspace=workspace) - elif rtype==2: - return resnet_unit2(data=data, num_filter=num_filter, name=name, dim_match=dim_match, workspace=workspace) - elif rtype==3: - return resnet_unit3(data=data, num_filter=num_filter, name=name, dim_match=dim_match, workspace=workspace) - elif rtype==4: - return resnet_unit4(data=data, num_filter=num_filter, name=name, dim_match=dim_match, workspace=workspace) - elif rtype==5: - return resnet_unit5(data=data, num_filter=num_filter, name=name, dim_match=dim_match, workspace=workspace) - elif rtype==6: - return resnet_unit6(data=data, num_filter=num_filter, name=name, dim_match=dim_match, workspace=workspace) - elif rtype==7: - return resnet_unit7(data=data, num_filter=num_filter, name=name, dim_match=dim_match, workspace=workspace) - elif rtype==100: - return resnet_unit100(data=data, num_filter=num_filter, name=name, dim_match=dim_match, workspace=workspace) - else: - assert(False) - -def resnet(data, units, filters, rtype, workspace): - body = resnet_unit0(data=data, num_filter=32, name="stage%d_unit%d"%(0, 0)) - for i in xrange(len(units)): - f = filters[i] - dim_match = False - if i==0: - dim_match = True - if rtype>=100: - body = resnet_unit(rtype=rtype, data=body, num_filter=f, name="stage%d_unit%d"%(i+1, 0), dim_match=dim_match) # do not connect to last layer, dim not match - else: - body = resnet_unit0(data=body, num_filter=f, name="stage%d_unit%d"%(i+1, 0)) # do not connect to last layer, dim not match - body = mx.sym.Pooling(data=body, kernel=(2, 2), stride=(2,2), pad=(0,0), pool_type='max', name="stage%d_pool"%(i+1)) - for j in xrange(units[i]): - body = resnet_unit(rtype=rtype, data=body, num_filter=f, name="stage%d_unit%d"%(i+1, j+1), dim_match=True) - - return body - -def get_symbol(num_classes, num_layers, conv_workspace=256): - data = mx.symbol.Variable('data') - bn_mom = 0.9 - if num_layers<29: - data = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='bn_data') - else: - data = data-127.5 - data = data*0.0078125 - units = [1,2,5,3] # all number of layers = sum(units)*2+len(units)+1 - filter_list = [64, 128, 256, 512] - rtype = 1 - ftype = 1 - if num_layers==27: - rtype = 1 - elif num_layers==28: - rtype = 2 - elif num_layers==29: - rtype = 3 - #use_last_bn = False - #use_dropout = False - elif num_layers==30: - filter_list = [64, 256, 512, 1024] - rtype = 3 - elif num_layers==31: - rtype = 100 - elif num_layers==51: - units = [2,3,15,3] - rtype = 3 - elif num_layers==52: - filter_list = [64, 256, 512, 1024] - units = [2,3,15,3] - rtype = 3 - elif num_layers==53: #se block - units = [2,3,15,3] - rtype = 7 - elif num_layers==74: - units = [2,3,15,3] - rtype = 4 - elif num_layers==75: - units = [2,3,15,3] - rtype = 5 - elif num_layers==76: - filter_list = [16, 32, 64, 128] - units = [2,3,15,3] - rtype = 6 - else: - assert(False) - - body = resnet(data = data, units = units, filters = filter_list, rtype=rtype, workspace = conv_workspace) - _weight = mx.symbol.Variable("fc1_weight") - _bias = mx.symbol.Variable("fc1_bias", lr_mult=2.0, wd_mult=0.0) - if ftype==0: - fc1 = mx.sym.FullyConnected(data=body, weight=_weight, bias=_bias, num_hidden=num_classes, name='fc1') - elif ftype==1: - body = mx.symbol.Dropout(data=body, p=0.4) - fc1 = mx.sym.FullyConnected(data=body, weight=_weight, bias=_bias, num_hidden=num_classes, name='pre_fc1') - fc1 = mx.sym.BatchNorm(data=fc1, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='fc1') - else: - body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn1') - body = mx.sym.Activation(data=body, act_type='relu', name='relu1') - body = mx.sym.Pooling(data=body, global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1') - body = mx.sym.Flatten(data=body) - fc1 = mx.sym.FullyConnected(data=body, weight=_weight, bias=_bias, num_hidden=num_classes, name='fc1') - - return fc1 - -def init_weights(sym, data_shape_dict, num_layers): - arg_name = sym.list_arguments() - aux_name = sym.list_auxiliary_states() - arg_shape, aaa, aux_shape = sym.infer_shape(**data_shape_dict) - #print(data_shape_dict) - #print(arg_name) - #print(arg_shape) - arg_params = {} - aux_params = None - #print(aaa) - #print(aux_shape) - arg_shape_dict = dict(zip(arg_name, arg_shape)) - aux_shape_dict = dict(zip(aux_name, aux_shape)) - #print(aux_shape) - #print(aux_params) - #print(arg_shape_dict) - for k,v in arg_shape_dict.iteritems(): - #print('find', k) - if k.endswith('_weight') and k.find('_conv')>=0: - if not k.find('_unit0_')>=0: - arg_params[k] = mx.random.normal(0, 0.01, shape=v) - print('init', k) - if k.endswith('_bias'): - arg_params[k] = mx.nd.zeros(shape=v) - print('init', k) - return arg_params, aux_params - diff --git a/src/resnet.py b/src/resnet.py deleted file mode 100644 index be49860..0000000 --- a/src/resnet.py +++ /dev/null @@ -1,196 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -''' -Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py -Original author Wei Wu - -Implemented the following paper: - -Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Identity Mappings in Deep Residual Networks" -''' -import mxnet as mx -import numpy as np - -def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True, bn_mom=0.9, workspace=256, memonger=False): - """Return ResNet Unit symbol for building ResNet - Parameters - ---------- - data : str - Input data - num_filter : int - Number of output channels - bnf : int - Bottle neck channels factor with regard to num_filter - stride : tuple - Stride used in convolution - dim_match : Boolean - True means channel number between input and output is the same, otherwise means differ - name : str - Base name of the operators - workspace : int - Workspace used in convolution operator - """ - if bottle_neck: - # the same as https://github.com/facebook/fb.resnet.torch#notes, a bit difference with origin paper - bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn1') - act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1') - conv1 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.25), kernel=(1,1), stride=(1,1), pad=(0,0), - no_bias=True, workspace=workspace, name=name + '_conv1') - bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn2') - act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2') - conv2 = mx.sym.Convolution(data=act2, num_filter=int(num_filter*0.25), kernel=(3,3), stride=stride, pad=(1,1), - no_bias=True, workspace=workspace, name=name + '_conv2') - bn3 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn3') - act3 = mx.sym.Activation(data=bn3, act_type='relu', name=name + '_relu3') - conv3 = mx.sym.Convolution(data=act3, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), no_bias=True, - workspace=workspace, name=name + '_conv3') - if dim_match: - shortcut = data - else: - shortcut = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True, - workspace=workspace, name=name+'_sc') - if memonger: - shortcut._set_attr(mirror_stage='True') - return conv3 + shortcut - else: - bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn1') - act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1') - conv1 = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(3,3), stride=stride, pad=(1,1), - no_bias=True, workspace=workspace, name=name + '_conv1') - bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn2') - act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2') - conv2 = mx.sym.Convolution(data=act2, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1), - no_bias=True, workspace=workspace, name=name + '_conv2') - if dim_match: - shortcut = data - else: - shortcut = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True, - workspace=workspace, name=name+'_sc') - if memonger: - shortcut._set_attr(mirror_stage='True') - return conv2 + shortcut - -def resnet(units, num_stages, filter_list, num_classes, image_shape, bottle_neck=True, bn_mom=0.9, workspace=256, dtype='float32', memonger=False): - """Return ResNet symbol of - Parameters - ---------- - units : list - Number of units in each stage - num_stages : int - Number of stage - filter_list : list - Channel size of each stage - num_classes : int - Ouput size of symbol - dataset : str - Dataset type, only cifar10 and imagenet supports - workspace : int - Workspace used in convolution operator - dtype : str - Precision (float32 or float16) - """ - num_unit = len(units) - assert(num_unit == num_stages) - data = mx.sym.Variable(name='data') - if dtype == 'float32': - data = mx.sym.identity(data=data, name='id') - else: - if dtype == 'float16': - data = mx.sym.Cast(data=data, dtype=np.float16) - data = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='bn_data') - (nchannel, height, width) = image_shape - if height <= 32: # such as cifar10 - body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(3, 3), stride=(1,1), pad=(1, 1), - no_bias=True, name="conv0", workspace=workspace) - else: # often expected to be 224 such as imagenet - body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(7, 7), stride=(2,2), pad=(3, 3), - no_bias=True, name="conv0", workspace=workspace) - body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn0') - body = mx.sym.Activation(data=body, act_type='relu', name='relu0') - body = mx.sym.Pooling(data=body, kernel=(3, 3), stride=(2,2), pad=(1,1), pool_type='max') - - for i in range(num_stages): - body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False, - name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, workspace=workspace, - memonger=memonger) - for j in range(units[i]-1): - body = residual_unit(body, filter_list[i+1], (1,1), True, name='stage%d_unit%d' % (i + 1, j + 2), - bottle_neck=bottle_neck, workspace=workspace, memonger=memonger) - bn1 = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn1') - relu1 = mx.sym.Activation(data=bn1, act_type='relu', name='relu1') - # Although kernel is not used here when global_pool=True, we should put one - pool1 = mx.sym.Pooling(data=relu1, global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1') - flat = mx.sym.Flatten(data=pool1) - fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1') - if dtype == 'float16': - fc1 = mx.sym.Cast(data=fc1, dtype=np.float32) - return mx.sym.SoftmaxOutput(data=fc1, name='softmax') - -def get_symbol(num_classes, num_layers, image_shape, conv_workspace=256, dtype='float32', **kwargs): - """ - Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py - Original author Wei Wu - """ - image_shape = [int(l) for l in image_shape.split(',')] - (nchannel, height, width) = image_shape - if height <= 28: - num_stages = 3 - if (num_layers-2) % 9 == 0 and num_layers >= 164: - per_unit = [(num_layers-2)//9] - filter_list = [16, 64, 128, 256] - bottle_neck = True - elif (num_layers-2) % 6 == 0 and num_layers < 164: - per_unit = [(num_layers-2)//6] - filter_list = [16, 16, 32, 64] - bottle_neck = False - else: - raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers)) - units = per_unit * num_stages - else: - if num_layers >= 50: - filter_list = [64, 256, 512, 1024, 2048] - bottle_neck = True - else: - filter_list = [64, 64, 128, 256, 512] - bottle_neck = False - num_stages = 4 - if num_layers == 18: - units = [2, 2, 2, 2] - elif num_layers == 34: - units = [3, 4, 6, 3] - elif num_layers == 50: - units = [3, 4, 6, 3] - elif num_layers == 101: - units = [3, 4, 23, 3] - elif num_layers == 152: - units = [3, 8, 36, 3] - elif num_layers == 200: - units = [3, 24, 36, 3] - elif num_layers == 269: - units = [3, 30, 48, 8] - else: - raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers)) - - return resnet(units = units, - num_stages = num_stages, - filter_list = filter_list, - num_classes = num_classes, - image_shape = image_shape, - bottle_neck = bottle_neck, - workspace = conv_workspace, - dtype = dtype) diff --git a/src/resnext.py b/src/resnext.py deleted file mode 100644 index 5974943..0000000 --- a/src/resnext.py +++ /dev/null @@ -1,210 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -''' -Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py -Original author Wei Wu - -Implemented the following paper: -Saining Xie, Ross Girshick, Piotr Dollar, Zhuowen Tu, Kaiming He. "Aggregated Residual Transformations for Deep Neural Network" -''' -import mxnet as mx -import numpy as np - -def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True, num_group=32, bn_mom=0.9, workspace=256, memonger=False): - """Return ResNet Unit symbol for building ResNet - Parameters - ---------- - data : str - Input data - num_filter : int - Number of output channels - bnf : int - Bottle neck channels factor with regard to num_filter - stride : tuple - Stride used in convolution - dim_match : Boolean - True means channel number between input and output is the same, otherwise means differ - name : str - Base name of the operators - workspace : int - Workspace used in convolution operator - """ - if bottle_neck: - # the same as https://github.com/facebook/fb.resnet.torch#notes, a bit difference with origin paper - - conv1 = mx.sym.Convolution(data=data, num_filter=int(num_filter*0.5), kernel=(1,1), stride=(1,1), pad=(0,0), - no_bias=True, workspace=workspace, name=name + '_conv1') - bn1 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn1') - act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1') - - - conv2 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.5), num_group=num_group, kernel=(3,3), stride=stride, pad=(1,1), - no_bias=True, workspace=workspace, name=name + '_conv2') - bn2 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn2') - act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2') - - - conv3 = mx.sym.Convolution(data=act2, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), no_bias=True, - workspace=workspace, name=name + '_conv3') - bn3 = mx.sym.BatchNorm(data=conv3, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn3') - - if dim_match: - shortcut = data - else: - shortcut_conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True, - workspace=workspace, name=name+'_sc') - shortcut = mx.sym.BatchNorm(data=shortcut_conv, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_sc_bn') - - if memonger: - shortcut._set_attr(mirror_stage='True') - eltwise = bn3 + shortcut - return mx.sym.Activation(data=eltwise, act_type='relu', name=name + '_relu') - else: - - conv1 = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(3,3), stride=stride, pad=(1,1), - no_bias=True, workspace=workspace, name=name + '_conv1') - bn1 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn1') - act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1') - - - conv2 = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1), - no_bias=True, workspace=workspace, name=name + '_conv2') - bn2 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn2') - - if dim_match: - shortcut = data - else: - shortcut_conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True, - workspace=workspace, name=name+'_sc') - shortcut = mx.sym.BatchNorm(data=shortcut_conv, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_sc_bn') - - if memonger: - shortcut._set_attr(mirror_stage='True') - eltwise = bn2 + shortcut - return mx.sym.Activation(data=eltwise, act_type='relu', name=name + '_relu') - -def resnext(units, num_stages, filter_list, num_classes, num_group, image_shape, bottle_neck=True, bn_mom=0.9, workspace=256, dtype='float32', memonger=False): - """Return ResNeXt symbol of - Parameters - ---------- - units : list - Number of units in each stage - num_stages : int - Number of stage - filter_list : list - Channel size of each stage - num_classes : int - Ouput size of symbol - num_groupes: int - Number of conv groups - dataset : str - Dataset type, only cifar10 and imagenet supports - workspace : int - Workspace used in convolution operator - dtype : str - Precision (float32 or float16) - """ - num_unit = len(units) - assert(num_unit == num_stages) - data = mx.sym.Variable(name='data') - if dtype == 'float32': - data = mx.sym.identity(data=data, name='id') - else: - if dtype == 'float16': - data = mx.sym.Cast(data=data, dtype=np.float16) - data = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='bn_data') - (nchannel, height, width) = image_shape - if height <= 32: # such as cifar10 - body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(3, 3), stride=(1,1), pad=(1, 1), - no_bias=True, name="conv0", workspace=workspace) - else: # often expected to be 224 such as imagenet - body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(7, 7), stride=(2,2), pad=(3, 3), - no_bias=True, name="conv0", workspace=workspace) - body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn0') - body = mx.sym.Activation(data=body, act_type='relu', name='relu0') - body = mx.sym.Pooling(data=body, kernel=(3, 3), stride=(2,2), pad=(1,1), pool_type='max') - - for i in range(num_stages): - body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False, - name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, num_group=num_group, - bn_mom=bn_mom, workspace=workspace, memonger=memonger) - for j in range(units[i]-1): - body = residual_unit(body, filter_list[i+1], (1,1), True, name='stage%d_unit%d' % (i + 1, j + 2), - bottle_neck=bottle_neck, num_group=num_group, bn_mom=bn_mom, workspace=workspace, memonger=memonger) - - pool1 = mx.sym.Pooling(data=body, global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1') - flat = mx.sym.Flatten(data=pool1) - fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1') - if dtype == 'float16': - fc1 = mx.sym.Cast(data=fc1, dtype=np.float32) - return mx.sym.SoftmaxOutput(data=fc1, name='softmax') - -def get_symbol(num_classes, num_layers, image_shape, num_group=32, conv_workspace=256, dtype='float32', **kwargs): - """ - Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py - Original author Wei Wu - """ - image_shape = [int(l) for l in image_shape.split(',')] - (nchannel, height, width) = image_shape - if height <= 32: - num_stages = 3 - if (num_layers-2) % 9 == 0 and num_layers >= 164: - per_unit = [(num_layers-2)//9] - filter_list = [16, 64, 128, 256] - bottle_neck = True - elif (num_layers-2) % 6 == 0 and num_layers < 164: - per_unit = [(num_layers-2)//6] - filter_list = [16, 16, 32, 64] - bottle_neck = False - else: - raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers)) - units = per_unit * num_stages - else: - if num_layers >= 50: - filter_list = [64, 256, 512, 1024, 2048] - bottle_neck = True - else: - filter_list = [64, 64, 128, 256, 512] - bottle_neck = False - num_stages = 4 - if num_layers == 18: - units = [2, 2, 2, 2] - elif num_layers == 34: - units = [3, 4, 6, 3] - elif num_layers == 50: - units = [3, 4, 6, 3] - elif num_layers == 101: - units = [3, 4, 23, 3] - elif num_layers == 152: - units = [3, 8, 36, 3] - elif num_layers == 200: - units = [3, 24, 36, 3] - elif num_layers == 269: - units = [3, 30, 48, 8] - else: - raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers)) - - return resnext(units = units, - num_stages = num_stages, - filter_list = filter_list, - num_classes = num_classes, - num_group = num_group, - image_shape = image_shape, - bottle_neck = bottle_neck, - workspace = conv_workspace, - dtype = dtype) diff --git a/src/spherenet.py b/src/spherenet.py deleted file mode 100644 index 43d37cb..0000000 --- a/src/spherenet.py +++ /dev/null @@ -1,102 +0,0 @@ -import mxnet as mx -import numpy as np -import math -from mxnet.base import _Null - -def conv_main(data, units, filters, workspace): - body = data - for i in xrange(len(units)): - f = filters[i] - _weight = mx.symbol.Variable("conv%d_%d_weight"%(i+1, 1), lr_mult=1.0) - _bias = mx.symbol.Variable("conv%d_%d_bias"%(i+1, 1), lr_mult=2.0, wd_mult=0.0) - body = mx.sym.Convolution(data=body, weight = _weight, bias = _bias, num_filter=f, kernel=(3, 3), stride=(2,2), pad=(1, 1), - name= "conv%d_%d"%(i+1, 1), workspace=workspace) - - - #body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=0.9, name='bn%d_%d'%(i+1, 1)) - - body = mx.sym.LeakyReLU(data = body, act_type='prelu', name = "relu%d_%d" % (i+1, 1)) - idx = 2 - for j in xrange(units[i]): - _body = mx.sym.Convolution(data=body, no_bias=True, num_filter=f, kernel=(3, 3), stride=(1,1), pad=(1, 1), - name= "conv%d_%d"%(i+1, idx), workspace=workspace) - - #_body = mx.sym.BatchNorm(data=_body, fix_gamma=False, eps=2e-5, momentum=0.9, name='bn%d_%d'%(i+1, idx)) - - _body = mx.sym.LeakyReLU(data = _body, act_type='prelu', name = "relu%d_%d" % (i+1, idx)) - idx+=1 - _body = mx.sym.Convolution(data=_body, no_bias=True, num_filter=f, kernel=(3, 3), stride=(1,1), pad=(1, 1), - name= "conv%d_%d"%(i+1, idx), workspace=workspace) - #_body = mx.sym.BatchNorm(data=_body, fix_gamma=False, eps=2e-5, momentum=0.9, name='bn%d_%d'%(i+1, idx)) - _body = mx.sym.LeakyReLU(data = _body, act_type='prelu', name = "relu%d_%d" % (i+1, idx)) - idx+=1 - body = body+_body - - #body = mx.sym.LeakyReLU(data = body, act_type='prelu', name = "relu%d_%d" % (i+1, idx)) #modify - #idx+=1 - - - return body - -def get_symbol(num_classes, num_layers, conv_workspace=256, **kwargs): - if num_layers==64: - units = [3,8,16,3] - filters = [64,128,256,512] - elif num_layers==20: - units = [1,2,4,1] - filters = [64,128,256,512] - #filters = [64, 256, 512, 1024] - elif num_layers==36: - units = [2,4,8,2] - filters = [64,128,256,512] - #filters = [64, 256, 512, 1024] - elif num_layers==60: - units = [3,8,14,3] - filters = [64,128,256,512] - elif num_layers==104: - units = [3,8,36,3] - filters = [64,128,256,512] - #filters = [64, 256, 512, 1024] - data = mx.symbol.Variable('data') - data = data-127.5 - data = data*0.0078125 - body = conv_main(data = data, units = units, filters = filters, workspace = conv_workspace) - #modify begin - - #body = mx.sym.Pooling(data=body, global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1') - #body = mx.sym.Flatten(data=body) - - #modify end - - _weight = mx.symbol.Variable("fc1_weight", lr_mult=1.0) - _bias = mx.symbol.Variable("fc1_bias", lr_mult=2.0, wd_mult=0.0) - fc1 = mx.sym.FullyConnected(data=body, weight=_weight, bias=_bias, num_hidden=num_classes, name='fc1') - return fc1 - -def init_weights(sym, data_shape_dict, num_layers): - arg_name = sym.list_arguments() - aux_name = sym.list_auxiliary_states() - arg_shape, aaa, aux_shape = sym.infer_shape(**data_shape_dict) - #print(data_shape_dict) - #print(arg_name) - #print(arg_shape) - arg_params = {} - aux_params = None - #print(aaa) - #print(aux_shape) - arg_shape_dict = dict(zip(arg_name, arg_shape)) - aux_shape_dict = dict(zip(aux_name, aux_shape)) - #print(aux_shape) - #print(aux_params) - #print(arg_shape_dict) - for k,v in arg_shape_dict.iteritems(): - if k.startswith('conv') and k.endswith('_weight'): - if not k.find('_1_')>=0: - if num_layers<100: - arg_params[k] = mx.random.normal(0, 0.01, shape=v) - print('init', k) - if k.endswith('_bias'): - arg_params[k] = mx.nd.zeros(shape=v) - print('init', k) - return arg_params, aux_params - diff --git a/src/train_softmax.py b/src/train_softmax.py index d74b77c..c45000e 100644 --- a/src/train_softmax.py +++ b/src/train_softmax.py @@ -19,6 +19,7 @@ import argparse import mxnet.optimizer as optimizer #sys.path.append(os.path.join(os.path.dirname(__file__), 'common')) sys.path.append(os.path.join(os.path.dirname(__file__), 'eval')) +sys.path.append(os.path.join(os.path.dirname(__file__), 'symbols')) import spherenet import marginalnet import inceptions diff --git a/src/xception.py b/src/xception.py deleted file mode 100644 index f9d17f3..0000000 --- a/src/xception.py +++ /dev/null @@ -1,154 +0,0 @@ -# -*- coding: utf-8 -*- -""" - -Xception network, suitable for images with around 299 x 299 (original version) - -Reference: - -François Chollet. Xception: Deep Learning with Depthwise Separable Convlutions. arXiv preprint. https://arxiv.org/pdf/1610.02357v3.pdf - -I refered one version of MXNet from u1234x1234 https://github.com/u1234x1234/mxnet-xception/blob/master/symbol_xception.py - -Modified by Lin Xiong, Sep-3, 2017 for images 224 x 224 -There are some slightly differences with u1234x1234's version (pooling layer) and original version (no dropout layer). - -In order to accelerate computation, we use smaller parameters than original paper. - -""" - -import mxnet as mx - -def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None, suffix='', withRelu=False, withBn=True, bn_mom=0.9, workspace=256): - conv = mx.sym.Convolution(data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, - name='%s%s_conv2d' % (name, suffix), workspace=workspace) - if withBn: - conv = mx.sym.BatchNorm(data=conv, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='%s%s_bn' % (name, suffix)) - if withRelu: - conv = mx.sym.Activation(data=conv, act_type='relu', name='%s%s_relu' % (name, suffix)) - return conv - -def Separable_Conv(data, num_in_channel, num_out_channel, kernel=(3, 3), stride=(1, 1), pad=(1, 1), name=None, suffix='', depth_mult=1, withBn=True, bn_mom=0.9, workspace=256): - # original version of Separable Convolution - # depthwise convolution - #channels = mx.sym.split(data=data, axis=1, num_outputs=num_in_channel) # for new version of mxnet > 0.8 - channels = mx.sym.SliceChannel(data=data, axis=1, num_outputs=num_in_channel) # for old version of mxnet <= 0.8 - depthwise_outs = [mx.sym.Convolution(data=channels[i], num_filter=depth_mult, kernel=kernel, - stride=stride, pad=pad, name=name+'_depthwise_kernel_'+str(i), workspace=workspace) - for i in range(num_in_channel)] - depthwise_out = mx.sym.Concat(*depthwise_outs) - # pointwise convolution - pointwise_out = Conv(data=depthwise_out, num_filter=num_out_channel, name=name+'_pointwise_kernel', withBn=False, bn_mom=0.9, workspace=256) - if withBn: - pointwise_out = mx.sym.BatchNorm(data=pointwise_out, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='%s%s_bn' % (name, suffix)) - return pointwise_out - - - -def Circle_Middle(name, data, - num_filter, - bn_mom=0.9, - round=8): - b = data - for i in xrange(round): - residual = b - prefix = name + '_block' + ('_%d' % i) - - b = mx.sym.Activation(data=b, act_type='relu', name=prefix + '_sepconv1_relu') - b = Separable_Conv(data=b, num_in_channel=num_filter, num_out_channel=num_filter, name=prefix + '_sepconv1', withBn=True, bn_mom=bn_mom, workspace=256) - b = mx.sym.Activation(data=b, act_type='relu', name=prefix + '_sepconv2_relu') - b = Separable_Conv(data=b, num_in_channel=num_filter, num_out_channel=num_filter, name=prefix + '_sepconv2', withBn=True, bn_mom=bn_mom, workspace=256) - b = mx.sym.Activation(data=b, act_type='relu', name=prefix + '_sepconv3_relu') - b = Separable_Conv(data=b, num_in_channel=num_filter, num_out_channel=num_filter, name=prefix + '_sepconv3', withBn=True, bn_mom=bn_mom, workspace=256) - - b = b + residual - - return b - - -def get_xception_symbol(num_classes=1000): - # input shape 229*229*3 (old) - # input shape 224*224*3 (new) - - #filter_list=[64, 128, 256, 728, 1024, 1536, 2048] # original version - filter_list=[64, 64, 128, 364, 512, 768, 1024] # smaller one - - # Entry flow - data = mx.sym.Variable('data') - - # block 1 - block1 = Conv(data=data, num_filter=int(filter_list[0]*0.5), kernel=(3, 3), stride=(2, 2), pad=(1, 1), name='Entry_flow_b1_conv1', - withRelu=True, withBn=True, bn_mom=0.9, workspace=256) - block1 = Conv(data=block1, num_filter=filter_list[0], kernel=(3, 3), pad=(1, 1), name='Entry_flow_b1_conv2', - withRelu=True, withBn=True, bn_mom=0.9, workspace=256) - - # block 2 - rs2 = Conv(data=block1, num_filter=filter_list[1], stride=(2, 2), name='Entry_flow_b2_conv1', - withBn=True, bn_mom=0.9, workspace=256) - block2 = Separable_Conv(block1, num_in_channel=filter_list[0], num_out_channel=filter_list[1], name='Entry_flow_b2_sepconv1', withBn=True, bn_mom=0.9, workspace=256) - block2 = mx.sym.Activation(data=block2, act_type='relu', name='Entry_flow_b2_sepconv1_relu') - block2 = Separable_Conv(block2, num_in_channel=filter_list[1], num_out_channel=filter_list[1], name='Entry_flow_b2_sepconv2', withBn=True, bn_mom=0.9, workspace=256) - block2 = mx.sym.Pooling(data=block2, kernel=(3, 3), stride=(2, 2), pad=(1, 1), pool_type='max', name='Entry_flow_b2_pool') - block2 = block2 + rs2 - - # block 3 - rs3 = Conv(data=block2, num_filter=filter_list[2], stride=(2, 2), name='Entry_flow_b3_conv1', - withBn=True, bn_mom=0.9, workspace=256) - block3 = mx.sym.Activation(data=block2, act_type='relu', name='Entry_flow_b3_sepconv1_relu') - block3 = Separable_Conv(block3, num_in_channel=filter_list[1], num_out_channel=filter_list[2], name='Entry_flow_b3_sepconv1', withBn=True, bn_mom=0.9, workspace=256) - block3 = mx.sym.Activation(data=block3, act_type='relu', name='Entry_flow_b3_sepconv2_relu') - block3 = Separable_Conv(block3, num_in_channel=filter_list[2], num_out_channel=filter_list[2], name='Entry_flow_b3_sepconv2', withBn=True, bn_mom=0.9, workspace=256) - block3 = mx.sym.Pooling(data=block3, kernel=(3, 3), stride=(2, 2), pad=(1, 1), pool_type='max', name='Entry_flow_b3_pool') - block3 = block3 + rs3 - - # block 4 - rs4 = Conv(data=block3, num_filter=filter_list[3], stride=(2, 2), name='Entry_flow_b4_conv1', - withBn=True, bn_mom=0.9, workspace=256) - block4 = mx.sym.Activation(data=block3, act_type='relu', name='Entry_flow_b4_sepconv1_relu') - block4 = Separable_Conv(block4, num_in_channel=filter_list[2], num_out_channel=filter_list[3], name='Entry_flow_b4_sepconv1', withBn=True, bn_mom=0.9, workspace=256) - block4 = mx.sym.Activation(data=block4, act_type='relu', name='Entry_flow_b4_sepconv2_relu') - block4 = Separable_Conv(block4, num_in_channel=filter_list[3], num_out_channel=filter_list[3], name='Entry_flow_b4_sepconv2', withBn=True, bn_mom=0.9, workspace=256) - block4 = mx.sym.Pooling(data=block4, kernel=(3, 3), stride=(2, 2), pad=(1, 1), pool_type='max', name='Entry_flow_b4_pool') - block4 = block4 + rs4 - - # Middle flow - block_m_f = Circle_Middle('Middle_flow', block4, - filter_list[3], - 0.9, - 8) - # Exit flow - rs5 = Conv(data=block_m_f, num_filter=filter_list[4], stride=(2, 2), name='Exit_flow_b5_conv1', - withBn=True, bn_mom=0.9, workspace=256) - block5 = mx.sym.Activation(data=block_m_f, act_type='relu', name='Exit_flow_b5_sepconv1_relu') - block5 = Separable_Conv(block5, num_in_channel=filter_list[3], num_out_channel=filter_list[3], name='Exit_flow_b5_sepconv1', withBn=True, bn_mom=0.9, workspace=256) - block5 = mx.sym.Activation(data=block5, act_type='relu', name='Exit_flow_b5_sepconv2_relu') - block5 = Separable_Conv(block5, num_in_channel=filter_list[3], num_out_channel=filter_list[4], name='Exit_flow_b5_sepconv2', withBn=True, bn_mom=0.9, workspace=256) - block5 = mx.sym.Pooling(data=block5, kernel=(3, 3), stride=(2, 2), pad=(1, 1), pool_type='max', name='Entry_flow_b5_pool') - block5 = block5 + rs5 - - block6 = Separable_Conv(block5, num_in_channel=filter_list[4], num_out_channel=filter_list[5], name='Exit_flow_b6_sepconv1', withBn=True, bn_mom=0.9, workspace=256) - block6 = mx.sym.Activation(data=block6, act_type='relu', name='Exit_flow_b6_sepconv1_relu') - block6 = Separable_Conv(block6, num_in_channel=filter_list[5], num_out_channel=filter_list[6], name='Exit_flow_b6_sepconv2', withBn=True, bn_mom=0.9, workspace=256) - block6 = mx.sym.Activation(data=block6, act_type='relu', name='Exit_flow_b6_sepconv2_relu') - - pool = mx.sym.Pooling(data=block6, global_pool=True, kernel=(7, 7), stride=(1, 1), pad=(0, 0), pool_type="avg", name="global_pool") - dropout = mx.sym.Dropout(data=pool, p=0.2) - flatten = mx.sym.Flatten(data=dropout) - - # output - fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=num_classes, name='fc1') - softmax = mx.symbol.SoftmaxOutput(data=fc1, name='softmax') - return fc1, softmax - - - - - - - - - - - - - -