Merge branch 'master' of https://github.com/deepinsight/insightface

2026-05-17 14:26:08 +00:00 · 2019-10-16 14:37:38 +08:00
parent df83ef3057 1a17c89e90
commit ccef2ad7b3
60 changed files with 8579 additions and 2299 deletions
--- a/PRNet.mxnet/README.md
+++ b/PRNet.mxnet/README.md
@@ -0,0 +1,6 @@
+MXNet implementation of [Joint 3D Face Reconstruction and Dense Alignment with Position Map Regression Network](http://openaccess.thecvf.com/content_ECCV_2018/papers/Yao_Feng_Joint_3D_Face_ECCV_2018_paper.pdf). 
+
+Original [PyTorch implementation](https://github.com/YadiraF/PRNet)
+
+Pretrained Models and details coming soon.
+
--- a/PRNet.mxnet/config.py
+++ b/PRNet.mxnet/config.py
@@ -0,0 +1,89 @@
+import numpy as np
+from easydict import EasyDict as edict
+
+config = edict()
+
+#default training/dataset config
+config.num_classes = 3
+config.input_img_size = 256
+config.output_label_size = 64
+
+# network settings
+network = edict()
+
+network.hourglass = edict()
+network.hourglass.net_sta = 0
+network.hourglass.net_n = 4
+network.hourglass.net_dcn = 0
+network.hourglass.net_stacks = 1
+network.hourglass.net_block = 'resnet'
+network.hourglass.net_binarize = False
+network.hourglass.losstype = 'heatmap'
+network.hourglass.multiplier = 1.0
+
+network.prnet = edict()
+network.prnet.net_sta = 0
+network.prnet.net_n = 5
+network.prnet.net_dcn = 0
+network.prnet.net_stacks = 1
+network.prnet.net_modules = 2
+network.prnet.net_block = 'hpm'
+network.prnet.net_binarize = False
+network.prnet.losstype = 'heatmap'
+network.prnet.multiplier = 0.25
+
+network.hpm = edict()
+network.hpm.net_sta = 0
+network.hpm.net_n = 4
+network.hpm.net_dcn = 0
+network.hpm.net_stacks = 1
+network.hpm.net_block = 'hpm'
+network.hpm.net_binarize = False
+network.hpm.losstype = 'heatmap'
+network.hpm.multiplier = 1.0
+
+
+# dataset settings
+dataset = edict()
+
+
+dataset.prnet = edict()
+dataset.prnet.dataset = '3D'
+dataset.prnet.landmark_type = 'dense'
+dataset.prnet.dataset_path = './data64'
+dataset.prnet.num_classes = 3
+dataset.prnet.input_img_size = 256
+dataset.prnet.output_label_size = 64
+#dataset.prnet.label_xfirst = False
+dataset.prnet.val_targets = ['']
+
+# default settings
+default = edict()
+
+# default network
+default.network = 'hpm'
+default.pretrained = ''
+default.pretrained_epoch = 0
+# default dataset
+default.dataset = 'prnet'
+default.frequent = 20
+default.verbose = 200
+default.kvstore = 'device'
+
+default.prefix = 'model/A'
+default.end_epoch = 10000
+default.lr = 0.00025
+default.wd = 0.0
+default.per_batch_size = 20
+default.lr_step = '16000,24000,30000'
+
+def generate_config(_network, _dataset):
+    for k, v in network[_network].items():
+      config[k] = v
+      default[k] = v
+    for k, v in dataset[_dataset].items():
+      config[k] = v
+      default[k] = v
+    config.network = _network
+    config.dataset = _dataset
+
--- a/PRNet.mxnet/data.py
+++ b/PRNet.mxnet/data.py
@@ -0,0 +1,164 @@
+# pylint: skip-file
+import mxnet as mx
+import numpy as np
+import sys, os
+import random
+import glob
+import math
+import scipy.misc
+import cv2
+import logging
+import sklearn
+import datetime
+import img_helper
+from mxnet.io import DataIter
+from mxnet import ndarray as nd
+from mxnet import io
+from mxnet import recordio
+from PIL import Image
+from config import config
+from skimage import transform as tf
+
+
+class FaceSegIter(DataIter):
+    def __init__(self, path, batch_size, 
+                 per_batch_size = 0,
+                 aug_level = 0,
+                 force_mirror = False,
+                 exf = 1,
+                 args = None):
+      self.aug_level = aug_level
+      self.force_mirror = force_mirror
+      self.exf = exf
+      self.batch_size = batch_size
+      self.per_batch_size = per_batch_size
+      self.image_file_list = []
+      self.uv_file_list = []
+      for _file in glob.glob(os.path.join(path, '*.jpg')):
+        self.image_file_list.append(_file)
+      for img in self.image_file_list:
+        uv_file = img[0:-3]+"npy"
+        self.uv_file_list.append(uv_file)
+      self.seq = range(len(self.image_file_list))
+      print('train size', len(self.seq))
+      self.cur = 0
+      self.reset()
+      self.data_shape = (3, config.input_img_size, config.input_img_size)
+      self.num_classes = config.num_classes
+      self.input_img_size = config.input_img_size
+      #self.label_classes = self.num_classes
+      self.output_label_size = config.output_label_size
+      #if aug_level>0:
+      #  self.output_label_size = config.output_label_size
+      #else:
+      #  self.output_label_size = self.input_img_size
+      self.label_shape = (self.num_classes, self.output_label_size, self.output_label_size)
+      self.provide_data = [('data', (batch_size,) + self.data_shape)]
+      self.provide_label = [('softmax_label', (batch_size,) + self.label_shape),
+                            ('mask_label', (batch_size,)+ self.label_shape)]
+      weight_mask = cv2.imread('./uv-data/uv_weight_mask.png')
+      print('weight_mask', weight_mask.shape)
+      if weight_mask.shape[0]!=self.output_label_size:
+        weight_mask = cv2.resize(weight_mask, (self.output_label_size, self.output_label_size) )
+      #idx = np.where(weight_mask>0)[0]
+      #print('weight idx', idx)
+      weight_mask = weight_mask.astype(np.float32)
+      weight_mask /= 255.0
+
+      vis_mask = cv2.imread('./uv-data/uv_face_mask.png')
+      print('vis_mask', vis_mask.shape)
+      if vis_mask.shape[0]!=self.output_label_size:
+        vis_mask = cv2.resize(vis_mask, (self.output_label_size, self.output_label_size) )
+      vis_mask = vis_mask.astype(np.float32)
+      vis_mask /= 255.0
+      weight_mask *= vis_mask
+      print('weight_mask', weight_mask.shape)
+      weight_mask = weight_mask.transpose( (2,0,1) )
+      #WM = np.zeros( (batch_size,)+self.label_shape, dtype=np.float32 )
+      #for i in range(batch_size):
+      #  WM[i] = weight_mask
+      #weight_mask = WM
+      #weight_mask = weight_mask.reshape( (1, 3, weight_mask.shape[0], weight_mask.shape[1]) )
+      weight_mask = weight_mask[np.newaxis,:,:,:]
+      print('weight_mask', weight_mask.shape)
+      weight_mask = np.tile(weight_mask, (batch_size,1,1,1))
+      print('weight_mask', weight_mask.shape)
+      self.weight_mask = nd.array(weight_mask)
+      self.img_num = 0
+      self.invalid_num = 0
+      self.mode = 1
+      self.vis = 0
+      self.stats = [0,0]
+
+    def get_data_shape(self):
+        return self.data_shape
+
+    #def get_label_shape(self):
+    #    return self.label_shape
+
+    def get_shape_dict(self):
+        D = {}
+        for (k,v) in self.provide_data:
+            D[k] = v
+        for (k,v) in self.provide_label:
+            D[k] = v
+        return D
+
+    def get_label_names(self):
+        D = []
+        for (k,v) in self.provide_label:
+            D.append(k)
+        return D
+
+    def reset(self):
+      #print('reset')
+      self.cur = 0
+      if self.aug_level>0:
+        random.shuffle(self.seq)
+
+    def next_sample(self):
+      """Helper function for reading in next sample."""
+      if self.cur >= len(self.seq):
+        raise StopIteration
+      idx = self.seq[self.cur]
+      self.cur += 1
+      uv_path = self.uv_file_list[idx]
+      image_path = self.image_file_list[idx]
+      uvmap = np.load(uv_path)
+      img = cv2.imread(image_path)[:,:,::-1]#to rgb
+      hlabel = uvmap
+      #print(hlabel.shape)
+      #hlabel = np.array(header.label).reshape( (self.output_label_size, self.output_label_size, self.num_classes) )
+      hlabel /= self.input_img_size
+
+      return img, hlabel
+
+
+    def next(self):
+        """Returns the next batch of data."""
+        #print('next')
+        batch_size = self.batch_size
+        batch_data = nd.empty((batch_size,)+self.data_shape)
+        batch_label = nd.empty((batch_size,)+self.label_shape)
+        i = 0
+        #self.cutoff = random.randint(800,1280)
+        try:
+            while i < batch_size:
+                #print('N', i)
+                data, label = self.next_sample()
+                data = nd.array(data)
+                data = nd.transpose(data, axes=(2, 0, 1))
+                label = nd.array(label)
+                label = nd.transpose(label, axes=(2, 0, 1))
+                batch_data[i][:] = data
+                batch_label[i][:] = label
+                i += 1
+        except StopIteration:
+            if i<batch_size:
+                raise StopIteration
+
+        #return {self.data_name  :  batch_data,
+        #        self.label_name :  batch_label}
+        #print(batch_data.shape, batch_label.shape)
+        return mx.io.DataBatch([batch_data], [batch_label, self.weight_mask], batch_size - i)
+
--- a/PRNet.mxnet/metric.py
+++ b/PRNet.mxnet/metric.py
@@ -0,0 +1,99 @@
+import mxnet as mx
+import numpy as np
+import math
+import cv2
+from config import config
+
+class LossValueMetric(mx.metric.EvalMetric):
+  def __init__(self):
+    self.axis = 1
+    super(LossValueMetric, self).__init__(
+        'lossvalue', axis=self.axis,
+        output_names=None, label_names=None)
+    self.losses = []
+
+  def update(self, labels, preds):
+    loss = preds[0].asnumpy()
+    self.sum_metric += np.mean(loss)
+    self.num_inst += 1.0
+
+class NMEMetric(mx.metric.EvalMetric):
+  def __init__(self):
+    self.axis = 1
+    super(NMEMetric, self).__init__(
+        'NME', axis=self.axis,
+        output_names=None, label_names=None)
+    #self.losses = []
+    self.count = 0
+
+  def cal_nme(self, label, pred_label):
+    nme = []
+    for b in xrange(pred_label.shape[0]):
+      record = [None]*6
+      item = []
+      if label.ndim==4:
+          _heatmap = label[b][36]
+          if np.count_nonzero(_heatmap)==0:
+              continue
+      else:#ndim==3
+          #print(label[b])
+          if np.count_nonzero(label[b])==0:
+              continue
+      for p in xrange(pred_label.shape[1]):
+        if label.ndim==4:
+            heatmap_gt = label[b][p]
+            ind_gt = np.unravel_index(np.argmax(heatmap_gt, axis=None), heatmap_gt.shape)
+            ind_gt = np.array(ind_gt)
+        else:
+            ind_gt = label[b][p]
+            #ind_gt = ind_gt.astype(np.int)
+            #print(ind_gt)
+        heatmap_pred = pred_label[b][p]
+        heatmap_pred = cv2.resize(heatmap_pred, (config.input_img_size, config.input_img_size))
+        ind_pred = np.unravel_index(np.argmax(heatmap_pred, axis=None), heatmap_pred.shape)
+        ind_pred = np.array(ind_pred)
+        #print(ind_gt.shape)
+        #print(ind_pred)
+        if p==36:
+            #print('b', b, p, ind_gt, np.count_nonzero(heatmap_gt))
+            record[0] = ind_gt
+        elif p==39:
+            record[1] = ind_gt
+        elif p==42:
+            record[2] = ind_gt
+        elif p==45:
+            record[3] = ind_gt
+        if record[4] is None or record[5] is None:
+            record[4] = ind_gt
+            record[5] = ind_gt
+        else:
+            record[4] = np.minimum(record[4], ind_gt)
+            record[5] = np.maximum(record[5], ind_gt)
+        #print(ind_gt.shape, ind_pred.shape)
+        value = np.sqrt(np.sum(np.square(ind_gt - ind_pred)))
+        item.append(value)
+      _nme = np.mean(item)
+      if config.landmark_type=='2d':
+          left_eye = (record[0]+record[1])/2
+          right_eye = (record[2]+record[3])/2
+          _dist = np.sqrt(np.sum(np.square(left_eye - right_eye)))
+          #print('eye dist', _dist, left_eye, right_eye)
+          _nme /= _dist
+      else:
+          #_dist = np.sqrt(float(label.shape[2]*label.shape[3]))
+          _dist = np.sqrt(np.sum(np.square(record[5] - record[4])))
+          #print(_dist)
+          _nme /= _dist
+      nme.append(_nme)
+    return np.mean(nme)
+
+  def update(self, labels, preds):
+    self.count+=1
+    label = labels[0].asnumpy()
+    pred_label = preds[-1].asnumpy()
+    nme = self.cal_nme(label, pred_label)
+
+    #print('nme', nme)
+    #nme = np.mean(nme)
+    self.sum_metric += np.mean(nme)
+    self.num_inst += 1.0
--- a/PRNet.mxnet/symbol/sym_heatmap.py
+++ b/PRNet.mxnet/symbol/sym_heatmap.py
@@ -0,0 +1,435 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import mxnet as mx
+import numpy as np
+from config import config
+
+
+ACT_BIT = 1
+bn_mom = 0.9
+workspace = 256
+memonger = False
+
+
+
+def Conv(**kwargs):
+    body = mx.sym.Convolution(**kwargs)
+    return body
+
+def Act(data, act_type, name):
+    if act_type=='prelu':
+      body = mx.sym.LeakyReLU(data = data, act_type='prelu', name = name)
+    else:
+      body = mx.symbol.Activation(data=data, act_type=act_type, name=name)
+    return body
+
+#def lin(data, num_filter, workspace, name, binarize, dcn):
+#  bit = 1
+#  if not binarize:
+#    if not dcn:
+#        conv1 = Conv(data=data, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0),
+#                                      no_bias=True, workspace=workspace, name=name + '_conv')
+#        bn1 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn')
+#        act1 = Act(data=bn1, act_type='relu', name=name + '_relu')
+#        return act1
+#    else:
+#        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn')
+#        act1 = Act(data=bn1, act_type='relu', name=name + '_relu')
+#        conv1_offset = mx.symbol.Convolution(name=name+'_conv_offset', data = act1,
+#                num_filter=18, pad=(1, 1), kernel=(3, 3), stride=(1, 1))
+#        conv1 = mx.contrib.symbol.DeformableConvolution(name=name+"_conv", data=act1, offset=conv1_offset,
+#                num_filter=num_filter, pad=(1,1), kernel=(3, 3), num_deformable_group=1, stride=(1, 1), dilate=(1, 1), no_bias=False)
+#        #conv1 = Conv(data=act1, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1),
+#        #                              no_bias=False, workspace=workspace, name=name + '_conv')
+#        return conv1
+#  else:
+#    bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn')
+#    act1 = Act(data=bn1, act_type='relu', name=name + '_relu')
+#    conv1 = mx.sym.QConvolution_v1(data=act1, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0),
+#                               no_bias=True, workspace=workspace, name=name + '_conv', act_bit=ACT_BIT, weight_bit=bit)
+#    conv1 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn2')
+#    return conv1
+
+def lin3(data, num_filter, workspace, name, k, g=1, d=1):
+    if k!=3:
+        conv1 = Conv(data=data, num_filter=num_filter, kernel=(k,k), stride=(1,1), pad=((k-1)//2,(k-1)//2), num_group=g,
+                                      no_bias=True, workspace=workspace, name=name + '_conv')
+    else:
+        conv1 = Conv(data=data, num_filter=num_filter, kernel=(k,k), stride=(1,1), pad=(d,d), num_group=g, dilate=(d, d),
+                                      no_bias=True, workspace=workspace, name=name + '_conv')
+    bn1 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn')
+    act1 = Act(data=bn1, act_type='relu', name=name + '_relu')
+    ret = act1
+    return ret
+
+def ConvFactory(data, num_filter, kernel, stride=(1, 1), pad=(0, 0), act_type="relu", mirror_attr={}, with_act=True, dcn=False, name=''):
+    if not dcn:
+      conv = mx.symbol.Convolution(
+          data=data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad, no_bias=True, workspace=workspace, name=name+'_conv')
+    else:
+        conv_offset = mx.symbol.Convolution(name=name+'_conv_offset', data = data,
+                num_filter=18, pad=(1, 1), kernel=(3, 3), stride=(1, 1))
+        conv = mx.contrib.symbol.DeformableConvolution(name=name+"_conv", data=data, offset=conv_offset,
+                num_filter=num_filter, pad=(1,1), kernel=(3,3), num_deformable_group=1, stride=stride, dilate=(1, 1), no_bias=False)
+    bn = mx.symbol.BatchNorm(data=conv, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name+'_bn')
+    if with_act:
+      act = Act(bn, act_type, name=name+'_relu')
+      #act = mx.symbol.Activation(
+      #    data=bn, act_type=act_type, attr=mirror_attr, name=name+'_relu')
+      return act
+    else:
+      return bn
+
+class CAB:
+    def __init__(self, data, nFilters, nModules, n, workspace, name, dilate, group):
+        self.data = data
+        self.nFilters = nFilters
+        self.nModules = nModules
+        self.n = n
+        self.workspace = workspace
+        self.name = name
+        self.dilate = dilate
+        self.group = group
+        self.sym_map = {}
+
+    def get_output(self, w, h):
+        key = (w, h)
+        if key in self.sym_map:
+            return self.sym_map[key]
+        ret = None
+        if h==self.n:
+            if w==self.n:
+                ret = (self.data, self.nFilters)
+            else:
+                x = self.get_output(w+1, h)
+                f = int(x[1]*0.5)
+                if w!=self.n-1:
+                    body = lin3(x[0], f, self.workspace, "%s_w%d_h%d_1"%(self.name, w, h), 3, self.group, 1)
+                else:
+                    body = lin3(x[0], f, self.workspace, "%s_w%d_h%d_1"%(self.name, w, h), 3, self.group, self.dilate)
+                ret = (body,f)
+        else:
+            x = self.get_output(w+1, h+1)
+            y = self.get_output(w, h+1)
+            if h%2==1 and h!=w:
+                xbody = lin3(x[0], x[1], self.workspace, "%s_w%d_h%d_2"%(self.name, w, h), 3, x[1])
+                #xbody = xbody+x[0]
+            else:
+                xbody = x[0]
+            #xbody = x[0]
+            #xbody = lin3(x[0], x[1], self.workspace, "%s_w%d_h%d_2"%(self.name, w, h), 3, x[1])
+            if w==0:
+                ybody = lin3(y[0], y[1], self.workspace, "%s_w%d_h%d_3"%(self.name, w, h), 3, self.group)
+            else:
+                ybody = y[0]
+            ybody = mx.sym.concat(y[0], ybody, dim=1)
+            body = mx.sym.add_n(xbody,ybody, name="%s_w%d_h%d_add"%(self.name, w, h))
+            body = body/2
+            ret = (body, x[1])
+        self.sym_map[key] = ret
+        return ret
+
+    def get(self):
+        return self.get_output(1, 1)[0]
+
+def conv_resnet(data, num_filter, stride, dim_match, name, binarize, dcn, dilate, **kwargs):
+    bit = 1
+    #print('in unit2')
+    # the same as https://github.com/facebook/fb.resnet.torch#notes, a bit difference with origin paper
+    bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn1')
+    act1 = Act(data=bn1, act_type='relu', name=name + '_relu1')
+    conv1 = Conv(data=act1, num_filter=int(num_filter*0.5), kernel=(1,1), stride=(1,1), pad=(0,0),
+                               no_bias=True, workspace=workspace, name=name + '_conv1')
+    bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn2')
+    act2 = Act(data=bn2, act_type='relu', name=name + '_relu2')
+    conv2 = Conv(data=act2, num_filter=int(num_filter*0.5), kernel=(3,3), stride=(1,1), pad=(1,1),
+                               no_bias=True, workspace=workspace, name=name + '_conv2')
+    bn3 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn3')
+    act3 = Act(data=bn3, act_type='relu', name=name + '_relu3')
+    conv3 = Conv(data=act3, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), no_bias=True,
+                               workspace=workspace, name=name + '_conv3')
+    #if binarize:
+    #  conv3 = mx.sym.BatchNorm(data=conv3, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn4')
+    if dim_match:
+        shortcut = data
+    else:
+        shortcut = Conv(data=act1, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
+                                        workspace=workspace, name=name+'_sc')
+    if memonger:
+        shortcut._set_attr(mirror_stage='True')
+    return conv3 + shortcut
+
+
+def conv_prnet(data, num_filter, stride, dim_match, name, binarize, dcn, dilate, **kwargs):
+    #print('in unit2')
+    # the same as https://github.com/facebook/fb.resnet.torch#notes, a bit difference with origin paper
+    bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn1')
+    act1 = Act(data=bn1, act_type='relu', name=name + '_relu1')
+    conv1 = Conv(data=act1, num_filter=int(num_filter*0.5), kernel=(1,1), stride=(1,1), pad=(0,0),
+                               no_bias=True, workspace=workspace, name=name + '_conv1')
+    bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn2')
+    act2 = Act(data=bn2, act_type='relu', name=name + '_relu2')
+    conv2 = Conv(data=act2, num_filter=int(num_filter*0.5), kernel=(3,3), stride=(1,1), pad=(1,1),
+                               no_bias=True, workspace=workspace, name=name + '_conv2')
+    bn3 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn3')
+    act3 = Act(data=bn3, act_type='relu', name=name + '_relu3')
+    conv3 = Conv(data=act3, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), no_bias=True,
+                               workspace=workspace, name=name + '_conv3')
+    if dim_match:
+        shortcut = data
+    else:
+        shortcut = Conv(data=act1, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
+                                        workspace=workspace, name=name+'_sc')
+    if memonger:
+        shortcut._set_attr(mirror_stage='True')
+    return conv3 + shortcut
+
+def conv_hpm(data, num_filter, stride, dim_match, name, binarize, dcn, dilation, **kwargs):
+    bit = 1
+    #print('in unit2')
+    # the same as https://github.com/facebook/fb.resnet.torch#notes, a bit difference with origin paper
+    bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn1')
+    act1 = Act(data=bn1, act_type='relu', name=name + '_relu1')
+    conv1 = Conv(data=act1, num_filter=int(num_filter*0.5), kernel=(3,3), stride=(1,1), pad=(dilation,dilation), dilate=(dilation,dilation),
+                               no_bias=True, workspace=workspace, name=name + '_conv1')
+    bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn2')
+    act2 = Act(data=bn2, act_type='relu', name=name + '_relu2')
+    conv2 = Conv(data=act2, num_filter=int(num_filter*0.25), kernel=(3,3), stride=(1,1), pad=(dilation,dilation), dilate=(dilation,dilation),
+                               no_bias=True, workspace=workspace, name=name + '_conv2')
+    bn3 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn3')
+    act3 = Act(data=bn3, act_type='relu', name=name + '_relu3')
+    conv3 = Conv(data=act3, num_filter=int(num_filter*0.25), kernel=(3,3), stride=(1,1), pad=(dilation,dilation), dilate=(dilation,dilation), 
+            no_bias=True, workspace=workspace, name=name + '_conv3')
+    conv4 = mx.symbol.Concat(*[conv1, conv2, conv3])
+    if dim_match:
+        shortcut = data
+    else:
+        shortcut = Conv(data=act1, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
+                                        workspace=workspace, name=name+'_sc')
+    if memonger:
+        shortcut._set_attr(mirror_stage='True')
+    return conv4 + shortcut
+
+
+def block17(net, input_num_channels, scale=1.0, with_act=True, act_type='relu', mirror_attr={}, name=''):
+    tower_conv = ConvFactory(net, 192, (1, 1), name=name+'_conv')
+    tower_conv1_0 = ConvFactory(net, 129, (1, 1), name=name+'_conv1_0')
+    tower_conv1_1 = ConvFactory(tower_conv1_0, 160, (1, 7), pad=(1, 2), name=name+'_conv1_1')
+    tower_conv1_2 = ConvFactory(tower_conv1_1, 192, (7, 1), pad=(2, 1), name=name+'_conv1_2')
+    tower_mixed = mx.symbol.Concat(*[tower_conv, tower_conv1_2])
+    tower_out = ConvFactory(
+        tower_mixed, input_num_channels, (1, 1), with_act=False, name=name+'_conv_out')
+    net = net+scale * tower_out
+    if with_act:
+        act = mx.symbol.Activation(
+            data=net, act_type=act_type, attr=mirror_attr)
+        return act
+    else:
+        return net
+
+def block35(net, input_num_channels, scale=1.0, with_act=True, act_type='relu', mirror_attr={}, name=''):
+    M = 1.0
+    tower_conv = ConvFactory(net, int(input_num_channels*0.25*M), (1, 1), name=name+'_conv')
+    tower_conv1_0 = ConvFactory(net, int(input_num_channels*0.25*M), (1, 1), name=name+'_conv1_0')
+    tower_conv1_1 = ConvFactory(tower_conv1_0, int(input_num_channels*0.25*M), (3, 3), pad=(1, 1), name=name+'_conv1_1')
+    tower_conv2_0 = ConvFactory(net, int(input_num_channels*0.25*M), (1, 1), name=name+'_conv2_0')
+    tower_conv2_1 = ConvFactory(tower_conv2_0, int(input_num_channels*0.375*M), (3, 3), pad=(1, 1), name=name+'_conv2_1')
+    tower_conv2_2 = ConvFactory(tower_conv2_1, int(input_num_channels*0.5*M), (3, 3), pad=(1, 1), name=name+'_conv2_2')
+    tower_mixed = mx.symbol.Concat(*[tower_conv, tower_conv1_1, tower_conv2_2])
+    tower_out = ConvFactory(
+        tower_mixed, input_num_channels, (1, 1), with_act=False, name=name+'_conv_out')
+
+    net = net+scale * tower_out
+    if with_act:
+        act = mx.symbol.Activation(
+            data=net, act_type=act_type, attr=mirror_attr)
+        return act
+    else:
+        return net
+
+def conv_inception(data, num_filter, stride, dim_match, name, binarize, dcn, dilate, **kwargs):
+    assert not binarize
+    if stride[0]>1 or not dim_match:
+        return conv_resnet(data, num_filter, stride, dim_match, name, binarize, dcn, dilate, **kwargs)
+    conv4 = block35(data, num_filter, name=name+'_block35')
+    return conv4
+
+def conv_cab(data, num_filter, stride, dim_match, name, binarize, dcn, dilate, **kwargs):
+    if stride[0]>1 or not dim_match:
+        return conv_hpm(data, num_filter, stride, dim_match, name, binarize, dcn, dilate, **kwargs)
+    cab = CAB(data, num_filter, 1, 4, workspace, name, dilate, 1)
+    return cab.get()
+
+def conv_block(data, num_filter, stride, dim_match, name, binarize, dcn, dilate):
+  if config.net_block=='resnet':
+    return conv_resnet(data, num_filter, stride, dim_match, name, binarize, dcn, dilate)
+  elif config.net_block=='inception':
+    return conv_inception(data, num_filter, stride, dim_match, name, binarize, dcn, dilate)
+  elif config.net_block=='hpm':
+    return conv_hpm(data, num_filter, stride, dim_match, name, binarize, dcn, dilate)
+  elif config.net_block=='cab':
+    return conv_cab(data, num_filter, stride, dim_match, name, binarize, dcn, dilate)
+  elif config.net_block=='prnet':
+    return conv_prnet(data, num_filter, stride, dim_match, name, binarize, dcn, dilate)
+
+def hourglass(data, nFilters, nModules, n, workspace, name, binarize, dcn):
+  s = 2
+  _dcn = False
+  up1 = data
+  for i in xrange(nModules):
+    up1 = conv_block(up1, nFilters, (1,1), True, "%s_up1_%d"%(name,i), binarize, _dcn, 1)
+  low1 = mx.sym.Pooling(data=data, kernel=(s, s), stride=(s,s), pad=(0,0), pool_type='max')
+  #low1 = ConvFactory(data, nFilters, (4,4), stride=(2,2), pad=(1,1), name=name+'_conv')
+  #low1 = ConvFactory(data, nFilters, (3,3), stride=(2,2), pad=(1,1), name=name+'_conv')
+  #low1 = ConvFactory(up1, nFilters, (3,3), stride=(2,2), pad=(1,1), name=name+'_conv')
+  for i in xrange(nModules):
+    low1 = conv_block(low1, nFilters, (1,1), True, "%s_low1_%d"%(name,i), binarize, _dcn, 1)
+  if n>1:
+    low2 = hourglass(low1, nFilters, nModules, n-1, workspace, "%s_%d"%(name, n-1), binarize, dcn)
+  else:
+    low2 = low1
+    for i in xrange(nModules):
+      low2 = conv_block(low2, nFilters, (1,1), True, "%s_low2_%d"%(name,i), binarize, _dcn, 1) #TODO
+  low3 = low2
+  for i in xrange(nModules):
+    low3 = conv_block(low3, nFilters, (1,1), True, "%s_low3_%d"%(name,i), binarize, _dcn, 1)
+  up2 = mx.symbol.UpSampling(low3, scale=s, sample_type='nearest', workspace=512, name='%s_upsampling_%s'%(name,n), num_args=1)
+  #up2 = mx.symbol.UpSampling(low3, scale=s, sample_type='bilinear', num_filter=nFilters, workspace=512, name='%s_upsampling_%s'%(name,n), num_args=1)
+  #up2 = mx.symbol.Deconvolution(data=low3, num_filter=nFilters, kernel=(s*2,s*2), 
+  #  stride=(s, s), pad=(s//2, s//2),
+  #  name='%s_upsampling_%s'%(name,n),
+  #  attr={'lr_mult': '0.1'})
+  #return mx.symbol.add_n(up1, up2)
+  return up2
+
+
+def prnet_loss(pred, gt_label, mask_label):
+  loss = pred - gt_label
+  #loss = mx.symbol.smooth_l1(loss, scalar=3.0)
+  loss = mx.symbol.abs(loss)
+  loss = mx.symbol.broadcast_mul(loss, mask_label)
+  #loss = mx.symbol.mean(loss, axis=0)
+  #loss = loss*loss
+  #loss = mx.symbol.mean(loss)
+  return loss
+
+def ce_loss(x, y):
+  #loss = mx.sym.SoftmaxOutput(data = x, label = y, normalization='valid', multi_output=True)
+  x_max = mx.sym.max(x, axis=[2,3], keepdims=True)
+  x = mx.sym.broadcast_minus(x, x_max)
+  body = mx.sym.exp(x)
+  sums = mx.sym.sum(body, axis=[2,3], keepdims=True)
+  body = mx.sym.broadcast_div(body, sums)
+  loss = mx.sym.log(body)
+  loss = loss*y*-1.0
+  #loss = mx.symbol.mean(loss, axis=[1,2,3])
+  loss = mx.symbol.mean(loss)
+  return loss
+
+def get_symbol(num_classes):
+    m = config.multiplier
+    sFilters = max(int(64*m), 16)
+    mFilters = max(int(128*m), 32)
+    nFilters = int(256*m)
+
+    nModules = config.net_modules
+    nStacks = config.net_stacks
+    binarize = config.net_binarize
+    input_size = config.input_img_size
+    label_size = config.output_label_size
+    use_STA = config.net_sta
+    N = config.net_n
+    DCN = config.net_dcn
+    per_batch_size = config.per_batch_size
+    print('binarize', binarize)
+    print('use_STA', use_STA)
+    print('use_N', N)
+    print('use_DCN', DCN)
+    print('per_batch_size', per_batch_size)
+    #assert(label_size==64 or label_size==32)
+    #assert(input_size==128 or input_size==256)
+    D = input_size // label_size
+    print(input_size, label_size, D)
+    data = mx.sym.Variable(name='data')
+    data = data-127.5
+    data = data*0.0078125
+    gt_label = mx.symbol.Variable(name='softmax_label')
+    mask_label = mx.symbol.Variable(name='mask_label')
+    losses = []
+    closses = []
+    #body = Conv(data=data, num_filter=sFilters, kernel=(3, 3), stride=(1,1), pad=(1, 1),
+    #                        no_bias=True, name="conv0", workspace=workspace)
+    body = Conv(data=data, num_filter=sFilters, kernel=(7,7), stride=(2,2), pad=(3,3),
+                            no_bias=True, name="conv0", workspace=workspace)
+    #body = Conv(data=data, num_filter=sFilters, kernel=(4,4), stride=(2,2), pad=(1,1),
+    #                        no_bias=True, name="conv0", workspace=workspace)
+    body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn0')
+    body = Act(data=body, act_type='relu', name='relu0')
+
+    dcn = False
+    body = conv_block(body, mFilters, (1,1), sFilters==mFilters, 'res0', False, dcn, 1)
+    body = mx.sym.Pooling(data=body, kernel=(2, 2), stride=(2,2), pad=(0,0), pool_type='max')
+
+    #body = Conv(data=body, num_filter=mFilters, kernel=(4,4), stride=(2,2), pad=(1,1),
+    #                        no_bias=True, name="conv1", workspace=workspace)
+    #body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn1')
+    #body = Act(data=body, act_type='relu', name='relu1')
+
+    #body = conv_block(body, mFilters, (1,1), True, 'res1', False, dcn, 1) #TODO
+    body = conv_block(body, nFilters, (1,1), mFilters==nFilters, 'res2', binarize, dcn, 1) #binarize=True?
+
+    heatmap = None
+    outs = []
+
+    body = hourglass(body, nFilters, nModules, config.net_n, workspace, 'stack0_hg', binarize, dcn)
+    for j in xrange(nModules):
+      body = conv_block(body, nFilters, (1,1), True, 'stack0_unit%d'%(j), binarize, dcn, 1)
+    _dcn = False
+    ll = ConvFactory(body, nFilters, (1,1), dcn = _dcn, name='stack0_ll')
+    _name = 'heatmap'
+    pred = Conv(data=ll, num_filter=num_classes, kernel=(1, 1), stride=(1,1), pad=(0,0),
+                              name=_name, workspace=workspace)
+    loss = prnet_loss(pred, gt_label, mask_label)
+    outs.append(mx.sym.MakeLoss(loss))
+
+
+    pred = mx.symbol.BlockGrad(pred)
+    #loss = mx.symbol.add_n(*losses)
+    #loss = mx.symbol.MakeLoss(loss)
+    #syms = [loss]
+    outs.append(pred)
+    sym = mx.symbol.Group( outs )
+    return sym
+
+def init_weights(sym, data_shape_dict):
+    #print('in hg')
+    arg_name = sym.list_arguments()
+    aux_name = sym.list_auxiliary_states()
+    arg_shape, _, aux_shape = sym.infer_shape(**data_shape_dict)
+    arg_shape_dict = dict(zip(arg_name, arg_shape))
+    aux_shape_dict = dict(zip(aux_name, aux_shape))
+    #print(aux_shape)
+    #print(aux_params)
+    #print(arg_shape_dict)
+    arg_params = {}
+    aux_params = {}
+    for k,v in arg_shape_dict.iteritems():
+      #print(k,v)
+      if k.endswith('offset_weight') or k.endswith('offset_bias'):
+        print('initializing',k)
+        arg_params[k] = mx.nd.zeros(shape = v)
+      elif k.startswith('fc6_'):
+        if k.endswith('_weight'):
+          print('initializing',k)
+          arg_params[k] = mx.random.normal(0, 0.01, shape=v)
+        elif k.endswith('_bias'):
+          print('initializing',k)
+          arg_params[k] = mx.nd.zeros(shape=v)
+      elif k.find('upsampling')>=0:
+        print('initializing upsampling_weight', k)
+        arg_params[k] = mx.nd.zeros(shape=arg_shape_dict[k])
+        init = mx.init.Initializer()
+        init._init_bilinear(k, arg_params[k])
+    return arg_params, aux_params
+
--- a/PRNet.mxnet/train.py
+++ b/PRNet.mxnet/train.py
@@ -0,0 +1,215 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import logging
+import argparse
+from data import FaceSegIter
+import mxnet as mx
+import mxnet.optimizer as optimizer
+import numpy as np
+import os
+import sys
+import math
+import random
+import cv2
+from config import config, default, generate_config
+from optimizer import ONadam
+from metric import LossValueMetric, NMEMetric
+sys.path.append(os.path.join(os.path.dirname(__file__), 'symbol'))
+import sym_heatmap
+#import sym_fc
+#from symbol import fc
+
+
+args = None
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+
+def main(args):
+  _seed = 727
+  random.seed(_seed)
+  np.random.seed(_seed)
+  mx.random.seed(_seed)
+  ctx = []
+  cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
+  if len(cvd)>0:
+    for i in xrange(len(cvd.split(','))):
+      ctx.append(mx.gpu(i))
+  if len(ctx)==0:
+    ctx = [mx.cpu()]
+    print('use cpu')
+  else:
+    print('gpu num:', len(ctx))
+  #ctx = [mx.gpu(0)]
+  args.ctx_num = len(ctx)
+
+  args.batch_size = args.per_batch_size*args.ctx_num
+  config.per_batch_size = args.per_batch_size
+
+
+
+  print('Call with', args, config)
+  train_iter = FaceSegIter(path = config.dataset_path,
+      batch_size = args.batch_size,
+      per_batch_size = args.per_batch_size,
+      aug_level = 1,
+      exf = args.exf,
+      args = args,
+      )
+
+  data_shape = train_iter.get_data_shape()
+  #label_shape = train_iter.get_label_shape()
+  sym = sym_heatmap.get_symbol(num_classes=config.num_classes)
+  if len(args.pretrained)==0:
+      #data_shape_dict = {'data' : (args.per_batch_size,)+data_shape, 'softmax_label' : (args.per_batch_size,)+label_shape}
+      data_shape_dict = train_iter.get_shape_dict()
+      arg_params, aux_params = sym_heatmap.init_weights(sym, data_shape_dict)
+  else:
+      vec = args.pretrained.split(',')
+      print('loading', vec)
+      _, arg_params, aux_params = mx.model.load_checkpoint(vec[0], int(vec[1]))
+      #sym, arg_params, aux_params = get_symbol(args, arg_params, aux_params)
+
+  model = mx.mod.Module(
+      context       = ctx,
+      symbol        = sym,
+      label_names   = train_iter.get_label_names(),
+  )
+  #lr = 1.0e-3
+  #lr = 2.5e-4
+  #_rescale_grad = 1.0/args.ctx_num
+  _rescale_grad = 1.0/args.batch_size
+  #lr = args.lr
+  #opt = optimizer.Nadam(learning_rate=args.lr, wd=args.wd, rescale_grad=_rescale_grad, clip_gradient=5.0)
+  if args.optimizer=='onadam':
+    opt = ONadam(learning_rate=args.lr, wd=args.wd, rescale_grad=_rescale_grad, clip_gradient=5.0)
+  elif args.optimizer=='nadam':
+    opt = optimizer.Nadam(learning_rate=args.lr, rescale_grad=_rescale_grad)
+  elif args.optimizer=='rmsprop':
+    opt = optimizer.RMSProp(learning_rate=args.lr, rescale_grad=_rescale_grad)
+  elif args.optimizer=='adam':
+    opt = optimizer.Adam(learning_rate=args.lr, rescale_grad=_rescale_grad)
+  else:
+    opt = optimizer.SGD(learning_rate=args.lr, momentum=0.9, wd=args.wd, rescale_grad=_rescale_grad)
+  initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2)
+  _cb = mx.callback.Speedometer(args.batch_size, args.frequent)
+  _metric = LossValueMetric()
+  #_metric = NMEMetric()
+  #_metric2 = AccMetric()
+  #eval_metrics = [_metric, _metric2]
+  eval_metrics = [_metric]
+  lr_steps = [int(x) for x in args.lr_step.split(',')]
+  print('lr-steps', lr_steps)
+  global_step = [0]
+
+  def val_test():
+    all_layers = sym.get_internals()
+    vsym = all_layers['heatmap_output']
+    vmodel = mx.mod.Module(symbol=vsym, context=ctx, label_names = None)
+    #model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))])
+    vmodel.bind(data_shapes=[('data', (args.batch_size,)+data_shape)])
+    arg_params, aux_params = model.get_params()
+    vmodel.set_params(arg_params, aux_params)
+    for target in config.val_targets:
+        _file = os.path.join(config.dataset_path, '%s.rec'%target)
+        if not os.path.exists(_file):
+            continue
+        val_iter = FaceSegIter(path_imgrec = _file,
+          batch_size = args.batch_size,
+          #batch_size = 4,
+          aug_level = 0,
+          args = args,
+          )
+        _metric = LossValueMetric()
+        val_metric = mx.metric.create(_metric)
+        val_metric.reset()
+        val_iter.reset()
+        diffs = []
+        for i, eval_batch in enumerate(val_iter):
+          #print(eval_batch.data[0].shape, eval_batch.label[0].shape)
+          batch_data = mx.io.DataBatch(eval_batch.data)
+          model.forward(batch_data, is_train=False)
+          _label = eval_batch.label[0].asnumpy()
+          _pred = model.get_outputs()[-1].asnumpy()
+          _diff = np.abs(_pred-_label)
+          _diff = np.mean(_diff)*config.input_img_size
+          #print('pred', _pred.shape, _label.shape)
+          #print('diff', _diff)
+          diffs.append(_diff)
+          model.update_metric(val_metric, eval_batch.label)
+        nme_value = val_metric.get_name_value()[0][1]
+        print('[%d][%s]LOSS: %f'%(global_step[0], target, nme_value))
+        print('avg diff', np.mean(diffs))
+  
+  def _batch_callback(param):
+    _cb(param)
+    global_step[0]+=1
+    mbatch = global_step[0]
+    for _lr in lr_steps:
+      if mbatch==_lr:
+        if args.optimizer=='sgd':
+          opt.lr *= 0.1
+        else:
+          opt.lr *= 0.5
+        print('lr change to', opt.lr)
+        break
+    if mbatch%1000==0:
+      print('lr-batch-epoch:',opt.lr,param.nbatch,param.epoch)
+    if mbatch>0 and mbatch%args.verbose==0:
+      val_test()
+      if args.ckpt==1:
+        msave = mbatch//args.verbose
+        print('saving', msave)
+        arg, aux = model.get_params()
+        mx.model.save_checkpoint(args.prefix, msave, model.symbol, arg, aux)
+    if mbatch==lr_steps[-1]:
+      if args.ckpt==2:
+        #msave = mbatch//args.verbose
+        msave = 1
+        print('saving', msave)
+        arg, aux = model.get_params()
+        mx.model.save_checkpoint(args.prefix, msave, model.symbol, arg, aux)
+      sys.exit(0)
+
+  train_iter = mx.io.PrefetchingIter(train_iter)
+
+  model.fit(train_iter,
+      begin_epoch        = 0,
+      num_epoch          = 9999,
+      #eval_data          = val_iter,
+      eval_data          = None,
+      eval_metric        = eval_metrics,
+      kvstore            = 'device',
+      optimizer          = opt,
+      initializer        = initializer,
+      arg_params         = arg_params,
+      aux_params         = aux_params,
+      allow_missing      = True,
+      batch_end_callback = _batch_callback,
+      epoch_end_callback = None,
+      )
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser(description='Train face alignment')
+  # general
+  parser.add_argument('--network', help='network name', default=default.network, type=str)
+  parser.add_argument('--dataset', help='dataset name', default=default.dataset, type=str)
+  args, rest = parser.parse_known_args()
+  generate_config(args.network, args.dataset)
+  parser.add_argument('--prefix', default=default.prefix, help='directory to save model.')
+  parser.add_argument('--pretrained', default=default.pretrained, help='')
+  parser.add_argument('--optimizer', default='nadam', help='')
+  parser.add_argument('--lr', type=float, default=default.lr, help='')
+  parser.add_argument('--wd', type=float, default=default.wd, help='')
+  parser.add_argument('--per-batch-size', type=int, default=default.per_batch_size, help='')
+  parser.add_argument('--lr-step', help='learning rate steps (in epoch)', default=default.lr_step, type=str)
+  parser.add_argument('--ckpt', type=int, default=1, help='')
+  parser.add_argument('--norm', type=int, default=0, help='')
+  parser.add_argument('--exf', type=int, default=1, help='')
+  parser.add_argument('--frequent', type=int, default=default.frequent, help='')
+  parser.add_argument('--verbose', type=int, default=default.verbose, help='')
+  args = parser.parse_args()
+  main(args)
+
--- a/README.md
+++ b/README.md
@@ -5,8 +5,10 @@ By Jia Guo and [Jiankang Deng](https://jiankangdeng.github.io/)

 ## License

-The code of InsightFace is released under the MIT License.
+The code of InsightFace is released under the MIT License. There is no limitation for both acadmic and commercial usage.

+The training data containing the annotation (and the models trained with these data) are available for non-commercial research purposes only.
+ 
 ## ArcFace Video Demo

 [![ArcFace Demo](https://github.com/deepinsight/insightface/blob/master/resources/facerecognitionfromvideo.PNG)](https://www.youtube.com/watch?v=y-D1tReryGA&t=81s)
@@ -15,10 +17,16 @@ Please click the image to watch the Youtube video. For Bilibili users, click [he

 ## Recent Update

-**`2019.04.14`**: We will launch a Light-weight Face Recognition challenge/workshop on ICCV 2019.
+**`2019.08.10`**: We achieved 2nd place at [WIDER Face Detection Challenge 2019](http://wider-challenge.org/2019.html).

-**`2019.04.04`**: Arcface achieved state-of-the-art performance (5/109) on the NIST Face Recognition Vendor Test (FRVT) (1:1 verification)
-[report](https://www.nist.gov/sites/default/files/documents/2019/04/04/frvt_report_2019_04_04.pdf) (name: Imperial-000). Our solution is based on [MS1MV2+DeepGlintAsian, ResNet100, ArcFace loss]. 
+**`2019.05.30`**: [Presentation at cvmart](https://pan.baidu.com/s/1v9fFHBJ8Q9Kl9Z6GwhbY6A)
+
+**`2019.04.30`**: Our Face detector ([RetinaFace](https://github.com/deepinsight/insightface/tree/master/RetinaFace)) obtains state-of-the-art results on [the WiderFace dataset](http://shuoyang1213.me/WIDERFACE/WiderFace_Results.html).
+
+**`2019.04.14`**: We will launch a [Light-weight Face Recognition challenge/workshop](https://github.com/deepinsight/insightface/tree/master/iccv19-challenge) on ICCV 2019.
+
+**`2019.04.04`**: Arcface achieved state-of-the-art performance (7/109) on the NIST Face Recognition Vendor Test (FRVT) (1:1 verification)
+[report](https://www.nist.gov/sites/default/files/documents/2019/04/04/frvt_report_2019_04_04.pdf) (name: Imperial-000 and Imperial-001). Our solution is based on [MS1MV2+DeepGlintAsian, ResNet100, ArcFace loss]. 

 **`2019.02.08`**: Please check [https://github.com/deepinsight/insightface/tree/master/recognition](https://github.com/deepinsight/insightface/tree/master/recognition) for our parallel training code which can easily and efficiently support one million identities on a single machine (8* 1080ti).

@@ -203,6 +211,7 @@ For single cropped face image(112x112), total inference time is only 17ms on our

 - TensorFlow: [InsightFace_TF](https://github.com/auroua/InsightFace_TF)
 - TensorFlow: [tf-insightface](https://github.com/AIInAi/tf-insightface)
+- TensorFlow:[insightface](https://github.com/Fei-Wang/insightface)
 - PyTorch: [InsightFace_Pytorch](https://github.com/TreB1eN/InsightFace_Pytorch)
 - PyTorch: [arcface-pytorch](https://github.com/ronghuaiyang/arcface-pytorch)
 - Caffe: [arcface-caffe](https://github.com/xialuxi/arcface-caffe)
@@ -212,23 +221,38 @@ For single cropped face image(112x112), total inference time is only 17ms on our

 ## Face Alignment

-Todo
+Please check the [Menpo](https://github.com/jiankangdeng/MenpoBenchmark) Benchmark and [Dense U-Net](https://github.com/deepinsight/insightface/tree/master/alignment) for more details.

 ## Face Detection

-Todo
+Please check [RetinaFace](https://github.com/deepinsight/insightface/tree/master/RetinaFace) for more details.

 ## Citation

 If you find *InsightFace* useful in your research, please consider to cite the following related papers:

 ```
+@inproceedings{deng2019retinaface,
+title={RetinaFace: Single-stage Dense Face Localisation in the Wild},
+author={Deng, Jiankang and Guo, Jia and Yuxiang, Zhou and Jinke Yu and Irene Kotsia and Zafeiriou, Stefanos},
+booktitle={arxiv},
+year={2019}
+}
+
@inproceedings{guo2018stacked,
  title={Stacked Dense U-Nets with Dual Transformers for Robust Face Alignment},
  author={Guo, Jia and Deng, Jiankang and Xue, Niannan and Zafeiriou, Stefanos},
  booktitle={BMVC},
  year={2018}
 }
+
+@article{deng2018menpo,
+  title={The Menpo benchmark for multi-pose 2D and 3D facial landmark localisation and tracking},
+  author={Deng, Jiankang and Roussos, Anastasios and Chrysos, Grigorios and Ververas, Evangelos and Kotsia, Irene and Shen, Jie and Zafeiriou, Stefanos},
+  journal={IJCV},
+  year={2018}
+}
+
@inproceedings{deng2018arcface,
 title={ArcFace: Additive Angular Margin Loss for Deep Face Recognition},
 author={Deng, Jiankang and Guo, Jia and Niannan, Xue and Zafeiriou, Stefanos},
--- a/RetinaFace/README.md
+++ b/RetinaFace/README.md
@@ -2,7 +2,7 @@

 ## Introduction

-RetinaFace is a practical single-stage face detector which is initially described in [arXiv technical report](https://arxiv.org/abs/1905.00641)
+RetinaFace is a practical single-stage [SOTA](http://shuoyang1213.me/WIDERFACE/WiderFace_Results.html) face detector which is initially described in [arXiv technical report](https://arxiv.org/abs/1905.00641)

 ![demoimg1](https://github.com/deepinsight/insightface/blob/master/resources/11513D05.jpg)

@@ -40,7 +40,7 @@ RetinaFace is a practical single-stage face detector which is initially describe
 Please check ``train.py`` for training.

 1. Copy ``rcnn/sample_config.py`` to ``rcnn/config.py``
-2. Download pretrained models and put them into ``model/``. 
+2. Download ImageNet pretrained models and put them into ``model/``(these models are not for detection testing/inferencing but training and parameters initialization). 

    ImageNet ResNet50 ([baidu cloud](https://pan.baidu.com/s/1WAkU9ZA_j-OmzO-sdk9whA) and [dropbox](https://www.dropbox.com/s/48b850vmnaaasfl/imagenet-resnet-50.zip?dl=0)). 

@@ -54,7 +54,7 @@ Before training, you can check the ``resnet`` network configuration (e.g. pretra

 Please check ``test.py`` for testing.

-## Models
+## RetinaFace Pretrained Models

 Pretrained Model: RetinaFace-R50 ([baidu cloud](https://pan.baidu.com/s/1C6nKq122gJxRhb37vK0_LQ) or [dropbox](https://www.dropbox.com/s/53ftnlarhyrpkg2/retinaface-R50.zip?dl=0)) is a medium size model with ResNet50 backbone.
 It can output face bounding boxes and five facial landmarks in a single forward pass.
@@ -63,6 +63,13 @@ WiderFace validation mAP: Easy 96.5, Medium 95.6, Hard 90.4.

 To avoid the confliction with the WiderFace Challenge (ICCV 2019), we postpone the release time of our best model.

+## Third-party Models
+
+[yangfly](https://github.com/yangfly): RetinaFace-MobileNet0.25 ([baidu cloud](https://pan.baidu.com/s/1P1ypO7VYUbNAezdvLm2m9w)).
+WiderFace validation mAP: Hard 82.5. (model size: 1.68Mb) 
+
+[clancylian](https://github.com/clancylian/retinaface): C++ version
+
 ## References

 ```
--- a/RetinaFace/rcnn/io/image.py
+++ b/RetinaFace/rcnn/io/image.py
@@ -82,7 +82,7 @@ def get_image(roidb, scale=False):
          im = im.astype(np.float32)
          boxes_mask = roi_rec['boxes_mask'].copy() * im_scale
          boxes_mask = boxes_mask.astype(np.int)
-          for j in xrange(boxes_mask.shape[0]):
+          for j in range(boxes_mask.shape[0]):
            m = boxes_mask[j]
            im_tensor[:,:,m[1]:m[3],m[0]:m[2]] = 0.0
            #print('find mask', m, file=sys.stderr)
@@ -131,7 +131,7 @@ def __get_crop_image(roidb):
          #im = im.astype(np.float32)
          boxes_mask = roi_rec['boxes_mask'].copy()
          boxes_mask = boxes_mask.astype(np.int)
-          for j in xrange(boxes_mask.shape[0]):
+          for j in range(boxes_mask.shape[0]):
            m = boxes_mask[j]
            im[m[1]:m[3],m[0]:m[2],:] = 0
            #print('find mask', m, file=sys.stderr)
@@ -143,7 +143,7 @@ def __get_crop_image(roidb):
        TARGET_BOX_SCALES = np.array([16,32,64,128,256,512])
        assert roi_rec['boxes'].shape[0]>0
        candidates = []
-        for i in xrange(roi_rec['boxes'].shape[0]):
+        for i in range(roi_rec['boxes'].shape[0]):
          box = roi_rec['boxes'][i]
          box_size = max(box[2]-box[0], box[3]-box[1])
          if box_size<config.TRAIN.MIN_BOX_SIZE:
@@ -181,7 +181,7 @@ def __get_crop_image(roidb):
        im = cv2.warpAffine(im, M, (SIZE, SIZE), borderValue = tuple(config.PIXEL_MEANS))
        #tbox = np.array([left, left+SIZE, up, up+SIZE], dtype=np.int)
        #im_new = np.zeros( (SIZE, SIZE,3), dtype=im.dtype)
-        #for i in xrange(3):
+        #for i in range(3):
        #  im_new[:,:,i] = config.PIXEL_MEANS[i]
        new_rec['boxes'][:,0] -= left
        new_rec['boxes'][:,2] -= left
@@ -192,7 +192,7 @@ def __get_crop_image(roidb):
        #print('before', new_rec['boxes'].shape[0])
        boxes_new = []
        classes_new = []
-        for i in xrange(new_rec['boxes'].shape[0]):
+        for i in range(new_rec['boxes'].shape[0]):
          box = new_rec['boxes'][i]
          box_size = max(box[2]-box[0], box[3]-box[1])
          center = np.array(([box[0], box[1]]+[box[2], box[3]]))/2
@@ -211,7 +211,7 @@ def __get_crop_image(roidb):
          global TMP_ID
          if TMP_ID<10:
            tim = im.copy()
-            for i in xrange(new_rec['boxes'].shape[0]):
+            for i in range(new_rec['boxes'].shape[0]):
              box = new_rec['boxes'][i].copy().astype(np.int)
              cv2.rectangle(tim, (box[0], box[1]), (box[2], box[3]), (255, 0, 0), 1)
            filename = './trainimages/train%d.png' % TMP_ID
@@ -279,7 +279,7 @@ def get_crop_image1(roidb):
          #im = im.astype(np.float32)
          boxes_mask = roi_rec['boxes_mask'].copy()
          boxes_mask = boxes_mask.astype(np.int)
-          for j in xrange(boxes_mask.shape[0]):
+          for j in range(boxes_mask.shape[0]):
            m = boxes_mask[j]
            im[m[1]:m[3],m[0]:m[2],:] = 127
            #print('find mask', m, file=sys.stderr)
@@ -342,7 +342,7 @@ def get_crop_image1(roidb):
          #print(origin_shape, im_new.shape, im_scale)
          valid = []
          valid_boxes = []
-          for i in xrange(boxes_new.shape[0]):
+          for i in range(boxes_new.shape[0]):
            box = boxes_new[i]
            #center = np.array(([box[0], box[1]]+[box[2], box[3]]))/2
            centerx = (box[0]+box[2])/2
@@ -385,12 +385,12 @@ def get_crop_image1(roidb):
        global TMP_ID
        if TMP_ID>=0 and TMP_ID<10:
          tim = im.copy().astype(np.uint8)
-          for i in xrange(new_rec['boxes'].shape[0]):
+          for i in range(new_rec['boxes'].shape[0]):
            box = new_rec['boxes'][i].copy().astype(np.int)
            cv2.rectangle(tim, (box[0], box[1]), (box[2], box[3]), (255, 0, 0), 1)
            print('draw box:', box)
          if config.FACE_LANDMARK:
-            for i in xrange(new_rec['landmarks'].shape[0]):
+            for i in range(new_rec['landmarks'].shape[0]):
              landmark = new_rec['landmarks'][i].copy()
              if landmark[0][2]<0:
                print('zero', landmark)
@@ -444,14 +444,14 @@ def get_crop_image2(roidb):
          #im = im.astype(np.float32)
          boxes_mask = roi_rec['boxes_mask'].copy()
          boxes_mask = boxes_mask.astype(np.int)
-          for j in xrange(boxes_mask.shape[0]):
+          for j in range(boxes_mask.shape[0]):
            m = boxes_mask[j]
            im[m[1]:m[3],m[0]:m[2],:] = 0
            #print('find mask', m, file=sys.stderr)
        SIZE = config.SCALES[0][0]
        scale_array = np.array([16,32,64,128,256,512], dtype=np.float32)
        candidates = []
-        for i in xrange(roi_rec['boxes'].shape[0]):
+        for i in range(roi_rec['boxes'].shape[0]):
          box = roi_rec['boxes'][i]
          box_size = max(box[2]-box[0], box[3]-box[1])
          if box_size<config.TRAIN.MIN_BOX_SIZE:
@@ -594,7 +594,7 @@ def get_crop_image2(roidb):
          #print(origin_shape, im_new.shape, im_scale)
          valid = []
          valid_boxes = []
-          for i in xrange(boxes_new.shape[0]):
+          for i in range(boxes_new.shape[0]):
            box = boxes_new[i]
            #center = np.array(([box[0], box[1]]+[box[2], box[3]]))/2
            centerx = (box[0]+box[2])/2
@@ -633,12 +633,12 @@ def get_crop_image2(roidb):
        global TMP_ID
        if TMP_ID>=0 and TMP_ID<10:
          tim = im.copy().astype(np.uint8)
-          for i in xrange(new_rec['boxes'].shape[0]):
+          for i in range(new_rec['boxes'].shape[0]):
            box = new_rec['boxes'][i].copy().astype(np.int)
            cv2.rectangle(tim, (box[0], box[1]), (box[2], box[3]), (255, 0, 0), 1)
            print('draw box:', box)
          if config.FACE_LANDMARK:
-            for i in xrange(new_rec['landmarks'].shape[0]):
+            for i in range(new_rec['landmarks'].shape[0]):
              landmark = new_rec['landmarks'][i].copy()
              if landmark[10]==0.0:
                print('zero', landmark)
--- a/RetinaFace/rcnn/symbol/symbol_mnet.py
+++ b/RetinaFace/rcnn/symbol/symbol_mnet.py
@@ -6,7 +6,7 @@ import mxnet.autograd as ag
 import numpy as np
 from rcnn.config import config
 from rcnn.PY_OP import rpn_fpn_ohem3
-from symbol_common import get_sym_train
+from rcnn.symbol.symbol_common import get_sym_train


 def conv_only(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \
--- a/RetinaFace/rcnn/symbol/symbol_resnet.py
+++ b/RetinaFace/rcnn/symbol/symbol_resnet.py
@@ -6,7 +6,7 @@ import mxnet.autograd as ag
 import numpy as np
 from rcnn.config import config
 from rcnn.PY_OP import rpn_fpn_ohem3
-from symbol_common import get_sym_train
+from rcnn.symbol.symbol_common import get_sym_train

 def conv_only(from_layer, name, num_filter, kernel=(1,1), pad=(0,0), \
    stride=(1,1), bias_wd_mult=0.0):
--- a/alignment/data.py
+++ b/alignment/data.py
@@ -159,7 +159,7 @@ class FaceSegIter(DataIter):

      #ul = np.array( (50000,50000), dtype=np.int32)
      #br = np.array( (0,0), dtype=np.int32)
-      #for i in xrange(hlabel.shape[0]):
+      #for i in range(hlabel.shape[0]):
      #  h = int(hlabel[i][0])
      #  w = int(hlabel[i][1])
      #  key = np.array((h,w))
@@ -171,9 +171,9 @@ class FaceSegIter(DataIter):
    def get_flip(self, data, label):
      data_flip = np.zeros_like(data)
      label_flip = np.zeros_like(label)
-      for k in xrange(data_flip.shape[2]):
+      for k in range(data_flip.shape[2]):
          data_flip[:,:,k] = np.fliplr(data[:,:,k])
-      for k in xrange(label_flip.shape[0]):
+      for k in range(label_flip.shape[0]):
          label_flip[k,:] = np.fliplr(label[k,:])
      #print(label[0,:].shape)
      label_flip = label_flip[self.flip_order,:]
@@ -186,7 +186,7 @@ class FaceSegIter(DataIter):
        #  filename = './vis/raw_%d.jpg' % (self.img_num)
        #  print('save', filename)
        #  draw = data.copy()
-        #  for i in xrange(label.shape[0]):
+        #  for i in range(label.shape[0]):
        #    cv2.circle(draw, (label[i][1], label[i][0]), 1, (0, 0, 255), 2)
        #  scipy.misc.imsave(filename, draw)

@@ -223,7 +223,7 @@ class FaceSegIter(DataIter):
          #data_out = img_helper.crop2(data, center, _scale, (self.input_img_size, self.input_img_size), rot=rotate)
          label_out = np.zeros(self.label_shape, dtype=np.float32)
          #print('out shapes', data_out.shape, label_out.shape)
-          for i in xrange(label.shape[0]):
+          for i in range(label.shape[0]):
            pt = label[i].copy()
            #pt = pt[::-1]
            npt = img_helper.transform_pt(pt, trans)
@@ -277,7 +277,7 @@ class FaceSegIter(DataIter):
        print('save', filename)
        draw = data_out.copy()
        alabel = label_out.copy()
-        for i in xrange(label.shape[0]):
+        for i in range(label.shape[0]):
          a = cv2.resize(alabel[i], (self.input_img_size, self.input_img_size))
          ind = np.unravel_index(np.argmax(a, axis=None), a.shape)
          cv2.circle(draw, (ind[1], ind[0]), 1, (0, 0, 255), 2)
--- a/alignment/metric.py
+++ b/alignment/metric.py
@@ -28,7 +28,7 @@ class NMEMetric(mx.metric.EvalMetric):

  def cal_nme(self, label, pred_label):
    nme = []
-    for b in xrange(pred_label.shape[0]):
+    for b in range(pred_label.shape[0]):
      record = [None]*6
      item = []
      if label.ndim==4:
@@ -39,7 +39,7 @@ class NMEMetric(mx.metric.EvalMetric):
          #print(label[b])
          if np.count_nonzero(label[b])==0:
              continue
-      for p in xrange(pred_label.shape[1]):
+      for p in range(pred_label.shape[1]):
        if label.ndim==4:
            heatmap_gt = label[b][p]
            ind_gt = np.unravel_index(np.argmax(heatmap_gt, axis=None), heatmap_gt.shape)
--- a/alignment/symbol/sym_heatmap.py
+++ b/alignment/symbol/sym_heatmap.py
@@ -313,19 +313,19 @@ def hourglass(data, nFilters, nModules, n, workspace, name, binarize, dcn):
  s = 2
  _dcn = False
  up1 = data
-  for i in xrange(nModules):
+  for i in range(nModules):
    up1 = conv_block(up1, nFilters, (1,1), True, "%s_up1_%d"%(name,i), binarize, _dcn, 1)
  low1 = mx.sym.Pooling(data=data, kernel=(s, s), stride=(s,s), pad=(0,0), pool_type='max')
-  for i in xrange(nModules):
+  for i in range(nModules):
    low1 = conv_block(low1, nFilters, (1,1), True, "%s_low1_%d"%(name,i), binarize, _dcn, 1)
  if n>1:
    low2 = hourglass(low1, nFilters, nModules, n-1, workspace, "%s_%d"%(name, n-1), binarize, dcn)
  else:
    low2 = low1
-    for i in xrange(nModules):
+    for i in range(nModules):
      low2 = conv_block(low2, nFilters, (1,1), True, "%s_low2_%d"%(name,i), binarize, _dcn, 1) #TODO
  low3 = low2
-  for i in xrange(nModules):
+  for i in range(nModules):
    low3 = conv_block(low3, nFilters, (1,1), True, "%s_low3_%d"%(name,i), binarize, _dcn, 1)
  up2 = mx.symbol.UpSampling(low3, scale=s, sample_type='nearest', workspace=512, name='%s_upsampling_%s'%(name,n), num_args=1)
  return mx.symbol.add_n(up1, up2)
@@ -517,14 +517,14 @@ def get_symbol(num_classes):

    heatmap = None

-    for i in xrange(nStacks):
+    for i in range(nStacks):
      shortcut = body
      if config.net_sta>0:
        sta = STA(body, nFilters, nModules, config.net_n+1, workspace, 'sta%d'%(i))
        body = sta.get()
      else:
        body = hourglass(body, nFilters, nModules, config.net_n, workspace, 'stack%d_hg'%(i), binarize, dcn)
-      for j in xrange(nModules):
+      for j in range(nModules):
        body = conv_block(body, nFilters, (1,1), True, 'stack%d_unit%d'%(i,j), binarize, dcn, 1)
      _dcn = True if config.net_dcn>=2 else False
      ll = ConvFactory(body, nFilters, (1,1), dcn = _dcn, name='stack%d_ll'%(i))
@@ -596,7 +596,8 @@ def init_weights(sym, data_shape_dict):
    #print(arg_shape_dict)
    arg_params = {}
    aux_params = {}
-    for k,v in arg_shape_dict.iteritems():
+    for k in arg_shape_dict:
+      v = arg_shape_dict[k]
      #print(k,v)
      if k.endswith('offset_weight') or k.endswith('offset_bias'):
        print('initializing',k)
--- a/alignment/test.py
+++ b/alignment/test.py
@@ -54,7 +54,7 @@ class Handler:
    tb = datetime.datetime.now()
    print('module time cost', (tb-ta).total_seconds())
    ret = np.zeros( (alabel.shape[0], 2), dtype=np.float32)
-    for i in xrange(alabel.shape[0]):
+    for i in range(alabel.shape[0]):
      a = cv2.resize(alabel[i], (self.image_size[1], self.image_size[0]))
      ind = np.unravel_index(np.argmax(a, axis=None), a.shape)
      #ret[i] = (ind[0], ind[1]) #h, w
--- a/alignment/train.py
+++ b/alignment/train.py
@@ -35,7 +35,7 @@ def main(args):
  ctx = []
  cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
  if len(cvd)>0:
-    for i in xrange(len(cvd.split(','))):
+    for i in range(len(cvd.split(','))):
      ctx.append(mx.gpu(i))
  if len(ctx)==0:
    ctx = [mx.cpu()]
--- a/deploy/benchmark.py
+++ b/deploy/benchmark.py
@@ -19,7 +19,7 @@ model = face_embedding.FaceModel(args)
 img = cv2.imread('/raid5data/dplearn/megaface/facescrubr/112x112/Tom_Hanks/Tom_Hanks_54745.png')

 time_now = datetime.datetime.now()
-for i in xrange(3000):
+for i in range(3000):
  f1 = model.get_feature(img)
 time_now2 = datetime.datetime.now()
 diff = time_now2 - time_now
--- a/deploy/face_embedding.py
+++ b/deploy/face_embedding.py
@@ -22,7 +22,7 @@ import face_preprocess


 def do_flip(data):
-  for idx in xrange(data.shape[0]):
+  for idx in range(data.shape[0]):
    data[idx,:,:] = np.fliplr(data[idx,:,:])

 class FaceModel:
--- a/deploy/face_model.py
+++ b/deploy/face_model.py
@@ -22,7 +22,7 @@ import face_preprocess


 def do_flip(data):
-  for idx in xrange(data.shape[0]):
+  for idx in range(data.shape[0]):
    data[idx,:,:] = np.fliplr(data[idx,:,:])

 def get_model(ctx, image_size, model_str, layer):
--- a/gender-age/add_age_rec.py
+++ b/gender-age/add_age_rec.py
@@ -1,52 +0,0 @@
-import mxnet as mx
-import numpy as np
-import sys, os
-
-source_dir = sys.argv[1]
-input_dir = sys.argv[2]
-idx_file = os.path.join(source_dir, 'traino.idx')
-rec_file = os.path.join(source_dir, 'traino.rec')
-writer = mx.recordio.MXIndexedRecordIO(os.path.join(source_dir,'train.idx'), os.path.join(source_dir,'train.rec'), 'w')  # pylint: disable=redefined-variable-type
-imgrec = mx.recordio.MXIndexedRecordIO(idx_file, rec_file, 'r')  # pylint: disable=redefined-variable-type
-seq = list(imgrec.keys)
-widx = 0
-for img_idx in seq:
-  s = imgrec.read_idx(img_idx)
-  assert widx==img_idx
-  writer.write_idx(widx, s)
-  widx+=1
-
-
-stat = {}
-
-for _file in os.listdir(input_dir):
-  if not _file.endswith('.rec'):
-    continue
-  rec_file = os.path.join(input_dir, _file)
-  print(rec_file)
-  idx_file = rec_file[:-4]+'.idx'
-  imgrec = mx.recordio.MXIndexedRecordIO(idx_file, rec_file, 'r')  # pylint: disable=redefined-variable-type
-  seq = list(imgrec.keys)
-  for img_idx in seq:
-    if img_idx%100==0:
-      print(img_idx, stat)
-    s = imgrec.read_idx(img_idx)
-    header, img = mx.recordio.unpack(s)
-    try:
-      image = mx.image.imdecode(img).asnumpy()
-    except:
-      continue
-    age = int(header.label[0])
-    if age>=20:
-      continue
-    age_group = age//10
-    #if not age in stat:
-      stat[age_group] = 0
-    stat[age_group]+=1
-    label = [9999, age]
-    nheader = mx.recordio.IRHeader(0, label, widx, 0)
-    bgr = image[:,:,::-1]
-    s = mx.recordio.pack_img(nheader, bgr, quality=95, img_fmt='.jpg')
-    writer.write_idx(widx, s)
-    widx+=1
-
--- a/gender-age/data.py
+++ b/gender-age/data.py
@@ -124,7 +124,7 @@ class FaceImageIter(io.DataIter):
    def mirror_aug(self, img):
      _rd = random.randint(0,1)
      if _rd==1:
-        for c in xrange(img.shape[2]):
+        for c in range(img.shape[2]):
          img[:,:,c] = np.fliplr(img[:,:,c])
      return img

--- a/gender-age/face_model.py
+++ b/gender-age/face_model.py
@@ -22,7 +22,7 @@ import face_preprocess


 def do_flip(data):
-  for idx in xrange(data.shape[0]):
+  for idx in range(data.shape[0]):
    data[idx,:,:] = np.fliplr(data[idx,:,:])

 def get_model(ctx, image_size, model_str, layer):
--- a/gender-age/train.py
+++ b/gender-age/train.py
@@ -81,7 +81,7 @@ class MAEMetric(mx.metric.EvalMetric):
    pred_age = np.zeros( label_age.shape, dtype=np.int)
    #pred_age = np.zeros( label_age.shape, dtype=np.float32)
    pred = preds[-1].asnumpy()
-    for i in xrange(AGE):
+    for i in range(AGE):
      _pred = pred[:,2+i*2:4+i*2]
      _pred = np.argmax(_pred, axis=1)
      #pred = pred[:,1]
@@ -107,7 +107,7 @@ class CUMMetric(mx.metric.EvalMetric):
    label_age = np.count_nonzero(label[:,1:], axis=1)
    pred_age = np.zeros( label_age.shape, dtype=np.int)
    pred = preds[-1].asnumpy()
-    for i in xrange(AGE):
+    for i in range(AGE):
      _pred = pred[:,2+i*2:4+i*2]
      _pred = np.argmax(_pred, axis=1)
      #pred = pred[:,1]
@@ -184,7 +184,7 @@ def train_net(args):
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd)>0:
-      for i in xrange(len(cvd.split(','))):
+      for i in range(len(cvd.split(','))):
        ctx.append(mx.gpu(i))
    if len(ctx)==0:
      ctx = [mx.cpu()]
--- a/gluon/README.md
+++ b/gluon/README.md
@@ -1 +0,0 @@
-Gluon interface, not totally working.
--- a/gluon/age_iter.py
+++ b/gluon/age_iter.py
@@ -1,271 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import random
-import logging
-import sys
-import numbers
-import math
-import sklearn
-import datetime
-import numpy as np
-import cv2
-
-import mxnet as mx
-from mxnet import ndarray as nd
-from mxnet import io
-from mxnet import recordio
-sys.path.append(os.path.join(os.path.dirname(__file__), 'common'))
-import face_preprocess
-import multiprocessing
-
-logger = logging.getLogger()
-
-
-class FaceImageIter(io.DataIter):
-
-    def __init__(self, batch_size, data_shape,
-                 path_imgrec = None, task = 'age',
-                 shuffle=False, aug_list=None, mean = None,
-                 rand_mirror = False, cutoff = 0,
-                 data_name='data', label_name='softmax_label', **kwargs):
-        super(FaceImageIter, self).__init__()
-        assert path_imgrec
-        if path_imgrec:
-            logging.info('loading recordio %s...',
-                         path_imgrec)
-            path_imgidx = path_imgrec[0:-4]+".idx"
-            self.imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')  # pylint: disable=redefined-variable-type
-            s = self.imgrec.read_idx(0)
-            header, _ = recordio.unpack(s)
-            self.imgidx = list(self.imgrec.keys)
-            if shuffle:
-              self.seq = self.imgidx
-              self.oseq = self.imgidx
-              print(len(self.seq))
-            else:
-              self.seq = None
-
-        self.mean = mean
-        self.nd_mean = None
-        if self.mean:
-          self.mean = np.array(self.mean, dtype=np.float32).reshape(1,1,3)
-          self.nd_mean = mx.nd.array(self.mean).reshape((1,1,3))
-
-        self.check_data_shape(data_shape)
-        self.provide_data = [(data_name, (batch_size,) + data_shape)]
-        self.batch_size = batch_size
-        self.data_shape = data_shape
-        self.shuffle = shuffle
-        self.image_size = '%d,%d'%(data_shape[1],data_shape[2])
-        self.rand_mirror = rand_mirror
-        print('rand_mirror', rand_mirror)
-        self.cutoff = cutoff
-        if task=='age':
-          self.provide_label = [(label_name, (batch_size,100))]
-        else:
-          self.provide_label = [(label_name, (batch_size,))]
-        #print(self.provide_label[0][1])
-        self.cur = 0
-        self.nbatch = 0
-        self.is_init = False
-
-
-    def reset(self):
-        """Resets the iterator to the beginning of the data."""
-        print('call reset()')
-        self.cur = 0
-        if self.shuffle:
-          random.shuffle(self.seq)
-        if self.seq is None and self.imgrec is not None:
-            self.imgrec.reset()
-
-    def num_samples(self):
-      return len(self.seq)
-
-    def next_sample(self):
-        """Helper function for reading in next sample."""
-        #set total batch size, for example, 1800, and maximum size for each people, for example 45
-        if self.seq is not None:
-          while True:
-            if self.cur >= len(self.seq):
-                raise StopIteration
-            idx = self.seq[self.cur]
-            self.cur += 1
-            if self.imgrec is not None:
-              s = self.imgrec.read_idx(idx)
-              header, img = recordio.unpack(s)
-              label = header.label
-              return label, img, None, None
-            else:
-              label, fname, bbox, landmark = self.imglist[idx]
-              return label, self.read_image(fname), bbox, landmark
-        else:
-            s = self.imgrec.read()
-            if s is None:
-                raise StopIteration
-            header, img = recordio.unpack(s)
-            return header.label, img, None, None
-
-    def brightness_aug(self, src, x):
-      alpha = 1.0 + random.uniform(-x, x)
-      src *= alpha
-      return src
-
-    def contrast_aug(self, src, x):
-      alpha = 1.0 + random.uniform(-x, x)
-      coef = np.array([[[0.299, 0.587, 0.114]]])
-      gray = src * coef
-      gray = (3.0 * (1.0 - alpha) / gray.size) * np.sum(gray)
-      src *= alpha
-      src += gray
-      return src
-
-    def saturation_aug(self, src, x):
-      alpha = 1.0 + random.uniform(-x, x)
-      coef = np.array([[[0.299, 0.587, 0.114]]])
-      gray = src * coef
-      gray = np.sum(gray, axis=2, keepdims=True)
-      gray *= (1.0 - alpha)
-      src *= alpha
-      src += gray
-      return src
-
-    def color_aug(self, img, x):
-      augs = [self.brightness_aug, self.contrast_aug, self.saturation_aug]
-      random.shuffle(augs)
-      for aug in augs:
-        #print(img.shape)
-        img = aug(img, x)
-        #print(img.shape)
-      return img
-
-    def mirror_aug(self, img):
-      _rd = random.randint(0,1)
-      if _rd==1:
-        for c in xrange(img.shape[2]):
-          img[:,:,c] = np.fliplr(img[:,:,c])
-      return img
-
-
-    def next(self):
-        if not self.is_init:
-          self.reset()
-          self.is_init = True
-        """Returns the next batch of data."""
-        #print('in next', self.cur, self.labelcur)
-        self.nbatch+=1
-        batch_size = self.batch_size
-        c, h, w = self.data_shape
-        batch_data = nd.empty((batch_size, c, h, w))
-        if self.provide_label is not None:
-          batch_label = nd.empty(self.provide_label[0][1])
-        i = 0
-        try:
-            while i < batch_size:
-                label, s, bbox, landmark = self.next_sample()
-                #if label[1]>=0.0 or label[2]>=0.0:
-                #  print(label[0:10])
-                _data = self.imdecode(s)
-                if self.rand_mirror:
-                  _rd = random.randint(0,1)
-                  if _rd==1:
-                    _data = mx.ndarray.flip(data=_data, axis=1)
-                #_data = _data.astype('float32')
-                #_data -= 127.5
-                #_data *= 0.0078125
-                if self.cutoff>0:
-                  centerh = random.randint(0, _data.shape[0]-1)
-                  centerw = random.randint(0, _data.shape[1]-1)
-                  half = self.cutoff//2
-                  starth = max(0, centerh-half)
-                  endh = min(_data.shape[0], centerh+half)
-                  startw = max(0, centerw-half)
-                  endw = min(_data.shape[1], centerw+half)
-                  _data = _data.astype('float32')
-                  #print(starth, endh, startw, endw, _data.shape)
-                  _data[starth:endh, startw:endw, :] = 127.5
-                data = [_data]
-                try:
-                    self.check_valid_image(data)
-                except RuntimeError as e:
-                    logging.debug('Invalid image, skipping:  %s', str(e))
-                    continue
-                #print('aa',data[0].shape)
-                #data = self.augmentation_transform(data)
-                #print('bb',data[0].shape)
-                for datum in data:
-                    assert i < batch_size, 'Batch size must be multiples of augmenter output length'
-                    #print(datum.shape)
-                    batch_data[i][:] = self.postprocess_data(datum)
-                    batch_label[i][:] = label
-                    i += 1
-        except StopIteration:
-            if i<batch_size:
-                raise StopIteration
-
-        return io.DataBatch([batch_data], [batch_label], batch_size - i)
-
-    def check_data_shape(self, data_shape):
-        """Checks if the input data shape is valid"""
-        if not len(data_shape) == 3:
-            raise ValueError('data_shape should have length 3, with dimensions CxHxW')
-        if not data_shape[0] == 3:
-            raise ValueError('This iterator expects inputs to have 3 channels.')
-
-    def check_valid_image(self, data):
-        """Checks if the input data is valid"""
-        if len(data[0].shape) == 0:
-            raise RuntimeError('Data shape is wrong')
-
-    def imdecode(self, s):
-        """Decodes a string or byte string to an NDArray.
-        See mx.img.imdecode for more details."""
-        img = mx.image.imdecode(s) #mx.ndarray
-        return img
-
-    def read_image(self, fname):
-        """Reads an input image `fname` and returns the decoded raw bytes.
-
-        Example usage:
-        ----------
-        >>> dataIter.read_image('Face.jpg') # returns decoded raw bytes.
-        """
-        with open(os.path.join(self.path_root, fname), 'rb') as fin:
-            img = fin.read()
-        return img
-
-    def augmentation_transform(self, data):
-        """Transforms input data with specified augmentation."""
-        for aug in self.auglist:
-            data = [ret for src in data for ret in aug(src)]
-        return data
-
-    def postprocess_data(self, datum):
-        """Final postprocessing step before image is loaded into the batch."""
-        return nd.transpose(datum, axes=(2, 0, 1))
-
-class FaceImageIterList(io.DataIter):
-  def __init__(self, iter_list):
-    assert len(iter_list)>0
-    self.provide_data = iter_list[0].provide_data
-    self.provide_label = iter_list[0].provide_label
-    self.iter_list = iter_list
-    self.cur_iter = None
-
-  def reset(self):
-    self.cur_iter.reset()
-
-  def next(self):
-    self.cur_iter = random.choice(self.iter_list)
-    while True:
-      try:
-        ret = self.cur_iter.next()
-      except StopIteration:
-        self.cur_iter.reset()
-        continue
-      return ret
-
-
--- a/gluon/blocks/UDD.py
+++ b/gluon/blocks/UDD.py
@@ -1,195 +0,0 @@
-import mxnet as mx
-from mxnet import gluon
-from mxnet import profiler
-from mxnet.gluon import nn
-from mxnet import ndarray as nd
-import fresnet
-
-class EmbeddingBlock(gluon.HybridBlock):
-    def __init__(self, emb_size = 512, mode='E', **kwargs):
-        super(EmbeddingBlock, self).__init__(**kwargs)
-        self.emb_size = emb_size
-        print('mode', mode)
-        with self.name_scope():
-          self.body = nn.HybridSequential(prefix='')
-          if mode=='D':
-            self.body.add(nn.BatchNorm())
-            self.body.add(nn.Activation('relu'))
-            self.body.add(nn.GlobalAvgPool2D())
-            self.body.add(nn.Flatten())
-            self.body.add(nn.Dense(emb_size))
-            self.body.add(nn.BatchNorm(scale=False, prefix='fc1'))
-          elif mode=='E':
-            self.body.add(nn.BatchNorm(epsilon=2e-5))
-            self.body.add(nn.Dropout(0.4))
-            #self.body.add(nn.Flatten())
-            self.body.add(nn.Dense(emb_size))
-            self.body.add(nn.BatchNorm(scale=False, epsilon=2e-5, prefix='fc1'))
-          elif mode=='Z':
-            #self.body.add(nn.BatchNorm(epsilon=2e-5))
-            #self.body.add(nn.Activation('relu'))
-            #self.body.add(nn.GlobalAvgPool2D())
-            #self.body.add(nn.Flatten())
-            self.body.add(nn.BatchNorm(epsilon=2e-5))
-            self.body.add(nn.Dropout(0.4))
-            #self.body.add(nn.Flatten())
-            self.body.add(nn.Dense(emb_size))
-            #self.body.add(nn.BatchNorm(scale=False, epsilon=2e-5, prefix='fc1'))
-          else:
-            self.body.add(nn.BatchNorm(epsilon=2e-5))
-            self.body.add(nn.Activation('relu'))
-            self.body.add(nn.GlobalAvgPool2D())
-            self.body.add(nn.Flatten())
-
-    def hybrid_forward(self, F, x):
-        x = self.body(x)
-        #bn_mom = 0.9
-        #x = F.BatchNorm(data=x, fix_gamma=True, eps=2e-5, momentum=bn_mom)
-        return x
-        #return x
-
-class ArcMarginBlock(gluon.HybridBlock):
-    def __init__(self, args, **kwargs):
-      super(ArcMarginBlock, self).__init__(**kwargs)
-      self.margin_s = args.margin_s
-      self.margin_m = args.margin_m
-      self.margin_a = args.margin_a
-      self.margin_b = args.margin_b
-      self.num_classes = args.num_classes
-      self.emb_size = args.emb_size
-      #self.weight = gluon.Parameter(name = 'fc7_weight', shape = (self.num_classes, self.emb_size))
-      #self.weight.initialize()
-      #self._weight = nd.empty(shape = (self.num_classes, self.emb_size))
-      #if self.margin_a>0.0:
-      with self.name_scope():
-        self.fc7_weight = self.params.get('fc7_weight', shape=(self.num_classes, self.emb_size))
-      #else:
-      #  self.dense = nn.Dense(self.num_classes, prefix='fc7')
-      self.body = nn.HybridSequential(prefix='')
-      feat = fresnet.get(args.num_layers, 
-          version_unit=args.version_unit,
-          version_act=args.version_act)
-      self.body.add(feat)
-      self.body.add(EmbeddingBlock(args.emb_size, args.version_output, prefix=''))
-
-    def feature(self, x):
-        feat = self.body(x)
-        return feat
-
-    def hybrid_forward(self, F, x, label, fc7_weight):
-        feat = self.body(x)
-        if self.margin_a==0.0:
-          fc7 = F.FullyConnected(feat, fc7_weight, no_bias = True, num_hidden=self.num_classes, name='fc7')
-          #fc7 = self.dense(feat)
-          #with x.context:
-          #  _w = self._weight.data()
-            #_b = self._bias.data()
-          #fc7 = nd.FullyConnected(data=feat, weight=_w, bias = _b, num_hidden=self.num_classes, name='fc7')
-          #fc7 = F.softmax_cross_entropy(data = fc7, label=label)
-          return fc7
-
-        nx = F.L2Normalization(feat, mode='instance', name='fc1n')*self.margin_s
-        w = F.L2Normalization(fc7_weight, mode='instance')
-        fc7 = F.FullyConnected(nx, w, no_bias = True, num_hidden=self.num_classes, name='fc7')
-        #fc7 = self.dense(nx)
-        if self.margin_a!=1.0 or self.margin_m!=0.0 or self.margin_b!=0.0:
-          if self.margin_a==1.0 and self.margin_m==0.0:
-            s_m = s*self.margin_b
-            gt_one_hot = F.one_hot(label, depth = self.num_classes, on_value = s_m, off_value = 0.0)
-            fc7 = fc7-gt_one_hot
-          else:
-            zy = F.pick(fc7, label, axis=1)
-            cos_t = zy/self.margin_s
-            t = F.arccos(cos_t)
-            if self.margin_a!=1.0:
-              t = t*self.margin_a
-            if self.margin_m>0.0:
-              t = t+self.margin_m
-            body = F.cos(t)
-            if self.margin_b>0.0:
-              body = body - self.margin_b
-            new_zy = body*self.margin_s
-            diff = new_zy - zy
-            diff = F.expand_dims(diff, 1)
-            gt_one_hot = F.one_hot(label, depth = self.num_classes, on_value = 1.0, off_value = 0.0)
-            body = F.broadcast_mul(gt_one_hot, diff)
-            fc7 = fc7+body
-        return fc7
-
-    #def hybrid_forward(self, F, x):
-    #  feat = self.body(x)
-    #  return feat
-
-class DenseBlock(gluon.HybridBlock):
-    def __init__(self, args, **kwargs):
-      super(DenseBlock, self).__init__(**kwargs)
-      self.num_classes = args.num_classes
-      self.emb_size = args.emb_size
-      self.body = nn.HybridSequential(prefix='')
-      feat = fresnet.get(args.num_layers, 
-          version_unit=args.version_unit,
-          version_act=args.version_act)
-      self.body.add(feat)
-      self.body.add(EmbeddingBlock(args.emb_size, args.version_output, prefix=''))
-      self.dense = nn.Dense(self.num_classes, prefix='fc7')
-
-    def feature(self, x):
-        feat = self.body(x)
-        return feat
-
-    def hybrid_forward(self, F, x):
-        feat = self.body(x)
-        fc7 = self.dense(feat)
-        return fc7
-
-class ArcMarginTestBlock(gluon.Block):
-    def __init__(self, args, **kwargs):
-      super(ArcMarginTestBlock, self).__init__(**kwargs)
-
-      self.body = nn.HybridSequential(prefix='')
-      feat = fresnet.get(args.num_layers, 
-          version_unit=args.version_unit,
-          version_act=args.version_act)
-      self.body.add(feat)
-      self.body.add(EmbeddingBlock(args.emb_size, args.version_output))
-
-    def forward(self, x):
-      feat = self.body(x)
-      return feat
-
-class _GABlock(gluon.HybridBlock):
-    def __init__(self, args, num_classes, **kwargs):
-      super(_GABlock, self).__init__(**kwargs)
-      with self.name_scope():
-        self.body = nn.HybridSequential(prefix='')
-        feat = fresnet.get(args.num_layers, 
-            version_unit=args.version_unit,
-            version_act=args.version_act)
-        self.body.add(feat)
-        self.body.add(EmbeddingBlock(mode=args.version_output))
-        self.body.add(nn.Dense(num_classes))
-
-    def hybrid_forward(self, F, x):
-      return self.body(x)
-
-
-class GABlock(gluon.HybridBlock):
-    def __init__(self, args, **kwargs):
-      super(GABlock, self).__init__(**kwargs)
-      with self.name_scope():
-        #args.num_classes = 2
-        self.bodyg = _GABlock(args, 2, prefix='gender_')
-        #args.num_classes = 200
-        self.bodya = _GABlock(args, 200, prefix='age_')
-        #if args.task=='age':
-        #  self.bodyg.collect_params().setattr('grad_req', 'null')
-        #elif args.task=='gender':
-        #  self.bodya.collect_params().setattr('grad_req', 'null')
-        #self.body = nn.HybridSequential(prefix='')
-
-    def hybrid_forward(self, F, x):
-      g = self.bodyg(x)
-      a = self.bodya(x)
-      f = F.concat(g,a,dim=1, name='fc1')
-      return [f,g,a]
-
--- a/gluon/blocks/embedding.py
+++ b/gluon/blocks/embedding.py
@@ -1,29 +0,0 @@
-
-class EmbeddingBlock(HybridBlock):
-    def __init__(self, emb_size = 512, mode='E', **kwargs):
-        super(EmbeddingBlock, self).__init__(**kwargs)
-        self.body = nn.HybridSequential(prefix='')
-        if mode=='D':
-          self.body.add(nn.BatchNorm())
-          self.body.add(nn.Activation('relu'))
-          self.body.add(nn.GlobalAvgPool2D())
-          self.body.add(nn.Flatten())
-          self.body.add(nn.Dense(emb_size))
-          self.body.add(nn.BatchNorm(scale=False, prefix='fc1'))
-        elif mode=='E':
-          self.body.add(nn.BatchNorm())
-          self.body.add(nn.Dropout(0.4))
-          self.body.add(nn.Dense(emb_size))
-          self.body.add(nn.BatchNorm(scale=False, prefix='fc1'))
-        else:
-          self.body.add(nn.BatchNorm())
-          self.body.add(nn.Activation('relu'))
-          self.body.add(nn.GlobalAvgPool2D())
-          self.body.add(nn.Flatten())
-
-    def hybrid_forward(self, F, x):
-        x = self.body(x)
-        return x
-
-class MarginBlock(HybridBlock):
-    def __init__(self, args, **kwargs):
--- a/gluon/blocks/fresnet.py
+++ b/gluon/blocks/fresnet.py
@@ -1,232 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-# pylint: disable= arguments-differ
-"""ResNets, implemented in Gluon."""
-from __future__ import division
-
-#__all__ = ['ResNetV1', 'ResNetV2',
-#           'BasicBlockV1', 'BasicBlockV2',
-#           'BottleneckV1', 'BottleneckV2',
-#           'resnet18_v1', 'resnet34_v1', 'resnet50_v1', 'resnet101_v1', 'resnet152_v1',
-#           'resnet18_v2', 'resnet34_v2', 'resnet50_v2', 'resnet101_v2', 'resnet152_v2',
-#           'get_resnet']
-
-import os
-
-#from ....context import cpu
-from mxnet import gluon
-from mxnet import profiler
-from mxnet.gluon import nn
-from mxnet.gluon.block import HybridBlock
-
-# Helpers
-def _conv3x3(channels, stride, in_channels):
-    return nn.Conv2D(channels, kernel_size=3, strides=stride, padding=1,
-                     use_bias=False, in_channels=in_channels)
-
-def _act(act_type):
-  if act_type=='prelu':
-    return nn.PReLU()
-  else:
-    return nn.Activation(act_type)
-
-# Blocks
-class BasicBlockV1(HybridBlock):
-    r"""BasicBlock V1 from `"Deep Residual Learning for Image Recognition"
-    <http://arxiv.org/abs/1512.03385>`_ paper.
-    This is used for ResNet V1 for 18, 34 layers.
-
-    Parameters
-    ----------
-    channels : int
-        Number of output channels.
-    stride : int
-        Stride size.
-    downsample : bool, default False
-        Whether to downsample the input.
-    in_channels : int, default 0
-        Number of input channels. Default is 0, to infer from the graph.
-    """
-    def __init__(self, channels, stride, downsample=False, in_channels=0, act_type = 'relu', **kwargs):
-        super(BasicBlockV1, self).__init__(**kwargs)
-        self.act_type = act_type
-        self.body = nn.HybridSequential(prefix='')
-        self.body.add(_conv3x3(channels, 1, in_channels))
-        self.body.add(nn.BatchNorm(epsilon=2e-5))
-        self.body.add(_act(act_type))
-        self.body.add(_conv3x3(channels, stride, channels))
-        self.body.add(nn.BatchNorm(epsilon=2e-5))
-        if self.act_type=='prelu':
-          self.prelu = nn.PReLU()
-        if downsample:
-            self.downsample = nn.HybridSequential(prefix='')
-            self.downsample.add(nn.Conv2D(channels, kernel_size=1, strides=stride,
-                                          use_bias=False, in_channels=in_channels))
-            self.downsample.add(nn.BatchNorm(epsilon=2e-5))
-        else:
-            self.downsample = None
-
-    def hybrid_forward(self, F, x):
-        residual = x
-
-        x = self.body(x)
-
-        if self.downsample:
-            residual = self.downsample(residual)
-
-        if self.act_type=='prelu':
-          x = self.prelu(x+residual)
-          #x = F.LeakyReLU(residual+x, act_type = self.act_type)
-        else:
-          x = F.Activation(x+residual, act_type=self.act_type)
-
-        return x
-
-
-class BasicBlockV2(HybridBlock):
-    r"""BasicBlock V2 from
-    `"Identity Mappings in Deep Residual Networks"
-    <https://arxiv.org/abs/1603.05027>`_ paper.
-    This is used for ResNet V2 for 18, 34 layers.
-
-    Parameters
-    ----------
-    channels : int
-        Number of output channels.
-    stride : int
-        Stride size.
-    downsample : bool, default False
-        Whether to downsample the input.
-    in_channels : int, default 0
-        Number of input channels. Default is 0, to infer from the graph.
-    """
-    def __init__(self, channels, stride, downsample=False, in_channels=0, **kwargs):
-        super(BasicBlockV2, self).__init__(**kwargs)
-        self.bn1 = nn.BatchNorm()
-        self.conv1 = _conv3x3(channels, stride, in_channels)
-        self.bn2 = nn.BatchNorm()
-        self.conv2 = _conv3x3(channels, 1, channels)
-        if downsample:
-            self.downsample = nn.Conv2D(channels, 1, stride, use_bias=False,
-                                        in_channels=in_channels)
-        else:
-            self.downsample = None
-
-    def hybrid_forward(self, F, x):
-        residual = x
-        x = self.bn1(x)
-        x = F.Activation(x, act_type='relu')
-        if self.downsample:
-            residual = self.downsample(x)
-        x = self.conv1(x)
-
-        x = self.bn2(x)
-        x = F.Activation(x, act_type='relu')
-        x = self.conv2(x)
-
-        return x + residual
-
-
-class ResNet(HybridBlock):
-    r"""ResNet V2 model from
-    `"Identity Mappings in Deep Residual Networks"
-    <https://arxiv.org/abs/1603.05027>`_ paper.
-
-    Parameters
-    ----------
-    block : HybridBlock
-        Class for the residual block. Options are BasicBlockV1, BottleneckV1.
-    layers : list of int
-        Numbers of layers in each block
-    channels : list of int
-        Numbers of channels in each block. Length should be one larger than layers list.
-    classes : int, default 1000
-        Number of classification classes.
-    thumbnail : bool, default False
-        Enable thumbnail.
-    """
-    def __init__(self, layers, channels, **kwargs):
-        version_unit = kwargs.get('version_unit', 1)
-        act_type = kwargs.get('version_act', 'prelu')
-        self.act_type = act_type
-        del kwargs['version_unit']
-        del kwargs['version_act']
-        super(ResNet, self).__init__(**kwargs)
-        assert len(layers) == len(channels) - 1
-        print(version_unit, act_type)
-        if version_unit==1:
-          block = BasicBlockV1
-        elif version_unit==2:
-          block = BasicBlockV2
-        with self.name_scope():
-            self.features = nn.HybridSequential(prefix='')
-            #self.features.add(nn.BatchNorm(scale=False, center=False))
-            #self.features.add(nn.BatchNorm())
-            self.features.add(_conv3x3(channels[0], 1, 0))
-            self.features.add(nn.BatchNorm(epsilon=2e-5))
-            self.features.add(_act(act_type))
-
-            in_channels = channels[0]
-            for i, num_layer in enumerate(layers):
-                #stride = 1 if i == 0 else 2
-                stride = 2
-                self.features.add(self._make_layer(block, num_layer, channels[i+1],
-                                                   stride, i+1, in_channels=in_channels))
-                in_channels = channels[i+1]
-            #self.features.add(nn.BatchNorm())
-            #self.features.add(nn.Activation('relu'))
-            #self.features.add(nn.GlobalAvgPool2D())
-            #self.features.add(nn.Flatten())
-
-            #self.output = nn.Dense(classes, in_units=in_channels)
-
-    def _make_layer(self, block, layers, channels, stride, stage_index, in_channels=0):
-        layer = nn.HybridSequential(prefix='stage%d_'%stage_index)
-        with layer.name_scope():
-            #print(channels, in_channels)
-            layer.add(block(channels, stride, True, in_channels=in_channels, act_type = self.act_type,
-                            prefix=''))
-            for _ in range(layers-1):
-                layer.add(block(channels, 1, False, in_channels=channels, act_type = self.act_type, prefix=''))
-        return layer
-
-    def hybrid_forward(self, F, x):
-        x = x-127.5
-        x = x*0.0078125
-        x = self.features(x)
-        return x
-
-
-# Specification
-resnet_spec = {18: ('basic_block', [2, 2, 2, 2], [64, 64, 128, 256, 512]),
-               34: ('basic_block', [3, 4, 6, 3], [64, 64, 128, 256, 512]),
-               50: ('basic_block', [3, 4, 14, 3], [64, 64, 128, 256, 512]),
-               100: ('basic_block', [3, 13, 30, 3], [64, 64, 128, 256, 512]),
-               152: ('bottle_neck', [3, 8, 36, 3], [64, 256, 512, 1024, 2048])}
-
-
-# Constructor
-def get(num_layers, **kwargs):
-    assert num_layers in resnet_spec, \
-        "Invalid number of layers: %d. Options are %s"%(
-            num_layers, str(resnet_spec.keys()))
-    block_type, layers, channels = resnet_spec[num_layers]
-    net = ResNet(layers, channels, **kwargs)
-    return net
-
--- a/gluon/image_iter.py
+++ b/gluon/image_iter.py
@@ -1,285 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import random
-import logging
-import sys
-import numbers
-import math
-import sklearn
-import datetime
-import numpy as np
-import cv2
-
-import mxnet as mx
-from mxnet import ndarray as nd
-from mxnet import io
-from mxnet import recordio
-#sys.path.append(os.path.join(os.path.dirname(__file__), 'common'))
-sys.path.append(os.path.join(os.path.dirname(os.path.dirname(__file__)), 'src', 'common'))
-import face_preprocess
-import multiprocessing
-
-logger = logging.getLogger()
-
-
-class FaceImageIter(io.DataIter):
-
-    def __init__(self, batch_size, data_shape,
-                 path_imgrec = None,
-                 shuffle=False, aug_list=None, mean = None,
-                 rand_mirror = False, cutoff = 0,
-                 data_name='data', label_name='softmax_label', **kwargs):
-        super(FaceImageIter, self).__init__()
-        assert path_imgrec
-        if path_imgrec:
-            logging.info('loading recordio %s...',
-                         path_imgrec)
-            path_imgidx = path_imgrec[0:-4]+".idx"
-            self.imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r')  # pylint: disable=redefined-variable-type
-            s = self.imgrec.read_idx(0)
-            header, _ = recordio.unpack(s)
-            if header.flag>0:
-              print('header0 label', header.label)
-              self.header0 = (int(header.label[0]), int(header.label[1]))
-              #assert(header.flag==1)
-              self.imgidx = range(1, int(header.label[0]))
-              self.id2range = {}
-              self.seq_identity = range(int(header.label[0]), int(header.label[1]))
-              for identity in self.seq_identity:
-                s = self.imgrec.read_idx(identity)
-                header, _ = recordio.unpack(s)
-                a,b = int(header.label[0]), int(header.label[1])
-                self.id2range[identity] = (a,b)
-                count = b-a
-              print('id2range', len(self.id2range))
-            else:
-              self.imgidx = list(self.imgrec.keys)
-            if shuffle:
-              self.seq = self.imgidx
-              self.oseq = self.imgidx
-              print(len(self.seq))
-            else:
-              self.seq = None
-
-        self.mean = mean
-        self.nd_mean = None
-        if self.mean:
-          self.mean = np.array(self.mean, dtype=np.float32).reshape(1,1,3)
-          self.nd_mean = mx.nd.array(self.mean).reshape((1,1,3))
-
-        self.check_data_shape(data_shape)
-        self.provide_data = [(data_name, (batch_size,) + data_shape)]
-        self.batch_size = batch_size
-        self.data_shape = data_shape
-        self.shuffle = shuffle
-        self.image_size = '%d,%d'%(data_shape[1],data_shape[2])
-        self.rand_mirror = rand_mirror
-        print('rand_mirror', rand_mirror)
-        self.cutoff = cutoff
-        self.provide_label = [(label_name, (batch_size,))]
-        #print(self.provide_label[0][1])
-        self.cur = 0
-        self.nbatch = 0
-        self.is_init = False
-
-
-    def reset(self):
-        """Resets the iterator to the beginning of the data."""
-        print('call reset()')
-        self.cur = 0
-        if self.shuffle:
-          random.shuffle(self.seq)
-        if self.seq is None and self.imgrec is not None:
-            self.imgrec.reset()
-
-    def num_samples(self):
-      return len(self.seq)
-
-    def next_sample(self):
-        """Helper function for reading in next sample."""
-        #set total batch size, for example, 1800, and maximum size for each people, for example 45
-        if self.seq is not None:
-          while True:
-            if self.cur >= len(self.seq):
-                raise StopIteration
-            idx = self.seq[self.cur]
-            self.cur += 1
-            if self.imgrec is not None:
-              s = self.imgrec.read_idx(idx)
-              header, img = recordio.unpack(s)
-              label = header.label
-              if not isinstance(label, numbers.Number):
-                label = label[0]
-              return label, img, None, None
-            else:
-              label, fname, bbox, landmark = self.imglist[idx]
-              return label, self.read_image(fname), bbox, landmark
-        else:
-            s = self.imgrec.read()
-            if s is None:
-                raise StopIteration
-            header, img = recordio.unpack(s)
-            return header.label, img, None, None
-
-    def brightness_aug(self, src, x):
-      alpha = 1.0 + random.uniform(-x, x)
-      src *= alpha
-      return src
-
-    def contrast_aug(self, src, x):
-      alpha = 1.0 + random.uniform(-x, x)
-      coef = np.array([[[0.299, 0.587, 0.114]]])
-      gray = src * coef
-      gray = (3.0 * (1.0 - alpha) / gray.size) * np.sum(gray)
-      src *= alpha
-      src += gray
-      return src
-
-    def saturation_aug(self, src, x):
-      alpha = 1.0 + random.uniform(-x, x)
-      coef = np.array([[[0.299, 0.587, 0.114]]])
-      gray = src * coef
-      gray = np.sum(gray, axis=2, keepdims=True)
-      gray *= (1.0 - alpha)
-      src *= alpha
-      src += gray
-      return src
-
-    def color_aug(self, img, x):
-      augs = [self.brightness_aug, self.contrast_aug, self.saturation_aug]
-      random.shuffle(augs)
-      for aug in augs:
-        #print(img.shape)
-        img = aug(img, x)
-        #print(img.shape)
-      return img
-
-    def mirror_aug(self, img):
-      _rd = random.randint(0,1)
-      if _rd==1:
-        for c in xrange(img.shape[2]):
-          img[:,:,c] = np.fliplr(img[:,:,c])
-      return img
-
-
-    def next(self):
-        if not self.is_init:
-          self.reset()
-          self.is_init = True
-        """Returns the next batch of data."""
-        #print('in next', self.cur, self.labelcur)
-        self.nbatch+=1
-        batch_size = self.batch_size
-        c, h, w = self.data_shape
-        batch_data = nd.empty((batch_size, c, h, w))
-        if self.provide_label is not None:
-          batch_label = nd.empty(self.provide_label[0][1])
-        i = 0
-        try:
-            while i < batch_size:
-                label, s, bbox, landmark = self.next_sample()
-                _data = self.imdecode(s)
-                if self.rand_mirror:
-                  _rd = random.randint(0,1)
-                  if _rd==1:
-                    _data = mx.ndarray.flip(data=_data, axis=1)
-                if self.nd_mean is not None:
-                  _data = _data.astype('float32')
-                  _data -= 127.5
-                  _data *= 0.0078125
-                if self.cutoff>0:
-                  centerh = random.randint(0, _data.shape[0]-1)
-                  centerw = random.randint(0, _data.shape[1]-1)
-                  half = self.cutoff//2
-                  starth = max(0, centerh-half)
-                  endh = min(_data.shape[0], centerh+half)
-                  startw = max(0, centerw-half)
-                  endw = min(_data.shape[1], centerw+half)
-                  _data = _data.astype('float32')
-                  #print(starth, endh, startw, endw, _data.shape)
-                  _data[starth:endh, startw:endw, :] = 127.5
-                data = [_data]
-                try:
-                    self.check_valid_image(data)
-                except RuntimeError as e:
-                    logging.debug('Invalid image, skipping:  %s', str(e))
-                    continue
-                #print('aa',data[0].shape)
-                #data = self.augmentation_transform(data)
-                #print('bb',data[0].shape)
-                for datum in data:
-                    assert i < batch_size, 'Batch size must be multiples of augmenter output length'
-                    #print(datum.shape)
-                    batch_data[i][:] = self.postprocess_data(datum)
-                    batch_label[i][:] = label
-                    i += 1
-        except StopIteration:
-            if i<batch_size:
-                raise StopIteration
-
-        return io.DataBatch([batch_data], [batch_label], batch_size - i)
-
-    def check_data_shape(self, data_shape):
-        """Checks if the input data shape is valid"""
-        if not len(data_shape) == 3:
-            raise ValueError('data_shape should have length 3, with dimensions CxHxW')
-        if not data_shape[0] == 3:
-            raise ValueError('This iterator expects inputs to have 3 channels.')
-
-    def check_valid_image(self, data):
-        """Checks if the input data is valid"""
-        if len(data[0].shape) == 0:
-            raise RuntimeError('Data shape is wrong')
-
-    def imdecode(self, s):
-        """Decodes a string or byte string to an NDArray.
-        See mx.img.imdecode for more details."""
-        img = mx.image.imdecode(s) #mx.ndarray
-        return img
-
-    def read_image(self, fname):
-        """Reads an input image `fname` and returns the decoded raw bytes.
-
-        Example usage:
-        ----------
-        >>> dataIter.read_image('Face.jpg') # returns decoded raw bytes.
-        """
-        with open(os.path.join(self.path_root, fname), 'rb') as fin:
-            img = fin.read()
-        return img
-
-    def augmentation_transform(self, data):
-        """Transforms input data with specified augmentation."""
-        for aug in self.auglist:
-            data = [ret for src in data for ret in aug(src)]
-        return data
-
-    def postprocess_data(self, datum):
-        """Final postprocessing step before image is loaded into the batch."""
-        return nd.transpose(datum, axes=(2, 0, 1))
-
-class FaceImageIterList(io.DataIter):
-  def __init__(self, iter_list):
-    assert len(iter_list)>0
-    self.provide_data = iter_list[0].provide_data
-    self.provide_label = iter_list[0].provide_label
-    self.iter_list = iter_list
-    self.cur_iter = None
-
-  def reset(self):
-    self.cur_iter.reset()
-
-  def next(self):
-    self.cur_iter = random.choice(self.iter_list)
-    while True:
-      try:
-        ret = self.cur_iter.next()
-      except StopIteration:
-        self.cur_iter.reset()
-        continue
-      return ret
-
-
--- a/gluon/train.py
+++ b/gluon/train.py
@@ -1,747 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-import math
-import random
-import logging
-import time
-import pickle
-import numpy as np
-import sklearn
-from image_iter import FaceImageIter
-from age_iter import FaceImageIter as FaceImageIterAge
-#from image_iter import FaceImageIterList
-import mxnet as mx
-from mxnet import gluon
-from mxnet import profiler
-from mxnet.gluon import nn
-from mxnet import ndarray as nd
-from mxnet import autograd as ag
-from mxnet.test_utils import get_mnist_iterator
-from mxnet.metric import Accuracy, TopKAccuracy, CompositeEvalMetric
-import argparse
-import mxnet.optimizer as optimizer
-#sys.path.append(os.path.join(os.path.dirname(os.path.dirname(__file__)), 'src', 'eval'))
-import verification
-#sys.path.append(os.path.join(os.path.dirname(__file__), 'common'))
-sys.path.append(os.path.join(os.path.dirname(os.path.dirname(__file__)), 'src', 'common'))
-import face_image
-#sys.path.append(os.path.join(os.path.dirname(__file__), 'eval'))
-sys.path.append(os.path.join(os.path.dirname(__file__), 'blocks'))
-import fresnet
-from UDD import *
-#import finception_resnet_v2
-#import fmobilenet 
-#import fmobilenetv2
-#import fmobilefacenet
-#import fxception
-#import fdensenet
-#import fdpn
-#import fnasnet
-#import spherenet
-#sys.path.append(os.path.join(os.path.dirname(__file__), 'losses'))
-#import center_loss
-
-
-logger = logging.getLogger()
-logger.setLevel(logging.INFO)
-
-AGE = 100
-
-args = None
-
-
-class AccMetric(mx.metric.EvalMetric):
-  def __init__(self):
-    self.axis = 1
-    super(AccMetric, self).__init__(
-        'acc', axis=self.axis,
-        output_names=None, label_names=None)
-    self.losses = []
-    self.count = 0
-
-  def update(self, labels, preds):
-    self.count+=1
-    #preds = [preds[1]] #use softmax output
-    for label, pred_label in zip(labels, preds):
-        if pred_label.shape != label.shape:
-            pred_label = mx.ndarray.argmax(pred_label, axis=self.axis)
-        pred_label = pred_label.asnumpy().astype('int32').flatten()
-        label = label.asnumpy()
-        if label.ndim==2:
-          label = label[:,0]
-        label = label.astype('int32').flatten()
-        assert label.shape==pred_label.shape
-        self.sum_metric += (pred_label.flat == label.flat).sum()
-        self.num_inst += len(pred_label.flat)
-
-class LossValueMetric(mx.metric.EvalMetric):
-  def __init__(self):
-    self.axis = 1
-    super(LossValueMetric, self).__init__(
-        'lossvalue', axis=self.axis,
-        output_names=None, label_names=None)
-    self.losses = []
-
-  def update(self, labels, preds):
-    loss = preds[-1].asnumpy()[0]
-    self.sum_metric += loss
-    self.num_inst += 1.0
-    gt_label = preds[-2].asnumpy()
-    #print(gt_label)
-
-class MAEMetric(mx.metric.EvalMetric):
-  def __init__(self):
-    self.axis = 1
-    super(MAEMetric, self).__init__(
-        'MAE', axis=self.axis,
-        output_names=None, label_names=None)
-    self.losses = []
-    self.count = 0
-
-  def update(self, labels, preds):
-    self.count+=1
-    label = labels[0].asnumpy()
-    label_age = np.count_nonzero(label, axis=1)
-    pred_age = np.zeros( label_age.shape, dtype=np.int)
-    #pred_age = np.zeros( label_age.shape, dtype=np.float32)
-    pred = preds[0].asnumpy()
-    for i in xrange(AGE):
-      _pred = pred[:,i*2:(i*2+2)]
-      _pred = np.argmax(_pred, axis=1)
-      #pred = pred[:,1]
-      pred_age += _pred
-    #pred_age = pred_age.astype(np.int)
-    mae = np.mean(np.abs(label_age - pred_age))
-    self.sum_metric += mae
-    self.num_inst += 1.0
-
-class CUMMetric(mx.metric.EvalMetric):
-  def __init__(self, n=5):
-    self.axis = 1
-    self.n = n
-    super(CUMMetric, self).__init__(
-        'CUM_%d'%n, axis=self.axis,
-        output_names=None, label_names=None)
-    self.losses = []
-    self.count = 0
-
-  def update(self, labels, preds):
-    self.count+=1
-    label = labels[0].asnumpy()
-    label_age = np.count_nonzero(label, axis=1)
-    pred_age = np.zeros( label_age.shape, dtype=np.int)
-    pred = preds[0].asnumpy()
-    for i in xrange(AGE):
-      _pred = pred[:,i*2:(i*2+2)]
-      _pred = np.argmax(_pred, axis=1)
-      #pred = pred[:,1]
-      pred_age += _pred
-    diff = np.abs(label_age - pred_age)
-    cum = np.sum( (diff<self.n) )
-    self.sum_metric += cum
-    self.num_inst += len(label_age)
-
-def parse_args():
-  global args
-  parser = argparse.ArgumentParser(description='Train face network')
-  # general
-  parser.add_argument('--data-dir', default='', help='training set directory')
-  parser.add_argument('--gender-data-dir', default='', help='training set directory')
-  parser.add_argument('--age-data-dir', default='', help='training set directory')
-  parser.add_argument('--prefix', default='../model/model', help='directory to save model.')
-  parser.add_argument('--pretrained', default='', help='pretrained model to load')
-  parser.add_argument('--ckpt', type=int, default=1, help='checkpoint saving option. 0: discard saving. 1: save when necessary. 2: always save')
-  parser.add_argument('--loss-type', type=int, default=4, help='loss type')
-  parser.add_argument('--verbose', type=int, default=2000, help='do verification testing and model saving every verbose batches')
-  parser.add_argument('--max-steps', type=int, default=0, help='max training batches')
-  parser.add_argument('--end-epoch', type=int, default=100000, help='training epoch size.')
-  parser.add_argument('--network', default='r50', help='specify network')
-  parser.add_argument('--version-output', type=str, default='E', help='network embedding output config')
-  parser.add_argument('--version-unit', type=int, default=1, help='resnet unit config')
-  parser.add_argument('--version-act', type=str, default='relu', help='network activation config')
-  parser.add_argument('--lr', type=float, default=0.1, help='start learning rate')
-  parser.add_argument('--lr-steps', type=str, default='', help='steps of lr changing')
-  parser.add_argument('--wd', type=float, default=0.0005, help='weight decay')
-  parser.add_argument('--fc7-wd-mult', type=float, default=1.0, help='weight decay mult for fc7')
-  parser.add_argument('--bn-mom', type=float, default=0.9, help='bn mom')
-  parser.add_argument('--mom', type=float, default=0.9, help='momentum')
-  parser.add_argument('--emb-size', type=int, default=512, help='embedding length')
-  parser.add_argument('--per-batch-size', type=int, default=128, help='batch size in each context')
-  parser.add_argument('--margin-m', type=float, default=0.5, help='margin for loss')
-  parser.add_argument('--margin-s', type=float, default=64.0, help='scale for feature')
-  parser.add_argument('--margin-a', type=float, default=1.0, help='')
-  parser.add_argument('--margin-b', type=float, default=0.0, help='')
-  parser.add_argument('--rand-mirror', type=int, default=1, help='if do random mirror in training')
-  parser.add_argument('--cutoff', type=int, default=0, help='cut off aug')
-  parser.add_argument('--eval', type=str, default='lfw,cfp_fp,agedb_30', help='verification targets')
-  parser.add_argument('--task', type=str, default='', help='')
-  parser.add_argument('--mode', type=str, default='gluon', help='')
-  args = parser.parse_args()
-  return args
-
-def get_model():
-  #print('init resnet', args.num_layers)
-  if args.task=='':
-    if args.margin_a>0.0:
-      return ArcMarginBlock(args, prefix='')
-    else:
-      return DenseBlock(args, prefix='')
-  else:#AGE or GENDER
-    return GABlock(args, prefix='')
-
-
-#def get_symbol(args, arg_params, aux_params):
-#  data_shape = (args.image_channel,args.image_h,args.image_w)
-#  image_shape = ",".join([str(x) for x in data_shape])
-#  margin_symbols = []
-#  if args.network[0]=='d':
-#    embedding = fdensenet.get_symbol(args.emb_size, args.num_layers,
-#        version_se=args.version_se, version_input=args.version_input, 
-#        version_output=args.version_output, version_unit=args.version_unit)
-#  elif args.network[0]=='m':
-#    print('init mobilenet', args.num_layers)
-#    if args.num_layers==1:
-#      embedding = fmobilenet.get_symbol(args.emb_size, 
-#          version_se=args.version_se, version_input=args.version_input, 
-#          version_output=args.version_output, version_unit=args.version_unit)
-#    else:
-#      embedding = fmobilenetv2.get_symbol(args.emb_size)
-#  elif args.network[0]=='i':
-#    print('init inception-resnet-v2', args.num_layers)
-#    embedding = finception_resnet_v2.get_symbol(args.emb_size,
-#        version_se=args.version_se, version_input=args.version_input, 
-#        version_output=args.version_output, version_unit=args.version_unit)
-#  elif args.network[0]=='x':
-#    print('init xception', args.num_layers)
-#    embedding = fxception.get_symbol(args.emb_size,
-#        version_se=args.version_se, version_input=args.version_input, 
-#        version_output=args.version_output, version_unit=args.version_unit)
-#  elif args.network[0]=='p':
-#    print('init dpn', args.num_layers)
-#    embedding = fdpn.get_symbol(args.emb_size, args.num_layers,
-#        version_se=args.version_se, version_input=args.version_input, 
-#        version_output=args.version_output, version_unit=args.version_unit)
-#  elif args.network[0]=='n':
-#    print('init nasnet', args.num_layers)
-#    embedding = fnasnet.get_symbol(args.emb_size)
-#  elif args.network[0]=='s':
-#    print('init spherenet', args.num_layers)
-#    embedding = spherenet.get_symbol(args.emb_size, args.num_layers)
-#  elif args.network[0]=='y':
-#    print('init mobilefacenet', args.num_layers)
-#    embedding = fmobilefacenet.get_symbol(args.emb_size, bn_mom = args.bn_mom, wd_mult = args.fc7_wd_mult)
-#  else:
-#    print('init resnet', args.num_layers)
-#    embedding = fresnet.get_symbol(args.emb_size, args.num_layers, 
-#        version_se=args.version_se, version_input=args.version_input, 
-#        version_output=args.version_output, version_unit=args.version_unit,
-#        version_act=args.version_act)
-#  all_label = mx.symbol.Variable('softmax_label')
-#  gt_label = all_label
-#  extra_loss = None
-#  _weight = mx.symbol.Variable("fc7_weight", shape=(args.num_classes, args.emb_size), lr_mult=1.0, wd_mult=args.fc7_wd_mult)
-#  if args.loss_type==0: #softmax
-#    _bias = mx.symbol.Variable('fc7_bias', lr_mult=2.0, wd_mult=0.0)
-#    fc7 = mx.sym.FullyConnected(data=embedding, weight = _weight, bias = _bias, num_hidden=args.num_classes, name='fc7')
-#  elif args.loss_type==1: #sphere
-#    _weight = mx.symbol.L2Normalization(_weight, mode='instance')
-#    fc7 = mx.sym.LSoftmax(data=embedding, label=gt_label, num_hidden=args.num_classes,
-#                          weight = _weight,
-#                          beta=args.beta, margin=args.margin, scale=args.scale,
-#                          beta_min=args.beta_min, verbose=1000, name='fc7')
-#  elif args.loss_type==2:
-#    s = args.margin_s
-#    m = args.margin_m
-#    assert(s>0.0)
-#    assert(m>0.0)
-#    _weight = mx.symbol.L2Normalization(_weight, mode='instance')
-#    nembedding = mx.symbol.L2Normalization(embedding, mode='instance', name='fc1n')*s
-#    fc7 = mx.sym.FullyConnected(data=nembedding, weight = _weight, no_bias = True, num_hidden=args.num_classes, name='fc7')
-#    s_m = s*m
-#    gt_one_hot = mx.sym.one_hot(gt_label, depth = args.num_classes, on_value = s_m, off_value = 0.0)
-#    fc7 = fc7-gt_one_hot
-#  elif args.loss_type==4:
-#    s = args.margin_s
-#    m = args.margin_m
-#    assert s>0.0
-#    assert m>=0.0
-#    assert m<(math.pi/2)
-#    _weight = mx.symbol.L2Normalization(_weight, mode='instance')
-#    nembedding = mx.symbol.L2Normalization(embedding, mode='instance', name='fc1n')*s
-#    fc7 = mx.sym.FullyConnected(data=nembedding, weight = _weight, no_bias = True, num_hidden=args.num_classes, name='fc7')
-#    zy = mx.sym.pick(fc7, gt_label, axis=1)
-#    cos_t = zy/s
-#    cos_m = math.cos(m)
-#    sin_m = math.sin(m)
-#    mm = math.sin(math.pi-m)*m
-#    #threshold = 0.0
-#    threshold = math.cos(math.pi-m)
-#    if args.easy_margin:
-#      cond = mx.symbol.Activation(data=cos_t, act_type='relu')
-#    else:
-#      cond_v = cos_t - threshold
-#      cond = mx.symbol.Activation(data=cond_v, act_type='relu')
-#    body = cos_t*cos_t
-#    body = 1.0-body
-#    sin_t = mx.sym.sqrt(body)
-#    new_zy = cos_t*cos_m
-#    b = sin_t*sin_m
-#    new_zy = new_zy - b
-#    new_zy = new_zy*s
-#    if args.easy_margin:
-#      zy_keep = zy
-#    else:
-#      zy_keep = zy - s*mm
-#    new_zy = mx.sym.where(cond, new_zy, zy_keep)
-#
-#    diff = new_zy - zy
-#    diff = mx.sym.expand_dims(diff, 1)
-#    gt_one_hot = mx.sym.one_hot(gt_label, depth = args.num_classes, on_value = 1.0, off_value = 0.0)
-#    body = mx.sym.broadcast_mul(gt_one_hot, diff)
-#    fc7 = fc7+body
-#  elif args.loss_type==5:
-#    s = args.margin_s
-#    m = args.margin_m
-#    assert s>0.0
-#    _weight = mx.symbol.L2Normalization(_weight, mode='instance')
-#    nembedding = mx.symbol.L2Normalization(embedding, mode='instance', name='fc1n')*s
-#    fc7 = mx.sym.FullyConnected(data=nembedding, weight = _weight, no_bias = True, num_hidden=args.num_classes, name='fc7')
-#    if args.margin_a!=1.0 or args.margin_m!=0.0 or args.margin_b!=0.0:
-#      if args.margin_a==1.0 and args.margin_m==0.0:
-#        s_m = s*args.margin_b
-#        gt_one_hot = mx.sym.one_hot(gt_label, depth = args.num_classes, on_value = s_m, off_value = 0.0)
-#        fc7 = fc7-gt_one_hot
-#      else:
-#        zy = mx.sym.pick(fc7, gt_label, axis=1)
-#        cos_t = zy/s
-#        t = mx.sym.arccos(cos_t)
-#        if args.margin_a!=1.0:
-#          t = t*args.margin_a
-#        if args.margin_m>0.0:
-#          t = t+args.margin_m
-#        body = mx.sym.cos(t)
-#        if args.margin_b>0.0:
-#          body = body - args.margin_b
-#        new_zy = body*s
-#        diff = new_zy - zy
-#        diff = mx.sym.expand_dims(diff, 1)
-#        gt_one_hot = mx.sym.one_hot(gt_label, depth = args.num_classes, on_value = 1.0, off_value = 0.0)
-#        body = mx.sym.broadcast_mul(gt_one_hot, diff)
-#        fc7 = fc7+body
-#  out_list = [mx.symbol.BlockGrad(embedding)]
-#  softmax = mx.symbol.SoftmaxOutput(data=fc7, label = gt_label, name='softmax', normalization='valid')
-#  out_list.append(softmax)
-#  out = mx.symbol.Group(out_list)
-#  return (out, arg_params, aux_params)
-#
-def train_net(args):
-    ctx = []
-    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
-    if len(cvd)>0:
-      for i in xrange(len(cvd.split(','))):
-        ctx.append(mx.gpu(i))
-    if len(ctx)==0:
-      ctx = [mx.cpu()]
-      print('use cpu')
-    else:
-      print('gpu num:', len(ctx))
-    prefix = args.prefix
-    prefix_dir = os.path.dirname(prefix)
-    if not os.path.exists(prefix_dir):
-      os.makedirs(prefix_dir)
-    end_epoch = args.end_epoch
-    args.ctx_num = len(ctx)
-    args.num_layers = int(args.network[1:])
-    print('num_layers', args.num_layers)
-    if args.per_batch_size==0:
-      args.per_batch_size = 128
-    args.batch_size = args.per_batch_size*args.ctx_num
-    args.image_channel = 3
-
-    data_dir = args.data_dir
-    if args.task=='gender':
-      data_dir = args.gender_data_dir
-    elif args.task=='age':
-      data_dir = args.age_data_dir
-    print('data dir', data_dir)
-    path_imgrec = None
-    path_imglist = None
-    prop = face_image.load_property(data_dir)
-    args.num_classes = prop.num_classes
-    image_size = prop.image_size
-    args.image_h = image_size[0]
-    args.image_w = image_size[1]
-    print('image_size', image_size)
-    assert(args.num_classes>0)
-    print('num_classes', args.num_classes)
-    path_imgrec = os.path.join(data_dir, "train.rec")
-
-
-    print('Called with argument:', args)
-    data_shape = (args.image_channel,image_size[0],image_size[1])
-    mean = None
-
-    begin_epoch = 0
-    net = get_model()
-    #if args.task=='':
-    #  test_net = get_model_test(net)
-    #print(net.__class__)
-    #net = net0[0]
-    if args.network[0]=='r' or args.network[0]=='y':
-      initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="out", magnitude=2) #resnet style
-    elif args.network[0]=='i' or args.network[0]=='x':
-      initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in", magnitude=2) #inception
-    else:
-      initializer = mx.init.Xavier(rnd_type='uniform', factor_type="in", magnitude=2)
-    net.hybridize()
-    if args.mode=='gluon':
-      if len(args.pretrained)==0:
-        pass
-      else:
-        net.load_params(args.pretrained, allow_missing=True, ignore_extra = True)
-      net.initialize(initializer)
-      net.collect_params().reset_ctx(ctx)
-
-    val_iter = None
-    if args.task=='':
-      train_iter = FaceImageIter(
-          batch_size           = args.batch_size,
-          data_shape           = data_shape,
-          path_imgrec          = path_imgrec,
-          shuffle              = True,
-          rand_mirror          = args.rand_mirror,
-          mean                 = mean,
-          cutoff               = args.cutoff,
-      )
-    else:
-      train_iter = FaceImageIterAge(
-          batch_size           = args.batch_size,
-          data_shape           = data_shape,
-          path_imgrec          = path_imgrec,
-          task                 = args.task,
-          shuffle              = True,
-          rand_mirror          = args.rand_mirror,
-          mean                 = mean,
-          cutoff               = args.cutoff,
-      )
-
-    if args.task=='age':
-      metric = CompositeEvalMetric([MAEMetric(), CUMMetric()])
-    elif args.task=='gender':
-      metric = CompositeEvalMetric([AccMetric()])
-    else:
-      metric = CompositeEvalMetric([AccMetric()])
-
-    ver_list = []
-    ver_name_list = []
-    if args.task=='':
-      for name in args.eval.split(','):
-        path = os.path.join(data_dir,name+".bin")
-        if os.path.exists(path):
-          data_set = verification.load_bin(path, image_size)
-          ver_list.append(data_set)
-          ver_name_list.append(name)
-          print('ver', name)
-
-    def ver_test(nbatch):
-      results = []
-      for i in xrange(len(ver_list)):
-        acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(ver_list[i], net, ctx, batch_size = args.batch_size)
-        print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
-        #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
-        print('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc2, std2))
-        results.append(acc2)
-      return results
-
-    def val_test(nbatch=0):
-      acc = 0.0
-      #if args.task=='age':
-      if len(args.age_data_dir)>0:
-        val_iter = FaceImageIterAge(
-            batch_size           = args.batch_size,
-            data_shape           = data_shape,
-            path_imgrec          = os.path.join(args.age_data_dir, 'val.rec'),
-            task                 = args.task,
-            shuffle              = False,
-            rand_mirror          = False,
-            mean                 = mean,
-        )
-        _metric = MAEMetric()
-        val_metric = mx.metric.create(_metric)
-        val_metric.reset()
-        _metric2 = CUMMetric()
-        val_metric2 = mx.metric.create(_metric2)
-        val_metric2.reset()
-        val_iter.reset()
-        for batch in val_iter:
-            data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
-            label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
-            outputs = []
-            for x in data:
-                outputs.append(net(x)[2])
-            val_metric.update(label, outputs)
-            val_metric2.update(label, outputs)
-        _value = val_metric.get_name_value()[0][1]
-        print('[%d][VMAE]: %f'%(nbatch, _value))
-        _value = val_metric2.get_name_value()[0][1]
-        if args.task=='age':
-          acc = _value
-        print('[%d][VCUM]: %f'%(nbatch, _value))
-      if len(args.gender_data_dir)>0:
-        val_iter = FaceImageIterAge(
-            batch_size           = args.batch_size,
-            data_shape           = data_shape,
-            path_imgrec          = os.path.join(args.gender_data_dir, 'val.rec'),
-            task                 = args.task,
-            shuffle              = False,
-            rand_mirror          = False,
-            mean                 = mean,
-        )
-        _metric = AccMetric()
-        val_metric = mx.metric.create(_metric)
-        val_metric.reset()
-        val_iter.reset()
-        for batch in val_iter:
-            data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
-            label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
-            outputs = []
-            for x in data:
-                outputs.append(net(x)[1])
-            val_metric.update(label, outputs)
-        _value = val_metric.get_name_value()[0][1]
-        if args.task=='gender':
-          acc = _value
-        print('[%d][VACC]: %f'%(nbatch, _value))
-      return acc
-
-
-    total_time = 0
-    num_epochs = 0
-    best_acc = [0]
-    highest_acc = [0.0, 0.0]  #lfw and target
-    global_step = [0]
-    save_step = [0]
-    if len(args.lr_steps)==0:
-      lr_steps = [100000, 140000, 160000]
-      p = 512.0/args.batch_size
-      for l in xrange(len(lr_steps)):
-        lr_steps[l] = int(lr_steps[l]*p)
-    else:
-      lr_steps = [int(x) for x in args.lr_steps.split(',')]
-    print('lr_steps', lr_steps)
-
-    kv = mx.kv.create('device')
-    #kv = mx.kv.create('local')
-    #_rescale = 1.0/args.ctx_num
-    #opt = optimizer.SGD(learning_rate=args.lr, momentum=args.mom, wd=args.wd, rescale_grad=_rescale)
-    #opt = optimizer.SGD(learning_rate=args.lr, momentum=args.mom, wd=args.wd)
-    if args.mode=='gluon':
-      trainer = gluon.Trainer(net.collect_params(), 'sgd', 
-              {'learning_rate': args.lr, 'wd': args.wd, 'momentum': args.mom, 'multi_precision': True},
-              kvstore=kv)
-    else:
-      _rescale = 1.0/args.ctx_num
-      opt = optimizer.SGD(learning_rate=args.lr, momentum=args.mom, wd=args.wd, rescale_grad=_rescale)
-      _cb = mx.callback.Speedometer(args.batch_size, 20)
-      arg_params = None
-      aux_params = None
-      data = mx.sym.var('data')
-      label = mx.sym.var('softmax_label')
-      if args.margin_a>0.0:
-        fc7 = net(data, label)
-      else:
-        fc7 = net(data)
-      #sym = mx.symbol.SoftmaxOutput(data=fc7, label = label, name='softmax', normalization='valid')
-      ceop = gluon.loss.SoftmaxCrossEntropyLoss()
-      loss = ceop(fc7, label) 
-      #loss = loss/args.per_batch_size
-      loss = mx.sym.mean(loss)
-      sym = mx.sym.Group( [mx.symbol.BlockGrad(fc7), mx.symbol.MakeLoss(loss, name='softmax')] )
-
-    def _batch_callback():
-      mbatch = global_step[0]
-      global_step[0]+=1
-      for _lr in lr_steps:
-        if mbatch==_lr:
-          args.lr *= 0.1
-          if args.mode=='gluon':
-            trainer.set_learning_rate(args.lr)
-          else:
-            opt.lr  = args.lr
-          print('lr change to', args.lr)
-          break
-
-      #_cb(param)
-      if mbatch%1000==0:
-        print('lr-batch-epoch:',args.lr, mbatch)
-
-      if mbatch>0 and mbatch%args.verbose==0:
-        save_step[0]+=1
-        msave = save_step[0]
-        do_save = False
-        is_highest = False
-        if args.task=='age' or args.task=='gender':
-          acc = val_test(mbatch)
-          if acc>=highest_acc[-1]:
-            highest_acc[-1] = acc
-            is_highest = True
-            do_save = True
-        else:
-          acc_list = ver_test(mbatch)
-          if len(acc_list)>0:
-            lfw_score = acc_list[0]
-            if lfw_score>highest_acc[0]:
-              highest_acc[0] = lfw_score
-              if lfw_score>=0.998:
-                do_save = True
-            if acc_list[-1]>=highest_acc[-1]:
-              highest_acc[-1] = acc_list[-1]
-              if lfw_score>=0.99:
-                do_save = True
-                is_highest = True
-        if args.ckpt==0:
-          do_save = False
-        elif args.ckpt>1:
-          do_save = True
-        if do_save:
-          print('saving', msave)
-          #print('saving gluon params')
-          fname = os.path.join(args.prefix, 'model-gluon.params')
-          net.save_params(fname)
-          fname = os.path.join(args.prefix, 'model')
-          net.export(fname, msave)
-          #arg, aux = model.get_params()
-          #mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux)
-        print('[%d]Accuracy-Highest: %1.5f'%(mbatch, highest_acc[-1]))
-      if args.max_steps>0 and mbatch>args.max_steps:
-        sys.exit(0)
-
-    def _batch_callback_sym(param):
-      _cb(param)
-      _batch_callback()
-
-
-    if args.mode!='gluon':
-      model = mx.mod.Module(
-          context       = ctx,
-          symbol        = sym,
-      )
-      model.fit(train_iter,
-          begin_epoch        = 0,
-          num_epoch          = args.end_epoch,
-          eval_data          = None,
-          eval_metric        = metric,
-          kvstore            = 'device',
-          optimizer          = opt,
-          initializer        = initializer,
-          arg_params         = arg_params,
-          aux_params         = aux_params,
-          allow_missing      = True,
-          batch_end_callback = _batch_callback_sym,
-          epoch_end_callback = None )
-    else:
-      loss_weight = 1.0
-      if args.task=='age':
-        loss_weight = 1.0/AGE
-      #loss = gluon.loss.SoftmaxCrossEntropyLoss(weight = loss_weight)
-      loss = nd.SoftmaxOutput
-      #loss = gluon.loss.SoftmaxCrossEntropyLoss()
-      while True:
-          #trainer = update_learning_rate(opt.lr, trainer, epoch, opt.lr_factor, lr_steps)
-          tic = time.time()
-          train_iter.reset()
-          metric.reset()
-          btic = time.time()
-          for i, batch in enumerate(train_iter):
-              _batch_callback()
-              #data = gluon.utils.split_and_load(batch.data[0].astype(opt.dtype), ctx_list=ctx, batch_axis=0)
-              #label = gluon.utils.split_and_load(batch.label[0].astype(opt.dtype), ctx_list=ctx, batch_axis=0)
-              data = gluon.utils.split_and_load(batch.data[0], ctx_list=ctx, batch_axis=0)
-              label = gluon.utils.split_and_load(batch.label[0], ctx_list=ctx, batch_axis=0)
-              outputs = []
-              Ls = []
-              with ag.record():
-                  for x, y in zip(data, label):
-                      #print(y.asnumpy())
-                      if args.task=='':
-                        if args.margin_a>0.0:
-                          z = net(x,y)
-                        else:
-                          z = net(x)
-                        #print(z[0].shape, z[1].shape)
-                      else:
-                        z = net(x)
-                      if args.task=='gender':
-                        L = loss(z[1], y)
-                        #L = L/args.per_batch_size
-                        Ls.append(L)
-                        outputs.append(z[1])
-                      elif args.task=='age':
-                        for k in xrange(AGE):
-                          _z = nd.slice_axis(z[2], axis=1, begin=k*2, end=k*2+2)
-                          _y = nd.slice_axis(y, axis=1, begin=k, end=k+1)
-                          _y = nd.flatten(_y)
-                          L = loss(_z, _y)
-                          #L = L/args.per_batch_size
-                          #L /= AGE
-                          Ls.append(L)
-                        outputs.append(z[2])
-                      else:
-                        L = loss(z, y)
-                        #L = L/args.per_batch_size
-                        Ls.append(L)
-                        outputs.append(z)
-                      # store the loss and do backward after we have done forward
-                      # on all GPUs for better speed on multiple GPUs.
-                  ag.backward(Ls)
-              #trainer.step(batch.data[0].shape[0], ignore_stale_grad=True)
-              #trainer.step(args.ctx_num)
-              n = batch.data[0].shape[0]
-              #print(n,n)
-              trainer.step(n)
-              metric.update(label, outputs)
-              if i>0 and i%20==0:
-                  name, acc = metric.get()
-                  if len(name)==2:
-                    logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f, %s=%f'%(
-                                   num_epochs, i, args.batch_size/(time.time()-btic), name[0], acc[0], name[1], acc[1]))
-                  else:
-                    logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f'%(
-                                   num_epochs, i, args.batch_size/(time.time()-btic), name[0], acc[0]))
-                  #metric.reset()
-              btic = time.time()
-
-          epoch_time = time.time()-tic
-
-          # First epoch will usually be much slower than the subsequent epics,
-          # so don't factor into the average
-          if num_epochs > 0:
-            total_time = total_time + epoch_time
-
-          #name, acc = metric.get()
-          #logger.info('[Epoch %d] training: %s=%f, %s=%f'%(num_epochs, name[0], acc[0], name[1], acc[1]))
-          logger.info('[Epoch %d] time cost: %f'%(num_epochs, epoch_time))
-          num_epochs = num_epochs + 1
-          #name, val_acc = test(ctx, val_data)
-          #logger.info('[Epoch %d] validation: %s=%f, %s=%f'%(epoch, name[0], val_acc[0], name[1], val_acc[1]))
-
-          # save model if meet requirements
-          #save_checkpoint(epoch, val_acc[0], best_acc)
-      if num_epochs > 1:
-          print('Average epoch time: {}'.format(float(total_time)/(num_epochs - 1)))
-
-
-
-def main():
-    #time.sleep(3600*6.5)
-    global args
-    args = parse_args()
-    train_net(args)
-
-if __name__ == '__main__':
-    main()
-
--- a/gluon/verification.py
+++ b/gluon/verification.py
@@ -1,369 +0,0 @@
-"""Helper for evaluation on the Labeled Faces in the Wild dataset 
-"""
-
-# MIT License
-# 
-# Copyright (c) 2016 David Sandberg
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import argparse
-import sys
-import numpy as np
-from scipy import misc
-from sklearn.model_selection import KFold
-from scipy import interpolate
-import sklearn
-import cv2
-import math
-import datetime
-import pickle
-from sklearn.decomposition import PCA
-import mxnet as mx
-from mxnet import gluon
-from mxnet import ndarray as nd
-sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))
-import face_image
-
-
-class LFold:
-  def __init__(self, n_splits = 2, shuffle = False):
-    self.n_splits = n_splits
-    if self.n_splits>1:
-      self.k_fold = KFold(n_splits = n_splits, shuffle = shuffle)
-
-  def split(self, indices):
-    if self.n_splits>1:
-      return self.k_fold.split(indices)
-    else:
-      return [(indices, indices)]
-
-
-def calculate_roc(thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10, pca = 0):
-    assert(embeddings1.shape[0] == embeddings2.shape[0])
-    assert(embeddings1.shape[1] == embeddings2.shape[1])
-    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
-    nrof_thresholds = len(thresholds)
-    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
-    
-    tprs = np.zeros((nrof_folds,nrof_thresholds))
-    fprs = np.zeros((nrof_folds,nrof_thresholds))
-    accuracy = np.zeros((nrof_folds))
-    indices = np.arange(nrof_pairs)
-    #print('pca', pca)
-    
-    if pca==0:
-      diff = np.subtract(embeddings1, embeddings2)
-      dist = np.sum(np.square(diff),1)
-    
-    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
-        #print('train_set', train_set)
-        #print('test_set', test_set)
-        if pca>0:
-          print('doing pca on', fold_idx)
-          embed1_train = embeddings1[train_set]
-          embed2_train = embeddings2[train_set]
-          _embed_train = np.concatenate( (embed1_train, embed2_train), axis=0 )
-          #print(_embed_train.shape)
-          pca_model = PCA(n_components=pca)
-          pca_model.fit(_embed_train)
-          embed1 = pca_model.transform(embeddings1)
-          embed2 = pca_model.transform(embeddings2)
-          embed1 = sklearn.preprocessing.normalize(embed1)
-          embed2 = sklearn.preprocessing.normalize(embed2)
-          #print(embed1.shape, embed2.shape)
-          diff = np.subtract(embed1, embed2)
-          dist = np.sum(np.square(diff),1)
-        
-        # Find the best threshold for the fold
-        acc_train = np.zeros((nrof_thresholds))
-        for threshold_idx, threshold in enumerate(thresholds):
-            _, _, acc_train[threshold_idx] = calculate_accuracy(threshold, dist[train_set], actual_issame[train_set])
-        best_threshold_index = np.argmax(acc_train)
-        #print('threshold', thresholds[best_threshold_index])
-        for threshold_idx, threshold in enumerate(thresholds):
-            tprs[fold_idx,threshold_idx], fprs[fold_idx,threshold_idx], _ = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set])
-        _, _, accuracy[fold_idx] = calculate_accuracy(thresholds[best_threshold_index], dist[test_set], actual_issame[test_set])
-          
-    tpr = np.mean(tprs,0)
-    fpr = np.mean(fprs,0)
-    return tpr, fpr, accuracy
-
-def calculate_accuracy(threshold, dist, actual_issame):
-    predict_issame = np.less(dist, threshold)
-    tp = np.sum(np.logical_and(predict_issame, actual_issame))
-    fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
-    tn = np.sum(np.logical_and(np.logical_not(predict_issame), np.logical_not(actual_issame)))
-    fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame))
-  
-    tpr = 0 if (tp+fn==0) else float(tp) / float(tp+fn)
-    fpr = 0 if (fp+tn==0) else float(fp) / float(fp+tn)
-    acc = float(tp+tn)/dist.size
-    return tpr, fpr, acc
-
-
-  
-def calculate_val(thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10):
-    assert(embeddings1.shape[0] == embeddings2.shape[0])
-    assert(embeddings1.shape[1] == embeddings2.shape[1])
-    nrof_pairs = min(len(actual_issame), embeddings1.shape[0])
-    nrof_thresholds = len(thresholds)
-    k_fold = LFold(n_splits=nrof_folds, shuffle=False)
-    
-    val = np.zeros(nrof_folds)
-    far = np.zeros(nrof_folds)
-    
-    diff = np.subtract(embeddings1, embeddings2)
-    dist = np.sum(np.square(diff),1)
-    indices = np.arange(nrof_pairs)
-    
-    for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)):
-      
-        # Find the threshold that gives FAR = far_target
-        far_train = np.zeros(nrof_thresholds)
-        for threshold_idx, threshold in enumerate(thresholds):
-            _, far_train[threshold_idx] = calculate_val_far(threshold, dist[train_set], actual_issame[train_set])
-        if np.max(far_train)>=far_target:
-            f = interpolate.interp1d(far_train, thresholds, kind='slinear')
-            threshold = f(far_target)
-        else:
-            threshold = 0.0
-    
-        val[fold_idx], far[fold_idx] = calculate_val_far(threshold, dist[test_set], actual_issame[test_set])
-  
-    val_mean = np.mean(val)
-    far_mean = np.mean(far)
-    val_std = np.std(val)
-    return val_mean, val_std, far_mean
-
-
-def calculate_val_far(threshold, dist, actual_issame):
-    predict_issame = np.less(dist, threshold)
-    true_accept = np.sum(np.logical_and(predict_issame, actual_issame))
-    false_accept = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame)))
-    n_same = np.sum(actual_issame)
-    n_diff = np.sum(np.logical_not(actual_issame))
-    #print(true_accept, false_accept)
-    #print(n_same, n_diff)
-    val = float(true_accept) / float(n_same)
-    far = float(false_accept) / float(n_diff)
-    return val, far
-
-def evaluate(embeddings, actual_issame, nrof_folds=10, pca = 0):
-    # Calculate evaluation metrics
-    thresholds = np.arange(0, 4, 0.01)
-    embeddings1 = embeddings[0::2]
-    embeddings2 = embeddings[1::2]
-    tpr, fpr, accuracy = calculate_roc(thresholds, embeddings1, embeddings2,
-        np.asarray(actual_issame), nrof_folds=nrof_folds, pca = pca)
-    thresholds = np.arange(0, 4, 0.001)
-    val, val_std, far = calculate_val(thresholds, embeddings1, embeddings2,
-        np.asarray(actual_issame), 1e-3, nrof_folds=nrof_folds)
-    return tpr, fpr, accuracy, val, val_std, far
-
-def load_bin(path, image_size):
-  bins, issame_list = pickle.load(open(path, 'rb'))
-  data_list = []
-  for flip in [0,1]:
-    data = nd.empty((len(issame_list)*2, 3, image_size[0], image_size[1]))
-    data_list.append(data)
-  for i in xrange(len(issame_list)*2):
-    _bin = bins[i]
-    img = mx.image.imdecode(_bin)
-    img = nd.transpose(img, axes=(2, 0, 1))
-    for flip in [0,1]:
-      if flip==1:
-        img = mx.ndarray.flip(data=img, axis=2)
-      data_list[flip][i][:] = img
-    if i%1000==0:
-      print('loading bin', i)
-  print(data_list[0].shape)
-  return (data_list, issame_list)
-
-def test(data_set, net, ctx, batch_size, nfolds=10):
-  print('testing verification..')
-  data_list = data_set[0]
-  issame_list = data_set[1]
-  embeddings_list = []
-  time_consumed = 0.0
-  for i in xrange( len(data_list) ):
-    data = data_list[i]
-    embeddings = None
-    ba = 0
-    while ba<data.shape[0]:
-      bb = min(ba+batch_size, data.shape[0])
-      count = bb-ba
-      #print(ba, bb)
-      x = nd.slice_axis(data, axis=0, begin=bb-batch_size, end=bb)
-      #print(_data.shape, _label.shape)
-      time0 = datetime.datetime.now()
-      #x = x.as_in_context(ctx[0])
-      xs = gluon.utils.split_and_load(x, ctx_list=ctx, batch_axis=0)
-      zs = []
-      for x in xs:
-        with mx.autograd.predict_mode():
-          z = net.feature(x)
-        zs.append(z)
-      zss = []
-      for z in zs:
-        zss.append(z.asnumpy())
-      zss = np.concatenate(zss, axis=0)
-      #print(zss.shape)
-      _embeddings = zss
-      #_arg, _aux = model.get_params()
-      #__arg = {}
-      #for k,v in _arg.iteritems():
-      #  __arg[k] = v.as_in_context(_ctx)
-      #_arg = __arg
-      #_arg["data"] = _data.as_in_context(_ctx)
-      #_arg["softmax_label"] = _label.as_in_context(_ctx)
-      #for k,v in _arg.iteritems():
-      #  print(k,v.context)
-      #exe = sym.bind(_ctx, _arg ,args_grad=None, grad_req="null", aux_states=_aux)
-      #exe.forward(is_train=False)
-      #net_out = exe.outputs
-      #_embeddings = z.asnumpy()
-      time_now = datetime.datetime.now()
-      diff = time_now - time0
-      time_consumed+=diff.total_seconds()
-      #print(_embeddings.shape)
-      if embeddings is None:
-        embeddings = np.zeros( (data.shape[0], _embeddings.shape[1]) )
-      embeddings[ba:bb,:] = _embeddings[(batch_size-count):,:]
-      ba = bb
-    embeddings_list.append(embeddings)
-
-  _xnorm = 0.0
-  _xnorm_cnt = 0
-  for embed in embeddings_list:
-    for i in xrange(embed.shape[0]):
-      _em = embed[i]
-      _norm=np.linalg.norm(_em)
-      #print(_em.shape, _norm)
-      _xnorm+=_norm
-      _xnorm_cnt+=1
-  _xnorm /= _xnorm_cnt
-
-  embeddings = embeddings_list[0].copy()
-  embeddings = sklearn.preprocessing.normalize(embeddings)
-  acc1 = 0.0
-  std1 = 0.0
-  #_, _, accuracy, val, val_std, far = evaluate(embeddings, issame_list, nrof_folds=10)
-  #acc1, std1 = np.mean(accuracy), np.std(accuracy)
-
-  #print('Validation rate: %2.5f+-%2.5f @ FAR=%2.5f' % (val, val_std, far))
-  #embeddings = np.concatenate(embeddings_list, axis=1)
-  embeddings = embeddings_list[0] + embeddings_list[1]
-  embeddings = sklearn.preprocessing.normalize(embeddings)
-  print(embeddings.shape)
-  print('infer time', time_consumed)
-  _, _, accuracy, val, val_std, far = evaluate(embeddings, issame_list, nrof_folds=nfolds)
-  acc2, std2 = np.mean(accuracy), np.std(accuracy)
-  return acc1, std1, acc2, std2, _xnorm, embeddings_list
-
-if __name__ == '__main__':
-
-  parser = argparse.ArgumentParser(description='do verification')
-  # general
-  parser.add_argument('--data-dir', default='', help='')
-  parser.add_argument('--model', default='../model/softmax,50', help='path to load model.')
-  parser.add_argument('--target', default='lfw,cfp_ff,cfp_fp,agedb_30', help='test targets.')
-  parser.add_argument('--gpu', default=0, type=int, help='gpu id')
-  parser.add_argument('--batch-size', default=32, type=int, help='')
-  parser.add_argument('--max', default='', type=str, help='')
-  parser.add_argument('--mode', default=0, type=int, help='')
-  parser.add_argument('--nfolds', default=10, type=int, help='')
-  args = parser.parse_args()
-
-  prop = face_image.load_property(args.data_dir)
-  image_size = prop.image_size
-  print('image_size', image_size)
-  ctx = mx.gpu(args.gpu)
-  nets = []
-  vec = args.model.split(',')
-  prefix = args.model.split(',')[0]
-  epochs = []
-  if len(vec)==1:
-    pdir = os.path.dirname(prefix)
-    for fname in os.listdir(pdir):
-      if not fname.endswith('.params'):
-        continue
-      _file = os.path.join(pdir, fname)
-      if _file.startswith(prefix):
-        epoch = int(fname.split('.')[0].split('-')[1])
-        epochs.append(epoch)
-    epochs = sorted(epochs, reverse=True)
-    if len(args.max)>0:
-      _max = [int(x) for x in args.max.split(',')]
-      assert len(_max)==2
-      if len(epochs)>_max[1]:
-        epochs = epochs[_max[0]:_max[1]]
-
-  else:
-    epochs = [int(x) for x in vec[1].split('|')]
-  print('model number', len(epochs))
-  time0 = datetime.datetime.now()
-  for epoch in epochs:
-    print('loading',prefix, epoch)
-    sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
-    #arg_params, aux_params = ch_dev(arg_params, aux_params, ctx)
-    all_layers = sym.get_internals()
-    sym = all_layers['fc1_output']
-    model = mx.mod.Module(symbol=sym, context=ctx, label_names = None)
-    #model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))])
-    model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))])
-    model.set_params(arg_params, aux_params)
-    nets.append(model)
-  time_now = datetime.datetime.now()
-  diff = time_now - time0
-  print('model loading time', diff.total_seconds())
-
-  ver_list = []
-  ver_name_list = []
-  for name in args.target.split(','):
-    path = os.path.join(args.data_dir,name+".bin")
-    if os.path.exists(path):
-      print('loading.. ', name)
-      data_set = load_bin(path, image_size)
-      ver_list.append(data_set)
-      ver_name_list.append(name)
-
-  if args.mode==0:
-    for i in xrange(len(ver_list)):
-      results = []
-      for model in nets:
-        acc1, std1, acc2, std2, xnorm, embeddings_list = test(ver_list[i], model, args.batch_size, args.nfolds)
-        print('[%s]XNorm: %f' % (ver_name_list[i], xnorm))
-        print('[%s]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], acc1, std1))
-        print('[%s]Accuracy-Flip: %1.5f+-%1.5f' % (ver_name_list[i], acc2, std2))
-        results.append(acc2)
-      print('Max of [%s] is %1.5f' % (ver_name_list[i], np.max(results)))
-  elif args.mode==1:
-    model = nets[0]
-    test_badcase(ver_list[0], model, args.batch_size, args.target)
-  else:
-    model = nets[0]
-    dumpR(ver_list[0], model, args.batch_size, args.target)
-
-
--- a/iccv19-challenge/README.md
+++ b/iccv19-challenge/README.md
@@ -1,6 +1,53 @@
 [The Lightweight Face Recognition Challenge & Workshop](https://ibug.doc.ic.ac.uk/resources/lightweight-face-recognition-challenge-workshop/) will be held in conjunction with the International Conference on Computer Vision (ICCV) 2019, Seoul Korea. 

-[Test Server](http://39.104.128.76/overview)   
+Please strictly follow the rules. For example, please use the same [method](https://github.com/deepinsight/insightface/blob/master/common/flops_counter.py) for the FLOPs calculation regardless of your training framework is insightface or not.
+
+[Test Server](http://www.insightface-challenge.com/overview) 
+
+**Sponsors:**
+
+The Lightweight Face Recognition Challenge has been supported by 
+
+EPSRC project FACER2VM (EP/N007743/1)
+
+Huawei (5000$)
+
+DeepGlint (3000$)
+
+iQIYI (3000$)
+
+Kingsoft Cloud (3000$)
+
+Pensees (3000$)
+
+Dynamic funding pool: (17000$)
+
+Cash sponsors and gift donations are welcome.
+
+Contact:
+insightface.challenge@gmail.com
+
+**Discussion Group**
+
+*For Chinese:*
+
+![wechat](https://github.com/deepinsight/insightface/blob/master/resources/lfr19_wechat1.jpg)
+
+*For English:*
+
+(in #lfr2019 channel)
+https://join.slack.com/t/insightface/shared_invite/enQtNjU0NDk2MjYyMTMzLTIzNDEwNmIxMjU5OGYzYzFhMjlkNjlhMTBkNWFiNjU4MTVhNTgzYjQ5ZTZiMGM3MzUyNzQ3OTBhZTg3MzM5M2I
+
+
+**NEWS**
+
+``2019.06.21`` We updated the groundtruth of Glint test dataset.
+
+``2019.06.04`` We will clean the groundtruth on deepglint testset.
+
+``2019.05.21`` Baseline models and training logs available.
+
+``2019.05.16`` The four tracks (deepglint-light, deepglint-large, iQIYI-light, iQIYI-large) will equally share the dynamic funding pool (14000$). From each track, the top 3 players will share the funding pool for 50%, 30% and 20% respectively.

 ==================

@@ -10,20 +57,21 @@

 1. Download ms1m-retinaface from [baiducloud](https://pan.baidu.com/s/1rQxJ3drqm_071vpxBtp98A) or [dropbox](https://www.dropbox.com/s/ev5ezzcz79p2hge/ms1m-retinaface-t1.zip?dl=0) and unzip it to `$INSIGHTFACE_ROOT/datasets/`
 2. Go into `$INSIGHTFACE_ROOT/recognition/`
-3. Refer to the `retina` dataset config section in `sample_config.py` and copy it to your own`config.py`.
+3. Refer to the `retina` dataset configuration section in `sample_config.py` and copy it as your own configuration file `config.py`.
 4. Start training with `CUDA_VISIBLE_DEVICES='0,1,2,3' python -u train.py --dataset retina --network [your-network] --loss arcface`. It will output the accuracy of lfw, cfp_fp and agedb_30 every 2000 batches by default.
+5. Putting the training dataset on SSD hard disk will achieve better training efficiency.

 ------------------

 **Testing:**

-1. testdata-image from [baiducloud](https://pan.baidu.com/s/1UKUYsRfVTSzj1tfU3BVFrw) or [dropbox](https://www.dropbox.com/s/r5y6xt754m36rh8/iccv19-challenge-data-v1.zip?dl=0). These face images are all pre-processed and aligned so no need to do further modification.
-2. To download testdata-video from iQIYI, please visit <http://challenge.ai.iqiyi.com/data-cluster>. You must download iQIYI-VID-FACE.z01, iQIYI-VID-FACE.z02 and iQIYI-VID-FACE.zip after signin. These face images are all pre-processed and aligned so no need to do further modification.
-   1. To unzip: ``zip iQIYI_VID_FACE.zip -s=0 --out iQIYI_VID_FACE_ALL.zip; unzip iQIYI_VID_FACE_ALL.zip``
-   2. We can get a directory named ``iQIYI_VID_FACE`` after decompression. Then we have to move ``video_filelist.txt`` in testdata-image package to ``iQIYI_VID_FACE/filelist.txt``, to indicate the order of videos in our submission feature file.
+1. Download testdata-image from [baiducloud](https://pan.baidu.com/s/1UKUYsRfVTSzj1tfU3BVFrw) or [dropbox](https://www.dropbox.com/s/r5y6xt754m36rh8/iccv19-challenge-data-v1.zip?dl=0). These face images are all pre-processed and aligned.
+2. To download testdata-video from iQIYI, please visit <http://challenge.ai.iqiyi.com/data-cluster>. You need to download iQIYI-VID-FACE.z01, iQIYI-VID-FACE.z02 and iQIYI-VID-FACE.zip after registration. These face frames are also pre-processed and aligned.
+   1. Unzip: ``zip iQIYI_VID_FACE.zip -s=0 --out iQIYI_VID_FACE_ALL.zip; unzip iQIYI_VID_FACE_ALL.zip``
+   2. We can get a directory named ``iQIYI_VID_FACE`` after decompression. Then, we have to move ``video_filelist.txt`` in testdata-image package to ``iQIYI_VID_FACE/filelist.txt``, to indicate the order of videos in our submission feature file.
 3. To generate image feature submission file: check ``gen_image_feature.py``
 4. To generate video feature submission file: check ``gen_video_feature.py``
-5. Submit binary feature to the right section on test server.
+5. Submit binary feature to the right track of the test server.

 You can also check the verification performance during training time on LFW,CFP_FP,AgeDB_30 datasets.

@@ -35,10 +83,16 @@ Final ranking is determined by the TAR under 1:1 protocal only, for all valid su

 For image testset, we evaluate the TAR under FAR@e-8 while we choose the TAR under FAR@e-4 for video testset.

-For track-1, we will rank all players as following formula: ``TAR(glint-light)+TAR(iqiyi-light)``
+------------------

-For track-2, we will rank all players as following formula: ``TAR(glint-large)+TAR(iqiyi-large)``
+**Baseline:**

+1. Network y2(a deeper mobilefacenet): 933M FLOPs. TAR_image: 0.64691, TAR_video: 0.47191
+2. Network r100fc(ResNet100FC-IR): 24G FLOPs. TAR_image: 0.80312, TAR_video: 0.64894
+
+Baseline models download link: [baidu cloud](https://pan.baidu.com/s/1Em0ZFnefSoTsZoTd-9m8Nw)    [dropbox](https://www.dropbox.com/s/yqaziktiv38ehrv/iccv19-baseline-models.zip?dl=0)
+
+Training logs: [baidu cloud](https://pan.baidu.com/s/12rsp-oMzsjTeU6nugEvA9g)   [dropbox](https://www.dropbox.com/s/4ufb9g7n76rfav5/iccv-baseline-log.zip?dl=0)

 ------------------

@@ -48,17 +102,10 @@ For track-2, we will rank all players as following formula: ``TAR(glint-large)+T

 ------------------

-**Baseline:**
-
-1. Network y2(a deeper mobilefacenet): 933M FLOPs. TAR_image: 0.64691, TAR_video: [TODO]
-2. Network r100fc(ResNet100FC-IR): 24G FLOPs. TAR_image: 0.80312, TAR_video: [TODO]
-
------------------
-
 **Candidate solutions:**

-1. Use slightly deeper or wider mobile-level networks.
-2. Try different training methods/losses than straightforward arcface.
+1. Manually design or automatically search different networks/losses.
+2. Use slightly deeper or wider mobile-level networks.
 3. [OctConv](https://arxiv.org/abs/1904.05049), to reduce FLOPs.
 4. [HRNet](https://arxiv.org/abs/1904.04514), for large FLOPs track.
 and so on
--- a/iccv19-challenge/gen_image_feature.py
+++ b/iccv19-challenge/gen_image_feature.py
@@ -30,7 +30,7 @@ use_flip = True


 def do_flip(data):
-  for idx in xrange(data.shape[0]):
+  for idx in range(data.shape[0]):
    data[idx,:,:] = np.fliplr(data[idx,:,:])

 def get_feature(buffer):
@@ -83,7 +83,7 @@ def main(args):
  ctx = []
  cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
  if len(cvd)>0:
-    for i in xrange(len(cvd.split(','))):
+    for i in range(len(cvd.split(','))):
      ctx.append(mx.gpu(i))
  if len(ctx)==0:
    ctx = [mx.cpu()]
--- a/iccv19-challenge/gen_video_feature.py
+++ b/iccv19-challenge/gen_video_feature.py
@@ -32,7 +32,7 @@ ctx_num = 0


 def do_flip(data):
-  for idx in xrange(data.shape[0]):
+  for idx in range(data.shape[0]):
    data[idx,:,:] = np.fliplr(data[idx,:,:])

 def get_feature(buffer):
@@ -89,7 +89,7 @@ def main(args):
  ctx = []
  cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
  if len(cvd)>0:
-    for i in xrange(len(cvd.split(','))):
+    for i in range(len(cvd.split(','))):
      ctx.append(mx.gpu(i))
  if len(ctx)==0:
    ctx = [mx.cpu()]
--- a/python-package/README.md
+++ b/python-package/README.md
@@ -0,0 +1,3 @@
+InsightFace.ai README
+
+
--- a/python-package/insightface/init.py
+++ b/python-package/insightface/init.py
@@ -0,0 +1,28 @@
+# coding: utf-8
+# pylint: disable=wrong-import-position
+"""InsightFace: A Face Analysis Toolkit."""
+from __future__ import absolute_import
+
+# mxnet version check
+#mx_version = '1.4.0'
+try:
+    import mxnet as mx
+    #from distutils.version import LooseVersion
+    #if LooseVersion(mx.__version__) < LooseVersion(mx_version):
+    #    msg = (
+    #        "Legacy mxnet-mkl=={} detected, some new modules may not work properly. "
+    #        "mxnet-mkl>={} is required. You can use pip to upgrade mxnet "
+    #        "`pip install mxnet-mkl --pre --upgrade` "
+    #        "or `pip install mxnet-cu90mkl --pre --upgrade`").format(mx.__version__, mx_version)
+    #    raise ImportError(msg)
+except ImportError:
+    raise ImportError(
+        "Unable to import dependency mxnet. "
+        "A quick tip is to install via `pip install mxnet-mkl/mxnet-cu90mkl --pre`. ")
+
+__version__ = '0.1.3'
+
+from . import model_zoo
+from . import utils
+from . import app
+
--- a/python-package/insightface/app/init.py
+++ b/python-package/insightface/app/init.py
@@ -0,0 +1 @@
+from .face_analysis import *
--- a/python-package/insightface/app/face_analysis.py
+++ b/python-package/insightface/app/face_analysis.py
@@ -0,0 +1,72 @@
+from __future__ import division
+import collections
+import mxnet as mx
+import numpy as np
+from numpy.linalg import norm
+import mxnet.ndarray as nd
+from ..model_zoo import model_zoo
+from ..utils import face_align
+
+__all__ = ['FaceAnalysis',
+           'Face']
+
+Face = collections.namedtuple('Face', [
+        'bbox', 'landmark', 'det_score', 'embedding', 'gender', 'age', 'embedding_norm', 'normed_embedding'])
+
+Face.__new__.__defaults__ = (None,) * len(Face._fields)
+
+class FaceAnalysis:
+    def __init__(self, det_name='retinaface_r50_v1', rec_name='arcface_r100_v1', ga_name='genderage_v1'):
+        assert det_name is not None
+        self.det_model = model_zoo.get_model(det_name)
+        if rec_name is not None:
+            self.rec_model = model_zoo.get_model(rec_name)
+        else:
+            self.rec_model = None
+        if ga_name is not None:
+            self.ga_model = model_zoo.get_model(ga_name)
+        else:
+            self.ga_model = None
+
+    def prepare(self, ctx_id, nms=0.4):
+        self.det_model.prepare(ctx_id, nms)
+        if self.rec_model is not None:
+            self.rec_model.prepare(ctx_id)
+        if self.ga_model is not None:
+            self.ga_model.prepare(ctx_id)
+
+    def get(self, img, det_thresh = 0.8, det_scale = 1.0, max_num = 0):
+        bboxes, landmarks = self.det_model.detect(img, threshold=det_thresh, scale = det_scale)
+        if bboxes.shape[0]==0:
+            return []
+        if max_num>0 and bboxes.shape[0]>max_num:
+            area = (bboxes[:,2]-bboxes[:,0])*(bboxes[:,3]-bboxes[:,1])
+            img_center = img.shape[0]//2, img.shape[1]//2
+            offsets = np.vstack([ (bboxes[:,0]+bboxes[:,2])/2-img_center[1], (bboxes[:,1]+bboxes[:,3])/2-img_center[0] ])
+            offset_dist_squared = np.sum(np.power(offsets,2.0),0)
+            bindex = np.argmax(area-offset_dist_squared*2.0) # some extra weight on the centering
+            bindex = bindex[0:max_num]
+            bboxes = bboxes[bindex, :]
+            landmarks = landmarks[bindex, :]
+        ret = []
+        for i in range(bboxes.shape[0]):
+            bbox = bboxes[i, 0:4]
+            det_score = bboxes[i,4]
+            landmark = landmarks[i]
+            _img = face_align.norm_crop(img, landmark = landmark)
+            embedding = None
+            embedding_norm = None
+            normed_embedding = None
+            gender = None
+            age = None
+            if self.rec_model is not None:
+                embedding = self.rec_model.get_embedding(_img).flatten()
+                embedding_norm = norm(embedding)
+                normed_embedding = embedding / embedding_norm
+            if self.ga_model is not None:
+                gender, age = self.ga_model.get(_img)
+            face = Face(bbox = bbox, landmark = landmark, det_score = det_score, embedding = embedding, gender = gender, age = age
+                    , normed_embedding=normed_embedding, embedding_norm = embedding_norm)
+            ret.append(face)
+        return ret
+
--- a/python-package/insightface/model_zoo/init.py
+++ b/python-package/insightface/model_zoo/init.py
@@ -0,0 +1 @@
+from .model_zoo import get_model, get_model_list
--- a/python-package/insightface/model_zoo/face_detection.py
+++ b/python-package/insightface/model_zoo/face_detection.py
@@ -0,0 +1,425 @@
+from __future__ import division
+import mxnet as mx
+import numpy as np
+import mxnet.ndarray as nd
+
+__all__ = ['FaceDetector',
+           'retinaface_r50_v1',
+           'retinaface_mnet025_v1',
+           'retinaface_mnet025_v2',
+           'get_retinaface']
+
+def _whctrs(anchor):
+    """
+    Return width, height, x center, and y center for an anchor (window).
+    """
+
+    w = anchor[2] - anchor[0] + 1
+    h = anchor[3] - anchor[1] + 1
+    x_ctr = anchor[0] + 0.5 * (w - 1)
+    y_ctr = anchor[1] + 0.5 * (h - 1)
+    return w, h, x_ctr, y_ctr
+
+
+def _mkanchors(ws, hs, x_ctr, y_ctr):
+    """
+    Given a vector of widths (ws) and heights (hs) around a center
+    (x_ctr, y_ctr), output a set of anchors (windows).
+    """
+
+    ws = ws[:, np.newaxis]
+    hs = hs[:, np.newaxis]
+    anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
+                         y_ctr - 0.5 * (hs - 1),
+                         x_ctr + 0.5 * (ws - 1),
+                         y_ctr + 0.5 * (hs - 1)))
+    return anchors
+
+def _ratio_enum(anchor, ratios):
+    """
+    Enumerate a set of anchors for each aspect ratio wrt an anchor.
+    """
+
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    size = w * h
+    size_ratios = size / ratios
+    ws = np.round(np.sqrt(size_ratios))
+    hs = np.round(ws * ratios)
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
+
+
+def _scale_enum(anchor, scales):
+    """
+    Enumerate a set of anchors for each scale wrt an anchor.
+    """
+
+    w, h, x_ctr, y_ctr = _whctrs(anchor)
+    ws = w * scales
+    hs = h * scales
+    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
+    return anchors
+
+def anchors_plane(height, width, stride, base_anchors):
+    """
+    Parameters
+    ----------
+    height: height of plane
+    width:  width of plane
+    stride: stride ot the original image
+    anchors_base: (A, 4) a base set of anchors
+    Returns
+    -------
+    all_anchors: (height, width, A, 4) ndarray of anchors spreading over the plane
+    """
+    A = base_anchors.shape[0]
+    all_anchors = np.zeros((height, width, A, 4), dtype=np.float32)
+    for iw in range(width):
+        sw = iw * stride
+        for ih in range(height):
+            sh = ih * stride
+            for k in range(A):
+                all_anchors[ih, iw, k, 0] = base_anchors[k, 0] + sw
+                all_anchors[ih, iw, k, 1] = base_anchors[k, 1] + sh
+                all_anchors[ih, iw, k, 2] = base_anchors[k, 2] + sw
+                all_anchors[ih, iw, k, 3] = base_anchors[k, 3] + sh
+    return all_anchors
+
+def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
+                     scales=2 ** np.arange(3, 6), stride=16):
+    """
+    Generate anchor (reference) windows by enumerating aspect ratios X
+    scales wrt a reference (0, 0, 15, 15) window.
+    """
+
+    base_anchor = np.array([1, 1, base_size, base_size]) - 1
+    ratio_anchors = _ratio_enum(base_anchor, ratios)
+    anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
+                         for i in range(ratio_anchors.shape[0])])
+    return anchors
+
+def generate_anchors_fpn(cfg):
+    """
+    Generate anchor (reference) windows by enumerating aspect ratios X
+    scales wrt a reference (0, 0, 15, 15) window.
+    """
+    RPN_FEAT_STRIDE = []
+    for k in cfg:
+      RPN_FEAT_STRIDE.append( int(k) )
+    RPN_FEAT_STRIDE = sorted(RPN_FEAT_STRIDE, reverse=True)
+    anchors = []
+    for k in RPN_FEAT_STRIDE:
+      v = cfg[str(k)]
+      bs = v['BASE_SIZE']
+      __ratios = np.array(v['RATIOS'])
+      __scales = np.array(v['SCALES'])
+      stride = int(k)
+      #print('anchors_fpn', bs, __ratios, __scales, file=sys.stderr)
+      r = generate_anchors(bs, __ratios, __scales, stride)
+      #print('anchors_fpn', r.shape, file=sys.stderr)
+      anchors.append(r)
+
+    return anchors
+
+def clip_pad(tensor, pad_shape):
+    """
+    Clip boxes of the pad area.
+    :param tensor: [n, c, H, W]
+    :param pad_shape: [h, w]
+    :return: [n, c, h, w]
+    """
+    H, W = tensor.shape[2:]
+    h, w = pad_shape
+
+    if h < H or w < W:
+      tensor = tensor[:, :, :h, :w].copy()
+
+    return tensor
+
+def bbox_pred(boxes, box_deltas):
+    """
+    Transform the set of class-agnostic boxes into class-specific boxes
+    by applying the predicted offsets (box_deltas)
+    :param boxes: !important [N 4]
+    :param box_deltas: [N, 4 * num_classes]
+    :return: [N 4 * num_classes]
+    """
+    if boxes.shape[0] == 0:
+        return np.zeros((0, box_deltas.shape[1]))
+
+    boxes = boxes.astype(np.float, copy=False)
+    widths = boxes[:, 2] - boxes[:, 0] + 1.0
+    heights = boxes[:, 3] - boxes[:, 1] + 1.0
+    ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0)
+    ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0)
+
+    dx = box_deltas[:, 0:1]
+    dy = box_deltas[:, 1:2]
+    dw = box_deltas[:, 2:3]
+    dh = box_deltas[:, 3:4]
+
+    pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
+    pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
+    pred_w = np.exp(dw) * widths[:, np.newaxis]
+    pred_h = np.exp(dh) * heights[:, np.newaxis]
+
+    pred_boxes = np.zeros(box_deltas.shape)
+    # x1
+    pred_boxes[:, 0:1] = pred_ctr_x - 0.5 * (pred_w - 1.0)
+    # y1
+    pred_boxes[:, 1:2] = pred_ctr_y - 0.5 * (pred_h - 1.0)
+    # x2
+    pred_boxes[:, 2:3] = pred_ctr_x + 0.5 * (pred_w - 1.0)
+    # y2
+    pred_boxes[:, 3:4] = pred_ctr_y + 0.5 * (pred_h - 1.0)
+
+    if box_deltas.shape[1]>4:
+      pred_boxes[:,4:] = box_deltas[:,4:]
+
+    return pred_boxes
+
+def landmark_pred(boxes, landmark_deltas):
+    if boxes.shape[0] == 0:
+        return np.zeros((0, landmark_deltas.shape[1]))
+    boxes = boxes.astype(np.float, copy=False)
+    widths = boxes[:, 2] - boxes[:, 0] + 1.0
+    heights = boxes[:, 3] - boxes[:, 1] + 1.0
+    ctr_x = boxes[:, 0] + 0.5 * (widths - 1.0)
+    ctr_y = boxes[:, 1] + 0.5 * (heights - 1.0)
+    pred = landmark_deltas.copy()
+    for i in range(5):
+        pred[:,i,0] = landmark_deltas[:,i,0]*widths + ctr_x
+        pred[:,i,1] = landmark_deltas[:,i,1]*heights + ctr_y
+    return pred
+
+class FaceDetector:
+    def __init__(self, param_file, rac):
+        self.param_file = param_file
+        self.rac = rac
+        self.default_image_size = (480, 640)
+
+    def prepare(self, ctx_id, nms=0.4, fix_image_size=None):
+        pos = self.param_file.rfind('-')
+        prefix = self.param_file[0:pos]
+        pos2 = self.param_file.rfind('.')
+        epoch = int(self.param_file[pos+1:pos2])
+        sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+        if ctx_id>=0:
+            ctx = mx.gpu(ctx_id)
+        else:
+            ctx = mx.cpu()
+        model = mx.mod.Module(symbol=sym, context=ctx, label_names = None)
+        if fix_image_size is not None:
+            data_shape = (1,3)+fix_image_size
+        else:
+            data_shape = (1,3)+self.default_image_size
+        model.bind(data_shapes=[('data', data_shape)])
+        model.set_params(arg_params, aux_params)
+        #warmup
+        data = mx.nd.zeros(shape=data_shape)
+        db = mx.io.DataBatch(data=(data,))
+        model.forward(db, is_train=False)
+        out = model.get_outputs()[0].asnumpy()
+        self.model = model
+        self.nms_threshold = nms
+
+        self.landmark_std = 1.0
+        _ratio = (1.,)
+        fmc = 3
+        if self.rac=='net3':
+            _ratio = (1.,)
+        elif self.rac=='net3l':
+            _ratio = (1.,)
+            self.landmark_std = 0.2
+        elif network=='net5': #retinaface
+            fmc = 5
+        else:
+            assert False, 'rac setting error %s'%self.rac
+
+        if fmc==3:
+            self._feat_stride_fpn = [32, 16, 8]
+            self.anchor_cfg = {
+                  '32': {'SCALES': (32,16), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999},
+                  '16': {'SCALES': (8,4), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999},
+                  '8': {'SCALES': (2,1), 'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999},
+                  }
+        elif fmc==5:
+            self._feat_stride_fpn = [64, 32, 16, 8, 4]
+            self.anchor_cfg = {}
+            _ass = 2.0**(1.0/3)
+            _basescale = 1.0
+            for _stride in [4, 8, 16, 32, 64]:
+                key = str(_stride)
+                value = {'BASE_SIZE': 16, 'RATIOS': _ratio, 'ALLOWED_BORDER': 9999}
+                scales = []
+                for _ in range(3):
+                    scales.append(_basescale)
+                    _basescale *= _ass
+                value['SCALES'] = tuple(scales)
+                self.anchor_cfg[key] = value
+
+        print(self._feat_stride_fpn, self.anchor_cfg)
+        self.use_landmarks = False
+        if len(sym)//len(self._feat_stride_fpn)==3:
+            self.use_landmarks = True
+        print('use_landmarks', self.use_landmarks)
+        self.fpn_keys = []
+
+        for s in self._feat_stride_fpn:
+            self.fpn_keys.append('stride%s'%s)
+
+        self._anchors_fpn = dict(zip(self.fpn_keys, generate_anchors_fpn(cfg=self.anchor_cfg)))
+        for k in self._anchors_fpn:
+            v = self._anchors_fpn[k].astype(np.float32)
+            self._anchors_fpn[k] = v
+        self.anchor_plane_cache = {}
+
+        self._num_anchors = dict(zip(self.fpn_keys, [anchors.shape[0] for anchors in self._anchors_fpn.values()]))
+
+    def detect(self, img, threshold=0.5, scale=1.0):
+        proposals_list = []
+        scores_list = []
+        landmarks_list = []
+        if scale==1.0:
+            im = img
+        else:
+            im = cv2.resize(img, None, None, fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
+        im_info = [im.shape[0], im.shape[1]]
+        im_tensor = np.zeros((1, 3, im.shape[0], im.shape[1]))
+        for i in range(3):
+            im_tensor[0, i, :, :] = im[:, :, 2 - i]
+        data = nd.array(im_tensor)
+        db = mx.io.DataBatch(data=(data,), provide_data=[('data', data.shape)])
+        self.model.forward(db, is_train=False)
+        net_out = self.model.get_outputs()
+        for _idx,s in enumerate(self._feat_stride_fpn):
+            _key = 'stride%s'%s
+            stride = int(s)
+            if self.use_landmarks:
+              idx = _idx*3
+            else:
+              idx = _idx*2
+            scores = net_out[idx].asnumpy()
+            scores = scores[:, self._num_anchors['stride%s'%s]:, :, :]
+            idx+=1
+            bbox_deltas = net_out[idx].asnumpy()
+
+            height, width = bbox_deltas.shape[2], bbox_deltas.shape[3]
+            A = self._num_anchors['stride%s'%s]
+            K = height * width
+            key = (height, width, stride)
+            if key in self.anchor_plane_cache:
+                anchors = self.anchor_plane_cache[key]
+            else:
+                anchors_fpn = self._anchors_fpn['stride%s'%s]
+                anchors = anchors_plane(height, width, stride, anchors_fpn)
+                anchors = anchors.reshape((K * A, 4))
+                if len(self.anchor_plane_cache)<100:
+                    self.anchor_plane_cache[key] = anchors
+
+            scores = clip_pad(scores, (height, width))
+            scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1))
+
+            bbox_deltas = clip_pad(bbox_deltas, (height, width))
+            bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1))
+            bbox_pred_len = bbox_deltas.shape[3]//A
+            bbox_deltas = bbox_deltas.reshape((-1, bbox_pred_len))
+
+            proposals = bbox_pred(anchors, bbox_deltas)
+            #proposals = clip_boxes(proposals, im_info[:2])
+
+
+            scores_ravel = scores.ravel()
+            order = np.where(scores_ravel>=threshold)[0]
+            proposals = proposals[order, :]
+            scores = scores[order]
+
+            proposals[:,0:4] /= scale
+
+            proposals_list.append(proposals)
+            scores_list.append(scores)
+
+            if self.use_landmarks:
+                idx+=1
+                landmark_deltas = net_out[idx].asnumpy()
+                landmark_deltas = clip_pad(landmark_deltas, (height, width))
+                landmark_pred_len = landmark_deltas.shape[1]//A
+                landmark_deltas = landmark_deltas.transpose((0, 2, 3, 1)).reshape((-1, 5, landmark_pred_len//5))
+                landmark_deltas *= self.landmark_std
+                #print(landmark_deltas.shape, landmark_deltas)
+                landmarks = landmark_pred(anchors, landmark_deltas)
+                landmarks = landmarks[order, :]
+
+                landmarks[:,:,0:2] /= scale
+                landmarks_list.append(landmarks)
+
+        proposals = np.vstack(proposals_list)
+        landmarks = None
+        if proposals.shape[0]==0:
+            if self.use_landmarks:
+                landmarks = np.zeros( (0,5,2) )
+            return np.zeros( (0,5) ), landmarks
+        scores = np.vstack(scores_list)
+        scores_ravel = scores.ravel()
+        order = scores_ravel.argsort()[::-1]
+        proposals = proposals[order, :]
+        scores = scores[order]
+        if self.use_landmarks:
+            landmarks = np.vstack(landmarks_list)
+            landmarks = landmarks[order].astype(np.float32, copy=False)
+
+        pre_det = np.hstack((proposals[:,0:4], scores)).astype(np.float32, copy=False)
+        keep = self.nms(pre_det)
+        det = np.hstack( (pre_det, proposals[:,4:]) )
+        det = det[keep, :]
+        if self.use_landmarks:
+            landmarks = landmarks[keep]
+
+        return det, landmarks
+
+    def nms(self, dets):
+        thresh = self.nms_threshold
+        x1 = dets[:, 0]
+        y1 = dets[:, 1]
+        x2 = dets[:, 2]
+        y2 = dets[:, 3]
+        scores = dets[:, 4]
+
+        areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+        order = scores.argsort()[::-1]
+
+        keep = []
+        while order.size > 0:
+            i = order[0]
+            keep.append(i)
+            xx1 = np.maximum(x1[i], x1[order[1:]])
+            yy1 = np.maximum(y1[i], y1[order[1:]])
+            xx2 = np.minimum(x2[i], x2[order[1:]])
+            yy2 = np.minimum(y2[i], y2[order[1:]])
+
+            w = np.maximum(0.0, xx2 - xx1 + 1)
+            h = np.maximum(0.0, yy2 - yy1 + 1)
+            inter = w * h
+            ovr = inter / (areas[i] + areas[order[1:]] - inter)
+
+            inds = np.where(ovr <= thresh)[0]
+            order = order[inds + 1]
+
+        return keep
+
+
+def get_retinaface(name, rac='net3',
+               root='~/.insightface/models', **kwargs):
+    from .model_store import get_model_file
+    _file = get_model_file("retinaface_%s"%name, root=root)
+    return FaceDetector(_file, rac)
+
+def retinaface_r50_v1(**kwargs):
+    return get_retinaface("r50_v1", rac='net3', **kwargs)
+
+def retinaface_mnet025_v1(**kwargs):
+    return get_retinaface("mnet025_v1", rac='net3', **kwargs)
+
+def retinaface_mnet025_v2(**kwargs):
+    return get_retinaface("mnet025_v2", rac='net3l', **kwargs)
+
--- a/python-package/insightface/model_zoo/face_genderage.py
+++ b/python-package/insightface/model_zoo/face_genderage.py
@@ -0,0 +1,77 @@
+from __future__ import division
+import mxnet as mx
+import numpy as np
+import cv2
+
+__all__ = ['FaceGenderage',
+           'genderage_v1',
+           'get_genderage']
+
+
+class FaceGenderage:
+    def __init__(self, name, download, param_file):
+        self.name = name
+        self.download = download
+        self.param_file = param_file
+        self.image_size = (112, 112)
+        if download:
+            assert param_file
+
+    def prepare(self, ctx_id):
+        if self.param_file:
+            pos = self.param_file.rfind('-')
+            prefix = self.param_file[0:pos]
+            pos2 = self.param_file.rfind('.')
+            epoch = int(self.param_file[pos+1:pos2])
+            sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+            all_layers = sym.get_internals()
+            sym = all_layers['fc1_output']
+            if ctx_id>=0:
+                ctx = mx.gpu(ctx_id)
+            else:
+                ctx = mx.cpu()
+            model = mx.mod.Module(symbol=sym, context=ctx, label_names = None)
+            data_shape = (1,3)+self.image_size
+            model.bind(data_shapes=[('data', data_shape)])
+            model.set_params(arg_params, aux_params)
+            #warmup
+            data = mx.nd.zeros(shape=data_shape)
+            db = mx.io.DataBatch(data=(data,))
+            model.forward(db, is_train=False)
+            embedding = model.get_outputs()[0].asnumpy()
+            self.model = model
+        else:
+            pass
+
+    def get(self, img):
+        assert self.param_file and self.model
+        assert img.shape[2]==3 and img.shape[0:2]==self.image_size
+        data = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        data = np.transpose(data, (2,0,1))
+        data = np.expand_dims(data, axis=0)
+        data = mx.nd.array(data)
+        db = mx.io.DataBatch(data=(data,))
+        self.model.forward(db, is_train=False)
+        ret = self.model.get_outputs()[0].asnumpy()
+        g = ret[:,0:2].flatten()
+        gender = np.argmax(g)
+        a = ret[:,2:202].reshape( (100,2) )
+        a = np.argmax(a, axis=1)
+        age = int(sum(a))
+        return gender, age
+
+def get_genderage(name, download=True,
+               root='~/.insightface/models', **kwargs):
+    if not download:
+        return FaceGenderage(name, False, None)
+    else:
+        from .model_store import get_model_file
+        _file = get_model_file("genderage_%s"%name, root=root)
+        return FaceGenderage(name, True, _file)
+
+def genderage_v1(**kwargs):
+    return get_genderage("v1", download=True, **kwargs)
+
+
+
+
--- a/python-package/insightface/model_zoo/face_recognition.py
+++ b/python-package/insightface/model_zoo/face_recognition.py
@@ -0,0 +1,83 @@
+from __future__ import division
+import mxnet as mx
+import numpy as np
+import cv2
+
+__all__ = ['FaceRecognition',
+           'arcface_r100_v1', 'arcface_outofreach_v1', 'arcface_mfn_v1',
+           'get_arcface']
+
+
+class FaceRecognition:
+    def __init__(self, name, download, param_file):
+        self.name = name
+        self.download = download
+        self.param_file = param_file
+        self.image_size = (112, 112)
+        if download:
+            assert param_file
+
+    def prepare(self, ctx_id):
+        if self.param_file:
+            pos = self.param_file.rfind('-')
+            prefix = self.param_file[0:pos]
+            pos2 = self.param_file.rfind('.')
+            epoch = int(self.param_file[pos+1:pos2])
+            sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
+            all_layers = sym.get_internals()
+            sym = all_layers['fc1_output']
+            if ctx_id>=0:
+                ctx = mx.gpu(ctx_id)
+            else:
+                ctx = mx.cpu()
+            model = mx.mod.Module(symbol=sym, context=ctx, label_names = None)
+            data_shape = (1,3)+self.image_size
+            model.bind(data_shapes=[('data', data_shape)])
+            model.set_params(arg_params, aux_params)
+            #warmup
+            data = mx.nd.zeros(shape=data_shape)
+            db = mx.io.DataBatch(data=(data,))
+            model.forward(db, is_train=False)
+            embedding = model.get_outputs()[0].asnumpy()
+            self.model = model
+        else:
+            pass
+
+    def get_embedding(self, img):
+        assert self.param_file and self.model
+        assert img.shape[2]==3 and img.shape[0:2]==self.image_size
+        data = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+        data = np.transpose(data, (2,0,1))
+        data = np.expand_dims(data, axis=0)
+        data = mx.nd.array(data)
+        db = mx.io.DataBatch(data=(data,))
+        self.model.forward(db, is_train=False)
+        embedding = self.model.get_outputs()[0].asnumpy()
+        return embedding
+
+    def compute_sim(self, img1, img2):
+        emb1 = self.get_embedding(img1).flatten()
+        emb2 = self.get_embedding(img2).flatten()
+        from numpy.linalg import norm
+        sim = np.dot(emb1, emb2)/(norm(emb1)*norm(emb2))
+        return sim
+
+def get_arcface(name, download=True,
+               root='~/.insightface/models', **kwargs):
+    if not download:
+        return FaceRecognition(name, False, None)
+    else:
+        from .model_store import get_model_file
+        _file = get_model_file("arcface_%s"%name, root=root)
+        return FaceRecognition(name, True, _file)
+
+def arcface_r100_v1(**kwargs):
+    return get_arcface("r100_v1", download=True, **kwargs)
+
+
+def arcface_mfn_v1(**kwargs):
+    return get_arcface("mfn_v1", download=True, **kwargs)
+
+def arcface_outofreach_v1(**kwargs):
+    return get_arcface("outofreach_v1", download=False, **kwargs)
+
--- a/python-package/insightface/model_zoo/model_store.py
+++ b/python-package/insightface/model_zoo/model_store.py
@@ -0,0 +1,97 @@
+
+"""
+This code file mainly comes from https://github.com/dmlc/gluon-cv/blob/master/gluoncv/model_zoo/model_store.py
+"""
+from __future__ import print_function
+
+__all__ = ['get_model_file']
+import os
+import zipfile
+import glob
+
+from ..utils import download, check_sha1
+
+_model_sha1 = {name: checksum for checksum, name in [
+    ('95be21b58e29e9c1237f229dae534bd854009ce0', 'arcface_r100_v1'),
+    ('', 'arcface_mfn_v1'),
+    ('39fd1e087a2a2ed70a154ac01fecaa86c315d01b', 'retinaface_r50_v1'),
+    ('2c9de8116d1f448fd1d4661f90308faae34c990a', 'retinaface_mnet025_v1'),
+    ('0db1d07921d005e6c9a5b38e059452fc5645e5a4', 'retinaface_mnet025_v2'),
+    ('7dd8111652b7aac2490c5dcddeb268e53ac643e6', 'genderage_v1'),
+]}
+
+base_repo_url = 'http://insightface.ai/files/'
+_url_format = '{repo_url}models/{file_name}.zip'
+
+
+def short_hash(name):
+    if name not in _model_sha1:
+        raise ValueError('Pretrained model for {name} is not available.'.format(name=name))
+    return _model_sha1[name][:8]
+
+
+def find_params_file(dir_path):
+    if not os.path.exists(dir_path):
+        return None
+    paths = glob.glob("%s/*.params"%dir_path)
+    if len(paths)==0:
+        return None
+    paths = sorted(paths)
+    return paths[-1]
+
+def get_model_file(name, root=os.path.join('~', '.insightface', 'models')):
+    r"""Return location for the pretrained on local file system.
+
+    This function will download from online model zoo when model cannot be found or has mismatch.
+    The root directory will be created if it doesn't exist.
+
+    Parameters
+    ----------
+    name : str
+        Name of the model.
+    root : str, default '~/.mxnet/models'
+        Location for keeping the model parameters.
+
+    Returns
+    -------
+    file_path
+        Path to the requested pretrained model file.
+    """
+
+    file_name = name
+    root = os.path.expanduser(root)
+    dir_path = os.path.join(root, name)
+    file_path = find_params_file(dir_path)
+    #file_path = os.path.join(root, file_name + '.params')
+    sha1_hash = _model_sha1[name]
+    if file_path is not None:
+        if check_sha1(file_path, sha1_hash):
+            return file_path
+        else:
+            print('Mismatch in the content of model file detected. Downloading again.')
+    else:
+        print('Model file is not found. Downloading.')
+
+    if not os.path.exists(root):
+        os.makedirs(root)
+    if not os.path.exists(dir_path):
+        os.makedirs(dir_path)
+
+    zip_file_path = os.path.join(root, file_name + '.zip')
+    repo_url = base_repo_url
+    if repo_url[-1] != '/':
+        repo_url = repo_url + '/'
+    download(_url_format.format(repo_url=repo_url, file_name=file_name),
+             path=zip_file_path,
+             overwrite=True)
+    with zipfile.ZipFile(zip_file_path) as zf:
+        zf.extractall(dir_path)
+    os.remove(zip_file_path)
+    file_path = find_params_file(dir_path)
+
+    if check_sha1(file_path, sha1_hash):
+        return file_path
+    else:
+        raise ValueError('Downloaded file has different hash. Please try again.')
+
+
--- a/python-package/insightface/model_zoo/model_zoo.py
+++ b/python-package/insightface/model_zoo/model_zoo.py
@@ -0,0 +1,57 @@
+# pylint: disable=wildcard-import, unused-wildcard-import
+"""
+This code file mainly comes from https://github.com/dmlc/gluon-cv/blob/master/gluoncv/model_zoo/model_zoo.py
+"""
+from .face_recognition import *
+from .face_detection import *
+from .face_genderage import *
+#from .face_alignment import *
+
+__all__ = ['get_model', 'get_model_list']
+
+_models = {
+    'arcface_r100_v1': arcface_r100_v1,
+    #'arcface_mfn_v1': arcface_mfn_v1,
+    #'arcface_outofreach_v1': arcface_outofreach_v1,
+    'retinaface_r50_v1': retinaface_r50_v1,
+    'retinaface_mnet025_v1': retinaface_mnet025_v1,
+    'retinaface_mnet025_v2': retinaface_mnet025_v2,
+    'genderage_v1': genderage_v1,
+}
+
+
+def get_model(name, **kwargs):
+    """Returns a pre-defined model by name
+
+    Parameters
+    ----------
+    name : str
+        Name of the model.
+    root : str, default '~/.insightface/models'
+        Location for keeping the model parameters.
+
+    Returns
+    -------
+    Model
+        The model.
+    """
+    name = name.lower()
+    if name not in _models:
+        err_str = '"%s" is not among the following model list:\n\t' % (name)
+        err_str += '%s' % ('\n\t'.join(sorted(_models.keys())))
+        raise ValueError(err_str)
+    net = _models[name](**kwargs)
+    return net
+
+
+def get_model_list():
+    """Get the entire list of model names in model_zoo.
+
+    Returns
+    -------
+    list of str
+        Entire list of model names in model_zoo.
+
+    """
+    return sorted(_models.keys())
+
--- a/python-package/insightface/utils/init.py
+++ b/python-package/insightface/utils/init.py
@@ -0,0 +1,17 @@
+from __future__ import absolute_import
+
+#from . import bbox
+#from . import viz
+#from . import random
+#from . import metrics
+#from . import parallel
+
+from .download import download, check_sha1
+from .filesystem import makedirs
+from .filesystem import try_import_dali
+#from .bbox import bbox_iou
+#from .block import recursive_visit, set_lr_mult, freeze_bn
+#from .lr_scheduler import LRSequential, LRScheduler
+#from .plot_history import TrainingHistory
+#from .export_helper import export_block
+#from .sync_loader_helper import split_data, split_and_load
--- a/python-package/insightface/utils/download.py
+++ b/python-package/insightface/utils/download.py
@@ -0,0 +1,90 @@
+"""
+This code file mainly comes from https://github.com/dmlc/gluon-cv/blob/master/gluoncv/utils/download.py
+"""
+import os
+import hashlib
+import requests
+from tqdm import tqdm
+
+def check_sha1(filename, sha1_hash):
+    """Check whether the sha1 hash of the file content matches the expected hash.
+    Parameters
+    ----------
+    filename : str
+        Path to the file.
+    sha1_hash : str
+        Expected sha1 hash in hexadecimal digits.
+    Returns
+    -------
+    bool
+        Whether the file content matches the expected hash.
+    """
+    sha1 = hashlib.sha1()
+    with open(filename, 'rb') as f:
+        while True:
+            data = f.read(1048576)
+            if not data:
+                break
+            sha1.update(data)
+
+    sha1_file = sha1.hexdigest()
+    l = min(len(sha1_file), len(sha1_hash))
+    return sha1.hexdigest()[0:l] == sha1_hash[0:l]
+
+def download(url, path=None, overwrite=False, sha1_hash=None):
+    """Download an given URL
+    Parameters
+    ----------
+    url : str
+        URL to download
+    path : str, optional
+        Destination path to store downloaded file. By default stores to the
+        current directory with same name as in url.
+    overwrite : bool, optional
+        Whether to overwrite destination file if already exists.
+    sha1_hash : str, optional
+        Expected sha1 hash in hexadecimal digits. Will ignore existing file when hash is specified
+        but doesn't match.
+    Returns
+    -------
+    str
+        The file path of the downloaded file.
+    """
+    if path is None:
+        fname = url.split('/')[-1]
+    else:
+        path = os.path.expanduser(path)
+        if os.path.isdir(path):
+            fname = os.path.join(path, url.split('/')[-1])
+        else:
+            fname = path
+
+    if overwrite or not os.path.exists(fname) or (sha1_hash and not check_sha1(fname, sha1_hash)):
+        dirname = os.path.dirname(os.path.abspath(os.path.expanduser(fname)))
+        if not os.path.exists(dirname):
+            os.makedirs(dirname)
+
+        print('Downloading %s from %s...'%(fname, url))
+        r = requests.get(url, stream=True)
+        if r.status_code != 200:
+            raise RuntimeError("Failed downloading url %s"%url)
+        total_length = r.headers.get('content-length')
+        with open(fname, 'wb') as f:
+            if total_length is None: # no content length header
+                for chunk in r.iter_content(chunk_size=1024):
+                    if chunk: # filter out keep-alive new chunks
+                        f.write(chunk)
+            else:
+                total_length = int(total_length)
+                for chunk in tqdm(r.iter_content(chunk_size=1024),
+                                  total=int(total_length / 1024. + 0.5),
+                                  unit='KB', unit_scale=False, dynamic_ncols=True):
+                    f.write(chunk)
+
+        if sha1_hash and not check_sha1(fname, sha1_hash):
+            raise UserWarning('File {} is downloaded but the content hash does not match. ' \
+                              'The repo may be outdated or download may be incomplete. ' \
+                              'If the "repo_url" is overridden, consider switching to ' \
+                              'the default repo.'.format(fname))
+
+    return fname
--- a/python-package/insightface/utils/face_align.py
+++ b/python-package/insightface/utils/face_align.py
@@ -0,0 +1,88 @@
+
+import cv2
+import numpy as np
+from skimage import transform as trans
+
+src1 = np.array([
+     [51.642,50.115],
+     [57.617,49.990],
+     [35.740,69.007],
+     [51.157,89.050],
+     [57.025,89.702]], dtype=np.float32)
+#<--left 
+src2 = np.array([
+    [45.031,50.118],
+    [65.568,50.872],
+    [39.677,68.111],
+    [45.177,86.190],
+    [64.246,86.758]], dtype=np.float32)
+
+#---frontal
+src3 = np.array([
+    [39.730,51.138],
+    [72.270,51.138],
+    [56.000,68.493],
+    [42.463,87.010],
+    [69.537,87.010]], dtype=np.float32)
+
+#-->right
+src4 = np.array([
+    [46.845,50.872],
+    [67.382,50.118],
+    [72.737,68.111],
+    [48.167,86.758],
+    [67.236,86.190]], dtype=np.float32)
+
+#-->right profile
+src5 = np.array([
+    [54.796,49.990],
+    [60.771,50.115],
+    [76.673,69.007],
+    [55.388,89.702],
+    [61.257,89.050]], dtype=np.float32)
+
+src = np.array([src1,src2,src3,src4,src5])
+src_map = {112 : src, 224 : src*2}
+
+arcface_src = np.array([
+  [38.2946, 51.6963],
+  [73.5318, 51.5014],
+  [56.0252, 71.7366],
+  [41.5493, 92.3655],
+  [70.7299, 92.2041] ], dtype=np.float32 )
+
+arcface_src = np.expand_dims(arcface_src, axis=0)
+
+# In[66]:
+
+# lmk is prediction; src is template
+def estimate_norm(lmk, image_size = 112, mode='arcface'):
+  assert lmk.shape==(5,2)
+  tform = trans.SimilarityTransform()
+  lmk_tran = np.insert(lmk, 2, values=np.ones(5), axis=1)
+  min_M = []
+  min_index = []
+  min_error = float('inf') 
+  if mode=='arcface':
+    assert image_size==112
+    src = arcface_src
+  else:
+    src = src_map[image_size]
+  for i in np.arange(src.shape[0]):
+    tform.estimate(lmk, src[i])
+    M = tform.params[0:2,:]
+    results = np.dot(M, lmk_tran.T)
+    results = results.T
+    error = np.sum(np.sqrt(np.sum((results - src[i]) ** 2,axis=1)))
+#         print(error)
+    if error< min_error:
+        min_error = error
+        min_M = M
+        min_index = i
+  return min_M, min_index
+
+def norm_crop(img, landmark, image_size=112, mode='arcface'):
+  M, pose_index = estimate_norm(landmark, image_size, mode)
+  warped = cv2.warpAffine(img,M, (image_size, image_size), borderValue = 0.0)
+  return warped
+
--- a/python-package/insightface/utils/filesystem.py
+++ b/python-package/insightface/utils/filesystem.py
@@ -0,0 +1,137 @@
+"""
+This code file mainly comes from https://github.com/dmlc/gluon-cv/blob/master/gluoncv/utils/filesystem.py
+"""
+import os
+import errno
+
+def makedirs(path):
+    """Create directory recursively if not exists.
+    Similar to `makedir -p`, you can skip checking existence before this function.
+
+    Parameters
+    ----------
+    path : str
+        Path of the desired dir
+    """
+    try:
+        os.makedirs(path)
+    except OSError as exc:
+        if exc.errno != errno.EEXIST:
+            raise
+
+def try_import(package, message=None):
+    """Try import specified package, with custom message support.
+
+    Parameters
+    ----------
+    package : str
+        The name of the targeting package.
+    message : str, default is None
+        If not None, this function will raise customized error message when import error is found.
+
+
+    Returns
+    -------
+    module if found, raise ImportError otherwise
+
+    """
+    try:
+        return __import__(package)
+    except ImportError as e:
+        if not message:
+            raise e
+        raise ImportError(message)
+
+def try_import_cv2():
+    """Try import cv2 at runtime.
+
+    Returns
+    -------
+    cv2 module if found. Raise ImportError otherwise
+
+    """
+    msg = "cv2 is required, you can install by package manager, e.g. 'apt-get', \
+        or `pip install opencv-python --user` (note that this is unofficial PYPI package)."
+    return try_import('cv2', msg)
+
+def try_import_mmcv():
+    """Try import mmcv at runtime.
+
+    Returns
+    -------
+    mmcv module if found. Raise ImportError otherwise
+
+    """
+    msg = "mmcv is required, you can install by first `pip install Cython --user` \
+        and then `pip install mmcv --user` (note that this is unofficial PYPI package)."
+    return try_import('mmcv', msg)
+
+def try_import_rarfile():
+    """Try import rarfile at runtime.
+
+    Returns
+    -------
+    rarfile module if found. Raise ImportError otherwise
+
+    """
+    msg = "rarfile is required, you can install by first `sudo apt-get install unrar` \
+        and then `pip install rarfile --user` (note that this is unofficial PYPI package)."
+    return try_import('rarfile', msg)
+
+def import_try_install(package, extern_url=None):
+    """Try import the specified package.
+    If the package not installed, try use pip to install and import if success.
+
+    Parameters
+    ----------
+    package : str
+        The name of the package trying to import.
+    extern_url : str or None, optional
+        The external url if package is not hosted on PyPI.
+        For example, you can install a package using:
+         "pip install git+http://github.com/user/repo/tarball/master/egginfo=xxx".
+        In this case, you can pass the url to the extern_url.
+
+    Returns
+    -------
+    <class 'Module'>
+        The imported python module.
+
+    """
+    try:
+        return __import__(package)
+    except ImportError:
+        try:
+            from pip import main as pipmain
+        except ImportError:
+            from pip._internal import main as pipmain
+
+        # trying to install package
+        url = package if extern_url is None else extern_url
+        pipmain(['install', '--user', url])  # will raise SystemExit Error if fails
+
+        # trying to load again
+        try:
+            return __import__(package)
+        except ImportError:
+            import sys
+            import site
+            user_site = site.getusersitepackages()
+            if user_site not in sys.path:
+                sys.path.append(user_site)
+            return __import__(package)
+    return __import__(package)
+
+def try_import_dali():
+    """Try import NVIDIA DALI at runtime.
+    """
+    try:
+        dali = __import__('nvidia.dali', fromlist=['pipeline', 'ops', 'types'])
+        dali.Pipeline = dali.pipeline.Pipeline
+    except ImportError:
+        class dali:
+            class Pipeline:
+                def __init__(self):
+                    raise NotImplementedError(
+                        "DALI not found, please check if you installed it correctly.")
+    return dali
--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -0,0 +1,61 @@
+#!/usr/bin/env python
+import os
+import io
+import re
+import shutil
+import sys
+from setuptools import setup, find_packages
+
+def read(*names, **kwargs):
+    with io.open(
+        os.path.join(os.path.dirname(__file__), *names),
+        encoding=kwargs.get("encoding", "utf8")
+    ) as fp:
+        return fp.read()
+
+
+def find_version(*file_paths):
+    version_file = read(*file_paths)
+    version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
+                              version_file, re.M)
+    if version_match:
+        return version_match.group(1)
+    raise RuntimeError("Unable to find version string.")
+
+try:
+    import pypandoc
+    long_description = pypandoc.convert('README.md', 'rst')
+except(IOError, ImportError):
+    long_description = open('README.md').read()
+
+VERSION = find_version('insightface', '__init__.py')
+
+requirements = [
+    'numpy',
+    'tqdm',
+    'requests',
+    'matplotlib',
+    'Pillow',
+    'scipy',
+    'opencv-python',
+    'scikit-learn',
+    'scikit-image',
+    'easydict',
+]
+
+setup(
+    # Metadata
+    name='insightface',
+    version=VERSION,
+    author='InsightFace Contributors',
+    url='https://github.com/deepinsight/insightface',
+    description='InsightFace Toolkit',
+    long_description=long_description,
+    license='Apache-2.0',
+    # Package info
+    packages=find_packages(exclude=('docs', 'tests', 'scripts')),
+    zip_safe=True,
+    include_package_data=True,
+    install_requires=requirements,
+)
+
--- a/recognition/data/build_eval_pack.py
+++ b/recognition/data/build_eval_pack.py
@@ -0,0 +1,124 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+#import mxnet as mx
+#from mxnet import ndarray as nd
+import argparse
+import cv2
+import pickle
+import numpy as np
+import sys
+import os
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'common'))
+sys.path.append(os.path.join(os.path.dirname(__file__), '..', '..', 'RetinaFace'))
+import face_align
+from retinaface import RetinaFace
+
+def to_rgb(img):
+    w, h = img.shape
+    ret = np.empty((w, h, 3), dtype=np.uint8)
+    ret[:, :, 0] = ret[:, :, 1] = ret[:, :, 2] = img
+    return ret
+
+
+def IOU(Reframe,GTframe):
+  x1 = Reframe[0];
+  y1 = Reframe[1];
+  width1 = Reframe[2]-Reframe[0];
+  height1 = Reframe[3]-Reframe[1];
+
+  x2 = GTframe[0]
+  y2 = GTframe[1]
+  width2 = GTframe[2]-GTframe[0]
+  height2 = GTframe[3]-GTframe[1]
+
+  endx = max(x1+width1,x2+width2)
+  startx = min(x1,x2)
+  width = width1+width2-(endx-startx)
+
+  endy = max(y1+height1,y2+height2)
+  starty = min(y1,y2)
+  height = height1+height2-(endy-starty)
+
+  if width <=0 or height <= 0:
+    ratio = 0
+  else:
+    Area = width*height
+    Area1 = width1*height1
+    Area2 = width2*height2
+    ratio = Area*1./(Area1+Area2-Area)
+  return ratio
+
+parser = argparse.ArgumentParser(description='Package eval images')
+# general
+parser.add_argument('--data-dir', default='', help='')
+parser.add_argument('--image-size', type=int, default=112, help='')
+parser.add_argument('--gpu', type=int, default=0, help='')
+parser.add_argument('--det-prefix', type=str, default='./model/R50', help='')
+parser.add_argument('--output', default='./', help='path to save.')
+parser.add_argument('--align-mode', default='arcface', help='align mode.')
+args = parser.parse_args()
+
+gpu_id = args.gpu
+
+detector = RetinaFace(args.det_prefix, 0, gpu_id, network='net3')
+target_size = 400
+max_size = 800
+
+def get_norm_crop(image_path):
+  im = cv2.imread(image_path)
+  im_shape = im.shape
+  im_size_min = np.min(im_shape[0:2])
+  im_size_max = np.max(im_shape[0:2])
+  im_scale = float(target_size) / float(im_size_min)
+  # prevent bigger axis from being more than max_size:
+  if np.round(im_scale * im_size_max) > max_size:
+      im_scale = float(max_size) / float(im_size_max)
+  bbox, landmark = detector.detect(im, threshold=0.5, scales=[im_scale])
+  #print(im.shape, bbox.shape, landmark.shape)
+  if bbox.shape[0]==0:
+    bbox, landmark = detector.detect(im, threshold=0.05, scales=[im_scale*0.75, im_scale, im_scale*2.0])
+    print('refine', im.shape, bbox.shape, landmark.shape)
+  nrof_faces = bbox.shape[0]
+  if nrof_faces>0:
+    det = bbox[:,0:4]
+    img_size = np.asarray(im.shape)[0:2]
+    bindex = 0
+    if nrof_faces>1:
+      bounding_box_size = (det[:,2]-det[:,0])*(det[:,3]-det[:,1])
+      img_center = img_size / 2
+      offsets = np.vstack([ (det[:,0]+det[:,2])/2-img_center[1], (det[:,1]+det[:,3])/2-img_center[0] ])
+      offset_dist_squared = np.sum(np.power(offsets,2.0),0)
+      bindex = np.argmax(bounding_box_size-offset_dist_squared*2.0) # some extra weight on the centering
+    #_bbox = bounding_boxes[bindex, 0:4]
+    _landmark = landmark[bindex]
+    warped = face_align.norm_crop(im, landmark = _landmark, image_size=args.image_size, mode=args.align_mode)
+    return warped
+  else:
+    return None
+
+
+bins = []
+issame_list = []
+pp = 0
+for line in open(os.path.join(args.data_dir, 'pairs_label.txt'), 'r'):
+  pp+=1
+  if pp%100==0:
+    print('processing', pp)
+  line = line.strip().split()
+  assert len(line)==3
+  path1 = os.path.join(args.data_dir, line[0])
+  path2 = os.path.join(args.data_dir, line[1])
+  im1 = get_norm_crop(path1)
+  im2 = get_norm_crop(path2)
+  issame = True
+  if line[2]=='0':
+    issame = False
+  issame_list.append(issame)
+  for im in [im1, im2]:
+    _, s = cv2.imencode('.jpg', im)
+    bins.append(s)
+
+with open(args.output, 'wb') as f:
+  pickle.dump((bins, issame_list), f, protocol=pickle.HIGHEST_PROTOCOL)
+
--- a/recognition/data/lfw/pairs_label.txt
+++ b/recognition/data/lfw/pairs_label.txt
--- a/recognition/eval/lfw.py
+++ b/recognition/eval/lfw.py
@@ -221,7 +221,7 @@ def test(lfw_set, mx_model, batch_size):
  issame_list = lfw_set[1]
  model = mx_model
  embeddings_list = []
-  for i in xrange( len(lfw_data_list) ):
+  for i in range( len(lfw_data_list) ):
    lfw_data = lfw_data_list[i]
    embeddings = None
    ba = 0
@@ -256,7 +256,7 @@ def test(lfw_set, mx_model, batch_size):
  _xnorm = 0.0
  _xnorm_cnt = 0
  for embed in embeddings_list:
-    for i in xrange(embed.shape[0]):
+    for i in range(embed.shape[0]):
      _em = embed[i]
      _norm=np.linalg.norm(_em)
      #print(_em.shape, _norm)
--- a/recognition/eval/verification.py
+++ b/recognition/eval/verification.py
@@ -180,12 +180,17 @@ def evaluate(embeddings, actual_issame, nrof_folds=10, pca = 0):
    return tpr, fpr, accuracy, val, val_std, far

 def load_bin(path, image_size):
-  bins, issame_list = pickle.load(open(path, 'rb'))
+  try:
+    with open(path, 'rb') as f:
+      bins, issame_list = pickle.load(f) #py2
+  except UnicodeDecodeError as e:
+    with open(path, 'rb') as f:
+      bins, issame_list = pickle.load(f, encoding='bytes') #py3
  data_list = []
  for flip in [0,1]:
    data = nd.empty((len(issame_list)*2, 3, image_size[0], image_size[1]))
    data_list.append(data)
-  for i in xrange(len(issame_list)*2):
+  for i in range(len(issame_list)*2):
    _bin = bins[i]
    img = mx.image.imdecode(_bin)
    if img.shape[1]!=image_size[0]:
@@ -213,7 +218,7 @@ def test(data_set, mx_model, batch_size, nfolds=10, data_extra = None, label_sha
    _label = nd.ones( (batch_size,) )
  else:
    _label = nd.ones( label_shape )
-  for i in xrange( len(data_list) ):
+  for i in range( len(data_list) ):
    data = data_list[i]
    embeddings = None
    ba = 0
@@ -255,7 +260,7 @@ def test(data_set, mx_model, batch_size, nfolds=10, data_extra = None, label_sha
  _xnorm = 0.0
  _xnorm_cnt = 0
  for embed in embeddings_list:
-    for i in xrange(embed.shape[0]):
+    for i in range(embed.shape[0]):
      _em = embed[i]
      _norm=np.linalg.norm(_em)
      #print(_em.shape, _norm)
@@ -293,7 +298,7 @@ def test_badcase(data_set, mx_model, batch_size, name='', data_extra = None, lab
    _label = nd.ones( (batch_size,) )
  else:
    _label = nd.ones( label_shape )
-  for i in xrange( len(data_list) ):
+  for i in range( len(data_list) ):
    data = data_list[i]
    embeddings = None
    ba = 0
@@ -438,7 +443,7 @@ def test_badcase(data_set, mx_model, batch_size, name='', data_extra = None, lab
        #  imgb = cv2.transpose(imgb)
        #  imgb = cv2.flip(imgb, 0)
        #else:
-        #  for ii in xrange(2):
+        #  for ii in range(2):
        #    imgb = cv2.transpose(imgb)
        #    imgb = cv2.flip(imgb, 1)
      dist = out[2]
@@ -469,7 +474,7 @@ def dumpR(data_set, mx_model, batch_size, name='', data_extra = None, label_shap
    _label = nd.ones( (batch_size,) )
  else:
    _label = nd.ones( label_shape )
-  for i in xrange( len(data_list) ):
+  for i in range( len(data_list) ):
    data = data_list[i]
    embeddings = None
    ba = 0
@@ -571,7 +576,7 @@ if __name__ == '__main__':
      ver_name_list.append(name)

  if args.mode==0:
-    for i in xrange(len(ver_list)):
+    for i in range(len(ver_list)):
      results = []
      for model in nets:
        acc1, std1, acc2, std2, xnorm, embeddings_list = test(ver_list[i], model, args.batch_size, args.nfolds)
--- a/recognition/image_iter.py
+++ b/recognition/image_iter.py
@@ -12,8 +12,6 @@ import sklearn
 import datetime
 import numpy as np
 import cv2
-from PIL import Image
-from io import BytesIO

 import mxnet as mx
 from mxnet import ndarray as nd
@@ -166,11 +164,13 @@ class FaceImageIter(io.DataIter):
    def mirror_aug(self, img):
      _rd = random.randint(0,1)
      if _rd==1:
-        for c in xrange(img.shape[2]):
+        for c in range(img.shape[2]):
          img[:,:,c] = np.fliplr(img[:,:,c])
      return img

    def compress_aug(self, img):
+      from PIL import Image
+      from io import BytesIO
      buf = BytesIO()
      img = Image.fromarray(img.asnumpy(), 'RGB')
      q = random.randint(2, 20)
--- a/recognition/parall_module_local_v1.py
+++ b/recognition/parall_module_local_v1.py
@@ -122,7 +122,8 @@ class ParallModule(BaseModule):
      #ag = {}
      #ax = {}
      rk = []
-      for k,v in g.iteritems():
+      for k in g:
+        v = g[k]
        if k.startswith('fc7'):
          p1 = k.find('_')
          p2 = k.rfind('_')
@@ -131,10 +132,6 @@ class ParallModule(BaseModule):
          rk.append(k)
      for k in rk:
        del g[k]
-      #for k,v in g.iteritems():
-      #  print('g', k, v.shape)
-      #for k,v in ag.iteritems():
-      #  print('ag', k, v.shape)
      self._curr_module.set_params(g, x)
      #self._arcface_module.set_params(ag, ax)

--- a/recognition/symbol/fresnet.py
+++ b/recognition/symbol/fresnet.py
@@ -606,6 +606,12 @@ def get_symbol():
        units = [3, 8, 35, 3]
    elif num_layers == 100:
        units = [3, 13, 30, 3]
+    elif num_layers == 134:
+        units = [3, 10, 50, 3]
+    elif num_layers == 136:
+        units = [3, 13, 48, 3]
+    elif num_layers == 140:
+        units = [3, 15, 48, 3]
    elif num_layers == 124:
        units = [3, 13, 40, 5]
    elif num_layers == 160:
--- a/recognition/train.py
+++ b/recognition/train.py
@@ -149,7 +149,7 @@ def train_net(args):
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd)>0:
-      for i in xrange(len(cvd.split(','))):
+      for i in range(len(cvd.split(','))):
        ctx.append(mx.gpu(i))
    if len(ctx)==0:
      ctx = [mx.cpu()]
@@ -270,7 +270,7 @@ def train_net(args):

    def ver_test(nbatch):
      results = []
-      for i in xrange(len(ver_list)):
+      for i in range(len(ver_list)):
        acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(ver_list[i], model, args.batch_size, 10, None, None)
        print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
        #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
@@ -281,7 +281,7 @@ def train_net(args):


    highest_acc = [0.0, 0.0]  #lfw and target
-    #for i in xrange(len(ver_list)):
+    #for i in range(len(ver_list)):
    #  highest_acc.append(0.0)
    global_step = [0]
    save_step = [0]
--- a/recognition/train_parall.py
+++ b/recognition/train_parall.py
@@ -62,6 +62,7 @@ def parse_args():
  parser.add_argument('--per-batch-size', type=int, default=default.per_batch_size, help='batch size in each context')
  parser.add_argument('--kvstore', type=str, default=default.kvstore, help='kvstore setting')
  parser.add_argument('--worker-id', type=int, default=0, help='worker id for dist training, starts from 0')
+  parser.add_argument('--extra-model-name', type=str, default='', help='extra model name')
  args = parser.parse_args()
  return args

@@ -126,14 +127,17 @@ def train_net(args):
    ctx = []
    cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
    if len(cvd)>0:
-      for i in xrange(len(cvd.split(','))):
+      for i in range(len(cvd.split(','))):
        ctx.append(mx.gpu(i))
    if len(ctx)==0:
      ctx = [mx.cpu()]
      print('use cpu')
    else:
      print('gpu num:', len(ctx))
-    prefix = os.path.join(args.models_root, '%s-%s-%s'%(args.network, args.loss, args.dataset), 'model')
+    if len(args.extra_model_name)==0:
+      prefix = os.path.join(args.models_root, '%s-%s-%s'%(args.network, args.loss, args.dataset), 'model')
+    else:
+      prefix = os.path.join(args.models_root, '%s-%s-%s-%s'%(args.network, args.loss, args.dataset, args.extra_model_name), 'model')
    prefix_dir = os.path.dirname(prefix)
    print('prefix', prefix)
    if not os.path.exists(prefix_dir):
@@ -249,7 +253,7 @@ def train_net(args):

    def ver_test(nbatch):
      results = []
-      for i in xrange(len(ver_list)):
+      for i in range(len(ver_list)):
        acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test(ver_list[i], model, args.batch_size, 10, None, None)
        print('[%s][%d]XNorm: %f' % (ver_name_list[i], nbatch, xnorm))
        #print('[%s][%d]Accuracy: %1.5f+-%1.5f' % (ver_name_list[i], nbatch, acc1, std1))
@@ -259,7 +263,7 @@ def train_net(args):


    highest_acc = [0.0, 0.0]  #lfw and target
-    #for i in xrange(len(ver_list)):
+    #for i in range(len(ver_list)):
    #  highest_acc.append(0.0)
    global_step = [0]
    save_step = [0]
--- a/recognition/triplet_image_iter.py
+++ b/recognition/triplet_image_iter.py
@@ -111,12 +111,12 @@ class FaceImageIter(io.DataIter):

    def pairwise_dists(self, embeddings):
      nd_embedding_list = []
-      for i in xrange(self.ctx_num):
+      for i in range(self.ctx_num):
        nd_embedding = mx.nd.array(embeddings, mx.gpu(i))
        nd_embedding_list.append(nd_embedding)
      nd_pdists = []
      pdists = []
-      for idx in xrange(embeddings.shape[0]):
+      for idx in range(embeddings.shape[0]):
        emb_idx = idx%self.ctx_num
        nd_embedding = nd_embedding_list[emb_idx]
        a_embedding = nd_embedding[idx]
@@ -138,16 +138,16 @@ class FaceImageIter(io.DataIter):
      pdists = self.pairwise_dists(embeddings)
      #self.times[3] += self.time_elapsed()

-      for i in xrange(people_per_batch):
+      for i in range(people_per_batch):
          nrof_images = int(nrof_images_per_class[i])
-          for j in xrange(1,nrof_images):
+          for j in range(1,nrof_images):
              #self.time_reset()
              a_idx = emb_start_idx + j - 1
              #neg_dists_sqr = np.sum(np.square(embeddings[a_idx] - embeddings), 1)
              neg_dists_sqr = pdists[a_idx]
              #self.times[3] += self.time_elapsed()

-              for pair in xrange(j, nrof_images): # For every possible positive pair.
+              for pair in range(j, nrof_images): # For every possible positive pair.
                  p_idx = emb_start_idx + pair
                  #self.time_reset()
                  pos_dist_sqr = np.sum(np.square(embeddings[a_idx]-embeddings[p_idx]))
@@ -234,7 +234,7 @@ class FaceImageIter(io.DataIter):
          #_label = _batch.label[0].asnumpy()
          #data[ba:bb,:,:,:] = _data
          #label[ba:bb] = _label
-          for i in xrange(ba, bb):
+          for i in range(ba, bb):
            #print(ba, bb, self.triplet_cur, i, len(self.triplet_seq))
            _idx = self.triplet_seq[i+self.triplet_cur]
            s = self.imgrec.read_idx(_idx)
@@ -269,7 +269,7 @@ class FaceImageIter(io.DataIter):
        self.times[1] += self.time_elapsed()
        self.time_reset()
        nrof_images_per_class = [1]
-        for i in xrange(1, bag_size):
+        for i in range(1, bag_size):
          if tag[i][0]==tag[i-1][0]:
            nrof_images_per_class[-1]+=1
          else:
@@ -283,7 +283,7 @@ class FaceImageIter(io.DataIter):
          if bb>len(triplets):
            break
          _triplets = triplets[ba:bb]
-          for i in xrange(3):
+          for i in range(3):
            for triplet in _triplets:
              _pos = triplet[i]
              _idx = tag[_pos][1]
@@ -306,7 +306,7 @@ class FaceImageIter(io.DataIter):
          print('loading batch',batch_num, ba)
        bb = min(ba+self.batch_size, len(self.oseq))
        _count = bb-ba
-        for i in xrange(_count):
+        for i in range(_count):
          idx = self.oseq[i+ba]
          s = self.imgrec.read_idx(idx)
          header, img = recordio.unpack(s)
@@ -323,7 +323,7 @@ class FaceImageIter(io.DataIter):
        if X is None:
          X = np.zeros( (len(self.id2range), nembedding.shape[1]), dtype=np.float32 )
        nplabel = label.asnumpy()
-        for i in xrange(_count):
+        for i in range(_count):
          ilabel = int(nplabel[i])
          #print(ilabel, ilabel.__class__)
          X[ilabel] += nembedding[i]
@@ -331,14 +331,14 @@ class FaceImageIter(io.DataIter):
      X = sklearn.preprocessing.normalize(X)
      d = X.shape[1]
      t = AnnoyIndex(d, metric='euclidean')
-      for i in xrange(X.shape[0]):
+      for i in range(X.shape[0]):
        t.add_item(i, X[i])
      print('start to build index')
      t.build(20)
      print(X.shape)
      k = self.per_identities
      self.seq = []
-      for i in xrange(X.shape[0]):
+      for i in range(X.shape[0]):
        nnlist = t.get_nns_by_item(i, k)
        assert nnlist[0]==i
        for _label in nnlist:
@@ -350,7 +350,7 @@ class FaceImageIter(io.DataIter):
            random.shuffle(_list)
          else:
            _list = np.random.choice(_list, self.images_per_identity, replace=False)
-          for i in xrange(self.images_per_identity):
+          for i in range(self.images_per_identity):
            _idx = _list[i%len(_list)]
            self.seq.append(_idx)
      #faiss_params = [20,5]
@@ -365,9 +365,9 @@ class FaceImageIter(io.DataIter):
      #D, I = index.search(X, k)     # actual search
      #print(I.shape)
      #self.seq = []
-      #for i in xrange(I.shape[0]):
+      #for i in range(I.shape[0]):
      #  #assert I[i][0]==i
-      #  for j in xrange(k):
+      #  for j in range(k):
      #    _label = I[i][j]
      #    assert _label<len(self.id2range)
      #    _id = self.header0[0]+_label
@@ -377,7 +377,7 @@ class FaceImageIter(io.DataIter):
      #      random.shuffle(_list)
      #    else:
      #      _list = np.random.choice(_list, self.images_per_identity, replace=False)
-      #    for i in xrange(self.images_per_identity):
+      #    for i in range(self.images_per_identity):
      #      _idx = _list[i%len(_list)]
      #      self.seq.append(_idx)

@@ -391,9 +391,10 @@ class FaceImageIter(io.DataIter):
          elif not self.hard_mining:
            self.seq = []
            idlist = []
-            for _id,v in self.id2range.iteritems():
+            for _id in self.id2range:
+              v = self.id2range[_id]
              idlist.append((_id,range(*v)))
-            for r in xrange(self.repeat):
+            for r in range(self.repeat):
              if r%10==0:
                print('repeat', r)
              if self.shuffle:
@@ -406,7 +407,7 @@ class FaceImageIter(io.DataIter):
                  random.shuffle(_list)
                else:
                  _list = np.random.choice(_list, self.images_per_identity, replace=False)
-                for i in xrange(self.images_per_identity):
+                for i in range(self.images_per_identity):
                  _idx = _list[i%len(_list)]
                  self.seq.append(_idx)
          else:
@@ -470,7 +471,7 @@ class FaceImageIter(io.DataIter):
    def mirror_aug(self, img):
      _rd = random.randint(0,1)
      if _rd==1:
-        for c in xrange(img.shape[2]):
+        for c in range(img.shape[2]):
          img[:,:,c] = np.fliplr(img[:,:,c])
      return img

--- a/resources/lfr19_wechat1.jpg
+++ b/resources/lfr19_wechat1.jpg
				`@@ -0,0 +1 @@`
				`from .model_zoo import get_model, get_model_list`