support shuffled rec and fp16 training

This commit is contained in:
nttstar
2021-03-09 01:02:25 +08:00
parent 7f9047a94e
commit d482f3d095
7 changed files with 177 additions and 35 deletions

View File

@@ -318,3 +318,50 @@ class FaceImageIterList(io.DataIter):
self.cur_iter.reset()
continue
return ret
def get_face_image_iter(cfg, data_shape, path_imgrec):
print('loading:', path_imgrec, cfg.is_shuffled_rec)
if not cfg.is_shuffled_rec:
train_dataiter = FaceImageIter(
batch_size=cfg.batch_size,
data_shape=data_shape,
path_imgrec=path_imgrec,
shuffle=True,
rand_mirror=config.data_rand_mirror,
mean=None,
cutoff=config.data_cutoff,
color_jittering=config.data_color,
images_filter=config.data_images_filter,
)
train_dataiter = mx.io.PrefetchingIter(train_dataiter)
else:
train_dataiter = mx.io.ImageRecordIter(
path_imgrec = path_imgrec,
data_shape = data_shape,
batch_size = cfg.batch_size,
rand_mirror = cfg.data_rand_mirror,
preprocess_threads = 2,
shuffle = True,
shuffle_chunk_size = 1024,
)
return train_dataiter
def test_face_image_iter(path_imgrec):
train_dataiter = mx.io.ImageRecordIter(
path_imgrec = path_imgrec,
data_shape = (3,112,112),
batch_size = 512,
rand_mirror = True,
preprocess_threads = 2,
shuffle = True,
shuffle_chunk_size = 1024,
)
for batch in train_dataiter:
data = batch.data[0].asnumpy()
print(data.shape)
img0 = data[0]
print(img0[0,:5,:5])
if __name__ == '__main__':
test_face_image_iter('/train_tmp/ms1mv3shuf/train.rec')

View File

@@ -234,12 +234,12 @@ class ParallModule(BaseModule):
return
self._curr_module.init_optimizer(kvstore,
optimizer,
optimizer[0],
optimizer_params,
force_init=force_init)
for _module in self._arcface_modules:
_module.init_optimizer(kvstore,
optimizer,
optimizer[1],
optimizer_params,
force_init=force_init)
self.optimizer_initialized = True

View File

@@ -27,6 +27,7 @@ config.data_color = 0
config.data_images_filter = 0
config.count_flops = True
config.memonger = False #not work now
config.is_shuffled_rec = False
# network settings
network = edict()

View File

@@ -325,18 +325,20 @@ def train_net(args):
_metric = LossValueMetric()
eval_metrics = [mx.metric.create(_metric)]
else:
from image_iter import FaceImageIter
train_dataiter = FaceImageIter(
batch_size=args.batch_size,
data_shape=data_shape,
path_imgrec=path_imgrec,
shuffle=True,
rand_mirror=config.data_rand_mirror,
mean=mean,
cutoff=config.data_cutoff,
color_jittering=config.data_color,
images_filter=config.data_images_filter,
)
#from image_iter import FaceImageIter
#train_dataiter = FaceImageIter(
# batch_size=args.batch_size,
# data_shape=data_shape,
# path_imgrec=path_imgrec,
# shuffle=True,
# rand_mirror=config.data_rand_mirror,
# mean=mean,
# cutoff=config.data_cutoff,
# color_jittering=config.data_color,
# images_filter=config.data_images_filter,
#)
from image_iter import get_face_image_iter
train_dataiter = get_face_image_iter(config, data_shape, path_imgrec)
metric1 = AccMetric()
eval_metrics = [mx.metric.create(metric1)]
if config.ce_loss:
@@ -454,7 +456,6 @@ def train_net(args):
sys.exit(0)
epoch_cb = None
train_dataiter = mx.io.PrefetchingIter(train_dataiter)
model.fit(
train_dataiter,

View File

@@ -14,7 +14,8 @@ import logging
import pickle
import sklearn
import numpy as np
from image_iter import FaceImageIter
#from image_iter import FaceImageIter
from image_iter import get_face_image_iter
import mxnet as mx
from mxnet import ndarray as nd
import argparse
@@ -107,6 +108,7 @@ def parse_args():
type=str,
default='',
help='extra model name')
parser.add_argument('--fp16-scale', type=float, default=0.0, help='')
args = parser.parse_args()
return args
@@ -186,6 +188,12 @@ def train_net(args):
#random.seed(_seed)
#np.random.seed(_seed)
#mx.random.seed(_seed)
config.fp16 = False
config.fp16_scale = 0.0
if args.fp16_scale>0.0:
config.fp16 = True
config.fp16_scale = args.fp16_scale
print('use fp16, scale=', config.fp16_scale)
ctx = []
cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
if len(cvd) > 0:
@@ -290,17 +298,6 @@ def train_net(args):
args=args,
)
val_dataiter = None
train_dataiter = FaceImageIter(
batch_size=args.batch_size,
data_shape=data_shape,
path_imgrec=path_imgrec,
shuffle=True,
rand_mirror=config.data_rand_mirror,
mean=mean,
cutoff=config.data_cutoff,
color_jittering=config.data_color,
images_filter=config.data_images_filter,
)
if config.net_name == 'fresnet' or config.net_name == 'fmobilefacenet':
initializer = mx.init.Xavier(rnd_type='gaussian',
@@ -312,10 +309,11 @@ def train_net(args):
magnitude=2)
_rescale = 1.0 / args.batch_size
opt = optimizer.SGD(learning_rate=base_lr,
momentum=base_mom,
wd=base_wd,
rescale_grad=_rescale)
if config.fp16:
opt = optimizer.SGD(learning_rate=base_lr, momentum=base_mom, wd=base_wd, rescale_grad=_rescale, multi_precision=True)
else:
opt = optimizer.SGD(learning_rate=base_lr, momentum=base_mom, wd=base_wd, rescale_grad=_rescale, multi_precision=False)
opt_fc7 = optimizer.SGD(learning_rate=base_lr, momentum=base_mom, wd=base_wd, rescale_grad=_rescale, multi_precision=False)
_cb = mx.callback.Speedometer(args.batch_size, args.frequent)
ver_list = []
@@ -355,12 +353,13 @@ def train_net(args):
for step in lr_steps:
if mbatch == step:
opt.lr *= 0.1
print('lr change to', opt.lr)
opt_fc7.lr *= 0.1
print('lr change to', opt.lr, opt_fc7.lr)
break
_cb(param)
if mbatch % 1000 == 0:
print('lr-batch-epoch:', opt.lr, param.nbatch, param.epoch)
print('lr-batch-epoch:', opt.lr, opt_fc7.lr, param.nbatch, param.epoch)
if mbatch >= 0 and mbatch % args.verbose == 0:
acc_list = ver_test(mbatch)
@@ -402,10 +401,28 @@ def train_net(args):
mx.model.save_checkpoint(prefix, msave, _sym, arg, aux)
print('[%d]Accuracy-Highest: %1.5f' % (mbatch, highest_acc[-1]))
if config.max_steps > 0 and mbatch > config.max_steps:
msave = 0
config.fp16 = False
print('saving last', msave)
arg, aux = model.get_export_params()
_sym = eval(config.net_name).get_symbol()
mx.model.save_checkpoint(prefix, msave, _sym, arg, aux)
sys.exit(0)
epoch_cb = None
train_dataiter = mx.io.PrefetchingIter(train_dataiter)
train_dataiter = get_face_image_iter(config, data_shape, path_imgrec)
#train_dataiter = FaceImageIter(
# batch_size=args.batch_size,
# data_shape=data_shape,
# path_imgrec=path_imgrec,
# shuffle=True,
# rand_mirror=config.data_rand_mirror,
# mean=mean,
# cutoff=config.data_cutoff,
# color_jittering=config.data_color,
# images_filter=config.data_images_filter,
#)
#train_dataiter = mx.io.PrefetchingIter(train_dataiter)
model.fit(
train_dataiter,
@@ -414,7 +431,7 @@ def train_net(args):
eval_data=val_dataiter,
#eval_metric = eval_metrics,
kvstore=args.kvstore,
optimizer=opt,
optimizer=[opt, opt_fc7],
#optimizer_params = optimizer_params,
initializer=initializer,
arg_params=arg_params,

View File

@@ -0,0 +1,72 @@
import os
import os.path as osp
import sys
import datetime
import glob
import shutil
import numbers
import mxnet as mx
from mxnet import ndarray as nd
from mxnet import io
from mxnet import recordio
import random
import argparse
import cv2
import time
import numpy as np
def main(args):
ds = args.input
path_imgrec = osp.join(ds, 'train.rec')
path_imgidx = osp.join(ds, 'train.idx')
imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r') # pylint: disable=redefined-variable-type
if not osp.exists(args.output):
os.makedirs(args.output)
writer = mx.recordio.MXRecordIO(osp.join(args.output, 'train.rec'), 'w')
s = imgrec.read_idx(0)
header, _ = recordio.unpack(s)
if header.flag > 0:
print('header0 label', header.label)
header0 = (int(header.label[0]), int(header.label[1]))
imgidx = list(range(1, int(header.label[0])))
else:
imgidx = list(imgrec.keys)
random.shuffle(imgidx)
label_stat = None
print('total images:', len(imgidx))
for i, idx in enumerate(imgidx):
if i%10000==0:
print('processing', i, idx)
s = imgrec.read_idx(idx)
header, img = mx.recordio.unpack(s)
label = header.label
if not isinstance(label, numbers.Number):
label = label[0]
if label_stat is None:
label_stat = [label, label]
else:
label_stat[0] = min(label, label_stat[0])
label_stat[1] = max(label, label_stat[1])
wheader = mx.recordio.IRHeader(0, label, i, 0)
ws = mx.recordio.pack(wheader, img)
writer.write(ws)
print('label_stat:', label_stat)
writer.close()
if args.copy_vers:
for binfile in glob.glob(osp.join(args.input, '*.bin')):
target_file = osp.join(args.output, binfile.split('/')[-1])
shutil.copyfile(binfile, target_file)
with open(osp.join(args.output, 'property'), 'w') as f:
f.write("%d,112,112\n"%(int(label_stat[1])+1))
f.write("%d\n"%len(imgidx))
f.write("shuffled\n")
f.write("%s\n"%(datetime.datetime.now()))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='convert rec to shuffled rec')
# general
parser.add_argument('--input', default='', type=str, help='')
parser.add_argument('--output', default='', type=str, help='')
parser.add_argument('--copy-vers', action='store_true', help='copy verification bins')
args = parser.parse_args()
main(args)

View File

@@ -1012,6 +1012,8 @@ def resnet(units, num_stages, filter_list, num_classes, bottle_neck):
num_unit = len(units)
assert (num_unit == num_stages)
data = mx.sym.Variable(name='data')
if config.fp16:
data = mx.sym.Cast(data=data, dtype=np.float16)
if version_input == 0:
#data = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='bn_data')
data = mx.sym.identity(data=data, name='id')
@@ -1079,6 +1081,8 @@ def resnet(units, num_stages, filter_list, num_classes, bottle_neck):
#else:
# body = residual_unit(body, filter_list[i+1], (2, 2), False,
# name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, **kwargs)
if i==num_stages-1 and config.fp16:
body = mx.sym.Cast(data=body, dtype=np.float32)
body = residual_unit(body,
filter_list[i + 1], (2, 2),
False,