dataset merge and clean script

This commit is contained in:
Jia Guo
2018-01-17 16:09:21 +08:00
parent 538363af17
commit dfc5df190e
2 changed files with 430 additions and 0 deletions

187
src/common/dataset_clean.py Normal file
View File

@@ -0,0 +1,187 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
import mxnet as mx
from mxnet import ndarray as nd
import random
import argparse
import cv2
import time
import sklearn
from sklearn.decomposition import PCA
from easydict import EasyDict as edict
import face_image
from sklearn.cluster import DBSCAN
import numpy as np
def do_clean(args):
ctx = []
cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
if len(cvd)>0:
for i in xrange(len(cvd.split(','))):
ctx.append(mx.gpu(i))
if len(ctx)==0:
ctx = [mx.cpu()]
print('use cpu')
else:
print('gpu num:', len(ctx))
ctx_num = len(ctx)
path_imgrec = os.path.join(args.input, 'train.rec')
path_imgidx = os.path.join(args.input, 'train.idx')
imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r') # pylint: disable=redefined-variable-type
s = imgrec.read_idx(0)
header, _ = mx.recordio.unpack(s)
assert header.flag>0
print('header0 label', header.label)
header0 = (int(header.label[0]), int(header.label[1]))
#assert(header.flag==1)
imgidx = range(1, int(header.label[0]))
id2range = {}
seq_identity = range(int(header.label[0]), int(header.label[1]))
for identity in seq_identity:
s = imgrec.read_idx(identity)
header, _ = mx.recordio.unpack(s)
id2range[identity] = (int(header.label[0]), int(header.label[1]))
print('id2range', len(id2range))
prop = face_image.load_property(args.input)
image_size = prop.image_size
print('image_size', image_size)
vec = args.model.split(',')
prefix = vec[0]
epoch = int(vec[1])
print('loading',prefix, epoch)
model = mx.mod.Module.load(prefix, epoch, context = ctx)
model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))])
if args.test==0:
if not os.path.exists(args.output):
os.makedirs(args.output)
writer = mx.recordio.MXIndexedRecordIO(os.path.join(args.output, 'train.idx'), os.path.join(args.output, 'train.rec'), 'w')
nrof_images = 0
nrof_removed = 0
idx = 1
id2label = {}
pp = 0
for _id, v in id2range.iteritems():
pp+=1
if pp%100==0:
print('stat', nrof_images, nrof_removed)
_list = range(*v)
ocontents = []
for i in xrange(len(_list)):
_idx = _list[i]
s = imgrec.read_idx(_idx)
ocontents.append(s)
if len(ocontents)>15:
nrof_removed+=len(ocontents)
continue
embeddings = None
#print(len(ocontents))
ba = 0
while True:
bb = min(ba+args.batch_size, len(ocontents))
if ba>=bb:
break
_batch_size = bb-ba
_batch_size2 = max(_batch_size, ctx_num)
data = nd.zeros( (_batch_size2,3, image_size[0], image_size[1]) )
label = nd.zeros( (_batch_size2,) )
count = bb-ba
ii=0
for i in xrange(ba, bb):
header, img = mx.recordio.unpack(ocontents[i])
img = mx.image.imdecode(img)
img = nd.transpose(img, axes=(2, 0, 1))
data[ii][:] = img
label[ii][:] = header.label
ii+=1
while ii<_batch_size2:
data[ii][:] = data[0][:]
label[ii][:] = label[0][:]
ii+=1
db = mx.io.DataBatch(data=(data,), label=(label,))
model.forward(db, is_train=False)
net_out = model.get_outputs()
net_out = net_out[0].asnumpy()
if embeddings is None:
embeddings = np.zeros( (len(ocontents), net_out.shape[1]))
embeddings[ba:bb,:] = net_out[0:_batch_size,:]
ba = bb
embeddings = sklearn.preprocessing.normalize(embeddings)
contents = []
if args.mode==1:
emb_mean = np.mean(embeddings, axis=0, keepdims=True)
emb_mean = sklearn.preprocessing.normalize(emb_mean)
sim = np.dot(embeddings, emb_mean.T)
#print(sim.shape)
sim = sim.flatten()
#print(sim.flatten())
x = np.argsort(sim)
for ix in xrange(len(x)):
_idx = x[ix]
_sim = sim[_idx]
#if ix<int(len(x)*0.3) and _sim<args.threshold:
if _sim<args.threshold:
continue
contents.append(ocontents[_idx])
else:
y_pred = DBSCAN(eps = args.threshold, min_samples = 2).fit_predict(embeddings)
#print(y_pred)
gmap = {}
for _idx in xrange(embeddings.shape[0]):
label = int(y_pred[_idx])
if label not in gmap:
gmap[label] = []
gmap[label].append(_idx)
assert len(gmap)>0
_max = [0, 0]
for label in xrange(10):
if not label in gmap:
break
glist = gmap[label]
if len(glist)>_max[1]:
_max[0] = label
_max[1] = len(glist)
if _max[1]>0:
glist = gmap[_max[0]]
for _idx in glist:
contents.append(ocontents[_idx])
nrof_removed+=(len(ocontents)-len(contents))
if len(contents)==0:
continue
#assert len(contents)>0
id2label[_id] = (idx, idx+len(contents))
nrof_images += len(contents)
for content in contents:
if args.test==0:
writer.write_idx(idx, content)
idx+=1
id_idx = idx
if args.test==0:
for _id, _label in id2label.iteritems():
_header = mx.recordio.IRHeader(1, _label, idx, 0)
s = mx.recordio.pack(_header, '')
writer.write_idx(idx, s)
idx+=1
_header = mx.recordio.IRHeader(1, (id_idx, idx), 0, 0)
s = mx.recordio.pack(_header, '')
writer.write_idx(0, s)
print(nrof_images, nrof_removed)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='do data clean')
# general
parser.add_argument('--input', default='', type=str, help='')
parser.add_argument('--output', default='', type=str, help='')
parser.add_argument('--model', default='../model/softmax,50', help='path to load model.')
parser.add_argument('--batch-size', default=32, type=int, help='')
parser.add_argument('--threshold', default=0.6, type=float, help='')
parser.add_argument('--mode', default=1, type=int, help='')
parser.add_argument('--test', default=0, type=int, help='')
args = parser.parse_args()
do_clean(args)

243
src/common/dataset_merge.py Normal file
View File

@@ -0,0 +1,243 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
import mxnet as mx
from mxnet import ndarray as nd
import random
import argparse
import cv2
import time
import sklearn
from sklearn.decomposition import PCA
from easydict import EasyDict as edict
import face_image
from sklearn.cluster import DBSCAN
import numpy as np
def get_embedding(args, imgrec, id, image_size, model):
s = imgrec.read_idx(id)
header, _ = mx.recordio.unpack(s)
ocontents = []
for idx in xrange(int(header.label[0]), int(header.label[1])):
s = imgrec.read_idx(idx)
ocontents.append(s)
embeddings = None
#print(len(ocontents))
ba = 0
while True:
bb = min(ba+args.batch_size, len(ocontents))
if ba>=bb:
break
_batch_size = bb-ba
_batch_size2 = max(_batch_size, args.ctx_num)
data = nd.zeros( (_batch_size2,3, image_size[0], image_size[1]) )
label = nd.zeros( (_batch_size2,) )
count = bb-ba
ii=0
for i in xrange(ba, bb):
header, img = mx.recordio.unpack(ocontents[i])
img = mx.image.imdecode(img)
img = nd.transpose(img, axes=(2, 0, 1))
data[ii][:] = img
label[ii][:] = header.label
ii+=1
while ii<_batch_size2:
data[ii][:] = data[0][:]
label[ii][:] = label[0][:]
ii+=1
db = mx.io.DataBatch(data=(data,), label=(label,))
model.forward(db, is_train=False)
net_out = model.get_outputs()
net_out = net_out[0].asnumpy()
if embeddings is None:
embeddings = np.zeros( (len(ocontents), net_out.shape[1]))
embeddings[ba:bb,:] = net_out[0:_batch_size,:]
ba = bb
embeddings = sklearn.preprocessing.normalize(embeddings)
embedding = np.mean(embeddings, axis=0, keepdims=True)
embedding = sklearn.preprocessing.normalize(embedding).flatten()
return embedding
def main(args):
ctx = []
cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
if len(cvd)>0:
for i in xrange(len(cvd.split(','))):
ctx.append(mx.gpu(i))
if len(ctx)==0:
ctx = [mx.cpu()]
print('use cpu')
else:
print('gpu num:', len(ctx))
args.ctx_num = len(ctx)
include_datasets = args.include.split(',')
prop = face_image.load_property(include_datasets[0])
image_size = prop.image_size
print('image_size', image_size)
vec = args.model.split(',')
prefix = vec[0]
epoch = int(vec[1])
print('loading',prefix, epoch)
model = mx.mod.Module.load(prefix, epoch, context = ctx)
model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))])
rec_list = []
for ds in include_datasets:
path_imgrec = os.path.join(ds, 'train.rec')
path_imgidx = os.path.join(ds, 'train.idx')
imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r') # pylint: disable=redefined-variable-type
rec_list.append(imgrec)
id_list_map = {}
all_id_list = []
test_limit = 0
for ds_id in xrange(len(rec_list)):
id_list = []
imgrec = rec_list[ds_id]
s = imgrec.read_idx(0)
header, _ = mx.recordio.unpack(s)
assert header.flag>0
print('header0 label', header.label)
header0 = (int(header.label[0]), int(header.label[1]))
#assert(header.flag==1)
imgidx = range(1, int(header.label[0]))
id2range = {}
seq_identity = range(int(header.label[0]), int(header.label[1]))
pp=0
for identity in seq_identity:
pp+=1
if pp%10==0:
print('processing id', pp)
embedding = get_embedding(args, imgrec, identity, image_size, model)
#print(embedding.shape)
id_list.append( [ds_id, identity, embedding] )
if test_limit>0 and pp>=test_limit:
break
id_list_map[ds_id] = id_list
if ds_id==0:
all_id_list += id_list
print(ds_id, len(id_list))
else:
X = []
for id_item in all_id_list:
X.append(id_item[2])
X = np.array(X)
for i in xrange(len(id_list)):
id_item = id_list[i]
y = id_item[2]
sim = np.dot(X, y.T)
idx = np.where(sim>=args.param1)[0]
if len(idx)>0:
continue
all_id_list.append(id_item)
print(ds_id, len(id_list), len(all_id_list))
if len(args.exclude)>0:
_path_imgrec = os.path.join(args.exclude, 'train.rec')
_path_imgidx = os.path.join(args.exclude, 'train.idx')
_imgrec = mx.recordio.MXIndexedRecordIO(_path_imgidx, _path_imgrec, 'r') # pylint: disable=redefined-variable-type
_ds_id = len(rec_list)
_id_list = []
s = _imgrec.read_idx(0)
header, _ = mx.recordio.unpack(s)
assert header.flag>0
print('header0 label', header.label)
header0 = (int(header.label[0]), int(header.label[1]))
#assert(header.flag==1)
imgidx = range(1, int(header.label[0]))
seq_identity = range(int(header.label[0]), int(header.label[1]))
pp=0
for identity in seq_identity:
pp+=1
if pp%10==0:
print('processing ex id', pp)
embedding = get_embedding(args, _imgrec, identity, image_size, model)
#print(embedding.shape)
_id_list.append( (_ds_id, identity, embedding) )
if test_limit>0 and pp>=test_limit:
break
#X = []
#for id_item in all_id_list:
# X.append(id_item[2])
#X = np.array(X)
#param1 = 0.3
#while param1<=1.01:
# emap = {}
# for id_item in _id_list:
# y = id_item[2]
# sim = np.dot(X, y.T)
# #print(sim.shape)
# #print(sim)
# idx = np.where(sim>=param1)[0]
# for j in idx:
# emap[j] = 1
# exclude_removed = len(emap)
# print(param1, exclude_removed)
# param1+=0.05
X = []
for id_item in all_id_list:
X.append(id_item[2])
X = np.array(X)
emap = {}
for id_item in _id_list:
y = id_item[2]
sim = np.dot(X, y.T)
idx = np.where(sim>=args.param2)[0]
for j in idx:
emap[j] = 1
all_id_list[j][1] = -1
print('exclude', len(emap))
if args.test>0:
return
if not os.path.exists(args.output):
os.makedirs(args.output)
writer = mx.recordio.MXIndexedRecordIO(os.path.join(args.output, 'train.idx'), os.path.join(args.output, 'train.rec'), 'w')
idx = 1
identities = []
for id_item in all_id_list:
if id_item[1]<0:
continue
ds_id = id_item[0]
imgrec = rec_list[ds_id]
id = id_item[1]
s = imgrec.read_idx(id)
header, _ = mx.recordio.unpack(s)
a, b = int(header.label[0]), int(header.label[1])
identities.append( (idx, idx+b-a) )
for _idx in xrange(a,b):
s = imgrec.read_idx(_idx)
writer.write_idx(idx, s)
idx+=1
id_idx = idx
for id_label in identities:
_header = mx.recordio.IRHeader(1, id_label, idx, 0)
s = mx.recordio.pack(_header, '')
writer.write_idx(idx, s)
idx+=1
_header = mx.recordio.IRHeader(1, (id_idx, idx), 0, 0)
s = mx.recordio.pack(_header, '')
writer.write_idx(0, s)
with open(os.path.join(args.output, 'property'), 'w') as f:
f.write("%d,%d,%d"%(len(identities), image_size[0], image_size[1]))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='do dataset merge')
# general
parser.add_argument('--include', default='', type=str, help='')
parser.add_argument('--exclude', default='', type=str, help='')
parser.add_argument('--output', default='', type=str, help='')
parser.add_argument('--model', default='../model/softmax,50', help='path to load model.')
parser.add_argument('--batch-size', default=32, type=int, help='')
parser.add_argument('--param1', default=0.3, type=float, help='')
parser.add_argument('--param2', default=0.45, type=float, help='')
parser.add_argument('--mode', default=1, type=int, help='')
parser.add_argument('--test', default=0, type=int, help='')
args = parser.parse_args()
main(args)