mirror of
https://github.com/deepinsight/insightface.git
synced 2026-05-18 22:57:49 +00:00
dataset merge and clean script
This commit is contained in:
187
src/common/dataset_clean.py
Normal file
187
src/common/dataset_clean.py
Normal file
@@ -0,0 +1,187 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import sys
|
||||
import mxnet as mx
|
||||
from mxnet import ndarray as nd
|
||||
import random
|
||||
import argparse
|
||||
import cv2
|
||||
import time
|
||||
import sklearn
|
||||
from sklearn.decomposition import PCA
|
||||
from easydict import EasyDict as edict
|
||||
import face_image
|
||||
from sklearn.cluster import DBSCAN
|
||||
import numpy as np
|
||||
|
||||
|
||||
|
||||
def do_clean(args):
|
||||
ctx = []
|
||||
cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
|
||||
if len(cvd)>0:
|
||||
for i in xrange(len(cvd.split(','))):
|
||||
ctx.append(mx.gpu(i))
|
||||
if len(ctx)==0:
|
||||
ctx = [mx.cpu()]
|
||||
print('use cpu')
|
||||
else:
|
||||
print('gpu num:', len(ctx))
|
||||
ctx_num = len(ctx)
|
||||
path_imgrec = os.path.join(args.input, 'train.rec')
|
||||
path_imgidx = os.path.join(args.input, 'train.idx')
|
||||
imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r') # pylint: disable=redefined-variable-type
|
||||
s = imgrec.read_idx(0)
|
||||
header, _ = mx.recordio.unpack(s)
|
||||
assert header.flag>0
|
||||
print('header0 label', header.label)
|
||||
header0 = (int(header.label[0]), int(header.label[1]))
|
||||
#assert(header.flag==1)
|
||||
imgidx = range(1, int(header.label[0]))
|
||||
id2range = {}
|
||||
seq_identity = range(int(header.label[0]), int(header.label[1]))
|
||||
for identity in seq_identity:
|
||||
s = imgrec.read_idx(identity)
|
||||
header, _ = mx.recordio.unpack(s)
|
||||
id2range[identity] = (int(header.label[0]), int(header.label[1]))
|
||||
print('id2range', len(id2range))
|
||||
prop = face_image.load_property(args.input)
|
||||
image_size = prop.image_size
|
||||
print('image_size', image_size)
|
||||
vec = args.model.split(',')
|
||||
prefix = vec[0]
|
||||
epoch = int(vec[1])
|
||||
print('loading',prefix, epoch)
|
||||
model = mx.mod.Module.load(prefix, epoch, context = ctx)
|
||||
model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))])
|
||||
if args.test==0:
|
||||
if not os.path.exists(args.output):
|
||||
os.makedirs(args.output)
|
||||
writer = mx.recordio.MXIndexedRecordIO(os.path.join(args.output, 'train.idx'), os.path.join(args.output, 'train.rec'), 'w')
|
||||
nrof_images = 0
|
||||
nrof_removed = 0
|
||||
idx = 1
|
||||
id2label = {}
|
||||
pp = 0
|
||||
for _id, v in id2range.iteritems():
|
||||
pp+=1
|
||||
if pp%100==0:
|
||||
print('stat', nrof_images, nrof_removed)
|
||||
_list = range(*v)
|
||||
ocontents = []
|
||||
for i in xrange(len(_list)):
|
||||
_idx = _list[i]
|
||||
s = imgrec.read_idx(_idx)
|
||||
ocontents.append(s)
|
||||
if len(ocontents)>15:
|
||||
nrof_removed+=len(ocontents)
|
||||
continue
|
||||
embeddings = None
|
||||
#print(len(ocontents))
|
||||
ba = 0
|
||||
while True:
|
||||
bb = min(ba+args.batch_size, len(ocontents))
|
||||
if ba>=bb:
|
||||
break
|
||||
_batch_size = bb-ba
|
||||
_batch_size2 = max(_batch_size, ctx_num)
|
||||
data = nd.zeros( (_batch_size2,3, image_size[0], image_size[1]) )
|
||||
label = nd.zeros( (_batch_size2,) )
|
||||
count = bb-ba
|
||||
ii=0
|
||||
for i in xrange(ba, bb):
|
||||
header, img = mx.recordio.unpack(ocontents[i])
|
||||
img = mx.image.imdecode(img)
|
||||
img = nd.transpose(img, axes=(2, 0, 1))
|
||||
data[ii][:] = img
|
||||
label[ii][:] = header.label
|
||||
ii+=1
|
||||
while ii<_batch_size2:
|
||||
data[ii][:] = data[0][:]
|
||||
label[ii][:] = label[0][:]
|
||||
ii+=1
|
||||
db = mx.io.DataBatch(data=(data,), label=(label,))
|
||||
model.forward(db, is_train=False)
|
||||
net_out = model.get_outputs()
|
||||
net_out = net_out[0].asnumpy()
|
||||
if embeddings is None:
|
||||
embeddings = np.zeros( (len(ocontents), net_out.shape[1]))
|
||||
embeddings[ba:bb,:] = net_out[0:_batch_size,:]
|
||||
ba = bb
|
||||
embeddings = sklearn.preprocessing.normalize(embeddings)
|
||||
contents = []
|
||||
if args.mode==1:
|
||||
emb_mean = np.mean(embeddings, axis=0, keepdims=True)
|
||||
emb_mean = sklearn.preprocessing.normalize(emb_mean)
|
||||
sim = np.dot(embeddings, emb_mean.T)
|
||||
#print(sim.shape)
|
||||
sim = sim.flatten()
|
||||
#print(sim.flatten())
|
||||
x = np.argsort(sim)
|
||||
for ix in xrange(len(x)):
|
||||
_idx = x[ix]
|
||||
_sim = sim[_idx]
|
||||
#if ix<int(len(x)*0.3) and _sim<args.threshold:
|
||||
if _sim<args.threshold:
|
||||
continue
|
||||
contents.append(ocontents[_idx])
|
||||
else:
|
||||
y_pred = DBSCAN(eps = args.threshold, min_samples = 2).fit_predict(embeddings)
|
||||
#print(y_pred)
|
||||
gmap = {}
|
||||
for _idx in xrange(embeddings.shape[0]):
|
||||
label = int(y_pred[_idx])
|
||||
if label not in gmap:
|
||||
gmap[label] = []
|
||||
gmap[label].append(_idx)
|
||||
assert len(gmap)>0
|
||||
_max = [0, 0]
|
||||
for label in xrange(10):
|
||||
if not label in gmap:
|
||||
break
|
||||
glist = gmap[label]
|
||||
if len(glist)>_max[1]:
|
||||
_max[0] = label
|
||||
_max[1] = len(glist)
|
||||
if _max[1]>0:
|
||||
glist = gmap[_max[0]]
|
||||
for _idx in glist:
|
||||
contents.append(ocontents[_idx])
|
||||
|
||||
nrof_removed+=(len(ocontents)-len(contents))
|
||||
if len(contents)==0:
|
||||
continue
|
||||
#assert len(contents)>0
|
||||
id2label[_id] = (idx, idx+len(contents))
|
||||
nrof_images += len(contents)
|
||||
for content in contents:
|
||||
if args.test==0:
|
||||
writer.write_idx(idx, content)
|
||||
idx+=1
|
||||
id_idx = idx
|
||||
if args.test==0:
|
||||
for _id, _label in id2label.iteritems():
|
||||
_header = mx.recordio.IRHeader(1, _label, idx, 0)
|
||||
s = mx.recordio.pack(_header, '')
|
||||
writer.write_idx(idx, s)
|
||||
idx+=1
|
||||
_header = mx.recordio.IRHeader(1, (id_idx, idx), 0, 0)
|
||||
s = mx.recordio.pack(_header, '')
|
||||
writer.write_idx(0, s)
|
||||
print(nrof_images, nrof_removed)
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='do data clean')
|
||||
# general
|
||||
parser.add_argument('--input', default='', type=str, help='')
|
||||
parser.add_argument('--output', default='', type=str, help='')
|
||||
parser.add_argument('--model', default='../model/softmax,50', help='path to load model.')
|
||||
parser.add_argument('--batch-size', default=32, type=int, help='')
|
||||
parser.add_argument('--threshold', default=0.6, type=float, help='')
|
||||
parser.add_argument('--mode', default=1, type=int, help='')
|
||||
parser.add_argument('--test', default=0, type=int, help='')
|
||||
args = parser.parse_args()
|
||||
do_clean(args)
|
||||
|
||||
243
src/common/dataset_merge.py
Normal file
243
src/common/dataset_merge.py
Normal file
@@ -0,0 +1,243 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
import os
|
||||
import sys
|
||||
import mxnet as mx
|
||||
from mxnet import ndarray as nd
|
||||
import random
|
||||
import argparse
|
||||
import cv2
|
||||
import time
|
||||
import sklearn
|
||||
from sklearn.decomposition import PCA
|
||||
from easydict import EasyDict as edict
|
||||
import face_image
|
||||
from sklearn.cluster import DBSCAN
|
||||
import numpy as np
|
||||
|
||||
|
||||
def get_embedding(args, imgrec, id, image_size, model):
|
||||
s = imgrec.read_idx(id)
|
||||
header, _ = mx.recordio.unpack(s)
|
||||
ocontents = []
|
||||
for idx in xrange(int(header.label[0]), int(header.label[1])):
|
||||
s = imgrec.read_idx(idx)
|
||||
ocontents.append(s)
|
||||
embeddings = None
|
||||
#print(len(ocontents))
|
||||
ba = 0
|
||||
while True:
|
||||
bb = min(ba+args.batch_size, len(ocontents))
|
||||
if ba>=bb:
|
||||
break
|
||||
_batch_size = bb-ba
|
||||
_batch_size2 = max(_batch_size, args.ctx_num)
|
||||
data = nd.zeros( (_batch_size2,3, image_size[0], image_size[1]) )
|
||||
label = nd.zeros( (_batch_size2,) )
|
||||
count = bb-ba
|
||||
ii=0
|
||||
for i in xrange(ba, bb):
|
||||
header, img = mx.recordio.unpack(ocontents[i])
|
||||
img = mx.image.imdecode(img)
|
||||
img = nd.transpose(img, axes=(2, 0, 1))
|
||||
data[ii][:] = img
|
||||
label[ii][:] = header.label
|
||||
ii+=1
|
||||
while ii<_batch_size2:
|
||||
data[ii][:] = data[0][:]
|
||||
label[ii][:] = label[0][:]
|
||||
ii+=1
|
||||
db = mx.io.DataBatch(data=(data,), label=(label,))
|
||||
model.forward(db, is_train=False)
|
||||
net_out = model.get_outputs()
|
||||
net_out = net_out[0].asnumpy()
|
||||
if embeddings is None:
|
||||
embeddings = np.zeros( (len(ocontents), net_out.shape[1]))
|
||||
embeddings[ba:bb,:] = net_out[0:_batch_size,:]
|
||||
ba = bb
|
||||
embeddings = sklearn.preprocessing.normalize(embeddings)
|
||||
embedding = np.mean(embeddings, axis=0, keepdims=True)
|
||||
embedding = sklearn.preprocessing.normalize(embedding).flatten()
|
||||
return embedding
|
||||
|
||||
def main(args):
|
||||
ctx = []
|
||||
cvd = os.environ['CUDA_VISIBLE_DEVICES'].strip()
|
||||
if len(cvd)>0:
|
||||
for i in xrange(len(cvd.split(','))):
|
||||
ctx.append(mx.gpu(i))
|
||||
if len(ctx)==0:
|
||||
ctx = [mx.cpu()]
|
||||
print('use cpu')
|
||||
else:
|
||||
print('gpu num:', len(ctx))
|
||||
args.ctx_num = len(ctx)
|
||||
include_datasets = args.include.split(',')
|
||||
prop = face_image.load_property(include_datasets[0])
|
||||
image_size = prop.image_size
|
||||
print('image_size', image_size)
|
||||
vec = args.model.split(',')
|
||||
prefix = vec[0]
|
||||
epoch = int(vec[1])
|
||||
print('loading',prefix, epoch)
|
||||
model = mx.mod.Module.load(prefix, epoch, context = ctx)
|
||||
model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))])
|
||||
rec_list = []
|
||||
for ds in include_datasets:
|
||||
path_imgrec = os.path.join(ds, 'train.rec')
|
||||
path_imgidx = os.path.join(ds, 'train.idx')
|
||||
imgrec = mx.recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, 'r') # pylint: disable=redefined-variable-type
|
||||
rec_list.append(imgrec)
|
||||
id_list_map = {}
|
||||
all_id_list = []
|
||||
test_limit = 0
|
||||
for ds_id in xrange(len(rec_list)):
|
||||
id_list = []
|
||||
imgrec = rec_list[ds_id]
|
||||
s = imgrec.read_idx(0)
|
||||
header, _ = mx.recordio.unpack(s)
|
||||
assert header.flag>0
|
||||
print('header0 label', header.label)
|
||||
header0 = (int(header.label[0]), int(header.label[1]))
|
||||
#assert(header.flag==1)
|
||||
imgidx = range(1, int(header.label[0]))
|
||||
id2range = {}
|
||||
seq_identity = range(int(header.label[0]), int(header.label[1]))
|
||||
pp=0
|
||||
for identity in seq_identity:
|
||||
pp+=1
|
||||
if pp%10==0:
|
||||
print('processing id', pp)
|
||||
embedding = get_embedding(args, imgrec, identity, image_size, model)
|
||||
#print(embedding.shape)
|
||||
id_list.append( [ds_id, identity, embedding] )
|
||||
if test_limit>0 and pp>=test_limit:
|
||||
break
|
||||
id_list_map[ds_id] = id_list
|
||||
if ds_id==0:
|
||||
all_id_list += id_list
|
||||
print(ds_id, len(id_list))
|
||||
else:
|
||||
X = []
|
||||
for id_item in all_id_list:
|
||||
X.append(id_item[2])
|
||||
X = np.array(X)
|
||||
for i in xrange(len(id_list)):
|
||||
id_item = id_list[i]
|
||||
y = id_item[2]
|
||||
sim = np.dot(X, y.T)
|
||||
idx = np.where(sim>=args.param1)[0]
|
||||
if len(idx)>0:
|
||||
continue
|
||||
all_id_list.append(id_item)
|
||||
print(ds_id, len(id_list), len(all_id_list))
|
||||
|
||||
|
||||
if len(args.exclude)>0:
|
||||
_path_imgrec = os.path.join(args.exclude, 'train.rec')
|
||||
_path_imgidx = os.path.join(args.exclude, 'train.idx')
|
||||
_imgrec = mx.recordio.MXIndexedRecordIO(_path_imgidx, _path_imgrec, 'r') # pylint: disable=redefined-variable-type
|
||||
_ds_id = len(rec_list)
|
||||
_id_list = []
|
||||
s = _imgrec.read_idx(0)
|
||||
header, _ = mx.recordio.unpack(s)
|
||||
assert header.flag>0
|
||||
print('header0 label', header.label)
|
||||
header0 = (int(header.label[0]), int(header.label[1]))
|
||||
#assert(header.flag==1)
|
||||
imgidx = range(1, int(header.label[0]))
|
||||
seq_identity = range(int(header.label[0]), int(header.label[1]))
|
||||
pp=0
|
||||
for identity in seq_identity:
|
||||
pp+=1
|
||||
if pp%10==0:
|
||||
print('processing ex id', pp)
|
||||
embedding = get_embedding(args, _imgrec, identity, image_size, model)
|
||||
#print(embedding.shape)
|
||||
_id_list.append( (_ds_id, identity, embedding) )
|
||||
if test_limit>0 and pp>=test_limit:
|
||||
break
|
||||
|
||||
#X = []
|
||||
#for id_item in all_id_list:
|
||||
# X.append(id_item[2])
|
||||
#X = np.array(X)
|
||||
#param1 = 0.3
|
||||
#while param1<=1.01:
|
||||
# emap = {}
|
||||
# for id_item in _id_list:
|
||||
# y = id_item[2]
|
||||
# sim = np.dot(X, y.T)
|
||||
# #print(sim.shape)
|
||||
# #print(sim)
|
||||
# idx = np.where(sim>=param1)[0]
|
||||
# for j in idx:
|
||||
# emap[j] = 1
|
||||
# exclude_removed = len(emap)
|
||||
# print(param1, exclude_removed)
|
||||
# param1+=0.05
|
||||
|
||||
X = []
|
||||
for id_item in all_id_list:
|
||||
X.append(id_item[2])
|
||||
X = np.array(X)
|
||||
emap = {}
|
||||
for id_item in _id_list:
|
||||
y = id_item[2]
|
||||
sim = np.dot(X, y.T)
|
||||
idx = np.where(sim>=args.param2)[0]
|
||||
for j in idx:
|
||||
emap[j] = 1
|
||||
all_id_list[j][1] = -1
|
||||
print('exclude', len(emap))
|
||||
|
||||
if args.test>0:
|
||||
return
|
||||
|
||||
if not os.path.exists(args.output):
|
||||
os.makedirs(args.output)
|
||||
writer = mx.recordio.MXIndexedRecordIO(os.path.join(args.output, 'train.idx'), os.path.join(args.output, 'train.rec'), 'w')
|
||||
idx = 1
|
||||
identities = []
|
||||
for id_item in all_id_list:
|
||||
if id_item[1]<0:
|
||||
continue
|
||||
ds_id = id_item[0]
|
||||
imgrec = rec_list[ds_id]
|
||||
id = id_item[1]
|
||||
s = imgrec.read_idx(id)
|
||||
header, _ = mx.recordio.unpack(s)
|
||||
a, b = int(header.label[0]), int(header.label[1])
|
||||
identities.append( (idx, idx+b-a) )
|
||||
for _idx in xrange(a,b):
|
||||
s = imgrec.read_idx(_idx)
|
||||
writer.write_idx(idx, s)
|
||||
idx+=1
|
||||
id_idx = idx
|
||||
for id_label in identities:
|
||||
_header = mx.recordio.IRHeader(1, id_label, idx, 0)
|
||||
s = mx.recordio.pack(_header, '')
|
||||
writer.write_idx(idx, s)
|
||||
idx+=1
|
||||
_header = mx.recordio.IRHeader(1, (id_idx, idx), 0, 0)
|
||||
s = mx.recordio.pack(_header, '')
|
||||
writer.write_idx(0, s)
|
||||
with open(os.path.join(args.output, 'property'), 'w') as f:
|
||||
f.write("%d,%d,%d"%(len(identities), image_size[0], image_size[1]))
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='do dataset merge')
|
||||
# general
|
||||
parser.add_argument('--include', default='', type=str, help='')
|
||||
parser.add_argument('--exclude', default='', type=str, help='')
|
||||
parser.add_argument('--output', default='', type=str, help='')
|
||||
parser.add_argument('--model', default='../model/softmax,50', help='path to load model.')
|
||||
parser.add_argument('--batch-size', default=32, type=int, help='')
|
||||
parser.add_argument('--param1', default=0.3, type=float, help='')
|
||||
parser.add_argument('--param2', default=0.45, type=float, help='')
|
||||
parser.add_argument('--mode', default=1, type=int, help='')
|
||||
parser.add_argument('--test', default=0, type=int, help='')
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
Reference in New Issue
Block a user