From 7770ca74f65b9b72fc3c6a39139fbf2a91143031 Mon Sep 17 00:00:00 2001 From: olojuwin <502401452@qq.com> Date: Thu, 4 Nov 2021 19:35:06 +0800 Subject: [PATCH] Refactoring with new APIs --- recognition/arcface_oneflow/README.md | 113 +---- recognition/arcface_oneflow/README_CH.md | 108 +---- .../arcface_oneflow/backbones/__init__.py | 15 +- .../arcface_oneflow/backbones/common.py | 242 ----------- .../backbones/fmobilefacenet.py | 257 ------------ .../arcface_oneflow/backbones/ir_resnet.py | 360 ++++++++-------- recognition/arcface_oneflow/configs/base.py | 55 +-- .../configs/face_emore_r100.py | 35 -- .../arcface_oneflow/configs/glint360k_mbf.py | 24 +- .../arcface_oneflow/configs/glint360k_r100.py | 15 +- .../arcface_oneflow/configs/glint360k_r18.py | 20 +- .../arcface_oneflow/configs/glint360k_r34.py | 20 +- .../arcface_oneflow/configs/glint360k_r50.py | 20 +- .../arcface_oneflow/configs/ms1V3_r100.py | 37 -- .../arcface_oneflow/configs/ms1mv3_mbf.py | 36 +- .../arcface_oneflow/configs/ms1mv3_r18.py | 24 +- .../arcface_oneflow/configs/ms1mv3_r34.py | 23 +- .../arcface_oneflow/configs/ms1mv3_r50.py | 19 +- recognition/arcface_oneflow/configs/speed.py | 8 +- recognition/arcface_oneflow/convert.sh | 3 +- .../arcface_oneflow/{ => eval}/onnx_helper.py | 96 +++-- .../arcface_oneflow/{ => eval}/onnx_ijbc.py | 130 +++--- .../arcface_oneflow/eval/verification.py | 178 +++++--- recognition/arcface_oneflow/function.py | 387 +++++++++++------- recognition/arcface_oneflow/graph.py | 75 ++++ recognition/arcface_oneflow/oneflow2onnx.py | 52 ++- recognition/arcface_oneflow/requirements.txt | 7 + recognition/arcface_oneflow/run.sh | 11 - recognition/arcface_oneflow/test.sh | 3 - .../mx_recordio_2_ofrecord.py | 20 +- .../mx_recordio_2_ofrecord_shuffled_npart.py | 19 +- recognition/arcface_oneflow/train.py | 96 ++--- recognition/arcface_oneflow/train_ddp.sh | 25 ++ .../train_graph_distributed.sh | 26 ++ recognition/arcface_oneflow/utils/losses.py | 66 +++ .../utils/ofrecord_data_utils.py | 198 ++++++--- .../arcface_oneflow/utils/utils_callbacks.py | 174 +++++--- .../arcface_oneflow/utils/utils_config.py | 5 +- .../arcface_oneflow/utils/utils_logging.py | 22 +- recognition/arcface_oneflow/val.py | 49 ++- recognition/arcface_oneflow/val.sh | 3 + 41 files changed, 1389 insertions(+), 1687 deletions(-) delete mode 100644 recognition/arcface_oneflow/backbones/common.py delete mode 100644 recognition/arcface_oneflow/backbones/fmobilefacenet.py delete mode 100644 recognition/arcface_oneflow/configs/face_emore_r100.py delete mode 100644 recognition/arcface_oneflow/configs/ms1V3_r100.py rename recognition/arcface_oneflow/{ => eval}/onnx_helper.py (70%) rename recognition/arcface_oneflow/{ => eval}/onnx_ijbc.py (68%) create mode 100644 recognition/arcface_oneflow/graph.py create mode 100644 recognition/arcface_oneflow/requirements.txt delete mode 100644 recognition/arcface_oneflow/run.sh delete mode 100644 recognition/arcface_oneflow/test.sh rename recognition/arcface_oneflow/tools/{dataset_convert => }/mx_recordio_2_ofrecord.py (86%) rename recognition/arcface_oneflow/tools/{dataset_convert => }/mx_recordio_2_ofrecord_shuffled_npart.py (89%) create mode 100644 recognition/arcface_oneflow/train_ddp.sh create mode 100644 recognition/arcface_oneflow/train_graph_distributed.sh create mode 100644 recognition/arcface_oneflow/utils/losses.py create mode 100644 recognition/arcface_oneflow/val.sh diff --git a/recognition/arcface_oneflow/README.md b/recognition/arcface_oneflow/README.md index 9b50306..21f980c 100644 --- a/recognition/arcface_oneflow/README.md +++ b/recognition/arcface_oneflow/README.md @@ -27,16 +27,11 @@ It introduces how to train InsightFace in OneFlow, and do verification over the \- [2. Transformation from MS1M recordio to OFRecord](#2-transformation-from-ms1m-recordio-to-ofrecord) - \- [Pretrained model](#Pretrained-model) - \- [Training and verification](#training-and-verification) \- [Training](#training) - \- [Varification](#varification) - - \- [Benchmark](#benchmark) - + \- [OneFLow2ONNX](#OneFLow2ONNX) ## Background @@ -109,7 +104,7 @@ First of all, before execution, please make sure that: According to steps in [Install OneFlow](https://github.com/Oneflow-Inc/oneflow#install-oneflow) install the newest release master whl packages. ``` -python3 -m pip install --find-links https://release.oneflow.info oneflow_cu102 --user +python3 -m pip install oneflow -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/cu102/6aa719d70119b65837b25cc5f186eb19ef2b7891/index.html --user ``` @@ -160,7 +155,7 @@ Only need to execute 2.1 or 2.2 Run ``` -python tools/mx_recordio_2_ofrecord_shuffled_npart.py --data_dir datasets/faces_emore --output_filepath faces_emore/ofrecord/train --part_num 16 +python tools/mx_recordio_2_ofrecord_shuffled_npart.py --data_dir datasets/faces_emore --output_filepath faces_emore/ofrecord/train --num_part 16 ``` And you will get the number of `part_num` parts of OFRecord, it's 16 parts in this example, it showed like this ``` @@ -237,29 +232,6 @@ ofrecord/test/ ``` - -## Pretrained model - -The accuracy comparison of OneFlow and MXNet pretrained models on the verification set of the 1:1 verification accuracy on insightface recognition test (IFRT) are as follows: - -| **Framework** | **African** | **Caucasian** | **Indian** | **Asian** | **All** | -| ------------- | ----------- | ------------- | ---------- | --------- | ------- | -| OneFlow | 90.4076 | 94.583 | 93.702 | 68.754 | 89.684 | -| MXNet | 90.45 | 94.60 | 93.96 | 63.91 | 88.23 | - -The download link of the OneFlow pretrain model:[of_005_model.tar.gz](http://oneflow-public.oss-cn-beijing.aliyuncs.com/face_dataset/pretrained_model/of_glint360k_partial_fc/of_005_model.tar.gz) - -We also provide the MXNet model which converted from OneFlow:[of_to_mxnet_model_005.tar.gz](http://oneflow-public.oss-cn-beijing.aliyuncs.com/face_dataset/pretrained_model/of_2_mxnet_glint360k_partial_fc/of_to_mxnet_model_005.tar.gz) - - - -## OneFLow2ONNX - -``` -pip install oneflow-onnx==0.3.4 -./convert.sh -``` - ## Training and verification @@ -268,9 +240,15 @@ pip install oneflow-onnx==0.3.4 To reduce the usage cost of user, OneFlow draws close the scripts to Torch style, you can directly modify parameters via configs/*.py +#### eager ``` -./run.sh +./train_ddp.sh ``` +#### Graph +``` +train_graph_distributed.sh +``` + ### Varification @@ -280,70 +258,9 @@ Moreover, OneFlow offers a validation script to do verification separately, val. ./val.sh ``` +## OneFLow2ONNX -## Benchmark - -### Training Speed Benchmark - -#### Face_emore Dataset & FP32 - -| Backbone | GPU | model_parallel | partial_fc | BatchSize / it | Throughput img / sec | -| -------- | ------------------------ | -------------- | ---------- | -------------- | -------------------- | -| R100 | 8 * Tesla V100-SXM2-16GB | False | False | 64 | 1836.8 | -| R100 | 8 * Tesla V100-SXM2-16GB | True | False | 64 | 1854.15 | -| R100 | 8 * Tesla V100-SXM2-16GB | True | True | 64 | 1872.81 | -| R100 | 8 * Tesla V100-SXM2-16GB | False | False | 96(Max) | 1931.76 | -| R100 | 8 * Tesla V100-SXM2-16GB | True | False | 115(Max) | 1921.87 | -| R100 | 8 * Tesla V100-SXM2-16GB | True | True | 120(Max) | 1962.76 | -| Y1 | 8 * Tesla V100-SXM2-16GB | False | False | 256 | 14298.02 | -| Y1 | 8 * Tesla V100-SXM2-16GB | True | False | 256 | 14049.75 | -| Y1 | 8 * Tesla V100-SXM2-16GB | False | False | 350(Max) | 14756.03 | -| Y1 | 8 * Tesla V100-SXM2-16GB | True | True | 400(Max) | 14436.38 | - -#### Glint360k Dataset & FP32 - -| Backbone | GPU | partial_fc sample_ratio | BatchSize / it | Throughput img / sec | -| -------- | ------------------------ | ----------------------- | -------------- | -------------------- | -| R100 | 8 * Tesla V100-SXM2-16GB | 0.1 | 64 | 1858.57 | -| R100 | 8 * Tesla V100-SXM2-16GB | 0.1 | 115 | 1933.88 | - - - -### Evaluation on Lfw, Cfp_fp, Agedb_30 - -- Data Parallelism - -| Backbone | Dataset | Lfw | Cfp_fp | Agedb_30 | -| ------------- | ------- | ------ | ------ | -------- | -| R100 | MS1M | 99.717 | 98.643 | 98.150 | -| MobileFaceNet | MS1M | 99.5 | 92.657 | 95.6 | - -- Model Parallelism - -| Backbone | Dataset | Lfw | Cfp_fp | Agedb_30 | -| ------------- | ------- | ------ | ------ | -------- | -| R100 | MS1M | 99.733 | 98.329 | 98.033 | -| MobileFaceNet | MS1M | 99.483 | 93.457 | 95.7 | - -- Partial FC - -| Backbone | Dataset | Lfw | Cfp_fp | Agedb_30 | -| -------- | ------- | ------ | ------ | -------- | -| R100 | MS1M | 99.817 | 98.443 | 98.217 | - -### Evaluation on IFRT - -r denotes the sampling rate of negative class centers. - -| Backbone | Dataset | African | Caucasian | Indian | Asian | ALL | -| -------- | -------------------- | ------- | --------- | ------ | ------ | ------ | -| R100 | **Glint360k**(r=0.1) | 90.4076 | 94.583 | 93.702 | 68.754 | 89.684 | - -### Max num_classses - -| node_num | gpu_num_per_node | batch_size_per_device | fp16 | Model Parallel | Partial FC | num_classes | -| -------- | ---------------- | --------------------- | ---- | -------------- | ---------- | ----------- | -| 1 | 1 | 64 | True | True | True | 2000000 | -| 1 | 8 | 64 | True | True | True | 13500000 | - -More test details could refer to [OneFlow DLPerf](https://github.com/Oneflow-Inc/DLPerf#insightface). +``` +pip install oneflow-onnx==0.5.1 +./convert.sh +``` \ No newline at end of file diff --git a/recognition/arcface_oneflow/README_CH.md b/recognition/arcface_oneflow/README_CH.md index 8b091e5..5a62f82 100644 --- a/recognition/arcface_oneflow/README_CH.md +++ b/recognition/arcface_oneflow/README_CH.md @@ -17,17 +17,11 @@ - [1. 下载数据集](#1-下载数据集) - [2. 将训练数据集 MS1M 从 recordio 格式转换为 OFRecord 格式](#2-将训练数据集-ms1m-从-recordio-格式转换为-ofrecord-格式) - - [预训练模型](#预训练模型) - [训练和验证](#训练和验证) - [训练](#训练) - [验证](#验证) - - [基准测试](#基准测试) - - [训练速度基准](#训练速度基准) - - [Face_emore 数据集 & FP32](#face_emore-数据集--fp32) - - [Glint360k 数据集 & FP32](#glint360k-数据集--fp32) - - [Evaluation on Lfw, Cfp_fp, Agedb_30](#evaluation-on-lfw-cfp_fp-agedb_30) - - [Evaluation on IFRT](#evaluation-on-ifrt) - - [Max num_classses](#max-num_classses) + - [OneFLow2ONNX](#OneFLow2ONNX) + ## 背景介绍 @@ -80,7 +74,7 @@ 根据 [Install OneFlow](https://github.com/Oneflow-Inc/oneflow#install-oneflow) 的步骤进行安装最新 master whl 包即可。 ``` -python3 -m pip install --find-links https://release.oneflow.info oneflow_cu102 --user +python3 -m pip install oneflow -f https://oneflow-staging.oss-cn-beijing.aliyuncs.com/branch/master/cu102/6aa719d70119b65837b25cc5f186eb19ef2b7891/index.html --user ``` ### 准备数据集 @@ -199,24 +193,7 @@ ofrecord/test/ ``` -## 预训练模型 -基于 oneflow 的人脸识别模型在 The 1:1 verification accuracy on InsightFace Recognition Test (IFRT) 验证集上与 MXNet 的预训练模型精度对比如下: - -| **Framework** | **African** | **Caucasian** | **Indian** | **Asian** | **All** | -| ------------- | ----------- | ------------- | ---------- | --------- | ------- | -| OneFlow | 90.4076 | 94.583 | 93.702 | 68.754 | 89.684 | -| MXNet | 90.45 | 94.60 | 93.96 | 63.91 | 88.23 | - -oneflow 的人脸预训练模型下载链接:[of_005_model.tar.gz](http://oneflow-public.oss-cn-beijing.aliyuncs.com/face_dataset/pretrained_model/of_glint360k_partial_fc/of_005_model.tar.gz) - -我们也提供了转换成 MXNet 的模型:[of_to_mxnet_model_005.tar.gz](http://oneflow-public.oss-cn-beijing.aliyuncs.com/face_dataset/pretrained_model/of_2_mxnet_glint360k_partial_fc/of_to_mxnet_model_005.tar.gz) - - ## 模型转换 -``` -pip install oneflow-onnx==0.3.4 -./convert.sh -``` ## 训练和验证 @@ -227,10 +204,14 @@ pip install oneflow-onnx==0.3.4 运行脚本: +#### eager ``` -./run.sh +./train_ddp.sh +``` +#### Graph +``` +train_graph_distributed.sh ``` - ### 验证 @@ -242,70 +223,9 @@ pip install oneflow-onnx==0.3.4 ./val.sh ``` +## OneFLow2ONNX -## 基准测试 - -### 训练速度基准 - -#### Face_emore 数据集 & FP32 - -| Backbone | GPU | model_parallel | partial_fc | BatchSize / it | Throughput img / sec | -| -------- | ------------------------ | -------------- | ---------- | -------------- | -------------------- | -| R100 | 8 * Tesla V100-SXM2-16GB | False | False | 64 | 1832.02 | -| R100 | 8 * Tesla V100-SXM2-16GB | True | False | 64 | 1851.63 | -| R100 | 8 * Tesla V100-SXM2-16GB | True | True | 64 | 1854.25 | -| R100 | 8 * Tesla V100-SXM2-16GB | True | True | 96(Max) | 1925.6 | -| R100 | 8 * Tesla V100-SXM2-16GB | True | False | 115(Max) | 1925.59 | -| R100 | 8 * Tesla V100-SXM2-16GB | True | True | 128(Max) | 1953.46 | -| Y1 | 8 * Tesla V100-SXM2-16GB | False | False | 256 | 14298.02 | -| Y1 | 8 * Tesla V100-SXM2-16GB | True | False | 256 | 14049.75 | -| Y1 | 8 * Tesla V100-SXM2-16GB | False | False | 350(Max) | 14756.03 | -| Y1 | 8 * Tesla V100-SXM2-16GB | True | True | 400(Max) | 14436.38 | - -#### Glint360k 数据集 & FP32 - -| Backbone | GPU | partial_fc sample_ratio | BatchSize / it | Throughput img / sec | -| -------- | ------------------------ | ----------------------- | -------------- | -------------------- | -| R100 | 8 * Tesla V100-SXM2-16GB | 1 | 64 | 1808.27 | -| R100 | 8 * Tesla V100-SXM2-16GB | 0.1 | 64 | 1858.57 | - - - -### Evaluation on Lfw, Cfp_fp, Agedb_30 - -- Data Parallelism - -| Backbone | Dataset | Lfw | Cfp_fp | Agedb_30 | -| ------------- | ------- | ------ | ------ | -------- | -| R100 | MS1M | 99.717 | 98.643 | 98.150 | -| MobileFaceNet | MS1M | 99.5 | 92.657 | 95.6 | - -- Model Parallelism - -| Backbone | Dataset | Lfw | Cfp_fp | Agedb_30 | -| ------------- | ------- | ------ | ------ | -------- | -| R100 | MS1M | 99.733 | 98.329 | 98.033 | -| MobileFaceNet | MS1M | 99.483 | 93.457 | 95.7 | - -- Partial FC - -| Backbone | Dataset | Lfw | Cfp_fp | Agedb_30 | -| -------- | ------- | ------ | ------ | -------- | -| R100 | MS1M | 99.817 | 98.443 | 98.217 | - -### Evaluation on IFRT - -r denotes the sampling rate of negative class centers. - -| Backbone | Dataset | African | Caucasian | Indian | Asian | ALL | -| -------- | -------------------- | ------- | --------- | ------ | ------ | ------ | -| R100 | **Glint360k**(r=0.1) | 90.4076 | 94.583 | 93.702 | 68.754 | 89.684 | - -### Max num_classses - -| node_num | gpu_num_per_node | batch_size_per_device | fp16 | Model Parallel | Partial FC | num_classes | -| -------- | ---------------- | --------------------- | ---- | -------------- | ---------- | ----------- | -| 1 | 1 | 64 | True | True | True | 2000000 | -| 1 | 8 | 64 | True | True | True | 13500000 | - -更多详情请移步 [OneFlow DLPerf](https://github.com/Oneflow-Inc/DLPerf#insightface). +``` +pip install oneflow-onnx==0.5.1 +./convert.sh +``` \ No newline at end of file diff --git a/recognition/arcface_oneflow/backbones/__init__.py b/recognition/arcface_oneflow/backbones/__init__.py index 2b95a04..2448255 100644 --- a/recognition/arcface_oneflow/backbones/__init__.py +++ b/recognition/arcface_oneflow/backbones/__init__.py @@ -1,19 +1,16 @@ from .ir_resnet import iresnet18, iresnet34, iresnet50, iresnet100, iresnet200 -from .fmobilefacenet import mobilefacenet -def get_model(name, input_blob, cfg): +def get_model(name, **kwargs): if name == "r18": - return iresnet18(input_blob, cfg) + return iresnet18(False, **kwargs) elif name == "r34": - return iresnet34(input_blob, cfg) + return iresnet34(False, **kwargs) elif name == "r50": - return iresnet50(input_blob, cfg) + return iresnet50(False, **kwargs) elif name == "r100": - return iresnet100(input_blob, cfg) + return iresnet100(False, **kwargs) elif name == "r200": - return iresnet200(input_blob, cfg) - elif name == "mbf": - return mobilefacenet(input_blob, cfg) + return iresnet200(False, **kwargs) else: raise ValueError() diff --git a/recognition/arcface_oneflow/backbones/common.py b/recognition/arcface_oneflow/backbones/common.py deleted file mode 100644 index 3088efb..0000000 --- a/recognition/arcface_oneflow/backbones/common.py +++ /dev/null @@ -1,242 +0,0 @@ -import oneflow as flow - - -# same as torch -def _get_initializer(): - return flow.random_normal_initializer(mean=0.0, stddev=0.1) - - -def _get_initializer_FC(): - return flow.random_normal_initializer(mean=0.0, stddev=0.01) - - -def _get_regularizer(name): - return flow.regularizers.l2(0.0005) - - -def _dropout(input_blob, dropout_prob): - return flow.nn.dropout(input_blob, rate=dropout_prob) - - -def _prelu(inputs, data_format="NCHW", name=None): - return flow.layers.prelu( - inputs, - alpha_initializer=flow.constant_initializer(0.25), - alpha_regularizer=_get_regularizer("alpha"), - shared_axes=[2, 3] if data_format == "NCHW" else [1, 2], - name=name, - ) - - -def _relu(inputs, data_format="NCHW", name=None): - return flow.nn.relu( - inputs, - name=name, - ) - - -def _avg_pool(inputs, pool_size, strides, padding, data_format="NCHW", name=None): - return flow.nn.avg_pool2d( - input=inputs, ksize=pool_size, strides=strides, padding=padding, data_format=data_format, name=name - ) - - -def _batch_norm( - inputs, - epsilon, - center=True, - scale=True, - trainable=True, - is_training=True, - data_format="NCHW", - name=None, -): - - return flow.layers.batch_normalization( - inputs=inputs, - axis=3 if data_format == "NHWC" and inputs.shape == 4 else 1, - momentum=0.9, - epsilon=epsilon, - center=center, - scale=scale, - beta_initializer=flow.zeros_initializer(), - gamma_initializer=flow.ones_initializer(), - beta_regularizer=_get_regularizer("beta"), - gamma_regularizer=_get_regularizer("gamma"), - moving_mean_initializer=flow.zeros_initializer(), - moving_variance_initializer=flow.ones_initializer(), - trainable=trainable, - training=is_training, - name=name, - ) - - -def _conv2d_layer( - name, - input, - filters, - kernel_size=3, - strides=1, - padding="SAME", - group_num=1, - data_format="NCHW", - dilation_rate=1, - activation=None, - use_bias=False, - weight_initializer=_get_initializer(), - bias_initializer=flow.zeros_initializer(), - weight_regularizer=_get_regularizer("weight"), - bias_regularizer=_get_regularizer("bias"), -): - return flow.layers.conv2d(inputs=input, filters=filters, kernel_size=kernel_size, strides=strides, padding=padding, data_format=data_format, dilation_rate=dilation_rate, groups=group_num, activation=activation, use_bias=use_bias, kernel_initializer=weight_initializer, bias_initializer=bias_initializer, kernel_regularizer=weight_regularizer, bias_regularizer=bias_regularizer, name=name) - - -def Linear( - input_blob, - num_filter=1, - kernel=None, - stride=None, - pad="valid", - num_group=1, - bn_is_training=True, - data_format="NCHW", - name=None, - suffix="", -): - conv = _conv2d_layer( - name="%s%s_conv2d" % (name, suffix), - input=input_blob, - filters=num_filter, - kernel_size=kernel, - strides=stride, - padding=pad, - data_format=data_format, - group_num=num_group, - use_bias=False, - dilation_rate=1, - activation=None, - ) - - bn = _batch_norm( - conv, - epsilon=0.001, - is_training=bn_is_training, - data_format=data_format, - name="%s%s_batchnorm" % (name, suffix), - ) - return bn - - -def get_fc1(last_conv, num_classes, fc_type, input_channel=512): - body = last_conv - if fc_type == "Z": - body = _batch_norm( - body, - epsilon=2e-5, - scale=False, - center=True, - is_training=True, - data_format="NCHW", - name="bn2" - ) - body = _dropout(body, 0.4) - fc1 = body - elif fc_type == "E": - body = _batch_norm( - body, - epsilon=2e-5, - is_training=True, - data_format="NCHW", - name="bn2" - ) - body = _dropout(body, dropout_prob=0.4) - - body = flow.flatten(body, 1) - fc1 = flow.layers.dense( - inputs=body, - units=num_classes, - activation=None, - use_bias=True, - kernel_initializer=_get_initializer(), - bias_initializer=flow.zeros_initializer(), - kernel_regularizer=_get_regularizer("weight"), - bias_regularizer=_get_regularizer("bias"), - trainable=True, - name="pre_fc1", - ) - fc1 = _batch_norm( - fc1, - epsilon=2e-5, - scale=False, - center=True, - is_training=True, - data_format="NCHW", - name="fc1", - ) - elif fc_type == "FC": - body = _batch_norm( - body, - epsilon=2e-5, - is_training=True, - data_format="NCHW", - name="bn2" - ) - - - body = flow.flatten(body, 1) - fc1 = flow.layers.dense( - inputs=body, - units=num_classes, - activation=None, - use_bias=True, - kernel_initializer=_get_initializer(), - bias_initializer=flow.zeros_initializer(), - kernel_regularizer=_get_regularizer("weight"), - bias_regularizer=_get_regularizer("bias"), - trainable=True, - name="fc" - ) - fc1 = _batch_norm( - fc1, - epsilon=2e-5, - scale=False, - center=True, - is_training=True, - data_format="NCHW", - name="features" - ) - elif fc_type == "GDC": - conv_6_dw = Linear( - last_conv, - num_filter=input_channel, # 512 - num_group=input_channel, # 512 - kernel=7, - pad="valid", - stride=[1, 1], - bn_is_training=True, - data_format="NCHW", - name="conv_6dw7_7", - ) - conv_6_dw = flow.reshape(conv_6_dw, (body.shape[0], -1)) - conv_6_f = flow.layers.dense( - inputs=conv_6_dw, - units=num_classes, - activation=None, - use_bias=True, - kernel_initializer=_get_initializer(), - bias_initializer=flow.zeros_initializer(), - kernel_regularizer=_get_regularizer("weight"), - bias_regularizer=_get_regularizer("bias"), - trainable=True, - name="pre_fc1", - ) - fc1 = _batch_norm( - conv_6_f, - epsilon=2e-5, - scale=False, - center=True, - is_training=True, - data_format="NCHW", - name="fc1", - ) - return fc1 diff --git a/recognition/arcface_oneflow/backbones/fmobilefacenet.py b/recognition/arcface_oneflow/backbones/fmobilefacenet.py deleted file mode 100644 index 5aa2883..0000000 --- a/recognition/arcface_oneflow/backbones/fmobilefacenet.py +++ /dev/null @@ -1,257 +0,0 @@ -import oneflow as flow -import oneflow.core.operator.op_conf_pb2 as op_conf_util -from .common import _get_initializer, _conv2d_layer, _batch_norm, _prelu, Linear, get_fc1 - - -""" -References: -https://github.com/deepinsight/insightface/blob/master/recognition/symbol/fmobilefacenet.py -""" - - -def Conv( - input_blob, - num_filter=1, - kernel=None, - stride=None, - pad="valid", - data_format="NCHW", - num_group=1, - bn_is_training=True, - name=None, - suffix="", -): - conv = _conv2d_layer( - name="%s%s_conv2d" % (name, suffix), - input=input_blob, - filters=num_filter, - kernel_size=kernel, - strides=stride, - padding=pad, - data_format=data_format, - group_num=num_group, - dilation_rate=1, - activation=None, - use_bias=False, - ) - - bn = _batch_norm( - conv, - epsilon=0.001, - is_training=bn_is_training, - data_format=data_format, - name="%s%s_batchnorm" % (name, suffix), - ) - prelu = _prelu(bn, data_format, name="%s%s_relu" % (name, suffix)) - - return prelu - - -def DResidual_v1( - input_blob, - num_out=1, - kernel=None, - stride=None, - pad="same", - num_group=1, - bn_is_training=True, - data_format="NCHW", - name=None, - suffix="", -): - conv = Conv( - input_blob=input_blob, - num_filter=num_group, - kernel=1, - pad="valid", - data_format=data_format, - stride=[1, 1], - bn_is_training=bn_is_training, - name="%s%s_conv_sep" % (name, suffix), - ) - conv_dw = Conv( - input_blob=conv, - num_filter=num_group, - num_group=num_group, - kernel=kernel, - pad=pad, - data_format=data_format, - stride=stride, - bn_is_training=bn_is_training, - name="%s%s_conv_dw" % (name, suffix), - ) - proj = Linear( - input_blob=conv_dw, - num_filter=num_out, - kernel=1, - pad="valid", - data_format=data_format, - stride=[1, 1], - bn_is_training=bn_is_training, - name="%s%s_conv_proj" % (name, suffix), - ) - return proj - - -def Residual( - input_blob, - num_block=1, - num_out=1, - kernel=None, - stride=None, - pad="same", - data_format="NCHW", - num_group=1, - bn_is_training=True, - name=None, - suffix="", -): - identity = input_blob - for i in range(num_block): - shortcut = identity - conv = DResidual_v1( - input_blob=identity, - num_out=num_out, - kernel=kernel, - stride=stride, - pad=pad, - data_format=data_format, - num_group=num_group, - - name="%s%s_block" % (name, suffix), - suffix="%d" % i, - ) - identity = flow.math.add(conv, shortcut) - return identity - - -def get_symbol(input_blob, net_blocks, config): - num_classes = config.embedding_size - fc_type = 'GDC' - data_format = "NCHW" - bn_is_training = True - - conv_1 = Conv( - input_blob, - num_filter=64, - kernel=3, - stride=[2, 2], - pad="same", - data_format=data_format, - bn_is_training=bn_is_training, - name="conv_1", - ) - - if net_blocks[0] == 1: - conv_2_dw = Conv( - conv_1, - num_filter=64, - kernel=3, - stride=[1, 1], - pad="same", - data_format=data_format, - num_group=64, - bn_is_training=bn_is_training, - name="conv_2_dw", - ) - else: - conv_2_dw = Residual( - conv_1, - num_block=net_blocks[0], - num_out=64, - kernel=3, - stride=[1, 1], - pad="same", - data_format=data_format, - num_group=64, - bn_is_training=bn_is_training, - name="res_2", - ) - - conv_23 = DResidual_v1( - conv_2_dw, - num_out=64, - kernel=3, - stride=[2, 2], - pad="same", - data_format=data_format, - num_group=128, - bn_is_training=bn_is_training, - name="dconv_23", - ) - conv_3 = Residual( - conv_23, - num_block=net_blocks[1], - num_out=64, - kernel=3, - stride=[1, 1], - pad="same", - data_format=data_format, - num_group=128, - bn_is_training=bn_is_training, - name="res_3", - ) - - conv_34 = DResidual_v1( - conv_3, - num_out=128, - kernel=3, - stride=[2, 2], - pad="same", - data_format=data_format, - num_group=256, - bn_is_training=bn_is_training, - name="dconv_34", - ) - conv_4 = Residual( - conv_34, - num_block=net_blocks[2], - num_out=128, - kernel=3, - stride=[1, 1], - pad="same", - data_format=data_format, - num_group=256, - bn_is_training=bn_is_training, - name="res_4", - ) - - conv_45 = DResidual_v1( - conv_4, - num_out=128, - kernel=3, - stride=[2, 2], - pad="same", - data_format=data_format, - num_group=512, - bn_is_training=bn_is_training, - name="dconv_45", - ) - conv_5 = Residual( - conv_45, - num_block=net_blocks[3], - num_out=128, - kernel=3, - stride=[1, 1], - pad="same", - data_format=data_format, - num_group=256, - bn_is_training=bn_is_training, - name="res_5", - ) - conv_6_sep = Conv( - conv_5, - num_filter=512, - kernel=1, - pad="valid", - data_format=data_format, - stride=[1, 1], - bn_is_training=bn_is_training, - name="conv_6sep", - ) - fc1 = get_fc1(conv_6_sep, num_classes, fc_type, input_channel=512) - return fc1 - - -def mobilefacenet(input_blob, cfg): - return get_symbol(input_blob, [1, 4, 6, 2], cfg) diff --git a/recognition/arcface_oneflow/backbones/ir_resnet.py b/recognition/arcface_oneflow/backbones/ir_resnet.py index 1202ca9..1e9a995 100644 --- a/recognition/arcface_oneflow/backbones/ir_resnet.py +++ b/recognition/arcface_oneflow/backbones/ir_resnet.py @@ -1,189 +1,219 @@ import oneflow as flow -from .common import _batch_norm, _conv2d_layer, _avg_pool, _prelu, get_fc1 +import oneflow.nn as nn +from typing import Type, Any, Callable, Union, List, Optional -def residual_unit_v3( - in_data, num_filter, stride, dim_match, bn_is_training, data_format, name -): - - suffix = "" - use_se = 0 - bn1 = _batch_norm( - in_data, - epsilon=2e-5, - is_training=bn_is_training, - data_format=data_format, - name="%s%s.bn1" % (name, suffix), - ) - conv1 = _conv2d_layer( - name="%s%s.conv1" % (name, suffix), - input=bn1, - filters=num_filter, +def conv3x3( + in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1 +) -> nn.Conv2d: + """3x3 convolution with padding""" + return nn.Conv2d( + in_planes, + out_planes, kernel_size=3, - strides=[1, 1], - padding="same", - data_format=data_format, - use_bias=False, - dilation_rate=1, - activation=None, - ) - bn2 = _batch_norm( - conv1, - epsilon=2e-5, - is_training=bn_is_training, - data_format=data_format, - name="%s%s.bn2" % (name, suffix), - ) - prelu = _prelu(bn2, data_format=data_format, - name="%s%s_relu1" % (name, suffix)) - conv2 = _conv2d_layer( - name="%s%s.conv2" % (name, suffix), - input=prelu, - filters=num_filter, - kernel_size=3, - strides=stride, - padding="same", - data_format=data_format, - use_bias=False, - dilation_rate=1, - activation=None, - ) - bn3 = _batch_norm( - conv2, - epsilon=2e-5, - is_training=bn_is_training, - data_format=data_format, - name="%s%s.bn3" % (name, suffix), + stride=stride, + padding=dilation, + groups=groups, + bias=False, + dilation=dilation, ) - if use_se: - # se begin - input_blob = _avg_pool( - bn3, pool_size=[7, 7], strides=[1, 1], padding="VALID" - ) - input_blob = _conv2d_layer( - name="%s%s_se_conv1" % (name, suffix), - input=input_blob, - filters=num_filter // 16, - kernel_size=1, - strides=[1, 1], - padding="valid", - data_format=data_format, - use_bias=True, - dilation_rate=1, - activation=None, - ) - input_blob = _prelu(input_blob, name="%s%s_se_relu1" % (name, suffix)) - input_blob = _conv2d_layer( - name="%s%s_se_conv2" % (name, suffix), - input=input_blob, - filters=num_filter, - kernel_size=1, - strides=[1, 1], - padding="valid", - data_format=data_format, - use_bias=True, - dilation_rate=1, - activation=None, - ) - input_blob = flow.math.sigmoid(input=input_blob) - bn3 = flow.math.multiply(x=input_blob, y=bn3) - # se end - if dim_match: - input_blob = in_data - else: - input_blob = _conv2d_layer( - name="%s%s.downsample.0" % (name, suffix), - input=in_data, - filters=num_filter, - kernel_size=1, - strides=stride, - padding="valid", - data_format=data_format, - use_bias=False, - dilation_rate=1, - activation=None, - ) - input_blob = _batch_norm( - input_blob, - epsilon=2e-5, - is_training=bn_is_training, - data_format=data_format, - name="%s%s.downsample.1" % (name, suffix), - ) - - identity = flow.math.add(x=bn3, y=input_blob) - return identity +def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d: + """1x1 convolution""" + return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) -def get_symbol(input_blob, units, cfg): - filter_list = [64, 64, 128, 256, 512] - num_stages = 4 - units = units +class IBasicBlock(nn.Module): + expansion = 1 - num_classes = cfg.embedding_size + def __init__( + self, + inplanes, + planes, + stride=1, + downsample=None, + groups=1, + base_width=64, + dilation=1, + ): + super(IBasicBlock, self).__init__() + if groups != 1 or base_width != 64: + raise ValueError("BasicBlock only supports groups=1 and base_width=64") + if dilation > 1: + raise NotImplementedError("Dilation > 1 not supported in BasicBlock") + self.bn1 = nn.BatchNorm2d(inplanes, eps=1e-05,) + self.conv1 = conv3x3(inplanes, planes) + self.bn2 = nn.BatchNorm2d(planes, eps=1e-05,) + self.prelu = nn.ReLU(planes) + self.conv2 = conv3x3(planes, planes, stride) + self.bn3 = nn.BatchNorm2d(planes, eps=1e-05,) + self.downsample = downsample + self.stride = stride - fc_type = cfg.fc_type - bn_is_training = True - data_format = "NCHW" + def forward(self, x): + identity = x + out = self.bn1(x) + out = self.conv1(out) + out = self.bn2(out) + out = self.prelu(out) + out = self.conv2(out) + out = self.bn3(out) + if self.downsample is not None: + identity = self.downsample(x) + out += identity + return out - input_blob = _conv2d_layer( - name="conv1", - input=input_blob, - filters=filter_list[0], - kernel_size=3, - strides=[1, 1], - padding="same", - data_format=data_format, - use_bias=False, - dilation_rate=1, - activation=None, - ) - input_blob = _batch_norm( - input_blob, epsilon=2e-5, is_training=bn_is_training, data_format=data_format, name="bn1" - ) - input_blob = _prelu(input_blob, data_format=data_format, name="relu0") - for i in range(num_stages): - input_blob = residual_unit_v3( - input_blob, - filter_list[i + 1], - [2, 2], - False, - bn_is_training=bn_is_training, - data_format=data_format, - name="layer%d.%d" % (i + 1, 0), - ) - for j in range(units[i] - 1): - input_blob = residual_unit_v3( - input_blob, - filter_list[i + 1], - [1, 1], - True, - bn_is_training=bn_is_training, - data_format=data_format, - name="layer%d.%d" % (i + 1, j + 1), +class IResNet(nn.Module): + fc_scale = 7 * 7 + + def __init__( + self, + block, + layers, + dropout=0, + num_features=512, + zero_init_residual=False, + groups=1, + width_per_group=64, + replace_stride_with_dilation=None, + fp16=False, + ): + super(IResNet, self).__init__() + self.fp16 = fp16 + self.inplanes = 64 + self.dilation = 1 + if replace_stride_with_dilation is None: + replace_stride_with_dilation = [False, False, False] + if len(replace_stride_with_dilation) != 3: + raise ValueError( + "replace_stride_with_dilation should be None " + "or a 3-element tuple, got {}".format(replace_stride_with_dilation) ) - fc1 = get_fc1(input_blob, num_classes, fc_type) - return fc1 + self.groups = groups + self.base_width = width_per_group + self.conv1 = nn.Conv2d( + 3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False + ) + self.bn1 = nn.BatchNorm2d(self.inplanes, eps=1e-05) + self.prelu = nn.ReLU(self.inplanes) + self.layer1 = self._make_layer(block, 64, layers[0], stride=2) + self.layer2 = self._make_layer( + block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0] + ) + self.layer3 = self._make_layer( + block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1] + ) + self.layer4 = self._make_layer( + block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2] + ) + self.bn2 = nn.BatchNorm2d(512 * block.expansion, eps=1e-05,) + self.dropout = nn.Dropout(p=dropout, inplace=True) + self.fc = nn.Linear(512 * block.expansion * self.fc_scale, num_features) + self.features = nn.BatchNorm1d(num_features, eps=1e-05) + nn.init.constant_(self.features.weight, 1.0) + self.features.weight.requires_grad = False + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.normal_(m.weight, 0, 0.1) + elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + if zero_init_residual: + for m in self.modules(): + if isinstance(m, IBasicBlock): + nn.init.constant_(m.bn2.weight, 0) + + def _make_layer(self, block, planes, blocks, stride=1, dilate=False): + downsample = None + previous_dilation = self.dilation + if dilate: + self.dilation *= stride + stride = 1 + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + conv1x1(self.inplanes, planes * block.expansion, stride), + nn.BatchNorm2d(planes * block.expansion, eps=1e-05,), + ) + layers = [] + layers.append( + block( + self.inplanes, + planes, + stride, + downsample, + self.groups, + self.base_width, + previous_dilation, + ) + ) + self.inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append( + block( + self.inplanes, + planes, + groups=self.groups, + base_width=self.base_width, + dilation=self.dilation, + ) + ) + + return nn.Sequential(*layers) + + def forward(self, x): + + x = self.conv1(x) + x = self.bn1(x) + x = self.prelu(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + x = self.bn2(x) + x = flow.flatten(x, 1) + x = self.dropout(x) + x = self.fc(x) + x = self.features(x) + + return x -def iresnet18(input_blob, cfg): - return get_symbol([2, 2, 2, 2], cfg) +def _iresnet(arch, block, layers, pretrained, progress, **kwargs): + model = IResNet(block, layers, **kwargs) + if pretrained: + raise ValueError() + return model -def iresnet34(input_blob, cfg): - return get_symbol(input_blob, [3, 4, 6, 3], cfg) +def iresnet18(pretrained=False, progress=True, **kwargs): + return _iresnet( + "iresnet18", IBasicBlock, [2, 2, 2, 2], pretrained, progress, **kwargs + ) -def iresnet50(input_blob, cfg): - return get_symbol(input_blob, [3, 4, 14, 3], cfg) +def iresnet34(pretrained=False, progress=True, **kwargs): + return _iresnet( + "iresnet34", IBasicBlock, [3, 4, 6, 3], pretrained, progress, **kwargs + ) -def iresnet100(input_blob, cfg): - return get_symbol(input_blob, [3, 13, 30, 3], cfg) +def iresnet50(pretrained=False, progress=True, **kwargs): + return _iresnet( + "iresnet50", IBasicBlock, [3, 4, 14, 3], pretrained, progress, **kwargs + ) -def iresnet200(input_blob, cfg): - return get_symbol(input_blob, [6, 26, 60, 6], cfg) +def iresnet100(pretrained=False, progress=True, **kwargs): + return _iresnet( + "iresnet100", IBasicBlock, [3, 13, 30, 3], pretrained, progress, **kwargs + ) + + +def iresnet200(pretrained=False, progress=True, **kwargs): + return _iresnet( + "iresnet200", IBasicBlock, [6, 26, 60, 6], pretrained, progress, **kwargs + ) diff --git a/recognition/arcface_oneflow/configs/base.py b/recognition/arcface_oneflow/configs/base.py index 7f14c05..e427248 100644 --- a/recognition/arcface_oneflow/configs/base.py +++ b/recognition/arcface_oneflow/configs/base.py @@ -1,68 +1,53 @@ +from pickle import TRUE from easydict import EasyDict as edict -import math -import numpy as np + # make training faster # our RAM is 256G # mount -t tmpfs -o size=140G tmpfs /train_tmp config = edict() -config.loss = "cosface" +config.loss = "arcface" config.network = "r50" config.resume = False config.output = "ms1mv3_arcface_r50" config.dataset = "ms1m-retinaface-t1" config.embedding_size = 512 -config.sample_rate = 1 -config.fp16 = True +config.fp16 = False +config.model_parallel = False +config.sample_rate = 1.0 +config.partial_fc = False +config.graph = True +config.synthetic = False +config.scale_grad = False + config.momentum = 0.9 config.weight_decay = 5e-4 config.batch_size = 128 config.lr = 0.1 # batch size is 512 -config.model_load_dir = '' - -config.val_batch_size = 10 - -config.node_ips = ["192.168.1.13"] -config.num_nodes = 1 -config.device_num_per_node = 1 -config.model_parallel = 1 -config.partial_fc = 0 - -config.use_synthetic_data = False - - -config.fc_type = "FC" -config.nccl_fusion_threshold_mb = 16 -config.nccl_fusion_max_ops = 64 -config.val_dataset_dir = "/train_tmp/glint360k/val" - - -config.part_name_prefix = "part-" -config.part_name_suffix_length = 5 -config.train_data_part_num = 16 -config.shuffle = True - - +config.val_image_num = {"lfw": 12000, "cfp_fp": 14000, "agedb_30": 12000} if config.dataset == "emore": config.ofrecord_path = "/train_tmp/faces_emore" config.num_classes = 85742 config.num_image = 5822653 config.num_epoch = 16 config.warmup_epoch = -1 - config.decay_epoch = [8, 14, ] - config.val_targets = ["lfw", ] - config.train_data_part_num = 32 + config.decay_epoch = [ + 8, + 14, + ] + config.val_targets = [ + "lfw", + ] elif config.dataset == "ms1m-retinaface-t1": config.ofrecord_path = "/dev/shm/ms1m-retinaface-t1/ofrecord" - config.num_classes = 93432 + config.num_classes = 93431 config.num_image = 5179510 config.num_epoch = 25 config.warmup_epoch = -1 config.decay_epoch = [11, 17, 22] config.val_targets = ["lfw", "cfp_fp", "agedb_30"] - config.train_data_part_num = 32 elif config.dataset == "glint360k": config.ofrecord_path = "/train_tmp/glint360k" diff --git a/recognition/arcface_oneflow/configs/face_emore_r100.py b/recognition/arcface_oneflow/configs/face_emore_r100.py deleted file mode 100644 index 6d6364c..0000000 --- a/recognition/arcface_oneflow/configs/face_emore_r100.py +++ /dev/null @@ -1,35 +0,0 @@ -from easydict import EasyDict as edict - -# make training faster -# our RAM is 256G -# mount -t tmpfs -o size=140G tmpfs /train_tmp - -config = edict() -config.loss = "cosface" -config.network = "r100" -config.resume = False -config.output = "lazy_r100" -config.embedding_size = 512 -config.fp16 = True -config.momentum = 0.9 -config.weight_decay = 5e-4 -config.batch_size = 128 -config.lr = 0.1 # batch size is 512 -config.model_parallel = True -config.partial_fc = 1 -config.sample_rate = 1.0 -config.device_num_per_node = 8 - - -config.ofrecord_path = "/dev/shm/faces_emore/ofrecord/train" -config.eval_ofrecord_path = "/dev/shm/faces_emore/ofrecord/val" -config.num_classes = 85742 -config.num_image = 5822653 -config.num_epoch = 16 -config.warmup_epoch = -1 -config.decay_epoch = [8, 14, ] -config.val_targets = ["lfw", "cfp_fp", "agedb_30"] - - -config.node_ips = ["192.168.1.13"] -config.num_nodes = 1 diff --git a/recognition/arcface_oneflow/configs/glint360k_mbf.py b/recognition/arcface_oneflow/configs/glint360k_mbf.py index fa05ca7..96697a1 100644 --- a/recognition/arcface_oneflow/configs/glint360k_mbf.py +++ b/recognition/arcface_oneflow/configs/glint360k_mbf.py @@ -8,26 +8,20 @@ config = edict() config.loss = "cosface" config.network = "mbf" config.resume = False -config.output = "lazy_mbf" -config.embedding_size = 128 +config.output = None +config.embedding_size = 512 +config.partial_fc = 1 +config.sample_rate = 0.1 +config.model_parallel = True config.fp16 = True config.momentum = 0.9 -config.weight_decay = 5e-4 +config.weight_decay = 2e-4 config.batch_size = 128 config.lr = 0.1 # batch size is 512 -config.model_parallel = True -config.partial_fc = 1 -config.sample_rate = 1.0 -config.device_num_per_node = 8 - -config.ofrecord_path = "/train_tmp/glint360k/train" -config.eval_ofrecord_path = "/train_tmp/glint360k/val" -config.num_classes = 93432 -config.num_image = 5179510 -config.train_data_part_num = 200 - -config.ofrecord_path = "/train_tmp/glint360k" +config.dataset = "glint360k" +config.ofrecord_path = "/train_tmp/glint360k/" +config.ofrecord_part_num = 200 config.num_classes = 360232 config.num_image = 17091657 config.num_epoch = 20 diff --git a/recognition/arcface_oneflow/configs/glint360k_r100.py b/recognition/arcface_oneflow/configs/glint360k_r100.py index b0ab366..7a41518 100644 --- a/recognition/arcface_oneflow/configs/glint360k_r100.py +++ b/recognition/arcface_oneflow/configs/glint360k_r100.py @@ -8,22 +8,21 @@ config = edict() config.loss = "cosface" config.network = "r100" config.resume = False -config.output = "lazy_r100" +config.output = None config.embedding_size = 512 +config.partial_fc = 1 +config.sample_rate = 0.1 +config.model_parallel = True config.fp16 = True config.momentum = 0.9 config.weight_decay = 5e-4 config.batch_size = 128 config.lr = 0.1 # batch size is 512 -config.model_parallel = True -config.partial_fc = 1 -config.sample_rate = 1.0 -config.device_num_per_node = 8 -config.ofrecord_path = "/train_tmp/glint360k/train" -config.eval_ofrecord_path = "/train_tmp/glint360k/val" -config.train_data_part_num = 200 +config.dataset = "glint360k" +config.ofrecord_path = "/train_tmp/glint360k/" +config.ofrecord_part_num = 200 config.num_classes = 360232 config.num_image = 17091657 config.num_epoch = 20 diff --git a/recognition/arcface_oneflow/configs/glint360k_r18.py b/recognition/arcface_oneflow/configs/glint360k_r18.py index 8371559..031fd02 100644 --- a/recognition/arcface_oneflow/configs/glint360k_r18.py +++ b/recognition/arcface_oneflow/configs/glint360k_r18.py @@ -8,26 +8,20 @@ config = edict() config.loss = "cosface" config.network = "r18" config.resume = False -config.output = "lazy_r18" +config.output = None config.embedding_size = 512 +config.partial_fc = 1 +config.sample_rate = 0.1 +config.model_parallel = True config.fp16 = True config.momentum = 0.9 config.weight_decay = 5e-4 config.batch_size = 128 config.lr = 0.1 # batch size is 512 -config.model_parallel = True -config.partial_fc = 1 -config.sample_rate = 1.0 -config.device_num_per_node = 8 - -config.ofrecord_path = "/train_tmp/glint360k/train" -config.eval_ofrecord_path = "/train_tmp/glint360k/val" -config.num_classes = 93432 -config.num_image = 5179510 -config.train_data_part_num = 200 - -config.ofrecord_path = "/train_tmp/glint360k" +config.dataset = "glint360k" +config.ofrecord_path = "/train_tmp/glint360k/" +config.ofrecord_part_num = 200 config.num_classes = 360232 config.num_image = 17091657 config.num_epoch = 20 diff --git a/recognition/arcface_oneflow/configs/glint360k_r34.py b/recognition/arcface_oneflow/configs/glint360k_r34.py index 8542f49..b072b02 100644 --- a/recognition/arcface_oneflow/configs/glint360k_r34.py +++ b/recognition/arcface_oneflow/configs/glint360k_r34.py @@ -8,26 +8,20 @@ config = edict() config.loss = "cosface" config.network = "r34" config.resume = False -config.output = "lazy_r34" +config.output = None config.embedding_size = 512 +config.partial_fc = 1 +config.sample_rate = 0.1 +config.model_parallel = True config.fp16 = True config.momentum = 0.9 config.weight_decay = 5e-4 config.batch_size = 128 config.lr = 0.1 # batch size is 512 -config.model_parallel = True -config.partial_fc = 1 -config.sample_rate = 1.0 -config.device_num_per_node = 8 - -config.ofrecord_path = "/train_tmp/glint360k/train" -config.eval_ofrecord_path = "/train_tmp/glint360k/val" -config.num_classes = 93432 -config.num_image = 5179510 -config.train_data_part_num = 200 - -config.ofrecord_path = "/train_tmp/glint360k" +config.dataset = "glint360k" +config.ofrecord_path = "/train_tmp/glint360k/" +config.ofrecord_part_num = 200 config.num_classes = 360232 config.num_image = 17091657 config.num_epoch = 20 diff --git a/recognition/arcface_oneflow/configs/glint360k_r50.py b/recognition/arcface_oneflow/configs/glint360k_r50.py index b050b16..90add48 100644 --- a/recognition/arcface_oneflow/configs/glint360k_r50.py +++ b/recognition/arcface_oneflow/configs/glint360k_r50.py @@ -8,26 +8,20 @@ config = edict() config.loss = "cosface" config.network = "r50" config.resume = False -config.output = "lazy_r50" +config.output = None config.embedding_size = 512 +config.partial_fc = 1 +config.sample_rate = 0.1 +config.model_parallel = True config.fp16 = True config.momentum = 0.9 config.weight_decay = 5e-4 config.batch_size = 128 config.lr = 0.1 # batch size is 512 -config.model_parallel = True -config.partial_fc = 1 -config.sample_rate = 1.0 -config.device_num_per_node = 8 - -config.ofrecord_path = "/train_tmp/glint360k/train" -config.eval_ofrecord_path = "/train_tmp/glint360k/val" -config.num_classes = 93432 -config.num_image = 5179510 -config.train_data_part_num = 200 - -config.ofrecord_path = "/train_tmp/glint360k" +config.dataset = "glint360k" +config.ofrecord_path = "/train_tmp/glint360k/" +config.ofrecord_part_num = 200 config.num_classes = 360232 config.num_image = 17091657 config.num_epoch = 20 diff --git a/recognition/arcface_oneflow/configs/ms1V3_r100.py b/recognition/arcface_oneflow/configs/ms1V3_r100.py deleted file mode 100644 index 4822026..0000000 --- a/recognition/arcface_oneflow/configs/ms1V3_r100.py +++ /dev/null @@ -1,37 +0,0 @@ -from easydict import EasyDict as edict - -# make training faster -# our RAM is 256G -# mount -t tmpfs -o size=140G tmpfs /train_tmp - -config = edict() -config.loss = "cosface" -config.network = "r100" -config.resume = False -config.output = "lazy_r100" -config.embedding_size = 512 -config.fp16 = True -config.momentum = 0.9 -config.weight_decay = 5e-4 -config.batch_size = 128 -config.lr = 0.1 # batch size is 512 -config.model_parallel = True -config.partial_fc = 1 -config.sample_rate = 1.0 -config.device_num_per_node = 8 - - -config.ofrecord_path = "/dev/shm/ms1m-retinaface-t1/ofrecord/train" -config.eval_ofrecord_path = "/dev/shm/ms1m-retinaface-t1/ofrecord/val" -config.num_classes = 93432 -config.num_image = 5179510 -config.train_data_part_num = 32 - -config.num_epoch = 25 -config.warmup_epoch = -1 -config.decay_epoch = [10, 16, 22] -config.val_targets = ["lfw", "cfp_fp", "agedb_30"] -#config.val_targets = [] - -config.node_ips = ["192.168.1.13"] -config.num_nodes = 1 diff --git a/recognition/arcface_oneflow/configs/ms1mv3_mbf.py b/recognition/arcface_oneflow/configs/ms1mv3_mbf.py index 1ded274..917fb13 100644 --- a/recognition/arcface_oneflow/configs/ms1mv3_mbf.py +++ b/recognition/arcface_oneflow/configs/ms1mv3_mbf.py @@ -5,33 +5,25 @@ from easydict import EasyDict as edict # mount -t tmpfs -o size=140G tmpfs /train_tmp config = edict() -config.loss = "cosface" +config.loss = "arcface" config.network = "mbf" config.resume = False -config.output = "lazy_mbf" -config.embedding_size = 128 -config.fp16 = True -config.momentum = 0.9 -config.weight_decay = 5e-4 -config.batch_size = 128 -config.lr = 0.1 # batch size is 512 +config.output = None +config.embedding_size = 512 config.model_parallel = True config.partial_fc = 1 -config.sample_rate = 1.0 -config.device_num_per_node = 8 +config.sample_rate = 0.1 +config.fp16 = True +config.momentum = 0.9 +config.weight_decay = 2e-4 +config.batch_size = 128 +config.lr = 0.1 # batch size is 512 - -config.ofrecord_path = "/dev/shm/ms1m-retinaface-t1/ofrecord/train" -config.eval_ofrecord_path = "/dev/shm/ms1m-retinaface-t1/ofrecord/val" -config.num_classes = 93432 +config.ofrecord_path = "/train_tmp/ms1m-retinaface-t1" +config.ofrecord_part_num = 8 +config.num_classes = 93431 config.num_image = 5179510 -config.train_data_part_num = 32 - -config.num_epoch = 25 +config.num_epoch = 30 config.warmup_epoch = -1 -config.decay_epoch = [10, 16, 22] +config.decay_epoch = [10, 20, 25] config.val_targets = ["lfw", "cfp_fp", "agedb_30"] - - -config.node_ips = ["192.168.1.13"] -config.num_nodes = 1 diff --git a/recognition/arcface_oneflow/configs/ms1mv3_r18.py b/recognition/arcface_oneflow/configs/ms1mv3_r18.py index 15ba990..7b2168a 100644 --- a/recognition/arcface_oneflow/configs/ms1mv3_r18.py +++ b/recognition/arcface_oneflow/configs/ms1mv3_r18.py @@ -5,33 +5,25 @@ from easydict import EasyDict as edict # mount -t tmpfs -o size=140G tmpfs /train_tmp config = edict() -config.loss = "cosface" +config.loss = "arcface" config.network = "r18" config.resume = False -config.output = "lazy_r18" +config.output = None config.embedding_size = 512 +config.model_parallel = True +config.partial_fc = 1 +config.sample_rate = 0.1 config.fp16 = True config.momentum = 0.9 config.weight_decay = 5e-4 config.batch_size = 128 config.lr = 0.1 # batch size is 512 -config.model_parallel = True -config.partial_fc = 1 -config.sample_rate = 1.0 -config.device_num_per_node = 8 - -config.ofrecord_path = "/dev/shm/ms1m-retinaface-t1/ofrecord/train" -config.eval_ofrecord_path = "/dev/shm/ms1m-retinaface-t1/ofrecord/val" -config.num_classes = 93432 +config.ofrecord_path = "/train_tmp/ms1m-retinaface-t1" +config.ofrecord_part_num = 8 +config.num_classes = 93431 config.num_image = 5179510 -config.train_data_part_num = 32 - config.num_epoch = 25 config.warmup_epoch = -1 config.decay_epoch = [10, 16, 22] config.val_targets = ["lfw", "cfp_fp", "agedb_30"] -#config.val_targets = [] - -config.node_ips = ["192.168.1.13"] -config.num_nodes = 1 diff --git a/recognition/arcface_oneflow/configs/ms1mv3_r34.py b/recognition/arcface_oneflow/configs/ms1mv3_r34.py index e61eb3a..c95aef0 100644 --- a/recognition/arcface_oneflow/configs/ms1mv3_r34.py +++ b/recognition/arcface_oneflow/configs/ms1mv3_r34.py @@ -5,33 +5,26 @@ from easydict import EasyDict as edict # mount -t tmpfs -o size=140G tmpfs /train_tmp config = edict() -config.loss = "cosface" +config.loss = "arcface" config.network = "r34" config.resume = False -config.output = "lazy_r34" +config.output = None config.embedding_size = 512 +config.model_parallel = True +config.partial_fc = 1 +config.sample_rate = 0.1 config.fp16 = True config.momentum = 0.9 config.weight_decay = 5e-4 config.batch_size = 128 config.lr = 0.1 # batch size is 512 -config.model_parallel = True -config.partial_fc = 1 -config.sample_rate = 1.0 -config.device_num_per_node = 8 -config.ofrecord_path = "/dev/shm/ms1m-retinaface-t1/ofrecord/train" -config.eval_ofrecord_path = "/dev/shm/ms1m-retinaface-t1/ofrecord/val" -config.num_classes = 93432 +config.ofrecord_path = "/train_tmp/ms1m-retinaface-t1" +config.ofrecord_part_num = 8 +config.num_classes = 93431 config.num_image = 5179510 -config.train_data_part_num = 32 - config.num_epoch = 25 config.warmup_epoch = -1 config.decay_epoch = [10, 16, 22] config.val_targets = ["lfw", "cfp_fp", "agedb_30"] -#config.val_targets = [] - -config.node_ips = ["192.168.1.13"] -config.num_nodes = 1 diff --git a/recognition/arcface_oneflow/configs/ms1mv3_r50.py b/recognition/arcface_oneflow/configs/ms1mv3_r50.py index cdde5b9..c7dede3 100644 --- a/recognition/arcface_oneflow/configs/ms1mv3_r50.py +++ b/recognition/arcface_oneflow/configs/ms1mv3_r50.py @@ -8,30 +8,23 @@ config = edict() config.loss = "cosface" config.network = "r50" config.resume = False -config.output = "lazy_r50" +config.output = "partial_fc" config.embedding_size = 512 +config.model_parallel = True +config.partial_fc = 0 +config.sample_rate = 0.1 config.fp16 = True config.momentum = 0.9 config.weight_decay = 5e-4 config.batch_size = 128 config.lr = 0.1 # batch size is 512 -config.model_parallel = True -config.partial_fc = 1 -config.sample_rate = 1.0 -config.device_num_per_node = 8 -config.ofrecord_path = "/dev/shm/ms1m-retinaface-t1/ofrecord/train" -config.eval_ofrecord_path = "/dev/shm/ms1m-retinaface-t1/ofrecord/val" +config.ofrecord_path = "/train_tmp/ms1m-retinaface-t1/ofrecord/" +config.ofrecord_part_num = 8 config.num_classes = 93432 config.num_image = 5179510 -config.train_data_part_num = 32 - config.num_epoch = 25 config.warmup_epoch = -1 config.decay_epoch = [10, 16, 22] config.val_targets = ["lfw", "cfp_fp", "agedb_30"] - - -config.node_ips = ["192.168.1.13"] -config.num_nodes = 1 diff --git a/recognition/arcface_oneflow/configs/speed.py b/recognition/arcface_oneflow/configs/speed.py index 4c63f77..cae0b8b 100644 --- a/recognition/arcface_oneflow/configs/speed.py +++ b/recognition/arcface_oneflow/configs/speed.py @@ -8,17 +8,17 @@ config.network = "r50" config.resume = False config.output = None config.embedding_size = 512 +config.model_parallel = True config.sample_rate = 1.0 -config.fp16 = True +config.fp16 = False config.momentum = 0.9 config.weight_decay = 5e-4 config.batch_size = 128 config.lr = 0.1 # batch size is 512 -config.rec = "synthetic" -config.num_classes = 100 * 10000 +config.synthetic = True +config.num_classes = 100000 config.num_epoch = 30 config.warmup_epoch = -1 config.decay_epoch = [10, 16, 22] config.val_targets = [] -config.use_synthetic_data = True diff --git a/recognition/arcface_oneflow/convert.sh b/recognition/arcface_oneflow/convert.sh index 1f23d68..1ad7d8a 100644 --- a/recognition/arcface_oneflow/convert.sh +++ b/recognition/arcface_oneflow/convert.sh @@ -1 +1,2 @@ -python oneflow2onnx.py configs/ms1mv3_r50 --model_path work_dir/lazy_r50/snapshot_0 \ No newline at end of file + +python3 oneflow2onnx.py configs/ms1mv3_r50 --model_path /workdir/epoch_0 diff --git a/recognition/arcface_oneflow/onnx_helper.py b/recognition/arcface_oneflow/eval/onnx_helper.py similarity index 70% rename from recognition/arcface_oneflow/onnx_helper.py rename to recognition/arcface_oneflow/eval/onnx_helper.py index 4060503..b58cd53 100644 --- a/recognition/arcface_oneflow/onnx_helper.py +++ b/recognition/arcface_oneflow/eval/onnx_helper.py @@ -25,13 +25,13 @@ class ArcFaceORT: return "model_path should be directory" onnx_files = [] for _file in os.listdir(self.model_path): - print('file_:', _file) - if _file.endswith('.onnx'): + print("file_:", _file) + if _file.endswith(".onnx"): onnx_files.append(osp.join(self.model_path, _file)) if len(onnx_files) == 0: return "do not have onnx files" self.model_file = sorted(onnx_files)[-1] - print('use onnx-model:', self.model_file) + print("use onnx-model:", self.model_file) try: session = onnxruntime.InferenceSession(self.model_file, None) except: @@ -39,18 +39,18 @@ class ArcFaceORT: input_cfg = session.get_inputs()[0] input_shape = input_cfg.shape - print('input-shape:', input_shape) + print("input-shape:", input_shape) if len(input_shape) != 4: return "length of input_shape should be 4" if not isinstance(input_shape[0], str): # return "input_shape[0] should be str to support batch-inference" - print('reset input-shape[0] to None') + print("reset input-shape[0] to None") model = onnx.load(self.model_file) - model.graph.input[0].type.tensor_type.shape.dim[0].dim_param = 'None' - new_model_file = osp.join(self.model_path, 'zzzzrefined.onnx') + model.graph.input[0].type.tensor_type.shape.dim[0].dim_param = "None" + new_model_file = osp.join(self.model_path, "zzzzrefined.onnx") onnx.save(model, new_model_file) self.model_file = new_model_file - print('use new onnx-model:', self.model_file) + print("use new onnx-model:", self.model_file) try: session = onnxruntime.InferenceSession(self.model_file, None) except: @@ -58,7 +58,7 @@ class ArcFaceORT: input_cfg = session.get_inputs()[0] input_shape = input_cfg.shape - print('new-input-shape:', input_shape) + print("new-input-shape:", input_shape) self.image_size = tuple(input_shape[2:4][::-1]) @@ -82,28 +82,30 @@ class ArcFaceORT: input_size = (112, 112) self.crop = None if True: - crop_file = osp.join(self.model_path, 'crop.txt') + crop_file = osp.join(self.model_path, "crop.txt") if osp.exists(crop_file): - lines = open(crop_file, 'r').readlines() + lines = open(crop_file, "r").readlines() if len(lines) != 6: return "crop.txt should contain 6 lines" lines = [int(x) for x in lines] self.crop = lines[:4] input_size = tuple(lines[4:6]) if input_size != self.image_size: - return "input-size is inconsistant with onnx model input, %s vs %s" % (input_size, self.image_size) + return "input-size is inconsistant with onnx model input, %s vs %s" % ( + input_size, + self.image_size, + ) - self.model_size_mb = os.path.getsize( - self.model_file) / float(1024 * 1024) + self.model_size_mb = os.path.getsize(self.model_file) / float(1024 * 1024) if self.model_size_mb > max_model_size_mb: return "max model size exceed, given %.3f-MB" % self.model_size_mb input_mean = None input_std = None if True: - pn_file = osp.join(self.model_path, 'pixel_norm.txt') + pn_file = osp.join(self.model_path, "pixel_norm.txt") if osp.exists(pn_file): - lines = open(pn_file, 'r').readlines() + lines = open(pn_file, "r").readlines() if len(lines) != 2: return "pixel_norm.txt should contain 2 lines" input_mean = float(lines[0]) @@ -116,9 +118,9 @@ class ArcFaceORT: find_mul = False for nid, node in enumerate(graph.node[:8]): print(nid, node.name) - if node.name.startswith('Sub') or node.name.startswith('_minus'): + if node.name.startswith("Sub") or node.name.startswith("_minus"): find_sub = True - if node.name.startswith('Mul') or node.name.startswith('_mul'): + if node.name.startswith("Mul") or node.name.startswith("_mul"): find_mul = True if find_sub and find_mul: # mxnet arcface model @@ -134,10 +136,11 @@ class ArcFaceORT: dt = weight_array.dtype if dt.itemsize < 4: - return 'invalid weight type - (%s:%s)' % (initn.name, dt.name) + return "invalid weight type - (%s:%s)" % (initn.name, dt.name) if test_img is None: - test_img = np.random.randint(0, 255, size=( - self.image_size[1], self.image_size[0], 3), dtype=np.uint8) + test_img = np.random.randint( + 0, 255, size=(self.image_size[1], self.image_size[0], 3), dtype=np.uint8 + ) else: test_img = cv2.resize(test_img, self.image_size) feat, cost = self.benchmark(test_img) @@ -149,12 +152,23 @@ class ArcFaceORT: return "max time cost exceed, given %.4f" % cost_ms self.cost_ms = cost_ms print( - 'check stat:, model-size-mb: %.4f, feat-dim: %d, time-cost-ms: %.4f, input-mean: %.3f, input-std: %.3f' % ( - self.model_size_mb, self.feat_dim, self.cost_ms, self.input_mean, self.input_std)) + "check stat:, model-size-mb: %.4f, feat-dim: %d, time-cost-ms: %.4f, input-mean: %.3f, input-std: %.3f" + % ( + self.model_size_mb, + self.feat_dim, + self.cost_ms, + self.input_mean, + self.input_std, + ) + ) return None def meta_info(self): - return {'model-size-mb': self.model_size_mb, 'feature-dim': self.feat_dim, 'infer': self.cost_ms} + return { + "model-size-mb": self.model_size_mb, + "feature-dim": self.feat_dim, + "infer": self.cost_ms, + } def forward(self, imgs): if not isinstance(imgs, list): @@ -163,32 +177,39 @@ class ArcFaceORT: if self.crop is not None: nimgs = [] for img in imgs: - nimg = img[self.crop[1]:self.crop[3], - self.crop[0]:self.crop[2], :] + nimg = img[self.crop[1] : self.crop[3], self.crop[0] : self.crop[2], :] if nimg.shape[0] != input_size[1] or nimg.shape[1] != input_size[0]: nimg = cv2.resize(nimg, input_size) nimgs.append(nimg) imgs = nimgs - blob = cv2.dnn.blobFromImages(imgs, 1.0 / self.input_std, input_size, - (self.input_mean, self.input_mean, self.input_mean), swapRB=True) - net_out = self.session.run( - self.output_names, {self.input_name: blob})[0] + blob = cv2.dnn.blobFromImages( + imgs, + 1.0 / self.input_std, + input_size, + (self.input_mean, self.input_mean, self.input_mean), + swapRB=True, + ) + net_out = self.session.run(self.output_names, {self.input_name: blob})[0] return net_out def benchmark(self, img): input_size = self.image_size if self.crop is not None: - nimg = img[self.crop[1]:self.crop[3], self.crop[0]:self.crop[2], :] + nimg = img[self.crop[1] : self.crop[3], self.crop[0] : self.crop[2], :] if nimg.shape[0] != input_size[1] or nimg.shape[1] != input_size[0]: nimg = cv2.resize(nimg, input_size) img = nimg - blob = cv2.dnn.blobFromImage(img, 1.0 / self.input_std, input_size, - (self.input_mean, self.input_mean, self.input_mean), swapRB=True) + blob = cv2.dnn.blobFromImage( + img, + 1.0 / self.input_std, + input_size, + (self.input_mean, self.input_mean, self.input_mean), + swapRB=True, + ) costs = [] for _ in range(50): ta = datetime.datetime.now() - net_out = self.session.run( - self.output_names, {self.input_name: blob})[0] + net_out = self.session.run(self.output_names, {self.input_name: blob})[0] tb = datetime.datetime.now() cost = (tb - ta).total_seconds() costs.append(cost) @@ -197,9 +218,10 @@ class ArcFaceORT: return net_out, cost -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( - "--model_root", help="onnx model root, default is './'", default="./") + "--model_root", help="onnx model root, default is './'", default="./" + ) args = parser.parse_args() ArcFaceORT(args.model_root).check() diff --git a/recognition/arcface_oneflow/onnx_ijbc.py b/recognition/arcface_oneflow/eval/onnx_ijbc.py similarity index 68% rename from recognition/arcface_oneflow/onnx_ijbc.py rename to recognition/arcface_oneflow/eval/onnx_ijbc.py index 05b50bf..0d07d81 100644 --- a/recognition/arcface_oneflow/onnx_ijbc.py +++ b/recognition/arcface_oneflow/eval/onnx_ijbc.py @@ -20,8 +20,10 @@ SRC = np.array( [65.5318, 51.5014], [48.0252, 71.7366], [33.5493, 92.3655], - [62.7299, 92.2041]] - , dtype=np.float32) + [62.7299, 92.2041], + ], + dtype=np.float32, +) SRC[:, 0] += 8.0 @@ -36,10 +38,12 @@ class AlignedDataSet(mx.gluon.data.Dataset): def __getitem__(self, idx): each_line = self.lines[idx] - name_lmk_score = each_line.strip().split(' ') + name_lmk_score = each_line.strip().split(" ") name = os.path.join(self.root, name_lmk_score[0]) img = cv2.cvtColor(cv2.imread(name), cv2.COLOR_BGR2RGB) - landmark5 = np.array([float(x) for x in name_lmk_score[1:-1]], dtype=np.float32).reshape((5, 2)) + landmark5 = np.array( + [float(x) for x in name_lmk_score[1:-1]], dtype=np.float32 + ).reshape((5, 2)) st = skimage.transform.SimilarityTransform() st.estimate(landmark5, SRC) img = cv2.warpAffine(img, st.params[0:2, :], (112, 112), borderValue=0.0) @@ -60,15 +64,21 @@ def extract(model_root, dataset): return mx.nd.concat(*data, dim=0) data_loader = mx.gluon.data.DataLoader( - dataset, 128, last_batch='keep', num_workers=4, - thread_pool=True, prefetch=16, batchify_fn=batchify_fn) + dataset, + 128, + last_batch="keep", + num_workers=4, + thread_pool=True, + prefetch=16, + batchify_fn=batchify_fn, + ) num_iter = 0 for batch in data_loader: batch = batch.asnumpy() batch = (batch - model.input_mean) / model.input_std feat = model.session.run(model.output_names, {model.input_name: batch})[0] feat = np.reshape(feat, (-1, model.feat_dim * 2)) - feat_mat[128 * num_iter: 128 * num_iter + feat.shape[0], :] = feat + feat_mat[128 * num_iter : 128 * num_iter + feat.shape[0], :] = feat num_iter += 1 if num_iter % 50 == 0: print(num_iter) @@ -76,14 +86,14 @@ def extract(model_root, dataset): def read_template_media_list(path): - ijb_meta = pd.read_csv(path, sep=' ', header=None).values + ijb_meta = pd.read_csv(path, sep=" ", header=None).values templates = ijb_meta[:, 1].astype(np.int) medias = ijb_meta[:, 2].astype(np.int) return templates, medias def read_template_pair_list(path): - pairs = pd.read_csv(path, sep=' ', header=None).values + pairs = pd.read_csv(path, sep=" ", header=None).values t1 = pairs[:, 0].astype(np.int) t2 = pairs[:, 1].astype(np.int) label = pairs[:, 2].astype(np.int) @@ -91,14 +101,12 @@ def read_template_pair_list(path): def read_image_feature(path): - with open(path, 'rb') as fid: + with open(path, "rb") as fid: img_feats = pickle.load(fid) return img_feats -def image2template_feature(img_feats=None, - templates=None, - medias=None): +def image2template_feature(img_feats=None, templates=None, medias=None): unique_templates = np.unique(templates) template_feats = np.zeros((len(unique_templates), img_feats.shape[1])) for count_template, uqt in enumerate(unique_templates): @@ -112,27 +120,25 @@ def image2template_feature(img_feats=None, if ct == 1: media_norm_feats += [face_norm_feats[ind_m]] else: # image features from the same video will be aggregated into one feature - media_norm_feats += [np.mean(face_norm_feats[ind_m], axis=0, keepdims=True), ] + media_norm_feats += [ + np.mean(face_norm_feats[ind_m], axis=0, keepdims=True), + ] media_norm_feats = np.array(media_norm_feats) template_feats[count_template] = np.sum(media_norm_feats, axis=0) if count_template % 2000 == 0: - print('Finish Calculating {} template features.'.format( - count_template)) + print("Finish Calculating {} template features.".format(count_template)) template_norm_feats = normalize(template_feats) return template_norm_feats, unique_templates -def verification(template_norm_feats=None, - unique_templates=None, - p1=None, - p2=None): +def verification(template_norm_feats=None, unique_templates=None, p1=None, p2=None): template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int) for count_template, uqt in enumerate(unique_templates): template2id[uqt] = count_template score = np.zeros((len(p1),)) total_pairs = np.array(range(len(p1))) batchsize = 100000 - sublists = [total_pairs[i: i + batchsize] for i in range(0, len(p1), batchsize)] + sublists = [total_pairs[i : i + batchsize] for i in range(0, len(p1), batchsize)] total_sublists = len(sublists) for c, s in enumerate(sublists): feat1 = template_norm_feats[template2id[p1[s]]] @@ -140,21 +146,19 @@ def verification(template_norm_feats=None, similarity_score = np.sum(feat1 * feat2, -1) score[s] = similarity_score.flatten() if c % 10 == 0: - print('Finish {}/{} pairs.'.format(c, total_sublists)) + print("Finish {}/{} pairs.".format(c, total_sublists)) return score -def verification2(template_norm_feats=None, - unique_templates=None, - p1=None, - p2=None): +def verification2(template_norm_feats=None, unique_templates=None, p1=None, p2=None): template2id = np.zeros((max(unique_templates) + 1, 1), dtype=int) for count_template, uqt in enumerate(unique_templates): template2id[uqt] = count_template score = np.zeros((len(p1),)) # save cosine distance between pairs total_pairs = np.array(range(len(p1))) - batchsize = 100000 # small batchsize instead of all pairs in one batch due to the memory limiation - sublists = [total_pairs[i:i + batchsize] for i in range(0, len(p1), batchsize)] + # small batchsize instead of all pairs in one batch due to the memory limiation + batchsize = 100000 + sublists = [total_pairs[i : i + batchsize] for i in range(0, len(p1), batchsize)] total_sublists = len(sublists) for c, s in enumerate(sublists): feat1 = template_norm_feats[template2id[p1[s]]] @@ -162,7 +166,7 @@ def verification2(template_norm_feats=None, similarity_score = np.sum(feat1 * feat2, -1) score[s] = similarity_score.flatten() if c % 10 == 0: - print('Finish {}/{} pairs.'.format(c, total_sublists)) + print("Finish {}/{} pairs.".format(c, total_sublists)) return score @@ -170,24 +174,33 @@ def main(args): use_norm_score = True # if Ture, TestMode(N1) use_detector_score = True # if Ture, TestMode(D1) use_flip_test = True # if Ture, TestMode(F1) - assert args.target == 'IJBC' or args.target == 'IJBB' + assert args.target == "IJBC" or args.target == "IJBB" start = timeit.default_timer() templates, medias = read_template_media_list( - os.path.join('%s/meta' % args.image_path, '%s_face_tid_mid.txt' % args.target.lower())) + os.path.join( + "%s/meta" % args.image_path, "%s_face_tid_mid.txt" % args.target.lower() + ) + ) stop = timeit.default_timer() - print('Time: %.2f s. ' % (stop - start)) + print("Time: %.2f s. " % (stop - start)) start = timeit.default_timer() p1, p2, label = read_template_pair_list( - os.path.join('%s/meta' % args.image_path, - '%s_template_pair_label.txt' % args.target.lower())) + os.path.join( + "%s/meta" % args.image_path, + "%s_template_pair_label.txt" % args.target.lower(), + ) + ) stop = timeit.default_timer() - print('Time: %.2f s. ' % (stop - start)) + print("Time: %.2f s. " % (stop - start)) start = timeit.default_timer() - img_path = '%s/loose_crop' % args.image_path - img_list_path = '%s/meta/%s_name_5pts_score.txt' % (args.image_path, args.target.lower()) + img_path = "%s/loose_crop" % args.image_path + img_list_path = "%s/meta/%s_name_5pts_score.txt" % ( + args.image_path, + args.target.lower(), + ) img_list = open(img_list_path) files = img_list.readlines() dataset = AlignedDataSet(root=img_path, lines=files, align=True) @@ -199,19 +212,24 @@ def main(args): faceness_scores.append(name_lmk_score[-1]) faceness_scores = np.array(faceness_scores).astype(np.float32) stop = timeit.default_timer() - print('Time: %.2f s. ' % (stop - start)) - print('Feature Shape: ({} , {}) .'.format(img_feats.shape[0], img_feats.shape[1])) + print("Time: %.2f s. " % (stop - start)) + print("Feature Shape: ({} , {}) .".format(img_feats.shape[0], img_feats.shape[1])) start = timeit.default_timer() if use_flip_test: - img_input_feats = img_feats[:, 0:img_feats.shape[1] // 2] + img_feats[:, img_feats.shape[1] // 2:] + img_input_feats = ( + img_feats[:, 0 : img_feats.shape[1] // 2] + + img_feats[:, img_feats.shape[1] // 2 :] + ) else: - img_input_feats = img_feats[:, 0:img_feats.shape[1] // 2] + img_input_feats = img_feats[:, 0 : img_feats.shape[1] // 2] if use_norm_score: img_input_feats = img_input_feats else: - img_input_feats = img_input_feats / np.sqrt(np.sum(img_input_feats ** 2, -1, keepdims=True)) + img_input_feats = img_input_feats / np.sqrt( + np.sum(img_input_feats ** 2, -1, keepdims=True) + ) if use_detector_score: print(img_input_feats.shape, faceness_scores.shape) @@ -220,14 +238,15 @@ def main(args): img_input_feats = img_input_feats template_norm_feats, unique_templates = image2template_feature( - img_input_feats, templates, medias) + img_input_feats, templates, medias + ) stop = timeit.default_timer() - print('Time: %.2f s. ' % (stop - start)) + print("Time: %.2f s. " % (stop - start)) start = timeit.default_timer() score = verification(template_norm_feats, unique_templates, p1, p2) stop = timeit.default_timer() - print('Time: %.2f s. ' % (stop - start)) + print("Time: %.2f s. " % (stop - start)) save_path = os.path.join(args.result_dir, "{}_result".format(args.target)) if not os.path.exists(save_path): os.makedirs(save_path) @@ -242,7 +261,7 @@ def main(args): methods = np.array(methods) scores = dict(zip(methods, scores)) x_labels = [10 ** -6, 10 ** -5, 10 ** -4, 10 ** -3, 10 ** -2, 10 ** -1] - tpr_fpr_table = prettytable.PrettyTable(['Methods'] + [str(x) for x in x_labels]) + tpr_fpr_table = prettytable.PrettyTable(["Methods"] + [str(x) for x in x_labels]) for method in methods: fpr, tpr, _ = roc_curve(label, scores[method]) fpr = np.flipud(fpr) @@ -251,17 +270,20 @@ def main(args): tpr_fpr_row.append("%s-%s" % (method, args.target)) for fpr_iter in np.arange(len(x_labels)): _, min_index = min( - list(zip(abs(fpr - x_labels[fpr_iter]), range(len(fpr))))) - tpr_fpr_row.append('%.2f' % (tpr[min_index] * 100)) + list(zip(abs(fpr - x_labels[fpr_iter]), range(len(fpr)))) + ) + tpr_fpr_row.append("%.2f" % (tpr[min_index] * 100)) tpr_fpr_table.add_row(tpr_fpr_row) print(tpr_fpr_table) -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='do ijb test') +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="do ijb test") # general - parser.add_argument('--model-root', default='', help='path to load model.') - parser.add_argument('--image-path', default='', type=str, help='') - parser.add_argument('--result-dir', default='.', type=str, help='') - parser.add_argument('--target', default='IJBC', type=str, help='target, set to IJBC or IJBB') + parser.add_argument("--model-root", default="", help="path to load model.") + parser.add_argument("--image-path", default="", type=str, help="") + parser.add_argument("--result-dir", default=".", type=str, help="") + parser.add_argument( + "--target", default="IJBC", type=str, help="target, set to IJBC or IJBB" + ) main(parser.parse_args()) diff --git a/recognition/arcface_oneflow/eval/verification.py b/recognition/arcface_oneflow/eval/verification.py index dfffdb6..8fc5b61 100644 --- a/recognition/arcface_oneflow/eval/verification.py +++ b/recognition/arcface_oneflow/eval/verification.py @@ -1,4 +1,4 @@ -"""Helper for evaluation on the Labeled Faces in the Wild dataset +"""Helper for evaluation on the Labeled Faces in the Wild dataset """ # MIT License @@ -28,6 +28,8 @@ import datetime import os import pickle + +import numpy as np import sklearn import oneflow as flow @@ -35,8 +37,7 @@ from scipy import interpolate from sklearn.decomposition import PCA from sklearn.model_selection import KFold import cv2 as cv -import numpy as np -import sklearn +import logging class LFold: @@ -52,14 +53,11 @@ class LFold: return [(indices, indices)] -def calculate_roc(thresholds, - embeddings1, - embeddings2, - actual_issame, - nrof_folds=10, - pca=0): - assert (embeddings1.shape[0] == embeddings2.shape[0]) - assert (embeddings1.shape[1] == embeddings2.shape[1]) +def calculate_roc( + thresholds, embeddings1, embeddings2, actual_issame, nrof_folds=10, pca=0 +): + assert embeddings1.shape[0] == embeddings2.shape[0] + assert embeddings1.shape[1] == embeddings2.shape[1] nrof_pairs = min(len(actual_issame), embeddings1.shape[0]) nrof_thresholds = len(thresholds) k_fold = LFold(n_splits=nrof_folds, shuffle=False) @@ -75,7 +73,7 @@ def calculate_roc(thresholds, for fold_idx, (train_set, test_set) in enumerate(k_fold.split(indices)): if pca > 0: - print('doing pca on', fold_idx) + print("doing pca on", fold_idx) embed1_train = embeddings1[train_set] embed2_train = embeddings2[train_set] _embed_train = np.concatenate((embed1_train, embed2_train), axis=0) @@ -92,15 +90,18 @@ def calculate_roc(thresholds, acc_train = np.zeros((nrof_thresholds)) for threshold_idx, threshold in enumerate(thresholds): _, _, acc_train[threshold_idx] = calculate_accuracy( - threshold, dist[train_set], actual_issame[train_set]) + threshold, dist[train_set], actual_issame[train_set] + ) best_threshold_index = np.argmax(acc_train) for threshold_idx, threshold in enumerate(thresholds): - tprs[fold_idx, threshold_idx], fprs[fold_idx, threshold_idx], _ = calculate_accuracy( - threshold, dist[test_set], - actual_issame[test_set]) + ( + tprs[fold_idx, threshold_idx], + fprs[fold_idx, threshold_idx], + _, + ) = calculate_accuracy(threshold, dist[test_set], actual_issame[test_set]) _, _, accuracy[fold_idx] = calculate_accuracy( - thresholds[best_threshold_index], dist[test_set], - actual_issame[test_set]) + thresholds[best_threshold_index], dist[test_set], actual_issame[test_set] + ) tpr = np.mean(tprs, 0) fpr = np.mean(fprs, 0) @@ -112,8 +113,8 @@ def calculate_accuracy(threshold, dist, actual_issame): tp = np.sum(np.logical_and(predict_issame, actual_issame)) fp = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame))) tn = np.sum( - np.logical_and(np.logical_not(predict_issame), - np.logical_not(actual_issame))) + np.logical_and(np.logical_not(predict_issame), np.logical_not(actual_issame)) + ) fn = np.sum(np.logical_and(np.logical_not(predict_issame), actual_issame)) tpr = 0 if (tp + fn == 0) else float(tp) / float(tp + fn) @@ -122,14 +123,11 @@ def calculate_accuracy(threshold, dist, actual_issame): return tpr, fpr, acc -def calculate_val(thresholds, - embeddings1, - embeddings2, - actual_issame, - far_target, - nrof_folds=10): - assert (embeddings1.shape[0] == embeddings2.shape[0]) - assert (embeddings1.shape[1] == embeddings2.shape[1]) +def calculate_val( + thresholds, embeddings1, embeddings2, actual_issame, far_target, nrof_folds=10 +): + assert embeddings1.shape[0] == embeddings2.shape[0] + assert embeddings1.shape[1] == embeddings2.shape[1] nrof_pairs = min(len(actual_issame), embeddings1.shape[0]) nrof_thresholds = len(thresholds) k_fold = LFold(n_splits=nrof_folds, shuffle=False) @@ -147,15 +145,17 @@ def calculate_val(thresholds, far_train = np.zeros(nrof_thresholds) for threshold_idx, threshold in enumerate(thresholds): _, far_train[threshold_idx] = calculate_val_far( - threshold, dist[train_set], actual_issame[train_set]) + threshold, dist[train_set], actual_issame[train_set] + ) if np.max(far_train) >= far_target: - f = interpolate.interp1d(far_train, thresholds, kind='slinear') + f = interpolate.interp1d(far_train, thresholds, kind="slinear") threshold = f(far_target) else: threshold = 0.0 val[fold_idx], far[fold_idx] = calculate_val_far( - threshold, dist[test_set], actual_issame[test_set]) + threshold, dist[test_set], actual_issame[test_set] + ) val_mean = np.mean(val) far_mean = np.mean(far) @@ -166,11 +166,11 @@ def calculate_val(thresholds, def calculate_val_far(threshold, dist, actual_issame): predict_issame = np.less(dist, threshold) true_accept = np.sum(np.logical_and(predict_issame, actual_issame)) - false_accept = np.sum( - np.logical_and(predict_issame, np.logical_not(actual_issame))) + false_accept = np.sum(np.logical_and(predict_issame, np.logical_not(actual_issame))) n_same = np.sum(actual_issame) n_diff = np.sum(np.logical_not(actual_issame)) - + # print(true_accept, false_accept) + # print(n_same, n_diff) val = float(true_accept) / float(n_same) far = float(false_accept) / float(n_diff) return val, far @@ -181,28 +181,31 @@ def evaluate(embeddings, actual_issame, nrof_folds=10, pca=0): thresholds = np.arange(0, 4, 0.01) embeddings1 = embeddings[0::2] embeddings2 = embeddings[1::2] - tpr, fpr, accuracy = calculate_roc(thresholds, - embeddings1, - embeddings2, - np.asarray(actual_issame), - nrof_folds=nrof_folds, - pca=pca) + tpr, fpr, accuracy = calculate_roc( + thresholds, + embeddings1, + embeddings2, + np.asarray(actual_issame), + nrof_folds=nrof_folds, + pca=pca, + ) thresholds = np.arange(0, 4, 0.001) - val, val_std, far = calculate_val(thresholds, - embeddings1, - embeddings2, - np.asarray(actual_issame), - 1e-3, - nrof_folds=nrof_folds) + val, val_std, far = calculate_val( + thresholds, + embeddings1, + embeddings2, + np.asarray(actual_issame), + 1e-3, + nrof_folds=nrof_folds, + ) return tpr, fpr, accuracy, val, val_std, far def load_bin_cv(path, image_size): - bins, issame_list = pickle.load(open(path, 'rb'), encoding='bytes') + bins, issame_list = pickle.load(open(path, "rb"), encoding="bytes") data_list = [] for flip in [0, 1]: - data = np.empty( - (len(issame_list) * 2, 3, image_size[0], image_size[1])) + data = flow.empty(len(issame_list) * 2, 3, image_size[0], image_size[1]) data_list.append(data) for i in range(len(issame_list) * 2): _bin = bins[i] @@ -214,21 +217,24 @@ def load_bin_cv(path, image_size): img = cv.flip(img, 1) img = np.array(img).transpose((2, 0, 1)) img = (img - 127.5) * 0.00784313725 - data_list[flip][i][:] = img + data_list[flip][i] = flow.tensor(img, dtype=flow.float) + if i % 1000 == 0: - print('loading bin', i) - print(data_list[0].shape) + logging.info("loading bin:%d", i) + logging.info(data_list[0].shape) return data_list, issame_list -def test(data_set, backbone, batch_size, nfolds=10): - print('testing verification..') - +@flow.no_grad() +def test(data_set, backbone, batch_size, nfolds=10, is_consistent=False): + logging.info("testing verification..") data_list = data_set[0] - issame_list = data_set[1] embeddings_list = [] time_consumed = 0.0 + if is_consistent: + placement = flow.env.all_device_placement("cpu") + sbp = flow.sbp.split(0) for i in range(len(data_list)): data = data_list[i] @@ -237,17 +243,23 @@ def test(data_set, backbone, batch_size, nfolds=10): while ba < data.shape[0]: bb = min(ba + batch_size, data.shape[0]) count = bb - ba - _data = data[bb - batch_size: bb] + img = data[bb - batch_size : bb] time0 = datetime.datetime.now() + with flow.no_grad(): + if is_consistent: + img = img.to_consistent(placement=placement, sbp=sbp) + net_out = backbone(img.to("cuda")) - net_out = backbone(_data) - _embeddings = net_out.get().numpy() + if is_consistent: + _embeddings = net_out.to_local().numpy() + else: + _embeddings = net_out.detach().numpy() time_now = datetime.datetime.now() diff = time_now - time0 time_consumed += diff.total_seconds() if embeddings is None: embeddings = np.zeros((data.shape[0], _embeddings.shape[1])) - embeddings[ba:bb, :] = _embeddings[(batch_size - count):, :] + embeddings[ba:bb, :] = _embeddings[(batch_size - count) :, :] ba = bb embeddings_list.append(embeddings) @@ -267,9 +279,49 @@ def test(data_set, backbone, batch_size, nfolds=10): std1 = 0.0 embeddings = embeddings_list[0] + embeddings_list[1] embeddings = sklearn.preprocessing.normalize(embeddings) - print(embeddings.shape) - print('infer time', time_consumed) + logging.info(embeddings.shape) + logging.info("infer time:%f" % time_consumed) _, _, accuracy, val, val_std, far = evaluate( - embeddings, issame_list, nrof_folds=nfolds) + embeddings, issame_list, nrof_folds=nfolds + ) acc2, std2 = np.mean(accuracy), np.std(accuracy) return acc1, std1, acc2, std2, _xnorm, embeddings_list + + +def dumpR(data_set, backbone, batch_size, name="", data_extra=None, label_shape=None): + print("dump verification embedding..") + data_list = data_set[0] + issame_list = data_set[1] + embeddings_list = [] + time_consumed = 0.0 + for i in range(len(data_list)): + data = data_list[i] + embeddings = None + ba = 0 + while ba < data.shape[0]: + bb = min(ba + batch_size, data.shape[0]) + count = bb - ba + + _data = nd.slice_axis(data, axis=0, begin=bb - batch_size, end=bb) + time0 = datetime.datetime.now() + if data_extra is None: + db = mx.io.DataBatch(data=(_data,), label=(_label,)) + else: + db = mx.io.DataBatch(data=(_data, _data_extra), label=(_label,)) + model.forward(db, is_train=False) + net_out = model.get_outputs() + _embeddings = net_out[0].asnumpy() + time_now = datetime.datetime.now() + diff = time_now - time0 + time_consumed += diff.total_seconds() + if embeddings is None: + embeddings = np.zeros((data.shape[0], _embeddings.shape[1])) + embeddings[ba:bb, :] = _embeddings[(batch_size - count) :, :] + ba = bb + embeddings_list.append(embeddings) + embeddings = embeddings_list[0] + embeddings_list[1] + embeddings = sklearn.preprocessing.normalize(embeddings) + actual_issame = np.asarray(issame_list) + outname = os.path.join("temp.bin") + with open(outname, "wb") as f: + pickle.dump((embeddings, issame_list), f, protocol=pickle.HIGHEST_PROTOCOL) diff --git a/recognition/arcface_oneflow/function.py b/recognition/arcface_oneflow/function.py index 1a9256d..07cbbd1 100644 --- a/recognition/arcface_oneflow/function.py +++ b/recognition/arcface_oneflow/function.py @@ -1,174 +1,261 @@ - -import argparse -import logging -import os - import oneflow as flow -import oneflow.nn as nn - -import sys +from oneflow.nn.parallel import DistributedDataParallel as ddp +from utils.ofrecord_data_utils import OFRecordDataLoader, SyntheticDataLoader +from utils.utils_logging import AverageMeter +from utils.utils_callbacks import CallBackVerification, CallBackLogging, CallBackModelCheckpoint from backbones import get_model -import math -from utils.utils_config import get_config -import numpy as np -import pickle -import time -from utils.ofrecord_data_utils import load_train_dataset, load_synthetic +from graph import TrainGraph, EvalGraph +from losses import CrossEntropyLoss_sbp +import logging -class Validator(object): - def __init__(self, cfg): - self.cfg = cfg +def make_data_loader(args, mode, is_consistent=False, synthetic=False): + assert mode in ("train", "validation") - def get_val_config(): - config = flow.function_config() - config.default_logical_view(flow.scope.consistent_view()) - config.default_data_type(flow.float) - return config - function_config = get_val_config() + if mode == "train": + total_batch_size = args.batch_size*flow.env.get_world_size() + batch_size = args.batch_size + num_samples = args.num_image + else: + total_batch_size = args.val_global_batch_size + batch_size = args.val_batch_size + num_samples = args.val_samples_per_epoch - @flow.global_function(type="predict", function_config=function_config) - def get_symbol_val_job( - images: flow.typing.Numpy.Placeholder( - (self.cfg.val_batch_size, 3, 112, 112) - ) - ): - print("val batch data: ", images.shape) - embedding = get_model(cfg.network, images, cfg) - return embedding + placement = None + sbp = None - self.get_symbol_val_fn = get_symbol_val_job + if is_consistent: + placement = flow.env.all_device_placement("cpu") + sbp = flow.sbp.split(0) + batch_size = total_batch_size - def load_checkpoint(self, model_path): - flow.load_variables(flow.checkpoint.get(model_path)) + if synthetic: - -def get_train_config(cfg): - - cfg.cudnn_conv_heuristic_search_algo = False - cfg.enable_fuse_model_update_ops = True - cfg.enable_fuse_add_to_output = True - func_config = flow.FunctionConfig() - func_config.default_logical_view(flow.scope.consistent_view()) - func_config.default_data_type(flow.float) - func_config.cudnn_conv_heuristic_search_algo( - cfg.cudnn_conv_heuristic_search_algo - ) - - func_config.enable_fuse_model_update_ops( - cfg.enable_fuse_model_update_ops) - func_config.enable_fuse_add_to_output(cfg.enable_fuse_add_to_output) - if cfg.fp16: - logging.info("Training with FP16 now.") - func_config.enable_auto_mixed_precision(True) - if cfg.partial_fc: - func_config.enable_fuse_model_update_ops(False) - func_config.indexed_slices_optimizer_conf( - dict(include_op_names=dict(op_name=['fc7-weight']))) - if cfg.fp16 and (cfg.num_nodes * cfg.device_num_per_node) > 1: - flow.config.collective_boxing.nccl_fusion_all_reduce_use_buffer(False) - if cfg.nccl_fusion_threshold_mb: - flow.config.collective_boxing.nccl_fusion_threshold_mb( - cfg.nccl_fusion_threshold_mb) - if cfg.nccl_fusion_max_ops: - flow.config.collective_boxing.nccl_fusion_max_ops( - cfg.nccl_fusion_max_ops) - - return func_config - - -def make_train_func(cfg): - @flow.global_function(type="train", function_config=get_train_config(cfg)) - def get_symbol_train_job(): - if cfg.use_synthetic_data: - (labels, images) = load_synthetic(cfg) - else: - labels, images = load_train_dataset(cfg) - image_size = images.shape[2:] - assert len( - image_size) == 2, "The length of image size must be equal to 2." - assert image_size[0] == image_size[1], "image_size[0] should be equal to image_size[1]." - - embedding = get_model(cfg.network, images, cfg) - - def _get_initializer(): - return flow.random_normal_initializer(mean=0.0, stddev=0.01) - - trainable = True - - if cfg.model_parallel and cfg.device_num_per_node > 1: - logging.info("Training is using model parallelism now.") - labels = labels.with_distribute(flow.distribute.broadcast()) - fc1_distribute = flow.distribute.broadcast() - fc7_data_distribute = flow.distribute.split(1) - fc7_model_distribute = flow.distribute.split(0) - else: - fc1_distribute = flow.distribute.split(0) - fc7_data_distribute = flow.distribute.split(0) - fc7_model_distribute = flow.distribute.broadcast() - weight_regularizer = flow.regularizers.l2(0.0005) - fc7_weight = flow.get_variable( - name="fc7-weight", - shape=(cfg.num_classes, embedding.shape[1]), - dtype=embedding.dtype, - initializer=_get_initializer(), - regularizer=weight_regularizer, - trainable=trainable, - model_name="weight", - distribute=fc7_model_distribute, + data_loader = SyntheticDataLoader( + batch_size=batch_size, + num_classes=args.num_classes, + placement=placement, + sbp=sbp, ) - if cfg.partial_fc and cfg.model_parallel: - logging.info( - "Training is using model parallelism and optimized by partial_fc now." - ) + return data_loader.to("cuda") - size = cfg.device_num_per_node * cfg.num_nodes - num_local = (cfg.num_classes + size - 1) // size - num_sample = int(num_local * cfg.sample_rate) - total_num_sample = num_sample * size + ofrecord_data_loader = OFRecordDataLoader( + ofrecord_root=args.ofrecord_path, + mode=mode, + dataset_size=num_samples, + batch_size=batch_size, + total_batch_size=total_batch_size, + data_part_num=args.ofrecord_part_num, + placement=placement, + sbp=sbp, + ) + return ofrecord_data_loader + + +def make_optimizer(args, model): + param_group = {"params": [p for p in model.parameters() if p is not None]} + + optimizer = flow.optim.SGD( + [param_group], + lr=args.lr, + momentum=args.momentum, + weight_decay=args.weight_decay, + ) + return optimizer + + +class FC7(flow.nn.Module): + def __init__(self, embedding_size, num_classes, cfg, partial_fc=False, bias=False): + super(FC7, self).__init__() + self.weight = flow.nn.Parameter( + flow.empty(num_classes, embedding_size)) + flow.nn.init.normal_(self.weight, mean=0, std=0.01) + + self.partial_fc = partial_fc + + size = flow.env.get_world_size() + num_local = (cfg.num_classes + size - 1) // size + self.num_sample = int(num_local * cfg.sample_rate) + self.total_num_sample = self.num_sample * size + + def forward(self, x, label): + x = flow.nn.functional.l2_normalize(input=x, dim=1, epsilon=1e-10) + if self.partial_fc: ( mapped_label, sampled_label, sampled_weight, ) = flow.distributed_partial_fc_sample( - weight=fc7_weight, label=labels, num_sample=total_num_sample, + weight=self.weight, label=label, num_sample=self.total_num_sample, ) - labels = mapped_label - fc7_weight = sampled_weight - fc7_weight = flow.math.l2_normalize( - input=fc7_weight, axis=1, epsilon=1e-10) - fc1 = flow.math.l2_normalize( - input=embedding, axis=1, epsilon=1e-10) - fc7 = flow.matmul( - a=fc1.with_distribute(fc1_distribute), b=fc7_weight, transpose_b=True - ) - fc7 = fc7.with_distribute(fc7_data_distribute) - - if cfg.loss == "cosface": - fc7 = (flow.combined_margin_loss( - fc7, labels, m1=1, m2=0.0, m3=0.4) * 64) - elif cfg.loss == "arcface": - fc7 = (flow.combined_margin_loss( - fc7, labels, m1=1, m2=0.5, m3=0.0) * 64) + label = mapped_label + weight = sampled_weight else: - raise ValueError() + weight = self.weight + weight = flow.nn.functional.l2_normalize( + input=weight, dim=1, epsilon=1e-10) + x = flow.matmul(x, weight, transpose_b=True) + if x.is_consistent: + return x, label + else: + return x - fc7 = fc7.with_distribute(fc7_data_distribute) - loss = flow.nn.sparse_softmax_cross_entropy_with_logits( - labels, fc7, name="softmax_loss" +class Train_Module(flow.nn.Module): + def __init__(self, cfg, backbone, placement, world_size): + super(Train_Module, self).__init__() + self.placement = placement + + if cfg.graph: + if cfg.model_parallel: + input_size = cfg.embedding_size + output_size = int(cfg.num_classes/world_size) + self.fc = FC7(input_size, output_size, cfg, partial_fc=cfg.partial_fc).to_consistent( + placement=placement, sbp=flow.sbp.split(0)) + else: + self.fc = FC7(cfg.embedding_size, cfg.num_classes, cfg).to_consistent( + placement=placement, sbp=flow.sbp.broadcast) + self.backbone = backbone.to_consistent( + placement=placement, sbp=flow.sbp.broadcast) + else: + self.backbone = backbone + self.fc = FC7(cfg.embedding_size, cfg.num_classes, cfg) + + def forward(self, x, labels): + x = self.backbone(x) + if x.is_consistent: + x = x.to_consistent(sbp=flow.sbp.broadcast) + x = self.fc(x, labels) + return x + + +class Trainer(object): + def __init__(self, cfg, placement, load_path, world_size, rank): + self.placement = placement + self.load_path = load_path + self.cfg = cfg + self.world_size = world_size + self.rank = rank + + # model + self.backbone = get_model(cfg.network, dropout=0.0, + num_features=cfg.embedding_size).to("cuda") + self.train_module = Train_Module( + cfg, self.backbone, self.placement, world_size).to("cuda") + if cfg.resume: + if load_path is not None: + self.load_state_dict() + else: + logging.info("Model resume failed! load path is None ") + + # optimizer + self.optimizer = make_optimizer(cfg, self.train_module) + + # data + self.train_data_loader = make_data_loader( + cfg, 'train', self.cfg.graph, self.cfg.synthetic) + + # loss + if cfg.loss == "cosface": + self.margin_softmax = flow.nn.CombinedMarginLoss( + 1, 0., 0.4).to("cuda") + else: + self.margin_softmax = flow.nn.CombinedMarginLoss( + 1, 0.5, 0.).to("cuda") + + self.of_cross_entropy = CrossEntropyLoss_sbp() + + # lr_scheduler + self.decay_step = self.cal_decay_step() + self.scheduler = flow.optim.lr_scheduler.MultiStepLR( + optimizer=self.optimizer, milestones=self.decay_step, gamma=0.1 ) - lr_scheduler = flow.optimizer.PiecewiseScalingScheduler( - base_lr=cfg.lr, - boundaries=cfg.lr_steps, - scale=cfg.lr_scales, - warmup=None - ) - flow.optimizer.SGD(lr_scheduler, - momentum=cfg.momentum if cfg.momentum > 0 else None, - ).minimize(loss) + # log + self.callback_logging = CallBackLogging( + 50, rank, cfg.total_step, cfg.batch_size, world_size, None) + # val + self.callback_verification = CallBackVerification( + 600, rank, cfg.val_targets, cfg.ofrecord_path, is_consistent=cfg.graph) + # save checkpoint + self.callback_checkpoint = CallBackModelCheckpoint(rank, cfg.output) - return loss + self.losses = AverageMeter() + self.start_epoch = 0 + self.global_step = 0 - return get_symbol_train_job + def __call__(self): + # Train + if self.cfg.graph: + self.train_graph() + else: + self.train_eager() + + def load_state_dict(self): + + if self.is_consistent: + state_dict = flow.load(self.load_path, consistent_src_rank=0) + elif self.rank == 0: + state_dict = flow.load(self.load_path) + else: + return + logging.info("Model resume successfully!") + self.model.load_state_dict(state_dict) + + def cal_decay_step(self): + cfg = self.cfg + num_image = cfg.num_image + total_batch_size = cfg.batch_size * self.world_size + self.warmup_step = num_image // total_batch_size * cfg.warmup_epoch + self.cfg.total_step = num_image // total_batch_size * cfg.num_epoch + logging.info("Total Step is:%d" % self.cfg.total_step) + return [x * num_image // total_batch_size for x in cfg.decay_epoch] + + def train_graph(self): + train_graph = TrainGraph(self.train_module, self.cfg, self.margin_softmax, + self.of_cross_entropy, self.train_data_loader, self.optimizer, self.scheduler) + # train_graph.debug() + val_graph = EvalGraph(self.backbone, self.cfg) + + for epoch in range(self.start_epoch, self.cfg.num_epoch): + self.train_module.train() + one_epoch_steps = len(self.train_data_loader) + for steps in range(one_epoch_steps): + self.global_step += 1 + loss = train_graph() + loss = loss.to_consistent( + sbp=flow.sbp.broadcast).to_local().numpy() + self.losses.update(loss, 1) + self.callback_logging(self.global_step, self.losses, epoch, False, + self.scheduler.get_last_lr()[0]) + self.callback_verification( + self.global_step, self.train_module, val_graph) + self.callback_checkpoint(self.global_step, epoch, + self.train_module, is_consistent=True) + + def train_eager(self): + self.train_module = ddp(self.train_module) + for epoch in range(self.start_epoch, self.cfg.num_epoch): + self.train_module.train() + + one_epoch_steps = len(self.train_data_loader) + for steps in range(one_epoch_steps): + self.global_step += 1 + image, label = self.train_data_loader() + image = image.to("cuda") + label = label.to("cuda") + features_fc7 = self.train_module(image, label) + features_fc7 = self.margin_softmax(features_fc7, label)*64 + loss = self.of_cross_entropy(features_fc7, label) + loss.backward() + self.optimizer.step() + self.optimizer.zero_grad() + + loss = loss.numpy() + self.losses.update(loss, 1) + self.callback_logging(self.global_step, self.losses, epoch, False, + self.scheduler.get_last_lr()[0]) + self.callback_verification(self.global_step, self.backbone) + self.scheduler.step() + self.callback_checkpoint( + self.global_step, epoch, self.train_module) diff --git a/recognition/arcface_oneflow/graph.py b/recognition/arcface_oneflow/graph.py new file mode 100644 index 0000000..891fcee --- /dev/null +++ b/recognition/arcface_oneflow/graph.py @@ -0,0 +1,75 @@ +import oneflow as flow +import oneflow.nn as nn + + +def make_static_grad_scaler(): + return flow.amp.StaticGradScaler(flow.env.get_world_size()) + + +def make_grad_scaler(): + return flow.amp.GradScaler( + init_scale=2 ** 30, growth_factor=2.0, backoff_factor=0.5, growth_interval=2000, + ) + + +def meter(self, mkey, *args): + assert mkey in self.m + self.m[mkey]["meter"].record(*args) + + +class TrainGraph(flow.nn.Graph): + def __init__( + self, + model, + cfg, + combine_margin, + cross_entropy, + data_loader, + optimizer, + lr_scheduler=None, + ): + super().__init__() + + if cfg.use_fp16: + self.config.enable_amp(True) + self.set_grad_scaler(make_grad_scaler()) + elif cfg.scale_grad: + self.set_grad_scaler(make_static_grad_scaler()) + + + + self.config.allow_fuse_add_to_output(True) + self.config.allow_fuse_model_update_ops(True) + + self.model = model + + self.cross_entropy = cross_entropy + self.combine_margin = combine_margin + self.data_loader = data_loader + self.add_optimizer(optimizer, lr_sch=lr_scheduler) + + def build(self): + image, label = self.data_loader() + + image = image.to("cuda") + label = label.to("cuda") + + logits, label = self.model(image, label) + logits = self.combine_margin(logits, label) * 64 + loss = self.cross_entropy(logits, label) + + loss.backward() + return loss + + +class EvalGraph(flow.nn.Graph): + def __init__(self, model, cfg): + super().__init__() + self.config.allow_fuse_add_to_output(True) + self.model = model + if cfg.fp16: + self.config.enable_amp(True) + + def build(self, image): + logits = self.model(image) + return logits diff --git a/recognition/arcface_oneflow/oneflow2onnx.py b/recognition/arcface_oneflow/oneflow2onnx.py index dd3aa7a..b1713bc 100644 --- a/recognition/arcface_oneflow/oneflow2onnx.py +++ b/recognition/arcface_oneflow/oneflow2onnx.py @@ -1,27 +1,49 @@ import os from os import mkdir -import oneflow.typing as tp -import onnx -import onnxruntime as ort -import numpy as np from oneflow_onnx.oneflow2onnx.util import convert_to_onnx_and_check import oneflow as flow import logging -from easydict import EasyDict as edict from backbones import get_model from utils.utils_config import get_config import argparse +import tempfile -def convert_func(cfg, model_path, out_path): - @flow.global_function() - def InferenceNet(images: tp.Numpy.Placeholder((1, 3, 112, 112))): +class ModelGraph(flow.nn.Graph): + def __init__(self, model): + super().__init__() + self.backbone = model - logits = get_model(cfg.network, images, cfg) - return logits - print(convert_to_onnx_and_check(InferenceNet, - flow_weight_dir=None, onnx_model_path=out_path)) + def build(self, x): + x = x.to("cuda") + out = self.backbone(x) + return out + + +def convert_func(cfg, model_path, out_path,image_size): + + model_module = get_model(cfg.network, dropout=0.0, + num_features=cfg.embedding_size).to("cuda") + model_module.eval() + print(model_module) + model_graph = ModelGraph(model_module) + model_graph._compile(flow.randn(1, 3, image_size, image_size).to("cuda")) + + with tempfile.TemporaryDirectory() as tmpdirname: + new_parameters = dict() + parameters = flow.load(model_path) + for key, value in parameters.items(): + if "num_batches_tracked" not in key: + if key == "fc.weight": + continue + val = value + new_key = key.replace("backbone.", "") + new_parameters[new_key] = val + model_module.load_state_dict(new_parameters) + flow.save(model_module.state_dict(), tmpdirname) + convert_to_onnx_and_check( + model_graph, flow_weight_dir=tmpdirname, onnx_model_path="./", print_outlier=True) def main(args): @@ -30,7 +52,7 @@ def main(args): cfg = get_config(args.config) if not os.path.exists(args.out_path): mkdir(args.out_path) - convert_func(cfg, args.model_path, args.out_path) + convert_func(cfg, args.model_path, args.out_path,args.image_size) if __name__ == "__main__": @@ -38,6 +60,8 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description='OneFlow ArcFace val') parser.add_argument('config', type=str, help='py config file') parser.add_argument('--model_path', type=str, help='model path') + parser.add_argument('--image_size', type=int, + default=112, help='input image size') parser.add_argument('--out_path', type=str, default="onnx_model", help='out path') - main(parser.parse_args()) + diff --git a/recognition/arcface_oneflow/requirements.txt b/recognition/arcface_oneflow/requirements.txt new file mode 100644 index 0000000..9b31a23 --- /dev/null +++ b/recognition/arcface_oneflow/requirements.txt @@ -0,0 +1,7 @@ +numpy +matplotlib +Pillow +opencv-python +scikit-learn +scipy +easydict \ No newline at end of file diff --git a/recognition/arcface_oneflow/run.sh b/recognition/arcface_oneflow/run.sh deleted file mode 100644 index ac2cf2c..0000000 --- a/recognition/arcface_oneflow/run.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -export PYTHONUNBUFFERED=1 -echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED -export NCCL_LAUNCH_MODE=PARALLEL -echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE -export NCCL_DEBUG=False -export ONEFLOW_DEBUG_MODE=False - -#CUDA_VISIBLE_DEVICES='1' -python train.py configs/ms1mv3_r50.py --device_num_per_node 8 \ No newline at end of file diff --git a/recognition/arcface_oneflow/test.sh b/recognition/arcface_oneflow/test.sh deleted file mode 100644 index 5c682ce..0000000 --- a/recognition/arcface_oneflow/test.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -python val.py configs/ms1mv3_r50 --model_path work_dir/lazy_r50/snapshot_0 \ No newline at end of file diff --git a/recognition/arcface_oneflow/tools/dataset_convert/mx_recordio_2_ofrecord.py b/recognition/arcface_oneflow/tools/mx_recordio_2_ofrecord.py similarity index 86% rename from recognition/arcface_oneflow/tools/dataset_convert/mx_recordio_2_ofrecord.py rename to recognition/arcface_oneflow/tools/mx_recordio_2_ofrecord.py index 3001984..b7047a3 100644 --- a/recognition/arcface_oneflow/tools/dataset_convert/mx_recordio_2_ofrecord.py +++ b/recognition/arcface_oneflow/tools/mx_recordio_2_ofrecord.py @@ -37,10 +37,7 @@ def load_train_data(data_dir): ) ) - imgrec = recordio.MXIndexedRecordIO( - path_imgidx, path_imgrec, "r", key_type=int - ) - # TODO: key_type ?? + imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, "r", key_type=int) # Read header0 to get some info. identity_key_start = 0 @@ -64,15 +61,6 @@ def load_train_data(data_dir): else: imgidx_list = imgrec.keys - - # print id2range to txt file - # with open('id2range.txt', 'w') as f: - # for identity in range(identity_key_start, identity_key_end): - # l = str(identity) \ - # + ' ' \ - # + str(id2range[identity][0]) \ - # + ' ' + str(id2range[identity][1]) + '\n' - # f.write(l) return imgrec, imgidx_list @@ -129,11 +117,7 @@ def main(args): with open(output_file, "wb") as f: for idx in imgidx_list: if idx % 10000 == 0: - print( - "Converting images: {} of {}".format( - idx, len(imgidx_list) - ) - ) + print("Converting images: {} of {}".format(idx, len(imgidx_list))) img_data = {} rec = imgrec.read_idx(idx) diff --git a/recognition/arcface_oneflow/tools/dataset_convert/mx_recordio_2_ofrecord_shuffled_npart.py b/recognition/arcface_oneflow/tools/mx_recordio_2_ofrecord_shuffled_npart.py similarity index 89% rename from recognition/arcface_oneflow/tools/dataset_convert/mx_recordio_2_ofrecord_shuffled_npart.py rename to recognition/arcface_oneflow/tools/mx_recordio_2_ofrecord_shuffled_npart.py index 8a97570..ba3b82c 100644 --- a/recognition/arcface_oneflow/tools/dataset_convert/mx_recordio_2_ofrecord_shuffled_npart.py +++ b/recognition/arcface_oneflow/tools/mx_recordio_2_ofrecord_shuffled_npart.py @@ -25,10 +25,7 @@ def parse_arguement(argv): help="Path to output OFRecord.", ) parser.add_argument( - "--num_part", - type=int, - default=96, - help="num_part of OFRecord to generate.", + "--num_part", type=int, default=96, help="num_part of OFRecord to generate.", ) return parser.parse_args(argv) @@ -40,14 +37,12 @@ def load_train_data(data_dir): print( "Loading recordio {}\n\ - Corresponding record idx is {}".format( + Corresponding record idx is {}".format( path_imgrec, path_imgidx ) ) - imgrec = recordio.MXIndexedRecordIO( - path_imgidx, path_imgrec, "r", key_type=int - ) + imgrec = recordio.MXIndexedRecordIO(path_imgidx, path_imgrec, "r", key_type=int) # Read header0 to get some info. identity_key_start = 0 @@ -135,16 +130,12 @@ def main(args): output_file = os.path.join(output_dir, part_name) file_idx_start = part_id * num_images_per_part file_idx_end = min((part_id + 1) * num_images_per_part, num_images) - print("part-"+str(part_id), "start", file_idx_start, "end", file_idx_end) + print("part-" + str(part_id), "start", file_idx_start, "end", file_idx_end) with open(output_file, "wb") as f: for file_idx in range(file_idx_start, file_idx_end): idx = imgidx_list[file_idx] if idx % 10000 == 0: - print( - "Converting images: {} of {}".format( - idx, len(imgidx_list) - ) - ) + print("Converting images: {} of {}".format(idx, len(imgidx_list))) img_data = {} rec = imgrec.read_idx(idx) diff --git a/recognition/arcface_oneflow/train.py b/recognition/arcface_oneflow/train.py index 2f8201d..477db3c 100644 --- a/recognition/arcface_oneflow/train.py +++ b/recognition/arcface_oneflow/train.py @@ -2,92 +2,42 @@ import argparse import logging import os import oneflow as flow -import oneflow.nn as nn -import sys -import math -import numpy as np -import pickle -import time -from backbones import get_model -from utils.utils_callbacks import CallBackVerification, CallBackLogging + +from function import Trainer +from utils.utils_logging import init_logging from utils.utils_config import get_config -from utils.utils_logging import AverageMeter, init_logging -from utils.ofrecord_data_utils import load_train_dataset, load_synthetic -from function import make_train_func, Validator def main(args): cfg = get_config(args.config) + cfg.graph = args.graph + rank = flow.env.get_rank() + world_size = flow.env.get_world_size() + placement = flow.env.all_device_placement("cuda") - cfg.device_num_per_node = args.device_num_per_node - cfg.total_batch_size = cfg.batch_size*cfg.device_num_per_node*cfg.num_nodes - cfg.steps_per_epoch = math.ceil(cfg.num_image / cfg.total_batch_size) - cfg.total_step = cfg.num_epoch*cfg.steps_per_epoch - cfg.lr_steps = (np.array(cfg.decay_epoch)*cfg.steps_per_epoch).tolist() - lr_scales = [0.1, 0.01, 0.001, 0.0001] - cfg.lr_scales = lr_scales[:len(cfg.lr_steps)] - cfg.output = os.path.join("work_dir", cfg.output, cfg.loss) - - world_size = cfg.num_nodes os.makedirs(cfg.output, exist_ok=True) - log_root = logging.getLogger() - init_logging(log_root, cfg.output) - flow.config.gpu_device_num(cfg.device_num_per_node) - logging.info("gpu num: %d" % cfg.device_num_per_node) - if cfg.num_nodes > 1: - assert cfg.num_nodes <= len( - cfg.node_ips), "The number of nodes should not be greater than length of node_ips list." - flow.env.ctrl_port(12138) - nodes = [] - for ip in cfg.node_ips: - addr_dict = {} - addr_dict["addr"] = ip - nodes.append(addr_dict) - flow.env.machine(nodes) - flow.env.log_dir(cfg.output) + init_logging(log_root, rank, cfg.output) + + # root dir of loading checkpoint + load_path = None for key, value in cfg.items(): - num_space = 35 - len(key) + num_space = 25 - len(key) logging.info(": " + key + " " * num_space + str(value)) - train_func = make_train_func(cfg) - val_infer = Validator(cfg) - - callback_verification = CallBackVerification( - 3000, cfg.val_targets, cfg.eval_ofrecord_path) - callback_logging = CallBackLogging( - 50, cfg.total_step, cfg.total_batch_size, world_size, None) - - if cfg.resume and os.path.exists(cfg.model_load_dir): - logging.info("Loading model from {}".format(cfg.model_load_dir)) - variables = flow.checkpoint.get(cfg.model_load_dir) - flow.load_variables(variables) - - start_epoch = 0 - global_step = 0 - lr = cfg.lr - for epoch in range(start_epoch, cfg.num_epoch): - for steps in range(cfg.steps_per_epoch): - train_func().async_get(callback_logging.metric_cb(global_step, epoch, lr)) - callback_verification(global_step, val_infer.get_symbol_val_fn) - global_step += 1 - if epoch in cfg.decay_epoch: - lr *= 0.1 - logging.info("lr_steps: %d" % global_step) - logging.info("lr change to %f" % lr) - - # snapshot - path = os.path.join( - cfg.output, "snapshot_" + str(epoch)) - flow.checkpoint.save(path) - logging.info("oneflow Model Saved in '{}'".format(path)) + trainer = Trainer(cfg, placement, load_path, world_size, rank) + trainer() if __name__ == "__main__": - parser = argparse.ArgumentParser(description='OneFlow ArcFace Training') - parser.add_argument('config', type=str, help='py config file') - parser.add_argument('--local_rank', type=int, default=0, help='local_rank') - parser.add_argument('--device_num_per_node', type=int, - default=1, help='local_rank') + + parser = argparse.ArgumentParser(description="OneFlow ArcFace Training") + parser.add_argument("config", type=str, help="py config file") + parser.add_argument( + "--graph", + action="store_true", + help="Run model in graph mode,else run model in ddp mode.", + ) + parser.add_argument("--local_rank", type=int, default=0, help="local_rank") main(parser.parse_args()) diff --git a/recognition/arcface_oneflow/train_ddp.sh b/recognition/arcface_oneflow/train_ddp.sh new file mode 100644 index 0000000..86ed541 --- /dev/null +++ b/recognition/arcface_oneflow/train_ddp.sh @@ -0,0 +1,25 @@ +# set -aux + +MASTER_ADDR=127.0.0.1 +MASTER_PORT=17788 +DEVICE_NUM_PER_NODE=8 +NUM_NODES=1 +NODE_RANK=0 + + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +export NCCL_LAUNCH_MODE=PARALLEL +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE +export NCCL_DEBUG=INFO +export ONEFLOW_DEBUG_MODE=True + + +NCCL_DEBUG=INFO \ +python3 -m oneflow.distributed.launch \ +--nproc_per_node $DEVICE_NUM_PER_NODE \ +--nnodes $NUM_NODES \ +--node_rank $NODE_RANK \ +--master_addr $MASTER_ADDR \ +--master_port $MASTER_PORT \ +train.py configs/ms1mv3_r50.py diff --git a/recognition/arcface_oneflow/train_graph_distributed.sh b/recognition/arcface_oneflow/train_graph_distributed.sh new file mode 100644 index 0000000..4f57250 --- /dev/null +++ b/recognition/arcface_oneflow/train_graph_distributed.sh @@ -0,0 +1,26 @@ +# set -aux + +MASTER_ADDR=127.0.0.1 +MASTER_PORT=17788 +DEVICE_NUM_PER_NODE=8 +NUM_NODES=1 +NODE_RANK=0 + + + +export PYTHONUNBUFFERED=1 +echo PYTHONUNBUFFERED=$PYTHONUNBUFFERED +export NCCL_LAUNCH_MODE=PARALLEL +echo NCCL_LAUNCH_MODE=$NCCL_LAUNCH_MODE +#export NCCL_DEBUG=INFO +export ONEFLOW_DEBUG_MODE=True + + +#NCCL_DEBUG=INFO +python3 -m oneflow.distributed.launch \ +--nproc_per_node $DEVICE_NUM_PER_NODE \ +--nnodes $NUM_NODES \ +--node_rank $NODE_RANK \ +--master_addr $MASTER_ADDR \ +--master_port $MASTER_PORT \ +train.py configs/ms1mv3_r50.py --graph diff --git a/recognition/arcface_oneflow/utils/losses.py b/recognition/arcface_oneflow/utils/losses.py new file mode 100644 index 0000000..5d4aa1d --- /dev/null +++ b/recognition/arcface_oneflow/utils/losses.py @@ -0,0 +1,66 @@ +import oneflow as flow +from oneflow import nn + + +def get_loss(name): + if name == "cosface": + return CosFace() + elif name == "arcface": + return ArcFace() + else: + raise ValueError() + + +class CrossEntropyLoss_sbp(nn.Module): + def __init__(self): + super(CrossEntropyLoss_sbp, self).__init__() + + def forward(self, logits, label): + loss = flow._C.sparse_softmax_cross_entropy( + logits, label) + loss = flow.mean(loss) + return loss + + +def get_loss(name): + if name == "cosface": + return CosFace() + elif name == "arcface": + return ArcFace() + else: + raise ValueError() + + +class CosFace(nn.Module): + def __init__(self, s=64.0, m=0.40): + super(CosFace, self).__init__() + self.s = s + self.m = m + + def forward(self, cosine, label): + index = flow.where(label != -1)[0] + m_hot = flow.zeros(index.size()[0], cosine.size()[ + 1], device=cosine.device) + + m_hot = flow.scatter(m_hot, 1, label[index, None], self.m) + cosine = cosine[index] - m_hot + + ret = cosine * self.s + return ret + + +class ArcFace(nn.Module): + def __init__(self, s=64.0, m=0.5): + super(ArcFace, self).__init__() + self.s = s + self.m = m + + def forward(self, cosine: flow.Tensor, label): + index = flow.where(label != -1)[0] + m_hot = flow.zeros(index.size()[0], cosine.size()[ + 1], device=cosine.device) + m_hot.scatter_(1, label[index, None], self.m) + cosine.acos_() + cosine[index] += m_hot + cosine.cos_().mul_(self.s) + return cosine diff --git a/recognition/arcface_oneflow/utils/ofrecord_data_utils.py b/recognition/arcface_oneflow/utils/ofrecord_data_utils.py index 63999a7..449fe25 100644 --- a/recognition/arcface_oneflow/utils/ofrecord_data_utils.py +++ b/recognition/arcface_oneflow/utils/ofrecord_data_utils.py @@ -1,70 +1,148 @@ -import os import oneflow as flow +import oneflow.nn as nn +import os +from typing import List, Union -def train_dataset_reader( - args, data_dir, batch_size, data_part_num, part_name_suffix_length=1 -): - if os.path.exists(data_dir): - print("Loading train data from {}".format(data_dir)) - else: - raise Exception("Invalid train dataset dir", data_dir) - image_blob_conf = flow.data.BlobConf( - "encoded", - shape=(112, 112, 3), - dtype=flow.float, - codec=flow.data.ImageCodec( - image_preprocessors=[ - flow.data.ImagePreprocessor("bgr2rgb"), - flow.data.ImagePreprocessor("mirror"), - ] - ), - preprocessors=[ - flow.data.NormByChannelPreprocessor( - mean_values=(127.5, 127.5, 127.5), std_values=(127.5, 127.5, 127.5), data_format="NCHW" - ), - ], - ) +class OFRecordDataLoader(nn.Module): + def __init__( + self, + ofrecord_root: str = "./ofrecord", + mode: str = "train", # "val" + dataset_size: int = 9469, + batch_size: int = 1, + total_batch_size: int = 1, + data_part_num: int = 8, + placement: flow.placement = None, + sbp: Union[flow.sbp.sbp, List[flow.sbp.sbp]] = None, + ): + super().__init__() + channel_last = False + output_layout = "NHWC" if channel_last else "NCHW" + assert (ofrecord_root, mode) + self.train_record_reader = flow.nn.OfrecordReader( + os.path.join(ofrecord_root, mode), + batch_size=batch_size, + data_part_num=data_part_num, + part_name_suffix_length=5, + random_shuffle=True if mode == "train" else False, + shuffle_after_epoch=True if mode == "train" else False, + placement=placement, + sbp=sbp, + ) + self.record_label_decoder = flow.nn.OfrecordRawDecoder( + "label", shape=(), dtype=flow.int32 + ) - label_blob_conf = flow.data.BlobConf( - "label", shape=(), dtype=flow.int32, codec=flow.data.RawCodec() - ) + color_space = "RGB" + height = 112 + width = 112 - return flow.data.decode_ofrecord( - data_dir, - (label_blob_conf, image_blob_conf), - batch_size=batch_size, - data_part_num=data_part_num, - part_name_prefix=args.part_name_prefix, - part_name_suffix_length=args.part_name_suffix_length, - shuffle=args.shuffle, - buffer_size=16384, - ) + self.record_image_decoder = flow.nn.OFRecordImageDecoder( + "encoded", color_space=color_space + ) + self.resize = ( + flow.nn.image.Resize(target_size=[height, width]) + if mode == "train" + else flow.nn.image.Resize( + resize_side="shorter", keep_aspect_ratio=True, target_size=112 + ) + ) + + self.flip = ( + flow.nn.CoinFlip(batch_size=batch_size, placement=placement, sbp=sbp) + if mode == "train" + else None + ) + + rgb_mean = [127.5, 127.5, 127.5] + rgb_std = [127.5, 127.5, 127.5] + self.crop_mirror_norm = ( + flow.nn.CropMirrorNormalize( + color_space=color_space, + output_layout=output_layout, + mean=rgb_mean, + std=rgb_std, + output_dtype=flow.float, + ) + if mode == "train" + else flow.nn.CropMirrorNormalize( + color_space=color_space, + output_layout=output_layout, + crop_h=0, + crop_w=0, + crop_pos_y=0.5, + crop_pos_x=0.5, + mean=rgb_mean, + std=rgb_std, + output_dtype=flow.float, + ) + ) + + self.batch_size = batch_size + self.total_batch_size = total_batch_size + self.dataset_size = dataset_size + + def __len__(self): + return self.dataset_size // self.total_batch_size + + def forward(self): + train_record = self.train_record_reader() + label = self.record_label_decoder(train_record) + image_raw_buffer = self.record_image_decoder(train_record) + + image = self.resize(image_raw_buffer)[0] + + rng = self.flip() if self.flip != None else None + image = self.crop_mirror_norm(image, rng) + + return image, label -def load_synthetic(config): - batch_size = config.train_batch_size - image_size = 112 - label = flow.data.decode_random( - shape=(), - dtype=flow.int32, - batch_size=batch_size, - initializer=flow.zeros_initializer(flow.int32), - ) +class SyntheticDataLoader(flow.nn.Module): + def __init__( + self, batch_size, image_size=112, num_classes=10000, placement=None, sbp=None, + ): + super().__init__() - image = flow.data.decode_random( - shape=(image_size, image_size, 3), dtype=flow.float, batch_size=batch_size, - ) - return label, image + self.image_shape = (batch_size, 3, image_size, image_size) + self.label_shape = (batch_size,) + self.num_classes = num_classes + self.placement = placement + self.sbp = sbp + if self.placement is not None and self.sbp is not None: + self.image = flow.nn.Parameter( + flow.randint( + 0, + high=255, + size=self.image_shape, + dtype=flow.float32, + placement=self.placement, + sbp=self.sbp, + ), + requires_grad=False, + ) + self.label = flow.nn.Parameter( + flow.randint( + 0, + high=self.num_classes, + size=self.label_shape, + placement=self.placement, + sbp=self.sbp, + ).to(dtype=flow.int32), + requires_grad=False, + ) + else: + self.image = flow.randint( + 0, high=255, size=self.image_shape, dtype=flow.float32, device="cuda" + ) + self.label = flow.randint( + 0, high=self.num_classes, size=self.label_shape, device="cuda", + ).to(dtype=flow.int32) -def load_train_dataset(args): - data_dir = args.ofrecord_path - batch_size = args.total_batch_size - data_part_num = args.train_data_part_num - part_name_suffix_length = args.part_name_suffix_length - print("train batch size in load train dataset: ", batch_size) - labels, images = train_dataset_reader( - args, data_dir, batch_size, data_part_num, part_name_suffix_length - ) - return labels, images + def __len__(self): + return 10000 + + def forward(self): + return self.image, self.label diff --git a/recognition/arcface_oneflow/utils/utils_callbacks.py b/recognition/arcface_oneflow/utils/utils_callbacks.py index ff5e785..fa1a1df 100644 --- a/recognition/arcface_oneflow/utils/utils_callbacks.py +++ b/recognition/arcface_oneflow/utils/utils_callbacks.py @@ -9,55 +9,85 @@ from eval import verification from utils.utils_logging import AverageMeter - class CallBackVerification(object): - def __init__(self, frequent, val_targets, rec_prefix, image_size=(112, 112),world_size=1): + def __init__( + self, + frequent, + rank, + val_targets, + rec_prefix, + image_size=(112, 112), + world_size=1, + is_consistent=False, + ): self.frequent: int = frequent - + self.rank: int = rank self.highest_acc: float = 0.0 self.highest_acc_list: List[float] = [0.0] * len(val_targets) self.ver_list: List[object] = [] self.ver_name_list: List[str] = [] - self.world_size=world_size - - self.init_dataset(val_targets=val_targets, data_dir=rec_prefix, image_size=image_size) + self.world_size = world_size + self.is_consistent = is_consistent + if self.is_consistent: + self.init_dataset( + val_targets=val_targets, data_dir=rec_prefix, image_size=image_size + ) + else: + if self.rank is 0: + self.init_dataset( + val_targets=val_targets, data_dir=rec_prefix, image_size=image_size + ) def ver_test(self, backbone: flow.nn.Module, global_step: int): results = [] for i in range(len(self.ver_list)): - + acc1, std1, acc2, std2, xnorm, embeddings_list = verification.test( - self.ver_list[i], backbone, 10, 10) - logging.info('[%s][%d]XNorm: %f' % (self.ver_name_list[i], global_step, xnorm)) - logging.info('[%s][%d]Accuracy-Flip: %1.5f+-%1.5f' % (self.ver_name_list[i], global_step, acc2, std2)) + self.ver_list[i], backbone, 10, 10, self.is_consistent + ) + logging.info( + "[%s][%d]XNorm: %f" % (self.ver_name_list[i], global_step, xnorm) + ) + logging.info( + "[%s][%d]Accuracy-Flip: %1.5f+-%1.5f" + % (self.ver_name_list[i], global_step, acc2, std2) + ) if acc2 > self.highest_acc_list[i]: self.highest_acc_list[i] = acc2 logging.info( - '[%s][%d]Accuracy-Highest: %1.5f' % (self.ver_name_list[i], global_step, self.highest_acc_list[i])) + "[%s][%d]Accuracy-Highest: %1.5f" + % (self.ver_name_list[i], global_step, self.highest_acc_list[i]) + ) results.append(acc2) - def init_dataset(self, val_targets, data_dir, image_size): for name in val_targets: - path = os.path.join(data_dir, name + ".bin") + path = os.path.join(data_dir, name + ".bin") if os.path.exists(path): data_set = verification.load_bin_cv(path, image_size) self.ver_list.append(data_set) self.ver_name_list.append(name) + def __call__(self, num_update, backbone: flow.nn.Module, backbone_graph=None): - def __call__(self, num_update, backbone): - if num_update > 0 and num_update % self.frequent == 0: - self.ver_test(backbone, num_update) - + if self.is_consistent: + if num_update > 0 and num_update % self.frequent == 0: + backbone.eval() + self.ver_test(backbone_graph, num_update) + backbone.train() + else: + if self.rank is 0 and num_update > 0 and num_update % self.frequent == 0: + backbone.eval() + self.ver_test(backbone, num_update) + backbone.train() class CallBackLogging(object): - def __init__(self, frequent, total_step, batch_size, world_size, writer=None): + def __init__(self, frequent, rank, total_step, batch_size, world_size, writer=None): self.frequent: int = frequent - + self.rank: int = rank self.time_start = time.time() self.total_step: int = total_step self.batch_size: int = batch_size @@ -66,42 +96,80 @@ class CallBackLogging(object): self.init = False self.tic = 0 - self.losses=AverageMeter() - def metric_cb(self, - global_step: int, - epoch: int, - learning_rate: float): - def callback(loss): - loss=loss.mean() - self.losses.update(loss, 1) - if global_step % self.frequent == 0: + def __call__( + self, + global_step: int, + loss: AverageMeter, + epoch: int, + fp16: bool, + learning_rate: float, + grad_scaler=None, + ): + if self.rank == 0 and global_step % self.frequent == 0: + if self.init: + try: + speed: float = self.frequent * self.batch_size / ( + time.time() - self.tic + ) + speed_total = speed * self.world_size + except ZeroDivisionError: + speed_total = float("inf") - if self.init: - try: - speed: float = self.frequent * self.batch_size / (time.time() - self.tic) - speed_total = speed * self.world_size - except ZeroDivisionError: - speed_total = float('inf') - - time_now = (time.time() - self.time_start) / 3600 - time_total = time_now / ((global_step + 1) / self.total_step) - time_for_end = time_total - time_now - if self.writer is not None: - self.writer.add_scalar('time_for_end', time_for_end, global_step) - self.writer.add_scalar('learning_rate', learning_rate, global_step) - self.writer.add_scalar('loss', loss.avg, global_step) - else: - msg = "Speed %.2f samples/sec Loss %.4f LearningRate %.4f Epoch: %d Global Step: %d " \ - "Required: %1.f hours" % ( - speed_total, self.losses.avg, learning_rate, epoch, global_step, time_for_end - ) - logging.info(msg) - self.losses.reset() - self.tic = time.time() + time_now = (time.time() - self.time_start) / 3600 + time_total = time_now / ((global_step + 1) / self.total_step) + time_for_end = time_total - time_now + if self.writer is not None: + self.writer.add_scalar("time_for_end", time_for_end, global_step) + self.writer.add_scalar("learning_rate", learning_rate, global_step) + self.writer.add_scalar("loss", loss.avg, global_step) + if fp16: + msg = ( + "Speed %.2f samples/sec Loss %.4f LearningRate %.4f Epoch: %d Global Step: %d " + "Fp16 Grad Scale: %2.f Required: %1.f hours" + % ( + speed_total, + loss.avg, + learning_rate, + epoch, + global_step, + time_for_end, + ) + ) else: - self.init = True - self.tic = time.time() - return callback + msg = ( + "Speed %.2f samples/sec Loss %.4f LearningRate %.4f Epoch: %d Global Step: %d " + "Required: %1.f hours" + % ( + speed_total, + loss.avg, + learning_rate, + epoch, + global_step, + time_for_end, + ) + ) + logging.info(msg) + loss.reset() + self.tic = time.time() + else: + self.init = True + self.tic = time.time() +class CallBackModelCheckpoint(object): + def __init__(self, rank, output="./"): + self.rank: int = rank + self.output: str = output + + def __call__(self, global_step, epoch, backbone, is_consistent=False): + + if global_step > 100 and backbone is not None: + path_module = os.path.join(self.output, "epoch_%d" % (epoch)) + + if is_consistent: + flow.save(backbone.state_dict(), path_module, consistent_dst_rank=0) + else: + if self.rank == 0: + flow.save(backbone.state_dict(), path_module) + logging.info("oneflow Model Saved in '{}'".format(path_module)) diff --git a/recognition/arcface_oneflow/utils/utils_config.py b/recognition/arcface_oneflow/utils/utils_config.py index 67f06a4..b61724b 100644 --- a/recognition/arcface_oneflow/utils/utils_config.py +++ b/recognition/arcface_oneflow/utils/utils_config.py @@ -4,7 +4,8 @@ import os.path as osp def get_config(config_file): assert config_file.startswith( - 'configs/'), 'config file setting must start with configs/' + "configs/" + ), "config file setting must start with configs/" temp_config_name = osp.basename(config_file) temp_module_name = osp.splitext(temp_config_name)[0] config = importlib.import_module("configs.base") @@ -13,5 +14,5 @@ def get_config(config_file): job_cfg = config.config cfg.update(job_cfg) if cfg.output is None: - cfg.output = osp.join('work_dirs', temp_module_name) + cfg.output = osp.join("work_dirs", temp_module_name) return cfg diff --git a/recognition/arcface_oneflow/utils/utils_logging.py b/recognition/arcface_oneflow/utils/utils_logging.py index 34bcdfc..543a7e1 100644 --- a/recognition/arcface_oneflow/utils/utils_logging.py +++ b/recognition/arcface_oneflow/utils/utils_logging.py @@ -27,14 +27,14 @@ class AverageMeter(object): self.avg = self.sum / self.count -def init_logging(log_root, models_root): - - log_root.setLevel(logging.INFO) - formatter = logging.Formatter("Training: %(asctime)s-%(message)s") - handler_file = logging.FileHandler( - os.path.join(models_root, "training.log")) - handler_stream = logging.StreamHandler(sys.stdout) - handler_file.setFormatter(formatter) - handler_stream.setFormatter(formatter) - log_root.addHandler(handler_file) - log_root.addHandler(handler_stream) +def init_logging(log_root, rank, models_root): + if rank is 0: + log_root.setLevel(logging.INFO) + formatter = logging.Formatter("Training: %(asctime)s-%(message)s") + handler_file = logging.FileHandler(os.path.join(models_root, "training.log")) + handler_stream = logging.StreamHandler(sys.stdout) + handler_file.setFormatter(formatter) + handler_stream.setFormatter(formatter) + log_root.addHandler(handler_file) + log_root.addHandler(handler_stream) + log_root.info("rank_id: %d" % rank) diff --git a/recognition/arcface_oneflow/val.py b/recognition/arcface_oneflow/val.py index 876e293..3645c3a 100644 --- a/recognition/arcface_oneflow/val.py +++ b/recognition/arcface_oneflow/val.py @@ -1,33 +1,44 @@ -from utils.utils_logging import AverageMeter, init_logging -import argparse -from function import Validator -from utils.utils_config import get_config -import logging -import os -from backbones import get_model -from utils.utils_callbacks import CallBackVerification -from eval import verification +import backbones import oneflow as flow -import sys +from utils.utils_callbacks import CallBackVerification +from backbones import get_model +from graph import TrainGraph, EvalGraph +import logging +import argparse +from utils.utils_config import get_config +from function import EvalGraph def main(args): cfg = get_config(args.config) - logging.basicConfig(level=logging.NOTSET) logging.info(args.model_path) - val_infer = Validator(cfg) - val_callback = CallBackVerification( - 1, cfg.val_targets, cfg.eval_ofrecord_path, image_nums=cfg.val_image_num) - val_infer.load_checkpoint(args.model_path) - val_callback(1000, val_infer.get_symbol_val_fn) + backbone = get_model(cfg.network, dropout=0.0, num_features=cfg.embedding_size).to( + "cuda" + ) + val_callback = CallBackVerification(1, 0, cfg.val_targets, cfg.ofrecord_path) + + state_dict = flow.load(args.model_path) + + new_parameters = dict() + for key, value in state_dict.items(): + if "num_batches_tracked" not in key: + if key == "fc.weight": + continue + new_key = key.replace("backbone.", "") + new_parameters[new_key] = value + + backbone.load_state_dict(new_parameters) + + infer_graph = EvalGraph(backbone) + val_callback(1000, backbone, infer_graph) if __name__ == "__main__": - parser = argparse.ArgumentParser(description='OneFlow ArcFace val') - parser.add_argument('config', type=str, help='py config file') - parser.add_argument('--model_path', type=str, help='model path') + parser = argparse.ArgumentParser(description="OneFlow ArcFace val") + parser.add_argument("config", type=str, help="py config file") + parser.add_argument("--model_path", type=str, help="model path") main(parser.parse_args()) diff --git a/recognition/arcface_oneflow/val.sh b/recognition/arcface_oneflow/val.sh new file mode 100644 index 0000000..8151f40 --- /dev/null +++ b/recognition/arcface_oneflow/val.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +python val.py configs/ms1mv3_r50 --model_path eager_test/epoch_0