From f5c678e4ea301f07c336093cda5252d408e979da Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Thu, 16 Nov 2017 14:29:48 +0800 Subject: [PATCH] tiny --- src/data.py | 6 + src/operator/lsoftmax-inl.h | 379 ++++++++++++++++++++++++++++++++++++ src/operator/lsoftmax.cc | 75 +++++++ src/operator/lsoftmax.cu | 322 ++++++++++++++++++++++++++++++ src/train.sh | 64 ++++++ src/train_softmax.py | 49 +++-- 6 files changed, 880 insertions(+), 15 deletions(-) create mode 100644 src/operator/lsoftmax-inl.h create mode 100644 src/operator/lsoftmax.cc create mode 100644 src/operator/lsoftmax.cu create mode 100755 src/train.sh diff --git a/src/data.py b/src/data.py index ba1077b..92c3a74 100644 --- a/src/data.py +++ b/src/data.py @@ -489,6 +489,12 @@ class FaceImageIter2(io.DataIter): else: label, fname, bbox, landmark = self.imglist[idx] return label, self.read_image(fname), bbox, landmark + else: + s = self.imgrec.read() + if s is None: + raise StopIteration + header, img = recordio.unpack(s) + return header.label, img, None, None def brightness_aug(self, src, x): alpha = 1.0 + random.uniform(-x, x) diff --git a/src/operator/lsoftmax-inl.h b/src/operator/lsoftmax-inl.h new file mode 100644 index 0000000..33d51bf --- /dev/null +++ b/src/operator/lsoftmax-inl.h @@ -0,0 +1,379 @@ +/*! + * Copyright (c) 2016 by Contributors + * \file lsoftmax-inl.h + * \brief LSoftmax from + * \author luoyetx + */ +#ifndef MXNET_OPERATOR_LSOFTMAX_INL_H_ +#define MXNET_OPERATOR_LSOFTMAX_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include "./operator_common.h" + +namespace mxnet { +namespace op { + +namespace lsoftmax_enum { +enum LSoftmaxOpInputs {kData, kWeight, kLabel}; +enum LSoftmaxOpOutputs {kOut, kDataNorm, kWeightNorm}; +enum LSoftmaxResource {kTempSpace}; +} + +struct LSoftmaxParam : public dmlc::Parameter { + int margin; + float beta; + float beta_min; + float scale; + int num_hidden; + bool grad_norm; + int verbose; + float eps; + DMLC_DECLARE_PARAMETER(LSoftmaxParam) { + DMLC_DECLARE_FIELD(margin).set_default(2).set_lower_bound(1) + .describe("LSoftmax margin"); + DMLC_DECLARE_FIELD(beta).set_default(1).set_lower_bound(0) + .describe("LSoftmax beta, same as lambda to weight original value"); + DMLC_DECLARE_FIELD(beta_min).set_default(0).set_lower_bound(0) + .describe("Minimum beta"); + DMLC_DECLARE_FIELD(scale).set_default(1).set_range(0, 1) + .describe("Scale of beta during training for every iteration"); + DMLC_DECLARE_FIELD(num_hidden).set_lower_bound(1) + .describe("Number of hidden nodes of the output"); + DMLC_DECLARE_FIELD(grad_norm).set_default(false) + .describe("do grad norm"); + DMLC_DECLARE_FIELD(verbose).set_default(0) + .describe("Log for beta change"); + DMLC_DECLARE_FIELD(eps).set_default(1e-10f) + .describe("l2 eps"); + } +}; + +template +class LSoftmaxOp : public Operator { + public: + explicit LSoftmaxOp(LSoftmaxParam param) { + this->param_ = param; + // setup global lookup table + k_table_.clear(); + c_table_.clear(); + k_table_.push_back(1); + c_table_.push_back(1); + const int margin = param.margin; + const double pi = std::atan(1) * 4; + double factor = 1; + for (int i = 1; i <= margin; ++i) { + factor = factor * (margin - i + 1) / i; + k_table_.push_back(std::cos(i * pi / margin)); + c_table_.push_back(factor); + } + //next_beta_ = param.beta * 0.1f; + count_ = 0; + if(const char* env_p = std::getenv("BETA")) { + float _beta = std::atof(env_p); + if (param_.verbose) { + LOG(INFO)<<"beta:"<<_beta; + } + param_.beta = _beta; + } + else if(const char* env_p = std::getenv("GLOBAL_STEP")) { + int nbatch = std::atoi(env_p); + if (param_.verbose) { + LOG(INFO)<<"nbatch:"< &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(in_data.size(), 3); + CHECK_EQ(out_data.size(), 3); + CHECK_EQ(req.size(), 3); + CHECK_EQ(req[lsoftmax_enum::kOut], kWriteTo); + CHECK(req[lsoftmax_enum::kDataNorm] == kNullOp || + req[lsoftmax_enum::kDataNorm] == kWriteTo); + CHECK(req[lsoftmax_enum::kWeightNorm] == kNullOp || + req[lsoftmax_enum::kWeightNorm] == kWriteTo); + Stream *s = ctx.get_stream(); + const int n = in_data[lsoftmax_enum::kData].size(0); + const int m = in_data[lsoftmax_enum::kWeight].size(0); + Tensor x = in_data[lsoftmax_enum::kData].FlatTo2D(s); + Tensor w = in_data[lsoftmax_enum::kWeight].FlatTo2D(s); + Tensor label = in_data[lsoftmax_enum::kLabel].get_with_shape(Shape1(n), s); + Tensor out = out_data[lsoftmax_enum::kOut].FlatTo2D(s); + Tensor x_norm = out_data[lsoftmax_enum::kDataNorm].get_with_shape(Shape1(n), s); + Tensor w_norm = out_data[lsoftmax_enum::kWeightNorm].get_with_shape(Shape1(m), s); +#if defined(__CUDACC__) + CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) + << "Must init CuBLAS handle in stream"; +#endif + // original fully connected + out = dot(x, w.T()); + if (ctx.is_train) { + // large margin fully connected + const int margin = param_.margin; + if(const char* env_p = std::getenv("BETA")) { + float _beta = std::atof(env_p); + param_.beta = _beta; + } + const DType beta = static_cast(param_.beta); + //LOG(INFO)<<"beta:"< k_table_cpu(k_table_.data(), Shape1(k_table_.size())); + Tensor c_table_cpu(c_table_.data(), Shape1(c_table_.size())); + Tensor k_table_xpu(Shape1(k_table_.size())); + Tensor c_table_xpu(Shape1(c_table_.size())); + k_table_xpu.set_stream(s); + c_table_xpu.set_stream(s); + AllocSpace(&k_table_xpu); + AllocSpace(&c_table_xpu); + Copy(k_table_xpu, k_table_cpu, s); + Copy(c_table_xpu, c_table_cpu, s); + LSoftmaxForward(x, w, label, out, x_norm, w_norm, k_table_xpu, c_table_xpu, margin, beta); + FreeSpace(&k_table_xpu); + FreeSpace(&c_table_xpu); + } + } + + //virtual void GradNorm(mshadow::Tensor grad, mshadow::Stream* s) { + // using namespace mshadow; + // using namespace mshadow::expr; + // Tensor grad_cpu(grad.shape_); + // AllocSpace(&grad_cpu); + // Copy(grad_cpu, grad, s); + // DType grad_norm = param_.eps; + // for(uint32_t i=0;i grad, mshadow::Stream* s) { + using namespace mshadow; + using namespace mshadow::expr; + Tensor grad_cpu(grad.shape_); + AllocSpace(&grad_cpu); + Copy(grad_cpu, grad, s); + DType grad_norm = param_.eps; + for(uint32_t i=0;i &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1); + CHECK_EQ(in_data.size(), 3); + CHECK_EQ(out_data.size(), 3); + CHECK_GE(in_grad.size(), 2); + CHECK_GE(req.size(), 2); + CHECK_EQ(req[lsoftmax_enum::kData], kWriteTo); + CHECK_EQ(req[lsoftmax_enum::kWeight], kWriteTo); + Stream *s = ctx.get_stream(); + const int n = in_data[lsoftmax_enum::kData].size(0); + const int m = in_data[lsoftmax_enum::kWeight].size(0); + Tensor x = in_data[lsoftmax_enum::kData].FlatTo2D(s); + Tensor w = in_data[lsoftmax_enum::kWeight].FlatTo2D(s); + Tensor label = in_data[lsoftmax_enum::kLabel].get_with_shape(Shape1(n), s); + Tensor x_norm = out_data[lsoftmax_enum::kDataNorm].get_with_shape(Shape1(n), s); + Tensor w_norm = out_data[lsoftmax_enum::kWeightNorm].get_with_shape(Shape1(m), s); + Tensor o_grad = out_grad[lsoftmax_enum::kOut].FlatTo2D(s); + Tensor x_grad = in_grad[lsoftmax_enum::kData].FlatTo2D(s); + Tensor w_grad = in_grad[lsoftmax_enum::kWeight].FlatTo2D(s); + // workspace is used for cos_t, cos_mt, k, sin2_t, fo and cos_t_m for every data point + Tensor workspace = ctx.requested[lsoftmax_enum::kTempSpace].get_space_typed(Shape2(6, n), s); +#if defined(__CUDACC__) + CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) + << "Must init CuBLAS handle in stream"; +#endif + // original fully connected + x_grad = dot(o_grad, w); + w_grad = dot(o_grad.T(), x); + // large margin fully connected + const int margin = param_.margin; + const DType beta = static_cast(param_.beta); + count_+=1; + if (param_.verbose) { + if(count_%param_.verbose==0) { + LOG(INFO)<<"["< k_table_cpu(k_table_.data(), Shape1(k_table_.size())); + Tensor c_table_cpu(c_table_.data(), Shape1(c_table_.size())); + Tensor k_table_xpu(Shape1(k_table_.size())); + Tensor c_table_xpu(Shape1(c_table_.size())); + k_table_xpu.set_stream(s); + c_table_xpu.set_stream(s); + AllocSpace(&k_table_xpu); + AllocSpace(&c_table_xpu); + Copy(k_table_xpu, k_table_cpu, s); + Copy(c_table_xpu, c_table_cpu, s); + LSoftmaxBackward(x, w, label, x_norm, w_norm, o_grad, x_grad, w_grad, workspace, + k_table_xpu, c_table_xpu, margin, beta); + FreeSpace(&k_table_xpu); + FreeSpace(&c_table_xpu); + //if(param_.grad_norm) { + // GradNorm(x_grad, s); + // GradNorm(w_grad, s); + //} + // dirty hack, should also work for multi device + if(std::getenv("BETA")==NULL) { + param_.beta *= param_.scale; + param_.beta = std::max(param_.beta, param_.beta_min); + } + //LOG(INFO)<<"w_grad:"<(F(w_grad), 2); + //norm = F(norm + param_.eps); + //out = data / broadcast_with_axis(norm, 1, dshape[2]); + //if (param_.beta < next_beta_) { + // next_beta_ *= 0.1f; + // if (param_.verbose) { + // LOG(INFO) << "LSoftmax changes beta to " << param_.beta; + // } + //} + } + + //Tensor grad_norm(const Tensor grad) { + //} + + + + + private: + LSoftmaxParam param_; + // global lookup table + std::vector k_table_; + std::vector c_table_; + //float next_beta_; + uint32_t count_; +}; // class LSoftmaxOp + +template +Operator *CreateOp(LSoftmaxParam param, int dtype); + +#if DMLC_USE_CXX11 +class LSoftmaxProp : public OperatorProperty { + public: + void Init(const std::vector > &kwargs) override { + param_.Init(kwargs); + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + std::vector ListArguments() const override { + return {"data", "weight", "label"}; + } + + std::vector ListOutputs() const override { + return {"output", "data_norm", "weight_norm"}; + } + + int NumOutputs() const override { + return 3; + } + + int NumVisibleOutputs() const override { + return 1; + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + CHECK_EQ(in_shape->size(), 3) << "Input:[data, label, weight]"; + const TShape &dshape = in_shape->at(lsoftmax_enum::kData); + const TShape &lshape = in_shape->at(lsoftmax_enum::kLabel); + CHECK_EQ(dshape.ndim(), 2) << "data shape should be (batch_size, feature_dim)"; + CHECK_EQ(lshape.ndim(), 1) << "label shape should be (batch_size,)"; + const int n = dshape[0]; + const int feature_dim = dshape[1]; + const int m = param_.num_hidden; + SHAPE_ASSIGN_CHECK(*in_shape, lsoftmax_enum::kWeight, Shape2(m, feature_dim)); + out_shape->clear(); + out_shape->push_back(Shape2(n, m)); // output + out_shape->push_back(Shape1(n)); // data norm + out_shape->push_back(Shape1(m)); // weight norm + aux_shape->clear(); + return true; + } + + std::vector BackwardResource( + const std::vector &in_shape) const override { + return {ResourceRequest::kTempSpace}; + } + + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return {out_grad[lsoftmax_enum::kOut], out_data[lsoftmax_enum::kDataNorm], + out_data[lsoftmax_enum::kWeightNorm], in_data[lsoftmax_enum::kData], + in_data[lsoftmax_enum::kWeight], in_data[lsoftmax_enum::kLabel]}; + } + + std::string TypeString() const override { + return "LSoftmax"; + } + + OperatorProperty *Copy() const override { + auto ptr = new LSoftmaxProp(); + ptr->param_ = param_; + return ptr; + } + + Operator *CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } + + Operator *CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + private: + LSoftmaxParam param_; +}; // class LSoftmaxProp +#endif // DMLC_USE_CXX11 + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_LSOFTMAX_INL_H_ diff --git a/src/operator/lsoftmax.cc b/src/operator/lsoftmax.cc new file mode 100644 index 0000000..cbf708b --- /dev/null +++ b/src/operator/lsoftmax.cc @@ -0,0 +1,75 @@ +/*! + * Copyright (c) 2016 by Contributors + * \file lsoftmax.cc + * \brief LSoftmax from + * \author luoyetx + */ +#include "./lsoftmax-inl.h" + +namespace mshadow { + +template +inline void LSoftmaxForward(const Tensor &x, + const Tensor &w, + const Tensor &label, + const Tensor &out, + const Tensor &x_norm, + const Tensor &w_norm, + const Tensor &k_table, + const Tensor &c_table, + const int margin, + const DType beta) { + LOG(FATAL) << "Not Implemented."; +} + +template +inline void LSoftmaxBackward(const Tensor &x, + const Tensor &w, + const Tensor &label, + const Tensor &x_norm, + const Tensor &w_norm, + const Tensor &o_grad, + const Tensor &x_grad, + const Tensor &w_grad, + const Tensor &workspace, + const Tensor &k_table, + const Tensor &c_table, + const int margin, + const DType beta) { + LOG(FATAL) << "Not Implemented."; +} + +} // namespace mshadow + +namespace mxnet { +namespace op { + +template<> +Operator *CreateOp(LSoftmaxParam param, int dtype) { + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new LSoftmaxOp(param); + }) + return op; +} + +Operator *LSoftmaxProp::CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const { + std::vector out_shape, aux_shape; + std::vector out_type, aux_type; + CHECK(InferType(in_type, &out_type, &aux_type)); + CHECK(InferShape(in_shape, &out_shape, &aux_shape)); + DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); +} + +DMLC_REGISTER_PARAMETER(LSoftmaxParam); + +MXNET_REGISTER_OP_PROPERTY(LSoftmax, LSoftmaxProp) +.describe("LSoftmax from ") +.add_argument("data", "Symbol", "data") +.add_argument("weight", "Symbol", "weight") +.add_argument("label", "Symbol", "label") +.add_arguments(LSoftmaxParam::__FIELDS__()); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/lsoftmax.cu b/src/operator/lsoftmax.cu new file mode 100644 index 0000000..6055c1b --- /dev/null +++ b/src/operator/lsoftmax.cu @@ -0,0 +1,322 @@ +/*! + * Copyright (c) 2016 by Contributors + * \file lsoftmax.cu + * \brief LSoftmax from + * \author luoyetx + */ +#include "./lsoftmax-inl.h" + +namespace mshadow { +namespace cuda { + +namespace { +// workspace variables +enum LSoftmaxTempSpaceType {kCost, kCosmt, kK, kSin2t, kFo, kCostM}; +} + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +MSHADOW_XINLINE int LSPowOfMO(const int k) { + return 1 - ((k&0x01) << 1); +} + +template +__global__ void LSCalcNorm(const Tensor x, + Tensor x_norm) { + const int n = x.size(0); + const int m = x.size(1); + CUDA_KERNEL_LOOP(i, n) { + DType norm = 0; + for (int j = 0; j < m; ++j) { + norm += x[i][j] * x[i][j]; + } + x_norm[i] = sqrt(norm); + } +} + +template +__device__ int LSFindK(const DType *k_table, const int n, const DType cos_t) { + const DType eps = 1e-5; + for (int i = 0; i < n; ++i) { + if (((k_table[i+1] < cos_t) || (abs(k_table[i+1] - cos_t) < eps)) && + ((k_table[i] > cos_t) || (abs(k_table[i] - cos_t) < eps))) { + return i; + } + } + return 0; +} + +template +__device__ DType LSCalcCosmt(const DType *c_table, const int n, + const DType cos_t, const int margin) { + const DType sin2_t = 1 - cos_t * cos_t; + DType cos_t_p = pow(cos_t, margin); + DType sin2_t_p = 1; + DType cos_mt = cos_t_p; // p = 0 + for (int p = 1; p <= margin / 2; ++p) { + cos_t_p /= cos_t * cos_t; // don't replace `cos_t*cos_t` with `1-sin2_t`, this can cause numeric issue if cos_t --> 0 + sin2_t_p *= sin2_t; + cos_mt += LSPowOfMO(p) * c_table[2*p] * cos_t_p * sin2_t_p; + } + return cos_mt; +} + +template +__global__ void LSoftmaxForwardKernel(const Tensor x, + const Tensor w, + const Tensor label, + const Tensor x_norm, + const Tensor w_norm, + Tensor out, + const Tensor k_table, + const Tensor c_table, + const int margin, + const DType beta) { + const int n = x.size(0); + const int feature_dim = x.size(1); + const int m = w.size(0); + CUDA_KERNEL_LOOP(i, n) { + const int yi = static_cast(label[i]); + const DType fo_i_yi = out[i][yi]; + const DType cos_t = fo_i_yi / (x_norm[i] * w_norm[yi]); + const int k = LSFindK(k_table.dptr_, k_table.size(0), cos_t); + const DType cos_mt = LSCalcCosmt(c_table.dptr_, c_table.size(0), cos_t, margin); + const DType f_i_yi = (LSPowOfMO(k) * cos_mt - 2*k) * (w_norm[yi] * x_norm[i]); + out[i][yi] = (f_i_yi + beta * fo_i_yi) / (1 + beta); + } +} + +template +inline void LSoftmaxForward(const Tensor &x, + const Tensor &w, + const Tensor &label, + const Tensor &out, + const Tensor &x_norm, + const Tensor &w_norm, + const Tensor &k_table, + const Tensor &c_table, + const int margin, + const DType beta) { + const int n = x.size(0); + const int m = w.size(0); + dim3 dimBlock(kBaseThreadNum); + dim3 dimGrid((n + kBaseThreadNum - 1) / kBaseThreadNum); + LSCalcNorm<<>>(x, x_norm); + dimGrid.x = ((m + kBaseThreadNum - 1) / kBaseThreadNum); + LSCalcNorm<<>>(w, w_norm); + dimGrid.x = ((n + kBaseThreadNum - 1) / kBaseThreadNum); + LSoftmaxForwardKernel<<>>(x, w, label, x_norm, w_norm, out, k_table, c_table, margin, beta); +} + +template +__global__ void LSoftmaxBackwardRequired(const Tensor x, + const Tensor w, + const Tensor label, + const Tensor x_norm, + const Tensor w_norm, + Tensor workspace, + const Tensor k_table, + const Tensor c_table, + const int margin) { + const int n = x.size(0); + const int feature_dim = x.size(1); + CUDA_KERNEL_LOOP(i, n) { + const int yi = static_cast(label[i]); + // fo_i_yi = dot(w_yi, x_i) + DType fo_i_yi = 0; + for (int p = 0; p < feature_dim; ++p) { + fo_i_yi += w[yi][p] * x[i][p]; + } + const DType cos_t = fo_i_yi / (x_norm[i] * w_norm[yi]); + const int k = LSFindK(k_table.dptr_, k_table.size(0), cos_t); + const DType cos_mt = LSCalcCosmt(c_table.dptr_, c_table.size(0), cos_t, margin); + const DType sin2_t = 1 - cos_t * cos_t; + workspace[kCost][i] = cos_t; + workspace[kCosmt][i] = cos_mt; + workspace[kK][i] = static_cast(k); + workspace[kSin2t][i] = sin2_t; + workspace[kFo][i] = fo_i_yi; + workspace[kCostM][i] = pow(cos_t, margin - 1); + } +} + +template +__global__ void LSoftmaxBackwardXKernel(const Tensor x, + const Tensor w, + const Tensor label, + const Tensor x_norm, + const Tensor w_norm, + const Tensor o_grad, + Tensor x_grad, + const Tensor workspace, + const Tensor c_table, + const int margin, + const DType beta) { + const int nthreads = x.size(0) * x.size(1); + const int feature_dim = x.size(1); + CUDA_KERNEL_LOOP(idx, nthreads) { + const int i = idx / feature_dim; + const int l = idx % feature_dim; + const int yi = static_cast(label[i]); + const DType cos_t = workspace[kCost][i]; + const DType cos_mt = workspace[kCosmt][i]; + const int k = static_cast(workspace[kK][i]); + const DType sin2_t = workspace[kSin2t][i]; + const DType fo_i_yi = workspace[kFo][i]; + const DType w_norm_yi = w_norm[yi]; + const DType x_norm_i = x_norm[i]; + + const DType dcos_dx = w[yi][l] / (w_norm_yi * x_norm_i) - \ + fo_i_yi * x[i][l] / (w_norm_yi * x_norm_i * x_norm_i * x_norm_i); + const DType dsin2_dx = -2 * cos_t * dcos_dx; + DType cos_t_p = workspace[kCostM][i]; + DType sin2_t_p = 1; + DType dcosm_dx = margin * cos_t_p * dcos_dx; // p = 0 + for (int p = 1; p <= margin / 2; ++p) { + cos_t_p /= cos_t * cos_t; + dcosm_dx += LSPowOfMO(p) * c_table[2*p] * (p * cos_t * dsin2_dx + \ + (margin - 2*p) * sin2_t * dcos_dx) * cos_t_p * sin2_t_p; + sin2_t_p *= sin2_t; + } + const DType df_dx = (LSPowOfMO(k) * cos_mt - 2*k) * w_norm_yi / x_norm_i * x[i][l] + \ + LSPowOfMO(k) * w_norm_yi * x_norm_i * dcosm_dx; + const DType alpha = 1 / (1 + beta); + x_grad[i][l] += alpha * o_grad[i][yi] * (df_dx - w[yi][l]); + } +} + +template +__global__ void LSoftmaxBackwardWKernel(const Tensor x, + const Tensor w, + const Tensor label, + const Tensor x_norm, + const Tensor w_norm, + const Tensor o_grad, + Tensor w_grad, + const Tensor workspace, + const Tensor c_table, + const int margin, + const DType beta) { + const int nthreads = w.size(0) * w.size(1); + const int n = x.size(0); + const int feature_dim = w.size(1); + CUDA_KERNEL_LOOP(idx, nthreads) { + const int j = idx / feature_dim; + const int l = idx % feature_dim; + DType dw = 0; + for (int i = 0; i < n; ++i) { + const int yi = static_cast(label[i]); + if (yi == j) { + const DType cos_t = workspace[kCost][i]; + const DType cos_mt = workspace[kCosmt][i]; + const int k = static_cast(workspace[kK][i]); + const DType sin2_t = workspace[kSin2t][i]; + const DType fo_i_yi = workspace[kFo][i]; + const DType x_norm_i = x_norm[i]; + const DType w_norm_yi = w_norm[yi]; + + const DType dcos_dw = x[i][l] / (w_norm_yi * x_norm_i) - \ + fo_i_yi * w[yi][l] / (x_norm_i * w_norm_yi * w_norm_yi * w_norm_yi); + const DType dsin2_dw = -2 * cos_t * dcos_dw; + DType cos_t_p = workspace[kCostM][i]; + DType sin2_t_p = 1; + DType dcosm_dw = margin * cos_t_p * dcos_dw; // p = 0 + for (int p = 1; p <= margin / 2; ++p) { + cos_t_p /= cos_t * cos_t; + dcosm_dw += LSPowOfMO(p) * c_table[2*p] * (p * cos_t * dsin2_dw + \ + (margin - 2*p) * sin2_t * dcos_dw) * cos_t_p * sin2_t_p; + sin2_t_p *= sin2_t; + } + const DType df_dw_j = (LSPowOfMO(k) * cos_mt - 2*k) * x_norm_i / w_norm_yi * w[yi][l] + \ + LSPowOfMO(k) * w_norm_yi * x_norm_i * dcosm_dw; + dw += o_grad[i][yi] * (df_dw_j - x[i][l]); + } + } + const DType alpha = 1 / (1 + beta); + w_grad[j][l] += alpha * dw; + } +} + +template +inline void LSoftmaxBackward(const Tensor &x, + const Tensor &w, + const Tensor &label, + const Tensor &x_norm, + const Tensor &w_norm, + const Tensor &o_grad, + const Tensor &x_grad, + const Tensor &w_grad, + const Tensor &workspace, + const Tensor &k_table, + const Tensor &c_table, + const int margin, + const DType beta) { + const int n = x.size(0); + const int feature_dim = x.size(1); + const int m = w.size(0); + dim3 dimBlock(kBaseThreadNum); + dim3 dimGrid((n + kBaseThreadNum - 1) / kBaseThreadNum); + LSoftmaxBackwardRequired<<>>(x, w, label, x_norm, w_norm, workspace, + k_table, c_table, margin); + dimGrid.x = ((n * feature_dim + kBaseThreadNum - 1) / kBaseThreadNum); + LSoftmaxBackwardXKernel<<>>(x, w, label, x_norm, w_norm, o_grad, x_grad, workspace, + c_table, margin, beta); + dimGrid.x = ((m * feature_dim + kBaseThreadNum - 1) / kBaseThreadNum); + LSoftmaxBackwardWKernel<<>>(x, w, label, x_norm, w_norm, o_grad, w_grad, workspace, + c_table, margin, beta); +} + +} // namespace cuda + +template +inline void LSoftmaxForward(const Tensor &x, + const Tensor &w, + const Tensor &label, + const Tensor &out, + const Tensor &x_norm, + const Tensor &w_norm, + const Tensor &k_table, + const Tensor &c_table, + const int margin, + const DType beta) { + cuda::LSoftmaxForward(x, w, label, out, x_norm, w_norm, + k_table, c_table, margin, beta); +} + +template +inline void LSoftmaxBackward(const Tensor &x, + const Tensor &w, + const Tensor &label, + const Tensor &x_norm, + const Tensor &w_norm, + const Tensor &o_grad, + const Tensor &x_grad, + const Tensor &w_grad, + const Tensor &workspace, + const Tensor &k_table, + const Tensor &c_table, + const int margin, + const DType beta) { + cuda::LSoftmaxBackward(x, w, label, x_norm, w_norm, o_grad, x_grad, w_grad, workspace, + k_table, c_table, margin, beta); +} + +} // namespace mshadow + +namespace mxnet { +namespace op { + +template<> +Operator *CreateOp(LSoftmaxParam param, int dtype) { + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new LSoftmaxOp(param); + }) + return op; +} + +} // namespace op +} // namespace mxnet diff --git a/src/train.sh b/src/train.sh new file mode 100755 index 0000000..04bc2c0 --- /dev/null +++ b/src/train.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +export MXNET_CPU_WORKER_NTHREADS=15 +export MXNET_CUDNN_AUTOTUNE_DEFAULT=0 +export MXNET_ENGINE_TYPE=ThreadedEnginePerDevice + +#export CUDA_VISIBLE_DEVICES='4,5' +#python -u train_softmax.py --retrain --pretrained '../model/sphereface-152-0-0' --load-epoch 8 --prefix '../model/sphereface-retrain-0' --loss-type 0 +export CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' +export CUDA_VISIBLE_DEVICES='0,1,2,3' +export CUDA_VISIBLE_DEVICES='4,5,6,7' +export CUDA_VISIBLE_DEVICES='4,5' +export CUDA_VISIBLE_DEVICES='0,1' +#python -u train_softmax.py --network 's60' --patch '16_0_96_112_0' --loss-type 1 > logs60_l1_v4 2>&1 & +#python -u train_softmax.py --network 's60' --patch '0_0_96_95_0' --loss-type 1 --prefix '../model/spherefacex' +#python -u train_softmax.py --network 's20' --patch '0_0_96_112_0' --loss-type 0 +#python -u train_softmax.py --network 'r50' --patch '0_0_96_112_0' --loss-type 0 +#python -u train_softmax.py --network 'm4' --patch '0_0_96_112_0' --loss-type 1 --prefix '../model/spherefacem' --per-batch-size 224 > celm.log 2>&1 & +#python -u train_softmax.py --network 'm29' --patch '0_0_96_112_0' --loss-type 0 --lr 0.1 --prefix '../model/softmax' --verbose 2000 --per-batch-size 128 > sx_m29.log 2>&1 & +#python -u train_softmax.py --network 'm29' --patch '0_0_96_112_0' --loss-type 1 --lr 0.1 --prefix '../model/sphere47' --verbose 2000 --per-batch-size 224 --beta-min 4.7 > sp_m29_47.log 2>&1 & +export CUDA_VISIBLE_DEVICES='2,3' +#python -u train_softmax.py --network 'm1' --patch '0_0_96_112_0' --loss-type 0 --lr 0.01 --prefix '../model/marginal0' --verbose 2000 +#python -u train_softmax.py --network 's60' --patch '0_0_96_95_0' --loss-type 1 +#python -u train_softmax.py --network 's20' --patch '0_0_96_95_0' --loss-type 1 +#python -u train_softmax.py --network 's60' --patch '0_0_96_112_0' --loss-type 1 --prefix '../model/spherefacec' > logs60_c 2>&1 & +#python -u train_marginal.py --patch '0_0_96_112_0' --network 's36' --verbose 1000 --lr 0.01 +#python -u train_coco.py --patch '0_0_96_112_0' --images-per-identity 32 +#python -u train_softmax.py --network 's36' --patch '0_0_96_112_0' --loss-type 1 --prefix '../model/spherefacei36' --per-batch-size 256 +#python -u train_softmax.py --network 's36' --patch '0_0_96_112_0' --loss-type 1 --prefix '../model/spherefacei36' --per-batch-size 256 > cel4.log 2>&1 & +#python -u train_softmax.py --network 'm28' --patch '0_0_96_112_0' --loss-type 11 --lr 0.1 --prefix '../model/L11' --verbose 500 --per-batch-size 128 --images-per-identity 4 +#python -u train_softmax.py --network 'm27' --patch '0_0_96_112_0' --loss-type 1 --lr 0.1 --prefix '../model/sphere' --verbose 2000 --per-batch-size 224 > sp_m27.log 2>&1 & +#python -u train_softmax.py --network 'm27' --patch '0_0_96_112_0' --loss-type 0 --lr 0.1 --prefix '../model/softmax' --verbose 2000 --per-batch-size 128 > sx_m27.log 2>&1 & +export CUDA_VISIBLE_DEVICES='4,5' +#python -u train_softmax.py --network 'm29' --patch '0_0_96_112_0' --loss-type 0 --lr 0.1 --prefix '../model/softmax' --verbose 2000 --per-batch-size 128 > sx_m29.log 2>&1 & +#python -u train_softmax.py --network 'm27' --patch '0_0_96_112_0' --loss-type 1 --lr 0.1 --prefix '../model/sphere' --verbose 2000 --per-batch-size 224 > sp_m27.log 2>&1 & +#python -u train_softmax.py --network 'm28' --patch '0_0_96_112_0' --loss-type 1 --lr 0.1 --prefix '../model/sphere' --verbose 2000 --per-batch-size 224 > sp_m28.log 2>&1 & +export CUDA_VISIBLE_DEVICES='6,7' +#python -u train_softmax.py --network 'm29' --patch '0_0_96_112_0' --loss-type 1 --lr 0.1 --prefix '../model/spherem' --verbose 2000 --per-batch-size 224 +#python -u train_softmax.py --network 'm28' --patch '0_0_96_112_0' --loss-type 0 --lr 0.1 --prefix '../model/softmax' --verbose 2000 --per-batch-size 128 > sx_m28.log 2>&1 & +#python -u train_marginal.py --patch '0_0_96_112_0' --network 'i4' --verbose 2000 --lr 0.01 +#python -u train_softmax.py --network 'i4' --patch '0_0_96_112_0' --loss-type 1 --gamma 0.06 --beta-min 4 +#python -u train_softmax.py --network 'x4' --patch '0_0_96_112_0' --loss-type 1 --gamma 0.09 +#python -u train_softmax.py --network 's60' --patch '0_0_80_95_0' --loss-type 1 > logs60_l1_v3 2>&1 & +#python -u train_softmax.py --network 's60' --patch '0_0_96_95_0' --loss-type 1 > logs60_l1_v2 2>&1 & +#python -u train_softmax.py --network 's20' --patch '0_0_96_112_0' +export CUDA_VISIBLE_DEVICES='4,5,6,7' +python -u train_softmax.py --network 'm29' --patch '0_0_96_112_0' --loss-type 0 --lr 0.1 --prefix '../model/softmax' --verbose 2000 --per-batch-size 128 > sx_m29.log 2>&1 & +#python -u train_softmax.py --network 's60' --patch '0_0_96_95_0' --loss-type 1 --gamma 0.06 --beta-freeze 5000 --prefix '../model/spherefacei' > cel2.log 2>&1 & +export CUDA_VISIBLE_DEVICES='0,1,2,3' +#python -u train_softmax.py --network 'm29' --patch '0_0_96_112_0' --loss-type 1 --lr 0.1 --prefix '../model/spherem' --verbose 2000 --per-batch-size 224 --lr-steps '60000,80000,90000' > spm_m29.log 2>&1 & +#python -u train_softmax.py --network 's60' --patch '0_15_96_112_0' --loss-type 1 --gamma 0.06 --beta-freeze 5000 --prefix '../model/spherefacei' > cel3.log 2>&1 & +export CUDA_VISIBLE_DEVICES='2' +#python -u train_marginal.py --patch '0_0_96_112_0' --network 's36' --verbose 2000 --lr 0.01 > mar_s36.log 2>&1 & +export CUDA_VISIBLE_DEVICES='3' +#python -u train_marginal.py --patch '0_0_96_112_0' --network 'i4' --verbose 2000 --lr 0.01 > mar_i4.log 2>&1 & +#python -u train_softmax.py --network 'i4' --patch '0_0_96_112_0' --loss-type 1 --gamma 0.06 --beta-freeze 5000 +#python -u train_softmax.py --network 'r50' --patch '0_0_96_112_0' --loss-type 1 --gamma 0.24 > logr50_l1 2>&1 & +#python -u train_softmax.py --network 'r50' --patch '0_0_96_112_0' --loss-type 2 --verbose 100 +#python -u train_softmax.py --network 'r50' --patch '0_0_96_95_0' > logr101_pu 2>&1 & +#python -u train_softmax.py --network 'r50' --patch '0_0_96_112_0' +#python -u train_softmax.py --network 'r101' --patch '0_0_96_95_0' +#python -u train_softmax.py --loss-type 1 --num-layers 64 --patch '0_0_96_112_0' +#python -u train_softmax.py --loss-type 1 --num-layers 36 --patch '0_0_96_95_0' +#python -u train_softmax.py --loss-type 1 --num-layers 20 --patch '0_0_80_95_0' + diff --git a/src/train_softmax.py b/src/train_softmax.py index 7f653bf..83cbe0a 100644 --- a/src/train_softmax.py +++ b/src/train_softmax.py @@ -17,7 +17,6 @@ from mxnet import ndarray as nd import argparse import mxnet.optimizer as optimizer sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common')) -import resnet_dcn import spherenet import marginalnet import inceptions @@ -26,7 +25,8 @@ import lfw import sklearn from sklearn.decomposition import PCA #from center_loss import * -import asoftmax +#import resnet_dcn +#import asoftmax logger = logging.getLogger() @@ -105,6 +105,8 @@ def parse_args(): help='') parser.add_argument('--loss-type', type=int, default=1, help='') + parser.add_argument('--incay', action='store_true', default=False, + help='feature incay') parser.add_argument('--use-deformable', type=int, default=0, help='') parser.add_argument('--patch', type=str, default='0_0_96_112_0', @@ -138,6 +140,7 @@ def get_symbol(args, arg_params, aux_params): _,_,embedding,_ = resnet_dcn.get_symbol(512, args.num_layers) gt_label = mx.symbol.Variable('softmax_label') assert args.loss_type>=0 + extra_loss = None if args.loss_type==0: _weight = mx.symbol.Variable('fc7_weight') _bias = mx.symbol.Variable('fc7_bias', lr_mult=2.0, wd_mult=0.0) @@ -206,12 +209,22 @@ def get_symbol(args, arg_params, aux_params): softmax = mx.symbol.SoftmaxOutput(data=fc7, label = gt_label, name='softmax', normalization='valid') else: softmax = mx.symbol.SoftmaxOutput(data=fc7, label = gt_label, name='softmax') + if args.loss_type<=1 and args.incay: + params = [1.e-10, 0.01] + sel = mx.symbol.argmax(data = fc7, axis=1) + sel = (sel==gt_label) + norm = embedding*embedding + norm = mx.symbol.sum(norm, axis=1) + norm += params[0] + feature_incay = sel/norm + feature_incay = mx.symbol.mean(feature_incay) * params[1] + extra_loss = mx.symbol.MakeLoss(feature_incay) #out = softmax #l2_embedding = mx.symbol.L2Normalization(embedding) #ce = mx.symbol.softmax_cross_entropy(fc7, gt_label, name='softmax_ce')/args.per_batch_size #out = mx.symbol.Group([mx.symbol.BlockGrad(embedding), softmax, mx.symbol.BlockGrad(ce)]) - if args.loss_type>=10 and extra_loss is not None: + if extra_loss is not None: out = mx.symbol.Group([mx.symbol.BlockGrad(embedding), softmax, extra_loss]) else: out = mx.symbol.Group([mx.symbol.BlockGrad(embedding), softmax]) @@ -277,7 +290,7 @@ def train_net(args): path_imglist = "/raid5data/dplearn/faces_normed/train.lst" args.num_classes = 82395 - args.use_val = True + args.use_val = False val_path = "/raid5data/dplearn/faces_normed/val.lst" path_imgrec = "/opt/jiaguo/faces_normed/train.rec" val_rec = "/opt/jiaguo/faces_normed/val.rec" @@ -291,7 +304,7 @@ def train_net(args): data_shape = (args.image_channel,112,96) mean = [127.5,127.5,127.5] - if args.use_val and args.loss_type<=1: + if args.use_val: val_dataiter = FaceImageIter2( batch_size = args.batch_size, data_shape = data_shape, @@ -349,7 +362,7 @@ def train_net(args): ) - if args.loss_type<=1: + if args.loss_type<=9: train_dataiter = FaceImageIter2( batch_size = args.batch_size, data_shape = data_shape, @@ -570,7 +583,8 @@ def train_net(args): global_step = [0] save_step = [0] if len(args.lr_steps)==0: - lr_steps = [40000, 70000, 90000] + #lr_steps = [40000, 70000, 90000] + lr_steps = [30000, 50000, 70000, 90000] if args.loss_type==1: lr_steps = [70000, 100000] else: @@ -595,16 +609,21 @@ def train_net(args): acc, embeddings_list = lfw_test(mbatch) save_step[0]+=1 msave = save_step[0] + do_save = False if acc>=highest_acc[0]: highest_acc[0] = acc - if acc>=0.992: - print('saving', msave) - arg, aux = model.get_params() - mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux) - lfw_npy = "%s-lfw-%04d" % (prefix, msave) - X = np.concatenate(embeddings_list, axis=0) - print(X.shape) - np.save(lfw_npy, X) + if acc>=0.995: + do_save = True + if mbatch>lr_steps[-1] and msave%5==0: + do_save = True + if do_save: + print('saving', msave) + arg, aux = model.get_params() + mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux) + #lfw_npy = "%s-lfw-%04d" % (prefix, msave) + #X = np.concatenate(embeddings_list, axis=0) + #print(X.shape) + #np.save(lfw_npy, X) print('[%d]Accuracy-Highest: %1.5f'%(mbatch, highest_acc[0])) if mbatch<=args.beta_freeze: _beta = args.beta