From f5c678e4ea301f07c336093cda5252d408e979da Mon Sep 17 00:00:00 2001
From: Jia Guo <guojia@gmail.com>
Date: Thu, 16 Nov 2017 14:29:48 +0800
Subject: [PATCH] tiny

---
 src/data.py                 |   6 +
 src/operator/lsoftmax-inl.h | 379 ++++++++++++++++++++++++++++++++++++
 src/operator/lsoftmax.cc    |  75 +++++++
 src/operator/lsoftmax.cu    | 322 ++++++++++++++++++++++++++++++
 src/train.sh                |  64 ++++++
 src/train_softmax.py        |  49 +++--
 6 files changed, 880 insertions(+), 15 deletions(-)
 create mode 100644 src/operator/lsoftmax-inl.h
 create mode 100644 src/operator/lsoftmax.cc
 create mode 100644 src/operator/lsoftmax.cu
 create mode 100755 src/train.sh
diff --git a/src/data.py b/src/data.py
index ba1077b..92c3a74 100644
--- a/src/data.py
+++ b/src/data.py
@@ -489,6 +489,12 @@ class FaceImageIter2(io.DataIter):
           else:
             label, fname, bbox, landmark = self.imglist[idx]
             return label, self.read_image(fname), bbox, landmark
+        else:
+            s = self.imgrec.read()
+            if s is None:
+                raise StopIteration
+            header, img = recordio.unpack(s)
+            return header.label, img, None, None
 
     def brightness_aug(self, src, x):
       alpha = 1.0 + random.uniform(-x, x)
diff --git a/src/operator/lsoftmax-inl.h b/src/operator/lsoftmax-inl.h
new file mode 100644
index 0000000..33d51bf
--- /dev/null
+++ b/src/operator/lsoftmax-inl.h
@@ -0,0 +1,379 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file lsoftmax-inl.h
+ * \brief LSoftmax from <Large-Margin Softmax Loss for Convolutional Neural Networks>
+ * \author luoyetx
+ */
+#ifndef MXNET_OPERATOR_LSOFTMAX_INL_H_
+#define MXNET_OPERATOR_LSOFTMAX_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <cmath>
+#include <map>
+#include <vector>
+#include <string>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+namespace lsoftmax_enum {
+enum LSoftmaxOpInputs {kData, kWeight, kLabel};
+enum LSoftmaxOpOutputs {kOut, kDataNorm, kWeightNorm};
+enum LSoftmaxResource {kTempSpace};
+}
+
+struct LSoftmaxParam : public dmlc::Parameter<LSoftmaxParam> {
+  int margin;
+  float beta;
+  float beta_min;
+  float scale;
+  int num_hidden;
+  bool grad_norm;
+  int verbose;
+  float eps;
+  DMLC_DECLARE_PARAMETER(LSoftmaxParam) {
+    DMLC_DECLARE_FIELD(margin).set_default(2).set_lower_bound(1)
+    .describe("LSoftmax margin");
+    DMLC_DECLARE_FIELD(beta).set_default(1).set_lower_bound(0)
+    .describe("LSoftmax beta, same as lambda to weight original value");
+    DMLC_DECLARE_FIELD(beta_min).set_default(0).set_lower_bound(0)
+    .describe("Minimum beta");
+    DMLC_DECLARE_FIELD(scale).set_default(1).set_range(0, 1)
+    .describe("Scale of beta during training for every iteration");
+    DMLC_DECLARE_FIELD(num_hidden).set_lower_bound(1)
+    .describe("Number of hidden nodes of the output");
+    DMLC_DECLARE_FIELD(grad_norm).set_default(false)
+    .describe("do grad norm");
+    DMLC_DECLARE_FIELD(verbose).set_default(0)
+    .describe("Log for beta change");
+    DMLC_DECLARE_FIELD(eps).set_default(1e-10f)
+    .describe("l2 eps");
+  }
+};
+
+template<typename xpu, typename DType>
+class LSoftmaxOp : public Operator {
+ public:
+  explicit LSoftmaxOp(LSoftmaxParam param) {
+    this->param_ = param;
+    // setup global lookup table
+    k_table_.clear();
+    c_table_.clear();
+    k_table_.push_back(1);
+    c_table_.push_back(1);
+    const int margin = param.margin;
+    const double pi = std::atan(1) * 4;
+    double factor = 1;
+    for (int i = 1; i <= margin; ++i) {
+      factor = factor * (margin - i + 1) / i;
+      k_table_.push_back(std::cos(i * pi / margin));
+      c_table_.push_back(factor);
+    }
+    //next_beta_ = param.beta * 0.1f;
+    count_ = 0;
+    if(const char* env_p = std::getenv("BETA")) {
+      float _beta = std::atof(env_p);
+      if (param_.verbose) {
+        LOG(INFO)<<"beta:"<<_beta;
+      }
+      param_.beta = _beta;
+    }
+    else if(const char* env_p = std::getenv("GLOBAL_STEP")) {
+      int nbatch = std::atoi(env_p);
+      if (param_.verbose) {
+        LOG(INFO)<<"nbatch:"<<nbatch;
+      }
+      float _beta = param.beta*std::pow((double)param.scale, (double)nbatch);
+      param_.beta = std::max(_beta, param_.beta_min);
+    }
+    if (param_.verbose) {
+      LOG(INFO)<<param_.margin<<","<<param_.beta<<","<<param_.beta_min<<","<<param_.scale;
+    }
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data,
+                       const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 3);
+    CHECK_EQ(out_data.size(), 3);
+    CHECK_EQ(req.size(), 3);
+    CHECK_EQ(req[lsoftmax_enum::kOut], kWriteTo);
+    CHECK(req[lsoftmax_enum::kDataNorm] == kNullOp ||
+          req[lsoftmax_enum::kDataNorm] == kWriteTo);
+    CHECK(req[lsoftmax_enum::kWeightNorm] == kNullOp ||
+          req[lsoftmax_enum::kWeightNorm] == kWriteTo);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const int n = in_data[lsoftmax_enum::kData].size(0);
+    const int m = in_data[lsoftmax_enum::kWeight].size(0);
+    Tensor<xpu, 2, DType> x = in_data[lsoftmax_enum::kData].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> w = in_data[lsoftmax_enum::kWeight].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 1, DType> label = in_data[lsoftmax_enum::kLabel].get_with_shape<xpu, 1, DType>(Shape1(n), s);
+    Tensor<xpu, 2, DType> out = out_data[lsoftmax_enum::kOut].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 1, DType> x_norm = out_data[lsoftmax_enum::kDataNorm].get_with_shape<xpu, 1, DType>(Shape1(n), s);
+    Tensor<xpu, 1, DType> w_norm = out_data[lsoftmax_enum::kWeightNorm].get_with_shape<xpu, 1, DType>(Shape1(m), s);
+#if defined(__CUDACC__)
+    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+        << "Must init CuBLAS handle in stream";
+#endif
+    // original fully connected
+    out = dot(x, w.T());
+    if (ctx.is_train) {
+      // large margin fully connected
+      const int margin = param_.margin;
+      if(const char* env_p = std::getenv("BETA")) {
+        float _beta = std::atof(env_p);
+        param_.beta = _beta;
+      }
+      const DType beta = static_cast<DType>(param_.beta);
+      //LOG(INFO)<<"beta:"<<beta<<std::endl;
+      Tensor<cpu, 1, DType> k_table_cpu(k_table_.data(), Shape1(k_table_.size()));
+      Tensor<cpu, 1, DType> c_table_cpu(c_table_.data(), Shape1(c_table_.size()));
+      Tensor<xpu, 1, DType> k_table_xpu(Shape1(k_table_.size()));
+      Tensor<xpu, 1, DType> c_table_xpu(Shape1(c_table_.size()));
+      k_table_xpu.set_stream(s);
+      c_table_xpu.set_stream(s);
+      AllocSpace(&k_table_xpu);
+      AllocSpace(&c_table_xpu);
+      Copy(k_table_xpu, k_table_cpu, s);
+      Copy(c_table_xpu, c_table_cpu, s);
+      LSoftmaxForward(x, w, label, out, x_norm, w_norm, k_table_xpu, c_table_xpu, margin, beta);
+      FreeSpace(&k_table_xpu);
+      FreeSpace(&c_table_xpu);
+    }
+  }
+
+  //virtual void GradNorm(mshadow::Tensor<xpu, 2, DType> grad, mshadow::Stream<xpu>* s) {
+  //  using namespace mshadow;
+  //  using namespace mshadow::expr;
+  //  Tensor<cpu, 2, DType> grad_cpu(grad.shape_);
+  //  AllocSpace(&grad_cpu);
+  //  Copy(grad_cpu, grad, s);
+  //  DType grad_norm = param_.eps;
+  //  for(uint32_t i=0;i<grad_cpu.shape_[0];i++) {
+  //    for(uint32_t j=0;j<grad_cpu.shape_[1];j++) {
+  //      grad_norm += grad_cpu[i][j]*grad_cpu[i][j];
+  //    }
+  //  }
+  //  grad_norm = sqrt(grad_norm);
+  //  grad_cpu /= grad_norm;
+  //  Copy(grad, grad_cpu, s);
+  //  FreeSpace(&grad_cpu);
+  //}
+
+  virtual DType GradNorm(mshadow::Tensor<xpu, 2, DType> grad, mshadow::Stream<xpu>* s) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    Tensor<cpu, 2, DType> grad_cpu(grad.shape_);
+    AllocSpace(&grad_cpu);
+    Copy(grad_cpu, grad, s);
+    DType grad_norm = param_.eps;
+    for(uint32_t i=0;i<grad_cpu.shape_[0];i++) {
+      for(uint32_t j=0;j<grad_cpu.shape_[1];j++) {
+        grad_norm += grad_cpu[i][j]*grad_cpu[i][j];
+      }
+    }
+    grad_norm = sqrt(grad_norm);
+    //grad_cpu /= grad_norm;
+    //Copy(grad, grad_cpu, s);
+    FreeSpace(&grad_cpu);
+    return grad_norm;
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad,
+                        const std::vector<TBlob> &aux_args) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(out_grad.size(), 1);
+    CHECK_EQ(in_data.size(), 3);
+    CHECK_EQ(out_data.size(), 3);
+    CHECK_GE(in_grad.size(), 2);
+    CHECK_GE(req.size(), 2);
+    CHECK_EQ(req[lsoftmax_enum::kData], kWriteTo);
+    CHECK_EQ(req[lsoftmax_enum::kWeight], kWriteTo);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    const int n = in_data[lsoftmax_enum::kData].size(0);
+    const int m = in_data[lsoftmax_enum::kWeight].size(0);
+    Tensor<xpu, 2, DType> x = in_data[lsoftmax_enum::kData].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> w = in_data[lsoftmax_enum::kWeight].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 1, DType> label = in_data[lsoftmax_enum::kLabel].get_with_shape<xpu, 1, DType>(Shape1(n), s);
+    Tensor<xpu, 1, DType> x_norm = out_data[lsoftmax_enum::kDataNorm].get_with_shape<xpu, 1, DType>(Shape1(n), s);
+    Tensor<xpu, 1, DType> w_norm = out_data[lsoftmax_enum::kWeightNorm].get_with_shape<xpu, 1, DType>(Shape1(m), s);
+    Tensor<xpu, 2, DType> o_grad = out_grad[lsoftmax_enum::kOut].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> x_grad = in_grad[lsoftmax_enum::kData].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> w_grad = in_grad[lsoftmax_enum::kWeight].FlatTo2D<xpu, DType>(s);
+    // workspace is used for cos_t, cos_mt, k, sin2_t, fo and cos_t_m for every data point
+    Tensor<xpu, 2, DType> workspace = ctx.requested[lsoftmax_enum::kTempSpace].get_space_typed<xpu, 2, DType>(Shape2(6, n), s);
+#if defined(__CUDACC__)
+    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
+        << "Must init CuBLAS handle in stream";
+#endif
+    // original fully connected
+    x_grad = dot(o_grad, w);
+    w_grad = dot(o_grad.T(), x);
+    // large margin fully connected
+    const int margin = param_.margin;
+    const DType beta = static_cast<DType>(param_.beta);
+    count_+=1;
+    if (param_.verbose) {
+      if(count_%param_.verbose==0) {
+        LOG(INFO)<<"["<<count_<<"]current beta:"<<beta;
+        DType n = GradNorm(x_grad, s);
+        LOG(INFO)<<"x_grad norm:"<<n;
+        n = GradNorm(w_grad, s);
+        LOG(INFO)<<"w_grad norm:"<<n;
+      }
+    }
+    Tensor<cpu, 1, DType> k_table_cpu(k_table_.data(), Shape1(k_table_.size()));
+    Tensor<cpu, 1, DType> c_table_cpu(c_table_.data(), Shape1(c_table_.size()));
+    Tensor<xpu, 1, DType> k_table_xpu(Shape1(k_table_.size()));
+    Tensor<xpu, 1, DType> c_table_xpu(Shape1(c_table_.size()));
+    k_table_xpu.set_stream(s);
+    c_table_xpu.set_stream(s);
+    AllocSpace(&k_table_xpu);
+    AllocSpace(&c_table_xpu);
+    Copy(k_table_xpu, k_table_cpu, s);
+    Copy(c_table_xpu, c_table_cpu, s);
+    LSoftmaxBackward(x, w, label, x_norm, w_norm, o_grad, x_grad, w_grad, workspace,
+                     k_table_xpu, c_table_xpu, margin, beta);
+    FreeSpace(&k_table_xpu);
+    FreeSpace(&c_table_xpu);
+    //if(param_.grad_norm) {
+    //  GradNorm(x_grad, s);
+    //  GradNorm(w_grad, s);
+    //}
+    // dirty hack, should also work for multi device
+    if(std::getenv("BETA")==NULL) {
+      param_.beta *= param_.scale;
+      param_.beta = std::max(param_.beta, param_.beta_min);
+    }
+    //LOG(INFO)<<"w_grad:"<<w_grad.shape_[0]<<","<<w_grad.shape_[1];
+    //norm = reduce_with_axis<red::sum, false>(F<mxnet::op::mshadow_op::square>(w_grad), 2);
+    //norm = F<mxnet::op::mshadow_op::square_root>(norm + param_.eps);
+    //out = data / broadcast_with_axis(norm, 1, dshape[2]);
+    //if (param_.beta < next_beta_) {
+    //  next_beta_ *= 0.1f;
+    //  if (param_.verbose) {
+    //    LOG(INFO) << "LSoftmax changes beta to " << param_.beta;
+    //  }
+    //}
+  }
+
+  //Tensor<xpu, 2, DType> grad_norm(const Tensor<xpu, 2, DType> grad) {
+  //}
+
+
+
+
+ private:
+  LSoftmaxParam param_;
+  // global lookup table
+  std::vector<DType> k_table_;
+  std::vector<DType> c_table_;
+  //float next_beta_;
+  uint32_t count_;
+};  // class LSoftmaxOp
+
+template<typename xpu>
+Operator *CreateOp(LSoftmaxParam param, int dtype);
+
+#if DMLC_USE_CXX11
+class LSoftmaxProp : public OperatorProperty {
+ public:
+  void Init(const std::vector<std::pair<std::string, std::string> > &kwargs) override {
+    param_.Init(kwargs);
+  }
+
+  std::map<std::string, std::string> GetParams() const override {
+    return param_.__DICT__();
+  }
+
+  std::vector<std::string> ListArguments() const override {
+    return {"data", "weight", "label"};
+  }
+
+  std::vector<std::string> ListOutputs() const override {
+    return {"output", "data_norm", "weight_norm"};
+  }
+
+  int NumOutputs() const override {
+    return 3;
+  }
+
+  int NumVisibleOutputs() const override {
+    return 1;
+  }
+
+  bool InferShape(std::vector<TShape> *in_shape,
+                  std::vector<TShape> *out_shape,
+                  std::vector<TShape> *aux_shape) const override {
+    using namespace mshadow;
+    CHECK_EQ(in_shape->size(), 3) << "Input:[data, label, weight]";
+    const TShape &dshape = in_shape->at(lsoftmax_enum::kData);
+    const TShape &lshape = in_shape->at(lsoftmax_enum::kLabel);
+    CHECK_EQ(dshape.ndim(), 2) << "data shape should be (batch_size, feature_dim)";
+    CHECK_EQ(lshape.ndim(), 1) << "label shape should be (batch_size,)";
+    const int n = dshape[0];
+    const int feature_dim = dshape[1];
+    const int m = param_.num_hidden;
+    SHAPE_ASSIGN_CHECK(*in_shape, lsoftmax_enum::kWeight, Shape2(m, feature_dim));
+    out_shape->clear();
+    out_shape->push_back(Shape2(n, m));  // output
+    out_shape->push_back(Shape1(n));  // data norm
+    out_shape->push_back(Shape1(m));  // weight norm
+    aux_shape->clear();
+    return true;
+  }
+
+  std::vector<ResourceRequest> BackwardResource(
+      const std::vector<TShape> &in_shape) const override {
+    return {ResourceRequest::kTempSpace};
+  }
+
+  std::vector<int> DeclareBackwardDependency(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data) const override {
+    return {out_grad[lsoftmax_enum::kOut], out_data[lsoftmax_enum::kDataNorm],
+            out_data[lsoftmax_enum::kWeightNorm], in_data[lsoftmax_enum::kData],
+            in_data[lsoftmax_enum::kWeight], in_data[lsoftmax_enum::kLabel]};
+  }
+
+  std::string TypeString() const override {
+    return "LSoftmax";
+  }
+
+  OperatorProperty *Copy() const override {
+    auto ptr = new LSoftmaxProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  Operator *CreateOperator(Context ctx) const override {
+    LOG(FATAL) << "Not Implemented.";
+    return NULL;
+  }
+
+  Operator *CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                             std::vector<int> *in_type) const override;
+
+ private:
+  LSoftmaxParam param_;
+};  // class LSoftmaxProp
+#endif  // DMLC_USE_CXX11
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_OPERATOR_LSOFTMAX_INL_H_
diff --git a/src/operator/lsoftmax.cc b/src/operator/lsoftmax.cc
new file mode 100644
index 0000000..cbf708b
--- /dev/null
+++ b/src/operator/lsoftmax.cc
@@ -0,0 +1,75 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file lsoftmax.cc
+ * \brief LSoftmax from <Large-Margin Softmax Loss for Convolutional Neural Networks>
+ * \author luoyetx
+ */
+#include "./lsoftmax-inl.h"
+
+namespace mshadow {
+
+template <typename DType>
+inline void LSoftmaxForward(const Tensor<cpu, 2, DType> &x,
+                            const Tensor<cpu, 2, DType> &w,
+                            const Tensor<cpu, 1, DType> &label,
+                            const Tensor<cpu, 2, DType> &out,
+                            const Tensor<cpu, 1, DType> &x_norm,
+                            const Tensor<cpu, 1, DType> &w_norm,
+                            const Tensor<cpu, 1, DType> &k_table,
+                            const Tensor<cpu, 1, DType> &c_table,
+                            const int margin,
+                            const DType beta) {
+  LOG(FATAL) << "Not Implemented.";
+}
+
+template <typename DType>
+inline void LSoftmaxBackward(const Tensor<cpu, 2, DType> &x,
+                             const Tensor<cpu, 2, DType> &w,
+                             const Tensor<cpu, 1, DType> &label,
+                             const Tensor<cpu, 1, DType> &x_norm,
+                             const Tensor<cpu, 1, DType> &w_norm,
+                             const Tensor<cpu, 2, DType> &o_grad,
+                             const Tensor<cpu, 2, DType> &x_grad,
+                             const Tensor<cpu, 2, DType> &w_grad,
+                             const Tensor<cpu, 2, DType> &workspace,
+                             const Tensor<cpu, 1, DType> &k_table,
+                             const Tensor<cpu, 1, DType> &c_table,
+                             const int margin,
+                             const DType beta) {
+  LOG(FATAL) << "Not Implemented.";
+}
+
+}  // namespace mshadow
+
+namespace mxnet {
+namespace op {
+
+template<>
+Operator *CreateOp<cpu>(LSoftmaxParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new LSoftmaxOp<cpu, DType>(param);
+  })
+  return op;
+}
+
+Operator *LSoftmaxProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
+                                         std::vector<int> *in_type) const {
+  std::vector<TShape> out_shape, aux_shape;
+  std::vector<int> out_type, aux_type;
+  CHECK(InferType(in_type, &out_type, &aux_type));
+  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
+  DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
+}
+
+DMLC_REGISTER_PARAMETER(LSoftmaxParam);
+
+MXNET_REGISTER_OP_PROPERTY(LSoftmax, LSoftmaxProp)
+.describe("LSoftmax from <Large-Margin Softmax Loss for Convolutional Neural Networks>")
+.add_argument("data", "Symbol", "data")
+.add_argument("weight", "Symbol", "weight")
+.add_argument("label", "Symbol", "label")
+.add_arguments(LSoftmaxParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/lsoftmax.cu b/src/operator/lsoftmax.cu
new file mode 100644
index 0000000..6055c1b
--- /dev/null
+++ b/src/operator/lsoftmax.cu
@@ -0,0 +1,322 @@
+/*!
+ * Copyright (c) 2016 by Contributors
+ * \file lsoftmax.cu
+ * \brief LSoftmax from <Large-Margin Softmax Loss for Convolutional Neural Networks>
+ * \author luoyetx
+ */
+#include "./lsoftmax-inl.h"
+
+namespace mshadow {
+namespace cuda {
+
+namespace {
+// workspace variables
+enum LSoftmaxTempSpaceType {kCost, kCosmt, kK, kSin2t, kFo, kCostM};
+}
+
+#define CUDA_KERNEL_LOOP(i, n) \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
+       i < (n); \
+       i += blockDim.x * gridDim.x)
+
+MSHADOW_XINLINE int LSPowOfMO(const int k) {
+  return 1 - ((k&0x01) << 1);
+}
+
+template<typename DType>
+__global__ void LSCalcNorm(const Tensor<gpu, 2, DType> x,
+                           Tensor<gpu, 1, DType> x_norm) {
+  const int n = x.size(0);
+  const int m = x.size(1);
+  CUDA_KERNEL_LOOP(i, n) {
+    DType norm = 0;
+    for (int j = 0; j < m; ++j) {
+      norm += x[i][j] * x[i][j];
+    }
+    x_norm[i] = sqrt(norm);
+  }
+}
+
+template<typename DType>
+__device__ int LSFindK(const DType *k_table, const int n, const DType cos_t) {
+  const DType eps = 1e-5;
+  for (int i = 0; i < n; ++i) {
+    if (((k_table[i+1] < cos_t) || (abs(k_table[i+1] - cos_t) < eps)) &&
+        ((k_table[i] > cos_t) || (abs(k_table[i] - cos_t) < eps))) {
+      return i;
+    }
+  }
+  return 0;
+}
+
+template<typename DType>
+__device__ DType LSCalcCosmt(const DType *c_table, const int n,
+                             const DType cos_t, const int margin) {
+  const DType sin2_t = 1 - cos_t * cos_t;
+  DType cos_t_p = pow(cos_t, margin);
+  DType sin2_t_p = 1;
+  DType cos_mt = cos_t_p;  // p = 0
+  for (int p = 1; p <= margin / 2; ++p) {
+    cos_t_p /= cos_t * cos_t;  // don't replace `cos_t*cos_t` with `1-sin2_t`, this can cause numeric issue if cos_t --> 0
+    sin2_t_p *= sin2_t;
+    cos_mt += LSPowOfMO(p) * c_table[2*p] * cos_t_p * sin2_t_p;
+  }
+  return cos_mt;
+}
+
+template<typename DType>
+__global__ void LSoftmaxForwardKernel(const Tensor<gpu, 2, DType> x,
+                                      const Tensor<gpu, 2, DType> w,
+                                      const Tensor<gpu, 1, DType> label,
+                                      const Tensor<gpu, 1, DType> x_norm,
+                                      const Tensor<gpu, 1, DType> w_norm,
+                                      Tensor<gpu, 2, DType> out,
+                                      const Tensor<gpu, 1, DType> k_table,
+                                      const Tensor<gpu, 1, DType> c_table,
+                                      const int margin,
+                                      const DType beta) {
+  const int n = x.size(0);
+  const int feature_dim = x.size(1);
+  const int m = w.size(0);
+  CUDA_KERNEL_LOOP(i, n) {
+    const int yi = static_cast<int>(label[i]);
+    const DType fo_i_yi = out[i][yi];
+    const DType cos_t = fo_i_yi / (x_norm[i] * w_norm[yi]);
+    const int k = LSFindK(k_table.dptr_, k_table.size(0), cos_t);
+    const DType cos_mt = LSCalcCosmt(c_table.dptr_, c_table.size(0), cos_t, margin);
+    const DType f_i_yi = (LSPowOfMO(k) * cos_mt - 2*k) * (w_norm[yi] * x_norm[i]);
+    out[i][yi] = (f_i_yi + beta * fo_i_yi) / (1 + beta);
+  }
+}
+
+template<typename DType>
+inline void LSoftmaxForward(const Tensor<gpu, 2, DType> &x,
+                            const Tensor<gpu, 2, DType> &w,
+                            const Tensor<gpu, 1, DType> &label,
+                            const Tensor<gpu, 2, DType> &out,
+                            const Tensor<gpu, 1, DType> &x_norm,
+                            const Tensor<gpu, 1, DType> &w_norm,
+                            const Tensor<gpu, 1, DType> &k_table,
+                            const Tensor<gpu, 1, DType> &c_table,
+                            const int margin,
+                            const DType beta) {
+  const int n = x.size(0);
+  const int m = w.size(0);
+  dim3 dimBlock(kBaseThreadNum);
+  dim3 dimGrid((n + kBaseThreadNum - 1) / kBaseThreadNum);
+  LSCalcNorm<<<dimGrid, dimBlock>>>(x, x_norm);
+  dimGrid.x = ((m + kBaseThreadNum - 1) / kBaseThreadNum);
+  LSCalcNorm<<<dimGrid, dimBlock>>>(w, w_norm);
+  dimGrid.x = ((n + kBaseThreadNum - 1) / kBaseThreadNum);
+  LSoftmaxForwardKernel<<<dimGrid, dimBlock>>>(x, w, label, x_norm, w_norm, out, k_table, c_table, margin, beta);
+}
+
+template<typename DType>
+__global__ void LSoftmaxBackwardRequired(const Tensor<gpu, 2, DType> x,
+                                         const Tensor<gpu, 2, DType> w,
+                                         const Tensor<gpu, 1, DType> label,
+                                         const Tensor<gpu, 1, DType> x_norm,
+                                         const Tensor<gpu, 1, DType> w_norm,
+                                         Tensor<gpu, 2, DType> workspace,
+                                         const Tensor<gpu, 1, DType> k_table,
+                                         const Tensor<gpu, 1, DType> c_table,
+                                         const int margin) {
+  const int n = x.size(0);
+  const int feature_dim = x.size(1);
+  CUDA_KERNEL_LOOP(i, n) {
+    const int yi = static_cast<int>(label[i]);
+    // fo_i_yi = dot(w_yi, x_i)
+    DType fo_i_yi = 0;
+    for (int p = 0; p < feature_dim; ++p) {
+      fo_i_yi += w[yi][p] * x[i][p];
+    }
+    const DType cos_t = fo_i_yi / (x_norm[i] * w_norm[yi]);
+    const int k = LSFindK(k_table.dptr_, k_table.size(0), cos_t);
+    const DType cos_mt = LSCalcCosmt(c_table.dptr_, c_table.size(0), cos_t, margin);
+    const DType sin2_t = 1 - cos_t * cos_t;
+    workspace[kCost][i] = cos_t;
+    workspace[kCosmt][i] = cos_mt;
+    workspace[kK][i] = static_cast<DType>(k);
+    workspace[kSin2t][i] = sin2_t;
+    workspace[kFo][i] = fo_i_yi;
+    workspace[kCostM][i] = pow(cos_t, margin - 1);
+  }
+}
+
+template<typename DType>
+__global__ void LSoftmaxBackwardXKernel(const Tensor<gpu, 2, DType> x,
+                                        const Tensor<gpu, 2, DType> w,
+                                        const Tensor<gpu, 1, DType> label,
+                                        const Tensor<gpu, 1, DType> x_norm,
+                                        const Tensor<gpu, 1, DType> w_norm,
+                                        const Tensor<gpu, 2, DType> o_grad,
+                                        Tensor<gpu, 2, DType> x_grad,
+                                        const Tensor<gpu, 2, DType> workspace,
+                                        const Tensor<gpu, 1, DType> c_table,
+                                        const int margin,
+                                        const DType beta) {
+  const int nthreads = x.size(0) * x.size(1);
+  const int feature_dim = x.size(1);
+  CUDA_KERNEL_LOOP(idx, nthreads) {
+    const int i = idx / feature_dim;
+    const int l = idx % feature_dim;
+    const int yi = static_cast<int>(label[i]);
+    const DType cos_t = workspace[kCost][i];
+    const DType cos_mt = workspace[kCosmt][i];
+    const int k = static_cast<int>(workspace[kK][i]);
+    const DType sin2_t = workspace[kSin2t][i];
+    const DType fo_i_yi = workspace[kFo][i];
+    const DType w_norm_yi = w_norm[yi];
+    const DType x_norm_i = x_norm[i];
+
+    const DType dcos_dx = w[yi][l] / (w_norm_yi * x_norm_i) - \
+                          fo_i_yi * x[i][l] / (w_norm_yi * x_norm_i * x_norm_i * x_norm_i);
+    const DType dsin2_dx = -2 * cos_t * dcos_dx;
+    DType cos_t_p = workspace[kCostM][i];
+    DType sin2_t_p = 1;
+    DType dcosm_dx = margin * cos_t_p * dcos_dx;  // p = 0
+    for (int p = 1; p <= margin / 2; ++p) {
+      cos_t_p /= cos_t * cos_t;
+      dcosm_dx += LSPowOfMO(p) * c_table[2*p] * (p * cos_t * dsin2_dx + \
+                    (margin - 2*p) * sin2_t * dcos_dx) * cos_t_p * sin2_t_p;
+      sin2_t_p *= sin2_t;
+    }
+    const DType df_dx = (LSPowOfMO(k) * cos_mt - 2*k) * w_norm_yi / x_norm_i * x[i][l] + \
+                         LSPowOfMO(k) * w_norm_yi * x_norm_i * dcosm_dx;
+    const DType alpha = 1 / (1 + beta);
+    x_grad[i][l] += alpha * o_grad[i][yi] * (df_dx - w[yi][l]);
+  }
+}
+
+template<typename DType>
+__global__ void LSoftmaxBackwardWKernel(const Tensor<gpu, 2, DType> x,
+                                        const Tensor<gpu, 2, DType> w,
+                                        const Tensor<gpu, 1, DType> label,
+                                        const Tensor<gpu, 1, DType> x_norm,
+                                        const Tensor<gpu, 1, DType> w_norm,
+                                        const Tensor<gpu, 2, DType> o_grad,
+                                        Tensor<gpu, 2, DType> w_grad,
+                                        const Tensor<gpu, 2, DType> workspace,
+                                        const Tensor<gpu, 1, DType> c_table,
+                                        const int margin,
+                                        const DType beta) {
+  const int nthreads = w.size(0) * w.size(1);
+  const int n = x.size(0);
+  const int feature_dim = w.size(1);
+  CUDA_KERNEL_LOOP(idx, nthreads) {
+    const int j = idx / feature_dim;
+    const int l = idx % feature_dim;
+    DType dw = 0;
+    for (int i = 0; i < n; ++i) {
+      const int yi = static_cast<int>(label[i]);
+      if (yi == j) {
+        const DType cos_t = workspace[kCost][i];
+        const DType cos_mt = workspace[kCosmt][i];
+        const int k = static_cast<int>(workspace[kK][i]);
+        const DType sin2_t = workspace[kSin2t][i];
+        const DType fo_i_yi = workspace[kFo][i];
+        const DType x_norm_i = x_norm[i];
+        const DType w_norm_yi = w_norm[yi];
+
+        const DType dcos_dw = x[i][l] / (w_norm_yi * x_norm_i) - \
+                              fo_i_yi * w[yi][l] / (x_norm_i * w_norm_yi * w_norm_yi * w_norm_yi);
+        const DType dsin2_dw = -2 * cos_t * dcos_dw;
+        DType cos_t_p = workspace[kCostM][i];
+        DType sin2_t_p = 1;
+        DType dcosm_dw = margin * cos_t_p * dcos_dw;  // p = 0
+        for (int p = 1; p <= margin / 2; ++p) {
+          cos_t_p /= cos_t * cos_t;
+          dcosm_dw += LSPowOfMO(p) * c_table[2*p] * (p * cos_t * dsin2_dw + \
+                        (margin - 2*p) * sin2_t * dcos_dw) * cos_t_p * sin2_t_p;
+          sin2_t_p *= sin2_t;
+        }
+        const DType df_dw_j = (LSPowOfMO(k) * cos_mt - 2*k) * x_norm_i / w_norm_yi * w[yi][l] + \
+                               LSPowOfMO(k) * w_norm_yi * x_norm_i * dcosm_dw;
+        dw += o_grad[i][yi] * (df_dw_j - x[i][l]);
+      }
+    }
+    const DType alpha = 1 / (1 + beta);
+    w_grad[j][l] += alpha * dw;
+  }
+}
+
+template<typename DType>
+inline void LSoftmaxBackward(const Tensor<gpu, 2, DType> &x,
+                             const Tensor<gpu, 2, DType> &w,
+                             const Tensor<gpu, 1, DType> &label,
+                             const Tensor<gpu, 1, DType> &x_norm,
+                             const Tensor<gpu, 1, DType> &w_norm,
+                             const Tensor<gpu, 2, DType> &o_grad,
+                             const Tensor<gpu, 2, DType> &x_grad,
+                             const Tensor<gpu, 2, DType> &w_grad,
+                             const Tensor<gpu, 2, DType> &workspace,
+                             const Tensor<gpu, 1, DType> &k_table,
+                             const Tensor<gpu, 1, DType> &c_table,
+                             const int margin,
+                             const DType beta) {
+  const int n = x.size(0);
+  const int feature_dim = x.size(1);
+  const int m = w.size(0);
+  dim3 dimBlock(kBaseThreadNum);
+  dim3 dimGrid((n + kBaseThreadNum - 1) / kBaseThreadNum);
+  LSoftmaxBackwardRequired<<<dimGrid, dimBlock>>>(x, w, label, x_norm, w_norm, workspace,
+                                                  k_table, c_table, margin);
+  dimGrid.x = ((n * feature_dim + kBaseThreadNum - 1) / kBaseThreadNum);
+  LSoftmaxBackwardXKernel<<<dimGrid, dimBlock>>>(x, w, label, x_norm, w_norm, o_grad, x_grad, workspace,
+                                                 c_table, margin, beta);
+  dimGrid.x = ((m * feature_dim + kBaseThreadNum - 1) / kBaseThreadNum);
+  LSoftmaxBackwardWKernel<<<dimGrid, dimBlock>>>(x, w, label, x_norm, w_norm, o_grad, w_grad, workspace,
+                                                 c_table, margin, beta);
+}
+
+}  // namespace cuda
+
+template<typename DType>
+inline void LSoftmaxForward(const Tensor<gpu, 2, DType> &x,
+                            const Tensor<gpu, 2, DType> &w,
+                            const Tensor<gpu, 1, DType> &label,
+                            const Tensor<gpu, 2, DType> &out,
+                            const Tensor<gpu, 1, DType> &x_norm,
+                            const Tensor<gpu, 1, DType> &w_norm,
+                            const Tensor<gpu, 1, DType> &k_table,
+                            const Tensor<gpu, 1, DType> &c_table,
+                            const int margin,
+                            const DType beta) {
+  cuda::LSoftmaxForward(x, w, label, out, x_norm, w_norm,
+                        k_table, c_table, margin, beta);
+}
+
+template<typename DType>
+inline void LSoftmaxBackward(const Tensor<gpu, 2, DType> &x,
+                             const Tensor<gpu, 2, DType> &w,
+                             const Tensor<gpu, 1, DType> &label,
+                             const Tensor<gpu, 1, DType> &x_norm,
+                             const Tensor<gpu, 1, DType> &w_norm,
+                             const Tensor<gpu, 2, DType> &o_grad,
+                             const Tensor<gpu, 2, DType> &x_grad,
+                             const Tensor<gpu, 2, DType> &w_grad,
+                             const Tensor<gpu, 2, DType> &workspace,
+                             const Tensor<gpu, 1, DType> &k_table,
+                             const Tensor<gpu, 1, DType> &c_table,
+                             const int margin,
+                             const DType beta) {
+  cuda::LSoftmaxBackward(x, w, label, x_norm, w_norm, o_grad, x_grad, w_grad, workspace,
+                         k_table, c_table, margin, beta);
+}
+
+}  // namespace mshadow
+
+namespace mxnet {
+namespace op {
+
+template<>
+Operator *CreateOp<gpu>(LSoftmaxParam param, int dtype) {
+  Operator *op = NULL;
+  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
+    op = new LSoftmaxOp<gpu, DType>(param);
+  })
+  return op;
+}
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/train.sh b/src/train.sh
new file mode 100755
index 0000000..04bc2c0
--- /dev/null
+++ b/src/train.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+export MXNET_CPU_WORKER_NTHREADS=15
+export MXNET_CUDNN_AUTOTUNE_DEFAULT=0
+export MXNET_ENGINE_TYPE=ThreadedEnginePerDevice
+
+#export CUDA_VISIBLE_DEVICES='4,5'
+#python -u train_softmax.py --retrain --pretrained '../model/sphereface-152-0-0' --load-epoch 8 --prefix '../model/sphereface-retrain-0' --loss-type 0
+export CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
+export CUDA_VISIBLE_DEVICES='0,1,2,3'
+export CUDA_VISIBLE_DEVICES='4,5,6,7'
+export CUDA_VISIBLE_DEVICES='4,5'
+export CUDA_VISIBLE_DEVICES='0,1'
+#python -u train_softmax.py --network 's60' --patch '16_0_96_112_0' --loss-type 1 > logs60_l1_v4 2>&1 &
+#python -u train_softmax.py --network 's60' --patch '0_0_96_95_0' --loss-type 1 --prefix '../model/spherefacex'
+#python -u train_softmax.py --network 's20' --patch '0_0_96_112_0' --loss-type 0
+#python -u train_softmax.py --network 'r50' --patch '0_0_96_112_0' --loss-type 0
+#python -u train_softmax.py --network 'm4' --patch '0_0_96_112_0' --loss-type 1 --prefix '../model/spherefacem' --per-batch-size 224 > celm.log 2>&1 &
+#python -u train_softmax.py --network 'm29' --patch '0_0_96_112_0' --loss-type 0 --lr 0.1 --prefix '../model/softmax' --verbose 2000 --per-batch-size 128 > sx_m29.log 2>&1 &
+#python -u train_softmax.py --network 'm29' --patch '0_0_96_112_0' --loss-type 1 --lr 0.1 --prefix '../model/sphere47' --verbose 2000 --per-batch-size 224 --beta-min 4.7 > sp_m29_47.log 2>&1 &
+export CUDA_VISIBLE_DEVICES='2,3'
+#python -u train_softmax.py --network 'm1' --patch '0_0_96_112_0' --loss-type 0 --lr 0.01 --prefix '../model/marginal0' --verbose 2000
+#python -u train_softmax.py --network 's60' --patch '0_0_96_95_0' --loss-type 1
+#python -u train_softmax.py --network 's20' --patch '0_0_96_95_0' --loss-type 1
+#python -u train_softmax.py --network 's60' --patch '0_0_96_112_0' --loss-type 1 --prefix '../model/spherefacec' > logs60_c 2>&1 &
+#python -u train_marginal.py --patch '0_0_96_112_0' --network 's36' --verbose 1000 --lr 0.01
+#python -u train_coco.py --patch '0_0_96_112_0' --images-per-identity 32
+#python -u train_softmax.py --network 's36' --patch '0_0_96_112_0' --loss-type 1 --prefix '../model/spherefacei36' --per-batch-size 256
+#python -u train_softmax.py --network 's36' --patch '0_0_96_112_0' --loss-type 1 --prefix '../model/spherefacei36' --per-batch-size 256 > cel4.log 2>&1 &
+#python -u train_softmax.py --network 'm28' --patch '0_0_96_112_0' --loss-type 11 --lr 0.1 --prefix '../model/L11' --verbose 500 --per-batch-size 128 --images-per-identity 4
+#python -u train_softmax.py --network 'm27' --patch '0_0_96_112_0' --loss-type 1 --lr 0.1 --prefix '../model/sphere' --verbose 2000 --per-batch-size 224 > sp_m27.log 2>&1 &
+#python -u train_softmax.py --network 'm27' --patch '0_0_96_112_0' --loss-type 0 --lr 0.1 --prefix '../model/softmax' --verbose 2000 --per-batch-size 128 > sx_m27.log 2>&1 &
+export CUDA_VISIBLE_DEVICES='4,5'
+#python -u train_softmax.py --network 'm29' --patch '0_0_96_112_0' --loss-type 0 --lr 0.1 --prefix '../model/softmax' --verbose 2000 --per-batch-size 128 > sx_m29.log 2>&1 &
+#python -u train_softmax.py --network 'm27' --patch '0_0_96_112_0' --loss-type 1 --lr 0.1 --prefix '../model/sphere' --verbose 2000 --per-batch-size 224 > sp_m27.log 2>&1 &
+#python -u train_softmax.py --network 'm28' --patch '0_0_96_112_0' --loss-type 1 --lr 0.1 --prefix '../model/sphere' --verbose 2000 --per-batch-size 224 > sp_m28.log 2>&1 &
+export CUDA_VISIBLE_DEVICES='6,7'
+#python -u train_softmax.py --network 'm29' --patch '0_0_96_112_0' --loss-type 1 --lr 0.1 --prefix '../model/spherem' --verbose 2000 --per-batch-size 224
+#python -u train_softmax.py --network 'm28' --patch '0_0_96_112_0' --loss-type 0 --lr 0.1 --prefix '../model/softmax' --verbose 2000 --per-batch-size 128 > sx_m28.log 2>&1 &
+#python -u train_marginal.py --patch '0_0_96_112_0' --network 'i4' --verbose 2000 --lr 0.01
+#python -u train_softmax.py --network 'i4' --patch '0_0_96_112_0' --loss-type 1 --gamma 0.06 --beta-min 4
+#python -u train_softmax.py --network 'x4' --patch '0_0_96_112_0' --loss-type 1 --gamma 0.09
+#python -u train_softmax.py --network 's60' --patch '0_0_80_95_0' --loss-type 1 > logs60_l1_v3 2>&1 &
+#python -u train_softmax.py --network 's60' --patch '0_0_96_95_0' --loss-type 1 > logs60_l1_v2 2>&1 &
+#python -u train_softmax.py --network 's20' --patch '0_0_96_112_0'
+export CUDA_VISIBLE_DEVICES='4,5,6,7'
+python -u train_softmax.py --network 'm29' --patch '0_0_96_112_0' --loss-type 0 --lr 0.1 --prefix '../model/softmax' --verbose 2000 --per-batch-size 128 > sx_m29.log 2>&1 &
+#python -u train_softmax.py --network 's60' --patch '0_0_96_95_0' --loss-type 1 --gamma 0.06 --beta-freeze 5000 --prefix '../model/spherefacei' > cel2.log 2>&1 &
+export CUDA_VISIBLE_DEVICES='0,1,2,3'
+#python -u train_softmax.py --network 'm29' --patch '0_0_96_112_0' --loss-type 1 --lr 0.1 --prefix '../model/spherem' --verbose 2000 --per-batch-size 224 --lr-steps '60000,80000,90000' > spm_m29.log 2>&1 &
+#python -u train_softmax.py --network 's60' --patch '0_15_96_112_0' --loss-type 1 --gamma 0.06 --beta-freeze 5000 --prefix '../model/spherefacei' > cel3.log 2>&1 &
+export CUDA_VISIBLE_DEVICES='2'
+#python -u train_marginal.py --patch '0_0_96_112_0' --network 's36' --verbose 2000 --lr 0.01 > mar_s36.log 2>&1 &
+export CUDA_VISIBLE_DEVICES='3'
+#python -u train_marginal.py --patch '0_0_96_112_0' --network 'i4' --verbose 2000 --lr 0.01 > mar_i4.log 2>&1 &
+#python -u train_softmax.py --network 'i4' --patch '0_0_96_112_0' --loss-type 1 --gamma 0.06 --beta-freeze 5000
+#python -u train_softmax.py --network 'r50' --patch '0_0_96_112_0' --loss-type 1 --gamma 0.24 > logr50_l1 2>&1 &
+#python -u train_softmax.py --network 'r50' --patch '0_0_96_112_0' --loss-type 2 --verbose 100
+#python -u train_softmax.py --network 'r50' --patch '0_0_96_95_0' > logr101_pu 2>&1 &
+#python -u train_softmax.py --network 'r50' --patch '0_0_96_112_0'
+#python -u train_softmax.py --network 'r101' --patch '0_0_96_95_0'
+#python -u train_softmax.py --loss-type 1 --num-layers 64 --patch '0_0_96_112_0'
+#python -u train_softmax.py --loss-type 1 --num-layers 36 --patch '0_0_96_95_0'
+#python -u train_softmax.py --loss-type 1 --num-layers 20 --patch '0_0_80_95_0'
+
diff --git a/src/train_softmax.py b/src/train_softmax.py
index 7f653bf..83cbe0a 100644
--- a/src/train_softmax.py
+++ b/src/train_softmax.py
@@ -17,7 +17,6 @@ from mxnet import ndarray as nd
 import argparse
 import mxnet.optimizer as optimizer
 sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))
-import resnet_dcn
 import spherenet
 import marginalnet
 import inceptions
@@ -26,7 +25,8 @@ import lfw
 import sklearn
 from sklearn.decomposition import PCA
 #from center_loss import *
-import asoftmax
+#import resnet_dcn
+#import asoftmax
 
 
 logger = logging.getLogger()
@@ -105,6 +105,8 @@ def parse_args():
       help='')
   parser.add_argument('--loss-type', type=int, default=1,
       help='')
+  parser.add_argument('--incay', action='store_true', default=False,
+      help='feature incay')
   parser.add_argument('--use-deformable', type=int, default=0,
       help='')
   parser.add_argument('--patch', type=str, default='0_0_96_112_0',
@@ -138,6 +140,7 @@ def get_symbol(args, arg_params, aux_params):
     _,_,embedding,_ = resnet_dcn.get_symbol(512, args.num_layers)
   gt_label = mx.symbol.Variable('softmax_label')
   assert args.loss_type>=0
+  extra_loss = None
   if args.loss_type==0:
     _weight = mx.symbol.Variable('fc7_weight')
     _bias = mx.symbol.Variable('fc7_bias', lr_mult=2.0, wd_mult=0.0)
@@ -206,12 +209,22 @@ def get_symbol(args, arg_params, aux_params):
     softmax = mx.symbol.SoftmaxOutput(data=fc7, label = gt_label, name='softmax', normalization='valid')
   else:
     softmax = mx.symbol.SoftmaxOutput(data=fc7, label = gt_label, name='softmax')
+  if args.loss_type<=1 and args.incay:
+    params = [1.e-10, 0.01]
+    sel = mx.symbol.argmax(data = fc7, axis=1)
+    sel = (sel==gt_label)
+    norm = embedding*embedding
+    norm = mx.symbol.sum(norm, axis=1)
+    norm += params[0]
+    feature_incay = sel/norm
+    feature_incay = mx.symbol.mean(feature_incay) * params[1]
+    extra_loss = mx.symbol.MakeLoss(feature_incay)
   #out = softmax
   #l2_embedding = mx.symbol.L2Normalization(embedding)
 
   #ce = mx.symbol.softmax_cross_entropy(fc7, gt_label, name='softmax_ce')/args.per_batch_size
   #out = mx.symbol.Group([mx.symbol.BlockGrad(embedding), softmax, mx.symbol.BlockGrad(ce)])
-  if args.loss_type>=10 and extra_loss is not None:
+  if extra_loss is not None:
     out = mx.symbol.Group([mx.symbol.BlockGrad(embedding), softmax, extra_loss])
   else:
     out = mx.symbol.Group([mx.symbol.BlockGrad(embedding), softmax])
@@ -277,7 +290,7 @@ def train_net(args):
 
     path_imglist = "/raid5data/dplearn/faces_normed/train.lst"
     args.num_classes = 82395
-    args.use_val = True
+    args.use_val = False
     val_path = "/raid5data/dplearn/faces_normed/val.lst"
     path_imgrec = "/opt/jiaguo/faces_normed/train.rec"
     val_rec = "/opt/jiaguo/faces_normed/val.rec"
@@ -291,7 +304,7 @@ def train_net(args):
     data_shape = (args.image_channel,112,96)
     mean = [127.5,127.5,127.5]
 
-    if args.use_val and args.loss_type<=1:
+    if args.use_val:
       val_dataiter = FaceImageIter2(
           batch_size           = args.batch_size,
           data_shape           = data_shape,
@@ -349,7 +362,7 @@ def train_net(args):
       )
 
 
-    if args.loss_type<=1:
+    if args.loss_type<=9:
       train_dataiter = FaceImageIter2(
           batch_size           = args.batch_size,
           data_shape           = data_shape,
@@ -570,7 +583,8 @@ def train_net(args):
     global_step = [0]
     save_step = [0]
     if len(args.lr_steps)==0:
-      lr_steps = [40000, 70000, 90000]
+      #lr_steps = [40000, 70000, 90000]
+      lr_steps = [30000, 50000, 70000, 90000]
       if args.loss_type==1:
         lr_steps = [70000, 100000]
     else:
@@ -595,16 +609,21 @@ def train_net(args):
         acc, embeddings_list = lfw_test(mbatch)
         save_step[0]+=1
         msave = save_step[0]
+        do_save = False
         if acc>=highest_acc[0]:
           highest_acc[0] = acc
-          if acc>=0.992:
-            print('saving', msave)
-            arg, aux = model.get_params()
-            mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux)
-            lfw_npy = "%s-lfw-%04d" % (prefix, msave)
-            X = np.concatenate(embeddings_list, axis=0)
-            print(X.shape)
-            np.save(lfw_npy, X)
+          if acc>=0.995:
+            do_save = True
+        if mbatch>lr_steps[-1] and msave%5==0:
+          do_save = True
+        if do_save:
+          print('saving', msave)
+          arg, aux = model.get_params()
+          mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux)
+          #lfw_npy = "%s-lfw-%04d" % (prefix, msave)
+          #X = np.concatenate(embeddings_list, axis=0)
+          #print(X.shape)
+          #np.save(lfw_npy, X)
         print('[%d]Accuracy-Highest: %1.5f'%(mbatch, highest_acc[0]))
       if mbatch<=args.beta_freeze:
         _beta = args.beta