This commit is contained in:
nttstar
2017-11-16 20:20:19 +08:00
6 changed files with 880 additions and 15 deletions

View File

@@ -491,6 +491,12 @@ class FaceImageIter2(io.DataIter):
else:
label, fname, bbox, landmark = self.imglist[idx]
return label, self.read_image(fname), bbox, landmark
else:
s = self.imgrec.read()
if s is None:
raise StopIteration
header, img = recordio.unpack(s)
return header.label, img, None, None
def brightness_aug(self, src, x):
alpha = 1.0 + random.uniform(-x, x)

379
src/operator/lsoftmax-inl.h Normal file
View File

@@ -0,0 +1,379 @@
/*!
* Copyright (c) 2016 by Contributors
* \file lsoftmax-inl.h
* \brief LSoftmax from <Large-Margin Softmax Loss for Convolutional Neural Networks>
* \author luoyetx
*/
#ifndef MXNET_OPERATOR_LSOFTMAX_INL_H_
#define MXNET_OPERATOR_LSOFTMAX_INL_H_
#include <dmlc/logging.h>
#include <dmlc/parameter.h>
#include <mxnet/operator.h>
#include <cmath>
#include <map>
#include <vector>
#include <string>
#include "./operator_common.h"
namespace mxnet {
namespace op {
namespace lsoftmax_enum {
enum LSoftmaxOpInputs {kData, kWeight, kLabel};
enum LSoftmaxOpOutputs {kOut, kDataNorm, kWeightNorm};
enum LSoftmaxResource {kTempSpace};
}
struct LSoftmaxParam : public dmlc::Parameter<LSoftmaxParam> {
int margin;
float beta;
float beta_min;
float scale;
int num_hidden;
bool grad_norm;
int verbose;
float eps;
DMLC_DECLARE_PARAMETER(LSoftmaxParam) {
DMLC_DECLARE_FIELD(margin).set_default(2).set_lower_bound(1)
.describe("LSoftmax margin");
DMLC_DECLARE_FIELD(beta).set_default(1).set_lower_bound(0)
.describe("LSoftmax beta, same as lambda to weight original value");
DMLC_DECLARE_FIELD(beta_min).set_default(0).set_lower_bound(0)
.describe("Minimum beta");
DMLC_DECLARE_FIELD(scale).set_default(1).set_range(0, 1)
.describe("Scale of beta during training for every iteration");
DMLC_DECLARE_FIELD(num_hidden).set_lower_bound(1)
.describe("Number of hidden nodes of the output");
DMLC_DECLARE_FIELD(grad_norm).set_default(false)
.describe("do grad norm");
DMLC_DECLARE_FIELD(verbose).set_default(0)
.describe("Log for beta change");
DMLC_DECLARE_FIELD(eps).set_default(1e-10f)
.describe("l2 eps");
}
};
template<typename xpu, typename DType>
class LSoftmaxOp : public Operator {
public:
explicit LSoftmaxOp(LSoftmaxParam param) {
this->param_ = param;
// setup global lookup table
k_table_.clear();
c_table_.clear();
k_table_.push_back(1);
c_table_.push_back(1);
const int margin = param.margin;
const double pi = std::atan(1) * 4;
double factor = 1;
for (int i = 1; i <= margin; ++i) {
factor = factor * (margin - i + 1) / i;
k_table_.push_back(std::cos(i * pi / margin));
c_table_.push_back(factor);
}
//next_beta_ = param.beta * 0.1f;
count_ = 0;
if(const char* env_p = std::getenv("BETA")) {
float _beta = std::atof(env_p);
if (param_.verbose) {
LOG(INFO)<<"beta:"<<_beta;
}
param_.beta = _beta;
}
else if(const char* env_p = std::getenv("GLOBAL_STEP")) {
int nbatch = std::atoi(env_p);
if (param_.verbose) {
LOG(INFO)<<"nbatch:"<<nbatch;
}
float _beta = param.beta*std::pow((double)param.scale, (double)nbatch);
param_.beta = std::max(_beta, param_.beta_min);
}
if (param_.verbose) {
LOG(INFO)<<param_.margin<<","<<param_.beta<<","<<param_.beta_min<<","<<param_.scale;
}
}
virtual void Forward(const OpContext &ctx,
const std::vector<TBlob> &in_data,
const std::vector<OpReqType> &req,
const std::vector<TBlob> &out_data,
const std::vector<TBlob> &aux_args) {
using namespace mshadow;
using namespace mshadow::expr;
CHECK_EQ(in_data.size(), 3);
CHECK_EQ(out_data.size(), 3);
CHECK_EQ(req.size(), 3);
CHECK_EQ(req[lsoftmax_enum::kOut], kWriteTo);
CHECK(req[lsoftmax_enum::kDataNorm] == kNullOp ||
req[lsoftmax_enum::kDataNorm] == kWriteTo);
CHECK(req[lsoftmax_enum::kWeightNorm] == kNullOp ||
req[lsoftmax_enum::kWeightNorm] == kWriteTo);
Stream<xpu> *s = ctx.get_stream<xpu>();
const int n = in_data[lsoftmax_enum::kData].size(0);
const int m = in_data[lsoftmax_enum::kWeight].size(0);
Tensor<xpu, 2, DType> x = in_data[lsoftmax_enum::kData].FlatTo2D<xpu, DType>(s);
Tensor<xpu, 2, DType> w = in_data[lsoftmax_enum::kWeight].FlatTo2D<xpu, DType>(s);
Tensor<xpu, 1, DType> label = in_data[lsoftmax_enum::kLabel].get_with_shape<xpu, 1, DType>(Shape1(n), s);
Tensor<xpu, 2, DType> out = out_data[lsoftmax_enum::kOut].FlatTo2D<xpu, DType>(s);
Tensor<xpu, 1, DType> x_norm = out_data[lsoftmax_enum::kDataNorm].get_with_shape<xpu, 1, DType>(Shape1(n), s);
Tensor<xpu, 1, DType> w_norm = out_data[lsoftmax_enum::kWeightNorm].get_with_shape<xpu, 1, DType>(Shape1(m), s);
#if defined(__CUDACC__)
CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
<< "Must init CuBLAS handle in stream";
#endif
// original fully connected
out = dot(x, w.T());
if (ctx.is_train) {
// large margin fully connected
const int margin = param_.margin;
if(const char* env_p = std::getenv("BETA")) {
float _beta = std::atof(env_p);
param_.beta = _beta;
}
const DType beta = static_cast<DType>(param_.beta);
//LOG(INFO)<<"beta:"<<beta<<std::endl;
Tensor<cpu, 1, DType> k_table_cpu(k_table_.data(), Shape1(k_table_.size()));
Tensor<cpu, 1, DType> c_table_cpu(c_table_.data(), Shape1(c_table_.size()));
Tensor<xpu, 1, DType> k_table_xpu(Shape1(k_table_.size()));
Tensor<xpu, 1, DType> c_table_xpu(Shape1(c_table_.size()));
k_table_xpu.set_stream(s);
c_table_xpu.set_stream(s);
AllocSpace(&k_table_xpu);
AllocSpace(&c_table_xpu);
Copy(k_table_xpu, k_table_cpu, s);
Copy(c_table_xpu, c_table_cpu, s);
LSoftmaxForward(x, w, label, out, x_norm, w_norm, k_table_xpu, c_table_xpu, margin, beta);
FreeSpace(&k_table_xpu);
FreeSpace(&c_table_xpu);
}
}
//virtual void GradNorm(mshadow::Tensor<xpu, 2, DType> grad, mshadow::Stream<xpu>* s) {
// using namespace mshadow;
// using namespace mshadow::expr;
// Tensor<cpu, 2, DType> grad_cpu(grad.shape_);
// AllocSpace(&grad_cpu);
// Copy(grad_cpu, grad, s);
// DType grad_norm = param_.eps;
// for(uint32_t i=0;i<grad_cpu.shape_[0];i++) {
// for(uint32_t j=0;j<grad_cpu.shape_[1];j++) {
// grad_norm += grad_cpu[i][j]*grad_cpu[i][j];
// }
// }
// grad_norm = sqrt(grad_norm);
// grad_cpu /= grad_norm;
// Copy(grad, grad_cpu, s);
// FreeSpace(&grad_cpu);
//}
virtual DType GradNorm(mshadow::Tensor<xpu, 2, DType> grad, mshadow::Stream<xpu>* s) {
using namespace mshadow;
using namespace mshadow::expr;
Tensor<cpu, 2, DType> grad_cpu(grad.shape_);
AllocSpace(&grad_cpu);
Copy(grad_cpu, grad, s);
DType grad_norm = param_.eps;
for(uint32_t i=0;i<grad_cpu.shape_[0];i++) {
for(uint32_t j=0;j<grad_cpu.shape_[1];j++) {
grad_norm += grad_cpu[i][j]*grad_cpu[i][j];
}
}
grad_norm = sqrt(grad_norm);
//grad_cpu /= grad_norm;
//Copy(grad, grad_cpu, s);
FreeSpace(&grad_cpu);
return grad_norm;
}
virtual void Backward(const OpContext &ctx,
const std::vector<TBlob> &out_grad,
const std::vector<TBlob> &in_data,
const std::vector<TBlob> &out_data,
const std::vector<OpReqType> &req,
const std::vector<TBlob> &in_grad,
const std::vector<TBlob> &aux_args) {
using namespace mshadow;
using namespace mshadow::expr;
CHECK_EQ(out_grad.size(), 1);
CHECK_EQ(in_data.size(), 3);
CHECK_EQ(out_data.size(), 3);
CHECK_GE(in_grad.size(), 2);
CHECK_GE(req.size(), 2);
CHECK_EQ(req[lsoftmax_enum::kData], kWriteTo);
CHECK_EQ(req[lsoftmax_enum::kWeight], kWriteTo);
Stream<xpu> *s = ctx.get_stream<xpu>();
const int n = in_data[lsoftmax_enum::kData].size(0);
const int m = in_data[lsoftmax_enum::kWeight].size(0);
Tensor<xpu, 2, DType> x = in_data[lsoftmax_enum::kData].FlatTo2D<xpu, DType>(s);
Tensor<xpu, 2, DType> w = in_data[lsoftmax_enum::kWeight].FlatTo2D<xpu, DType>(s);
Tensor<xpu, 1, DType> label = in_data[lsoftmax_enum::kLabel].get_with_shape<xpu, 1, DType>(Shape1(n), s);
Tensor<xpu, 1, DType> x_norm = out_data[lsoftmax_enum::kDataNorm].get_with_shape<xpu, 1, DType>(Shape1(n), s);
Tensor<xpu, 1, DType> w_norm = out_data[lsoftmax_enum::kWeightNorm].get_with_shape<xpu, 1, DType>(Shape1(m), s);
Tensor<xpu, 2, DType> o_grad = out_grad[lsoftmax_enum::kOut].FlatTo2D<xpu, DType>(s);
Tensor<xpu, 2, DType> x_grad = in_grad[lsoftmax_enum::kData].FlatTo2D<xpu, DType>(s);
Tensor<xpu, 2, DType> w_grad = in_grad[lsoftmax_enum::kWeight].FlatTo2D<xpu, DType>(s);
// workspace is used for cos_t, cos_mt, k, sin2_t, fo and cos_t_m for every data point
Tensor<xpu, 2, DType> workspace = ctx.requested[lsoftmax_enum::kTempSpace].get_space_typed<xpu, 2, DType>(Shape2(6, n), s);
#if defined(__CUDACC__)
CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
<< "Must init CuBLAS handle in stream";
#endif
// original fully connected
x_grad = dot(o_grad, w);
w_grad = dot(o_grad.T(), x);
// large margin fully connected
const int margin = param_.margin;
const DType beta = static_cast<DType>(param_.beta);
count_+=1;
if (param_.verbose) {
if(count_%param_.verbose==0) {
LOG(INFO)<<"["<<count_<<"]current beta:"<<beta;
DType n = GradNorm(x_grad, s);
LOG(INFO)<<"x_grad norm:"<<n;
n = GradNorm(w_grad, s);
LOG(INFO)<<"w_grad norm:"<<n;
}
}
Tensor<cpu, 1, DType> k_table_cpu(k_table_.data(), Shape1(k_table_.size()));
Tensor<cpu, 1, DType> c_table_cpu(c_table_.data(), Shape1(c_table_.size()));
Tensor<xpu, 1, DType> k_table_xpu(Shape1(k_table_.size()));
Tensor<xpu, 1, DType> c_table_xpu(Shape1(c_table_.size()));
k_table_xpu.set_stream(s);
c_table_xpu.set_stream(s);
AllocSpace(&k_table_xpu);
AllocSpace(&c_table_xpu);
Copy(k_table_xpu, k_table_cpu, s);
Copy(c_table_xpu, c_table_cpu, s);
LSoftmaxBackward(x, w, label, x_norm, w_norm, o_grad, x_grad, w_grad, workspace,
k_table_xpu, c_table_xpu, margin, beta);
FreeSpace(&k_table_xpu);
FreeSpace(&c_table_xpu);
//if(param_.grad_norm) {
// GradNorm(x_grad, s);
// GradNorm(w_grad, s);
//}
// dirty hack, should also work for multi device
if(std::getenv("BETA")==NULL) {
param_.beta *= param_.scale;
param_.beta = std::max(param_.beta, param_.beta_min);
}
//LOG(INFO)<<"w_grad:"<<w_grad.shape_[0]<<","<<w_grad.shape_[1];
//norm = reduce_with_axis<red::sum, false>(F<mxnet::op::mshadow_op::square>(w_grad), 2);
//norm = F<mxnet::op::mshadow_op::square_root>(norm + param_.eps);
//out = data / broadcast_with_axis(norm, 1, dshape[2]);
//if (param_.beta < next_beta_) {
// next_beta_ *= 0.1f;
// if (param_.verbose) {
// LOG(INFO) << "LSoftmax changes beta to " << param_.beta;
// }
//}
}
//Tensor<xpu, 2, DType> grad_norm(const Tensor<xpu, 2, DType> grad) {
//}
private:
LSoftmaxParam param_;
// global lookup table
std::vector<DType> k_table_;
std::vector<DType> c_table_;
//float next_beta_;
uint32_t count_;
}; // class LSoftmaxOp
template<typename xpu>
Operator *CreateOp(LSoftmaxParam param, int dtype);
#if DMLC_USE_CXX11
class LSoftmaxProp : public OperatorProperty {
public:
void Init(const std::vector<std::pair<std::string, std::string> > &kwargs) override {
param_.Init(kwargs);
}
std::map<std::string, std::string> GetParams() const override {
return param_.__DICT__();
}
std::vector<std::string> ListArguments() const override {
return {"data", "weight", "label"};
}
std::vector<std::string> ListOutputs() const override {
return {"output", "data_norm", "weight_norm"};
}
int NumOutputs() const override {
return 3;
}
int NumVisibleOutputs() const override {
return 1;
}
bool InferShape(std::vector<TShape> *in_shape,
std::vector<TShape> *out_shape,
std::vector<TShape> *aux_shape) const override {
using namespace mshadow;
CHECK_EQ(in_shape->size(), 3) << "Input:[data, label, weight]";
const TShape &dshape = in_shape->at(lsoftmax_enum::kData);
const TShape &lshape = in_shape->at(lsoftmax_enum::kLabel);
CHECK_EQ(dshape.ndim(), 2) << "data shape should be (batch_size, feature_dim)";
CHECK_EQ(lshape.ndim(), 1) << "label shape should be (batch_size,)";
const int n = dshape[0];
const int feature_dim = dshape[1];
const int m = param_.num_hidden;
SHAPE_ASSIGN_CHECK(*in_shape, lsoftmax_enum::kWeight, Shape2(m, feature_dim));
out_shape->clear();
out_shape->push_back(Shape2(n, m)); // output
out_shape->push_back(Shape1(n)); // data norm
out_shape->push_back(Shape1(m)); // weight norm
aux_shape->clear();
return true;
}
std::vector<ResourceRequest> BackwardResource(
const std::vector<TShape> &in_shape) const override {
return {ResourceRequest::kTempSpace};
}
std::vector<int> DeclareBackwardDependency(
const std::vector<int> &out_grad,
const std::vector<int> &in_data,
const std::vector<int> &out_data) const override {
return {out_grad[lsoftmax_enum::kOut], out_data[lsoftmax_enum::kDataNorm],
out_data[lsoftmax_enum::kWeightNorm], in_data[lsoftmax_enum::kData],
in_data[lsoftmax_enum::kWeight], in_data[lsoftmax_enum::kLabel]};
}
std::string TypeString() const override {
return "LSoftmax";
}
OperatorProperty *Copy() const override {
auto ptr = new LSoftmaxProp();
ptr->param_ = param_;
return ptr;
}
Operator *CreateOperator(Context ctx) const override {
LOG(FATAL) << "Not Implemented.";
return NULL;
}
Operator *CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
std::vector<int> *in_type) const override;
private:
LSoftmaxParam param_;
}; // class LSoftmaxProp
#endif // DMLC_USE_CXX11
} // namespace op
} // namespace mxnet
#endif // MXNET_OPERATOR_LSOFTMAX_INL_H_

75
src/operator/lsoftmax.cc Normal file
View File

@@ -0,0 +1,75 @@
/*!
* Copyright (c) 2016 by Contributors
* \file lsoftmax.cc
* \brief LSoftmax from <Large-Margin Softmax Loss for Convolutional Neural Networks>
* \author luoyetx
*/
#include "./lsoftmax-inl.h"
namespace mshadow {
template <typename DType>
inline void LSoftmaxForward(const Tensor<cpu, 2, DType> &x,
const Tensor<cpu, 2, DType> &w,
const Tensor<cpu, 1, DType> &label,
const Tensor<cpu, 2, DType> &out,
const Tensor<cpu, 1, DType> &x_norm,
const Tensor<cpu, 1, DType> &w_norm,
const Tensor<cpu, 1, DType> &k_table,
const Tensor<cpu, 1, DType> &c_table,
const int margin,
const DType beta) {
LOG(FATAL) << "Not Implemented.";
}
template <typename DType>
inline void LSoftmaxBackward(const Tensor<cpu, 2, DType> &x,
const Tensor<cpu, 2, DType> &w,
const Tensor<cpu, 1, DType> &label,
const Tensor<cpu, 1, DType> &x_norm,
const Tensor<cpu, 1, DType> &w_norm,
const Tensor<cpu, 2, DType> &o_grad,
const Tensor<cpu, 2, DType> &x_grad,
const Tensor<cpu, 2, DType> &w_grad,
const Tensor<cpu, 2, DType> &workspace,
const Tensor<cpu, 1, DType> &k_table,
const Tensor<cpu, 1, DType> &c_table,
const int margin,
const DType beta) {
LOG(FATAL) << "Not Implemented.";
}
} // namespace mshadow
namespace mxnet {
namespace op {
template<>
Operator *CreateOp<cpu>(LSoftmaxParam param, int dtype) {
Operator *op = NULL;
MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
op = new LSoftmaxOp<cpu, DType>(param);
})
return op;
}
Operator *LSoftmaxProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
std::vector<int> *in_type) const {
std::vector<TShape> out_shape, aux_shape;
std::vector<int> out_type, aux_type;
CHECK(InferType(in_type, &out_type, &aux_type));
CHECK(InferShape(in_shape, &out_shape, &aux_shape));
DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0));
}
DMLC_REGISTER_PARAMETER(LSoftmaxParam);
MXNET_REGISTER_OP_PROPERTY(LSoftmax, LSoftmaxProp)
.describe("LSoftmax from <Large-Margin Softmax Loss for Convolutional Neural Networks>")
.add_argument("data", "Symbol", "data")
.add_argument("weight", "Symbol", "weight")
.add_argument("label", "Symbol", "label")
.add_arguments(LSoftmaxParam::__FIELDS__());
} // namespace op
} // namespace mxnet

322
src/operator/lsoftmax.cu Normal file
View File

@@ -0,0 +1,322 @@
/*!
* Copyright (c) 2016 by Contributors
* \file lsoftmax.cu
* \brief LSoftmax from <Large-Margin Softmax Loss for Convolutional Neural Networks>
* \author luoyetx
*/
#include "./lsoftmax-inl.h"
namespace mshadow {
namespace cuda {
namespace {
// workspace variables
enum LSoftmaxTempSpaceType {kCost, kCosmt, kK, kSin2t, kFo, kCostM};
}
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
MSHADOW_XINLINE int LSPowOfMO(const int k) {
return 1 - ((k&0x01) << 1);
}
template<typename DType>
__global__ void LSCalcNorm(const Tensor<gpu, 2, DType> x,
Tensor<gpu, 1, DType> x_norm) {
const int n = x.size(0);
const int m = x.size(1);
CUDA_KERNEL_LOOP(i, n) {
DType norm = 0;
for (int j = 0; j < m; ++j) {
norm += x[i][j] * x[i][j];
}
x_norm[i] = sqrt(norm);
}
}
template<typename DType>
__device__ int LSFindK(const DType *k_table, const int n, const DType cos_t) {
const DType eps = 1e-5;
for (int i = 0; i < n; ++i) {
if (((k_table[i+1] < cos_t) || (abs(k_table[i+1] - cos_t) < eps)) &&
((k_table[i] > cos_t) || (abs(k_table[i] - cos_t) < eps))) {
return i;
}
}
return 0;
}
template<typename DType>
__device__ DType LSCalcCosmt(const DType *c_table, const int n,
const DType cos_t, const int margin) {
const DType sin2_t = 1 - cos_t * cos_t;
DType cos_t_p = pow(cos_t, margin);
DType sin2_t_p = 1;
DType cos_mt = cos_t_p; // p = 0
for (int p = 1; p <= margin / 2; ++p) {
cos_t_p /= cos_t * cos_t; // don't replace `cos_t*cos_t` with `1-sin2_t`, this can cause numeric issue if cos_t --> 0
sin2_t_p *= sin2_t;
cos_mt += LSPowOfMO(p) * c_table[2*p] * cos_t_p * sin2_t_p;
}
return cos_mt;
}
template<typename DType>
__global__ void LSoftmaxForwardKernel(const Tensor<gpu, 2, DType> x,
const Tensor<gpu, 2, DType> w,
const Tensor<gpu, 1, DType> label,
const Tensor<gpu, 1, DType> x_norm,
const Tensor<gpu, 1, DType> w_norm,
Tensor<gpu, 2, DType> out,
const Tensor<gpu, 1, DType> k_table,
const Tensor<gpu, 1, DType> c_table,
const int margin,
const DType beta) {
const int n = x.size(0);
const int feature_dim = x.size(1);
const int m = w.size(0);
CUDA_KERNEL_LOOP(i, n) {
const int yi = static_cast<int>(label[i]);
const DType fo_i_yi = out[i][yi];
const DType cos_t = fo_i_yi / (x_norm[i] * w_norm[yi]);
const int k = LSFindK(k_table.dptr_, k_table.size(0), cos_t);
const DType cos_mt = LSCalcCosmt(c_table.dptr_, c_table.size(0), cos_t, margin);
const DType f_i_yi = (LSPowOfMO(k) * cos_mt - 2*k) * (w_norm[yi] * x_norm[i]);
out[i][yi] = (f_i_yi + beta * fo_i_yi) / (1 + beta);
}
}
template<typename DType>
inline void LSoftmaxForward(const Tensor<gpu, 2, DType> &x,
const Tensor<gpu, 2, DType> &w,
const Tensor<gpu, 1, DType> &label,
const Tensor<gpu, 2, DType> &out,
const Tensor<gpu, 1, DType> &x_norm,
const Tensor<gpu, 1, DType> &w_norm,
const Tensor<gpu, 1, DType> &k_table,
const Tensor<gpu, 1, DType> &c_table,
const int margin,
const DType beta) {
const int n = x.size(0);
const int m = w.size(0);
dim3 dimBlock(kBaseThreadNum);
dim3 dimGrid((n + kBaseThreadNum - 1) / kBaseThreadNum);
LSCalcNorm<<<dimGrid, dimBlock>>>(x, x_norm);
dimGrid.x = ((m + kBaseThreadNum - 1) / kBaseThreadNum);
LSCalcNorm<<<dimGrid, dimBlock>>>(w, w_norm);
dimGrid.x = ((n + kBaseThreadNum - 1) / kBaseThreadNum);
LSoftmaxForwardKernel<<<dimGrid, dimBlock>>>(x, w, label, x_norm, w_norm, out, k_table, c_table, margin, beta);
}
template<typename DType>
__global__ void LSoftmaxBackwardRequired(const Tensor<gpu, 2, DType> x,
const Tensor<gpu, 2, DType> w,
const Tensor<gpu, 1, DType> label,
const Tensor<gpu, 1, DType> x_norm,
const Tensor<gpu, 1, DType> w_norm,
Tensor<gpu, 2, DType> workspace,
const Tensor<gpu, 1, DType> k_table,
const Tensor<gpu, 1, DType> c_table,
const int margin) {
const int n = x.size(0);
const int feature_dim = x.size(1);
CUDA_KERNEL_LOOP(i, n) {
const int yi = static_cast<int>(label[i]);
// fo_i_yi = dot(w_yi, x_i)
DType fo_i_yi = 0;
for (int p = 0; p < feature_dim; ++p) {
fo_i_yi += w[yi][p] * x[i][p];
}
const DType cos_t = fo_i_yi / (x_norm[i] * w_norm[yi]);
const int k = LSFindK(k_table.dptr_, k_table.size(0), cos_t);
const DType cos_mt = LSCalcCosmt(c_table.dptr_, c_table.size(0), cos_t, margin);
const DType sin2_t = 1 - cos_t * cos_t;
workspace[kCost][i] = cos_t;
workspace[kCosmt][i] = cos_mt;
workspace[kK][i] = static_cast<DType>(k);
workspace[kSin2t][i] = sin2_t;
workspace[kFo][i] = fo_i_yi;
workspace[kCostM][i] = pow(cos_t, margin - 1);
}
}
template<typename DType>
__global__ void LSoftmaxBackwardXKernel(const Tensor<gpu, 2, DType> x,
const Tensor<gpu, 2, DType> w,
const Tensor<gpu, 1, DType> label,
const Tensor<gpu, 1, DType> x_norm,
const Tensor<gpu, 1, DType> w_norm,
const Tensor<gpu, 2, DType> o_grad,
Tensor<gpu, 2, DType> x_grad,
const Tensor<gpu, 2, DType> workspace,
const Tensor<gpu, 1, DType> c_table,
const int margin,
const DType beta) {
const int nthreads = x.size(0) * x.size(1);
const int feature_dim = x.size(1);
CUDA_KERNEL_LOOP(idx, nthreads) {
const int i = idx / feature_dim;
const int l = idx % feature_dim;
const int yi = static_cast<int>(label[i]);
const DType cos_t = workspace[kCost][i];
const DType cos_mt = workspace[kCosmt][i];
const int k = static_cast<int>(workspace[kK][i]);
const DType sin2_t = workspace[kSin2t][i];
const DType fo_i_yi = workspace[kFo][i];
const DType w_norm_yi = w_norm[yi];
const DType x_norm_i = x_norm[i];
const DType dcos_dx = w[yi][l] / (w_norm_yi * x_norm_i) - \
fo_i_yi * x[i][l] / (w_norm_yi * x_norm_i * x_norm_i * x_norm_i);
const DType dsin2_dx = -2 * cos_t * dcos_dx;
DType cos_t_p = workspace[kCostM][i];
DType sin2_t_p = 1;
DType dcosm_dx = margin * cos_t_p * dcos_dx; // p = 0
for (int p = 1; p <= margin / 2; ++p) {
cos_t_p /= cos_t * cos_t;
dcosm_dx += LSPowOfMO(p) * c_table[2*p] * (p * cos_t * dsin2_dx + \
(margin - 2*p) * sin2_t * dcos_dx) * cos_t_p * sin2_t_p;
sin2_t_p *= sin2_t;
}
const DType df_dx = (LSPowOfMO(k) * cos_mt - 2*k) * w_norm_yi / x_norm_i * x[i][l] + \
LSPowOfMO(k) * w_norm_yi * x_norm_i * dcosm_dx;
const DType alpha = 1 / (1 + beta);
x_grad[i][l] += alpha * o_grad[i][yi] * (df_dx - w[yi][l]);
}
}
template<typename DType>
__global__ void LSoftmaxBackwardWKernel(const Tensor<gpu, 2, DType> x,
const Tensor<gpu, 2, DType> w,
const Tensor<gpu, 1, DType> label,
const Tensor<gpu, 1, DType> x_norm,
const Tensor<gpu, 1, DType> w_norm,
const Tensor<gpu, 2, DType> o_grad,
Tensor<gpu, 2, DType> w_grad,
const Tensor<gpu, 2, DType> workspace,
const Tensor<gpu, 1, DType> c_table,
const int margin,
const DType beta) {
const int nthreads = w.size(0) * w.size(1);
const int n = x.size(0);
const int feature_dim = w.size(1);
CUDA_KERNEL_LOOP(idx, nthreads) {
const int j = idx / feature_dim;
const int l = idx % feature_dim;
DType dw = 0;
for (int i = 0; i < n; ++i) {
const int yi = static_cast<int>(label[i]);
if (yi == j) {
const DType cos_t = workspace[kCost][i];
const DType cos_mt = workspace[kCosmt][i];
const int k = static_cast<int>(workspace[kK][i]);
const DType sin2_t = workspace[kSin2t][i];
const DType fo_i_yi = workspace[kFo][i];
const DType x_norm_i = x_norm[i];
const DType w_norm_yi = w_norm[yi];
const DType dcos_dw = x[i][l] / (w_norm_yi * x_norm_i) - \
fo_i_yi * w[yi][l] / (x_norm_i * w_norm_yi * w_norm_yi * w_norm_yi);
const DType dsin2_dw = -2 * cos_t * dcos_dw;
DType cos_t_p = workspace[kCostM][i];
DType sin2_t_p = 1;
DType dcosm_dw = margin * cos_t_p * dcos_dw; // p = 0
for (int p = 1; p <= margin / 2; ++p) {
cos_t_p /= cos_t * cos_t;
dcosm_dw += LSPowOfMO(p) * c_table[2*p] * (p * cos_t * dsin2_dw + \
(margin - 2*p) * sin2_t * dcos_dw) * cos_t_p * sin2_t_p;
sin2_t_p *= sin2_t;
}
const DType df_dw_j = (LSPowOfMO(k) * cos_mt - 2*k) * x_norm_i / w_norm_yi * w[yi][l] + \
LSPowOfMO(k) * w_norm_yi * x_norm_i * dcosm_dw;
dw += o_grad[i][yi] * (df_dw_j - x[i][l]);
}
}
const DType alpha = 1 / (1 + beta);
w_grad[j][l] += alpha * dw;
}
}
template<typename DType>
inline void LSoftmaxBackward(const Tensor<gpu, 2, DType> &x,
const Tensor<gpu, 2, DType> &w,
const Tensor<gpu, 1, DType> &label,
const Tensor<gpu, 1, DType> &x_norm,
const Tensor<gpu, 1, DType> &w_norm,
const Tensor<gpu, 2, DType> &o_grad,
const Tensor<gpu, 2, DType> &x_grad,
const Tensor<gpu, 2, DType> &w_grad,
const Tensor<gpu, 2, DType> &workspace,
const Tensor<gpu, 1, DType> &k_table,
const Tensor<gpu, 1, DType> &c_table,
const int margin,
const DType beta) {
const int n = x.size(0);
const int feature_dim = x.size(1);
const int m = w.size(0);
dim3 dimBlock(kBaseThreadNum);
dim3 dimGrid((n + kBaseThreadNum - 1) / kBaseThreadNum);
LSoftmaxBackwardRequired<<<dimGrid, dimBlock>>>(x, w, label, x_norm, w_norm, workspace,
k_table, c_table, margin);
dimGrid.x = ((n * feature_dim + kBaseThreadNum - 1) / kBaseThreadNum);
LSoftmaxBackwardXKernel<<<dimGrid, dimBlock>>>(x, w, label, x_norm, w_norm, o_grad, x_grad, workspace,
c_table, margin, beta);
dimGrid.x = ((m * feature_dim + kBaseThreadNum - 1) / kBaseThreadNum);
LSoftmaxBackwardWKernel<<<dimGrid, dimBlock>>>(x, w, label, x_norm, w_norm, o_grad, w_grad, workspace,
c_table, margin, beta);
}
} // namespace cuda
template<typename DType>
inline void LSoftmaxForward(const Tensor<gpu, 2, DType> &x,
const Tensor<gpu, 2, DType> &w,
const Tensor<gpu, 1, DType> &label,
const Tensor<gpu, 2, DType> &out,
const Tensor<gpu, 1, DType> &x_norm,
const Tensor<gpu, 1, DType> &w_norm,
const Tensor<gpu, 1, DType> &k_table,
const Tensor<gpu, 1, DType> &c_table,
const int margin,
const DType beta) {
cuda::LSoftmaxForward(x, w, label, out, x_norm, w_norm,
k_table, c_table, margin, beta);
}
template<typename DType>
inline void LSoftmaxBackward(const Tensor<gpu, 2, DType> &x,
const Tensor<gpu, 2, DType> &w,
const Tensor<gpu, 1, DType> &label,
const Tensor<gpu, 1, DType> &x_norm,
const Tensor<gpu, 1, DType> &w_norm,
const Tensor<gpu, 2, DType> &o_grad,
const Tensor<gpu, 2, DType> &x_grad,
const Tensor<gpu, 2, DType> &w_grad,
const Tensor<gpu, 2, DType> &workspace,
const Tensor<gpu, 1, DType> &k_table,
const Tensor<gpu, 1, DType> &c_table,
const int margin,
const DType beta) {
cuda::LSoftmaxBackward(x, w, label, x_norm, w_norm, o_grad, x_grad, w_grad, workspace,
k_table, c_table, margin, beta);
}
} // namespace mshadow
namespace mxnet {
namespace op {
template<>
Operator *CreateOp<gpu>(LSoftmaxParam param, int dtype) {
Operator *op = NULL;
MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
op = new LSoftmaxOp<gpu, DType>(param);
})
return op;
}
} // namespace op
} // namespace mxnet

64
src/train.sh Executable file
View File

@@ -0,0 +1,64 @@
#!/usr/bin/env bash
export MXNET_CPU_WORKER_NTHREADS=15
export MXNET_CUDNN_AUTOTUNE_DEFAULT=0
export MXNET_ENGINE_TYPE=ThreadedEnginePerDevice
#export CUDA_VISIBLE_DEVICES='4,5'
#python -u train_softmax.py --retrain --pretrained '../model/sphereface-152-0-0' --load-epoch 8 --prefix '../model/sphereface-retrain-0' --loss-type 0
export CUDA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
export CUDA_VISIBLE_DEVICES='0,1,2,3'
export CUDA_VISIBLE_DEVICES='4,5,6,7'
export CUDA_VISIBLE_DEVICES='4,5'
export CUDA_VISIBLE_DEVICES='0,1'
#python -u train_softmax.py --network 's60' --patch '16_0_96_112_0' --loss-type 1 > logs60_l1_v4 2>&1 &
#python -u train_softmax.py --network 's60' --patch '0_0_96_95_0' --loss-type 1 --prefix '../model/spherefacex'
#python -u train_softmax.py --network 's20' --patch '0_0_96_112_0' --loss-type 0
#python -u train_softmax.py --network 'r50' --patch '0_0_96_112_0' --loss-type 0
#python -u train_softmax.py --network 'm4' --patch '0_0_96_112_0' --loss-type 1 --prefix '../model/spherefacem' --per-batch-size 224 > celm.log 2>&1 &
#python -u train_softmax.py --network 'm29' --patch '0_0_96_112_0' --loss-type 0 --lr 0.1 --prefix '../model/softmax' --verbose 2000 --per-batch-size 128 > sx_m29.log 2>&1 &
#python -u train_softmax.py --network 'm29' --patch '0_0_96_112_0' --loss-type 1 --lr 0.1 --prefix '../model/sphere47' --verbose 2000 --per-batch-size 224 --beta-min 4.7 > sp_m29_47.log 2>&1 &
export CUDA_VISIBLE_DEVICES='2,3'
#python -u train_softmax.py --network 'm1' --patch '0_0_96_112_0' --loss-type 0 --lr 0.01 --prefix '../model/marginal0' --verbose 2000
#python -u train_softmax.py --network 's60' --patch '0_0_96_95_0' --loss-type 1
#python -u train_softmax.py --network 's20' --patch '0_0_96_95_0' --loss-type 1
#python -u train_softmax.py --network 's60' --patch '0_0_96_112_0' --loss-type 1 --prefix '../model/spherefacec' > logs60_c 2>&1 &
#python -u train_marginal.py --patch '0_0_96_112_0' --network 's36' --verbose 1000 --lr 0.01
#python -u train_coco.py --patch '0_0_96_112_0' --images-per-identity 32
#python -u train_softmax.py --network 's36' --patch '0_0_96_112_0' --loss-type 1 --prefix '../model/spherefacei36' --per-batch-size 256
#python -u train_softmax.py --network 's36' --patch '0_0_96_112_0' --loss-type 1 --prefix '../model/spherefacei36' --per-batch-size 256 > cel4.log 2>&1 &
#python -u train_softmax.py --network 'm28' --patch '0_0_96_112_0' --loss-type 11 --lr 0.1 --prefix '../model/L11' --verbose 500 --per-batch-size 128 --images-per-identity 4
#python -u train_softmax.py --network 'm27' --patch '0_0_96_112_0' --loss-type 1 --lr 0.1 --prefix '../model/sphere' --verbose 2000 --per-batch-size 224 > sp_m27.log 2>&1 &
#python -u train_softmax.py --network 'm27' --patch '0_0_96_112_0' --loss-type 0 --lr 0.1 --prefix '../model/softmax' --verbose 2000 --per-batch-size 128 > sx_m27.log 2>&1 &
export CUDA_VISIBLE_DEVICES='4,5'
#python -u train_softmax.py --network 'm29' --patch '0_0_96_112_0' --loss-type 0 --lr 0.1 --prefix '../model/softmax' --verbose 2000 --per-batch-size 128 > sx_m29.log 2>&1 &
#python -u train_softmax.py --network 'm27' --patch '0_0_96_112_0' --loss-type 1 --lr 0.1 --prefix '../model/sphere' --verbose 2000 --per-batch-size 224 > sp_m27.log 2>&1 &
#python -u train_softmax.py --network 'm28' --patch '0_0_96_112_0' --loss-type 1 --lr 0.1 --prefix '../model/sphere' --verbose 2000 --per-batch-size 224 > sp_m28.log 2>&1 &
export CUDA_VISIBLE_DEVICES='6,7'
#python -u train_softmax.py --network 'm29' --patch '0_0_96_112_0' --loss-type 1 --lr 0.1 --prefix '../model/spherem' --verbose 2000 --per-batch-size 224
#python -u train_softmax.py --network 'm28' --patch '0_0_96_112_0' --loss-type 0 --lr 0.1 --prefix '../model/softmax' --verbose 2000 --per-batch-size 128 > sx_m28.log 2>&1 &
#python -u train_marginal.py --patch '0_0_96_112_0' --network 'i4' --verbose 2000 --lr 0.01
#python -u train_softmax.py --network 'i4' --patch '0_0_96_112_0' --loss-type 1 --gamma 0.06 --beta-min 4
#python -u train_softmax.py --network 'x4' --patch '0_0_96_112_0' --loss-type 1 --gamma 0.09
#python -u train_softmax.py --network 's60' --patch '0_0_80_95_0' --loss-type 1 > logs60_l1_v3 2>&1 &
#python -u train_softmax.py --network 's60' --patch '0_0_96_95_0' --loss-type 1 > logs60_l1_v2 2>&1 &
#python -u train_softmax.py --network 's20' --patch '0_0_96_112_0'
export CUDA_VISIBLE_DEVICES='4,5,6,7'
python -u train_softmax.py --network 'm29' --patch '0_0_96_112_0' --loss-type 0 --lr 0.1 --prefix '../model/softmax' --verbose 2000 --per-batch-size 128 > sx_m29.log 2>&1 &
#python -u train_softmax.py --network 's60' --patch '0_0_96_95_0' --loss-type 1 --gamma 0.06 --beta-freeze 5000 --prefix '../model/spherefacei' > cel2.log 2>&1 &
export CUDA_VISIBLE_DEVICES='0,1,2,3'
#python -u train_softmax.py --network 'm29' --patch '0_0_96_112_0' --loss-type 1 --lr 0.1 --prefix '../model/spherem' --verbose 2000 --per-batch-size 224 --lr-steps '60000,80000,90000' > spm_m29.log 2>&1 &
#python -u train_softmax.py --network 's60' --patch '0_15_96_112_0' --loss-type 1 --gamma 0.06 --beta-freeze 5000 --prefix '../model/spherefacei' > cel3.log 2>&1 &
export CUDA_VISIBLE_DEVICES='2'
#python -u train_marginal.py --patch '0_0_96_112_0' --network 's36' --verbose 2000 --lr 0.01 > mar_s36.log 2>&1 &
export CUDA_VISIBLE_DEVICES='3'
#python -u train_marginal.py --patch '0_0_96_112_0' --network 'i4' --verbose 2000 --lr 0.01 > mar_i4.log 2>&1 &
#python -u train_softmax.py --network 'i4' --patch '0_0_96_112_0' --loss-type 1 --gamma 0.06 --beta-freeze 5000
#python -u train_softmax.py --network 'r50' --patch '0_0_96_112_0' --loss-type 1 --gamma 0.24 > logr50_l1 2>&1 &
#python -u train_softmax.py --network 'r50' --patch '0_0_96_112_0' --loss-type 2 --verbose 100
#python -u train_softmax.py --network 'r50' --patch '0_0_96_95_0' > logr101_pu 2>&1 &
#python -u train_softmax.py --network 'r50' --patch '0_0_96_112_0'
#python -u train_softmax.py --network 'r101' --patch '0_0_96_95_0'
#python -u train_softmax.py --loss-type 1 --num-layers 64 --patch '0_0_96_112_0'
#python -u train_softmax.py --loss-type 1 --num-layers 36 --patch '0_0_96_95_0'
#python -u train_softmax.py --loss-type 1 --num-layers 20 --patch '0_0_80_95_0'

View File

@@ -17,7 +17,6 @@ from mxnet import ndarray as nd
import argparse
import mxnet.optimizer as optimizer
sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'common'))
import resnet_dcn
import spherenet
import marginalnet
import inceptions
@@ -26,7 +25,8 @@ import lfw
import sklearn
from sklearn.decomposition import PCA
#from center_loss import *
import asoftmax
#import resnet_dcn
#import asoftmax
logger = logging.getLogger()
@@ -105,6 +105,8 @@ def parse_args():
help='')
parser.add_argument('--loss-type', type=int, default=1,
help='')
parser.add_argument('--incay', action='store_true', default=False,
help='feature incay')
parser.add_argument('--use-deformable', type=int, default=0,
help='')
parser.add_argument('--patch', type=str, default='0_0_96_112_0',
@@ -138,6 +140,7 @@ def get_symbol(args, arg_params, aux_params):
_,_,embedding,_ = resnet_dcn.get_symbol(512, args.num_layers)
gt_label = mx.symbol.Variable('softmax_label')
assert args.loss_type>=0
extra_loss = None
if args.loss_type==0:
_weight = mx.symbol.Variable('fc7_weight')
_bias = mx.symbol.Variable('fc7_bias', lr_mult=2.0, wd_mult=0.0)
@@ -206,12 +209,22 @@ def get_symbol(args, arg_params, aux_params):
softmax = mx.symbol.SoftmaxOutput(data=fc7, label = gt_label, name='softmax', normalization='valid')
else:
softmax = mx.symbol.SoftmaxOutput(data=fc7, label = gt_label, name='softmax')
if args.loss_type<=1 and args.incay:
params = [1.e-10, 0.01]
sel = mx.symbol.argmax(data = fc7, axis=1)
sel = (sel==gt_label)
norm = embedding*embedding
norm = mx.symbol.sum(norm, axis=1)
norm += params[0]
feature_incay = sel/norm
feature_incay = mx.symbol.mean(feature_incay) * params[1]
extra_loss = mx.symbol.MakeLoss(feature_incay)
#out = softmax
#l2_embedding = mx.symbol.L2Normalization(embedding)
#ce = mx.symbol.softmax_cross_entropy(fc7, gt_label, name='softmax_ce')/args.per_batch_size
#out = mx.symbol.Group([mx.symbol.BlockGrad(embedding), softmax, mx.symbol.BlockGrad(ce)])
if args.loss_type>=10 and extra_loss is not None:
if extra_loss is not None:
out = mx.symbol.Group([mx.symbol.BlockGrad(embedding), softmax, extra_loss])
else:
out = mx.symbol.Group([mx.symbol.BlockGrad(embedding), softmax])
@@ -277,7 +290,7 @@ def train_net(args):
path_imglist = "/raid5data/dplearn/faces_normed/train.lst"
args.num_classes = 82395
args.use_val = True
args.use_val = False
val_path = "/raid5data/dplearn/faces_normed/val.lst"
path_imgrec = "/opt/jiaguo/faces_normed/train.rec"
val_rec = "/opt/jiaguo/faces_normed/val.rec"
@@ -291,7 +304,7 @@ def train_net(args):
data_shape = (args.image_channel,112,96)
mean = [127.5,127.5,127.5]
if args.use_val and args.loss_type<=1:
if args.use_val:
val_dataiter = FaceImageIter2(
batch_size = args.batch_size,
data_shape = data_shape,
@@ -349,7 +362,7 @@ def train_net(args):
)
if args.loss_type<=1:
if args.loss_type<=9:
train_dataiter = FaceImageIter2(
batch_size = args.batch_size,
data_shape = data_shape,
@@ -570,7 +583,8 @@ def train_net(args):
global_step = [0]
save_step = [0]
if len(args.lr_steps)==0:
lr_steps = [40000, 70000, 90000]
#lr_steps = [40000, 70000, 90000]
lr_steps = [30000, 50000, 70000, 90000]
if args.loss_type==1:
lr_steps = [70000, 100000]
else:
@@ -595,16 +609,21 @@ def train_net(args):
acc, embeddings_list = lfw_test(mbatch)
save_step[0]+=1
msave = save_step[0]
do_save = False
if acc>=highest_acc[0]:
highest_acc[0] = acc
if acc>=0.992:
print('saving', msave)
arg, aux = model.get_params()
mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux)
lfw_npy = "%s-lfw-%04d" % (prefix, msave)
X = np.concatenate(embeddings_list, axis=0)
print(X.shape)
np.save(lfw_npy, X)
if acc>=0.995:
do_save = True
if mbatch>lr_steps[-1] and msave%5==0:
do_save = True
if do_save:
print('saving', msave)
arg, aux = model.get_params()
mx.model.save_checkpoint(prefix, msave, model.symbol, arg, aux)
#lfw_npy = "%s-lfw-%04d" % (prefix, msave)
#X = np.concatenate(embeddings_list, axis=0)
#print(X.shape)
#np.save(lfw_npy, X)
print('[%d]Accuracy-Highest: %1.5f'%(mbatch, highest_acc[0]))
if mbatch<=args.beta_freeze:
_beta = args.beta