Files
insightface/3rdparty/operator/lsoftmax.cu
2017-11-29 17:12:26 +08:00

323 lines
14 KiB
Plaintext

/*!
* Copyright (c) 2016 by Contributors
* \file lsoftmax.cu
* \brief LSoftmax from <Large-Margin Softmax Loss for Convolutional Neural Networks>
* \author luoyetx
*/
#include "./lsoftmax-inl.h"
namespace mshadow {
namespace cuda {
namespace {
// workspace variables
enum LSoftmaxTempSpaceType {kCost, kCosmt, kK, kSin2t, kFo, kCostM};
}
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
MSHADOW_XINLINE int LSPowOfMO(const int k) {
return 1 - ((k&0x01) << 1);
}
template<typename DType>
__global__ void LSCalcNorm(const Tensor<gpu, 2, DType> x,
Tensor<gpu, 1, DType> x_norm) {
const int n = x.size(0);
const int m = x.size(1);
CUDA_KERNEL_LOOP(i, n) {
DType norm = 0;
for (int j = 0; j < m; ++j) {
norm += x[i][j] * x[i][j];
}
x_norm[i] = sqrt(norm);
}
}
template<typename DType>
__device__ int LSFindK(const DType *k_table, const int n, const DType cos_t) {
const DType eps = 1e-5;
for (int i = 0; i < n; ++i) {
if (((k_table[i+1] < cos_t) || (abs(k_table[i+1] - cos_t) < eps)) &&
((k_table[i] > cos_t) || (abs(k_table[i] - cos_t) < eps))) {
return i;
}
}
return 0;
}
template<typename DType>
__device__ DType LSCalcCosmt(const DType *c_table, const int n,
const DType cos_t, const int margin) {
const DType sin2_t = 1 - cos_t * cos_t;
DType cos_t_p = pow(cos_t, margin);
DType sin2_t_p = 1;
DType cos_mt = cos_t_p; // p = 0
for (int p = 1; p <= margin / 2; ++p) {
cos_t_p /= cos_t * cos_t; // don't replace `cos_t*cos_t` with `1-sin2_t`, this can cause numeric issue if cos_t --> 0
sin2_t_p *= sin2_t;
cos_mt += LSPowOfMO(p) * c_table[2*p] * cos_t_p * sin2_t_p;
}
return cos_mt;
}
template<typename DType>
__global__ void LSoftmaxForwardKernel(const Tensor<gpu, 2, DType> x,
const Tensor<gpu, 2, DType> w,
const Tensor<gpu, 1, DType> label,
const Tensor<gpu, 1, DType> x_norm,
const Tensor<gpu, 1, DType> w_norm,
Tensor<gpu, 2, DType> out,
const Tensor<gpu, 1, DType> k_table,
const Tensor<gpu, 1, DType> c_table,
const int margin,
const DType beta) {
const int n = x.size(0);
const int feature_dim = x.size(1);
const int m = w.size(0);
CUDA_KERNEL_LOOP(i, n) {
const int yi = static_cast<int>(label[i]);
const DType fo_i_yi = out[i][yi];
const DType cos_t = fo_i_yi / (x_norm[i] * w_norm[yi]);
const int k = LSFindK(k_table.dptr_, k_table.size(0), cos_t);
const DType cos_mt = LSCalcCosmt(c_table.dptr_, c_table.size(0), cos_t, margin);
const DType f_i_yi = (LSPowOfMO(k) * cos_mt - 2*k) * (w_norm[yi] * x_norm[i]);
out[i][yi] = (f_i_yi + beta * fo_i_yi) / (1 + beta);
}
}
template<typename DType>
inline void LSoftmaxForward(const Tensor<gpu, 2, DType> &x,
const Tensor<gpu, 2, DType> &w,
const Tensor<gpu, 1, DType> &label,
const Tensor<gpu, 2, DType> &out,
const Tensor<gpu, 1, DType> &x_norm,
const Tensor<gpu, 1, DType> &w_norm,
const Tensor<gpu, 1, DType> &k_table,
const Tensor<gpu, 1, DType> &c_table,
const int margin,
const DType beta) {
const int n = x.size(0);
const int m = w.size(0);
dim3 dimBlock(kBaseThreadNum);
dim3 dimGrid((n + kBaseThreadNum - 1) / kBaseThreadNum);
LSCalcNorm<<<dimGrid, dimBlock>>>(x, x_norm);
dimGrid.x = ((m + kBaseThreadNum - 1) / kBaseThreadNum);
LSCalcNorm<<<dimGrid, dimBlock>>>(w, w_norm);
dimGrid.x = ((n + kBaseThreadNum - 1) / kBaseThreadNum);
LSoftmaxForwardKernel<<<dimGrid, dimBlock>>>(x, w, label, x_norm, w_norm, out, k_table, c_table, margin, beta);
}
template<typename DType>
__global__ void LSoftmaxBackwardRequired(const Tensor<gpu, 2, DType> x,
const Tensor<gpu, 2, DType> w,
const Tensor<gpu, 1, DType> label,
const Tensor<gpu, 1, DType> x_norm,
const Tensor<gpu, 1, DType> w_norm,
Tensor<gpu, 2, DType> workspace,
const Tensor<gpu, 1, DType> k_table,
const Tensor<gpu, 1, DType> c_table,
const int margin) {
const int n = x.size(0);
const int feature_dim = x.size(1);
CUDA_KERNEL_LOOP(i, n) {
const int yi = static_cast<int>(label[i]);
// fo_i_yi = dot(w_yi, x_i)
DType fo_i_yi = 0;
for (int p = 0; p < feature_dim; ++p) {
fo_i_yi += w[yi][p] * x[i][p];
}
const DType cos_t = fo_i_yi / (x_norm[i] * w_norm[yi]);
const int k = LSFindK(k_table.dptr_, k_table.size(0), cos_t);
const DType cos_mt = LSCalcCosmt(c_table.dptr_, c_table.size(0), cos_t, margin);
const DType sin2_t = 1 - cos_t * cos_t;
workspace[kCost][i] = cos_t;
workspace[kCosmt][i] = cos_mt;
workspace[kK][i] = static_cast<DType>(k);
workspace[kSin2t][i] = sin2_t;
workspace[kFo][i] = fo_i_yi;
workspace[kCostM][i] = pow(cos_t, margin - 1);
}
}
template<typename DType>
__global__ void LSoftmaxBackwardXKernel(const Tensor<gpu, 2, DType> x,
const Tensor<gpu, 2, DType> w,
const Tensor<gpu, 1, DType> label,
const Tensor<gpu, 1, DType> x_norm,
const Tensor<gpu, 1, DType> w_norm,
const Tensor<gpu, 2, DType> o_grad,
Tensor<gpu, 2, DType> x_grad,
const Tensor<gpu, 2, DType> workspace,
const Tensor<gpu, 1, DType> c_table,
const int margin,
const DType beta) {
const int nthreads = x.size(0) * x.size(1);
const int feature_dim = x.size(1);
CUDA_KERNEL_LOOP(idx, nthreads) {
const int i = idx / feature_dim;
const int l = idx % feature_dim;
const int yi = static_cast<int>(label[i]);
const DType cos_t = workspace[kCost][i];
const DType cos_mt = workspace[kCosmt][i];
const int k = static_cast<int>(workspace[kK][i]);
const DType sin2_t = workspace[kSin2t][i];
const DType fo_i_yi = workspace[kFo][i];
const DType w_norm_yi = w_norm[yi];
const DType x_norm_i = x_norm[i];
const DType dcos_dx = w[yi][l] / (w_norm_yi * x_norm_i) - \
fo_i_yi * x[i][l] / (w_norm_yi * x_norm_i * x_norm_i * x_norm_i);
const DType dsin2_dx = -2 * cos_t * dcos_dx;
DType cos_t_p = workspace[kCostM][i];
DType sin2_t_p = 1;
DType dcosm_dx = margin * cos_t_p * dcos_dx; // p = 0
for (int p = 1; p <= margin / 2; ++p) {
cos_t_p /= cos_t * cos_t;
dcosm_dx += LSPowOfMO(p) * c_table[2*p] * (p * cos_t * dsin2_dx + \
(margin - 2*p) * sin2_t * dcos_dx) * cos_t_p * sin2_t_p;
sin2_t_p *= sin2_t;
}
const DType df_dx = (LSPowOfMO(k) * cos_mt - 2*k) * w_norm_yi / x_norm_i * x[i][l] + \
LSPowOfMO(k) * w_norm_yi * x_norm_i * dcosm_dx;
const DType alpha = 1 / (1 + beta);
x_grad[i][l] += alpha * o_grad[i][yi] * (df_dx - w[yi][l]);
}
}
template<typename DType>
__global__ void LSoftmaxBackwardWKernel(const Tensor<gpu, 2, DType> x,
const Tensor<gpu, 2, DType> w,
const Tensor<gpu, 1, DType> label,
const Tensor<gpu, 1, DType> x_norm,
const Tensor<gpu, 1, DType> w_norm,
const Tensor<gpu, 2, DType> o_grad,
Tensor<gpu, 2, DType> w_grad,
const Tensor<gpu, 2, DType> workspace,
const Tensor<gpu, 1, DType> c_table,
const int margin,
const DType beta) {
const int nthreads = w.size(0) * w.size(1);
const int n = x.size(0);
const int feature_dim = w.size(1);
CUDA_KERNEL_LOOP(idx, nthreads) {
const int j = idx / feature_dim;
const int l = idx % feature_dim;
DType dw = 0;
for (int i = 0; i < n; ++i) {
const int yi = static_cast<int>(label[i]);
if (yi == j) {
const DType cos_t = workspace[kCost][i];
const DType cos_mt = workspace[kCosmt][i];
const int k = static_cast<int>(workspace[kK][i]);
const DType sin2_t = workspace[kSin2t][i];
const DType fo_i_yi = workspace[kFo][i];
const DType x_norm_i = x_norm[i];
const DType w_norm_yi = w_norm[yi];
const DType dcos_dw = x[i][l] / (w_norm_yi * x_norm_i) - \
fo_i_yi * w[yi][l] / (x_norm_i * w_norm_yi * w_norm_yi * w_norm_yi);
const DType dsin2_dw = -2 * cos_t * dcos_dw;
DType cos_t_p = workspace[kCostM][i];
DType sin2_t_p = 1;
DType dcosm_dw = margin * cos_t_p * dcos_dw; // p = 0
for (int p = 1; p <= margin / 2; ++p) {
cos_t_p /= cos_t * cos_t;
dcosm_dw += LSPowOfMO(p) * c_table[2*p] * (p * cos_t * dsin2_dw + \
(margin - 2*p) * sin2_t * dcos_dw) * cos_t_p * sin2_t_p;
sin2_t_p *= sin2_t;
}
const DType df_dw_j = (LSPowOfMO(k) * cos_mt - 2*k) * x_norm_i / w_norm_yi * w[yi][l] + \
LSPowOfMO(k) * w_norm_yi * x_norm_i * dcosm_dw;
dw += o_grad[i][yi] * (df_dw_j - x[i][l]);
}
}
const DType alpha = 1 / (1 + beta);
w_grad[j][l] += alpha * dw;
}
}
template<typename DType>
inline void LSoftmaxBackward(const Tensor<gpu, 2, DType> &x,
const Tensor<gpu, 2, DType> &w,
const Tensor<gpu, 1, DType> &label,
const Tensor<gpu, 1, DType> &x_norm,
const Tensor<gpu, 1, DType> &w_norm,
const Tensor<gpu, 2, DType> &o_grad,
const Tensor<gpu, 2, DType> &x_grad,
const Tensor<gpu, 2, DType> &w_grad,
const Tensor<gpu, 2, DType> &workspace,
const Tensor<gpu, 1, DType> &k_table,
const Tensor<gpu, 1, DType> &c_table,
const int margin,
const DType beta) {
const int n = x.size(0);
const int feature_dim = x.size(1);
const int m = w.size(0);
dim3 dimBlock(kBaseThreadNum);
dim3 dimGrid((n + kBaseThreadNum - 1) / kBaseThreadNum);
LSoftmaxBackwardRequired<<<dimGrid, dimBlock>>>(x, w, label, x_norm, w_norm, workspace,
k_table, c_table, margin);
dimGrid.x = ((n * feature_dim + kBaseThreadNum - 1) / kBaseThreadNum);
LSoftmaxBackwardXKernel<<<dimGrid, dimBlock>>>(x, w, label, x_norm, w_norm, o_grad, x_grad, workspace,
c_table, margin, beta);
dimGrid.x = ((m * feature_dim + kBaseThreadNum - 1) / kBaseThreadNum);
LSoftmaxBackwardWKernel<<<dimGrid, dimBlock>>>(x, w, label, x_norm, w_norm, o_grad, w_grad, workspace,
c_table, margin, beta);
}
} // namespace cuda
template<typename DType>
inline void LSoftmaxForward(const Tensor<gpu, 2, DType> &x,
const Tensor<gpu, 2, DType> &w,
const Tensor<gpu, 1, DType> &label,
const Tensor<gpu, 2, DType> &out,
const Tensor<gpu, 1, DType> &x_norm,
const Tensor<gpu, 1, DType> &w_norm,
const Tensor<gpu, 1, DType> &k_table,
const Tensor<gpu, 1, DType> &c_table,
const int margin,
const DType beta) {
cuda::LSoftmaxForward(x, w, label, out, x_norm, w_norm,
k_table, c_table, margin, beta);
}
template<typename DType>
inline void LSoftmaxBackward(const Tensor<gpu, 2, DType> &x,
const Tensor<gpu, 2, DType> &w,
const Tensor<gpu, 1, DType> &label,
const Tensor<gpu, 1, DType> &x_norm,
const Tensor<gpu, 1, DType> &w_norm,
const Tensor<gpu, 2, DType> &o_grad,
const Tensor<gpu, 2, DType> &x_grad,
const Tensor<gpu, 2, DType> &w_grad,
const Tensor<gpu, 2, DType> &workspace,
const Tensor<gpu, 1, DType> &k_table,
const Tensor<gpu, 1, DType> &c_table,
const int margin,
const DType beta) {
cuda::LSoftmaxBackward(x, w, label, x_norm, w_norm, o_grad, x_grad, w_grad, workspace,
k_table, c_table, margin, beta);
}
} // namespace mshadow
namespace mxnet {
namespace op {
template<>
Operator *CreateOp<gpu>(LSoftmaxParam param, int dtype) {
Operator *op = NULL;
MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
op = new LSoftmaxOp<gpu, DType>(param);
})
return op;
}
} // namespace op
} // namespace mxnet