mirror of
https://github.com/deepinsight/insightface.git
synced 2026-05-15 12:52:47 +00:00
323 lines
14 KiB
Plaintext
323 lines
14 KiB
Plaintext
/*!
|
|
* Copyright (c) 2016 by Contributors
|
|
* \file lsoftmax.cu
|
|
* \brief LSoftmax from <Large-Margin Softmax Loss for Convolutional Neural Networks>
|
|
* \author luoyetx
|
|
*/
|
|
#include "./lsoftmax-inl.h"
|
|
|
|
namespace mshadow {
|
|
namespace cuda {
|
|
|
|
namespace {
|
|
// workspace variables
|
|
enum LSoftmaxTempSpaceType {kCost, kCosmt, kK, kSin2t, kFo, kCostM};
|
|
}
|
|
|
|
#define CUDA_KERNEL_LOOP(i, n) \
|
|
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
|
|
i < (n); \
|
|
i += blockDim.x * gridDim.x)
|
|
|
|
MSHADOW_XINLINE int LSPowOfMO(const int k) {
|
|
return 1 - ((k&0x01) << 1);
|
|
}
|
|
|
|
template<typename DType>
|
|
__global__ void LSCalcNorm(const Tensor<gpu, 2, DType> x,
|
|
Tensor<gpu, 1, DType> x_norm) {
|
|
const int n = x.size(0);
|
|
const int m = x.size(1);
|
|
CUDA_KERNEL_LOOP(i, n) {
|
|
DType norm = 0;
|
|
for (int j = 0; j < m; ++j) {
|
|
norm += x[i][j] * x[i][j];
|
|
}
|
|
x_norm[i] = sqrt(norm);
|
|
}
|
|
}
|
|
|
|
template<typename DType>
|
|
__device__ int LSFindK(const DType *k_table, const int n, const DType cos_t) {
|
|
const DType eps = 1e-5;
|
|
for (int i = 0; i < n; ++i) {
|
|
if (((k_table[i+1] < cos_t) || (abs(k_table[i+1] - cos_t) < eps)) &&
|
|
((k_table[i] > cos_t) || (abs(k_table[i] - cos_t) < eps))) {
|
|
return i;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
template<typename DType>
|
|
__device__ DType LSCalcCosmt(const DType *c_table, const int n,
|
|
const DType cos_t, const int margin) {
|
|
const DType sin2_t = 1 - cos_t * cos_t;
|
|
DType cos_t_p = pow(cos_t, margin);
|
|
DType sin2_t_p = 1;
|
|
DType cos_mt = cos_t_p; // p = 0
|
|
for (int p = 1; p <= margin / 2; ++p) {
|
|
cos_t_p /= cos_t * cos_t; // don't replace `cos_t*cos_t` with `1-sin2_t`, this can cause numeric issue if cos_t --> 0
|
|
sin2_t_p *= sin2_t;
|
|
cos_mt += LSPowOfMO(p) * c_table[2*p] * cos_t_p * sin2_t_p;
|
|
}
|
|
return cos_mt;
|
|
}
|
|
|
|
template<typename DType>
|
|
__global__ void LSoftmaxForwardKernel(const Tensor<gpu, 2, DType> x,
|
|
const Tensor<gpu, 2, DType> w,
|
|
const Tensor<gpu, 1, DType> label,
|
|
const Tensor<gpu, 1, DType> x_norm,
|
|
const Tensor<gpu, 1, DType> w_norm,
|
|
Tensor<gpu, 2, DType> out,
|
|
const Tensor<gpu, 1, DType> k_table,
|
|
const Tensor<gpu, 1, DType> c_table,
|
|
const int margin,
|
|
const DType beta) {
|
|
const int n = x.size(0);
|
|
const int feature_dim = x.size(1);
|
|
const int m = w.size(0);
|
|
CUDA_KERNEL_LOOP(i, n) {
|
|
const int yi = static_cast<int>(label[i]);
|
|
const DType fo_i_yi = out[i][yi];
|
|
const DType cos_t = fo_i_yi / (x_norm[i] * w_norm[yi]);
|
|
const int k = LSFindK(k_table.dptr_, k_table.size(0), cos_t);
|
|
const DType cos_mt = LSCalcCosmt(c_table.dptr_, c_table.size(0), cos_t, margin);
|
|
const DType f_i_yi = (LSPowOfMO(k) * cos_mt - 2*k) * (w_norm[yi] * x_norm[i]);
|
|
out[i][yi] = (f_i_yi + beta * fo_i_yi) / (1 + beta);
|
|
}
|
|
}
|
|
|
|
template<typename DType>
|
|
inline void LSoftmaxForward(const Tensor<gpu, 2, DType> &x,
|
|
const Tensor<gpu, 2, DType> &w,
|
|
const Tensor<gpu, 1, DType> &label,
|
|
const Tensor<gpu, 2, DType> &out,
|
|
const Tensor<gpu, 1, DType> &x_norm,
|
|
const Tensor<gpu, 1, DType> &w_norm,
|
|
const Tensor<gpu, 1, DType> &k_table,
|
|
const Tensor<gpu, 1, DType> &c_table,
|
|
const int margin,
|
|
const DType beta) {
|
|
const int n = x.size(0);
|
|
const int m = w.size(0);
|
|
dim3 dimBlock(kBaseThreadNum);
|
|
dim3 dimGrid((n + kBaseThreadNum - 1) / kBaseThreadNum);
|
|
LSCalcNorm<<<dimGrid, dimBlock>>>(x, x_norm);
|
|
dimGrid.x = ((m + kBaseThreadNum - 1) / kBaseThreadNum);
|
|
LSCalcNorm<<<dimGrid, dimBlock>>>(w, w_norm);
|
|
dimGrid.x = ((n + kBaseThreadNum - 1) / kBaseThreadNum);
|
|
LSoftmaxForwardKernel<<<dimGrid, dimBlock>>>(x, w, label, x_norm, w_norm, out, k_table, c_table, margin, beta);
|
|
}
|
|
|
|
template<typename DType>
|
|
__global__ void LSoftmaxBackwardRequired(const Tensor<gpu, 2, DType> x,
|
|
const Tensor<gpu, 2, DType> w,
|
|
const Tensor<gpu, 1, DType> label,
|
|
const Tensor<gpu, 1, DType> x_norm,
|
|
const Tensor<gpu, 1, DType> w_norm,
|
|
Tensor<gpu, 2, DType> workspace,
|
|
const Tensor<gpu, 1, DType> k_table,
|
|
const Tensor<gpu, 1, DType> c_table,
|
|
const int margin) {
|
|
const int n = x.size(0);
|
|
const int feature_dim = x.size(1);
|
|
CUDA_KERNEL_LOOP(i, n) {
|
|
const int yi = static_cast<int>(label[i]);
|
|
// fo_i_yi = dot(w_yi, x_i)
|
|
DType fo_i_yi = 0;
|
|
for (int p = 0; p < feature_dim; ++p) {
|
|
fo_i_yi += w[yi][p] * x[i][p];
|
|
}
|
|
const DType cos_t = fo_i_yi / (x_norm[i] * w_norm[yi]);
|
|
const int k = LSFindK(k_table.dptr_, k_table.size(0), cos_t);
|
|
const DType cos_mt = LSCalcCosmt(c_table.dptr_, c_table.size(0), cos_t, margin);
|
|
const DType sin2_t = 1 - cos_t * cos_t;
|
|
workspace[kCost][i] = cos_t;
|
|
workspace[kCosmt][i] = cos_mt;
|
|
workspace[kK][i] = static_cast<DType>(k);
|
|
workspace[kSin2t][i] = sin2_t;
|
|
workspace[kFo][i] = fo_i_yi;
|
|
workspace[kCostM][i] = pow(cos_t, margin - 1);
|
|
}
|
|
}
|
|
|
|
template<typename DType>
|
|
__global__ void LSoftmaxBackwardXKernel(const Tensor<gpu, 2, DType> x,
|
|
const Tensor<gpu, 2, DType> w,
|
|
const Tensor<gpu, 1, DType> label,
|
|
const Tensor<gpu, 1, DType> x_norm,
|
|
const Tensor<gpu, 1, DType> w_norm,
|
|
const Tensor<gpu, 2, DType> o_grad,
|
|
Tensor<gpu, 2, DType> x_grad,
|
|
const Tensor<gpu, 2, DType> workspace,
|
|
const Tensor<gpu, 1, DType> c_table,
|
|
const int margin,
|
|
const DType beta) {
|
|
const int nthreads = x.size(0) * x.size(1);
|
|
const int feature_dim = x.size(1);
|
|
CUDA_KERNEL_LOOP(idx, nthreads) {
|
|
const int i = idx / feature_dim;
|
|
const int l = idx % feature_dim;
|
|
const int yi = static_cast<int>(label[i]);
|
|
const DType cos_t = workspace[kCost][i];
|
|
const DType cos_mt = workspace[kCosmt][i];
|
|
const int k = static_cast<int>(workspace[kK][i]);
|
|
const DType sin2_t = workspace[kSin2t][i];
|
|
const DType fo_i_yi = workspace[kFo][i];
|
|
const DType w_norm_yi = w_norm[yi];
|
|
const DType x_norm_i = x_norm[i];
|
|
|
|
const DType dcos_dx = w[yi][l] / (w_norm_yi * x_norm_i) - \
|
|
fo_i_yi * x[i][l] / (w_norm_yi * x_norm_i * x_norm_i * x_norm_i);
|
|
const DType dsin2_dx = -2 * cos_t * dcos_dx;
|
|
DType cos_t_p = workspace[kCostM][i];
|
|
DType sin2_t_p = 1;
|
|
DType dcosm_dx = margin * cos_t_p * dcos_dx; // p = 0
|
|
for (int p = 1; p <= margin / 2; ++p) {
|
|
cos_t_p /= cos_t * cos_t;
|
|
dcosm_dx += LSPowOfMO(p) * c_table[2*p] * (p * cos_t * dsin2_dx + \
|
|
(margin - 2*p) * sin2_t * dcos_dx) * cos_t_p * sin2_t_p;
|
|
sin2_t_p *= sin2_t;
|
|
}
|
|
const DType df_dx = (LSPowOfMO(k) * cos_mt - 2*k) * w_norm_yi / x_norm_i * x[i][l] + \
|
|
LSPowOfMO(k) * w_norm_yi * x_norm_i * dcosm_dx;
|
|
const DType alpha = 1 / (1 + beta);
|
|
x_grad[i][l] += alpha * o_grad[i][yi] * (df_dx - w[yi][l]);
|
|
}
|
|
}
|
|
|
|
template<typename DType>
|
|
__global__ void LSoftmaxBackwardWKernel(const Tensor<gpu, 2, DType> x,
|
|
const Tensor<gpu, 2, DType> w,
|
|
const Tensor<gpu, 1, DType> label,
|
|
const Tensor<gpu, 1, DType> x_norm,
|
|
const Tensor<gpu, 1, DType> w_norm,
|
|
const Tensor<gpu, 2, DType> o_grad,
|
|
Tensor<gpu, 2, DType> w_grad,
|
|
const Tensor<gpu, 2, DType> workspace,
|
|
const Tensor<gpu, 1, DType> c_table,
|
|
const int margin,
|
|
const DType beta) {
|
|
const int nthreads = w.size(0) * w.size(1);
|
|
const int n = x.size(0);
|
|
const int feature_dim = w.size(1);
|
|
CUDA_KERNEL_LOOP(idx, nthreads) {
|
|
const int j = idx / feature_dim;
|
|
const int l = idx % feature_dim;
|
|
DType dw = 0;
|
|
for (int i = 0; i < n; ++i) {
|
|
const int yi = static_cast<int>(label[i]);
|
|
if (yi == j) {
|
|
const DType cos_t = workspace[kCost][i];
|
|
const DType cos_mt = workspace[kCosmt][i];
|
|
const int k = static_cast<int>(workspace[kK][i]);
|
|
const DType sin2_t = workspace[kSin2t][i];
|
|
const DType fo_i_yi = workspace[kFo][i];
|
|
const DType x_norm_i = x_norm[i];
|
|
const DType w_norm_yi = w_norm[yi];
|
|
|
|
const DType dcos_dw = x[i][l] / (w_norm_yi * x_norm_i) - \
|
|
fo_i_yi * w[yi][l] / (x_norm_i * w_norm_yi * w_norm_yi * w_norm_yi);
|
|
const DType dsin2_dw = -2 * cos_t * dcos_dw;
|
|
DType cos_t_p = workspace[kCostM][i];
|
|
DType sin2_t_p = 1;
|
|
DType dcosm_dw = margin * cos_t_p * dcos_dw; // p = 0
|
|
for (int p = 1; p <= margin / 2; ++p) {
|
|
cos_t_p /= cos_t * cos_t;
|
|
dcosm_dw += LSPowOfMO(p) * c_table[2*p] * (p * cos_t * dsin2_dw + \
|
|
(margin - 2*p) * sin2_t * dcos_dw) * cos_t_p * sin2_t_p;
|
|
sin2_t_p *= sin2_t;
|
|
}
|
|
const DType df_dw_j = (LSPowOfMO(k) * cos_mt - 2*k) * x_norm_i / w_norm_yi * w[yi][l] + \
|
|
LSPowOfMO(k) * w_norm_yi * x_norm_i * dcosm_dw;
|
|
dw += o_grad[i][yi] * (df_dw_j - x[i][l]);
|
|
}
|
|
}
|
|
const DType alpha = 1 / (1 + beta);
|
|
w_grad[j][l] += alpha * dw;
|
|
}
|
|
}
|
|
|
|
template<typename DType>
|
|
inline void LSoftmaxBackward(const Tensor<gpu, 2, DType> &x,
|
|
const Tensor<gpu, 2, DType> &w,
|
|
const Tensor<gpu, 1, DType> &label,
|
|
const Tensor<gpu, 1, DType> &x_norm,
|
|
const Tensor<gpu, 1, DType> &w_norm,
|
|
const Tensor<gpu, 2, DType> &o_grad,
|
|
const Tensor<gpu, 2, DType> &x_grad,
|
|
const Tensor<gpu, 2, DType> &w_grad,
|
|
const Tensor<gpu, 2, DType> &workspace,
|
|
const Tensor<gpu, 1, DType> &k_table,
|
|
const Tensor<gpu, 1, DType> &c_table,
|
|
const int margin,
|
|
const DType beta) {
|
|
const int n = x.size(0);
|
|
const int feature_dim = x.size(1);
|
|
const int m = w.size(0);
|
|
dim3 dimBlock(kBaseThreadNum);
|
|
dim3 dimGrid((n + kBaseThreadNum - 1) / kBaseThreadNum);
|
|
LSoftmaxBackwardRequired<<<dimGrid, dimBlock>>>(x, w, label, x_norm, w_norm, workspace,
|
|
k_table, c_table, margin);
|
|
dimGrid.x = ((n * feature_dim + kBaseThreadNum - 1) / kBaseThreadNum);
|
|
LSoftmaxBackwardXKernel<<<dimGrid, dimBlock>>>(x, w, label, x_norm, w_norm, o_grad, x_grad, workspace,
|
|
c_table, margin, beta);
|
|
dimGrid.x = ((m * feature_dim + kBaseThreadNum - 1) / kBaseThreadNum);
|
|
LSoftmaxBackwardWKernel<<<dimGrid, dimBlock>>>(x, w, label, x_norm, w_norm, o_grad, w_grad, workspace,
|
|
c_table, margin, beta);
|
|
}
|
|
|
|
} // namespace cuda
|
|
|
|
template<typename DType>
|
|
inline void LSoftmaxForward(const Tensor<gpu, 2, DType> &x,
|
|
const Tensor<gpu, 2, DType> &w,
|
|
const Tensor<gpu, 1, DType> &label,
|
|
const Tensor<gpu, 2, DType> &out,
|
|
const Tensor<gpu, 1, DType> &x_norm,
|
|
const Tensor<gpu, 1, DType> &w_norm,
|
|
const Tensor<gpu, 1, DType> &k_table,
|
|
const Tensor<gpu, 1, DType> &c_table,
|
|
const int margin,
|
|
const DType beta) {
|
|
cuda::LSoftmaxForward(x, w, label, out, x_norm, w_norm,
|
|
k_table, c_table, margin, beta);
|
|
}
|
|
|
|
template<typename DType>
|
|
inline void LSoftmaxBackward(const Tensor<gpu, 2, DType> &x,
|
|
const Tensor<gpu, 2, DType> &w,
|
|
const Tensor<gpu, 1, DType> &label,
|
|
const Tensor<gpu, 1, DType> &x_norm,
|
|
const Tensor<gpu, 1, DType> &w_norm,
|
|
const Tensor<gpu, 2, DType> &o_grad,
|
|
const Tensor<gpu, 2, DType> &x_grad,
|
|
const Tensor<gpu, 2, DType> &w_grad,
|
|
const Tensor<gpu, 2, DType> &workspace,
|
|
const Tensor<gpu, 1, DType> &k_table,
|
|
const Tensor<gpu, 1, DType> &c_table,
|
|
const int margin,
|
|
const DType beta) {
|
|
cuda::LSoftmaxBackward(x, w, label, x_norm, w_norm, o_grad, x_grad, w_grad, workspace,
|
|
k_table, c_table, margin, beta);
|
|
}
|
|
|
|
} // namespace mshadow
|
|
|
|
namespace mxnet {
|
|
namespace op {
|
|
|
|
template<>
|
|
Operator *CreateOp<gpu>(LSoftmaxParam param, int dtype) {
|
|
Operator *op = NULL;
|
|
MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
|
|
op = new LSoftmaxOp<gpu, DType>(param);
|
|
})
|
|
return op;
|
|
}
|
|
|
|
} // namespace op
|
|
} // namespace mxnet
|