diff --git a/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h b/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h index 04c7e1eb..8ec1c20b 100644 --- a/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h +++ b/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h @@ -81,8 +81,8 @@ namespace LandmarkDetector // Copy constructor CNN(const CNN& other); - // Given an image, orientation and detected landmarks output the result of the appropriate regressor - std::vector > Inference(const cv::Mat& input_img); + // Given an image apply a CNN on it, the boolean direct controls if direct convolution is used (through matrix multiplication) or an FFT optimization + std::vector > Inference(const cv::Mat& input_img, bool direct = true); // Reading in the model void Read(string location); @@ -99,6 +99,7 @@ namespace LandmarkDetector // layer -> input maps -> kernels // Bit ugly with so much nesting, but oh well vector > > > cnn_convolutional_layers; + vector > cnn_convolutional_layers_weights; vector > > > cnn_convolutional_layers_rearr; vector > cnn_convolutional_layers_bias; vector > cnn_fully_connected_layers_weights; diff --git a/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp b/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp index b8790714..ded812bc 100644 --- a/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp +++ b/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp @@ -111,6 +111,13 @@ CNN::CNN(const CNN& other) : cnn_layer_types(other.cnn_layer_types), cnn_max_poo } } + this->cnn_convolutional_layers_weights.resize(other.cnn_convolutional_layers_weights.size()); + for (size_t l = 0; l < other.cnn_convolutional_layers_weights.size(); ++l) + { + // Make sure the matrix is copied. + this->cnn_convolutional_layers_weights[l] = other.cnn_convolutional_layers_weights[l].clone(); + } + this->cnn_convolutional_layers_rearr.resize(other.cnn_convolutional_layers_rearr.size()); for (size_t l = 0; l < other.cnn_convolutional_layers_rearr.size(); ++l) { @@ -404,11 +411,17 @@ void convolution_single_kern_fft(const vector >& input_imgs, vec for (size_t k = 0; k < input_imgs.size(); ++k) { cv::Mat dftTempl1(dftTempl[k], cv::Rect(0, 0, dftsize.width, dftsize.height)); - cv::mulSpectrums(img_dfts[k], dftTempl1, dftImgs[k], 0, true); - dft_img = dft_img + dftImgs[k]; + if (k == 0) + { + cv::mulSpectrums(img_dfts[k], dftTempl1, dft_img, 0, true); + } + else + { + cv::mulSpectrums(img_dfts[k], dftTempl1, dftImgs[k], 0, true); + dft_img = dft_img + dftImgs[k]; + } } - cv::dft(dft_img, dft_img, cv::DFT_INVERSE + cv::DFT_SCALE, bsz.height); src = dft_img(cv::Rect(0, 0, bsz.width, bsz.height)); @@ -417,6 +430,105 @@ void convolution_single_kern_fft(const vector >& input_imgs, vec } +void im2colBias(const cv::Mat_& input, int width, int height, cv::Mat_& output) +{ + + int m = input.rows; + int n = input.cols; + + // determine how many blocks there will be with a sliding window of width x height in the input + int yB = m - height + 1; + int xB = n - width + 1; + + // Allocate the output size + if (output.rows != xB*yB && output.cols != width * height + 1) + { + output = cv::Mat::ones(xB*yB, width * height + 1, CV_32F); + } + + // Iterate over the blocks + for (int i = 0; i< yB; i++) + { + for (int j = 0; j< xB; j++) + { + // here yours is in different order than I first thought: + //int rowIdx = j + i*xB; // my intuition how to index the result + int rowIdx = i + j*yB; + + for (unsigned int yy = 0; yy < height; ++yy) + for (unsigned int xx = 0; xx < width; ++xx) + { + int colIdx = xx*height + yy; + output.at(rowIdx, colIdx + 1) = input.at(i + yy, j + xx); + } + } + } +} + +void im2col(const cv::Mat_& input, int width, int height, cv::Mat_& output) +{ + + int m = input.rows; + int n = input.cols; + + // determine how many blocks there will be with a sliding window of width x height in the input + int yB = m - height + 1; + int xB = n - width + 1; + + // Allocate the output size + if (output.rows != xB*yB && output.cols != width * height + 1) + { + output = cv::Mat::ones(xB*yB, width * height, CV_32F); + } + + // Iterate over the blocks + for (int i = 0; i< yB; i++) + { + for (int j = 0; j< xB; j++) + { + int rowIdx = i + j*yB; + + for (unsigned int yy = 0; yy < height; ++yy) + for (unsigned int xx = 0; xx < width; ++xx) + { + int colIdx = xx*height + yy; + output.at(rowIdx, colIdx) = input.at(i + yy, j + xx); + } + } + } +} + +void convolution_direct(std::vector >& outputs, const std::vector >& input_maps, const cv::Mat_& weight_matrix, const std::vector& biases, int height_k, int width_k) +{ + outputs.clear(); + + int height_in = input_maps[0].rows; + int width_n = input_maps[0].cols; + + // determine how many blocks there will be with a sliding window of width x height in the input + int yB = height_in - height_k + 1; + int xB = width_n - width_k + 1; + + cv::Mat_ input_matrix(yB * xB, input_maps.size() * height_k * width_k); + + // Comibine im2col accross channels to prepare for matrix multiplication + for (size_t i = 0; i < input_maps.size(); ++i) + { + im2col(input_maps[i], width_k, height_k, input_matrix(cv::Rect(i * height_k * width_k, 0, height_k * width_k, yB * xB))); + } + + // Actual multiplication + cv::Mat_ out = input_matrix * weight_matrix; + + // Move back to vectors and reshape accordingly (also add the bias) + for (size_t k = 0; k < weight_matrix.cols; ++k) + { + cv::Mat_ reshaped = out.col(k).clone() + biases[k]; + reshaped = reshaped.reshape(1, xB).t(); + outputs.push_back(reshaped); + } + +} void convolution_fft2(std::vector >& outputs, const std::vector >& input_maps, const std::vector > >& kernels, const std::vector& biases, vector > > >& precomp_dfts) { @@ -438,27 +550,6 @@ void convolution_fft2(std::vector >& outputs, const std::vector< } } -//void convolution_fft2_tiled(std::vector >& outputs, const std::vector >& input_maps, const std::vector > >& kernels, const std::vector& biases, vector >& precomp_dfts) -//{ -// outputs.clear(); -// -// // Useful precomputed data placeholders for quick correlation (convolution) -// cv::Mat_ input_image_dft_tiled; -// -// for (size_t k = 0; k < kernels.size(); ++k) -// { -// -// // The convolution (with precomputation) -// cv::Mat_ output; -// convolution_single_kern_fft_tiled(input_maps, input_image_dft, kernels[k], precomp_dfts[k], output); -// -// // Combining the maps -// outputs.push_back(output + biases[k]); -// -// } -//} - - void convolution_fft(std::vector >& outputs, const std::vector >& input_maps, const std::vector > >& kernels, const std::vector& biases, vector > > >& precomp_dfts) { outputs.clear(); @@ -514,7 +605,7 @@ void convolution_fft(std::vector >& outputs, const std::vector> CNN::Inference(const cv::Mat& input_img) +std::vector> CNN::Inference(const cv::Mat& input_img, bool direct) { if (input_img.channels() == 1) { @@ -548,8 +639,15 @@ std::vector> CNN::Inference(const cv::Mat& input_img) if (layer_type == 0) { - convolution_fft2(outputs, input_maps, cnn_convolutional_layers_rearr[cnn_layer], cnn_convolutional_layers_bias[cnn_layer], cnn_convolutional_layers_dft2[cnn_layer]); - + // Either perform direct convolution through matrix multiplication or use an FFT optimized version, which one is optimal depends on the kernel and input sizes + if (direct) + { + convolution_direct(outputs, input_maps, cnn_convolutional_layers_weights[cnn_layer], cnn_convolutional_layers_bias[cnn_layer], cnn_convolutional_layers_rearr[cnn_layer][0][0].rows, cnn_convolutional_layers_rearr[cnn_layer][0][0].cols); + } + else + { + convolution_fft2(outputs, input_maps, cnn_convolutional_layers_rearr[cnn_layer], cnn_convolutional_layers_bias[cnn_layer], cnn_convolutional_layers_dft2[cnn_layer]); + } //vector > outs; //convolution_fft(outs, input_maps, cnn_convolutional_layers[cnn_layer], cnn_convolutional_layers_bias[cnn_layer], cnn_convolutional_layers_dft[cnn_layer]); @@ -687,6 +785,7 @@ void CNN::Read(string location) cnn_convolutional_layers.push_back(kernels); cnn_convolutional_layers_dft.push_back(kernel_dfts); + vector > > > cnn_convolutional_layers_dft2_curr_layer; cnn_convolutional_layers_dft2_curr_layer.resize(num_kernels); cnn_convolutional_layers_dft2.push_back(cnn_convolutional_layers_dft2_curr_layer); @@ -706,6 +805,20 @@ void CNN::Read(string location) cnn_convolutional_layers_rearr.push_back(kernels_rearr); + // Rearrange the flattened kernels into weight matrices for direct convolution computation + cv::Mat_ weight_matrix(num_in_maps * kernels_rearr[0][0].rows * kernels_rearr[0][0].cols, num_kernels); + for (size_t k = 0; k < num_kernels; ++k) + { + for (size_t i = 0; i < num_in_maps; ++i) + { + // Flatten the kernel + cv::Mat_ k_flat = kernels_rearr[k][i].t(); + k_flat = k_flat.reshape(0, 1).t(); + k_flat.copyTo(weight_matrix(cv::Rect(k, i * kernels_rearr[0][0].rows * kernels_rearr[0][0].cols, 1, kernels_rearr[0][0].rows * kernels_rearr[0][0].cols))); + } + } + cnn_convolutional_layers_weights.push_back(weight_matrix); + } else if (layer_type == 1) { @@ -1010,8 +1123,8 @@ bool FaceDetectorMTCNN::DetectFaces(vector >& o_regions, const normalised_img = (normalised_img - 127.5) * 0.0078125; // Actual PNet CNN step - std::vector > pnet_out = PNet.Inference(normalised_img); - + std::vector > pnet_out = PNet.Inference(normalised_img, true); + // Clear the precomputations, as the image sizes will be different (TODO could be useful for videos) for (size_t k1 = 0; k1 < PNet.cnn_convolutional_layers_dft.size(); ++k1) { @@ -1098,7 +1211,7 @@ bool FaceDetectorMTCNN::DetectFaces(vector >& o_regions, const prop_img = (prop_img - 127.5) * 0.0078125; // Perform RNet on the proposal image - std::vector > rnet_out = RNet.Inference(prop_img); + std::vector > rnet_out = RNet.Inference(prop_img, true); float prob = 1.0 / (1.0 + cv::exp(rnet_out[0].at(0) - rnet_out[0].at(1))); scores_all[k] = prob; @@ -1156,7 +1269,7 @@ bool FaceDetectorMTCNN::DetectFaces(vector >& o_regions, const prop_img = (prop_img - 127.5) * 0.0078125; // Perform RNet on the proposal image - std::vector > onet_out = ONet.Inference(prop_img); + std::vector > onet_out = ONet.Inference(prop_img, true); float prob = 1.0 / (1.0 + cv::exp(onet_out[0].at(0) - onet_out[0].at(1))); scores_all[k] = prob;