From 2bd085cb47d9a89e2d7653f9d362457d62c0684c Mon Sep 17 00:00:00 2001 From: Tadas Baltrusaitis Date: Wed, 16 Aug 2017 17:14:00 +0100 Subject: [PATCH] Exploring CNN optimization --- .../include/FaceDetectorMTCNN.h | 2 + .../src/FaceDetectorMTCNN.cpp | 219 +++++++++++++++++- 2 files changed, 220 insertions(+), 1 deletion(-) diff --git a/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h b/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h index cfe7a656..04c7e1eb 100644 --- a/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h +++ b/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h @@ -89,6 +89,7 @@ namespace LandmarkDetector // Precomputations for faster convolution vector > > > > cnn_convolutional_layers_dft; + vector > > > > cnn_convolutional_layers_dft2; private: //========================================== @@ -98,6 +99,7 @@ namespace LandmarkDetector // layer -> input maps -> kernels // Bit ugly with so much nesting, but oh well vector > > > cnn_convolutional_layers; + vector > > > cnn_convolutional_layers_rearr; vector > cnn_convolutional_layers_bias; vector > cnn_fully_connected_layers_weights; vector > cnn_fully_connected_layers_biases; diff --git a/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp b/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp index 6f95a188..b8790714 100644 --- a/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp +++ b/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp @@ -111,6 +111,23 @@ CNN::CNN(const CNN& other) : cnn_layer_types(other.cnn_layer_types), cnn_max_poo } } + this->cnn_convolutional_layers_rearr.resize(other.cnn_convolutional_layers_rearr.size()); + for (size_t l = 0; l < other.cnn_convolutional_layers_rearr.size(); ++l) + { + this->cnn_convolutional_layers_rearr[l].resize(other.cnn_convolutional_layers_rearr[l].size()); + + for (size_t i = 0; i < other.cnn_convolutional_layers_rearr[l].size(); ++i) + { + this->cnn_convolutional_layers_rearr[l][i].resize(other.cnn_convolutional_layers_rearr[l][i].size()); + + for (size_t k = 0; k < other.cnn_convolutional_layers_rearr[l][i].size(); ++k) + { + // Make sure the matrix is copied. + this->cnn_convolutional_layers_rearr[l][i][k] = other.cnn_convolutional_layers_rearr[l][i][k].clone(); + } + } + } + this->cnn_fully_connected_layers_weights.resize(other.cnn_fully_connected_layers_weights.size()); for (size_t l = 0; l < other.cnn_fully_connected_layers_weights.size(); ++l) @@ -282,6 +299,166 @@ void max_pooling(std::vector >& outputs, const std::vector >& input_imgs, vector >& img_dfts, const vector >& _templs, map > >& _templ_dfts, cv::Mat_& result) +{ + // Assume result is defined properly + if (result.empty()) + { + cv::Size corrSize(input_imgs[0].cols - _templs[0].cols + 1, input_imgs[0].rows - _templs[0].rows + 1); + result.create(corrSize); + } + + // Our model will always be under min block size so can ignore this + //const double blockScale = 4.5; + //const int minBlockSize = 256; + + int maxDepth = CV_64F; + + cv::Size dftsize; + + dftsize.width = cv::getOptimalDFTSize(result.cols + _templs[0].cols - 1); + dftsize.height = cv::getOptimalDFTSize(result.rows + _templs[0].rows - 1); + + // Compute block size + cv::Size blocksize; + blocksize.width = dftsize.width - _templs[0].cols + 1; + blocksize.width = MIN(blocksize.width, result.cols); + blocksize.height = dftsize.height - _templs[0].rows + 1; + blocksize.height = MIN(blocksize.height, result.rows); + + vector> dftTempl; + + // if this has not been precomputed, precompute it, otherwise use it + if (_templ_dfts.find(dftsize.width) == _templ_dfts.end()) + { + dftTempl.resize(_templs.size()); + for (size_t k = 0; k < _templs.size(); ++k) + { + dftTempl[k].create(dftsize.height, dftsize.width); + + cv::Mat_ src = _templs[k]; + + cv::Mat_ dst(dftTempl[k], cv::Rect(0, 0, dftsize.width, dftsize.height)); + + cv::Mat_ dst1(dftTempl[k], cv::Rect(0, 0, _templs[k].cols, _templs[k].rows)); + + if (dst1.data != src.data) + src.convertTo(dst1, dst1.depth()); + + if (dst.cols > _templs[k].cols) + { + cv::Mat_ part(dst, cv::Range(0, _templs[k].rows), cv::Range(_templs[k].cols, dst.cols)); + part.setTo(0); + } + + // Perform DFT of the template + dft(dst, dst, 0, _templs[k].rows); + + } + // TODO is a deep copy needed + _templ_dfts[dftsize.width] = dftTempl; + + } + else + { + dftTempl = _templ_dfts[dftsize.width]; + } + + cv::Size bsz(std::min(blocksize.width, result.cols), std::min(blocksize.height, result.rows)); + cv::Mat src; + + cv::Mat cdst(result, cv::Rect(0, 0, bsz.width, bsz.height)); + + vector > dftImgs; + dftImgs.resize(input_imgs.size()); + + if (img_dfts.empty()) + { + for(size_t k = 0; k < input_imgs.size(); ++k) + { + dftImgs[k].create(dftsize); + dftImgs[k].setTo(0.0); + + cv::Size dsz(bsz.width + _templs[k].cols - 1, bsz.height + _templs[k].rows - 1); + + int x2 = std::min(input_imgs[k].cols, dsz.width); + int y2 = std::min(input_imgs[k].rows, dsz.height); + + cv::Mat src0(input_imgs[k], cv::Range(0, y2), cv::Range(0, x2)); + cv::Mat dst(dftImgs[k], cv::Rect(0, 0, dsz.width, dsz.height)); + cv::Mat dst1(dftImgs[k], cv::Rect(0, 0, x2, y2)); + + src = src0; + + if (dst1.data != src.data) + src.convertTo(dst1, dst1.depth()); + + dft(dftImgs[k], dftImgs[k], 0, dsz.height); + img_dfts.push_back(dftImgs[k].clone()); + } + } + + cv::Mat_ dft_img(img_dfts[0].rows, img_dfts[0].cols, 0.0); + for (size_t k = 0; k < input_imgs.size(); ++k) + { + cv::Mat dftTempl1(dftTempl[k], cv::Rect(0, 0, dftsize.width, dftsize.height)); + cv::mulSpectrums(img_dfts[k], dftTempl1, dftImgs[k], 0, true); + dft_img = dft_img + dftImgs[k]; + } + + + cv::dft(dft_img, dft_img, cv::DFT_INVERSE + cv::DFT_SCALE, bsz.height); + + src = dft_img(cv::Rect(0, 0, bsz.width, bsz.height)); + + src.convertTo(cdst, CV_32F); + +} + + +void convolution_fft2(std::vector >& outputs, const std::vector >& input_maps, const std::vector > >& kernels, const std::vector& biases, vector > > >& precomp_dfts) +{ + outputs.clear(); + + // Useful precomputed data placeholders for quick correlation (convolution) + vector > input_image_dft; + + for (size_t k = 0; k < kernels.size(); ++k) + { + + // The convolution (with precomputation) + cv::Mat_ output; + convolution_single_kern_fft(input_maps, input_image_dft, kernels[k], precomp_dfts[k], output); + + // Combining the maps + outputs.push_back(output + biases[k]); + + } +} + +//void convolution_fft2_tiled(std::vector >& outputs, const std::vector >& input_maps, const std::vector > >& kernels, const std::vector& biases, vector >& precomp_dfts) +//{ +// outputs.clear(); +// +// // Useful precomputed data placeholders for quick correlation (convolution) +// cv::Mat_ input_image_dft_tiled; +// +// for (size_t k = 0; k < kernels.size(); ++k) +// { +// +// // The convolution (with precomputation) +// cv::Mat_ output; +// convolution_single_kern_fft_tiled(input_maps, input_image_dft, kernels[k], precomp_dfts[k], output); +// +// // Combining the maps +// outputs.push_back(output + biases[k]); +// +// } +//} + + void convolution_fft(std::vector >& outputs, const std::vector >& input_maps, const std::vector > >& kernels, const std::vector& biases, vector > > >& precomp_dfts) { outputs.clear(); @@ -370,7 +547,18 @@ std::vector> CNN::Inference(const cv::Mat& input_img) // Convolutional layer if (layer_type == 0) { - convolution_fft(outputs, input_maps, cnn_convolutional_layers[cnn_layer], cnn_convolutional_layers_bias[cnn_layer], cnn_convolutional_layers_dft[cnn_layer]); + + convolution_fft2(outputs, input_maps, cnn_convolutional_layers_rearr[cnn_layer], cnn_convolutional_layers_bias[cnn_layer], cnn_convolutional_layers_dft2[cnn_layer]); + + //vector > outs; + //convolution_fft(outs, input_maps, cnn_convolutional_layers[cnn_layer], cnn_convolutional_layers_bias[cnn_layer], cnn_convolutional_layers_dft[cnn_layer]); + + //double diff = 0; + //for (size_t i = 0; i < outs.size(); ++i) + //{ + // diff += cv::mean(cv::abs(outs[i] - outputs[i]))[0]; + //} + //cout << diff << endl; cnn_layer++; } @@ -498,6 +686,26 @@ void CNN::Read(string location) cnn_convolutional_layers.push_back(kernels); cnn_convolutional_layers_dft.push_back(kernel_dfts); + + vector > > > cnn_convolutional_layers_dft2_curr_layer; + cnn_convolutional_layers_dft2_curr_layer.resize(num_kernels); + cnn_convolutional_layers_dft2.push_back(cnn_convolutional_layers_dft2_curr_layer); + + // Rearrange the kernels for faster inference with FFT + vector > > kernels_rearr; + kernels_rearr.resize(num_kernels); + + // Fill up the rearranged layer + for (int k = 0; k < num_kernels; ++k) + { + for (int in = 0; in < num_in_maps; ++in) + { + kernels_rearr[k].push_back(kernels[in][k]); + } + } + + cnn_convolutional_layers_rearr.push_back(kernels_rearr); + } else if (layer_type == 1) { @@ -816,6 +1024,15 @@ bool FaceDetectorMTCNN::DetectFaces(vector >& o_regions, const } } + for (size_t k1 = 0; k1 < PNet.cnn_convolutional_layers_dft2.size(); ++k1) + { + for (size_t k2 = 0; k2 < PNet.cnn_convolutional_layers_dft2[k1].size(); ++k2) + { + PNet.cnn_convolutional_layers_dft2[k1][k2].clear(); + } + } + + // Extract the probabilities from PNet response cv::Mat_ prob_heatmap; cv::exp(pnet_out[0]- pnet_out[1], prob_heatmap);