From b71b2f1cfc2a2c1c973c3e03ef249905cc9082fa Mon Sep 17 00:00:00 2001 From: Tadas Baltrusaitis Date: Wed, 30 Aug 2017 15:20:56 +0100 Subject: [PATCH] Another performance improvement by rearranging GEMM order. --- .gitignore | 1 + .../LandmarkDetector/LandmarkDetector.vcxproj | 2 + .../LandmarkDetector.vcxproj.filters | 96 +- .../LandmarkDetector/include/CNN_utils.h | 64 ++ .../LandmarkDetector/src/CEN_patch_expert.cpp | 159 +--- lib/local/LandmarkDetector/src/CNN_utils.cpp | 434 +++++++++ .../src/FaceDetectorMTCNN.cpp | 848 +++++------------- 7 files changed, 810 insertions(+), 794 deletions(-) create mode 100644 lib/local/LandmarkDetector/include/CNN_utils.h create mode 100644 lib/local/LandmarkDetector/src/CNN_utils.cpp diff --git a/.gitignore b/.gitignore index 4826b944..d886c992 100644 --- a/.gitignore +++ b/.gitignore @@ -53,3 +53,4 @@ matlab_runners/Demos/demo_img/ matlab_runners/Demos/demo_vid/ matlab_runners/Demos/output_features_seq/ matlab_runners/Demos/output_features_vid/ +exe/FeatureExtraction/experiments/ diff --git a/lib/local/LandmarkDetector/LandmarkDetector.vcxproj b/lib/local/LandmarkDetector/LandmarkDetector.vcxproj index c86c51f4..017aa60e 100644 --- a/lib/local/LandmarkDetector/LandmarkDetector.vcxproj +++ b/lib/local/LandmarkDetector/LandmarkDetector.vcxproj @@ -195,6 +195,7 @@ xcopy /I /E /Y /D "$(SolutionDir)lib\3rdParty\OpenCV3.1\classifiers" "$(OutDir)c Use + Use @@ -255,6 +256,7 @@ xcopy /I /E /Y /D "$(SolutionDir)lib\3rdParty\OpenCV3.1\classifiers" "$(OutDir)c + diff --git a/lib/local/LandmarkDetector/LandmarkDetector.vcxproj.filters b/lib/local/LandmarkDetector/LandmarkDetector.vcxproj.filters index 07e70f23..fdef973c 100644 --- a/lib/local/LandmarkDetector/LandmarkDetector.vcxproj.filters +++ b/lib/local/LandmarkDetector/LandmarkDetector.vcxproj.filters @@ -4,6 +4,30 @@ source + + source + + + source + + + source + + + source + + + source + + + source + + + source + + + source + source @@ -19,30 +43,38 @@ source - - source - - - source - - - source - - - source - - - source - - - - source - headers + + headers + + + headers + + + headers + + + headers + + + headers + + + headers + + + headers + + + headers + + + headers + headers @@ -58,35 +90,13 @@ headers - - headers - - - headers - - - headers - - - headers - - - headers - - - headers - - - - headers - - {75157eb9-2d21-482a-a1e2-e3e4100fffd7} + {4c468912-7bbd-415e-a34e-3d8470740945} - {f50f893d-a6b2-4e45-8f94-529042834e49} + {55e469df-489d-4acb-86cf-bd2d915e69ad} \ No newline at end of file diff --git a/lib/local/LandmarkDetector/include/CNN_utils.h b/lib/local/LandmarkDetector/include/CNN_utils.h new file mode 100644 index 00000000..80db6e10 --- /dev/null +++ b/lib/local/LandmarkDetector/include/CNN_utils.h @@ -0,0 +1,64 @@ +/////////////////////////////////////////////////////////////////////////////// +// Copyright (C) 2017, Tadas Baltrusaitis, all rights reserved. +// +// ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY +// +// BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT. +// IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE. +// +// License can be found in OpenFace-license.txt +// +// * Any publications arising from the use of this software, including but +// not limited to academic journal and conference publications, technical +// reports and manuals, must cite at least one of the following works: +// +// OpenFace: an open source facial behavior analysis toolkit +// Tadas Baltrušaitis, Peter Robinson, and Louis-Philippe Morency +// in IEEE Winter Conference on Applications of Computer Vision, 2016 +// +// Rendering of Eyes for Eye-Shape Registration and Gaze Estimation +// Erroll Wood, Tadas Baltrušaitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, and Andreas Bulling +// in IEEE International. Conference on Computer Vision (ICCV), 2015 +// +// Cross-dataset learning and person-speci?c normalisation for automatic Action Unit detection +// Tadas Baltrušaitis, Marwa Mahmoud, and Peter Robinson +// in Facial Expression Recognition and Analysis Challenge, +// IEEE International Conference on Automatic Face and Gesture Recognition, 2015 +// +// Constrained Local Neural Fields for robust facial landmark detection in the wild. +// Tadas Baltrušaitis, Peter Robinson, and Louis-Philippe Morency. +// in IEEE Int. Conference on Computer Vision Workshops, 300 Faces in-the-Wild Challenge, 2013. +// +/////////////////////////////////////////////////////////////////////////////// + +// Header for all external CLNF/CLM-Z/CLM methods of interest to the user +#ifndef __CNN_UTILS_h_ +#define __CNN_UTILS_h_ + +// OpenCV includes +#include + +using namespace std; + +namespace LandmarkDetector +{ + //=========================================================================== + // Various CNN layers + + // Parametric ReLU with leaky weights (separate ones per channel) + void PReLU(std::vector >& input_output_maps, cv::Mat_ prelu_weights); + + // The fully connected layer + void fully_connected(std::vector >& outputs, const std::vector >& input_maps, cv::Mat_ weights, cv::Mat_ biases); + + // Max pooling layer with parametrized stride and kernel sizes + void max_pooling(std::vector >& outputs, const std::vector >& input_maps, int stride_x, int stride_y, int kernel_size_x, int kernel_size_y); + + // Convolution using FFT optimization rather than matrix multiplication + void convolution_fft2(std::vector >& outputs, const std::vector >& input_maps, const std::vector > >& kernels, const std::vector& biases, vector > > >& precomp_dfts); + + // Convolution using matrix multiplication and OpenBLAS optimization + void convolution_direct_blas(std::vector >& outputs, const std::vector >& input_maps, const cv::Mat_& weight_matrix, const std::vector& biases, int height_k, int width_k); + +} +#endif diff --git a/lib/local/LandmarkDetector/src/CEN_patch_expert.cpp b/lib/local/LandmarkDetector/src/CEN_patch_expert.cpp index 0e6c419c..65ee0e08 100644 --- a/lib/local/LandmarkDetector/src/CEN_patch_expert.cpp +++ b/lib/local/LandmarkDetector/src/CEN_patch_expert.cpp @@ -134,7 +134,7 @@ void CEN_patch_expert::Read(ifstream &stream) cv::Mat_ weight; LandmarkDetector::ReadMatBin(stream, weight); - weights[i] = weight; + weights[i] = weight.t(); biases[i] = bias; } @@ -251,6 +251,8 @@ void CEN_patch_expert::Response(const cv::Mat_ &area_of_interest, cv::Mat float beta = 0.0; sgemm_("N", "N", &weights[layer].cols, &response.rows, &response.cols, &alpha, m2, &weights[layer].cols, m1, &response.cols, &beta, m3, &weights[layer].cols); + // TODO correct this with weight transpose + response = resp_blas; // TODO bias could be pre-allocated to the window size so that addition could be quicker @@ -406,126 +408,6 @@ void im2colBiasSparse(const cv::Mat_& input, int width, int height, cv::M } } -//=========================================================================== -//void CEN_patch_expert::ResponseSparse_ocv(const cv::Mat_ &area_of_interest, cv::Mat_ &response) -//{ -// -// int response_height = area_of_interest.rows - height + 1; -// int response_width = area_of_interest.cols - width + 1; -// -// cv::Mat_ input_col; -// // Extract im2col but in a sparse way -// im2colBiasSparse(area_of_interest, width, height, input_col); -// -// // Mean and standard deviation normalization -// contrastNorm(input_col, response); -// -// cv::Mat_ response_blas = response.clone(); -// -// for (size_t layer = 0; layer < activation_function.size(); ++layer) -// { -// -// // We are performing response = response * weights[layers], but in OpenBLAS as that is significantly quicker than OpenCV -// response_blas = response.clone(); -// -// float* m1 = (float*)response_blas.data; -// float* m2 = (float*)weights[layer].data; -// -// cv::Mat_ resp_blas(response_blas.rows, weights[layer].cols, 1.0); -// float* m3 = (float*)resp_blas.data; -// -// cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, weights[layer].cols, response.rows, response.cols, 1, m2, weights[layer].cols, m1, response.cols, 0.0, m3, weights[layer].cols); -// -// response = resp_blas; -// -// // TODO bias could be pre-allocated to the window size so that addition could be quicker -// for (size_t y = 0; y < response.rows; ++y) -// { -// response(cv::Rect(0, y, response.cols, 1)) = response(cv::Rect(0, y, response.cols, 1)) + biases[layer]; -// } -// -// // Perform activation -// if (activation_function[layer] == 0) // Sigmoid -// { -// for (cv::MatIterator_ p = response.begin(); p != response.end(); p++) -// { -// *p = 1.0 / (1.0 + exp(-(*p))); -// } -// } -// else if (activation_function[layer] == 2)// ReLU -// { -// cv::threshold(response, response, 0, 0, cv::THRESH_TOZERO); -// } -// -// } -// -// // Restructure the output with interpolation -// cv::Mat_ mapMatrix(response.rows, response_height * response_width, 0.0f); -// -// // Find a mapping from indices in the computed sparse response and the original full response -// cv::Mat_ value_id_matrix(response_width, response_height, 0); -// -// int ind = 0; -// for (int k = 0; k < value_id_matrix.rows * value_id_matrix.cols; ++k) -// { -// if (k % 2 != 0) -// { -// value_id_matrix.at(k) = ind; -// ind++; -// } -// } -// value_id_matrix = value_id_matrix.t(); -// -// int skip_counter = 0; -// for (int x = 0; x < response_width; ++x) -// { -// for (int y = 0; y < response_height; ++y) -// { -// int mapping_col = x * response_height + y; -// skip_counter++; -// if (skip_counter % 2 == 0) -// { -// int val_id = value_id_matrix.at(y, x); -// mapMatrix.at(val_id, mapping_col) = 1; -// continue; -// } -// -// double num_neigh = 0.0; -// vector val_ids; -// if (x - 1 >= 0) -// { -// num_neigh++; -// val_ids.push_back(value_id_matrix.at(y, x - 1)); -// } -// if (y - 1 >= 0) -// { -// num_neigh++; -// val_ids.push_back(value_id_matrix.at(y - 1, x)); -// } -// if (x + 1 < response_width) -// { -// num_neigh++; -// val_ids.push_back(value_id_matrix.at(y, x + 1)); -// } -// if (y + 1 < response_height) -// { -// num_neigh++; -// val_ids.push_back(value_id_matrix.at(y + 1, x)); -// } -// -// for (size_t k = 0; k < val_ids.size(); ++k) -// { -// mapMatrix.at(val_ids[k], mapping_col) = 1.0 / num_neigh; -// } -// } -// } -// -// response = response.t() * mapMatrix; -// response = response.t(); -// response = response.reshape(1, response_height); -// response = response.t(); -//} - // As the sparse patch expert output with interpolation, this function creates an interpolation matrix void LandmarkDetector::interpolationMatrix(cv::Mat_& mapMatrix, int response_height, int response_width, int input_width, int input_height) { @@ -610,41 +492,46 @@ void CEN_patch_expert::ResponseSparse(const cv::Mat_ &area_of_interest, c // Extract im2col but in a sparse way and contrast normalize im2colBiasSparseContrastNorm(area_of_interest, width, height, response); + response = response.t(); for (size_t layer = 0; layer < activation_function.size(); ++layer) { - // We are performing response = response * weights[layers], but in OpenBLAS as that is significantly quicker than OpenCV - float* m1 = (float*)response.data; - float* m2 = (float*)weights[layer].data; + // We are performing response = weights[layers] * response(t), but in OpenBLAS as that is significantly quicker than OpenCV + cv::Mat_ resp = response; + float* m1 = (float*)resp.data; + cv::Mat_ weight = weights[layer]; + float* m2 = (float*)weight.data; - cv::Mat_ resp_blas(response.rows, weights[layer].cols); + cv::Mat_ resp_blas(weight.rows, resp.cols); float* m3 = (float*)resp_blas.data; - // Perform matrix multiplication - float alpha = 1.0; - float beta = 0.0; - sgemm_("N", "N", &weights[layer].cols, &response.rows, &response.cols, &alpha, m2, &weights[layer].cols, m1, &response.cols, &beta, m3, &weights[layer].cols); + // Perform matrix multiplication in OpenBLAS (fortran call) + float alpha1 = 1.0; + float beta1 = 0.0; + sgemm_("N", "N", &resp.cols, &weight.rows, &weight.cols, &alpha1, m1, &resp.cols, m2, &weight.cols, &beta1, m3, &resp.cols); // The above is a faster version of this, by calling the fortran version directly - //cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, weights[layer].cols, response.rows, response.cols, 1, m2, weights[layer].cols, m1, response.cols, 0.0, m3, weights[layer].cols); + //cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, resp.cols, weight.rows, weight.cols, 1, m1, resp.cols, m2, weight.cols, 0.0, m3, resp.cols); // Adding the bias (bit ugly, but the fastest way to do this) response = resp_blas; + float* data = (float*)response.data; size_t height = response.rows; size_t width = response.cols; float* data_b = (float*)biases[layer].data; for (size_t y = 0; y < height; ++y) { + float bias = data_b[y]; for (size_t x = 0; x < width; ++x) { - float in = *data; - *data++ = in + data_b[x]; + float in = *data + bias; + *data++ = in; } } - // Perform activation + // Perform activation and add bias at the same time if (activation_function[layer] == 0) // Sigmoid { @@ -662,15 +549,13 @@ void CEN_patch_expert::ResponseSparse(const cv::Mat_ &area_of_interest, c } else if (activation_function[layer] == 2)// ReLU { - // TODO this could be spedup by iterating over it cv::threshold(response, response, 0, 0, cv::THRESH_TOZERO); } } - //response = response.t() * mapMatrix; - //response = response.t(); - response = mapMatrix * response; + response = response * mapMatrix; + response = response.t(); response = response.reshape(1, response_height); response = response.t(); } diff --git a/lib/local/LandmarkDetector/src/CNN_utils.cpp b/lib/local/LandmarkDetector/src/CNN_utils.cpp new file mode 100644 index 00000000..7128eafa --- /dev/null +++ b/lib/local/LandmarkDetector/src/CNN_utils.cpp @@ -0,0 +1,434 @@ +/////////////////////////////////////////////////////////////////////////////// +// Copyright (C) 2017, Tadas Baltrusaitis, all rights reserved. +// +// ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY +// +// BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT. +// IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE. +// +// License can be found in OpenFace-license.txt +// +// * Any publications arising from the use of this software, including but +// not limited to academic journal and conference publications, technical +// reports and manuals, must cite at least one of the following works: +// +// OpenFace: an open source facial behavior analysis toolkit +// Tadas Baltrušaitis, Peter Robinson, and Louis-Philippe Morency +// in IEEE Winter Conference on Applications of Computer Vision, 2016 +// +// Rendering of Eyes for Eye-Shape Registration and Gaze Estimation +// Erroll Wood, Tadas Baltrušaitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, and Andreas Bulling +// in IEEE International. Conference on Computer Vision (ICCV), 2015 +// +// Cross-dataset learning and person-speci?c normalisation for automatic Action Unit detection +// Tadas Baltrušaitis, Marwa Mahmoud, and Peter Robinson +// in Facial Expression Recognition and Analysis Challenge, +// IEEE International Conference on Automatic Face and Gesture Recognition, 2015 +// +// Constrained Local Neural Fields for robust facial landmark detection in the wild. +// Tadas Baltrušaitis, Peter Robinson, and Louis-Philippe Morency. +// in IEEE Int. Conference on Computer Vision Workshops, 300 Faces in-the-Wild Challenge, 2013. +// +/////////////////////////////////////////////////////////////////////////////// + +#include "stdafx.h" + +#include "CNN_utils.h" + +// OpenBLAS +#include +#include + +using namespace std; + +namespace LandmarkDetector +{ + + // Parametric ReLU with leaky weights (separate ones per channel) + void PReLU(std::vector >& input_output_maps, cv::Mat_ prelu_weights) + { + + if (input_output_maps.size() > 1) + { + int h = input_output_maps[0].rows; + int w = input_output_maps[0].cols; + size_t size_in = h * w; + + for (size_t k = 0; k < input_output_maps.size(); ++k) + { + // Apply the PReLU + auto iter = input_output_maps[k].begin(); + + float neg_mult = prelu_weights.at(k); + + for (size_t i = 0; i < size_in; ++i) + { + float in_val = *iter; + + // The prelu step + *iter++ = in_val >= 0 ? in_val : in_val * neg_mult; + + } + } + } + else + { + + int w = input_output_maps[0].cols; + + for (size_t k = 0; k < prelu_weights.rows; ++k) + { + auto iter = input_output_maps[0].row(k).begin(); + float neg_mult = prelu_weights.at(k); + + for (size_t i = 0; i < w; ++i) + { + float in_val = *iter; + // Apply the PReLU + *iter++ = in_val >= 0 ? in_val : in_val * neg_mult; + } + } + + } + + } + + void fully_connected(std::vector >& outputs, const std::vector >& input_maps, cv::Mat_ weights, cv::Mat_ biases) + { + outputs.clear(); + + if (input_maps.size() > 1) + { + // Concatenate all the maps + cv::Size orig_size = input_maps[0].size(); + cv::Mat_ input_concat(input_maps.size(), input_maps[0].cols * input_maps[0].rows); + + for (size_t in = 0; in < input_maps.size(); ++in) + { + cv::Mat_ add = input_maps[in]; + + // Reshape if all of the data will be flattened + if (input_concat.rows != weights.cols) + { + add = add.t(); + } + + add = add.reshape(0, 1); + add.copyTo(input_concat.row(in)); + } + + // Treat the input as separate feature maps + if (input_concat.rows == weights.cols) + { + input_concat = weights * input_concat; + // Add biases + for (size_t k = 0; k < biases.rows; ++k) + { + input_concat.row(k) = input_concat.row(k) + biases.at(k); + } + + outputs.clear(); + // Resize and add as output + for (size_t k = 0; k < biases.rows; ++k) + { + cv::Mat_ reshaped = input_concat.row(k).clone(); + reshaped = reshaped.reshape(1, orig_size.height); + outputs.push_back(reshaped); + } + } + else + { + // Flatten the input + input_concat = input_concat.reshape(0, input_concat.rows * input_concat.cols); + + input_concat = weights * input_concat + biases; + + outputs.clear(); + outputs.push_back(input_concat); + } + + } + else + { + cv::Mat out = weights * input_maps[0] + biases; + outputs.clear(); + outputs.push_back(out.t()); + } + + } + + + void max_pooling(std::vector >& outputs, const std::vector >& input_maps, int stride_x, int stride_y, int kernel_size_x, int kernel_size_y) + { + vector > outputs_sub; + + // Iterate over kernel height and width, based on stride + for (size_t in = 0; in < input_maps.size(); ++in) + { + // Help with rounding up a bit, to match caffe style output + int out_x = round((double)(input_maps[in].cols - kernel_size_x) / (double)stride_x) + 1; + int out_y = round((double)(input_maps[in].rows - kernel_size_y) / (double)stride_y) + 1; + + cv::Mat_ sub_out(out_y, out_x, 0.0); + cv::Mat_ in_map = input_maps[in]; + + for (int x = 0; x < input_maps[in].cols; x += stride_x) + { + int max_x = cv::min(input_maps[in].cols, x + kernel_size_x); + int x_in_out = floor(x / stride_x); + + if (x_in_out >= out_x) + continue; + + for (int y = 0; y < input_maps[in].rows; y += stride_y) + { + int y_in_out = floor(y / stride_y); + + if (y_in_out >= out_y) + continue; + + int max_y = cv::min(input_maps[in].rows, y + kernel_size_y); + + float curr_max = -FLT_MAX; + + for (int x_in = x; x_in < max_x; ++x_in) + { + for (int y_in = y; y_in < max_y; ++y_in) + { + float curr_val = in_map.at(y_in, x_in); + if (curr_val > curr_max) + { + curr_max = curr_val; + } + } + } + sub_out.at(y_in_out, x_in_out) = curr_max; + } + } + + outputs_sub.push_back(sub_out); + + } + outputs = outputs_sub; + + } + + void convolution_single_kern_fft(const vector >& input_imgs, vector >& img_dfts, const vector >& _templs, map > >& _templ_dfts, cv::Mat_& result) + { + // Assume result is defined properly + if (result.empty()) + { + cv::Size corrSize(input_imgs[0].cols - _templs[0].cols + 1, input_imgs[0].rows - _templs[0].rows + 1); + result.create(corrSize); + } + + // Our model will always be under min block size so can ignore this + //const double blockScale = 4.5; + //const int minBlockSize = 256; + + int maxDepth = CV_64F; + + cv::Size dftsize; + + dftsize.width = cv::getOptimalDFTSize(result.cols + _templs[0].cols - 1); + dftsize.height = cv::getOptimalDFTSize(result.rows + _templs[0].rows - 1); + + // Compute block size + cv::Size blocksize; + blocksize.width = dftsize.width - _templs[0].cols + 1; + blocksize.width = MIN(blocksize.width, result.cols); + blocksize.height = dftsize.height - _templs[0].rows + 1; + blocksize.height = MIN(blocksize.height, result.rows); + + vector> dftTempl; + + // if this has not been precomputed, precompute it, otherwise use it + if (_templ_dfts.find(dftsize.width) == _templ_dfts.end()) + { + dftTempl.resize(_templs.size()); + for (size_t k = 0; k < _templs.size(); ++k) + { + dftTempl[k].create(dftsize.height, dftsize.width); + + cv::Mat_ src = _templs[k]; + + cv::Mat_ dst(dftTempl[k], cv::Rect(0, 0, dftsize.width, dftsize.height)); + + cv::Mat_ dst1(dftTempl[k], cv::Rect(0, 0, _templs[k].cols, _templs[k].rows)); + + if (dst1.data != src.data) + src.convertTo(dst1, dst1.depth()); + + if (dst.cols > _templs[k].cols) + { + cv::Mat_ part(dst, cv::Range(0, _templs[k].rows), cv::Range(_templs[k].cols, dst.cols)); + part.setTo(0); + } + + // Perform DFT of the template + dft(dst, dst, 0, _templs[k].rows); + + } + _templ_dfts[dftsize.width] = dftTempl; + + } + else + { + dftTempl = _templ_dfts[dftsize.width]; + } + + cv::Size bsz(std::min(blocksize.width, result.cols), std::min(blocksize.height, result.rows)); + cv::Mat src; + + cv::Mat cdst(result, cv::Rect(0, 0, bsz.width, bsz.height)); + + vector > dftImgs; + dftImgs.resize(input_imgs.size()); + + if (img_dfts.empty()) + { + for (size_t k = 0; k < input_imgs.size(); ++k) + { + dftImgs[k].create(dftsize); + dftImgs[k].setTo(0.0); + + cv::Size dsz(bsz.width + _templs[k].cols - 1, bsz.height + _templs[k].rows - 1); + + int x2 = std::min(input_imgs[k].cols, dsz.width); + int y2 = std::min(input_imgs[k].rows, dsz.height); + + cv::Mat src0(input_imgs[k], cv::Range(0, y2), cv::Range(0, x2)); + cv::Mat dst(dftImgs[k], cv::Rect(0, 0, dsz.width, dsz.height)); + cv::Mat dst1(dftImgs[k], cv::Rect(0, 0, x2, y2)); + + src = src0; + + if (dst1.data != src.data) + src.convertTo(dst1, dst1.depth()); + + dft(dftImgs[k], dftImgs[k], 0, dsz.height); + img_dfts.push_back(dftImgs[k].clone()); + } + } + + cv::Mat_ dft_img(img_dfts[0].rows, img_dfts[0].cols, 0.0); + for (size_t k = 0; k < input_imgs.size(); ++k) + { + cv::Mat dftTempl1(dftTempl[k], cv::Rect(0, 0, dftsize.width, dftsize.height)); + if (k == 0) + { + cv::mulSpectrums(img_dfts[k], dftTempl1, dft_img, 0, true); + } + else + { + cv::mulSpectrums(img_dfts[k], dftTempl1, dftImgs[k], 0, true); + dft_img = dft_img + dftImgs[k]; + } + } + + cv::dft(dft_img, dft_img, cv::DFT_INVERSE + cv::DFT_SCALE, bsz.height); + + src = dft_img(cv::Rect(0, 0, bsz.width, bsz.height)); + + src.convertTo(cdst, CV_32F); + + } + + void convolution_fft2(std::vector >& outputs, const std::vector >& input_maps, const std::vector > >& kernels, const std::vector& biases, vector > > >& precomp_dfts) + { + outputs.clear(); + + // Useful precomputed data placeholders for quick correlation (convolution) + vector > input_image_dft; + + for (size_t k = 0; k < kernels.size(); ++k) + { + + // The convolution (with precomputation) + cv::Mat_ output; + convolution_single_kern_fft(input_maps, input_image_dft, kernels[k], precomp_dfts[k], output); + + // Combining the maps + outputs.push_back(output + biases[k]); + + } + } + + void im2col_t(const cv::Mat_& input, int width, int height, cv::Mat_& output) + { + + int m = input.cols; + int n = input.rows; + + // determine how many blocks there will be with a sliding window of width x height in the input + int yB = m - height + 1; + int xB = n - width + 1; + + // Allocate the output size + if (output.rows != width * height && output.cols != xB*yB) + { + output = cv::Mat::ones(width * height, xB*yB, CV_32F); + } + + // Iterate over the whole image + for (int i = 0; i< yB; i++) + { + int rowIdx = i; + for (int j = 0; j< xB; j++) + { + //int rowIdx = i; +j*yB; + // iterate over the blocks within the image + for (unsigned int yy = 0; yy < height; ++yy) + { + // Faster iteration over the image + const float* Mi = input.ptr(j + yy); + for (unsigned int xx = 0; xx < width; ++xx) + { + int colIdx = xx*height + yy; + + output.at(colIdx, rowIdx) = Mi[i + xx]; + } + } + rowIdx += yB; + + } + } + } + + + void convolution_direct_blas(std::vector >& outputs, const std::vector >& input_maps, const cv::Mat_& weight_matrix, const std::vector& biases, int height_k, int width_k) + { + outputs.clear(); + + int height_in = input_maps[0].rows; + int width_n = input_maps[0].cols; + + // determine how many blocks there will be with a sliding window of width x height in the input + int yB = height_in - height_k + 1; + int xB = width_n - width_k + 1; + + cv::Mat_ input_matrix(input_maps.size() * height_k * width_k + 1.0, yB * xB, 1.0f); + + // Comibine im2col accross channels to prepare for matrix multiplication + for (size_t i = 0; i < input_maps.size(); ++i) + { + im2col_t(input_maps[i], width_k, height_k, input_matrix(cv::Rect(0, i * height_k * width_k, yB * xB, height_k * width_k))); + } + + float* m1 = (float*)weight_matrix.data; + float* m2 = (float*)input_matrix.data; + + cv::Mat_ out(weight_matrix.rows, input_matrix.cols, 1.0); + float* m3 = (float*)out.data; + + cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, input_matrix.cols, weight_matrix.rows, weight_matrix.cols, 1, m2, input_matrix.cols, m1, weight_matrix.cols, 0.0, m3, input_matrix.cols); + + // Move back to vectors and reshape accordingly (also add the bias) + for (size_t k = 0; k < out.rows; ++k) + { + outputs.push_back(out.row(k).reshape(1, yB)); + } + + } + +} \ No newline at end of file diff --git a/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp b/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp index 6ca5ea9f..0703010b 100644 --- a/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp +++ b/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp @@ -85,6 +85,9 @@ #include "LandmarkDetectorUtils.h" +// CNN includes +#include "CNN_utils.h" + // OpenBLAS #include #include @@ -153,623 +156,240 @@ CNN::CNN(const CNN& other) : cnn_layer_types(other.cnn_layer_types), cnn_max_poo } } -void PReLU(std::vector >& input_output_maps, cv::Mat_ prelu_weights) -{ - - if (input_output_maps.size() > 1) - { - int h = input_output_maps[0].rows; - int w = input_output_maps[0].cols; - size_t size_in = h * w; - - for (size_t k = 0; k < input_output_maps.size(); ++k) - { - // Apply the PReLU - auto iter = input_output_maps[k].begin(); - - float neg_mult = prelu_weights.at(k); - - for(size_t i = 0; i < size_in; ++i) - { - float in_val = *iter; - - // The prelu step - *iter++ = in_val >= 0 ? in_val : in_val * neg_mult; - - } - } - } - else - { - - int w = input_output_maps[0].cols; - - for (size_t k = 0; k < prelu_weights.rows; ++k) - { - auto iter = input_output_maps[0].row(k).begin(); - float neg_mult = prelu_weights.at(k); - - for (size_t i = 0; i < w; ++i) - { - float in_val = *iter; - // Apply the PReLU - *iter++ = in_val >= 0 ? in_val : in_val * neg_mult; - } - } - - } - -} - -void fully_connected(std::vector >& outputs, const std::vector >& input_maps, cv::Mat_ weights, cv::Mat_ biases) -{ - outputs.clear(); - - if (input_maps.size() > 1) - { - // Concatenate all the maps - cv::Size orig_size = input_maps[0].size(); - cv::Mat_ input_concat(input_maps.size(), input_maps[0].cols * input_maps[0].rows); - - for (size_t in = 0; in < input_maps.size(); ++in) - { - cv::Mat_ add = input_maps[in]; - - // Reshape if all of the data will be flattened - if (input_concat.rows != weights.cols) - { - add = add.t(); - } - - add = add.reshape(0, 1); - add.copyTo(input_concat.row(in)); - } - - // Treat the input as separate feature maps - if (input_concat.rows == weights.cols) - { - input_concat = weights * input_concat; - // Add biases - for (size_t k = 0; k < biases.rows; ++k) - { - input_concat.row(k) = input_concat.row(k) + biases.at(k); - } - - outputs.clear(); - // Resize and add as output - for (size_t k = 0; k < biases.rows; ++k) - { - cv::Mat_ reshaped = input_concat.row(k).clone(); - reshaped = reshaped.reshape(1, orig_size.height); - outputs.push_back(reshaped); - } - } - else - { - // Flatten the input - input_concat = input_concat.reshape(0, input_concat.rows * input_concat.cols); - - input_concat = weights * input_concat + biases; - - outputs.clear(); - outputs.push_back(input_concat); - } - - } - else - { - cv::Mat out = weights * input_maps[0] + biases; - outputs.clear(); - outputs.push_back(out.t()); - } - -} - -void max_pooling(std::vector >& outputs, const std::vector >& input_maps, int stride_x, int stride_y, int kernel_size_x, int kernel_size_y) -{ - vector > outputs_sub; - - // Iterate over kernel height and width, based on stride - for (size_t in = 0; in < input_maps.size(); ++in) - { - // Help with rounding up a bit, to match caffe style output - int out_x = round((double)(input_maps[in].cols - kernel_size_x) / (double)stride_x) + 1; - int out_y = round((double)(input_maps[in].rows - kernel_size_y) / (double)stride_y) + 1; - - cv::Mat_ sub_out(out_y, out_x, 0.0); - cv::Mat_ in_map = input_maps[in]; - - for (int x = 0; x < input_maps[in].cols; x += stride_x) - { - int max_x = cv::min(input_maps[in].cols, x + kernel_size_x); - int x_in_out = floor(x / stride_x); - - if (x_in_out >= out_x) - continue; - - for (int y = 0; y < input_maps[in].rows; y += stride_y) - { - int y_in_out = floor(y / stride_y); - - if (y_in_out >= out_y) - continue; - - int max_y = cv::min(input_maps[in].rows, y + kernel_size_y); - - float curr_max = -FLT_MAX; - - for (int x_in = x; x_in < max_x; ++x_in) - { - for (int y_in = y; y_in < max_y; ++y_in) - { - float curr_val = in_map.at(y_in, x_in); - if (curr_val > curr_max) - { - curr_max = curr_val; - } - } - } - sub_out.at(y_in_out, x_in_out) = curr_max; - } - } - - outputs_sub.push_back(sub_out); - - } - outputs = outputs_sub; - -} - //////////////////////////////////////////////////////////////////////////////////////////////////////// -void convolution_single_kern_fft(const vector >& input_imgs, vector >& img_dfts, const vector >& _templs, map > >& _templ_dfts, cv::Mat_& result) -{ - // Assume result is defined properly - if (result.empty()) - { - cv::Size corrSize(input_imgs[0].cols - _templs[0].cols + 1, input_imgs[0].rows - _templs[0].rows + 1); - result.create(corrSize); - } - // Our model will always be under min block size so can ignore this - //const double blockScale = 4.5; - //const int minBlockSize = 256; - - int maxDepth = CV_64F; - - cv::Size dftsize; - - dftsize.width = cv::getOptimalDFTSize(result.cols + _templs[0].cols - 1); - dftsize.height = cv::getOptimalDFTSize(result.rows + _templs[0].rows - 1); - - // Compute block size - cv::Size blocksize; - blocksize.width = dftsize.width - _templs[0].cols + 1; - blocksize.width = MIN(blocksize.width, result.cols); - blocksize.height = dftsize.height - _templs[0].rows + 1; - blocksize.height = MIN(blocksize.height, result.rows); - - vector> dftTempl; - - // if this has not been precomputed, precompute it, otherwise use it - if (_templ_dfts.find(dftsize.width) == _templ_dfts.end()) - { - dftTempl.resize(_templs.size()); - for (size_t k = 0; k < _templs.size(); ++k) - { - dftTempl[k].create(dftsize.height, dftsize.width); - - cv::Mat_ src = _templs[k]; - - cv::Mat_ dst(dftTempl[k], cv::Rect(0, 0, dftsize.width, dftsize.height)); - - cv::Mat_ dst1(dftTempl[k], cv::Rect(0, 0, _templs[k].cols, _templs[k].rows)); - - if (dst1.data != src.data) - src.convertTo(dst1, dst1.depth()); - - if (dst.cols > _templs[k].cols) - { - cv::Mat_ part(dst, cv::Range(0, _templs[k].rows), cv::Range(_templs[k].cols, dst.cols)); - part.setTo(0); - } - - // Perform DFT of the template - dft(dst, dst, 0, _templs[k].rows); - - } - _templ_dfts[dftsize.width] = dftTempl; - - } - else - { - dftTempl = _templ_dfts[dftsize.width]; - } - - cv::Size bsz(std::min(blocksize.width, result.cols), std::min(blocksize.height, result.rows)); - cv::Mat src; - - cv::Mat cdst(result, cv::Rect(0, 0, bsz.width, bsz.height)); - - vector > dftImgs; - dftImgs.resize(input_imgs.size()); - - if (img_dfts.empty()) - { - for(size_t k = 0; k < input_imgs.size(); ++k) - { - dftImgs[k].create(dftsize); - dftImgs[k].setTo(0.0); - - cv::Size dsz(bsz.width + _templs[k].cols - 1, bsz.height + _templs[k].rows - 1); - - int x2 = std::min(input_imgs[k].cols, dsz.width); - int y2 = std::min(input_imgs[k].rows, dsz.height); - - cv::Mat src0(input_imgs[k], cv::Range(0, y2), cv::Range(0, x2)); - cv::Mat dst(dftImgs[k], cv::Rect(0, 0, dsz.width, dsz.height)); - cv::Mat dst1(dftImgs[k], cv::Rect(0, 0, x2, y2)); - - src = src0; - - if (dst1.data != src.data) - src.convertTo(dst1, dst1.depth()); - - dft(dftImgs[k], dftImgs[k], 0, dsz.height); - img_dfts.push_back(dftImgs[k].clone()); - } - } - - cv::Mat_ dft_img(img_dfts[0].rows, img_dfts[0].cols, 0.0); - for (size_t k = 0; k < input_imgs.size(); ++k) - { - cv::Mat dftTempl1(dftTempl[k], cv::Rect(0, 0, dftsize.width, dftsize.height)); - if (k == 0) - { - cv::mulSpectrums(img_dfts[k], dftTempl1, dft_img, 0, true); - } - else - { - cv::mulSpectrums(img_dfts[k], dftTempl1, dftImgs[k], 0, true); - dft_img = dft_img + dftImgs[k]; - } - } - - cv::dft(dft_img, dft_img, cv::DFT_INVERSE + cv::DFT_SCALE, bsz.height); - - src = dft_img(cv::Rect(0, 0, bsz.width, bsz.height)); - - src.convertTo(cdst, CV_32F); - -} - -void im2col_t(const cv::Mat_& input, int width, int height, cv::Mat_& output) -{ - - int m = input.cols; - int n = input.rows; - - // determine how many blocks there will be with a sliding window of width x height in the input - int yB = m - height + 1; - int xB = n - width + 1; - - // Allocate the output size - if (output.rows != width * height && output.cols != xB*yB) - { - output = cv::Mat::ones(width * height, xB*yB, CV_32F); - } - - // Iterate over the whole image - for (int i = 0; i< yB; i++) - { - int rowIdx = i; - for (int j = 0; j< xB; j++) - { - //int rowIdx = i; +j*yB; - // iterate over the blocks within the image - for (unsigned int yy = 0; yy < height; ++yy) - { - // Faster iteration over the image - const float* Mi = input.ptr(j + yy); - for (unsigned int xx = 0; xx < width; ++xx) - { - int colIdx = xx*height + yy; - - output.at(colIdx, rowIdx) = Mi[i + xx]; - } - } - rowIdx += yB; - - } - } -} - -void im2col_multimap(const vector >& inputs, int width, int height, cv::Mat_& output) -{ - - int m = inputs[0].rows; - int n = inputs[0].cols; - - // determine how many blocks there will be with a sliding window of width x height in the input - int yB = m - height + 1; - int xB = n - width + 1; - - int stride = height * width; - - size_t num_maps = inputs.size(); - - // Allocate the output size - if (output.cols != width * height * inputs.size() + 1 && output.rows != xB*yB) - { - output = cv::Mat::ones(xB*yB, width * height * num_maps + 1, CV_32F); - } - - // Iterate over the whole image - for (int i = 0; i< yB; i++) - { - int rowIdx = i*xB; - for (int j = 0; j< xB; j++) - { - - float* Mo = output.ptr(rowIdx); - - // iterate over the blocks within the image - for (unsigned int yy = 0; yy < height; ++yy) - { - for (unsigned int in_maps = 0; in_maps < num_maps; ++in_maps) - { - // Faster iteration over the image - const float* Mi = inputs[in_maps].ptr(i + yy); - - for (unsigned int xx = 0; xx < width; ++xx) - { - int colIdx = xx*height + yy + in_maps * stride; - //output.at(rowIdx, colIdx) = Mi[j + xx]; //input.at(i + yy, j + xx); - Mo[colIdx] = Mi[j + xx]; - } - } - } - rowIdx++; - - } - } -} - - -void im2col(const cv::Mat_& input, int width, int height, cv::Mat_& output) -{ - - int m = input.rows; - int n = input.cols; - - // determine how many blocks there will be with a sliding window of width x height in the input - int yB = m - height + 1; - int xB = n - width + 1; - - // Allocate the output size - if (output.cols != width * height && output.rows != xB*yB) - { - output = cv::Mat::ones(xB*yB, width * height, CV_32F); - } - - // Iterate over the whole image - for (int i = 0; i< yB; i++) - { - int rowIdx = i*xB; - for (int j = 0; j< xB; j++) - { - - float* Mo = output.ptr(rowIdx); - - // iterate over the blocks within the image - for (unsigned int yy = 0; yy < height; ++yy) - { - // Faster iteration over the image - const float* Mi = input.ptr(i + yy); - - for (unsigned int xx = 0; xx < width; ++xx) - { - int colIdx = xx*height + yy; - //output.at(rowIdx, colIdx) = Mi[j + xx]; //input.at(i + yy, j + xx); - Mo[colIdx] = Mi[j + xx]; - } - } - rowIdx++; - - } - } -} - -void convolution_direct(std::vector >& outputs, const std::vector >& input_maps, const cv::Mat_& weight_matrix, const std::vector& biases, int height_k, int width_k) -{ - outputs.clear(); - - int height_in = input_maps[0].rows; - int width_n = input_maps[0].cols; - - // determine how many blocks there will be with a sliding window of width x height in the input - int yB = height_in - height_k + 1; - int xB = width_n - width_k + 1; - - cv::Mat_ input_matrix(input_maps.size() * height_k * width_k + 1.0, yB * xB, 1.0f); - - // Comibine im2col accross channels to prepare for matrix multiplication - for (size_t i = 0; i < input_maps.size(); ++i) - { - im2col_t(input_maps[i], width_k, height_k, input_matrix(cv::Rect(0, i * height_k * width_k, yB * xB, height_k * width_k))); - } - - // Actual convolution (through multiplication) - cv::Mat_ out = weight_matrix * input_matrix; - - // Move back to vectors and reshape accordingly (also add the bias) - for (size_t k = 0; k < out.rows; ++k) - { - outputs.push_back(out.row(k).reshape(1, yB)); - } - -} - -void convolution_direct_blas(std::vector >& outputs, const std::vector >& input_maps, const cv::Mat_& weight_matrix, const std::vector& biases, int height_k, int width_k) -{ - outputs.clear(); - - int height_in = input_maps[0].rows; - int width_n = input_maps[0].cols; - - // determine how many blocks there will be with a sliding window of width x height in the input - int yB = height_in - height_k + 1; - int xB = width_n - width_k + 1; - - cv::Mat_ input_matrix(input_maps.size() * height_k * width_k + 1.0, yB * xB, 1.0f); - - // Comibine im2col accross channels to prepare for matrix multiplication - for (size_t i = 0; i < input_maps.size(); ++i) - { - im2col_t(input_maps[i], width_k, height_k, input_matrix(cv::Rect(0, i * height_k * width_k, yB * xB, height_k * width_k))); - } - - float* m1 = (float*)weight_matrix.data; - float* m2 = (float*)input_matrix.data; - - cv::Mat_ out(weight_matrix.rows, input_matrix.cols, 1.0); - float* m3 = (float*)out.data; - - cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, input_matrix.cols, weight_matrix.rows, weight_matrix.cols, 1, m2, input_matrix.cols, m1, weight_matrix.cols, 0.0, m3, input_matrix.cols); - - // Move back to vectors and reshape accordingly (also add the bias) - for (size_t k = 0; k < out.rows; ++k) - { - outputs.push_back(out.row(k).reshape(1, yB)); - } - -} - -void convolution_direct_blas2(std::vector >& outputs, const std::vector >& input_maps, const cv::Mat_& weight_matrix, const std::vector& biases, int height_k, int width_k) -{ - outputs.clear(); - - int height_in = input_maps[0].rows; - int width_n = input_maps[0].cols; - - // determine how many blocks there will be with a sliding window of width x height in the input - int yB = height_in - height_k + 1; - int xB = width_n - width_k + 1; - - // TODO this could (should) be pre-allocated - cv::Mat_ input_matrix(input_maps.size() * height_k * width_k + 1.0, yB * xB, 1.0f); - //cv::Mat_ input_matrix_t(yB * xB, input_maps.size() * height_k * width_k + 1.0, 1.0f); - - // Comibine im2col accross channels to prepare for matrix multiplication - for (size_t i = 0; i < input_maps.size(); ++i) - { - im2col_t(input_maps[i], width_k, height_k, input_matrix(cv::Rect(0, i * height_k * width_k, yB * xB, height_k * width_k))); - //im2col(input_maps[i], width_k, height_k, input_matrix_t(cv::Rect(i * height_k * width_k, 0, height_k * width_k, yB * xB))); - - } - - cv::Mat_ input_matrix_mm; - im2col_multimap(input_maps, width_k, height_k, input_matrix_mm); - //input_matrix_mm = input_matrix_mm.t(); - - float* m1 = (float*)weight_matrix.data; - float* m2 = (float*)input_matrix.data; - - cv::Mat_ out(weight_matrix.rows, input_matrix.cols, 1.0); - float* m3 = (float*)out.data; - - cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, input_matrix.cols, weight_matrix.rows, weight_matrix.cols, 1, m2, input_matrix.cols, m1, weight_matrix.cols, 0.0, m3, input_matrix.cols); - - cv::Mat_ out2(weight_matrix.rows, input_matrix.cols, 1.0); - float* m31 = (float*)out2.data; - - float* m21 = (float*)input_matrix_mm.data; - cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, input_matrix_mm.cols, weight_matrix.rows, weight_matrix.cols, 1, m21, input_matrix_mm.cols, m1, weight_matrix.cols, 0.0, m31, input_matrix_mm.cols); - // TOODO call fortran directly - //sgemm_("N","N", ) - cout << cv::mean(cv::abs(out - out2))[0] << endl; - - // Move back to vectors and reshape accordingly (also add the bias) - for (size_t k = 0; k < out.rows; ++k) - { - outputs.push_back(out.row(k).reshape(1, yB)); - } - -} - -void convolution_fft2(std::vector >& outputs, const std::vector >& input_maps, const std::vector > >& kernels, const std::vector& biases, vector > > >& precomp_dfts) -{ - outputs.clear(); - - // Useful precomputed data placeholders for quick correlation (convolution) - vector > input_image_dft; - - for (size_t k = 0; k < kernels.size(); ++k) - { - - // The convolution (with precomputation) - cv::Mat_ output; - convolution_single_kern_fft(input_maps, input_image_dft, kernels[k], precomp_dfts[k], output); - - // Combining the maps - outputs.push_back(output + biases[k]); - - } -} - -void convolution_fft(std::vector >& outputs, const std::vector >& input_maps, const std::vector > >& kernels, const std::vector& biases, vector > > >& precomp_dfts) -{ - outputs.clear(); - for (size_t in = 0; in < input_maps.size(); ++in) - { - cv::Mat_ input_image = input_maps[in]; - - // Useful precomputed data placeholders for quick correlation (convolution) - cv::Mat_ input_image_dft; - cv::Mat integral_image; - cv::Mat integral_image_sq; - - for (size_t k = 0; k < kernels[in].size(); ++k) - { - cv::Mat_ kernel = kernels[in][k]; - - // The convolution (with precomputation) - cv::Mat_ output; - if (precomp_dfts[in][k].second.empty()) - { - std::map > precomputed_dft; - - LandmarkDetector::matchTemplate_m(input_image, input_image_dft, integral_image, integral_image_sq, kernel, precomputed_dft, output, CV_TM_CCORR); - - precomp_dfts[in][k].first = precomputed_dft.begin()->first; - precomp_dfts[in][k].second = precomputed_dft.begin()->second; - } - else - { - std::map > precomputed_dft; - precomputed_dft[precomp_dfts[in][k].first] = precomp_dfts[in][k].second; - LandmarkDetector::matchTemplate_m(input_image, input_image_dft, integral_image, integral_image_sq, kernel, precomputed_dft, output, CV_TM_CCORR); - } - - // Combining the maps - if (in == 0) - { - outputs.push_back(output); - } - else - { - outputs[k] = outputs[k] + output; - } - - } - - } - - for (size_t k = 0; k < biases.size(); ++k) - { - outputs[k] = outputs[k] + biases[k]; - } -} +//void im2col_multimap(const vector >& inputs, int width, int height, cv::Mat_& output) +//{ +// +// int m = inputs[0].rows; +// int n = inputs[0].cols; +// +// // determine how many blocks there will be with a sliding window of width x height in the input +// int yB = m - height + 1; +// int xB = n - width + 1; +// +// int stride = height * width; +// +// size_t num_maps = inputs.size(); +// +// // Allocate the output size +// if (output.cols != width * height * inputs.size() + 1 && output.rows != xB*yB) +// { +// output = cv::Mat::ones(xB*yB, width * height * num_maps + 1, CV_32F); +// } +// +// // Iterate over the whole image +// for (int i = 0; i< yB; i++) +// { +// int rowIdx = i*xB; +// for (int j = 0; j< xB; j++) +// { +// +// float* Mo = output.ptr(rowIdx); +// +// // iterate over the blocks within the image +// for (unsigned int yy = 0; yy < height; ++yy) +// { +// for (unsigned int in_maps = 0; in_maps < num_maps; ++in_maps) +// { +// // Faster iteration over the image +// const float* Mi = inputs[in_maps].ptr(i + yy); +// +// for (unsigned int xx = 0; xx < width; ++xx) +// { +// int colIdx = xx*height + yy + in_maps * stride; +// //output.at(rowIdx, colIdx) = Mi[j + xx]; //input.at(i + yy, j + xx); +// Mo[colIdx] = Mi[j + xx]; +// } +// } +// } +// rowIdx++; +// +// } +// } +//} +// +//void im2col(const cv::Mat_& input, int width, int height, cv::Mat_& output) +//{ +// +// int m = input.rows; +// int n = input.cols; +// +// // determine how many blocks there will be with a sliding window of width x height in the input +// int yB = m - height + 1; +// int xB = n - width + 1; +// +// // Allocate the output size +// if (output.cols != width * height && output.rows != xB*yB) +// { +// output = cv::Mat::ones(xB*yB, width * height, CV_32F); +// } +// +// // Iterate over the whole image +// for (int i = 0; i< yB; i++) +// { +// int rowIdx = i*xB; +// for (int j = 0; j< xB; j++) +// { +// +// float* Mo = output.ptr(rowIdx); +// +// // iterate over the blocks within the image +// for (unsigned int yy = 0; yy < height; ++yy) +// { +// // Faster iteration over the image +// const float* Mi = input.ptr(i + yy); +// +// for (unsigned int xx = 0; xx < width; ++xx) +// { +// int colIdx = xx*height + yy; +// //output.at(rowIdx, colIdx) = Mi[j + xx]; //input.at(i + yy, j + xx); +// Mo[colIdx] = Mi[j + xx]; +// } +// } +// rowIdx++; +// +// } +// } +//} +// +//void convolution_direct(std::vector >& outputs, const std::vector >& input_maps, const cv::Mat_& weight_matrix, const std::vector& biases, int height_k, int width_k) +//{ +// outputs.clear(); +// +// int height_in = input_maps[0].rows; +// int width_n = input_maps[0].cols; +// +// // determine how many blocks there will be with a sliding window of width x height in the input +// int yB = height_in - height_k + 1; +// int xB = width_n - width_k + 1; +// +// cv::Mat_ input_matrix(input_maps.size() * height_k * width_k + 1.0, yB * xB, 1.0f); +// +// // Comibine im2col accross channels to prepare for matrix multiplication +// for (size_t i = 0; i < input_maps.size(); ++i) +// { +// im2col_t(input_maps[i], width_k, height_k, input_matrix(cv::Rect(0, i * height_k * width_k, yB * xB, height_k * width_k))); +// } +// +// // Actual convolution (through multiplication) +// cv::Mat_ out = weight_matrix * input_matrix; +// +// // Move back to vectors and reshape accordingly (also add the bias) +// for (size_t k = 0; k < out.rows; ++k) +// { +// outputs.push_back(out.row(k).reshape(1, yB)); +// } +// +//} +// +// +//void convolution_direct_blas2(std::vector >& outputs, const std::vector >& input_maps, const cv::Mat_& weight_matrix, const std::vector& biases, int height_k, int width_k) +//{ +// outputs.clear(); +// +// int height_in = input_maps[0].rows; +// int width_n = input_maps[0].cols; +// +// // determine how many blocks there will be with a sliding window of width x height in the input +// int yB = height_in - height_k + 1; +// int xB = width_n - width_k + 1; +// +// // TODO this could (should) be pre-allocated +// cv::Mat_ input_matrix(input_maps.size() * height_k * width_k + 1.0, yB * xB, 1.0f); +// //cv::Mat_ input_matrix_t(yB * xB, input_maps.size() * height_k * width_k + 1.0, 1.0f); +// +// // Comibine im2col accross channels to prepare for matrix multiplication +// for (size_t i = 0; i < input_maps.size(); ++i) +// { +// im2col_t(input_maps[i], width_k, height_k, input_matrix(cv::Rect(0, i * height_k * width_k, yB * xB, height_k * width_k))); +// //im2col(input_maps[i], width_k, height_k, input_matrix_t(cv::Rect(i * height_k * width_k, 0, height_k * width_k, yB * xB))); +// +// } +// +// cv::Mat_ input_matrix_mm; +// im2col_multimap(input_maps, width_k, height_k, input_matrix_mm); +// //input_matrix_mm = input_matrix_mm.t(); +// +// float* m1 = (float*)weight_matrix.data; +// float* m2 = (float*)input_matrix.data; +// +// cv::Mat_ out(weight_matrix.rows, input_matrix.cols, 1.0); +// float* m3 = (float*)out.data; +// +// cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, input_matrix.cols, weight_matrix.rows, weight_matrix.cols, 1, m2, input_matrix.cols, m1, weight_matrix.cols, 0.0, m3, input_matrix.cols); +// +// cv::Mat_ out2(weight_matrix.rows, input_matrix.cols, 1.0); +// float* m31 = (float*)out2.data; +// +// float* m21 = (float*)input_matrix_mm.data; +// cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, input_matrix_mm.cols, weight_matrix.rows, weight_matrix.cols, 1, m21, input_matrix_mm.cols, m1, weight_matrix.cols, 0.0, m31, input_matrix_mm.cols); +// // TOODO call fortran directly +// //sgemm_("N","N", ) +// cout << cv::mean(cv::abs(out - out2))[0] << endl; +// +// // Move back to vectors and reshape accordingly (also add the bias) +// for (size_t k = 0; k < out.rows; ++k) +// { +// outputs.push_back(out.row(k).reshape(1, yB)); +// } +// +//} +// +//void convolution_fft(std::vector >& outputs, const std::vector >& input_maps, const std::vector > >& kernels, const std::vector& biases, vector > > >& precomp_dfts) +//{ +// outputs.clear(); +// for (size_t in = 0; in < input_maps.size(); ++in) +// { +// cv::Mat_ input_image = input_maps[in]; +// +// // Useful precomputed data placeholders for quick correlation (convolution) +// cv::Mat_ input_image_dft; +// cv::Mat integral_image; +// cv::Mat integral_image_sq; +// +// for (size_t k = 0; k < kernels[in].size(); ++k) +// { +// cv::Mat_ kernel = kernels[in][k]; +// +// // The convolution (with precomputation) +// cv::Mat_ output; +// if (precomp_dfts[in][k].second.empty()) +// { +// std::map > precomputed_dft; +// +// LandmarkDetector::matchTemplate_m(input_image, input_image_dft, integral_image, integral_image_sq, kernel, precomputed_dft, output, CV_TM_CCORR); +// +// precomp_dfts[in][k].first = precomputed_dft.begin()->first; +// precomp_dfts[in][k].second = precomputed_dft.begin()->second; +// } +// else +// { +// std::map > precomputed_dft; +// precomputed_dft[precomp_dfts[in][k].first] = precomp_dfts[in][k].second; +// LandmarkDetector::matchTemplate_m(input_image, input_image_dft, integral_image, integral_image_sq, kernel, precomputed_dft, output, CV_TM_CCORR); +// } +// +// // Combining the maps +// if (in == 0) +// { +// outputs.push_back(output); +// } +// else +// { +// outputs[k] = outputs[k] + output; +// } +// +// } +// +// } +// +// for (size_t k = 0; k < biases.size(); ++k) +// { +// outputs[k] = outputs[k] + biases[k]; +// } +//} std::vector> CNN::Inference(const cv::Mat& input_img, bool direct) {