From b71b2f1cfc2a2c1c973c3e03ef249905cc9082fa Mon Sep 17 00:00:00 2001
From: Tadas Baltrusaitis <tadyla@gmail.com>
Date: Wed, 30 Aug 2017 15:20:56 +0100
Subject: [PATCH] Another performance improvement by rearranging GEMM order.

---
 .gitignore                                    |   1 +
 .../LandmarkDetector/LandmarkDetector.vcxproj |   2 +
 .../LandmarkDetector.vcxproj.filters          |  96 +-
 .../LandmarkDetector/include/CNN_utils.h      |  64 ++
 .../LandmarkDetector/src/CEN_patch_expert.cpp | 159 +---
 lib/local/LandmarkDetector/src/CNN_utils.cpp  | 434 +++++++++
 .../src/FaceDetectorMTCNN.cpp                 | 848 +++++-------------
 7 files changed, 810 insertions(+), 794 deletions(-)
 create mode 100644 lib/local/LandmarkDetector/include/CNN_utils.h
 create mode 100644 lib/local/LandmarkDetector/src/CNN_utils.cpp
diff --git a/.gitignore b/.gitignore
index 4826b944..d886c992 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,3 +53,4 @@ matlab_runners/Demos/demo_img/
 matlab_runners/Demos/demo_vid/
 matlab_runners/Demos/output_features_seq/
 matlab_runners/Demos/output_features_vid/
+exe/FeatureExtraction/experiments/
diff --git a/lib/local/LandmarkDetector/LandmarkDetector.vcxproj b/lib/local/LandmarkDetector/LandmarkDetector.vcxproj
index c86c51f4..017aa60e 100644
--- a/lib/local/LandmarkDetector/LandmarkDetector.vcxproj
+++ b/lib/local/LandmarkDetector/LandmarkDetector.vcxproj
@@ -195,6 +195,7 @@ xcopy /I /E /Y /D "$(SolutionDir)lib\3rdParty\OpenCV3.1\classifiers" "$(OutDir)c
       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Use</PrecompiledHeader>
     </ClCompile>
     <ClCompile Include="src\CEN_patch_expert.cpp" />
+    <ClCompile Include="src\CNN_utils.cpp" />
     <ClCompile Include="src\FaceDetectorMTCNN.cpp" />
     <ClCompile Include="src\LandmarkDetectorModel.cpp">
       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Use</PrecompiledHeader>
@@ -255,6 +256,7 @@ xcopy /I /E /Y /D "$(SolutionDir)lib\3rdParty\OpenCV3.1\classifiers" "$(OutDir)c
   <ItemGroup>
     <ClInclude Include="include\CCNF_patch_expert.h" />
     <ClInclude Include="include\CEN_patch_expert.h" />
+    <ClInclude Include="include\CNN_utils.h" />
     <ClInclude Include="include\FaceDetectorMTCNN.h" />
     <ClInclude Include="include\LandmarkDetectorModel.h" />
     <ClInclude Include="include\LandmarkDetectorParameters.h" />
diff --git a/lib/local/LandmarkDetector/LandmarkDetector.vcxproj.filters b/lib/local/LandmarkDetector/LandmarkDetector.vcxproj.filters
index 07e70f23..fdef973c 100644
--- a/lib/local/LandmarkDetector/LandmarkDetector.vcxproj.filters
+++ b/lib/local/LandmarkDetector/LandmarkDetector.vcxproj.filters
@@ -4,6 +4,30 @@
     <ClCompile Include="src\CCNF_patch_expert.cpp">
       <Filter>source</Filter>
     </ClCompile>
+    <ClCompile Include="src\CEN_patch_expert.cpp">
+      <Filter>source</Filter>
+    </ClCompile>
+    <ClCompile Include="src\CNN_utils.cpp">
+      <Filter>source</Filter>
+    </ClCompile>
+    <ClCompile Include="src\FaceDetectorMTCNN.cpp">
+      <Filter>source</Filter>
+    </ClCompile>
+    <ClCompile Include="src\LandmarkDetectionValidator.cpp">
+      <Filter>source</Filter>
+    </ClCompile>
+    <ClCompile Include="src\LandmarkDetectorFunc.cpp">
+      <Filter>source</Filter>
+    </ClCompile>
+    <ClCompile Include="src\LandmarkDetectorModel.cpp">
+      <Filter>source</Filter>
+    </ClCompile>
+    <ClCompile Include="src\LandmarkDetectorParameters.cpp">
+      <Filter>source</Filter>
+    </ClCompile>
+    <ClCompile Include="src\LandmarkDetectorUtils.cpp">
+      <Filter>source</Filter>
+    </ClCompile>
     <ClCompile Include="src\Patch_experts.cpp">
       <Filter>source</Filter>
     </ClCompile>
@@ -19,30 +43,38 @@
     <ClCompile Include="src\SVR_patch_expert.cpp">
       <Filter>source</Filter>
     </ClCompile>
-    <ClCompile Include="src\LandmarkDetectorModel.cpp">
-      <Filter>source</Filter>
-    </ClCompile>
-    <ClCompile Include="src\LandmarkDetectorUtils.cpp">
-      <Filter>source</Filter>
-    </ClCompile>
-    <ClCompile Include="src\LandmarkDetectionValidator.cpp">
-      <Filter>source</Filter>
-    </ClCompile>
-    <ClCompile Include="src\LandmarkDetectorFunc.cpp">
-      <Filter>source</Filter>
-    </ClCompile>
-    <ClCompile Include="src\LandmarkDetectorParameters.cpp">
-      <Filter>source</Filter>
-    </ClCompile>
-    <ClCompile Include="src\CEN_patch_expert.cpp">
-    <ClCompile Include="src\FaceDetectorMTCNN.cpp">
-      <Filter>source</Filter>
-    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="include\CCNF_patch_expert.h">
       <Filter>headers</Filter>
     </ClInclude>
+    <ClInclude Include="include\CEN_patch_expert.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="include\CNN_utils.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="include\FaceDetectorMTCNN.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="include\LandmarkCoreIncludes.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="include\LandmarkDetectionValidator.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="include\LandmarkDetectorFunc.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="include\LandmarkDetectorModel.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="include\LandmarkDetectorParameters.h">
+      <Filter>headers</Filter>
+    </ClInclude>
+    <ClInclude Include="include\LandmarkDetectorUtils.h">
+      <Filter>headers</Filter>
+    </ClInclude>
     <ClInclude Include="include\Patch_experts.h">
       <Filter>headers</Filter>
     </ClInclude>
@@ -58,35 +90,13 @@
     <ClInclude Include="include\SVR_patch_expert.h">
       <Filter>headers</Filter>
     </ClInclude>
-    <ClInclude Include="include\LandmarkDetectorParameters.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="include\LandmarkDetectorUtils.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="include\LandmarkDetectorModel.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="include\LandmarkDetectionValidator.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="include\LandmarkCoreIncludes.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="include\LandmarkDetectorFunc.h">
-      <Filter>headers</Filter>
-    </ClInclude>
-    <ClInclude Include="include\CEN_patch_expert.h">
-    <ClInclude Include="include\FaceDetectorMTCNN.h">
-      <Filter>headers</Filter>
-    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <Filter Include="headers">
-      <UniqueIdentifier>{75157eb9-2d21-482a-a1e2-e3e4100fffd7}</UniqueIdentifier>
+      <UniqueIdentifier>{4c468912-7bbd-415e-a34e-3d8470740945}</UniqueIdentifier>
     </Filter>
     <Filter Include="source">
-      <UniqueIdentifier>{f50f893d-a6b2-4e45-8f94-529042834e49}</UniqueIdentifier>
+      <UniqueIdentifier>{55e469df-489d-4acb-86cf-bd2d915e69ad}</UniqueIdentifier>
     </Filter>
   </ItemGroup>
 </Project>
\ No newline at end of file
diff --git a/lib/local/LandmarkDetector/include/CNN_utils.h b/lib/local/LandmarkDetector/include/CNN_utils.h
new file mode 100644
index 00000000..80db6e10
--- /dev/null
+++ b/lib/local/LandmarkDetector/include/CNN_utils.h
@@ -0,0 +1,64 @@
+///////////////////////////////////////////////////////////////////////////////
+// Copyright (C) 2017, Tadas Baltrusaitis, all rights reserved.
+//
+// ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
+//
+// BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT.  
+// IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
+//
+// License can be found in OpenFace-license.txt
+//
+//     * Any publications arising from the use of this software, including but
+//       not limited to academic journal and conference publications, technical
+//       reports and manuals, must cite at least one of the following works:
+//
+//       OpenFace: an open source facial behavior analysis toolkit
+//       Tadas Baltrušaitis, Peter Robinson, and Louis-Philippe Morency
+//       in IEEE Winter Conference on Applications of Computer Vision, 2016  
+//
+//       Rendering of Eyes for Eye-Shape Registration and Gaze Estimation
+//       Erroll Wood, Tadas Baltrušaitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, and Andreas Bulling 
+//       in IEEE International. Conference on Computer Vision (ICCV),  2015 
+//
+//       Cross-dataset learning and person-speci?c normalisation for automatic Action Unit detection
+//       Tadas Baltrušaitis, Marwa Mahmoud, and Peter Robinson 
+//       in Facial Expression Recognition and Analysis Challenge, 
+//       IEEE International Conference on Automatic Face and Gesture Recognition, 2015 
+//
+//       Constrained Local Neural Fields for robust facial landmark detection in the wild.
+//       Tadas Baltrušaitis, Peter Robinson, and Louis-Philippe Morency. 
+//       in IEEE Int. Conference on Computer Vision Workshops, 300 Faces in-the-Wild Challenge, 2013.    
+//
+///////////////////////////////////////////////////////////////////////////////
+
+//  Header for all external CLNF/CLM-Z/CLM methods of interest to the user
+#ifndef __CNN_UTILS_h_
+#define __CNN_UTILS_h_
+
+// OpenCV includes
+#include <opencv2/core/core.hpp>
+
+using namespace std;
+
+namespace LandmarkDetector
+{
+	//===========================================================================	
+	// Various CNN layers
+
+	// Parametric ReLU with leaky weights (separate ones per channel)
+	void PReLU(std::vector<cv::Mat_<float> >& input_output_maps, cv::Mat_<float> prelu_weights);
+
+	// The fully connected layer
+	void fully_connected(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, cv::Mat_<float> weights, cv::Mat_<float> biases);
+
+	// Max pooling layer with parametrized stride and kernel sizes
+	void max_pooling(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, int stride_x, int stride_y, int kernel_size_x, int kernel_size_y);
+
+	// Convolution using FFT optimization rather than matrix multiplication
+	void convolution_fft2(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, const std::vector<std::vector<cv::Mat_<float> > >& kernels, const std::vector<float >& biases, vector<map<int, vector<cv::Mat_<double> > > >& precomp_dfts);
+	
+	// Convolution using matrix multiplication and OpenBLAS optimization
+	void convolution_direct_blas(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, const cv::Mat_<float>& weight_matrix, const std::vector<float >& biases, int height_k, int width_k);
+
+}
+#endif
diff --git a/lib/local/LandmarkDetector/src/CEN_patch_expert.cpp b/lib/local/LandmarkDetector/src/CEN_patch_expert.cpp
index 0e6c419c..65ee0e08 100644
--- a/lib/local/LandmarkDetector/src/CEN_patch_expert.cpp
+++ b/lib/local/LandmarkDetector/src/CEN_patch_expert.cpp
@@ -134,7 +134,7 @@ void CEN_patch_expert::Read(ifstream &stream)
 		cv::Mat_<double> weight;
 		LandmarkDetector::ReadMatBin(stream, weight);
 
-		weights[i] = weight;
+		weights[i] = weight.t();
 		biases[i] = bias;
 	}
 
@@ -251,6 +251,8 @@ void CEN_patch_expert::Response(const cv::Mat_<float> &area_of_interest, cv::Mat
 		float beta = 0.0;
 		sgemm_("N", "N", &weights[layer].cols, &response.rows, &response.cols, &alpha, m2, &weights[layer].cols, m1, &response.cols, &beta, m3, &weights[layer].cols);
 
+		// TODO correct this with weight transpose
+
 		response = resp_blas;
 
 		// TODO bias could be pre-allocated to the window size so that addition could be quicker
@@ -406,126 +408,6 @@ void im2colBiasSparse(const cv::Mat_<float>& input, int width, int height, cv::M
 	}
 }
 
-//===========================================================================
-//void CEN_patch_expert::ResponseSparse_ocv(const cv::Mat_<float> &area_of_interest, cv::Mat_<float> &response)
-//{
-//
-//	int response_height = area_of_interest.rows - height + 1;
-//	int response_width = area_of_interest.cols - width + 1;
-//
-//	cv::Mat_<float> input_col;
-//	// Extract im2col but in a sparse way
-//	im2colBiasSparse(area_of_interest, width, height, input_col);
-//
-//	// Mean and standard deviation normalization
-//	contrastNorm(input_col, response);
-//
-//	cv::Mat_<float> response_blas = response.clone();
-//
-//	for (size_t layer = 0; layer < activation_function.size(); ++layer)
-//	{
-//
-//		// We are performing response = response * weights[layers], but in OpenBLAS as that is significantly quicker than OpenCV
-//		response_blas = response.clone();
-//
-//		float* m1 = (float*)response_blas.data;
-//		float* m2 = (float*)weights[layer].data;
-//
-//		cv::Mat_<float> resp_blas(response_blas.rows, weights[layer].cols, 1.0);
-//		float* m3 = (float*)resp_blas.data;
-//
-//		cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, weights[layer].cols, response.rows, response.cols, 1, m2, weights[layer].cols, m1, response.cols, 0.0, m3, weights[layer].cols);
-//
-//		response = resp_blas;
-//
-//		// TODO bias could be pre-allocated to the window size so that addition could be quicker
-//		for (size_t y = 0; y < response.rows; ++y)
-//		{
-//			response(cv::Rect(0, y, response.cols, 1)) = response(cv::Rect(0, y, response.cols, 1)) + biases[layer];
-//		}
-//
-//		// Perform activation		
-//		if (activation_function[layer] == 0) // Sigmoid
-//		{
-//			for (cv::MatIterator_<float> p = response.begin(); p != response.end(); p++)
-//			{
-//				*p = 1.0 / (1.0 + exp(-(*p)));
-//			}
-//		}
-//		else if (activation_function[layer] == 2)// ReLU
-//		{
-//			cv::threshold(response, response, 0, 0, cv::THRESH_TOZERO);
-//		}
-//
-//	}
-//
-//	// Restructure the output with interpolation
-//	cv::Mat_<float> mapMatrix(response.rows, response_height * response_width, 0.0f);
-//
-//	// Find a mapping from indices in the computed sparse response and the original full response
-//	cv::Mat_<int> value_id_matrix(response_width, response_height, 0);
-//
-//	int ind = 0;
-//	for (int k = 0; k < value_id_matrix.rows * value_id_matrix.cols; ++k)
-//	{
-//		if (k % 2 != 0)
-//		{
-//			value_id_matrix.at<int>(k) = ind;
-//			ind++;
-//		}
-//	}
-//	value_id_matrix = value_id_matrix.t();
-//
-//	int skip_counter = 0;
-//	for (int x = 0; x < response_width; ++x)
-//	{
-//		for (int y = 0; y < response_height; ++y)
-//		{
-//			int mapping_col = x * response_height + y;
-//			skip_counter++;
-//			if (skip_counter % 2 == 0)
-//			{
-//				int val_id = value_id_matrix.at<int>(y, x);
-//				mapMatrix.at<float>(val_id, mapping_col) = 1;
-//				continue;
-//			}
-//
-//			double num_neigh = 0.0;
-//			vector<int> val_ids;
-//			if (x - 1 >= 0)
-//			{
-//				num_neigh++;
-//				val_ids.push_back(value_id_matrix.at<int>(y, x - 1));
-//			}
-//			if (y - 1 >= 0)
-//			{
-//				num_neigh++;
-//				val_ids.push_back(value_id_matrix.at<int>(y - 1, x));
-//			}
-//			if (x + 1 < response_width)
-//			{
-//				num_neigh++;
-//				val_ids.push_back(value_id_matrix.at<int>(y, x + 1));
-//			}
-//			if (y + 1 < response_height)
-//			{
-//				num_neigh++;
-//				val_ids.push_back(value_id_matrix.at<int>(y + 1, x));
-//			}
-//
-//			for (size_t k = 0; k < val_ids.size(); ++k)
-//			{
-//				mapMatrix.at<float>(val_ids[k], mapping_col) = 1.0 / num_neigh;
-//			}
-//		}
-//	}
-//
-//	response = response.t() * mapMatrix;
-//	response = response.t();
-//	response = response.reshape(1, response_height);
-//	response = response.t();
-//}
-
 // As the sparse patch expert output with interpolation, this function creates an interpolation matrix
 void LandmarkDetector::interpolationMatrix(cv::Mat_<float>& mapMatrix, int response_height, int response_width, int input_width, int input_height)
 {
@@ -610,41 +492,46 @@ void CEN_patch_expert::ResponseSparse(const cv::Mat_<float> &area_of_interest, c
 
 	// Extract im2col but in a sparse way and contrast normalize
 	im2colBiasSparseContrastNorm(area_of_interest, width, height, response);
+	response = response.t();
 
 	for (size_t layer = 0; layer < activation_function.size(); ++layer)
 	{
 
-		// We are performing response = response * weights[layers], but in OpenBLAS as that is significantly quicker than OpenCV		
-		float* m1 = (float*)response.data;
-		float* m2 = (float*)weights[layer].data;
+		// We are performing response = weights[layers] * response(t), but in OpenBLAS as that is significantly quicker than OpenCV		
+		cv::Mat_<float> resp = response;
+		float* m1 = (float*)resp.data;
+		cv::Mat_<float> weight = weights[layer];
+		float* m2 = (float*)weight.data;
 
-		cv::Mat_<float> resp_blas(response.rows, weights[layer].cols);
+		cv::Mat_<float> resp_blas(weight.rows, resp.cols);
 		float* m3 = (float*)resp_blas.data;
 
-		// Perform matrix multiplication
-		float alpha = 1.0;
-		float beta = 0.0;
-		sgemm_("N", "N", &weights[layer].cols, &response.rows, &response.cols, &alpha, m2, &weights[layer].cols, m1, &response.cols, &beta, m3, &weights[layer].cols);
+		// Perform matrix multiplication in OpenBLAS (fortran call)
+		float alpha1 = 1.0;
+		float beta1 = 0.0;
+		sgemm_("N", "N", &resp.cols, &weight.rows, &weight.cols, &alpha1, m1, &resp.cols, m2, &weight.cols, &beta1, m3, &resp.cols);
 
 		// The above is a faster version of this, by calling the fortran version directly
-		//cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, weights[layer].cols, response.rows, response.cols, 1, m2, weights[layer].cols, m1, response.cols, 0.0, m3, weights[layer].cols);
+		//cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, resp.cols, weight.rows, weight.cols, 1, m1, resp.cols, m2, weight.cols, 0.0, m3, resp.cols);
 		
 		// Adding the bias (bit ugly, but the fastest way to do this)
 		response = resp_blas;
+
 		float* data = (float*)response.data;
 		size_t height = response.rows;
 		size_t width = response.cols;
 		float* data_b = (float*)biases[layer].data;
 		for (size_t y = 0; y < height; ++y)
 		{
+			float bias = data_b[y];
 			for (size_t x = 0; x < width; ++x)
 			{
-				float in = *data;
-				*data++ = in + data_b[x];
+				float in = *data + bias;
+				*data++ = in;
 			}
 		}
 
-		// Perform activation	
+		// Perform activation and add bias at the same time	
 		if (activation_function[layer] == 0) // Sigmoid
 		{
 
@@ -662,15 +549,13 @@ void CEN_patch_expert::ResponseSparse(const cv::Mat_<float> &area_of_interest, c
 		}
 		else if (activation_function[layer] == 2)// ReLU
 		{
-			// TODO this could be spedup by iterating over it
 			cv::threshold(response, response, 0, 0, cv::THRESH_TOZERO);
 		}
 
 	}
 	
-	//response = response.t() * mapMatrix;
-	//response = response.t();
-	response = mapMatrix * response;
+	response =  response * mapMatrix;
+	response = response.t();
 	response = response.reshape(1, response_height);
 	response = response.t();
 }
diff --git a/lib/local/LandmarkDetector/src/CNN_utils.cpp b/lib/local/LandmarkDetector/src/CNN_utils.cpp
new file mode 100644
index 00000000..7128eafa
--- /dev/null
+++ b/lib/local/LandmarkDetector/src/CNN_utils.cpp
@@ -0,0 +1,434 @@
+///////////////////////////////////////////////////////////////////////////////
+// Copyright (C) 2017, Tadas Baltrusaitis, all rights reserved.
+//
+// ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
+//
+// BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT.  
+// IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
+//
+// License can be found in OpenFace-license.txt
+//
+//     * Any publications arising from the use of this software, including but
+//       not limited to academic journal and conference publications, technical
+//       reports and manuals, must cite at least one of the following works:
+//
+//       OpenFace: an open source facial behavior analysis toolkit
+//       Tadas Baltrušaitis, Peter Robinson, and Louis-Philippe Morency
+//       in IEEE Winter Conference on Applications of Computer Vision, 2016  
+//
+//       Rendering of Eyes for Eye-Shape Registration and Gaze Estimation
+//       Erroll Wood, Tadas Baltrušaitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, and Andreas Bulling 
+//       in IEEE International. Conference on Computer Vision (ICCV),  2015 
+//
+//       Cross-dataset learning and person-speci?c normalisation for automatic Action Unit detection
+//       Tadas Baltrušaitis, Marwa Mahmoud, and Peter Robinson 
+//       in Facial Expression Recognition and Analysis Challenge, 
+//       IEEE International Conference on Automatic Face and Gesture Recognition, 2015 
+//
+//       Constrained Local Neural Fields for robust facial landmark detection in the wild.
+//       Tadas Baltrušaitis, Peter Robinson, and Louis-Philippe Morency. 
+//       in IEEE Int. Conference on Computer Vision Workshops, 300 Faces in-the-Wild Challenge, 2013.    
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include "stdafx.h"
+
+#include "CNN_utils.h"
+
+// OpenBLAS
+#include <cblas.h>
+#include <f77blas.h>
+
+using namespace std;
+
+namespace LandmarkDetector
+{
+
+	// Parametric ReLU with leaky weights (separate ones per channel)
+	void PReLU(std::vector<cv::Mat_<float> >& input_output_maps, cv::Mat_<float> prelu_weights)
+	{
+
+		if (input_output_maps.size() > 1)
+		{
+			int h = input_output_maps[0].rows;
+			int w = input_output_maps[0].cols;
+			size_t size_in = h * w;
+
+			for (size_t k = 0; k < input_output_maps.size(); ++k)
+			{
+				// Apply the PReLU
+				auto iter = input_output_maps[k].begin();
+
+				float neg_mult = prelu_weights.at<float>(k);
+
+				for (size_t i = 0; i < size_in; ++i)
+				{
+					float in_val = *iter;
+
+					// The prelu step
+					*iter++ = in_val >= 0 ? in_val : in_val * neg_mult;
+
+				}
+			}
+		}
+		else
+		{
+
+			int w = input_output_maps[0].cols;
+
+			for (size_t k = 0; k < prelu_weights.rows; ++k)
+			{
+				auto iter = input_output_maps[0].row(k).begin();
+				float neg_mult = prelu_weights.at<float>(k);
+
+				for (size_t i = 0; i < w; ++i)
+				{
+					float in_val = *iter;
+					// Apply the PReLU
+					*iter++ = in_val >= 0 ? in_val : in_val * neg_mult;
+				}
+			}
+
+		}
+
+	}
+
+	void fully_connected(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, cv::Mat_<float> weights, cv::Mat_<float> biases)
+	{
+		outputs.clear();
+
+		if (input_maps.size() > 1)
+		{
+			// Concatenate all the maps
+			cv::Size orig_size = input_maps[0].size();
+			cv::Mat_<float> input_concat(input_maps.size(), input_maps[0].cols * input_maps[0].rows);
+
+			for (size_t in = 0; in < input_maps.size(); ++in)
+			{
+				cv::Mat_<float> add = input_maps[in];
+
+				// Reshape if all of the data will be flattened
+				if (input_concat.rows != weights.cols)
+				{
+					add = add.t();
+				}
+
+				add = add.reshape(0, 1);
+				add.copyTo(input_concat.row(in));
+			}
+
+			// Treat the input as separate feature maps
+			if (input_concat.rows == weights.cols)
+			{
+				input_concat = weights * input_concat;
+				// Add biases
+				for (size_t k = 0; k < biases.rows; ++k)
+				{
+					input_concat.row(k) = input_concat.row(k) + biases.at<float>(k);
+				}
+
+				outputs.clear();
+				// Resize and add as output
+				for (size_t k = 0; k < biases.rows; ++k)
+				{
+					cv::Mat_<float> reshaped = input_concat.row(k).clone();
+					reshaped = reshaped.reshape(1, orig_size.height);
+					outputs.push_back(reshaped);
+				}
+			}
+			else
+			{
+				// Flatten the input
+				input_concat = input_concat.reshape(0, input_concat.rows * input_concat.cols);
+
+				input_concat = weights * input_concat + biases;
+
+				outputs.clear();
+				outputs.push_back(input_concat);
+			}
+
+		}
+		else
+		{
+			cv::Mat out = weights * input_maps[0] + biases;
+			outputs.clear();
+			outputs.push_back(out.t());
+		}
+
+	}
+
+
+	void max_pooling(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, int stride_x, int stride_y, int kernel_size_x, int kernel_size_y)
+	{
+		vector<cv::Mat_<float> > outputs_sub;
+
+		// Iterate over kernel height and width, based on stride
+		for (size_t in = 0; in < input_maps.size(); ++in)
+		{
+			// Help with rounding up a bit, to match caffe style output
+			int out_x = round((double)(input_maps[in].cols - kernel_size_x) / (double)stride_x) + 1;
+			int out_y = round((double)(input_maps[in].rows - kernel_size_y) / (double)stride_y) + 1;
+
+			cv::Mat_<float> sub_out(out_y, out_x, 0.0);
+			cv::Mat_<float> in_map = input_maps[in];
+
+			for (int x = 0; x < input_maps[in].cols; x += stride_x)
+			{
+				int max_x = cv::min(input_maps[in].cols, x + kernel_size_x);
+				int x_in_out = floor(x / stride_x);
+
+				if (x_in_out >= out_x)
+					continue;
+
+				for (int y = 0; y < input_maps[in].rows; y += stride_y)
+				{
+					int y_in_out = floor(y / stride_y);
+
+					if (y_in_out >= out_y)
+						continue;
+
+					int max_y = cv::min(input_maps[in].rows, y + kernel_size_y);
+
+					float curr_max = -FLT_MAX;
+
+					for (int x_in = x; x_in < max_x; ++x_in)
+					{
+						for (int y_in = y; y_in < max_y; ++y_in)
+						{
+							float curr_val = in_map.at<float>(y_in, x_in);
+							if (curr_val > curr_max)
+							{
+								curr_max = curr_val;
+							}
+						}
+					}
+					sub_out.at<float>(y_in_out, x_in_out) = curr_max;
+				}
+			}
+
+			outputs_sub.push_back(sub_out);
+
+		}
+		outputs = outputs_sub;
+
+	}
+
+	void convolution_single_kern_fft(const vector<cv::Mat_<float> >& input_imgs, vector<cv::Mat_<double> >& img_dfts, const vector<cv::Mat_<float> >&  _templs, map<int, vector<cv::Mat_<double> > >& _templ_dfts, cv::Mat_<float>& result)
+	{
+		// Assume result is defined properly
+		if (result.empty())
+		{
+			cv::Size corrSize(input_imgs[0].cols - _templs[0].cols + 1, input_imgs[0].rows - _templs[0].rows + 1);
+			result.create(corrSize);
+		}
+
+		// Our model will always be under min block size so can ignore this
+		//const double blockScale = 4.5;
+		//const int minBlockSize = 256;
+
+		int maxDepth = CV_64F;
+
+		cv::Size dftsize;
+
+		dftsize.width = cv::getOptimalDFTSize(result.cols + _templs[0].cols - 1);
+		dftsize.height = cv::getOptimalDFTSize(result.rows + _templs[0].rows - 1);
+
+		// Compute block size
+		cv::Size blocksize;
+		blocksize.width = dftsize.width - _templs[0].cols + 1;
+		blocksize.width = MIN(blocksize.width, result.cols);
+		blocksize.height = dftsize.height - _templs[0].rows + 1;
+		blocksize.height = MIN(blocksize.height, result.rows);
+
+		vector<cv::Mat_<double>> dftTempl;
+
+		// if this has not been precomputed, precompute it, otherwise use it
+		if (_templ_dfts.find(dftsize.width) == _templ_dfts.end())
+		{
+			dftTempl.resize(_templs.size());
+			for (size_t k = 0; k < _templs.size(); ++k)
+			{
+				dftTempl[k].create(dftsize.height, dftsize.width);
+
+				cv::Mat_<float> src = _templs[k];
+
+				cv::Mat_<double> dst(dftTempl[k], cv::Rect(0, 0, dftsize.width, dftsize.height));
+
+				cv::Mat_<double> dst1(dftTempl[k], cv::Rect(0, 0, _templs[k].cols, _templs[k].rows));
+
+				if (dst1.data != src.data)
+					src.convertTo(dst1, dst1.depth());
+
+				if (dst.cols > _templs[k].cols)
+				{
+					cv::Mat_<double> part(dst, cv::Range(0, _templs[k].rows), cv::Range(_templs[k].cols, dst.cols));
+					part.setTo(0);
+				}
+
+				// Perform DFT of the template
+				dft(dst, dst, 0, _templs[k].rows);
+
+			}
+			_templ_dfts[dftsize.width] = dftTempl;
+
+		}
+		else
+		{
+			dftTempl = _templ_dfts[dftsize.width];
+		}
+
+		cv::Size bsz(std::min(blocksize.width, result.cols), std::min(blocksize.height, result.rows));
+		cv::Mat src;
+
+		cv::Mat cdst(result, cv::Rect(0, 0, bsz.width, bsz.height));
+
+		vector<cv::Mat_<double> > dftImgs;
+		dftImgs.resize(input_imgs.size());
+
+		if (img_dfts.empty())
+		{
+			for (size_t k = 0; k < input_imgs.size(); ++k)
+			{
+				dftImgs[k].create(dftsize);
+				dftImgs[k].setTo(0.0);
+
+				cv::Size dsz(bsz.width + _templs[k].cols - 1, bsz.height + _templs[k].rows - 1);
+
+				int x2 = std::min(input_imgs[k].cols, dsz.width);
+				int y2 = std::min(input_imgs[k].rows, dsz.height);
+
+				cv::Mat src0(input_imgs[k], cv::Range(0, y2), cv::Range(0, x2));
+				cv::Mat dst(dftImgs[k], cv::Rect(0, 0, dsz.width, dsz.height));
+				cv::Mat dst1(dftImgs[k], cv::Rect(0, 0, x2, y2));
+
+				src = src0;
+
+				if (dst1.data != src.data)
+					src.convertTo(dst1, dst1.depth());
+
+				dft(dftImgs[k], dftImgs[k], 0, dsz.height);
+				img_dfts.push_back(dftImgs[k].clone());
+			}
+		}
+
+		cv::Mat_<double> dft_img(img_dfts[0].rows, img_dfts[0].cols, 0.0);
+		for (size_t k = 0; k < input_imgs.size(); ++k)
+		{
+			cv::Mat dftTempl1(dftTempl[k], cv::Rect(0, 0, dftsize.width, dftsize.height));
+			if (k == 0)
+			{
+				cv::mulSpectrums(img_dfts[k], dftTempl1, dft_img, 0, true);
+			}
+			else
+			{
+				cv::mulSpectrums(img_dfts[k], dftTempl1, dftImgs[k], 0, true);
+				dft_img = dft_img + dftImgs[k];
+			}
+		}
+
+		cv::dft(dft_img, dft_img, cv::DFT_INVERSE + cv::DFT_SCALE, bsz.height);
+
+		src = dft_img(cv::Rect(0, 0, bsz.width, bsz.height));
+
+		src.convertTo(cdst, CV_32F);
+
+	}
+
+	void convolution_fft2(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, const std::vector<std::vector<cv::Mat_<float> > >& kernels, const std::vector<float >& biases, vector<map<int, vector<cv::Mat_<double> > > >& precomp_dfts)
+	{
+		outputs.clear();
+
+		// Useful precomputed data placeholders for quick correlation (convolution)
+		vector<cv::Mat_<double> > input_image_dft;
+
+		for (size_t k = 0; k < kernels.size(); ++k)
+		{
+
+			// The convolution (with precomputation)
+			cv::Mat_<float> output;
+			convolution_single_kern_fft(input_maps, input_image_dft, kernels[k], precomp_dfts[k], output);
+
+			// Combining the maps
+			outputs.push_back(output + biases[k]);
+
+		}
+	}
+
+	void im2col_t(const cv::Mat_<float>& input, int width, int height, cv::Mat_<float>& output)
+	{
+
+		int m = input.cols;
+		int n = input.rows;
+
+		// determine how many blocks there will be with a sliding window of width x height in the input
+		int yB = m - height + 1;
+		int xB = n - width + 1;
+
+		// Allocate the output size
+		if (output.rows != width * height && output.cols != xB*yB)
+		{
+			output = cv::Mat::ones(width * height, xB*yB, CV_32F);
+		}
+
+		// Iterate over the whole image
+		for (int i = 0; i< yB; i++)
+		{
+			int rowIdx = i;
+			for (int j = 0; j< xB; j++)
+			{
+				//int rowIdx = i; +j*yB;
+				// iterate over the blocks within the image
+				for (unsigned int yy = 0; yy < height; ++yy)
+				{
+					// Faster iteration over the image
+					const float* Mi = input.ptr<float>(j + yy);
+					for (unsigned int xx = 0; xx < width; ++xx)
+					{
+						int colIdx = xx*height + yy;
+
+						output.at<float>(colIdx, rowIdx) = Mi[i + xx];
+					}
+				}
+				rowIdx += yB;
+
+			}
+		}
+	}
+
+
+	void convolution_direct_blas(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, const cv::Mat_<float>& weight_matrix, const std::vector<float >& biases, int height_k, int width_k)
+	{
+		outputs.clear();
+
+		int height_in = input_maps[0].rows;
+		int width_n = input_maps[0].cols;
+
+		// determine how many blocks there will be with a sliding window of width x height in the input
+		int yB = height_in - height_k + 1;
+		int xB = width_n - width_k + 1;
+
+		cv::Mat_<float> input_matrix(input_maps.size() * height_k * width_k + 1.0, yB * xB, 1.0f);
+
+		// Comibine im2col accross channels to prepare for matrix multiplication
+		for (size_t i = 0; i < input_maps.size(); ++i)
+		{
+			im2col_t(input_maps[i], width_k, height_k, input_matrix(cv::Rect(0, i * height_k * width_k, yB * xB, height_k * width_k)));
+		}
+
+		float* m1 = (float*)weight_matrix.data;
+		float* m2 = (float*)input_matrix.data;
+
+		cv::Mat_<float> out(weight_matrix.rows, input_matrix.cols, 1.0);
+		float* m3 = (float*)out.data;
+
+		cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, input_matrix.cols, weight_matrix.rows, weight_matrix.cols, 1, m2, input_matrix.cols, m1, weight_matrix.cols, 0.0, m3, input_matrix.cols);
+
+		// Move back to vectors and reshape accordingly (also add the bias)
+		for (size_t k = 0; k < out.rows; ++k)
+		{
+			outputs.push_back(out.row(k).reshape(1, yB));
+		}
+
+	}
+
+}
\ No newline at end of file
diff --git a/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp b/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp
index 6ca5ea9f..0703010b 100644
--- a/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp
+++ b/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp
@@ -85,6 +85,9 @@
 
 #include "LandmarkDetectorUtils.h"
 
+// CNN includes
+#include "CNN_utils.h"
+
 // OpenBLAS
 #include <cblas.h>
 #include <f77blas.h>
@@ -153,623 +156,240 @@ CNN::CNN(const CNN& other) : cnn_layer_types(other.cnn_layer_types), cnn_max_poo
 	}
 }
 
-void PReLU(std::vector<cv::Mat_<float> >& input_output_maps, cv::Mat_<float> prelu_weights)
-{
-
-	if (input_output_maps.size() > 1)
-	{
-		int h = input_output_maps[0].rows;
-		int w = input_output_maps[0].cols;
-		size_t size_in = h * w;
-
-		for (size_t k = 0; k < input_output_maps.size(); ++k)
-		{
-			// Apply the PReLU
-			auto iter = input_output_maps[k].begin();
-
-			float neg_mult = prelu_weights.at<float>(k);
-
-			for(size_t i = 0; i < size_in; ++i)
-			{
-				float in_val = *iter;
-
-				// The prelu step
-				*iter++ = in_val >= 0 ? in_val : in_val * neg_mult;
-
-			}
-		}
-	}
-	else
-	{
-		
-		int w = input_output_maps[0].cols;
-
-		for (size_t k = 0; k < prelu_weights.rows; ++k)
-		{
-			auto iter = input_output_maps[0].row(k).begin();
-			float neg_mult = prelu_weights.at<float>(k);
-
-			for (size_t i = 0; i < w; ++i)
-			{
-				float in_val = *iter;
-				// Apply the PReLU
-				*iter++ = in_val >= 0 ? in_val : in_val * neg_mult;
-			}
-		}
-
-	}
-
-}
-
-void fully_connected(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, cv::Mat_<float> weights, cv::Mat_<float> biases)
-{
-	outputs.clear();
-
-	if (input_maps.size() > 1)
-	{
-		// Concatenate all the maps
-		cv::Size orig_size = input_maps[0].size();
-		cv::Mat_<float> input_concat(input_maps.size(), input_maps[0].cols * input_maps[0].rows);
-
-		for (size_t in = 0; in < input_maps.size(); ++in)
-		{
-			cv::Mat_<float> add = input_maps[in];
-			
-			// Reshape if all of the data will be flattened
-			if (input_concat.rows != weights.cols)
-			{
-				add = add.t();
-			}
-
-			add = add.reshape(0, 1);
-			add.copyTo(input_concat.row(in));
-		}
-
-		// Treat the input as separate feature maps
-		if (input_concat.rows == weights.cols)
-		{
-			input_concat = weights * input_concat;
-			// Add biases
-			for (size_t k = 0; k < biases.rows; ++k)
-			{
-				input_concat.row(k) = input_concat.row(k) + biases.at<float>(k);
-			}
-
-			outputs.clear();
-			// Resize and add as output
-			for (size_t k = 0; k < biases.rows; ++k)
-			{
-				cv::Mat_<float> reshaped = input_concat.row(k).clone();
-				reshaped = reshaped.reshape(1, orig_size.height);
-				outputs.push_back(reshaped);
-			}
-		}
-		else
-		{
-			// Flatten the input
-			input_concat = input_concat.reshape(0, input_concat.rows * input_concat.cols);
-
-			input_concat = weights * input_concat + biases;
-
-			outputs.clear();
-			outputs.push_back(input_concat);
-		}
-
-	}
-	else
-	{
-		cv::Mat out = weights * input_maps[0] + biases;
-		outputs.clear();
-		outputs.push_back(out.t());
-	}
-
-}
-
-void max_pooling(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, int stride_x, int stride_y, int kernel_size_x, int kernel_size_y)
-{
-	vector<cv::Mat_<float> > outputs_sub;
-
-	// Iterate over kernel height and width, based on stride
-	for (size_t in = 0; in < input_maps.size(); ++in)
-	{
-		// Help with rounding up a bit, to match caffe style output
-		int out_x = round((double)(input_maps[in].cols - kernel_size_x) / (double)stride_x) + 1;
-		int out_y = round((double)(input_maps[in].rows - kernel_size_y) / (double)stride_y) + 1;
-
-		cv::Mat_<float> sub_out(out_y, out_x, 0.0);
-		cv::Mat_<float> in_map = input_maps[in];
-
-		for (int x = 0; x < input_maps[in].cols; x += stride_x)
-		{
-			int max_x = cv::min(input_maps[in].cols, x + kernel_size_x);
-			int x_in_out = floor(x / stride_x);
-
-			if (x_in_out >= out_x)
-				continue;
-
-			for (int y = 0; y < input_maps[in].rows; y += stride_y)
-			{
-				int y_in_out = floor(y / stride_y);
-
-				if (y_in_out >= out_y)
-					continue;
-
-				int max_y = cv::min(input_maps[in].rows, y + kernel_size_y);
-
-				float curr_max = -FLT_MAX;
-
-				for (int x_in = x; x_in < max_x; ++x_in)
-				{
-					for (int y_in = y; y_in < max_y; ++y_in)
-					{
-						float curr_val = in_map.at<float>(y_in, x_in);
-						if (curr_val > curr_max)
-						{
-							curr_max = curr_val;
-						}
-					}
-				}
-				sub_out.at<float>(y_in_out, x_in_out) = curr_max;
-			}
-		}
-
-		outputs_sub.push_back(sub_out);
-
-	}
-	outputs = outputs_sub;
-
-}
-
 ////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-void convolution_single_kern_fft(const vector<cv::Mat_<float> >& input_imgs, vector<cv::Mat_<double> >& img_dfts, const vector<cv::Mat_<float> >&  _templs, map<int, vector<cv::Mat_<double> > >& _templ_dfts, cv::Mat_<float>& result)
-{
-	// Assume result is defined properly
-	if (result.empty())
-	{
-		cv::Size corrSize(input_imgs[0].cols - _templs[0].cols + 1, input_imgs[0].rows - _templs[0].rows + 1);
-		result.create(corrSize);
-	}
 
-	// Our model will always be under min block size so can ignore this
-	//const double blockScale = 4.5;
-	//const int minBlockSize = 256;
-
-	int maxDepth = CV_64F;
-
-	cv::Size dftsize;
-
-	dftsize.width = cv::getOptimalDFTSize(result.cols + _templs[0].cols - 1);
-	dftsize.height = cv::getOptimalDFTSize(result.rows + _templs[0].rows - 1);
-
-	// Compute block size
-	cv::Size blocksize;
-	blocksize.width = dftsize.width - _templs[0].cols + 1;
-	blocksize.width = MIN(blocksize.width, result.cols);
-	blocksize.height = dftsize.height - _templs[0].rows + 1;
-	blocksize.height = MIN(blocksize.height, result.rows);
-
-	vector<cv::Mat_<double>> dftTempl;
-
-	// if this has not been precomputed, precompute it, otherwise use it
-	if (_templ_dfts.find(dftsize.width) == _templ_dfts.end())
-	{
-		dftTempl.resize(_templs.size());
-		for (size_t k = 0; k < _templs.size(); ++k)
-		{
-			dftTempl[k].create(dftsize.height, dftsize.width);
-
-			cv::Mat_<float> src = _templs[k];
-
-			cv::Mat_<double> dst(dftTempl[k], cv::Rect(0, 0, dftsize.width, dftsize.height));
-
-			cv::Mat_<double> dst1(dftTempl[k], cv::Rect(0, 0, _templs[k].cols, _templs[k].rows));
-
-			if (dst1.data != src.data)
-				src.convertTo(dst1, dst1.depth());
-
-			if (dst.cols > _templs[k].cols)
-			{
-				cv::Mat_<double> part(dst, cv::Range(0, _templs[k].rows), cv::Range(_templs[k].cols, dst.cols));
-				part.setTo(0);
-			}
-
-			// Perform DFT of the template
-			dft(dst, dst, 0, _templs[k].rows);
-
-		}
-		_templ_dfts[dftsize.width] = dftTempl;
-
-	}
-	else
-	{
-		dftTempl = _templ_dfts[dftsize.width];
-	}
-
-	cv::Size bsz(std::min(blocksize.width, result.cols), std::min(blocksize.height, result.rows));
-	cv::Mat src;
-
-	cv::Mat cdst(result, cv::Rect(0, 0, bsz.width, bsz.height));
-
-	vector<cv::Mat_<double> > dftImgs;
-	dftImgs.resize(input_imgs.size());
-
-	if (img_dfts.empty())
-	{
-		for(size_t k = 0; k < input_imgs.size(); ++k)
-		{
-			dftImgs[k].create(dftsize);
-			dftImgs[k].setTo(0.0);
-
-			cv::Size dsz(bsz.width + _templs[k].cols - 1, bsz.height + _templs[k].rows - 1);
-
-			int x2 = std::min(input_imgs[k].cols, dsz.width);
-			int y2 = std::min(input_imgs[k].rows, dsz.height);
-
-			cv::Mat src0(input_imgs[k], cv::Range(0, y2), cv::Range(0, x2));
-			cv::Mat dst(dftImgs[k], cv::Rect(0, 0, dsz.width, dsz.height));
-			cv::Mat dst1(dftImgs[k], cv::Rect(0, 0, x2, y2));
-
-			src = src0;
-
-			if (dst1.data != src.data)
-				src.convertTo(dst1, dst1.depth());
-
-			dft(dftImgs[k], dftImgs[k], 0, dsz.height);
-			img_dfts.push_back(dftImgs[k].clone());
-		}
-	}
-
-	cv::Mat_<double> dft_img(img_dfts[0].rows, img_dfts[0].cols, 0.0);
-	for (size_t k = 0; k < input_imgs.size(); ++k)
-	{
-		cv::Mat dftTempl1(dftTempl[k], cv::Rect(0, 0, dftsize.width, dftsize.height));
-		if (k == 0)
-		{
-			cv::mulSpectrums(img_dfts[k], dftTempl1, dft_img, 0, true);
-		}
-		else
-		{
-			cv::mulSpectrums(img_dfts[k], dftTempl1, dftImgs[k], 0, true);
-			dft_img = dft_img + dftImgs[k];
-		}
-	}
-
-	cv::dft(dft_img, dft_img, cv::DFT_INVERSE + cv::DFT_SCALE, bsz.height);
-
-	src = dft_img(cv::Rect(0, 0, bsz.width, bsz.height));
-
-	src.convertTo(cdst, CV_32F);
-
-}
-
-void im2col_t(const cv::Mat_<float>& input, int width, int height, cv::Mat_<float>& output)
-{
-
-	int m = input.cols;
-	int n = input.rows;
-
-	// determine how many blocks there will be with a sliding window of width x height in the input
-	int yB = m - height + 1;
-	int xB = n - width + 1;
-
-	// Allocate the output size
-	if (output.rows != width * height && output.cols != xB*yB)
-	{
-		output = cv::Mat::ones(width * height, xB*yB, CV_32F);
-	}
-
-	// Iterate over the whole image
-	for (int i = 0; i< yB; i++)
-	{
-		int rowIdx = i;
-		for (int j = 0; j< xB; j++)
-		{
-			//int rowIdx = i; +j*yB;
-			// iterate over the blocks within the image
-			for (unsigned int yy = 0; yy < height; ++yy)
-			{
-				// Faster iteration over the image
-				const float* Mi = input.ptr<float>(j + yy);
-				for (unsigned int xx = 0; xx < width; ++xx)
-				{
-					int colIdx = xx*height + yy;
-
-					output.at<float>(colIdx, rowIdx) = Mi[i + xx];
-				}
-			}
-			rowIdx += yB;
-
-		}
-	}
-}
-
-void im2col_multimap(const vector<cv::Mat_<float> >& inputs, int width, int height, cv::Mat_<float>& output)
-{
-
-	int m = inputs[0].rows;
-	int n = inputs[0].cols;
-
-	// determine how many blocks there will be with a sliding window of width x height in the input
-	int yB = m - height + 1;
-	int xB = n - width + 1;
-
-	int stride = height * width;
-
-	size_t num_maps = inputs.size();
-
-	// Allocate the output size
-	if (output.cols != width * height * inputs.size() + 1 && output.rows != xB*yB)
-	{
-		output = cv::Mat::ones(xB*yB, width * height * num_maps + 1, CV_32F);
-	}
-
-	// Iterate over the whole image
-	for (int i = 0; i< yB; i++)
-	{
-		int rowIdx = i*xB;
-		for (int j = 0; j< xB; j++)
-		{
-
-			float* Mo = output.ptr<float>(rowIdx);
-
-			// iterate over the blocks within the image
-			for (unsigned int yy = 0; yy < height; ++yy)
-			{
-				for (unsigned int in_maps = 0; in_maps < num_maps; ++in_maps)
-				{
-					// Faster iteration over the image
-					const float* Mi = inputs[in_maps].ptr<float>(i + yy);
-
-					for (unsigned int xx = 0; xx < width; ++xx)
-					{
-						int colIdx = xx*height + yy + in_maps * stride;
-						//output.at<float>(rowIdx, colIdx) = Mi[j + xx]; //input.at<float>(i + yy, j + xx);
-						Mo[colIdx] = Mi[j + xx];
-					}
-				}
-			}
-			rowIdx++;
-
-		}
-	}
-}
-
-
-void im2col(const cv::Mat_<float>& input, int width, int height, cv::Mat_<float>& output)
-{
-
-	int m = input.rows;
-	int n = input.cols;
-
-	// determine how many blocks there will be with a sliding window of width x height in the input
-	int yB = m - height + 1;
-	int xB = n - width + 1;
-
-	// Allocate the output size
-	if (output.cols != width * height && output.rows != xB*yB)
-	{
-		output = cv::Mat::ones(xB*yB, width * height, CV_32F);
-	}
-
-	// Iterate over the whole image
-	for (int i = 0; i< yB; i++)
-	{
-		int rowIdx = i*xB;
-		for (int j = 0; j< xB; j++)
-		{
-
-			float* Mo = output.ptr<float>(rowIdx);
-
-			// iterate over the blocks within the image
-			for (unsigned int yy = 0; yy < height; ++yy)
-			{
-				// Faster iteration over the image
-				const float* Mi = input.ptr<float>(i + yy);
-
-				for (unsigned int xx = 0; xx < width; ++xx)
-				{
-					int colIdx = xx*height + yy;
-					//output.at<float>(rowIdx, colIdx) = Mi[j + xx]; //input.at<float>(i + yy, j + xx);
-					Mo[colIdx] = Mi[j + xx];
-				}
-			}
-			rowIdx++;
-
-		}
-	}
-}
-
-void convolution_direct(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, const cv::Mat_<float>& weight_matrix, const std::vector<float >& biases, int height_k, int width_k)
-{
-	outputs.clear();
-
-	int height_in = input_maps[0].rows;
-	int width_n = input_maps[0].cols;
-
-	// determine how many blocks there will be with a sliding window of width x height in the input
-	int yB = height_in - height_k + 1;
-	int xB = width_n - width_k + 1;
-
-	cv::Mat_<float> input_matrix(input_maps.size() * height_k * width_k + 1.0, yB * xB, 1.0f);
-
-	// Comibine im2col accross channels to prepare for matrix multiplication
-	for (size_t i = 0; i < input_maps.size(); ++i)
-	{
-		im2col_t(input_maps[i], width_k, height_k, input_matrix(cv::Rect(0, i * height_k * width_k, yB * xB, height_k * width_k)));
-	}
-
-	// Actual convolution (through multiplication)
-	cv::Mat_<float> out = weight_matrix * input_matrix;
-
-	// Move back to vectors and reshape accordingly (also add the bias)
-	for (size_t k = 0; k < out.rows; ++k)
-	{
-		outputs.push_back(out.row(k).reshape(1, yB));
-	}
-
-}
-
-void convolution_direct_blas(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, const cv::Mat_<float>& weight_matrix, const std::vector<float >& biases, int height_k, int width_k)
-{
-	outputs.clear();
-
-	int height_in = input_maps[0].rows;
-	int width_n = input_maps[0].cols;
-
-	// determine how many blocks there will be with a sliding window of width x height in the input
-	int yB = height_in - height_k + 1;
-	int xB = width_n - width_k + 1;
-
-	cv::Mat_<float> input_matrix(input_maps.size() * height_k * width_k + 1.0, yB * xB, 1.0f);
-
-	// Comibine im2col accross channels to prepare for matrix multiplication
-	for (size_t i = 0; i < input_maps.size(); ++i)
-	{
-		im2col_t(input_maps[i], width_k, height_k, input_matrix(cv::Rect(0, i * height_k * width_k, yB * xB, height_k * width_k)));
-	}
-
-	float* m1 = (float*)weight_matrix.data;
-	float* m2 = (float*)input_matrix.data;
-
-	cv::Mat_<float> out(weight_matrix.rows, input_matrix.cols, 1.0);
-	float* m3 = (float*)out.data;
-
-	cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, input_matrix.cols, weight_matrix.rows, weight_matrix.cols, 1, m2, input_matrix.cols, m1, weight_matrix.cols, 0.0, m3, input_matrix.cols);
-
-	// Move back to vectors and reshape accordingly (also add the bias)
-	for (size_t k = 0; k < out.rows; ++k)
-	{
-		outputs.push_back(out.row(k).reshape(1, yB));
-	}
-
-}
-
-void convolution_direct_blas2(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, const cv::Mat_<float>& weight_matrix, const std::vector<float >& biases, int height_k, int width_k)
-{
-	outputs.clear();
-
-	int height_in = input_maps[0].rows;
-	int width_n = input_maps[0].cols;
-
-	// determine how many blocks there will be with a sliding window of width x height in the input
-	int yB = height_in - height_k + 1;
-	int xB = width_n - width_k + 1;
-
-	// TODO this could (should) be pre-allocated
-	cv::Mat_<float> input_matrix(input_maps.size() * height_k * width_k + 1.0, yB * xB, 1.0f);
-	//cv::Mat_<float> input_matrix_t(yB * xB, input_maps.size() * height_k * width_k + 1.0, 1.0f);
-
-	// Comibine im2col accross channels to prepare for matrix multiplication
-	for (size_t i = 0; i < input_maps.size(); ++i)
-	{
-		im2col_t(input_maps[i], width_k, height_k, input_matrix(cv::Rect(0, i * height_k * width_k, yB * xB, height_k * width_k)));
-		//im2col(input_maps[i], width_k, height_k, input_matrix_t(cv::Rect(i * height_k * width_k, 0, height_k * width_k, yB * xB)));
-
-	}
-
-	cv::Mat_<float> input_matrix_mm;
-	im2col_multimap(input_maps, width_k, height_k, input_matrix_mm);
-	//input_matrix_mm = input_matrix_mm.t();
-
-	float* m1 = (float*)weight_matrix.data;
-	float* m2 = (float*)input_matrix.data;
-
-	cv::Mat_<float> out(weight_matrix.rows, input_matrix.cols, 1.0);
-	float* m3 = (float*)out.data;
-
-	cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, input_matrix.cols, weight_matrix.rows, weight_matrix.cols, 1, m2, input_matrix.cols, m1, weight_matrix.cols, 0.0, m3, input_matrix.cols);
-
-	cv::Mat_<float> out2(weight_matrix.rows, input_matrix.cols, 1.0);
-	float* m31 = (float*)out2.data;
-
-	float* m21 = (float*)input_matrix_mm.data;
-	cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, input_matrix_mm.cols, weight_matrix.rows, weight_matrix.cols, 1, m21, input_matrix_mm.cols, m1, weight_matrix.cols, 0.0, m31, input_matrix_mm.cols);
-	// TOODO call fortran directly
-	//sgemm_("N","N", )
-	cout << cv::mean(cv::abs(out - out2))[0] << endl;
-
-	// Move back to vectors and reshape accordingly (also add the bias)
-	for (size_t k = 0; k < out.rows; ++k)
-	{
-		outputs.push_back(out.row(k).reshape(1, yB));
-	}
-
-}
-
-void convolution_fft2(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, const std::vector<std::vector<cv::Mat_<float> > >& kernels, const std::vector<float >& biases, vector<map<int, vector<cv::Mat_<double> > > >& precomp_dfts)
-{
-	outputs.clear();
-
-	// Useful precomputed data placeholders for quick correlation (convolution)
-	vector<cv::Mat_<double> > input_image_dft;
-
-	for (size_t k = 0; k < kernels.size(); ++k)
-	{
-
-		// The convolution (with precomputation)
-		cv::Mat_<float> output;
-		convolution_single_kern_fft(input_maps, input_image_dft, kernels[k], precomp_dfts[k], output);
-
-		// Combining the maps
-		outputs.push_back(output + biases[k]);
-
-	}
-}
-
-void convolution_fft(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, const std::vector<std::vector<cv::Mat_<float> > >& kernels, const std::vector<float >& biases, vector<vector<pair<int, cv::Mat_<double> > > >& precomp_dfts)
-{
-	outputs.clear();
-	for (size_t in = 0; in < input_maps.size(); ++in)
-	{
-		cv::Mat_<float> input_image = input_maps[in];
-
-		// Useful precomputed data placeholders for quick correlation (convolution)
-		cv::Mat_<double> input_image_dft;
-		cv::Mat integral_image;
-		cv::Mat integral_image_sq;
-
-		for (size_t k = 0; k < kernels[in].size(); ++k)
-		{
-			cv::Mat_<float> kernel = kernels[in][k];
-
-			// The convolution (with precomputation)
-			cv::Mat_<float> output;
-			if (precomp_dfts[in][k].second.empty())
-			{
-				std::map<int, cv::Mat_<double> > precomputed_dft;
-
-				LandmarkDetector::matchTemplate_m(input_image, input_image_dft, integral_image, integral_image_sq, kernel, precomputed_dft, output, CV_TM_CCORR);
-
-				precomp_dfts[in][k].first = precomputed_dft.begin()->first;
-				precomp_dfts[in][k].second = precomputed_dft.begin()->second;
-			}
-			else
-			{
-				std::map<int, cv::Mat_<double> > precomputed_dft;
-				precomputed_dft[precomp_dfts[in][k].first] = precomp_dfts[in][k].second;
-				LandmarkDetector::matchTemplate_m(input_image, input_image_dft, integral_image, integral_image_sq, kernel, precomputed_dft, output, CV_TM_CCORR);
-			}
-
-			// Combining the maps
-			if (in == 0)
-			{
-				outputs.push_back(output);
-			}
-			else
-			{
-				outputs[k] = outputs[k] + output;
-			}
-
-		}
-
-	}
-
-	for (size_t k = 0; k < biases.size(); ++k)
-	{
-		outputs[k] = outputs[k] + biases[k];
-	}
-}
+//void im2col_multimap(const vector<cv::Mat_<float> >& inputs, int width, int height, cv::Mat_<float>& output)
+//{
+//
+//	int m = inputs[0].rows;
+//	int n = inputs[0].cols;
+//
+//	// determine how many blocks there will be with a sliding window of width x height in the input
+//	int yB = m - height + 1;
+//	int xB = n - width + 1;
+//
+//	int stride = height * width;
+//
+//	size_t num_maps = inputs.size();
+//
+//	// Allocate the output size
+//	if (output.cols != width * height * inputs.size() + 1 && output.rows != xB*yB)
+//	{
+//		output = cv::Mat::ones(xB*yB, width * height * num_maps + 1, CV_32F);
+//	}
+//
+//	// Iterate over the whole image
+//	for (int i = 0; i< yB; i++)
+//	{
+//		int rowIdx = i*xB;
+//		for (int j = 0; j< xB; j++)
+//		{
+//
+//			float* Mo = output.ptr<float>(rowIdx);
+//
+//			// iterate over the blocks within the image
+//			for (unsigned int yy = 0; yy < height; ++yy)
+//			{
+//				for (unsigned int in_maps = 0; in_maps < num_maps; ++in_maps)
+//				{
+//					// Faster iteration over the image
+//					const float* Mi = inputs[in_maps].ptr<float>(i + yy);
+//
+//					for (unsigned int xx = 0; xx < width; ++xx)
+//					{
+//						int colIdx = xx*height + yy + in_maps * stride;
+//						//output.at<float>(rowIdx, colIdx) = Mi[j + xx]; //input.at<float>(i + yy, j + xx);
+//						Mo[colIdx] = Mi[j + xx];
+//					}
+//				}
+//			}
+//			rowIdx++;
+//
+//		}
+//	}
+//}
+//
+//void im2col(const cv::Mat_<float>& input, int width, int height, cv::Mat_<float>& output)
+//{
+//
+//	int m = input.rows;
+//	int n = input.cols;
+//
+//	// determine how many blocks there will be with a sliding window of width x height in the input
+//	int yB = m - height + 1;
+//	int xB = n - width + 1;
+//
+//	// Allocate the output size
+//	if (output.cols != width * height && output.rows != xB*yB)
+//	{
+//		output = cv::Mat::ones(xB*yB, width * height, CV_32F);
+//	}
+//
+//	// Iterate over the whole image
+//	for (int i = 0; i< yB; i++)
+//	{
+//		int rowIdx = i*xB;
+//		for (int j = 0; j< xB; j++)
+//		{
+//
+//			float* Mo = output.ptr<float>(rowIdx);
+//
+//			// iterate over the blocks within the image
+//			for (unsigned int yy = 0; yy < height; ++yy)
+//			{
+//				// Faster iteration over the image
+//				const float* Mi = input.ptr<float>(i + yy);
+//
+//				for (unsigned int xx = 0; xx < width; ++xx)
+//				{
+//					int colIdx = xx*height + yy;
+//					//output.at<float>(rowIdx, colIdx) = Mi[j + xx]; //input.at<float>(i + yy, j + xx);
+//					Mo[colIdx] = Mi[j + xx];
+//				}
+//			}
+//			rowIdx++;
+//
+//		}
+//	}
+//}
+//
+//void convolution_direct(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, const cv::Mat_<float>& weight_matrix, const std::vector<float >& biases, int height_k, int width_k)
+//{
+//	outputs.clear();
+//
+//	int height_in = input_maps[0].rows;
+//	int width_n = input_maps[0].cols;
+//
+//	// determine how many blocks there will be with a sliding window of width x height in the input
+//	int yB = height_in - height_k + 1;
+//	int xB = width_n - width_k + 1;
+//
+//	cv::Mat_<float> input_matrix(input_maps.size() * height_k * width_k + 1.0, yB * xB, 1.0f);
+//
+//	// Comibine im2col accross channels to prepare for matrix multiplication
+//	for (size_t i = 0; i < input_maps.size(); ++i)
+//	{
+//		im2col_t(input_maps[i], width_k, height_k, input_matrix(cv::Rect(0, i * height_k * width_k, yB * xB, height_k * width_k)));
+//	}
+//
+//	// Actual convolution (through multiplication)
+//	cv::Mat_<float> out = weight_matrix * input_matrix;
+//
+//	// Move back to vectors and reshape accordingly (also add the bias)
+//	for (size_t k = 0; k < out.rows; ++k)
+//	{
+//		outputs.push_back(out.row(k).reshape(1, yB));
+//	}
+//
+//}
+//
+//
+//void convolution_direct_blas2(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, const cv::Mat_<float>& weight_matrix, const std::vector<float >& biases, int height_k, int width_k)
+//{
+//	outputs.clear();
+//
+//	int height_in = input_maps[0].rows;
+//	int width_n = input_maps[0].cols;
+//
+//	// determine how many blocks there will be with a sliding window of width x height in the input
+//	int yB = height_in - height_k + 1;
+//	int xB = width_n - width_k + 1;
+//
+//	// TODO this could (should) be pre-allocated
+//	cv::Mat_<float> input_matrix(input_maps.size() * height_k * width_k + 1.0, yB * xB, 1.0f);
+//	//cv::Mat_<float> input_matrix_t(yB * xB, input_maps.size() * height_k * width_k + 1.0, 1.0f);
+//
+//	// Comibine im2col accross channels to prepare for matrix multiplication
+//	for (size_t i = 0; i < input_maps.size(); ++i)
+//	{
+//		im2col_t(input_maps[i], width_k, height_k, input_matrix(cv::Rect(0, i * height_k * width_k, yB * xB, height_k * width_k)));
+//		//im2col(input_maps[i], width_k, height_k, input_matrix_t(cv::Rect(i * height_k * width_k, 0, height_k * width_k, yB * xB)));
+//
+//	}
+//
+//	cv::Mat_<float> input_matrix_mm;
+//	im2col_multimap(input_maps, width_k, height_k, input_matrix_mm);
+//	//input_matrix_mm = input_matrix_mm.t();
+//
+//	float* m1 = (float*)weight_matrix.data;
+//	float* m2 = (float*)input_matrix.data;
+//
+//	cv::Mat_<float> out(weight_matrix.rows, input_matrix.cols, 1.0);
+//	float* m3 = (float*)out.data;
+//
+//	cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, input_matrix.cols, weight_matrix.rows, weight_matrix.cols, 1, m2, input_matrix.cols, m1, weight_matrix.cols, 0.0, m3, input_matrix.cols);
+//
+//	cv::Mat_<float> out2(weight_matrix.rows, input_matrix.cols, 1.0);
+//	float* m31 = (float*)out2.data;
+//
+//	float* m21 = (float*)input_matrix_mm.data;
+//	cblas_sgemm(CblasColMajor, CblasNoTrans, CblasTrans, input_matrix_mm.cols, weight_matrix.rows, weight_matrix.cols, 1, m21, input_matrix_mm.cols, m1, weight_matrix.cols, 0.0, m31, input_matrix_mm.cols);
+//	// TOODO call fortran directly
+//	//sgemm_("N","N", )
+//	cout << cv::mean(cv::abs(out - out2))[0] << endl;
+//
+//	// Move back to vectors and reshape accordingly (also add the bias)
+//	for (size_t k = 0; k < out.rows; ++k)
+//	{
+//		outputs.push_back(out.row(k).reshape(1, yB));
+//	}
+//
+//}
+//
+//void convolution_fft(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, const std::vector<std::vector<cv::Mat_<float> > >& kernels, const std::vector<float >& biases, vector<vector<pair<int, cv::Mat_<double> > > >& precomp_dfts)
+//{
+//	outputs.clear();
+//	for (size_t in = 0; in < input_maps.size(); ++in)
+//	{
+//		cv::Mat_<float> input_image = input_maps[in];
+//
+//		// Useful precomputed data placeholders for quick correlation (convolution)
+//		cv::Mat_<double> input_image_dft;
+//		cv::Mat integral_image;
+//		cv::Mat integral_image_sq;
+//
+//		for (size_t k = 0; k < kernels[in].size(); ++k)
+//		{
+//			cv::Mat_<float> kernel = kernels[in][k];
+//
+//			// The convolution (with precomputation)
+//			cv::Mat_<float> output;
+//			if (precomp_dfts[in][k].second.empty())
+//			{
+//				std::map<int, cv::Mat_<double> > precomputed_dft;
+//
+//				LandmarkDetector::matchTemplate_m(input_image, input_image_dft, integral_image, integral_image_sq, kernel, precomputed_dft, output, CV_TM_CCORR);
+//
+//				precomp_dfts[in][k].first = precomputed_dft.begin()->first;
+//				precomp_dfts[in][k].second = precomputed_dft.begin()->second;
+//			}
+//			else
+//			{
+//				std::map<int, cv::Mat_<double> > precomputed_dft;
+//				precomputed_dft[precomp_dfts[in][k].first] = precomp_dfts[in][k].second;
+//				LandmarkDetector::matchTemplate_m(input_image, input_image_dft, integral_image, integral_image_sq, kernel, precomputed_dft, output, CV_TM_CCORR);
+//			}
+//
+//			// Combining the maps
+//			if (in == 0)
+//			{
+//				outputs.push_back(output);
+//			}
+//			else
+//			{
+//				outputs[k] = outputs[k] + output;
+//			}
+//
+//		}
+//
+//	}
+//
+//	for (size_t k = 0; k < biases.size(); ++k)
+//	{
+//		outputs[k] = outputs[k] + biases[k];
+//	}
+//}
 
 std::vector<cv::Mat_<float>> CNN::Inference(const cv::Mat& input_img, bool direct)
 {