diff --git a/exe/FaceLandmarkImg/FaceLandmarkImg.cpp b/exe/FaceLandmarkImg/FaceLandmarkImg.cpp
index 8037ee26..b1164395 100644
--- a/exe/FaceLandmarkImg/FaceLandmarkImg.cpp
+++ b/exe/FaceLandmarkImg/FaceLandmarkImg.cpp
@@ -35,6 +35,8 @@
 
 #include "LandmarkCoreIncludes.h"
 
+#include "FaceDetectorMTCNN.h"
+
 // System includes
 #include <fstream>
 
@@ -306,9 +308,9 @@ int main(int argc, char **argv)
 
 	// Bounding boxes for a face in each image (optional)
 	vector<cv::Rect_<double> > bounding_boxes;
-	
+
 	LandmarkDetector::get_image_input_output_params(files, output_landmark_locations, output_pose_locations, output_images, bounding_boxes, arguments);
-	LandmarkDetector::FaceModelParameters det_parameters(arguments);	
+	LandmarkDetector::FaceModelParameters det_parameters(arguments);
 
 	// No need to validate detections, as we're not doing tracking
 	det_parameters.validate_detections = false;
@@ -335,8 +337,9 @@ int main(int argc, char **argv)
 	LandmarkDetector::CLNF clnf_model(det_parameters.model_location);
 	cout << "Model loaded" << endl;
 
-	cv::CascadeClassifier classifier(det_parameters.face_detector_location);
+	cv::CascadeClassifier classifier(det_parameters.haar_face_detector_location);
 	dlib::frontal_face_detector face_detector_hog = dlib::get_frontal_face_detector();
+	LandmarkDetector::FaceDetectorMTCNN face_detector_mtcnn(det_parameters.mtcnn_face_detector_location);
 
 	// Load facial feature extractor and AU analyser (make sure it is static)
 	FaceAnalysis::FaceAnalyserParameters face_analysis_params(arguments);
@@ -393,10 +396,15 @@ int main(int argc, char **argv)
 				vector<double> confidences;
 				LandmarkDetector::DetectFacesHOG(face_detections, grayscale_image, face_detector_hog, confidences);
 			}
-			else
+			else if (det_parameters.curr_face_detector == LandmarkDetector::FaceModelParameters::HAAR_DETECTOR)
 			{
 				LandmarkDetector::DetectFaces(face_detections, grayscale_image, classifier);
 			}
+			else
+			{
+				vector<double> confidences;
+				LandmarkDetector::DetectFacesMTCNN(face_detections, read_image, face_detector_mtcnn, confidences);
+			}
 
 			// Detect landmarks around detected faces
 			int face_det = 0;
@@ -414,7 +422,7 @@ int main(int argc, char **argv)
 				cv::Point3f gazeDirection1(0, 0, -1);
 
 				if (success && det_parameters.track_gaze)
-				{					
+				{
 					GazeAnalysis::EstimateGaze(clnf_model, gazeDirection0, fx, fy, cx, cy, true);
 					GazeAnalysis::EstimateGaze(clnf_model, gazeDirection1, fx, fy, cx, cy, false);
 
diff --git a/exe/FaceLandmarkVid/FaceLandmarkVid.cpp b/exe/FaceLandmarkVid/FaceLandmarkVid.cpp
index 4912aed4..fcb405b0 100644
--- a/exe/FaceLandmarkVid/FaceLandmarkVid.cpp
+++ b/exe/FaceLandmarkVid/FaceLandmarkVid.cpp
@@ -152,7 +152,7 @@ int main(int argc, char **argv)
 
 	// Some initial parameters that can be overriden from command line	
 	vector<string> files, output_video_files, out_dummy;
-	
+
 	// By default try webcam 0
 	int device = 0;
 
@@ -281,20 +281,8 @@ int main(int argc, char **argv)
 		while (!captured_image.empty())
 		{
 
-			// Reading the images
-			cv::Mat_<uchar> grayscale_image;
-
-			if (captured_image.channels() == 3)
-			{
-				cv::cvtColor(captured_image, grayscale_image, CV_BGR2GRAY);
-			}
-			else
-			{
-				grayscale_image = captured_image.clone();
-			}
-					
 			// The actual facial landmark detection / tracking
-			bool detection_success = LandmarkDetector::DetectLandmarksInVideo(grayscale_image, clnf_model, det_parameters);
+			bool detection_success = LandmarkDetector::DetectLandmarksInVideo(captured_image, clnf_model, det_parameters);
 
 			// Visualising the results
 			// Drawing the facial landmarks on the face and the bounding box around it if tracking is successful and initialised
@@ -311,7 +299,7 @@ int main(int argc, char **argv)
 			}
 
 			visualise_tracking(captured_image, clnf_model, det_parameters, gazeDirection0, gazeDirection1, frame_count, fx, fy, cx, cy);
-			
+
 			// output the tracked video
 			if (!output_video_files.empty())
 			{
diff --git a/exe/FaceLandmarkVidMulti/FaceLandmarkVidMulti.cpp b/exe/FaceLandmarkVidMulti/FaceLandmarkVidMulti.cpp
index 8cf3814c..98546e21 100644
--- a/exe/FaceLandmarkVidMulti/FaceLandmarkVidMulti.cpp
+++ b/exe/FaceLandmarkVidMulti/FaceLandmarkVidMulti.cpp
@@ -120,7 +120,7 @@ int main(int argc, char **argv)
 	// This is so that the model would not try re-initialising itself
 	det_params.reinit_video_every = -1;
 
-	det_params.curr_face_detector = LandmarkDetector::FaceModelParameters::HOG_SVM_DETECTOR;
+	det_params.curr_face_detector = LandmarkDetector::FaceModelParameters::MTCNN_DETECTOR;
 
 	vector<LandmarkDetector::FaceModelParameters> det_parameters;
 	det_parameters.push_back(det_params);
@@ -139,8 +139,10 @@ int main(int argc, char **argv)
 	int num_faces_max = 4;
 
 	LandmarkDetector::CLNF clnf_model(det_parameters[0].model_location);
-	clnf_model.face_detector_HAAR.load(det_parameters[0].face_detector_location);
-	clnf_model.face_detector_location = det_parameters[0].face_detector_location;
+	clnf_model.face_detector_HAAR.load(det_parameters[0].haar_face_detector_location);
+	clnf_model.haar_face_detector_location = det_parameters[0].haar_face_detector_location;
+	clnf_model.face_detector_MTCNN.Read(det_parameters[0].mtcnn_face_detector_location);
+	clnf_model.mtcnn_face_detector_location = det_parameters[0].mtcnn_face_detector_location;
 
 	clnf_models.reserve(num_faces_max);
 
@@ -271,10 +273,15 @@ int main(int argc, char **argv)
 					vector<double> confidences;
 					LandmarkDetector::DetectFacesHOG(face_detections, grayscale_image, clnf_models[0].face_detector_HOG, confidences);
 				}
-				else
+				else if (det_parameters[0].curr_face_detector == LandmarkDetector::FaceModelParameters::HAAR_DETECTOR)
 				{
 					LandmarkDetector::DetectFaces(face_detections, grayscale_image, clnf_models[0].face_detector_HAAR);
 				}
+				else
+				{
+					vector<double> confidences;
+					LandmarkDetector::DetectFacesMTCNN(face_detections, captured_image, clnf_models[0].face_detector_MTCNN, confidences);
+				}
 
 			}
 
diff --git a/exe/FeatureExtraction/FeatureExtraction.cpp b/exe/FeatureExtraction/FeatureExtraction.cpp
index 46e4ef4c..e27fd80b 100644
--- a/exe/FeatureExtraction/FeatureExtraction.cpp
+++ b/exe/FeatureExtraction/FeatureExtraction.cpp
@@ -220,7 +220,7 @@ int main(int argc, char **argv)
 
 	// Some initial parameters that can be overriden from command line	
 	vector<string> input_files, output_files, tracked_videos_output;
-	
+
 	// Get the input output file parameters
 
 	// Indicates that rotation should be with respect to camera or world coordinates
@@ -371,8 +371,8 @@ int main(int argc, char **argv)
 			}
 			// If image sequence provided, assume the fps is 30
 			fps_vid_in = 30;
-		}	
-		
+		}
+
 		// If optical centers are not defined just use center of image
 		if (cx_undefined)
 		{
@@ -450,29 +450,17 @@ int main(int argc, char **argv)
 				// if loading images assume 30fps
 				time_stamp = (double)frame_count * (1.0 / 30.0);
 			}
-
-			// Reading the images
-			cv::Mat_<uchar> grayscale_image;
-
-			if (captured_image.channels() == 3)
-			{
-				cvtColor(captured_image, grayscale_image, CV_BGR2GRAY);
-			}
-			else
-			{
-				grayscale_image = captured_image.clone();
-			}
-
+			
 			// The actual facial landmark detection / tracking
 			bool detection_success;
 
 			if (video_input || images_as_video)
 			{
-				detection_success = LandmarkDetector::DetectLandmarksInVideo(grayscale_image, face_model, det_parameters);
+				detection_success = LandmarkDetector::DetectLandmarksInVideo(captured_image, face_model, det_parameters);
 			}
 			else
 			{
-				detection_success = LandmarkDetector::DetectLandmarksInImage(grayscale_image, face_model, det_parameters);
+				detection_success = LandmarkDetector::DetectLandmarksInImage(captured_image, face_model, det_parameters);
 			}
 
 
diff --git a/lib/local/LandmarkDetector/LandmarkDetector.vcxproj b/lib/local/LandmarkDetector/LandmarkDetector.vcxproj
index e542eaa5..c86c51f4 100644
--- a/lib/local/LandmarkDetector/LandmarkDetector.vcxproj
+++ b/lib/local/LandmarkDetector/LandmarkDetector.vcxproj
@@ -195,6 +195,7 @@ xcopy /I /E /Y /D "$(SolutionDir)lib\3rdParty\OpenCV3.1\classifiers" "$(OutDir)c
       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Use</PrecompiledHeader>
     </ClCompile>
     <ClCompile Include="src\CEN_patch_expert.cpp" />
+    <ClCompile Include="src\FaceDetectorMTCNN.cpp" />
     <ClCompile Include="src\LandmarkDetectorModel.cpp">
       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Use</PrecompiledHeader>
       <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Use</PrecompiledHeader>
@@ -254,6 +255,7 @@ xcopy /I /E /Y /D "$(SolutionDir)lib\3rdParty\OpenCV3.1\classifiers" "$(OutDir)c
   <ItemGroup>
     <ClInclude Include="include\CCNF_patch_expert.h" />
     <ClInclude Include="include\CEN_patch_expert.h" />
+    <ClInclude Include="include\FaceDetectorMTCNN.h" />
     <ClInclude Include="include\LandmarkDetectorModel.h" />
     <ClInclude Include="include\LandmarkDetectorParameters.h" />
     <ClInclude Include="include\LandmarkDetectorFunc.h" />
diff --git a/lib/local/LandmarkDetector/LandmarkDetector.vcxproj.filters b/lib/local/LandmarkDetector/LandmarkDetector.vcxproj.filters
index 6d85a6f4..07e70f23 100644
--- a/lib/local/LandmarkDetector/LandmarkDetector.vcxproj.filters
+++ b/lib/local/LandmarkDetector/LandmarkDetector.vcxproj.filters
@@ -35,6 +35,7 @@
       <Filter>source</Filter>
     </ClCompile>
     <ClCompile Include="src\CEN_patch_expert.cpp">
+    <ClCompile Include="src\FaceDetectorMTCNN.cpp">
       <Filter>source</Filter>
     </ClCompile>
   </ItemGroup>
@@ -76,6 +77,7 @@
       <Filter>headers</Filter>
     </ClInclude>
     <ClInclude Include="include\CEN_patch_expert.h">
+    <ClInclude Include="include\FaceDetectorMTCNN.h">
       <Filter>headers</Filter>
     </ClInclude>
   </ItemGroup>
diff --git a/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h b/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h
new file mode 100644
index 00000000..5ce11298
--- /dev/null
+++ b/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h
@@ -0,0 +1,156 @@
+///////////////////////////////////////////////////////////////////////////////
+// Copyright (C) 2016, Carnegie Mellon University and University of Cambridge,
+// all rights reserved.
+//
+// THIS SOFTWARE IS PROVIDED “AS IS” FOR ACADEMIC USE ONLY AND ANY EXPRESS
+// OR IMPLIED WARRANTIES WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY.
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Notwithstanding the license granted herein, Licensee acknowledges that certain components
+// of the Software may be covered by so-called “open source” software licenses (“Open Source
+// Components”), which means any software licenses approved as open source licenses by the
+// Open Source Initiative or any substantially similar licenses, including without limitation any
+// license that, as a condition of distribution of the software licensed under such license,
+// requires that the distributor make the software available in source code format. Licensor shall
+// provide a list of Open Source Components for a particular version of the Software upon
+// Licensee’s request. Licensee will comply with the applicable terms of such licenses and to
+// the extent required by the licenses covering Open Source Components, the terms of such
+// licenses will apply in lieu of the terms of this Agreement. To the extent the terms of the
+// licenses applicable to Open Source Components prohibit any of the restrictions in this
+// License Agreement with respect to such Open Source Component, such restrictions will not
+// apply to such Open Source Component. To the extent the terms of the licenses applicable to
+// Open Source Components require Licensor to make an offer to provide source code or
+// related information in connection with the Software, such offer is hereby made. Any request
+// for source code or related information should be directed to cl-face-tracker-distribution@lists.cam.ac.uk
+// Licensee acknowledges receipt of notices for the Open Source Components for the initial
+// delivery of the Software.
+
+//     * Any publications arising from the use of this software, including but
+//       not limited to academic journal and conference publications, technical
+//       reports and manuals, must cite at least one of the following works:
+//
+//       OpenFace: an open source facial behavior analysis toolkit
+//       Tadas Baltrušaitis, Peter Robinson, and Louis-Philippe Morency
+//       in IEEE Winter Conference on Applications of Computer Vision, 2016  
+//
+//       Rendering of Eyes for Eye-Shape Registration and Gaze Estimation
+//       Erroll Wood, Tadas Baltrušaitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, and Andreas Bulling 
+//       in IEEE International. Conference on Computer Vision (ICCV),  2015 
+//
+//       Cross-dataset learning and person-speci?c normalisation for automatic Action Unit detection
+//       Tadas Baltrušaitis, Marwa Mahmoud, and Peter Robinson 
+//       in Facial Expression Recognition and Analysis Challenge, 
+//       IEEE International Conference on Automatic Face and Gesture Recognition, 2015 
+//
+//       Constrained Local Neural Fields for robust facial landmark detection in the wild.
+//       Tadas Baltrušaitis, Peter Robinson, and Louis-Philippe Morency. 
+//       in IEEE Int. Conference on Computer Vision Workshops, 300 Faces in-the-Wild Challenge, 2013.    
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef __FACE_DETECTOR_MTCNN_h_
+#define __FACE_DETECTOR_MTCNN_h_
+
+// OpenCV includes
+#include <opencv2/core/core.hpp>
+
+// System includes
+#include <vector>
+
+using namespace std;
+
+namespace LandmarkDetector
+{
+	class CNN
+	{
+	public:
+
+		//==========================================
+
+		// Default constructor
+		CNN() { ; }
+
+		// Copy constructor
+		CNN(const CNN& other);
+
+		// Given an image apply a CNN on it, the boolean direct controls if direct convolution is used (through matrix multiplication) or an FFT optimization
+		std::vector<cv::Mat_<float> > Inference(const cv::Mat& input_img, bool direct = true);
+
+		// Reading in the model
+		void Read(const string& location);
+
+		// Clearing precomputed DFTs
+		void ClearPrecomp();
+
+		size_t NumberOfLayers() { return cnn_layer_types.size(); }
+
+	private:
+		//==========================================
+		// Convolutional Neural Network
+
+		// CNN layers
+		// Layer -> Weight matrix
+		vector<cv::Mat_<float> > cnn_convolutional_layers_weights;
+		// Layer -> kernel -> input maps
+		vector<vector<vector<cv::Mat_<float> > > > cnn_convolutional_layers;
+		vector<vector<float > > cnn_convolutional_layers_bias;
+		// Layer matrix + bas
+		vector<cv::Mat_<float> >  cnn_fully_connected_layers_weights;
+		vector<cv::Mat_<float> > cnn_fully_connected_layers_biases;
+		vector<cv::Mat_<float> >  cnn_prelu_layer_weights;
+		vector<std::tuple<int, int, int, int> > cnn_max_pooling_layers;
+
+		// Precomputations for faster convolution
+		vector<vector<map<int, vector<cv::Mat_<double> > > > > cnn_convolutional_layers_dft;
+
+		// CNN: 0 - convolutional, 1 - max pooling, 2 - fully connected, 3 - prelu, 4 - sigmoid
+		vector<int > cnn_layer_types;
+	};
+	//===========================================================================
+	//
+	// Checking if landmark detection was successful using an SVR regressor
+	// Using multiple validators trained add different views
+	// The regressor outputs -1 for ideal alignment and 1 for worst alignment
+	//===========================================================================
+	class FaceDetectorMTCNN
+	{
+
+	public:
+
+		// Default constructor
+		FaceDetectorMTCNN() { ; }
+
+		FaceDetectorMTCNN(const string& location);
+
+		// Copy constructor
+		FaceDetectorMTCNN(const FaceDetectorMTCNN& other);
+
+		// Given an image, orientation and detected landmarks output the result of the appropriate regressor
+		bool DetectFaces(vector<cv::Rect_<double> >& o_regions, const cv::Mat& input_img, std::vector<double>& o_confidences, int min_face = 60, double t1 = 0.6, double t2 = 0.7, double t3 = 0.7);
+
+		// Reading in the model
+		void Read(const string& location);
+
+		// Indicate if the model has been read in
+		bool empty() { return PNet.NumberOfLayers() == 0 || RNet.NumberOfLayers() == 0 || ONet.NumberOfLayers() == 0; };
+
+	private:
+		//==========================================
+		// Components of the model
+
+		CNN PNet;
+		CNN RNet;
+		CNN ONet;
+		
+	};
+
+}
+#endif
diff --git a/lib/local/LandmarkDetector/include/LandmarkDetectorFunc.h b/lib/local/LandmarkDetector/include/LandmarkDetectorFunc.h
index 42a23ce3..9c1aa8c0 100644
--- a/lib/local/LandmarkDetector/include/LandmarkDetectorFunc.h
+++ b/lib/local/LandmarkDetector/include/LandmarkDetectorFunc.h
@@ -54,16 +54,16 @@ namespace LandmarkDetector
 	// Landmark detection in videos, need to provide an image and model parameters (default values work well)
 	// Optionally can provide a bounding box from which to start tracking
 	//================================================================================================================
-	bool DetectLandmarksInVideo(const cv::Mat_<uchar> &grayscale_image, CLNF& clnf_model, FaceModelParameters& params);
-	bool DetectLandmarksInVideo(const cv::Mat_<uchar> &grayscale_image, const cv::Rect_<double> bounding_box, CLNF& clnf_model, FaceModelParameters& params);
+	bool DetectLandmarksInVideo(const cv::Mat &image, CLNF& clnf_model, FaceModelParameters& params);
+	bool DetectLandmarksInVideo(const cv::Mat &image, const cv::Rect_<double> bounding_box, CLNF& clnf_model, FaceModelParameters& params);
 
 	//================================================================================================================
 	// Landmark detection in image, need to provide an image and optionally CLNF model together with parameters (default values work well)
 	// Optionally can provide a bounding box in which detection is performed (this is useful if multiple faces are to be detected in images)
 	//================================================================================================================
-	bool DetectLandmarksInImage(const cv::Mat_<uchar> &grayscale_image, CLNF& clnf_model, FaceModelParameters& params);
+	bool DetectLandmarksInImage(const cv::Mat &image, CLNF& clnf_model, FaceModelParameters& params);
 	// Providing a bounding box
-	bool DetectLandmarksInImage(const cv::Mat_<uchar> &grayscale_image, const cv::Rect_<double> bounding_box, CLNF& clnf_model, FaceModelParameters& params);
+	bool DetectLandmarksInImage(const cv::Mat &image, const cv::Rect_<double> bounding_box, CLNF& clnf_model, FaceModelParameters& params);
 
 	//================================================================
 	// Helper function for getting head pose from CLNF parameters
diff --git a/lib/local/LandmarkDetector/include/LandmarkDetectorModel.h b/lib/local/LandmarkDetector/include/LandmarkDetectorModel.h
index e46c1cd4..3cee3704 100644
--- a/lib/local/LandmarkDetector/include/LandmarkDetectorModel.h
+++ b/lib/local/LandmarkDetector/include/LandmarkDetectorModel.h
@@ -47,6 +47,7 @@
 #include "Patch_experts.h"
 #include "LandmarkDetectionValidator.h"
 #include "LandmarkDetectorParameters.h"
+#include "FaceDetectorMTCNN.h"
 
 using namespace std;
 
@@ -85,13 +86,17 @@ public:
 
 	//==================== Helpers for face detection and landmark detection validation =========================================
 
+	// TODO these should be static, and loading should be made easier
+
 	// Haar cascade classifier for face detection
 	cv::CascadeClassifier   face_detector_HAAR;
-	string                  face_detector_location;
-
+	string                  haar_face_detector_location;
+	
 	// A HOG SVM-struct based face detector
 	dlib::frontal_face_detector face_detector_HOG;
 
+	FaceDetectorMTCNN		face_detector_MTCNN;
+	string                  mtcnn_face_detector_location;
 
 	// Validate if the detected landmarks are correct using an SVR regressor
 	DetectionValidator	landmark_validator; 
diff --git a/lib/local/LandmarkDetector/include/LandmarkDetectorParameters.h b/lib/local/LandmarkDetector/include/LandmarkDetectorParameters.h
index 77ed1683..2e259fbd 100644
--- a/lib/local/LandmarkDetector/include/LandmarkDetectorParameters.h
+++ b/lib/local/LandmarkDetector/include/LandmarkDetectorParameters.h
@@ -88,9 +88,11 @@ struct FaceModelParameters
 
 	// Determining which face detector to use for (re)initialisation, HAAR is quicker but provides more false positives and is not goot for in-the-wild conditions
 	// Also HAAR detector can detect smaller faces while HOG SVM is only capable of detecting faces at least 70px across
-	enum FaceDetector{HAAR_DETECTOR, HOG_SVM_DETECTOR};
+	// MTCNN detector is much more accurate that the other two, and is even suitable for profile faces, but it is somewhat slower
+	enum FaceDetector{HAAR_DETECTOR, HOG_SVM_DETECTOR, MTCNN_DETECTOR};
 
-	string face_detector_location;
+	string haar_face_detector_location;
+	string mtcnn_face_detector_location;
 	FaceDetector curr_face_detector;
 
 	// Should the results be visualised and reported to console
diff --git a/lib/local/LandmarkDetector/include/LandmarkDetectorUtils.h b/lib/local/LandmarkDetector/include/LandmarkDetectorUtils.h
index 974a7df4..56f113bc 100644
--- a/lib/local/LandmarkDetector/include/LandmarkDetectorUtils.h
+++ b/lib/local/LandmarkDetector/include/LandmarkDetectorUtils.h
@@ -41,6 +41,8 @@
 
 #include "LandmarkDetectorModel.h"
 
+#include "FaceDetectorMTCNN.h"
+
 using namespace std;
 
 namespace LandmarkDetector
@@ -135,6 +137,11 @@ namespace LandmarkDetector
 	// The preference point allows for disambiguation if multiple faces are present (pick the closest one), if it is not set the biggest face is chosen
 	bool DetectSingleFaceHOG(cv::Rect_<double>& o_region, const cv::Mat_<uchar>& intensity, dlib::frontal_face_detector& classifier, double& confidence, const cv::Point preference = cv::Point(-1, -1), double min_width = -1, cv::Rect_<double> roi = cv::Rect_<double>(0.0, 0.0, 1.0, 1.0));
 
+	// Face detection using Multi-task Convolutional Neural Network
+	bool DetectFacesMTCNN(vector<cv::Rect_<double> >& o_regions, const cv::Mat& image, LandmarkDetector::FaceDetectorMTCNN& detector, std::vector<double>& confidences);
+	// The preference point allows for disambiguation if multiple faces are present (pick the closest one), if it is not set the biggest face is chosen
+	bool DetectSingleFaceMTCNN(cv::Rect_<double>& o_region, const cv::Mat& image, LandmarkDetector::FaceDetectorMTCNN& detector, double& confidence, const cv::Point preference = cv::Point(-1, -1));
+
 	//============================================================================
 	// Matrix reading functionality
 	//============================================================================
diff --git a/lib/local/LandmarkDetector/model/mtcnn_detector/MTCNN_detector.txt b/lib/local/LandmarkDetector/model/mtcnn_detector/MTCNN_detector.txt
new file mode 100644
index 00000000..9a4f805b
--- /dev/null
+++ b/lib/local/LandmarkDetector/model/mtcnn_detector/MTCNN_detector.txt
@@ -0,0 +1,3 @@
+PNet PNet.dat
+RNet RNet.dat
+ONet ONet.dat
diff --git a/lib/local/LandmarkDetector/model/mtcnn_detector/ONet.dat b/lib/local/LandmarkDetector/model/mtcnn_detector/ONet.dat
new file mode 100644
index 00000000..291c4462
Binary files /dev/null and b/lib/local/LandmarkDetector/model/mtcnn_detector/ONet.dat differ
diff --git a/lib/local/LandmarkDetector/model/mtcnn_detector/PNet.dat b/lib/local/LandmarkDetector/model/mtcnn_detector/PNet.dat
new file mode 100644
index 00000000..9550d39a
Binary files /dev/null and b/lib/local/LandmarkDetector/model/mtcnn_detector/PNet.dat differ
diff --git a/lib/local/LandmarkDetector/model/mtcnn_detector/RNet.dat b/lib/local/LandmarkDetector/model/mtcnn_detector/RNet.dat
new file mode 100644
index 00000000..864e0dd9
Binary files /dev/null and b/lib/local/LandmarkDetector/model/mtcnn_detector/RNet.dat differ
diff --git a/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp b/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp
new file mode 100644
index 00000000..953972ba
--- /dev/null
+++ b/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp
@@ -0,0 +1,1346 @@
+///////////////////////////////////////////////////////////////////////////////
+// Copyright (C) 2016, Carnegie Mellon University and University of Cambridge,
+// all rights reserved.
+//
+// THIS SOFTWARE IS PROVIDED “AS IS” FOR ACADEMIC USE ONLY AND ANY EXPRESS
+// OR IMPLIED WARRANTIES WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS
+// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY.
+// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+//
+// Notwithstanding the license granted herein, Licensee acknowledges that certain components
+// of the Software may be covered by so-called “open source” software licenses (“Open Source
+// Components”), which means any software licenses approved as open source licenses by the
+// Open Source Initiative or any substantially similar licenses, including without limitation any
+// license that, as a condition of distribution of the software licensed under such license,
+// requires that the distributor make the software available in source code format. Licensor shall
+// provide a list of Open Source Components for a particular version of the Software upon
+// Licensee’s request. Licensee will comply with the applicable terms of such licenses and to
+// the extent required by the licenses covering Open Source Components, the terms of such
+// licenses will apply in lieu of the terms of this Agreement. To the extent the terms of the
+// licenses applicable to Open Source Components prohibit any of the restrictions in this
+// License Agreement with respect to such Open Source Component, such restrictions will not
+// apply to such Open Source Component. To the extent the terms of the licenses applicable to
+// Open Source Components require Licensor to make an offer to provide source code or
+// related information in connection with the Software, such offer is hereby made. Any request
+// for source code or related information should be directed to cl-face-tracker-distribution@lists.cam.ac.uk
+// Licensee acknowledges receipt of notices for the Open Source Components for the initial
+// delivery of the Software.
+
+//     * Any publications arising from the use of this software, including but
+//       not limited to academic journal and conference publications, technical
+//       reports and manuals, must cite at least one of the following works:
+//
+//       OpenFace: an open source facial behavior analysis toolkit
+//       Tadas Baltrušaitis, Peter Robinson, and Louis-Philippe Morency
+//       in IEEE Winter Conference on Applications of Computer Vision, 2016  
+//
+//       Rendering of Eyes for Eye-Shape Registration and Gaze Estimation
+//       Erroll Wood, Tadas Baltrušaitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, and Andreas Bulling 
+//       in IEEE International. Conference on Computer Vision (ICCV),  2015 
+//
+//       Cross-dataset learning and person-speci?c normalisation for automatic Action Unit detection
+//       Tadas Baltrušaitis, Marwa Mahmoud, and Peter Robinson 
+//       in Facial Expression Recognition and Analysis Challenge, 
+//       IEEE International Conference on Automatic Face and Gesture Recognition, 2015 
+//
+//       Constrained Local Neural Fields for robust facial landmark detection in the wild.
+//       Tadas Baltrušaitis, Peter Robinson, and Louis-Philippe Morency. 
+//       in IEEE Int. Conference on Computer Vision Workshops, 300 Faces in-the-Wild Challenge, 2013.    
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include "stdafx.h"
+
+#include "FaceDetectorMTCNN.h"
+
+// OpenCV includes
+#include <opencv2/core/core.hpp>
+#include <opencv2/imgproc.hpp>
+
+// TBB includes
+#include <tbb/tbb.h>
+
+// System includes
+#include <fstream>
+
+// Math includes
+#define _USE_MATH_DEFINES
+#include <cmath>
+
+// Boost includes
+#include <filesystem.hpp>
+#include <filesystem/fstream.hpp>
+
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846
+#endif
+
+#include "LandmarkDetectorUtils.h"
+
+using namespace LandmarkDetector;
+
+// Constructor from model file location
+FaceDetectorMTCNN::FaceDetectorMTCNN(const string& location)
+{
+	this->Read(location);
+}
+// Copy constructor
+FaceDetectorMTCNN::FaceDetectorMTCNN(const FaceDetectorMTCNN& other) : PNet(other.PNet), RNet(other.RNet), ONet(other.ONet)
+{
+}
+
+CNN::CNN(const CNN& other) : cnn_layer_types(other.cnn_layer_types), cnn_max_pooling_layers(other.cnn_max_pooling_layers), cnn_convolutional_layers_bias(other.cnn_convolutional_layers_bias)
+{
+
+	this->cnn_convolutional_layers_weights.resize(other.cnn_convolutional_layers_weights.size());
+	for (size_t l = 0; l < other.cnn_convolutional_layers_weights.size(); ++l)
+	{
+		// Make sure the matrix is copied.
+		this->cnn_convolutional_layers_weights[l] = other.cnn_convolutional_layers_weights[l].clone();
+	}
+
+	this->cnn_convolutional_layers.resize(other.cnn_convolutional_layers.size());
+	for (size_t l = 0; l < other.cnn_convolutional_layers.size(); ++l)
+	{
+		this->cnn_convolutional_layers[l].resize(other.cnn_convolutional_layers[l].size());
+
+		for (size_t i = 0; i < other.cnn_convolutional_layers[l].size(); ++i)
+		{
+			this->cnn_convolutional_layers[l][i].resize(other.cnn_convolutional_layers[l][i].size());
+
+			for (size_t k = 0; k < other.cnn_convolutional_layers[l][i].size(); ++k)
+			{
+				// Make sure the matrix is copied.
+				this->cnn_convolutional_layers[l][i][k] = other.cnn_convolutional_layers[l][i][k].clone();
+			}
+		}
+	}
+
+	this->cnn_fully_connected_layers_weights.resize(other.cnn_fully_connected_layers_weights.size());
+
+	for (size_t l = 0; l < other.cnn_fully_connected_layers_weights.size(); ++l)
+	{
+		// Make sure the matrix is copied.
+		this->cnn_fully_connected_layers_weights[l] = other.cnn_fully_connected_layers_weights[l].clone();
+	}
+
+	this->cnn_fully_connected_layers_biases.resize(other.cnn_fully_connected_layers_biases.size());
+
+	for (size_t l = 0; l < other.cnn_fully_connected_layers_biases.size(); ++l)
+	{
+		// Make sure the matrix is copied.
+		this->cnn_fully_connected_layers_biases[l] = other.cnn_fully_connected_layers_biases[l].clone();
+	}
+
+	this->cnn_prelu_layer_weights.resize(other.cnn_prelu_layer_weights.size());
+
+	for (size_t l = 0; l < other.cnn_prelu_layer_weights.size(); ++l)
+	{
+		// Make sure the matrix is copied.
+		this->cnn_prelu_layer_weights[l] = other.cnn_prelu_layer_weights[l].clone();
+	}
+}
+
+void PReLU(std::vector<cv::Mat_<float> >& input_output_maps, cv::Mat_<float> prelu_weights)
+{
+
+	if (input_output_maps.size() > 1)
+	{
+		int h = input_output_maps[0].rows;
+		int w = input_output_maps[0].cols;
+		size_t size_in = h * w;
+
+		for (size_t k = 0; k < input_output_maps.size(); ++k)
+		{
+			// Apply the PReLU
+			auto iter = input_output_maps[k].begin();
+
+			float neg_mult = prelu_weights.at<float>(k);
+
+			for(size_t i = 0; i < size_in; ++i)
+			{
+				float in_val = *iter;
+
+				// The prelu step
+				*iter++ = in_val >= 0 ? in_val : in_val * neg_mult;
+
+			}
+		}
+	}
+	else
+	{
+		
+		int w = input_output_maps[0].cols;
+
+		for (size_t k = 0; k < prelu_weights.rows; ++k)
+		{
+			auto iter = input_output_maps[0].row(k).begin();
+			float neg_mult = prelu_weights.at<float>(k);
+
+			for (size_t i = 0; i < w; ++i)
+			{
+				float in_val = *iter;
+				// Apply the PReLU
+				*iter++ = in_val >= 0 ? in_val : in_val * neg_mult;
+			}
+		}
+
+	}
+
+}
+
+void fully_connected(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, cv::Mat_<float> weights, cv::Mat_<float> biases)
+{
+	outputs.clear();
+
+	if (input_maps.size() > 1)
+	{
+		// Concatenate all the maps
+		cv::Size orig_size = input_maps[0].size();
+		cv::Mat_<float> input_concat(input_maps.size(), input_maps[0].cols * input_maps[0].rows);
+
+		for (size_t in = 0; in < input_maps.size(); ++in)
+		{
+			cv::Mat_<float> add = input_maps[in];
+			
+			// Reshape if all of the data will be flattened
+			if (input_concat.rows != weights.cols)
+			{
+				add = add.t();
+			}
+
+			add = add.reshape(0, 1);
+			add.copyTo(input_concat.row(in));
+		}
+
+		// Treat the input as separate feature maps
+		if (input_concat.rows == weights.cols)
+		{
+			input_concat = weights * input_concat;
+			// Add biases
+			for (size_t k = 0; k < biases.rows; ++k)
+			{
+				input_concat.row(k) = input_concat.row(k) + biases.at<float>(k);
+			}
+
+			outputs.clear();
+			// Resize and add as output
+			for (size_t k = 0; k < biases.rows; ++k)
+			{
+				cv::Mat_<float> reshaped = input_concat.row(k).clone();
+				reshaped = reshaped.reshape(1, orig_size.height);
+				outputs.push_back(reshaped);
+			}
+		}
+		else
+		{
+			// Flatten the input
+			input_concat = input_concat.reshape(0, input_concat.rows * input_concat.cols);
+
+			input_concat = weights * input_concat + biases;
+
+			outputs.clear();
+			outputs.push_back(input_concat);
+		}
+
+	}
+	else
+	{
+		cv::Mat out = weights * input_maps[0] + biases;
+		outputs.clear();
+		outputs.push_back(out.t());
+	}
+
+}
+
+void max_pooling(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, int stride_x, int stride_y, int kernel_size_x, int kernel_size_y)
+{
+	vector<cv::Mat_<float> > outputs_sub;
+
+	// Iterate over kernel height and width, based on stride
+	for (size_t in = 0; in < input_maps.size(); ++in)
+	{
+		// Help with rounding up a bit, to match caffe style output
+		int out_x = round((double)(input_maps[in].cols - kernel_size_x) / (double)stride_x) + 1;
+		int out_y = round((double)(input_maps[in].rows - kernel_size_y) / (double)stride_y) + 1;
+
+		cv::Mat_<float> sub_out(out_y, out_x, 0.0);
+		cv::Mat_<float> in_map = input_maps[in];
+
+		for (int x = 0; x < input_maps[in].cols; x += stride_x)
+		{
+			int max_x = cv::min(input_maps[in].cols, x + kernel_size_x);
+			int x_in_out = floor(x / stride_x);
+
+			if (x_in_out >= out_x)
+				continue;
+
+			for (int y = 0; y < input_maps[in].rows; y += stride_y)
+			{
+				int y_in_out = floor(y / stride_y);
+
+				if (y_in_out >= out_y)
+					continue;
+
+				int max_y = cv::min(input_maps[in].rows, y + kernel_size_y);
+
+				float curr_max = -FLT_MAX;
+
+				for (int x_in = x; x_in < max_x; ++x_in)
+				{
+					for (int y_in = y; y_in < max_y; ++y_in)
+					{
+						float curr_val = in_map.at<float>(y_in, x_in);
+						if (curr_val > curr_max)
+						{
+							curr_max = curr_val;
+						}
+					}
+				}
+				sub_out.at<float>(y_in_out, x_in_out) = curr_max;
+			}
+		}
+
+		outputs_sub.push_back(sub_out);
+
+	}
+	outputs = outputs_sub;
+
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+void convolution_single_kern_fft(const vector<cv::Mat_<float> >& input_imgs, vector<cv::Mat_<double> >& img_dfts, const vector<cv::Mat_<float> >&  _templs, map<int, vector<cv::Mat_<double> > >& _templ_dfts, cv::Mat_<float>& result)
+{
+	// Assume result is defined properly
+	if (result.empty())
+	{
+		cv::Size corrSize(input_imgs[0].cols - _templs[0].cols + 1, input_imgs[0].rows - _templs[0].rows + 1);
+		result.create(corrSize);
+	}
+
+	// Our model will always be under min block size so can ignore this
+	//const double blockScale = 4.5;
+	//const int minBlockSize = 256;
+
+	int maxDepth = CV_64F;
+
+	cv::Size dftsize;
+
+	dftsize.width = cv::getOptimalDFTSize(result.cols + _templs[0].cols - 1);
+	dftsize.height = cv::getOptimalDFTSize(result.rows + _templs[0].rows - 1);
+
+	// Compute block size
+	cv::Size blocksize;
+	blocksize.width = dftsize.width - _templs[0].cols + 1;
+	blocksize.width = MIN(blocksize.width, result.cols);
+	blocksize.height = dftsize.height - _templs[0].rows + 1;
+	blocksize.height = MIN(blocksize.height, result.rows);
+
+	vector<cv::Mat_<double>> dftTempl;
+
+	// if this has not been precomputed, precompute it, otherwise use it
+	if (_templ_dfts.find(dftsize.width) == _templ_dfts.end())
+	{
+		dftTempl.resize(_templs.size());
+		for (size_t k = 0; k < _templs.size(); ++k)
+		{
+			dftTempl[k].create(dftsize.height, dftsize.width);
+
+			cv::Mat_<float> src = _templs[k];
+
+			cv::Mat_<double> dst(dftTempl[k], cv::Rect(0, 0, dftsize.width, dftsize.height));
+
+			cv::Mat_<double> dst1(dftTempl[k], cv::Rect(0, 0, _templs[k].cols, _templs[k].rows));
+
+			if (dst1.data != src.data)
+				src.convertTo(dst1, dst1.depth());
+
+			if (dst.cols > _templs[k].cols)
+			{
+				cv::Mat_<double> part(dst, cv::Range(0, _templs[k].rows), cv::Range(_templs[k].cols, dst.cols));
+				part.setTo(0);
+			}
+
+			// Perform DFT of the template
+			dft(dst, dst, 0, _templs[k].rows);
+
+		}
+		_templ_dfts[dftsize.width] = dftTempl;
+
+	}
+	else
+	{
+		dftTempl = _templ_dfts[dftsize.width];
+	}
+
+	cv::Size bsz(std::min(blocksize.width, result.cols), std::min(blocksize.height, result.rows));
+	cv::Mat src;
+
+	cv::Mat cdst(result, cv::Rect(0, 0, bsz.width, bsz.height));
+
+	vector<cv::Mat_<double> > dftImgs;
+	dftImgs.resize(input_imgs.size());
+
+	if (img_dfts.empty())
+	{
+		for(size_t k = 0; k < input_imgs.size(); ++k)
+		{
+			dftImgs[k].create(dftsize);
+			dftImgs[k].setTo(0.0);
+
+			cv::Size dsz(bsz.width + _templs[k].cols - 1, bsz.height + _templs[k].rows - 1);
+
+			int x2 = std::min(input_imgs[k].cols, dsz.width);
+			int y2 = std::min(input_imgs[k].rows, dsz.height);
+
+			cv::Mat src0(input_imgs[k], cv::Range(0, y2), cv::Range(0, x2));
+			cv::Mat dst(dftImgs[k], cv::Rect(0, 0, dsz.width, dsz.height));
+			cv::Mat dst1(dftImgs[k], cv::Rect(0, 0, x2, y2));
+
+			src = src0;
+
+			if (dst1.data != src.data)
+				src.convertTo(dst1, dst1.depth());
+
+			dft(dftImgs[k], dftImgs[k], 0, dsz.height);
+			img_dfts.push_back(dftImgs[k].clone());
+		}
+	}
+
+	cv::Mat_<double> dft_img(img_dfts[0].rows, img_dfts[0].cols, 0.0);
+	for (size_t k = 0; k < input_imgs.size(); ++k)
+	{
+		cv::Mat dftTempl1(dftTempl[k], cv::Rect(0, 0, dftsize.width, dftsize.height));
+		if (k == 0)
+		{
+			cv::mulSpectrums(img_dfts[k], dftTempl1, dft_img, 0, true);
+		}
+		else
+		{
+			cv::mulSpectrums(img_dfts[k], dftTempl1, dftImgs[k], 0, true);
+			dft_img = dft_img + dftImgs[k];
+		}
+	}
+
+	cv::dft(dft_img, dft_img, cv::DFT_INVERSE + cv::DFT_SCALE, bsz.height);
+
+	src = dft_img(cv::Rect(0, 0, bsz.width, bsz.height));
+
+	src.convertTo(cdst, CV_32F);
+
+}
+
+void im2col_t(const cv::Mat_<float>& input, int width, int height, cv::Mat_<float>& output)
+{
+
+	int m = input.cols;
+	int n = input.rows;
+
+	// determine how many blocks there will be with a sliding window of width x height in the input
+	int yB = m - height + 1;
+	int xB = n - width + 1;
+
+	// Allocate the output size
+	if (output.rows != width * height && output.cols != xB*yB)
+	{
+		output = cv::Mat::ones(width * height, xB*yB, CV_32F);
+	}
+
+	// Iterate over the whole image
+	for (int i = 0; i< yB; i++)
+	{
+		int rowIdx = i;
+		for (int j = 0; j< xB; j++)
+		{
+			//int rowIdx = i; +j*yB;
+			// iterate over the blocks within the image
+			for (unsigned int yy = 0; yy < height; ++yy)
+			{
+				// Faster iteration over the image
+				const float* Mi = input.ptr<float>(j + yy);
+				for (unsigned int xx = 0; xx < width; ++xx)
+				{
+					int colIdx = xx*height + yy;
+
+					output.at<float>(colIdx, rowIdx) = Mi[i + xx];
+				}
+			}
+			rowIdx += yB;
+
+		}
+	}
+}
+
+void convolution_direct(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, const cv::Mat_<float>& weight_matrix, const std::vector<float >& biases, int height_k, int width_k)
+{
+	outputs.clear();
+
+	int height_in = input_maps[0].rows;
+	int width_n = input_maps[0].cols;
+
+	// determine how many blocks there will be with a sliding window of width x height in the input
+	int yB = height_in - height_k + 1;
+	int xB = width_n - width_k + 1;
+
+	cv::Mat_<float> input_matrix(input_maps.size() * height_k * width_k + 1.0, yB * xB, 1.0f);
+
+	// Comibine im2col accross channels to prepare for matrix multiplication
+	for (size_t i = 0; i < input_maps.size(); ++i)
+	{
+		im2col_t(input_maps[i], width_k, height_k, input_matrix(cv::Rect(0, i * height_k * width_k, yB * xB, height_k * width_k)));
+	}
+
+	// Actual convolution (through multiplication)
+	cv::Mat_<float> out = weight_matrix * input_matrix;
+
+	// Move back to vectors and reshape accordingly (also add the bias)
+	for (size_t k = 0; k < out.rows; ++k)
+	{
+		outputs.push_back(out.row(k).reshape(1, yB));
+	}
+
+}
+
+void convolution_fft2(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, const std::vector<std::vector<cv::Mat_<float> > >& kernels, const std::vector<float >& biases, vector<map<int, vector<cv::Mat_<double> > > >& precomp_dfts)
+{
+	outputs.clear();
+
+	// Useful precomputed data placeholders for quick correlation (convolution)
+	vector<cv::Mat_<double> > input_image_dft;
+
+	for (size_t k = 0; k < kernels.size(); ++k)
+	{
+
+		// The convolution (with precomputation)
+		cv::Mat_<float> output;
+		convolution_single_kern_fft(input_maps, input_image_dft, kernels[k], precomp_dfts[k], output);
+
+		// Combining the maps
+		outputs.push_back(output + biases[k]);
+
+	}
+}
+
+void convolution_fft(std::vector<cv::Mat_<float> >& outputs, const std::vector<cv::Mat_<float> >& input_maps, const std::vector<std::vector<cv::Mat_<float> > >& kernels, const std::vector<float >& biases, vector<vector<pair<int, cv::Mat_<double> > > >& precomp_dfts)
+{
+	outputs.clear();
+	for (size_t in = 0; in < input_maps.size(); ++in)
+	{
+		cv::Mat_<float> input_image = input_maps[in];
+
+		// Useful precomputed data placeholders for quick correlation (convolution)
+		cv::Mat_<double> input_image_dft;
+		cv::Mat integral_image;
+		cv::Mat integral_image_sq;
+
+		for (size_t k = 0; k < kernels[in].size(); ++k)
+		{
+			cv::Mat_<float> kernel = kernels[in][k];
+
+			// The convolution (with precomputation)
+			cv::Mat_<float> output;
+			if (precomp_dfts[in][k].second.empty())
+			{
+				std::map<int, cv::Mat_<double> > precomputed_dft;
+
+				LandmarkDetector::matchTemplate_m(input_image, input_image_dft, integral_image, integral_image_sq, kernel, precomputed_dft, output, CV_TM_CCORR);
+
+				precomp_dfts[in][k].first = precomputed_dft.begin()->first;
+				precomp_dfts[in][k].second = precomputed_dft.begin()->second;
+			}
+			else
+			{
+				std::map<int, cv::Mat_<double> > precomputed_dft;
+				precomputed_dft[precomp_dfts[in][k].first] = precomp_dfts[in][k].second;
+				LandmarkDetector::matchTemplate_m(input_image, input_image_dft, integral_image, integral_image_sq, kernel, precomputed_dft, output, CV_TM_CCORR);
+			}
+
+			// Combining the maps
+			if (in == 0)
+			{
+				outputs.push_back(output);
+			}
+			else
+			{
+				outputs[k] = outputs[k] + output;
+			}
+
+		}
+
+	}
+
+	for (size_t k = 0; k < biases.size(); ++k)
+	{
+		outputs[k] = outputs[k] + biases[k];
+	}
+}
+
+std::vector<cv::Mat_<float>> CNN::Inference(const cv::Mat& input_img, bool direct)
+{
+	if (input_img.channels() == 1)
+	{
+		cv::cvtColor(input_img, input_img, cv::COLOR_GRAY2BGR);
+	}
+
+	int cnn_layer = 0;
+	int fully_connected_layer = 0;
+	int prelu_layer = 0;
+	int max_pool_layer = 0;
+
+	// Slit a BGR image into three chnels
+	cv::Mat channels[3]; 
+	cv::split(input_img, channels);  
+
+	// Flip the BGR order to RGB
+	vector<cv::Mat_<float> > input_maps;
+	input_maps.push_back(channels[2]);
+	input_maps.push_back(channels[1]);
+	input_maps.push_back(channels[0]);
+
+	vector<cv::Mat_<float> > outputs;
+
+	for (size_t layer = 0; layer < cnn_layer_types.size(); ++layer)
+	{
+
+		// Determine layer type
+		int layer_type = cnn_layer_types[layer];
+
+		// Convolutional layer
+		if (layer_type == 0)		
+		{
+
+			// Either perform direct convolution through matrix multiplication or use an FFT optimized version, which one is optimal depends on the kernel and input sizes
+			if (direct)
+			{
+				convolution_direct(outputs, input_maps, cnn_convolutional_layers_weights[cnn_layer], cnn_convolutional_layers_bias[cnn_layer], cnn_convolutional_layers[cnn_layer][0][0].rows, cnn_convolutional_layers[cnn_layer][0][0].cols);
+			}
+			else
+			{
+				convolution_fft2(outputs, input_maps, cnn_convolutional_layers[cnn_layer], cnn_convolutional_layers_bias[cnn_layer], cnn_convolutional_layers_dft[cnn_layer]);
+			}
+			//vector<cv::Mat_<float> > outs;
+			//convolution_fft(outs, input_maps, cnn_convolutional_layers[cnn_layer], cnn_convolutional_layers_bias[cnn_layer], cnn_convolutional_layers_dft[cnn_layer]);
+
+
+
+			cnn_layer++;
+		}
+		if (layer_type == 1)
+		{
+
+			int stride_x = std::get<2>(cnn_max_pooling_layers[max_pool_layer]);
+			int stride_y = std::get<3>(cnn_max_pooling_layers[max_pool_layer]);
+			
+			int kernel_size_x = std::get<0>(cnn_max_pooling_layers[max_pool_layer]);
+			int kernel_size_y = std::get<1>(cnn_max_pooling_layers[max_pool_layer]);
+
+			max_pooling(outputs, input_maps, stride_x, stride_y, kernel_size_x, kernel_size_y);
+			max_pool_layer++;
+		}
+		if (layer_type == 2)
+		{
+			fully_connected(outputs, input_maps, cnn_fully_connected_layers_weights[fully_connected_layer], cnn_fully_connected_layers_biases[fully_connected_layer]);
+			fully_connected_layer++;
+		}
+		if (layer_type == 3) // PReLU
+		{
+			// In place prelu computation
+			PReLU(input_maps, cnn_prelu_layer_weights[prelu_layer]);
+			outputs = input_maps;
+			prelu_layer++;
+		}
+		if (layer_type == 4)
+		{
+			outputs.clear();
+			for (size_t k = 0; k < input_maps.size(); ++k)
+			{
+				// Apply the sigmoid
+				cv::exp(-input_maps[k], input_maps[k]);
+				input_maps[k] = 1.0 / (1.0 + input_maps[k]);
+
+				outputs.push_back(input_maps[k]);
+
+			}
+		}
+		// Set the outputs of this layer to inputs of the next one
+		input_maps = outputs;		
+	}
+
+	
+	return outputs;
+
+}
+
+void ReadMatBin(std::ifstream& stream, cv::Mat &output_mat)
+{
+	// Read in the number of rows, columns and the data type
+	int row, col, type;
+
+	stream.read((char*)&row, 4);
+	stream.read((char*)&col, 4);
+	stream.read((char*)&type, 4);
+
+	output_mat = cv::Mat(row, col, type);
+	int size = output_mat.rows * output_mat.cols * output_mat.elemSize();
+	stream.read((char *)output_mat.data, size);
+
+}
+
+void CNN::ClearPrecomp()
+{
+	for (size_t k1 = 0; k1 < cnn_convolutional_layers_dft.size(); ++k1)
+	{
+		for (size_t k2 = 0; k2 < cnn_convolutional_layers_dft[k1].size(); ++k2)
+		{
+			cnn_convolutional_layers_dft[k1][k2].clear();
+		}
+	}
+}
+
+void CNN::Read(const string& location)
+{
+	ifstream cnn_stream(location, ios::in | ios::binary);
+	if (cnn_stream.is_open())
+	{
+		cnn_stream.seekg(0, ios::beg);
+
+		// Reading in CNNs
+
+		int network_depth;
+		cnn_stream.read((char*)&network_depth, 4);
+
+		cnn_layer_types.resize(network_depth);
+
+		for (int layer = 0; layer < network_depth; ++layer)
+		{
+
+			int layer_type;
+			cnn_stream.read((char*)&layer_type, 4);
+			cnn_layer_types[layer] = layer_type;
+
+			// convolutional
+			if (layer_type == 0)
+			{
+
+				// Read the number of input maps
+				int num_in_maps;
+				cnn_stream.read((char*)&num_in_maps, 4);
+
+				// Read the number of kernels for each input map
+				int num_kernels;
+				cnn_stream.read((char*)&num_kernels, 4);
+
+				vector<vector<cv::Mat_<float> > > kernels;
+
+				kernels.resize(num_in_maps);
+
+				vector<float> biases;
+				for (int k = 0; k < num_kernels; ++k)
+				{
+					float bias;
+					cnn_stream.read((char*)&bias, 4);
+					biases.push_back(bias);
+				}
+
+				cnn_convolutional_layers_bias.push_back(biases);
+
+				// For every input map
+				for (int in = 0; in < num_in_maps; ++in)
+				{
+					kernels[in].resize(num_kernels);
+
+					// For every kernel on that input map
+					for (int k = 0; k < num_kernels; ++k)
+					{
+						ReadMatBin(cnn_stream, kernels[in][k]);
+
+					}
+				}
+
+				// Rearrange the kernels for faster inference with FFT
+				vector<vector<cv::Mat_<float> > > kernels_rearr;
+				kernels_rearr.resize(num_kernels);
+
+				// Fill up the rearranged layer
+				for (int k = 0; k < num_kernels; ++k)
+				{
+					for (int in = 0; in < num_in_maps; ++in)
+					{
+						kernels_rearr[k].push_back(kernels[in][k]);
+					}
+				}
+
+				cnn_convolutional_layers.push_back(kernels_rearr);
+
+				// Place-holders for DFT precomputation
+				vector<map<int, vector<cv::Mat_<double> > > > cnn_convolutional_layers_dft_curr_layer;
+				cnn_convolutional_layers_dft_curr_layer.resize(num_kernels);
+				cnn_convolutional_layers_dft.push_back(cnn_convolutional_layers_dft_curr_layer);
+
+				// Rearrange the flattened kernels into weight matrices for direct convolution computation
+				cv::Mat_<float> weight_matrix(num_in_maps * kernels_rearr[0][0].rows * kernels_rearr[0][0].cols, num_kernels);
+				for (size_t k = 0; k < num_kernels; ++k)
+				{
+					for (size_t i = 0; i < num_in_maps; ++i)
+					{
+						// Flatten the kernel
+						cv::Mat_<float> k_flat = kernels_rearr[k][i].t();
+						k_flat = k_flat.reshape(0, 1).t();
+						k_flat.copyTo(weight_matrix(cv::Rect(k, i * kernels_rearr[0][0].rows * kernels_rearr[0][0].cols, 1, kernels_rearr[0][0].rows * kernels_rearr[0][0].cols)));
+					}
+				}
+
+				// Transpose the weight matrix for more convenient computation
+				weight_matrix = weight_matrix.t();
+
+				// Add a bias term to the weight matrix for efficiency
+				cv::Mat_<float> W(weight_matrix.rows, weight_matrix.cols + 1, 1.0);
+				for (size_t k = 0; k < weight_matrix.rows; ++k)
+				{
+					W.at<float>(k, weight_matrix.cols) = biases[k];
+				}
+				weight_matrix.copyTo(W(cv::Rect(0, 0, weight_matrix.cols, weight_matrix.rows)));
+
+				cnn_convolutional_layers_weights.push_back(W);
+
+			}
+			else if (layer_type == 1)
+			{
+				int kernel_x, kernel_y, stride_x, stride_y;
+				cnn_stream.read((char*)&kernel_x, 4);
+				cnn_stream.read((char*)&kernel_y, 4);
+				cnn_stream.read((char*)&stride_x, 4);
+				cnn_stream.read((char*)&stride_y, 4);
+				cnn_max_pooling_layers.push_back(std::tuple<int, int, int, int>(kernel_x, kernel_y, stride_x, stride_y));
+			}
+			else if (layer_type == 2)
+			{
+				cv::Mat_<float> biases;
+				ReadMatBin(cnn_stream, biases);
+				cnn_fully_connected_layers_biases.push_back(biases);
+
+				// Fully connected layer
+				cv::Mat_<float> weights;
+				ReadMatBin(cnn_stream, weights);
+				cnn_fully_connected_layers_weights.push_back(weights.t());
+			}
+
+			else if (layer_type == 3)
+			{
+				cv::Mat_<float> weights;
+				ReadMatBin(cnn_stream, weights);
+				cnn_prelu_layer_weights.push_back(weights);
+			}
+		}
+	}
+	else
+	{
+		cout << "WARNING: Can't find the CNN location" << endl;
+	}
+}
+
+//===========================================================================
+// Read in the MTCNN detector
+void FaceDetectorMTCNN::Read(const string& location)
+{
+
+	cout << "Reading the MTCNN face detector from: " << location << endl;
+
+	ifstream locations(location.c_str(), ios_base::in);
+	if (!locations.is_open())
+	{
+		cout << "Couldn't open the model file, aborting" << endl;
+		return;
+	}
+	string line;
+
+	// The other module locations should be defined as relative paths from the main model
+	boost::filesystem::path root = boost::filesystem::path(location).parent_path();
+
+	// The main file contains the references to other files
+	while (!locations.eof())
+	{
+		getline(locations, line);
+
+		stringstream lineStream(line);
+
+		string module;
+		string location;
+
+		// figure out which module is to be read from which file
+		lineStream >> module;
+
+		lineStream >> location;
+
+		// remove carriage return at the end for compatibility with unix systems
+		if (location.size() > 0 && location.at(location.size() - 1) == '\r')
+		{
+			location = location.substr(0, location.size() - 1);
+		}
+
+		// append to root
+		location = (root / location).string();
+		if (module.compare("PNet") == 0)
+		{
+			cout << "Reading the PNet module from: " << location << endl;
+			PNet.Read(location);
+		}
+		else if(module.compare("RNet") == 0)
+		{
+			cout << "Reading the RNet module from: " << location << endl;
+			RNet.Read(location);
+		}
+		else if (module.compare("ONet") == 0)
+		{
+			cout << "Reading the ONet module from: " << location << endl;
+			ONet.Read(location);
+		}
+	}
+}
+
+// Perform non maximum supression on proposal bounding boxes prioritizing boxes with high score/confidence
+std::vector<int> non_maximum_supression(const std::vector<cv::Rect_<float> >& original_bb, const std::vector<float>& scores, float thresh, bool minimum)
+{
+
+	// Sort the input bounding boxes by the detection score, using the nice trick of multimap always being sorted internally
+	std::multimap<float, size_t> idxs;
+	for (size_t i = 0; i < original_bb.size(); ++i)
+	{
+		idxs.insert(std::pair<float, size_t>(scores[i], i));
+	}
+
+	std::vector<int> output_ids;
+
+	// keep looping while some indexes still remain in the indexes list
+	while (idxs.size() > 0)
+	{
+		// grab the last rectangle
+		auto lastElem = --std::end(idxs);
+		size_t curr_id = lastElem->second;
+
+		const cv::Rect& rect1 = original_bb[curr_id];
+
+		idxs.erase(lastElem);
+
+		// Iterate through remaining bounding boxes and choose which ones to remove
+		for (auto pos = std::begin(idxs); pos != std::end(idxs); )
+		{
+			// grab the current rectangle
+			const cv::Rect& rect2 = original_bb[pos->second];
+
+			float intArea = (rect1 & rect2).area();
+			float unionArea;
+			if (minimum)
+			{
+				unionArea = cv::min(rect1.area(), rect2.area());
+			}
+			else 
+			{
+				unionArea = rect1.area() + rect2.area() - intArea;
+			}
+			float overlap = intArea / unionArea;
+
+			// Remove the bounding boxes with less confidence but with significant overlap with the current one
+			if (overlap > thresh)
+			{
+				pos = idxs.erase(pos);
+			}
+			else
+			{
+				++pos;
+			}
+		}
+		output_ids.push_back(curr_id);
+
+	}
+
+	return output_ids;
+
+}
+
+// Helper function for selecting a subset of bounding boxes based on indices
+void select_subset(const vector<int>& to_keep, vector<cv::Rect_<float> >& bounding_boxes, vector<float>& scores, vector<cv::Rect_<float> >& corrections)
+{
+	vector<cv::Rect_<float> > bounding_boxes_tmp;
+	vector<float> scores_tmp;
+	vector<cv::Rect_<float> > corrections_tmp;
+
+	for (size_t i = 0; i < to_keep.size(); ++i)
+	{
+		bounding_boxes_tmp.push_back(bounding_boxes[to_keep[i]]);
+		scores_tmp.push_back(scores[to_keep[i]]);
+		corrections_tmp.push_back(corrections[to_keep[i]]);
+	}
+	
+	bounding_boxes = bounding_boxes_tmp;
+	scores = scores_tmp;
+	corrections = corrections_tmp;
+}
+
+// Use the heatmap generated by PNet to generate bounding boxes in the original image space, also generate the correction values and scores of the bounding boxes as well
+void generate_bounding_boxes(vector<cv::Rect_<float> >& o_bounding_boxes, vector<float>& o_scores, vector<cv::Rect_<float> >& o_corrections, const cv::Mat_<float>& heatmap, const vector<cv::Mat_<float> >& corrections, double scale, double threshold, int face_support)
+{
+
+	// Correction for the pooling
+	int stride = 2;
+
+	o_bounding_boxes.clear();
+	o_scores.clear();
+	o_corrections.clear();
+
+	int counter = 0;
+	for (int x = 0; x < heatmap.cols; ++x)
+	{
+		for(int y = 0; y < heatmap.rows; ++y)
+		{
+			if (heatmap.at<float>(y, x) >= threshold)
+			{
+				float min_x = int((stride * x + 1) / scale);
+				float max_x = int((stride * x + face_support) / scale);
+				float min_y = int((stride * y + 1) / scale);
+				float max_y = int((stride * y + face_support) / scale);
+
+				o_bounding_boxes.push_back(cv::Rect_<float>(min_x, min_y, max_x - min_x, max_y - min_y));
+				o_scores.push_back(heatmap.at<float>(y, x));
+
+				float corr_x = corrections[0].at<float>(y, x);
+				float corr_y = corrections[1].at<float>(y, x);
+				float corr_width = corrections[2].at<float>(y, x);
+				float corr_height = corrections[3].at<float>(y, x);
+				o_corrections.push_back(cv::Rect_<float>(corr_x, corr_y, corr_width, corr_height));
+
+				counter++;
+			}
+		}
+	}
+	
+}
+
+// Converting the bounding boxes to squares
+void rectify(vector<cv::Rect_<float> >& total_bboxes)
+{
+
+	// Apply size and location offsets
+	for (size_t i = 0; i < total_bboxes.size(); ++i)
+	{
+		float height = total_bboxes[i].height;
+		float width = total_bboxes[i].width;
+
+		float max_side = max(width, height);
+
+		// Correct the starts based on new size
+		float new_min_x = total_bboxes[i].x + 0.5 * (width - max_side);
+		float new_min_y = total_bboxes[i].y + 0.5 * (height - max_side);
+
+		total_bboxes[i].x = (int)new_min_x;
+		total_bboxes[i].y = (int)new_min_y;
+		total_bboxes[i].width = (int)max_side;
+		total_bboxes[i].height = (int)max_side;
+	}
+}
+
+void apply_correction(vector<cv::Rect_<float> >& total_bboxes, const vector<cv::Rect_<float> > corrections, bool add1)
+{
+
+	// Apply size and location offsets
+	for (size_t i = 0; i < total_bboxes.size(); ++i)
+	{
+		cv::Rect curr_box = total_bboxes[i];
+		if (add1)
+		{
+			curr_box.width++;
+			curr_box.height++;
+		}
+
+		float new_min_x = curr_box.x + corrections[i].x * curr_box.width;
+		float new_min_y = curr_box.y + corrections[i].y * curr_box.height;
+		float new_max_x = curr_box.x + curr_box.width + curr_box.width * corrections[i].width;
+		float new_max_y = curr_box.y + curr_box.height + curr_box.height * corrections[i].height;
+		total_bboxes[i] = cv::Rect_<float>(new_min_x, new_min_y, new_max_x - new_min_x, new_max_y - new_min_y);
+
+	}
+
+
+}
+
+
+// The actual MTCNN face detection step
+bool FaceDetectorMTCNN::DetectFaces(vector<cv::Rect_<double> >& o_regions, const cv::Mat& img_in, std::vector<double>& o_confidences, int min_face_size, double t1, double t2, double t3)
+{
+
+	int height_orig = img_in.size().height;
+	int width_orig = img_in.size().width;
+
+	// Size ratio of image pyramids
+	double pyramid_factor = 0.709;
+
+	// Face support region is 12x12 px, so from that can work out the largest
+	// scale(which is 12 / min), and work down from there to smallest scale(no smaller than 12x12px)
+	int min_dim = std::min(height_orig, width_orig);
+
+	int face_support = 12;
+	int num_scales = floor(log((double)min_face_size / (double)min_dim) / log(pyramid_factor)) + 1;
+
+	cv::Mat input_img;
+
+	if (img_in.channels() == 1)
+	{
+		cv::cvtColor(img_in, input_img, CV_GRAY2RGB);
+	}
+	else
+	{
+		input_img = img_in;
+	}
+
+	cv::Mat img_float;
+	input_img.convertTo(img_float, CV_32FC3);
+
+	vector<cv::Rect_<float> > proposal_boxes_all;
+	vector<float> scores_all;
+	vector<cv::Rect_<float> > proposal_corrections_all;
+
+	// As the scales will be done in parallel have some containers for them
+	vector<vector<cv::Rect_<float> > > proposal_boxes_cross_scale(num_scales);
+	vector<vector<float> > scores_cross_scale(num_scales);
+	vector<vector<cv::Rect_<float> > > proposal_corrections_cross_scale(num_scales);
+
+	//tbb::parallel_for(0, (int)num_scales, [&](int i) {
+	for (int i = 0; i < num_scales; ++i)
+	{
+		double scale = ((double)face_support / (double)min_face_size)*cv::pow(pyramid_factor, i);
+
+		int h_pyr = ceil(height_orig * scale);
+		int w_pyr = ceil(width_orig * scale);
+
+		cv::Mat normalised_img;
+		cv::resize(img_float, normalised_img, cv::Size(w_pyr, h_pyr));
+		
+		// Normalize the image
+		normalised_img = (normalised_img - 127.5) * 0.0078125;
+
+		// Actual PNet CNN step
+		std::vector<cv::Mat_<float> > pnet_out = PNet.Inference(normalised_img, true);
+
+		// Clear the precomputations, as the image sizes will be different
+		PNet.ClearPrecomp();
+
+		// Extract the probabilities from PNet response
+		cv::Mat_<float> prob_heatmap;
+		cv::exp(pnet_out[0]- pnet_out[1], prob_heatmap);
+		prob_heatmap = 1.0 / (1.0 + prob_heatmap);
+
+		// Extract the probabilities from PNet response
+		std::vector<cv::Mat_<float>> corrections_heatmap(pnet_out.begin() + 2, pnet_out.end());
+
+		// Grab the detections
+		vector<cv::Rect_<float> > proposal_boxes;
+		vector<float> scores;
+		vector<cv::Rect_<float> > proposal_corrections;
+		generate_bounding_boxes(proposal_boxes, scores, proposal_corrections, prob_heatmap, corrections_heatmap, scale, t1, face_support);
+
+		proposal_boxes_cross_scale[i] = proposal_boxes;
+		scores_cross_scale[i] = scores;
+		proposal_corrections_cross_scale[i] = proposal_corrections;
+	}
+	//});
+
+	// Perform non-maximum supression on proposals accross scales and combine them
+	for (int i = 0; i < num_scales; ++i)
+	{
+		vector<int> to_keep = non_maximum_supression(proposal_boxes_cross_scale[i], scores_cross_scale[i], 0.5, false);
+		select_subset(to_keep, proposal_boxes_cross_scale[i], scores_cross_scale[i], proposal_corrections_cross_scale[i]);
+
+		proposal_boxes_all.insert(proposal_boxes_all.end(), proposal_boxes_cross_scale[i].begin(), proposal_boxes_cross_scale[i].end());
+		scores_all.insert(scores_all.end(), scores_cross_scale[i].begin(), scores_cross_scale[i].end());
+		proposal_corrections_all.insert(proposal_corrections_all.end(), proposal_corrections_cross_scale[i].begin(), proposal_corrections_cross_scale[i].end());
+	}
+
+	// Preparation for RNet step
+
+	// Non maximum supression accross bounding boxes, and their offset correction
+	vector<int> to_keep = non_maximum_supression(proposal_boxes_all, scores_all, 0.7, false);
+	select_subset(to_keep, proposal_boxes_all, scores_all, proposal_corrections_all);
+
+	apply_correction(proposal_boxes_all, proposal_corrections_all, false);
+
+	// Convert to rectangles and round
+	rectify(proposal_boxes_all);
+
+	// Creating proposal images from previous step detections
+	vector<bool> above_thresh(proposal_boxes_all.size());
+	//tbb::parallel_for(0, (int)proposal_boxes_all.size(), [&](int k) {
+	for (size_t k = 0; k < proposal_boxes_all.size(); ++k) 
+	{
+		float width_target = proposal_boxes_all[k].width + 1;
+		float height_target = proposal_boxes_all[k].height + 1;
+
+		// Work out the start and end indices in the original image
+		int start_x_in = cv::max((int)(proposal_boxes_all[k].x - 1), 0);
+		int start_y_in = cv::max((int)(proposal_boxes_all[k].y - 1), 0);
+		int end_x_in = cv::min((int)(proposal_boxes_all[k].x + width_target - 1), width_orig);
+		int end_y_in = cv::min((int)(proposal_boxes_all[k].y + height_target - 1), height_orig);
+
+		// Work out the start and end indices in the target image
+		int	start_x_out = cv::max((int)(-proposal_boxes_all[k].x + 1), 0);
+		int start_y_out = cv::max((int)(-proposal_boxes_all[k].y + 1), 0);
+		int end_x_out = cv::min(width_target - (proposal_boxes_all[k].x + proposal_boxes_all[k].width - width_orig), width_target);
+		int end_y_out = cv::min(height_target - (proposal_boxes_all[k].y + proposal_boxes_all[k].height - height_orig), height_target);
+
+		cv::Mat tmp(height_target, width_target, CV_32FC3, cv::Scalar(0.0f,0.0f,0.0f));
+
+		img_float(cv::Rect(start_x_in, start_y_in, end_x_in - start_x_in, end_y_in - start_y_in)).copyTo(
+			tmp(cv::Rect(start_x_out, start_y_out, end_x_out - start_x_out, end_y_out - start_y_out)));
+		
+		cv::Mat prop_img;
+		cv::resize(tmp, prop_img, cv::Size(24, 24));
+			
+		prop_img = (prop_img - 127.5) * 0.0078125;
+		
+		// Perform RNet on the proposal image
+		std::vector<cv::Mat_<float> > rnet_out = RNet.Inference(prop_img, true);
+
+		float prob = 1.0 / (1.0 + cv::exp(rnet_out[0].at<float>(0) - rnet_out[0].at<float>(1)));
+		scores_all[k] = prob;
+		proposal_corrections_all[k].x = rnet_out[0].at<float>(2);
+		proposal_corrections_all[k].y = rnet_out[0].at<float>(3);
+		proposal_corrections_all[k].width = rnet_out[0].at<float>(4);
+		proposal_corrections_all[k].height = rnet_out[0].at<float>(5);
+		if(prob >= t2)
+		{
+			above_thresh[k] = true;
+		}
+		else
+		{
+			above_thresh[k] = false;
+		}
+
+	}
+	//});
+
+	to_keep.clear();
+	for (size_t i = 0; i < above_thresh.size(); ++i)
+	{
+		if (above_thresh[i])
+		{
+			to_keep.push_back(i);
+		}
+	}
+
+	// Pick only the bounding boxes above the threshold
+	select_subset(to_keep, proposal_boxes_all, scores_all, proposal_corrections_all);
+
+	// Non maximum supression accross bounding boxes, and their offset correction
+	to_keep = non_maximum_supression(proposal_boxes_all, scores_all, 0.7, false);
+	select_subset(to_keep, proposal_boxes_all, scores_all, proposal_corrections_all);
+
+	apply_correction(proposal_boxes_all, proposal_corrections_all, false);
+
+	// Convert to rectangles and round
+	rectify(proposal_boxes_all);
+
+	// Preparing for the ONet stage
+	above_thresh.clear();
+	above_thresh.resize(proposal_boxes_all.size());
+	//tbb::parallel_for(0, (int)proposal_boxes_all.size(), [&](int k) {
+	for (size_t k = 0; k < proposal_boxes_all.size(); ++k)
+	{
+		float width_target = proposal_boxes_all[k].width + 1;
+		float height_target = proposal_boxes_all[k].height + 1;
+
+		// Work out the start and end indices in the original image
+		int start_x_in = cv::max((int)(proposal_boxes_all[k].x - 1), 0);
+		int start_y_in = cv::max((int)(proposal_boxes_all[k].y - 1), 0);
+		int end_x_in = cv::min((int)(proposal_boxes_all[k].x + width_target - 1), width_orig);
+		int end_y_in = cv::min((int)(proposal_boxes_all[k].y + height_target - 1), height_orig);
+
+		// Work out the start and end indices in the target image
+		int	start_x_out = cv::max((int)(-proposal_boxes_all[k].x + 1), 0);
+		int start_y_out = cv::max((int)(-proposal_boxes_all[k].y + 1), 0);
+		int end_x_out = cv::min(width_target - (proposal_boxes_all[k].x + proposal_boxes_all[k].width - width_orig), width_target);
+		int end_y_out = cv::min(height_target - (proposal_boxes_all[k].y + proposal_boxes_all[k].height - height_orig), height_target);
+
+		cv::Mat tmp(height_target, width_target, CV_32FC3, cv::Scalar(0.0f, 0.0f, 0.0f));
+
+		img_float(cv::Rect(start_x_in, start_y_in, end_x_in - start_x_in, end_y_in - start_y_in)).copyTo(
+			tmp(cv::Rect(start_x_out, start_y_out, end_x_out - start_x_out, end_y_out - start_y_out)));
+
+		cv::Mat prop_img;
+		cv::resize(tmp, prop_img, cv::Size(48, 48));
+
+		prop_img = (prop_img - 127.5) * 0.0078125;
+
+		// Perform RNet on the proposal image
+		std::vector<cv::Mat_<float> > onet_out = ONet.Inference(prop_img, true);
+
+		float prob = 1.0 / (1.0 + cv::exp(onet_out[0].at<float>(0) - onet_out[0].at<float>(1)));
+		scores_all[k] = prob;
+		proposal_corrections_all[k].x = onet_out[0].at<float>(2);
+		proposal_corrections_all[k].y = onet_out[0].at<float>(3);
+		proposal_corrections_all[k].width = onet_out[0].at<float>(4);
+		proposal_corrections_all[k].height = onet_out[0].at<float>(5);
+		if (prob >= t3)
+		{
+			above_thresh[k] = true;
+		}
+		else
+		{
+			above_thresh[k] = false;
+		}
+	}
+	//});
+
+	to_keep.clear();
+	for (size_t i = 0; i < above_thresh.size(); ++i)
+	{
+		if (above_thresh[i])
+		{
+			to_keep.push_back(i);
+		}
+	}
+
+	// Pick only the bounding boxes above the threshold
+	select_subset(to_keep, proposal_boxes_all, scores_all, proposal_corrections_all);
+	apply_correction(proposal_boxes_all, proposal_corrections_all, true);
+
+	// Non maximum supression accross bounding boxes, and their offset correction
+	to_keep = non_maximum_supression(proposal_boxes_all, scores_all, 0.7, true);
+	select_subset(to_keep, proposal_boxes_all, scores_all, proposal_corrections_all);
+
+	// TODO rem
+	cv::Mat disp_img = input_img.clone();
+
+	// Correct the box to expectation to be tight around facial landmarks
+	for (size_t k = 0; k < proposal_boxes_all.size(); ++k)
+	{
+		proposal_boxes_all[k].x = proposal_boxes_all[k].width * -0.0075 + proposal_boxes_all[k].x;
+		proposal_boxes_all[k].y = proposal_boxes_all[k].height * 0.2459 + proposal_boxes_all[k].y;
+		proposal_boxes_all[k].width = 1.0323 * proposal_boxes_all[k].width;
+		proposal_boxes_all[k].height = 0.7751 * proposal_boxes_all[k].height;
+
+		o_regions.push_back(cv::Rect_<double>(proposal_boxes_all[k].x, proposal_boxes_all[k].y, proposal_boxes_all[k].width, proposal_boxes_all[k].height));
+		o_confidences.push_back(scores_all[k]);
+
+		cv::rectangle(disp_img, proposal_boxes_all[k], cv::Scalar(255, 0, 0), 3);
+	}
+	cv::imshow("detections", disp_img);
+	cv::waitKey(20);
+
+	if(o_regions.size() > 0)
+	{
+		return true;
+	}
+	else
+	{
+		return false;
+	}
+}
+
diff --git a/lib/local/LandmarkDetector/src/LandmarkDetectorFunc.cpp b/lib/local/LandmarkDetector/src/LandmarkDetectorFunc.cpp
index 6b9b7158..ba6c9728 100644
--- a/lib/local/LandmarkDetector/src/LandmarkDetectorFunc.cpp
+++ b/lib/local/LandmarkDetector/src/LandmarkDetectorFunc.cpp
@@ -208,12 +208,23 @@ void CorrectGlobalParametersVideo(const cv::Mat_<uchar> &grayscale_image, CLNF&
 	
 }
 
-bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat_<uchar> &grayscale_image, CLNF& clnf_model, FaceModelParameters& params)
+bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat &image, CLNF& clnf_model, FaceModelParameters& params)
 {
 	// First need to decide if the landmarks should be "detected" or "tracked"
 	// Detected means running face detection and a larger search area, tracked means initialising from previous step
 	// and using a smaller search area
 
+	cv::Mat grayscale_image;
+	if (image.channels() == 3)
+	{
+		cv::cvtColor(image, grayscale_image, CV_BGR2GRAY);
+	}
+	else
+	{
+		grayscale_image = image.clone();
+	}
+
+
 	// Indicating that this is a first detection in video sequence or after restart
 	bool initial_detection = !clnf_model.tracking_initialised;
 
@@ -263,8 +274,13 @@ bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat_<uchar> &grayscale_i
 		// If the face detector has not been initialised read it in
 		if(clnf_model.face_detector_HAAR.empty())
 		{
-			clnf_model.face_detector_HAAR.load(params.face_detector_location);
-			clnf_model.face_detector_location = params.face_detector_location;
+			clnf_model.face_detector_HAAR.load(params.haar_face_detector_location);
+			clnf_model.haar_face_detector_location = params.haar_face_detector_location;
+		}
+		if (clnf_model.face_detector_MTCNN.empty())
+		{
+			clnf_model.face_detector_MTCNN.Read(params.mtcnn_face_detector_location);
+			clnf_model.mtcnn_face_detector_location = params.haar_face_detector_location;
 		}
 
 		cv::Point preference_det(-1, -1);
@@ -285,6 +301,11 @@ bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat_<uchar> &grayscale_i
 		{
 			face_detection_success = LandmarkDetector::DetectSingleFace(bounding_box, grayscale_image, clnf_model.face_detector_HAAR, preference_det);
 		}
+		else if (params.curr_face_detector == FaceModelParameters::MTCNN_DETECTOR)
+		{
+			double confidence;
+			face_detection_success = LandmarkDetector::DetectSingleFaceMTCNN(bounding_box, image, clnf_model.face_detector_MTCNN, confidence, preference_det);
+		}
 
 		// Attempt to detect landmarks using the detected face (if unseccessful the detection will be ignored)
 		if(face_detection_success)
@@ -350,7 +371,7 @@ bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat_<uchar> &grayscale_i
 	
 }
 
-bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat_<uchar> &grayscale_image, const cv::Rect_<double> bounding_box, CLNF& clnf_model, FaceModelParameters& params)
+bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat &image, const cv::Rect_<double> bounding_box, CLNF& clnf_model, FaceModelParameters& params)
 {
 	if(bounding_box.width > 0)
 	{
@@ -362,7 +383,7 @@ bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat_<uchar> &grayscale_i
 		clnf_model.tracking_initialised = true;
 	}
 
-	return DetectLandmarksInVideo(grayscale_image, clnf_model, params);
+	return DetectLandmarksInVideo(image, clnf_model, params);
 
 }
 
@@ -621,9 +642,19 @@ bool DetectLandmarksInImageMultiHypEarlyTerm(const cv::Mat_<uchar> &grayscale_im
 
 
 // This is the one where the actual work gets done, other DetectLandmarksInImage calls lead to this one
-bool LandmarkDetector::DetectLandmarksInImage(const cv::Mat_<uchar> &grayscale_image, const cv::Rect_<double> bounding_box, CLNF& clnf_model, FaceModelParameters& params)
+bool LandmarkDetector::DetectLandmarksInImage(const cv::Mat &image, const cv::Rect_<double> bounding_box, CLNF& clnf_model, FaceModelParameters& params)
 {
 
+	cv::Mat grayscale_image;
+	if (image.channels() == 3)
+	{
+		cv::cvtColor(image, grayscale_image, CV_BGR2GRAY);
+	}
+	else
+	{
+		grayscale_image = image.clone();
+	}
+
 	// Can have multiple hypotheses
 	vector<cv::Vec3d> rotation_hypotheses;
 
@@ -654,27 +685,41 @@ bool LandmarkDetector::DetectLandmarksInImage(const cv::Mat_<uchar> &grayscale_i
 	// Either use basic multi-hypothesis testing or clever testing if early termination parameters are present
 	if(clnf_model.patch_experts.early_term_biases.size() == 0)
 	{
-		success = DetectLandmarksInImageMultiHypBasic(grayscale_image, rotation_hypotheses, bounding_box, clnf_model, params);
+		success = DetectLandmarksInImageMultiHypBasic(image, rotation_hypotheses, bounding_box, clnf_model, params);
 	}
 	else
 	{
-		success = DetectLandmarksInImageMultiHypEarlyTerm(grayscale_image, rotation_hypotheses, bounding_box, clnf_model, params);
+		success = DetectLandmarksInImageMultiHypEarlyTerm(image, rotation_hypotheses, bounding_box, clnf_model, params);
 	}
 	return success;
 }
 
-bool LandmarkDetector::DetectLandmarksInImage(const cv::Mat_<uchar> &grayscale_image, CLNF& clnf_model, FaceModelParameters& params)
+bool LandmarkDetector::DetectLandmarksInImage(const cv::Mat &image, CLNF& clnf_model, FaceModelParameters& params)
 {
+	cv::Mat grayscale_image;
+	if (image.channels() == 3)
+	{
+		cv::cvtColor(image, grayscale_image, CV_BGR2GRAY);
+	}
+	else
+	{
+		grayscale_image = image.clone();
+	}
 
 	cv::Rect_<double> bounding_box;
 
 	// If the face detector has not been initialised read it in
-	if(clnf_model.face_detector_HAAR.empty())
+	if(clnf_model.face_detector_HAAR.empty() && params.curr_face_detector == FaceModelParameters::HAAR_DETECTOR)
 	{
-		clnf_model.face_detector_HAAR.load(params.face_detector_location);
-		clnf_model.face_detector_location = params.face_detector_location;
+		clnf_model.face_detector_HAAR.load(params.haar_face_detector_location);
+		clnf_model.haar_face_detector_location = params.haar_face_detector_location;
 	}
-		
+	
+	if (clnf_model.face_detector_MTCNN.empty() && params.curr_face_detector == FaceModelParameters::MTCNN_DETECTOR)
+	{
+		clnf_model.face_detector_MTCNN.Read(params.mtcnn_face_detector_location);
+	}
+
 	// Detect the face first
 	if(params.curr_face_detector == FaceModelParameters::HOG_SVM_DETECTOR)
 	{
@@ -683,7 +728,12 @@ bool LandmarkDetector::DetectLandmarksInImage(const cv::Mat_<uchar> &grayscale_i
 	}
 	else if(params.curr_face_detector == FaceModelParameters::HAAR_DETECTOR)
 	{
-		LandmarkDetector::DetectSingleFace(bounding_box, grayscale_image, clnf_model.face_detector_HAAR);
+		LandmarkDetector::DetectSingleFace(bounding_box, image, clnf_model.face_detector_HAAR);
+	}
+	else if (params.curr_face_detector == FaceModelParameters::MTCNN_DETECTOR)
+	{
+		double confidence;
+		LandmarkDetector::DetectSingleFaceMTCNN(bounding_box, image, clnf_model.face_detector_MTCNN, confidence);
 	}
 
 	if(bounding_box.width == 0)
@@ -692,6 +742,6 @@ bool LandmarkDetector::DetectLandmarksInImage(const cv::Mat_<uchar> &grayscale_i
 	}
 	else
 	{
-		return DetectLandmarksInImage(grayscale_image, bounding_box, clnf_model, params);
+		return DetectLandmarksInImage(image, bounding_box, clnf_model, params);
 	}
 }
diff --git a/lib/local/LandmarkDetector/src/LandmarkDetectorModel.cpp b/lib/local/LandmarkDetector/src/LandmarkDetectorModel.cpp
index 92b4a9ef..c72d3e6e 100644
--- a/lib/local/LandmarkDetector/src/LandmarkDetectorModel.cpp
+++ b/lib/local/LandmarkDetector/src/LandmarkDetectorModel.cpp
@@ -67,9 +67,9 @@ CLNF::CLNF(string fname)
 
 // Copy constructor (makes a deep copy of CLNF)
 CLNF::CLNF(const CLNF& other): pdm(other.pdm), params_local(other.params_local.clone()), params_global(other.params_global), detected_landmarks(other.detected_landmarks.clone()),
-	landmark_likelihoods(other.landmark_likelihoods.clone()), patch_experts(other.patch_experts), landmark_validator(other.landmark_validator), face_detector_location(other.face_detector_location),
-	hierarchical_mapping(other.hierarchical_mapping), hierarchical_models(other.hierarchical_models), hierarchical_model_names(other.hierarchical_model_names),
-	hierarchical_params(other.hierarchical_params), eye_model(other.eye_model)
+	landmark_likelihoods(other.landmark_likelihoods.clone()), patch_experts(other.patch_experts), landmark_validator(other.landmark_validator), haar_face_detector_location(other.haar_face_detector_location),
+	mtcnn_face_detector_location(other.mtcnn_face_detector_location), hierarchical_mapping(other.hierarchical_mapping), hierarchical_models(other.hierarchical_models), hierarchical_model_names(other.hierarchical_model_names),
+	hierarchical_params(other.hierarchical_params), eye_model(other.eye_model), face_detector_MTCNN(other.face_detector_MTCNN)
 {
 	this->detection_success = other.detection_success;
 	this->tracking_initialised = other.tracking_initialised;
@@ -78,9 +78,9 @@ CLNF::CLNF(const CLNF& other): pdm(other.pdm), params_local(other.params_local.c
 	this->failures_in_a_row = other.failures_in_a_row;
 
 	// Load the CascadeClassifier (as it does not have a proper copy constructor)
-	if(!face_detector_location.empty())
+	if(!haar_face_detector_location.empty())
 	{
-		this->face_detector_HAAR.load(face_detector_location);
+		this->face_detector_HAAR.load(haar_face_detector_location);
 	}
 	// Make sure the matrices are allocated properly
 	this->triangulations.resize(other.triangulations.size());
@@ -114,7 +114,8 @@ CLNF & CLNF::operator= (const CLNF& other)
 		landmark_likelihoods =other.landmark_likelihoods.clone();
 		patch_experts = Patch_experts(other.patch_experts);
 		landmark_validator = DetectionValidator(other.landmark_validator);
-		face_detector_location = other.face_detector_location;
+		haar_face_detector_location = other.haar_face_detector_location;
+		mtcnn_face_detector_location = other.mtcnn_face_detector_location;
 
 		this->detection_success = other.detection_success;
 		this->tracking_initialised = other.tracking_initialised;
@@ -125,9 +126,9 @@ CLNF & CLNF::operator= (const CLNF& other)
 		this->eye_model = other.eye_model;
 
 		// Load the CascadeClassifier (as it does not have a proper copy constructor)
-		if(!face_detector_location.empty())
+		if(!haar_face_detector_location.empty())
 		{
-			this->face_detector_HAAR.load(face_detector_location);
+			this->face_detector_HAAR.load(haar_face_detector_location);
 		}
 		// Make sure the matrices are allocated properly
 		this->triangulations.resize(other.triangulations.size());
@@ -149,9 +150,12 @@ CLNF & CLNF::operator= (const CLNF& other)
 		this->hierarchical_models = other.hierarchical_models;
 		this->hierarchical_model_names = other.hierarchical_model_names;
 		this->hierarchical_params = other.hierarchical_params;
+
+		mtcnn_face_detector_location = other.mtcnn_face_detector_location;
+		face_detector_MTCNN = other.face_detector_MTCNN;
 	}
 
-	face_detector_HOG = dlib::get_frontal_face_detector();
+	face_detector_HOG = dlib::get_frontal_face_detector();	
 
 	return *this;
 }
@@ -172,7 +176,8 @@ CLNF::CLNF(const CLNF&& other)
 	landmark_likelihoods = other.landmark_likelihoods;
 	patch_experts = other.patch_experts;
 	landmark_validator = other.landmark_validator;
-	face_detector_location = other.face_detector_location;
+	haar_face_detector_location = other.haar_face_detector_location;
+	mtcnn_face_detector_location = other.mtcnn_face_detector_location;
 
 	face_detector_HAAR = other.face_detector_HAAR;
 
@@ -181,6 +186,8 @@ CLNF::CLNF(const CLNF&& other)
 
 	face_detector_HOG = dlib::get_frontal_face_detector();
 
+	face_detector_MTCNN = other.face_detector_MTCNN;
+
 	// Copy over the hierarchical models
 	this->hierarchical_mapping = other.hierarchical_mapping;
 	this->hierarchical_models = other.hierarchical_models;
@@ -207,7 +214,8 @@ CLNF & CLNF::operator= (const CLNF&& other)
 	landmark_likelihoods = other.landmark_likelihoods;
 	patch_experts = other.patch_experts;
 	landmark_validator = other.landmark_validator;
-	face_detector_location = other.face_detector_location;
+	haar_face_detector_location = other.haar_face_detector_location;
+	mtcnn_face_detector_location = other.mtcnn_face_detector_location;
 
 	face_detector_HAAR = other.face_detector_HAAR;
 
@@ -216,6 +224,8 @@ CLNF & CLNF::operator= (const CLNF&& other)
 
 	face_detector_HOG = dlib::get_frontal_face_detector();
 
+	face_detector_MTCNN = other.face_detector_MTCNN;
+
 	// Copy over the hierarchical models
 	this->hierarchical_mapping = other.hierarchical_mapping;
 	this->hierarchical_models = other.hierarchical_models;
diff --git a/lib/local/LandmarkDetector/src/LandmarkDetectorParameters.cpp b/lib/local/LandmarkDetector/src/LandmarkDetectorParameters.cpp
index b3d805e3..2a3f7fd0 100644
--- a/lib/local/LandmarkDetector/src/LandmarkDetectorParameters.cpp
+++ b/lib/local/LandmarkDetector/src/LandmarkDetectorParameters.cpp
@@ -86,7 +86,7 @@ FaceModelParameters::FaceModelParameters(vector<string> &arguments)
 		if (arguments[i].compare("-fdloc") ==0)
 		{
 			string face_detector_loc = arguments[i + 1];
-			face_detector_location = face_detector_loc;
+			haar_face_detector_location = face_detector_loc;
 			curr_face_detector = HAAR_DETECTOR;
 			valid[i] = false;
 			valid[i + 1] = false;
@@ -176,8 +176,10 @@ FaceModelParameters::FaceModelParameters(vector<string> &arguments)
 			valid[i] = false;
 
 			// For in-the-wild images use an in-the wild detector				
-			curr_face_detector = HOG_SVM_DETECTOR;
+			curr_face_detector = MTCNN_DETECTOR;
 
+			// Use multi-view hypotheses if in-the-wild setting
+			multi_view = true;
 		}
 	}
 
@@ -216,6 +218,46 @@ FaceModelParameters::FaceModelParameters(vector<string> &arguments)
 		sigma = 1.5 * sigma;
 		reg_factor = 0.9 * reg_factor;
 	}
+
+	// Make sure face detector location is valid
+	// First check working directory, then the executable's directory, then the config path set by the build process.
+	model_path = boost::filesystem::path(haar_face_detector_location);
+	if (boost::filesystem::exists(model_path))
+	{
+		haar_face_detector_location = model_path.string();
+	}
+	else if (boost::filesystem::exists(root / model_path))
+	{
+		haar_face_detector_location = (root / model_path).string();
+	}
+	else if (boost::filesystem::exists(config_path / model_path))
+	{
+		haar_face_detector_location = (config_path / model_path).string();
+	}
+	else
+	{
+		std::cout << "Could not find the HAAR face detector location" << std::endl;
+	}
+
+	// Make sure face detector location is valid
+	// First check working directory, then the executable's directory, then the config path set by the build process.
+	model_path = boost::filesystem::path(mtcnn_face_detector_location);
+	if (boost::filesystem::exists(model_path))
+	{
+		mtcnn_face_detector_location = model_path.string();
+	}
+	else if (boost::filesystem::exists(root / model_path))
+	{
+		mtcnn_face_detector_location = (root / model_path).string();
+	}
+	else if (boost::filesystem::exists(config_path / model_path))
+	{
+		mtcnn_face_detector_location = (config_path / model_path).string();
+	}
+	else
+	{
+		std::cout << "Could not find the MTCNN face detector location" << std::endl;
+	}
 }
 
 void FaceModelParameters::init()
@@ -269,11 +311,12 @@ void FaceModelParameters::init()
 	reinit_video_every = 4;
 
 	// Face detection
-	face_detector_location = "classifiers/haarcascade_frontalface_alt.xml";
+	haar_face_detector_location = "classifiers/haarcascade_frontalface_alt.xml";
+	mtcnn_face_detector_location = "model/mtcnn_detector/MTCNN_detector.txt";
 	quiet_mode = false;
 
-	// By default use HOG SVM
-	curr_face_detector = HOG_SVM_DETECTOR;
+	// By default use MTCNN
+	curr_face_detector = MTCNN_DETECTOR;
 
 	// The gaze tracking has to be explicitly initialised
 	track_gaze = false;
diff --git a/lib/local/LandmarkDetector/src/LandmarkDetectorUtils.cpp b/lib/local/LandmarkDetector/src/LandmarkDetectorUtils.cpp
index ffba8441..34b21a6d 100644
--- a/lib/local/LandmarkDetector/src/LandmarkDetectorUtils.cpp
+++ b/lib/local/LandmarkDetector/src/LandmarkDetectorUtils.cpp
@@ -1560,94 +1560,168 @@ namespace LandmarkDetector
 		return detect_success;
 	}
 
-	//============================================================================
-	// Matrix reading functionality
-	//============================================================================
+bool DetectFacesMTCNN(vector<cv::Rect_<double> >& o_regions, const cv::Mat& image, LandmarkDetector::FaceDetectorMTCNN& detector, std::vector<double>& o_confidences)
+{
+	detector.DetectFaces(o_regions, image, o_confidences);
 
-	// Reading in a matrix from a stream
-	void ReadMat(std::ifstream& stream, cv::Mat &output_mat)
+	return o_regions.size() > 0;
+}
+
+bool DetectSingleFaceMTCNN(cv::Rect_<double>& o_region, const cv::Mat& image, LandmarkDetector::FaceDetectorMTCNN& detector, double& confidence, cv::Point preference)
+{
+	// The tracker can return multiple faces
+	vector<cv::Rect_<double> > face_detections;
+	vector<double> confidences;
+
+	detector.DetectFaces(face_detections, image, confidences);
+
+	bool detect_success = face_detections.size() > 0;
+	if (detect_success)
 	{
-		// Read in the number of rows, columns and the data type
-		int row, col, type;
 
-		stream >> row >> col >> type;
+		bool use_preferred = (preference.x != -1) && (preference.y != -1);
 
-		output_mat = cv::Mat(row, col, type);
-
-		switch (output_mat.type())
+		// keep the most confident one or the one closest to preference point if set
+		double best_so_far;
+		if (use_preferred)
 		{
-		case CV_64FC1:
+			best_so_far = sqrt((preference.x - (face_detections[0].width / 2 + face_detections[0].x)) * (preference.x - (face_detections[0].width / 2 + face_detections[0].x)) +
+				(preference.y - (face_detections[0].height / 2 + face_detections[0].y)) * (preference.y - (face_detections[0].height / 2 + face_detections[0].y)));
+		}
+		else
 		{
-			cv::MatIterator_<double> begin_it = output_mat.begin<double>();
-			cv::MatIterator_<double> end_it = output_mat.end<double>();
+			best_so_far = confidences[0];
+		}
+		int bestIndex = 0;
 
-			while (begin_it != end_it)
+		for (size_t i = 1; i < face_detections.size(); ++i)
+		{
+
+			double dist;
+			bool better;
+
+			if (use_preferred)
 			{
-				stream >> *begin_it++;
+				dist = sqrt((preference.x - (face_detections[0].width / 2 + face_detections[0].x)) * (preference.x - (face_detections[0].width / 2 + face_detections[0].x)) +
+					(preference.y - (face_detections[0].height / 2 + face_detections[0].y)) * (preference.y - (face_detections[0].height / 2 + face_detections[0].y)));
+				better = dist < best_so_far;
+			}
+			else
+			{
+				dist = confidences[i];
+				better = dist > best_so_far;
+			}
+
+			// Pick a closest face
+			if (better)
+			{
+				best_so_far = dist;
+				bestIndex = i;
 			}
 		}
-		break;
-		case CV_32FC1:
-		{
-			cv::MatIterator_<float> begin_it = output_mat.begin<float>();
-			cv::MatIterator_<float> end_it = output_mat.end<float>();
 
-			while (begin_it != end_it)
-			{
-				stream >> *begin_it++;
-			}
-		}
-		break;
-		case CV_32SC1:
-		{
-			cv::MatIterator_<int> begin_it = output_mat.begin<int>();
-			cv::MatIterator_<int> end_it = output_mat.end<int>();
-			while (begin_it != end_it)
-			{
-				stream >> *begin_it++;
-			}
-		}
-		break;
-		case CV_8UC1:
-		{
-			cv::MatIterator_<uchar> begin_it = output_mat.begin<uchar>();
-			cv::MatIterator_<uchar> end_it = output_mat.end<uchar>();
-			while (begin_it != end_it)
-			{
-				stream >> *begin_it++;
-			}
-		}
-		break;
-		default:
-			printf("ERROR(%s,%d) : Unsupported Matrix type %d!\n", __FILE__, __LINE__, output_mat.type()); abort();
+		o_region = face_detections[bestIndex];
+		confidence = confidences[bestIndex];
+	}
+	else
+	{
+		// if not detected
+		o_region = cv::Rect_<double>(0, 0, 0, 0);
+		// A completely unreliable detection (shouldn't really matter what is returned here)
+		confidence = -2;
+	}
+	return detect_success;
+}
 
 
+//============================================================================
+// Matrix reading functionality
+//============================================================================
+
+// Reading in a matrix from a stream
+void ReadMat(std::ifstream& stream, cv::Mat &output_mat)
+{
+	// Read in the number of rows, columns and the data type
+	int row, col, type;
+
+	stream >> row >> col >> type;
+
+	output_mat = cv::Mat(row, col, type);
+
+	switch (output_mat.type())
+	{
+	case CV_64FC1:
+	{
+		cv::MatIterator_<double> begin_it = output_mat.begin<double>();
+		cv::MatIterator_<double> end_it = output_mat.end<double>();
+
+		while (begin_it != end_it)
+		{
+			stream >> *begin_it++;
 		}
 	}
-
-	void ReadMatBin(std::ifstream& stream, cv::Mat &output_mat)
+	break;
+	case CV_32FC1:
 	{
-		// Read in the number of rows, columns and the data type
-		int row, col, type;
+		cv::MatIterator_<float> begin_it = output_mat.begin<float>();
+		cv::MatIterator_<float> end_it = output_mat.end<float>();
 
-		stream.read((char*)&row, 4);
-		stream.read((char*)&col, 4);
-		stream.read((char*)&type, 4);
-
-		output_mat = cv::Mat(row, col, type);
-		int size = output_mat.rows * output_mat.cols * output_mat.elemSize();
-		stream.read((char *)output_mat.data, size);
-
-	}
-
-	// Skipping lines that start with # (together with empty lines)
-	void SkipComments(std::ifstream& stream)
-	{
-		while (stream.peek() == '#' || stream.peek() == '\n' || stream.peek() == ' ' || stream.peek() == '\r')
+		while (begin_it != end_it)
 		{
-			std::string skipped;
-			std::getline(stream, skipped);
+			stream >> *begin_it++;
 		}
 	}
+	break;
+	case CV_32SC1:
+	{
+		cv::MatIterator_<int> begin_it = output_mat.begin<int>();
+		cv::MatIterator_<int> end_it = output_mat.end<int>();
+		while (begin_it != end_it)
+		{
+			stream >> *begin_it++;
+		}
+	}
+	break;
+	case CV_8UC1:
+	{
+		cv::MatIterator_<uchar> begin_it = output_mat.begin<uchar>();
+		cv::MatIterator_<uchar> end_it = output_mat.end<uchar>();
+		while (begin_it != end_it)
+		{
+			stream >> *begin_it++;
+		}
+	}
+	break;
+	default:
+		printf("ERROR(%s,%d) : Unsupported Matrix type %d!\n", __FILE__, __LINE__, output_mat.type()); abort();
+
+
+	}
+}
+
+void ReadMatBin(std::ifstream& stream, cv::Mat &output_mat)
+{
+	// Read in the number of rows, columns and the data type
+	int row, col, type;
+
+	stream.read((char*)&row, 4);
+	stream.read((char*)&col, 4);
+	stream.read((char*)&type, 4);
+
+	output_mat = cv::Mat(row, col, type);
+	int size = output_mat.rows * output_mat.cols * output_mat.elemSize();
+	stream.read((char *)output_mat.data, size);
+
+}
+
+// Skipping lines that start with # (together with empty lines)
+void SkipComments(std::ifstream& stream)
+{
+	while (stream.peek() == '#' || stream.peek() == '\n' || stream.peek() == ' ' || stream.peek() == '\r')
+	{
+		std::string skipped;
+		std::getline(stream, skipped);
+	}
+}
 
 }
diff --git a/matlab_version/demo/face_image_demo.m b/matlab_version/demo/face_image_demo.m
index 654b0acb..80ad8973 100644
--- a/matlab_version/demo/face_image_demo.m
+++ b/matlab_version/demo/face_image_demo.m
@@ -16,6 +16,10 @@ addpath('../CCNF/');
 
 clmParams.multi_modal_types  = patches(1).multi_modal_types;
 
+% Dependencies for face detection (MatConvNet), remove if not present
+setup_mconvnet;
+addpath('../face_detection/mtcnn/');
+
 %%
 root_dir = '../../samples/';
 images = dir([root_dir, '*.jpg']);
@@ -25,8 +29,11 @@ verbose = true;
 for img=1:numel(images)
     image_orig = imread([root_dir images(img).name]);
 
+    % MTCNN face detector
+    [bboxs, det_shapes, confidences] = detect_face_mtcnn(image_orig);
+
     % First attempt to use the Matlab one (fastest but not as accurate, if not present use yu et al.)
-    [bboxs, det_shapes] = detect_faces(image_orig, {'cascade', 'yu'});
+    % [bboxs, det_shapes] = detect_faces(image_orig, {'cascade', 'yu'});
     % Zhu and Ramanan and Yu et al. are slower, but also more accurate 
     % and can be used when vision toolbox is unavailable
     % [bboxs, det_shapes] = detect_faces(image_orig, {'yu', 'zhu'});
@@ -52,28 +59,14 @@ for img=1:numel(images)
         hold on;
     end
 
-    for i=1:size(bboxs,2)
+    for i=1:size(bboxs,1)
 
         % Convert from the initial detected shape to CLM model parameters,
         % if shape is available
         
-        bbox = bboxs(:,i);
-        
-        if(~isempty(det_shapes))
-            shape = det_shapes(:,:,i);
-            inds = [1:60,62:64,66:68];
-            M = pdm.M([inds, inds+68, inds+68*2]);
-            E = pdm.E;
-            V = pdm.V([inds, inds+68, inds+68*2],:);
-            [ a, R, T, ~, params, err, shapeOrtho] = fit_PDM_ortho_proj_to_2D(M, E, V, shape);
-            g_param = [a; Rot2Euler(R)'; T];
-            l_param = params;
+        bbox = bboxs(i,:);
 
-            % Use the initial global and local params for clm fitting in the image
-            [shape,~,~,lhood,lmark_lhood,view_used] = Fitting_from_bb(image, [], bbox, pdm, patches, clmParams, 'gparam', g_param, 'lparam', l_param);
-        else
-            [shape,~,~,lhood,lmark_lhood,view_used] = Fitting_from_bb(image, [], bbox, pdm, patches, clmParams);
-        end
+        [shape,~,~,lhood,lmark_lhood,view_used] = Fitting_from_bb(image, [], bbox, pdm, patches, clmParams);
         
         % shape correction for matlab format
         shape = shape + 1;
diff --git a/matlab_version/demo/face_video_demo.m b/matlab_version/demo/face_video_demo.m
index 2241c686..ba84b2d2 100644
--- a/matlab_version/demo/face_video_demo.m
+++ b/matlab_version/demo/face_video_demo.m
@@ -33,6 +33,12 @@ od = cd('../face_validation/');
 setup;
 cd(od);
 
+% Setup the face detector (remove the setup mconvnet if not using
+% MatConvNet)
+setup_mconvnet;
+addpath('../face_detection/mtcnn/');
+
+
 %%
 for v=1:numel(vids)
     % load the video
@@ -66,8 +72,9 @@ for v=1:numel(vids)
         image_orig = read(vr, i);
         if((~det && mod(i,4) == 0) || ~initialised)
             
+            [bboxs, det_shapes, confidences] = detect_face_mtcnn(image_orig);            
             % First attempt to use the Matlab one (fastest but not as accurate, if not present use yu et al.)
-            [bboxs, det_shapes] = detect_faces(image_orig, {'cascade', 'yu'});
+            % [bboxs, det_shapes] = detect_faces(image_orig, {'cascade', 'yu'});
             % Zhu and Ramanan and Yu et al. are slower, but also more accurate
             % and can be used when vision toolbox is unavailable
             % [bboxs, det_shapes] = detect_faces(image_orig, {'yu', 'zhu'});
@@ -75,8 +82,8 @@ for v=1:numel(vids)
             if(~isempty(bboxs))
 
                 % Pick the biggest face for tracking
-                [~,ind] = max(bboxs(3,:) - bboxs(1,:));                    
-                bbox = bboxs(:,ind);
+                [~,ind] = max(bboxs(:,3) - bboxs(:,1));                    
+                bbox = bboxs(ind,:);
                 
                 % Discard overly small detections
                 if(bbox(3) - bbox(1) > 40)
@@ -84,39 +91,27 @@ for v=1:numel(vids)
                     % Either infer the local and global shape parameters
                     % from the detected landmarks or just using the
                     % bounding box
-                    if(~isempty(det_shapes))
-                        shape = det_shapes(:,:,ind);                                        
-
-                        inds = [1:60,62:64,66:68];
-                        M = pdm.M([inds, inds+68, inds+68*2]);
-                        E = pdm.E;
-                        V = pdm.V([inds, inds+68, inds+68*2],:);
-                        [ a, R, T, ~, params, err] = fit_PDM_ortho_proj_to_2D(M, E, V, shape);
-                        g_param_n = [a; Rot2Euler(R)'; T];
-                        l_param_n = params;
-                    else
                         
-                        num_points = numel(pdm.M) / 3;
+                    num_points = numel(pdm.M) / 3;
 
-                        M = reshape(pdm.M, num_points, 3);
-                        width_model = max(M(:,1)) - min(M(:,1));
-                        height_model = max(M(:,2)) - min(M(:,2));
+                    M = reshape(pdm.M, num_points, 3);
+                    width_model = max(M(:,1)) - min(M(:,1));
+                    height_model = max(M(:,2)) - min(M(:,2));
 
-                        a = (((bbox(3) - bbox(1)) / width_model) + ((bbox(4) - bbox(2))/ height_model)) / 2;
+                    a = (((bbox(3) - bbox(1)) / width_model) + ((bbox(4) - bbox(2))/ height_model)) / 2;
 
-                        tx = (bbox(3) + bbox(1))/2;
-                        ty = (bbox(4) + bbox(2))/2;
+                    tx = (bbox(3) + bbox(1))/2;
+                    ty = (bbox(4) + bbox(2))/2;
 
-                        % correct it so that the bounding box is just around the minimum
-                        % and maximum point in the initialised face
-                        tx = tx - a*(min(M(:,1)) + max(M(:,1)))/2;
-                        ty = ty + a*(min(M(:,2)) + max(M(:,2)))/2;
+                    % correct it so that the bounding box is just around the minimum
+                    % and maximum point in the initialised face
+                    tx = tx - a*(min(M(:,1)) + max(M(:,1)))/2;
+                    ty = ty + a*(min(M(:,2)) + max(M(:,2)))/2;
 
-                        % visualisation
-                        g_param_n = [a, 0, 0, 0, tx, ty]';
+                    % visualisation
+                    g_param_n = [a, 0, 0, 0, tx, ty]';
 
-                        l_param_n = zeros(size(pdm.E));
-                    end
+                    l_param_n = zeros(size(pdm.E));
                     
                     % If tracking has not started trust the detection
                     if(~initialised)
@@ -186,7 +181,7 @@ for v=1:numel(vids)
             end
             hold off;
             drawnow expose;
-            pause(0.05);
+            pause(0.01);
 
             if(record)
                 frame = getframe;
diff --git a/matlab_version/demo/setup_mconvnet.m b/matlab_version/demo/setup_mconvnet.m
new file mode 100644
index 00000000..f4837f11
--- /dev/null
+++ b/matlab_version/demo/setup_mconvnet.m
@@ -0,0 +1,28 @@
+function setup(varargin)
+
+try
+    run D:\soft\matconvnet-master\matconvnet-master\matlab/vl_setupnn ;
+    addpath D:\soft\matconvnet-master\matconvnet-master\examples ;
+
+    opts.useGpu = false ;
+    opts.verbose = false ;
+    opts = vl_argparse(opts, varargin) ;
+
+    try
+      vl_nnconv(single(1),single(1),[]) ;
+    catch
+      warning('VL_NNCONV() does not seem to be compiled. Trying to compile it now.') ;
+      vl_compilenn('enableGpu', opts.useGpu, 'verbose', opts.verbose) ;
+    end
+
+    if opts.useGpu
+      try
+        vl_nnconv(gpuArray(single(1)),gpuArray(single(1)),[]) ;
+      catch
+        vl_compilenn('enableGpu', opts.useGpu, 'verbose', opts.verbose) ;
+        warning('GPU support does not seem to be compiled in MatConvNet. Trying to compile it now') ;
+      end
+    end
+catch
+   fprintf('Could not setup MatConvNet, face detection will be slower, install the library and set the right location for it in setup_mconvnet.m\n');
+end
\ No newline at end of file
diff --git a/matlab_version/face_detection/detect_faces.m b/matlab_version/face_detection/detect_faces.m
index 42cdc1d0..078ac069 100644
--- a/matlab_version/face_detection/detect_faces.m
+++ b/matlab_version/face_detection/detect_faces.m
@@ -4,8 +4,8 @@ function [ bboxes, shapes ] = detect_faces( image, types )
 %   image - the image to detect the faces on
 %   type  - cell array of the face detectors to use: 'zhu', 'yu', 'cascade'
 % OUTPUT:
-%   bboxes - a set of bounding boxes describing the detected faces 4 x
-%   num_faces, the format is [min_x; min_y; max_x; max_y];
+%   bboxes - a set of bounding boxes describing the detected faces num_faces x
+%   4, the format is [min_x; min_y; max_x; max_y];
 %   shapes - if the face detector detects landmarks as well, output them
 %   n_points x 2 x num_faces
 
@@ -57,6 +57,6 @@ function [ bboxes, shapes ] = detect_faces( image, types )
     if(use_zhu && isempty(bboxes)) 
         [bboxes, shapes] = Detect_tree_based_zhu(image);
     end
-    
+    bboxes = bboxes'' 
 end
 
diff --git a/matlab_version/face_detection/mtcnn/ONet.m b/matlab_version/face_detection/mtcnn/ONet.m
new file mode 100644
index 00000000..14d62325
--- /dev/null
+++ b/matlab_version/face_detection/mtcnn/ONet.m
@@ -0,0 +1,37 @@
+function [ out_prob, out_correction, out_lmarks ] = ONet( im_data, ONet_mlab )
+%PNET Summary of this function goes here
+%   Detailed explanation goes here
+
+    % The convolutional and pooling layers
+    out = convolution(im_data, ONet_mlab.weights_conv1, ONet_mlab.biases_conv1);
+    out = PReLU(out, ONet_mlab.prelu_weights_1);
+    out = max_pooling2(out, 3, 2);
+    out = convolution(out, ONet_mlab.weights_conv2, ONet_mlab.biases_conv2);
+    out = PReLU(out, ONet_mlab.prelu_weights_2);
+    out = max_pooling2(out, 3, 2);
+    out = convolution(out, ONet_mlab.weights_conv3, ONet_mlab.biases_conv3);
+    out = PReLU(out, ONet_mlab.prelu_weights_3);
+    out = max_pooling2(out, 2, 2);
+    out = convolution(out, ONet_mlab.weights_conv4, ONet_mlab.biases_conv4);
+    out = PReLU(out, ONet_mlab.prelu_weights_4);
+    
+    % The fully connected layers
+
+    out_fc_1 = zeros(size(out,1)*size(out,2) * size(out,3), size(out,4));
+    out_fc_1(:) = out(:);
+    out_fc_1 = out_fc_1' * ONet_mlab.w_fc1 + ONet_mlab.b_fc1';
+    out_fc_1 = PReLU(out_fc_1, ONet_mlab.prelu_fc1);
+
+    out_fc2 = out_fc_1 * ONet_mlab.w_fc2 + ONet_mlab.b_fc2';
+    out_fc2 = out_fc2';
+    
+    % Probability of each proposal
+    out_prob = 1./(1+exp(out_fc2(1,:)-out_fc2(2,:)));
+    
+    % The correction of each detection
+    out_correction = out_fc2(3:6,:);
+
+    % The actual detected landmarks
+    out_lmarks = out_fc2(7:end,:);
+end
+
diff --git a/matlab_version/face_detection/mtcnn/ONet_mlab.mat b/matlab_version/face_detection/mtcnn/ONet_mlab.mat
new file mode 100644
index 00000000..5f08c114
Binary files /dev/null and b/matlab_version/face_detection/mtcnn/ONet_mlab.mat differ
diff --git a/matlab_version/face_detection/mtcnn/PNet.m b/matlab_version/face_detection/mtcnn/PNet.m
new file mode 100644
index 00000000..ca22fd68
--- /dev/null
+++ b/matlab_version/face_detection/mtcnn/PNet.m
@@ -0,0 +1,26 @@
+function [ out_prob, out_correction ] = PNet( im_data, PNet_mlab )
+%PNET Summary of this function goes here
+%   Detailed explanation goes here
+
+    % Pass through the first convolution layer
+    out = convolution(im_data, PNet_mlab.weights_conv1, PNet_mlab.biases_conv1);
+    out = PReLU(out, PNet_mlab.prelu_weights_1);
+    out = max_pooling2(out, 2, 2);
+    out = convolution(out, PNet_mlab.weights_conv2, PNet_mlab.biases_conv2);
+    out = PReLU(out, PNet_mlab.prelu_weights_2);
+    out = convolution(out, PNet_mlab.weights_conv3, PNet_mlab.biases_conv3);
+    out = PReLU(out, PNet_mlab.prelu_weights_3);
+    
+    % The fully connected layer
+    out_fc = zeros(size(out,1)*size(out,2), size(out,3));
+    out_fc(:) = out(:);
+    out_fc = out_fc * PNet_mlab.w + PNet_mlab.b';
+    out = reshape(out_fc, size(out,1), size(out,2), size(out_fc,2));
+
+    % The alignment probabilities (face heat map)
+    out_prob = 1./(1+exp(out(:,:,1)-out(:,:,2)));
+
+    % The correction of the detection
+    out_correction = out(:,:,3:end);    
+end
+
diff --git a/matlab_version/face_detection/mtcnn/PNet_mlab.mat b/matlab_version/face_detection/mtcnn/PNet_mlab.mat
new file mode 100644
index 00000000..40726a48
Binary files /dev/null and b/matlab_version/face_detection/mtcnn/PNet_mlab.mat differ
diff --git a/matlab_version/face_detection/mtcnn/PReLU.m b/matlab_version/face_detection/mtcnn/PReLU.m
new file mode 100644
index 00000000..4c345315
--- /dev/null
+++ b/matlab_version/face_detection/mtcnn/PReLU.m
@@ -0,0 +1,26 @@
+function [ out_map ] = PReLU( input_maps, PReLU_params )
+%PRELU Summary of this function goes here
+%   Detailed explanation goes here
+
+    out_map = zeros(size(input_maps));
+    if(numel(size(input_maps)) > 2)
+        for i=1:size(input_maps,3)
+             
+            % A more readable but slower version
+            % in_map = input_maps(:,:,i,:);
+            % in_map(in_map < 0) = in_map(in_map<0) * PReLU_params(i);
+            
+            % alternative
+%             out_map(:,:,i,:) = max(input_maps(:,:,i,:),0) + min(input_maps(:,:,i,:),0)*PReLU_params(i);            
+
+            out_map(:,:,i,:) = input_maps(:,:,i,:) .* (PReLU_params(i) + (1 - PReLU_params(i)) * (input_maps(:,:,i,:) > 0)) ;
+        end  
+    else
+        for i=1:size(input_maps,2)
+            in_map = input_maps(:,i);
+            in_map(in_map < 0) = in_map(in_map<0) * PReLU_params(i);
+            out_map(:,i) = in_map;
+        end        
+    end 
+end
+
diff --git a/matlab_version/face_detection/mtcnn/RNet.m b/matlab_version/face_detection/mtcnn/RNet.m
new file mode 100644
index 00000000..7c95cc61
--- /dev/null
+++ b/matlab_version/face_detection/mtcnn/RNet.m
@@ -0,0 +1,31 @@
+function [ out_prob, out_correction ] = RNet( im_data, RNet_mlab )
+%PNET Summary of this function goes here
+%   Detailed explanation goes here
+
+    % The convolutional and pooling layers
+    out = convolution(im_data, RNet_mlab.weights_conv1, RNet_mlab.biases_conv1);
+    out = PReLU(out, RNet_mlab.prelu_weights_1);
+    out = max_pooling2(out, 3, 2);
+    out = convolution(out, RNet_mlab.weights_conv2, RNet_mlab.biases_conv2);
+    out = PReLU(out, RNet_mlab.prelu_weights_2);
+    out = max_pooling2(out, 3, 2);
+    out = convolution(out, RNet_mlab.weights_conv3, RNet_mlab.biases_conv3);
+    out = PReLU(out, RNet_mlab.prelu_weights_3);
+    
+    % The fully connected layers
+
+    out_fc_1 = zeros(size(out,1)*size(out,2) * size(out,3), size(out,4));
+    out_fc_1(:) = out(:);
+    out_fc_1 = out_fc_1' * RNet_mlab.w_fc1 + RNet_mlab.b_fc1';
+    out_fc_1 = PReLU(out_fc_1, RNet_mlab.prelu_fc1);
+
+    out_fc2 = out_fc_1 * RNet_mlab.w_fc2 + RNet_mlab.b_fc2';
+    out_fc2 = out_fc2';
+    
+    % Probability of each proposal
+    out_prob = 1./(1+exp(out_fc2(1,:)-out_fc2(2,:)));
+    
+    % The correction of each detection
+    out_correction = out_fc2(3:end,:);
+end
+
diff --git a/matlab_version/face_detection/mtcnn/RNet_mlab.mat b/matlab_version/face_detection/mtcnn/RNet_mlab.mat
new file mode 100644
index 00000000..f7e25c2d
Binary files /dev/null and b/matlab_version/face_detection/mtcnn/RNet_mlab.mat differ
diff --git a/matlab_version/face_detection/mtcnn/apply_correction.m b/matlab_version/face_detection/mtcnn/apply_correction.m
new file mode 100644
index 00000000..c6e8a5bb
--- /dev/null
+++ b/matlab_version/face_detection/mtcnn/apply_correction.m
@@ -0,0 +1,23 @@
+function [ total_bboxes ] = apply_correction( total_bboxes, corrections, add1 )
+%APPLY_CORRECTION Summary of this function goes here
+%   Detailed explanation goes here
+
+    % Perform correction based on regression values
+    bbw = total_bboxes(:,3) - total_bboxes(:,1);
+    bbh = total_bboxes(:,4) - total_bboxes(:,2);
+    
+    % TODO is this needed?
+    if(add1)
+        bbw = bbw + 1;
+        bbh = bbh + 1;
+    end
+    
+    new_min_x = total_bboxes(:,1) + corrections(:,1) .* bbw;
+    new_min_y = total_bboxes(:,2) + corrections(:,2) .* bbh;    
+    new_max_x = total_bboxes(:,3) + corrections(:,3) .* bbw;
+    new_max_y = total_bboxes(:,4) + corrections(:,4) .* bbh;
+    score = total_bboxes(:,5);
+    total_bboxes = [new_min_x, new_min_y, new_max_x, new_max_y, score];
+
+end
+
diff --git a/matlab_version/face_detection/mtcnn/convert_to_cpp/MTCNN_detector.txt b/matlab_version/face_detection/mtcnn/convert_to_cpp/MTCNN_detector.txt
new file mode 100644
index 00000000..9a4f805b
--- /dev/null
+++ b/matlab_version/face_detection/mtcnn/convert_to_cpp/MTCNN_detector.txt
@@ -0,0 +1,3 @@
+PNet PNet.dat
+RNet RNet.dat
+ONet ONet.dat
diff --git a/matlab_version/face_detection/mtcnn/convert_to_cpp/ONet.dat b/matlab_version/face_detection/mtcnn/convert_to_cpp/ONet.dat
new file mode 100644
index 00000000..291c4462
Binary files /dev/null and b/matlab_version/face_detection/mtcnn/convert_to_cpp/ONet.dat differ
diff --git a/matlab_version/face_detection/mtcnn/convert_to_cpp/PNet.dat b/matlab_version/face_detection/mtcnn/convert_to_cpp/PNet.dat
new file mode 100644
index 00000000..9550d39a
Binary files /dev/null and b/matlab_version/face_detection/mtcnn/convert_to_cpp/PNet.dat differ
diff --git a/matlab_version/face_detection/mtcnn/convert_to_cpp/RNet.dat b/matlab_version/face_detection/mtcnn/convert_to_cpp/RNet.dat
new file mode 100644
index 00000000..864e0dd9
Binary files /dev/null and b/matlab_version/face_detection/mtcnn/convert_to_cpp/RNet.dat differ
diff --git a/matlab_version/face_detection/mtcnn/convert_to_cpp/Write_CNN_to_binary.m b/matlab_version/face_detection/mtcnn/convert_to_cpp/Write_CNN_to_binary.m
new file mode 100644
index 00000000..12106e88
--- /dev/null
+++ b/matlab_version/face_detection/mtcnn/convert_to_cpp/Write_CNN_to_binary.m
@@ -0,0 +1,70 @@
+function Write_CNN_to_binary(location_binary, cnn)
+
+    addpath('../../../PDM_helpers/');
+    
+    % use little-endian
+    cnn_binary_file = fopen(location_binary, 'w', 'l');        
+              
+    num_layers = size(cnn.layers,2);
+
+    % Get the number of layers
+    fwrite(cnn_binary_file, num_layers, 'uint'); % 4 bytes
+
+    for layers=1:num_layers
+
+        % write layer type: 0 - convolutional, 1 - max pooling, 2 -
+        % fully connected, 3 - prelu, 4 - sigmoid
+        if(strcmp(cnn.layers{layers}.type, 'conv'))
+
+            % write the type (convolutional)
+            fwrite(cnn_binary_file, 0, 'uint'); % 4 bytes
+
+            num_in_map = size(cnn.layers{layers}.weights{1},3);
+
+            % write the number of input maps
+            fwrite(cnn_binary_file, num_in_map, 'uint'); % 4 bytes
+
+            num_out_kerns = size(cnn.layers{layers}.weights{1},4);
+
+            % write the number of kernels for each output map
+            fwrite(cnn_binary_file, num_out_kerns, 'uint'); % 4 bytes
+
+            % Write output map bias terms
+            for k2=1:num_out_kerns    
+                fwrite(cnn_binary_file, cnn.layers{layers}.weights{2}(k2), 'float32'); % 4 bytes
+            end
+
+            for k=1:num_in_map                                        
+                for k2=1:num_out_kerns
+                    % Write out the kernel                              
+                    W = squeeze(cnn.layers{layers}.weights{1}(:,:,k,k2));
+                    writeMatrixBin(cnn_binary_file, W, 5);                
+                end
+            end    
+        elseif(strcmp(cnn.layers{layers}.type, 'fc'))
+
+            % This is the fully connected layer
+            fwrite(cnn_binary_file, 2, 'uint'); % 4 bytes
+
+            % the bias term
+            writeMatrixBin(cnn_binary_file, cnn.layers{layers}.weights{2}, 5);
+            % the weights
+            writeMatrixBin(cnn_binary_file, cnn.layers{layers}.weights{1}, 5);
+
+        elseif(strcmp(cnn.layers{layers}.type, 'max_pooling'))
+            fwrite(cnn_binary_file, 1, 'uint'); % 4 bytes, indicate max pooling layer
+            % params kernel and stride size
+            fwrite(cnn_binary_file, cnn.layers{layers}.kernel_size_x, 'uint'); % 4 bytes
+            fwrite(cnn_binary_file, cnn.layers{layers}.kernel_size_y, 'uint'); % 4 bytes
+            fwrite(cnn_binary_file, cnn.layers{layers}.stride_x, 'uint'); % 4 bytes
+            fwrite(cnn_binary_file, cnn.layers{layers}.stride_y, 'uint'); % 4 bytes
+           
+        elseif(strcmp(cnn.layers{layers}.type, 'prelu'))
+            fwrite(cnn_binary_file, 3, 'uint'); % 4 bytes, indicate a parametric relu layer
+            writeMatrixBin(cnn_binary_file, cnn.layers{layers}.weights{1}, 5);
+        end            
+    end
+    
+    fclose(cnn_binary_file);
+    
+end
\ No newline at end of file
diff --git a/matlab_version/face_detection/mtcnn/convert_to_cpp/Write_out_mtcnn.m b/matlab_version/face_detection/mtcnn/convert_to_cpp/Write_out_mtcnn.m
new file mode 100644
index 00000000..7afbce88
--- /dev/null
+++ b/matlab_version/face_detection/mtcnn/convert_to_cpp/Write_out_mtcnn.m
@@ -0,0 +1,184 @@
+% First writing out PNet
+load('../PNet_mlab.mat');
+
+cnn = struct;
+cnn.layers = cell(1,8);
+cnn.layers{1} = struct;
+cnn.layers{1}.type = 'conv';
+cnn.layers{1}.weights = {PNet_mlab.weights_conv1, PNet_mlab.biases_conv1};
+
+cnn.layers{2} = struct;
+cnn.layers{2}.type = 'prelu';
+cnn.layers{2}.weights = {PNet_mlab.prelu_weights_1};
+
+cnn.layers{3} = struct;
+cnn.layers{3}.type = 'max_pooling';
+cnn.layers{3}.weights = {};
+cnn.layers{3}.stride_x = 2;
+cnn.layers{3}.stride_y = 2;
+cnn.layers{3}.kernel_size_x = 2;
+cnn.layers{3}.kernel_size_y = 2;
+
+cnn.layers{4} = struct;
+cnn.layers{4}.type = 'conv';
+cnn.layers{4}.weights = {PNet_mlab.weights_conv2, PNet_mlab.biases_conv2};
+
+cnn.layers{5} = struct;
+cnn.layers{5}.type = 'prelu';
+cnn.layers{5}.weights = {PNet_mlab.prelu_weights_2};
+
+cnn.layers{6} = struct;
+cnn.layers{6}.type = 'conv';
+cnn.layers{6}.weights = {PNet_mlab.weights_conv3, PNet_mlab.biases_conv3};
+
+cnn.layers{7} = struct;
+cnn.layers{7}.type = 'prelu';
+cnn.layers{7}.weights = {PNet_mlab.prelu_weights_3};
+
+cnn.layers{8} = struct;
+cnn.layers{8}.type = 'fc';
+cnn.layers{8}.weights = {PNet_mlab.w, PNet_mlab.b};
+
+Write_CNN_to_binary('PNet.dat', cnn);
+
+%% Next writing out the RNet
+clear
+load('../RNet_mlab.mat');
+    
+cnn = struct;
+cnn.layers = cell(1,11);
+cnn.layers{1} = struct;
+cnn.layers{1}.type = 'conv';
+cnn.layers{1}.weights = {RNet_mlab.weights_conv1, RNet_mlab.biases_conv1};
+
+cnn.layers{2} = struct;
+cnn.layers{2}.type = 'prelu';
+cnn.layers{2}.weights = {RNet_mlab.prelu_weights_1};
+
+cnn.layers{3} = struct;
+cnn.layers{3}.type = 'max_pooling';
+cnn.layers{3}.weights = {};
+cnn.layers{3}.stride_x = 2;
+cnn.layers{3}.stride_y = 2;
+cnn.layers{3}.kernel_size_x = 3;
+cnn.layers{3}.kernel_size_y = 3;
+
+cnn.layers{4} = struct;
+cnn.layers{4}.type = 'conv';
+cnn.layers{4}.weights = {RNet_mlab.weights_conv2, RNet_mlab.biases_conv2};
+
+cnn.layers{5} = struct;
+cnn.layers{5}.type = 'prelu';
+cnn.layers{5}.weights = {RNet_mlab.prelu_weights_2};
+
+cnn.layers{6} = struct;
+cnn.layers{6}.type = 'max_pooling';
+cnn.layers{6}.weights = {};
+cnn.layers{6}.stride_x = 2;
+cnn.layers{6}.stride_y = 2;
+cnn.layers{6}.kernel_size_x = 3;
+cnn.layers{6}.kernel_size_y = 3;
+
+cnn.layers{7} = struct;
+cnn.layers{7}.type = 'conv';
+cnn.layers{7}.weights = {RNet_mlab.weights_conv3, RNet_mlab.biases_conv3};
+
+cnn.layers{8} = struct;
+cnn.layers{8}.type = 'prelu';
+cnn.layers{8}.weights = {RNet_mlab.prelu_weights_3};
+
+cnn.layers{9} = struct;
+cnn.layers{9}.type = 'fc';
+cnn.layers{9}.weights = {RNet_mlab.w_fc1, RNet_mlab.b_fc1};
+
+cnn.layers{10} = struct;
+cnn.layers{10}.type = 'prelu';
+cnn.layers{10}.weights = {RNet_mlab.prelu_fc1};
+    
+cnn.layers{11} = struct;
+cnn.layers{11}.type = 'fc';
+cnn.layers{11}.weights = {RNet_mlab.w_fc2, RNet_mlab.b_fc2};
+
+Write_CNN_to_binary('RNet.dat', cnn);
+
+%% Next writing out the ONet
+clear
+load('../ONet_mlab.mat');
+    
+cnn = struct;
+cnn.layers = cell(1,14);
+cnn.layers{1} = struct;
+cnn.layers{1}.type = 'conv';
+cnn.layers{1}.weights = {ONet_mlab.weights_conv1, ONet_mlab.biases_conv1};
+
+cnn.layers{2} = struct;
+cnn.layers{2}.type = 'prelu';
+cnn.layers{2}.weights = {ONet_mlab.prelu_weights_1};
+
+cnn.layers{3} = struct;
+cnn.layers{3}.type = 'max_pooling';
+cnn.layers{3}.weights = {};
+cnn.layers{3}.stride_x = 2;
+cnn.layers{3}.stride_y = 2;
+cnn.layers{3}.kernel_size_x = 3;
+cnn.layers{3}.kernel_size_y = 3;
+
+cnn.layers{4} = struct;
+cnn.layers{4}.type = 'conv';
+cnn.layers{4}.weights = {ONet_mlab.weights_conv2, ONet_mlab.biases_conv2};
+
+cnn.layers{5} = struct;
+cnn.layers{5}.type = 'prelu';
+cnn.layers{5}.weights = {ONet_mlab.prelu_weights_2};
+
+cnn.layers{6} = struct;
+cnn.layers{6}.type = 'max_pooling';
+cnn.layers{6}.weights = {};
+cnn.layers{6}.stride_x = 2;
+cnn.layers{6}.stride_y = 2;
+cnn.layers{6}.kernel_size_x = 3;
+cnn.layers{6}.kernel_size_y = 3;
+
+cnn.layers{7} = struct;
+cnn.layers{7}.type = 'conv';
+cnn.layers{7}.weights = {ONet_mlab.weights_conv3, ONet_mlab.biases_conv3};
+
+cnn.layers{8} = struct;
+cnn.layers{8}.type = 'prelu';
+cnn.layers{8}.weights = {ONet_mlab.prelu_weights_3};
+
+cnn.layers{9} = struct;
+cnn.layers{9}.type = 'max_pooling';
+cnn.layers{9}.weights = {};
+cnn.layers{9}.stride_x = 2;
+cnn.layers{9}.stride_y = 2;
+cnn.layers{9}.kernel_size_x = 2;
+cnn.layers{9}.kernel_size_y = 2;
+
+cnn.layers{10} = struct;
+cnn.layers{10}.type = 'conv';
+cnn.layers{10}.weights = {ONet_mlab.weights_conv4, ONet_mlab.biases_conv4};
+
+cnn.layers{11} = struct;
+cnn.layers{11}.type = 'prelu';
+cnn.layers{11}.weights = {ONet_mlab.prelu_weights_4};
+
+cnn.layers{12} = struct;
+cnn.layers{12}.type = 'fc';
+cnn.layers{12}.weights = {ONet_mlab.w_fc1, ONet_mlab.b_fc1};
+
+cnn.layers{13} = struct;
+cnn.layers{13}.type = 'prelu';
+cnn.layers{13}.weights = {ONet_mlab.prelu_fc1};
+    
+cnn.layers{14} = struct;
+cnn.layers{14}.type = 'fc';
+cnn.layers{14}.weights = {ONet_mlab.w_fc2, ONet_mlab.b_fc2};
+
+Write_CNN_to_binary('ONet.dat', cnn);
+
+f = fopen('MTCNN_detector.txt', 'w');
+fprintf(f, 'PNet PNet.dat\r\n');
+fprintf(f, 'RNet RNet.dat\r\n');
+fprintf(f, 'ONet ONet.dat\r\n');
+fclose(f);
\ No newline at end of file
diff --git a/matlab_version/face_detection/mtcnn/convolution.m b/matlab_version/face_detection/mtcnn/convolution.m
new file mode 100644
index 00000000..b2e5b554
--- /dev/null
+++ b/matlab_version/face_detection/mtcnn/convolution.m
@@ -0,0 +1,24 @@
+function [ output_maps ] = convolution( input_maps, kernels, biases )
+%CONVOLUTION Summary of this function goes here
+%   Detailed explanation goes here
+
+    % If MatConvNet is not installed use Matlab (much slower)
+    if(exist('vl_nnconv', 'file') == 3)
+        output_maps = vl_nnconv(single(input_maps), kernels, biases);
+    else
+        n_filters = size(kernels, 4);
+
+        kernels2 = kernels(:,:,end:-1:1,:);
+        for i=1:n_filters
+            for n_in_maps=1:size(kernels,3)
+                kernels2(:,:,n_in_maps,i) = fliplr(squeeze(kernels2(:,:,n_in_maps,i)));
+                kernels2(:,:,n_in_maps,i) = flipud(squeeze(kernels2(:,:,n_in_maps,i)));
+            end
+        end
+        output_maps = [];
+        for i=1:n_filters
+            output_maps = cat(3, output_maps, convn(input_maps, kernels2(:,:,:,i), 'valid') + biases(i));
+        end    
+    end
+end
+
diff --git a/matlab_version/face_detection/mtcnn/correct_bbox.m b/matlab_version/face_detection/mtcnn/correct_bbox.m
new file mode 100644
index 00000000..e75c1f86
--- /dev/null
+++ b/matlab_version/face_detection/mtcnn/correct_bbox.m
@@ -0,0 +1,36 @@
+function [ total_bboxes, to_keep ] = correct_bbox( total_bboxes, corrections, add1, rectangulate, round, type )
+%CORRECT_BBOX Summary of this function goes here
+%   Detailed explanation goes here
+
+    % Non maximum supression accross bounding boxes
+    to_keep = non_maximum_supression(total_bboxes, 0.7, type);
+    total_bboxes = total_bboxes(to_keep, :);
+    corrections = corrections(to_keep, :);
+    % Perform correction based on regression values
+    bbw = total_bboxes(:,3) - total_bboxes(:,1);
+    bbh = total_bboxes(:,4) - total_bboxes(:,2);
+    
+    % TODO is this needed?
+    if(add1)
+        bbw = bbw + 1;
+        bbh = bbh + 1;
+    end
+    
+    new_min_x = total_bboxes(:,1) + corrections(:,1) .* bbw;
+    new_min_y = total_bboxes(:,2) + corrections(:,2) .* bbh;    
+    new_max_x = total_bboxes(:,3) + corrections(:,3) .* bbw;
+    new_max_y = total_bboxes(:,4) + corrections(:,4) .* bbh;
+    score = total_bboxes(:,5);
+    total_bboxes = [new_min_x, new_min_y, new_max_x, new_max_y, score];
+    
+    if(rectangulate)
+        % Convert the bounding boxes to rectangles
+        total_bboxes(:,1:4) = rectify(total_bboxes(:,1:4));
+    end
+    
+    if(round)
+        % Rounding to pixels
+        total_bboxes(:,1:4) = fix(total_bboxes(:,1:4));
+    end
+end
+
diff --git a/matlab_version/face_detection/mtcnn/demo.m b/matlab_version/face_detection/mtcnn/demo.m
new file mode 100644
index 00000000..b3910706
--- /dev/null
+++ b/matlab_version/face_detection/mtcnn/demo.m
@@ -0,0 +1,10 @@
+clear;
+
+% Make sure we have the dependencies for convolution
+od = cd('../../face_validation');
+setup;
+cd(od);
+
+img = imread('test1.jpg');
+
+[bboxes, lmarks, confidences] = detect_face_mtcnn(img);
\ No newline at end of file
diff --git a/matlab_version/face_detection/mtcnn/demo_300W.m b/matlab_version/face_detection/mtcnn/demo_300W.m
new file mode 100644
index 00000000..cd46bf80
--- /dev/null
+++ b/matlab_version/face_detection/mtcnn/demo_300W.m
@@ -0,0 +1,20 @@
+clear;
+
+% Make sure we have the dependencies for convolution
+od = cd('../../face_validation');
+setup;
+cd(od);
+
+imgs = dir('D:\Datasets\300_W\AFW/*.jpg');
+for i=2:numel(imgs)
+    img = imread(['D:\Datasets\300_W\AFW/', imgs(i).name]);
+    [bboxes, lmarks, confidences] = detect_face_mtcnn(img, 60);
+    hold off
+    imshow(img);
+    hold on;
+    for d=1:size(bboxes,1)
+        rectangle('Position', [bboxes(d,1), bboxes(d,2), bboxes(d,3)-bboxes(d,1), bboxes(d,4) - bboxes(d,2)]);
+        plot(lmarks(d,1:5), lmarks(d,6:10), '.r');
+    end
+    drawnow expose
+end
\ No newline at end of file
diff --git a/matlab_version/face_detection/mtcnn/detect_face_mtcnn.m b/matlab_version/face_detection/mtcnn/detect_face_mtcnn.m
new file mode 100644
index 00000000..714f8d2b
--- /dev/null
+++ b/matlab_version/face_detection/mtcnn/detect_face_mtcnn.m
@@ -0,0 +1,222 @@
+function [total_bboxes, lmarks, confidence] = detect_face_mtcnn(img, min_face_size)
+
+% Check if MatConvNet is installed
+if(exist('vl_nnconv', 'file') ~= 3)
+    fprintf('Warning MatConvNet is not installed or not setup, face detection will be quite slow\n');
+end
+
+height_orig = size(img,1);
+width_orig = size(img,2);
+
+% Everything is done in floats
+img = single(img);
+
+% Minimum face size
+if(nargin ==1)
+    min_face_size = 30;
+end
+
+% Image pyramid scaling factor
+factor = 0.709;
+
+% Thresholds for the PNet, ONet, and RNet
+threshold=[0.6 0.7 0.7];
+
+min_dim = min([width_orig height_orig]);
+
+% Face support region is 12x12 px, so from that can work out the largest
+% scale (which is 12 / min), and work down from there to smallest scale (no smaller than
+% 12x12px)
+face_support = 12;
+num_scales = floor(log(min_face_size / min_dim) / log(factor));
+scales = (face_support / min_face_size)*factor.^(0:num_scales);
+
+load('PNet_mlab');
+load('RNet_mlab');
+load('ONet_mlab');
+
+total_bboxes = [];
+
+% First the PNet stage on image pyramid
+for s = scales
+    h_pyr = ceil(height_orig * s);
+    w_pyr = ceil(width_orig * s);
+
+    % Resize the image and normalize to what MTCNN expects it to be
+    im_data=(imresize(img, [h_pyr w_pyr],'bilinear','AntiAliasing',false)-127.5)*0.0078125;
+
+    [ out_prob, out_correction ] = PNet( im_data, PNet_mlab );
+
+    % Generate bounding boxes from the heatmap
+    bboxes = generate_bounding_boxes(out_prob, out_correction, s, threshold(1), face_support);
+
+    % TODO correct bboxes before running NMS?, as now lots of overlaping
+    % boxes are present
+    
+    % Perform non maximum supression to remove reduntant bounding boxes
+    pick = non_maximum_supression(bboxes, 0.5, 'Union');
+    bboxes=bboxes(pick,:);
+    if ~isempty(bboxes)
+        total_bboxes = cat(1, total_bboxes, bboxes);
+    end
+end
+
+if ~isempty(total_bboxes)
+    % Non maximum supression accross bounding boxes, and their offset
+    % correction
+    corrections = total_bboxes(:,6:end);
+    total_bboxes = total_bboxes(:,1:5);
+    
+    to_keep = non_maximum_supression(total_bboxes, 0.7, 'Union');
+    total_bboxes = total_bboxes(to_keep, :);
+    corrections = corrections(to_keep, :);
+    
+    total_bboxes = apply_correction(total_bboxes, corrections, false);
+    
+    % Making them into rectangles
+    total_bboxes(:,1:4) = rectify(total_bboxes(:,1:4));
+
+    % Rounding to pixels
+    total_bboxes(:,1:4) = fix(total_bboxes(:,1:4));            
+end
+num_bbox = size(total_bboxes,1);
+
+% RNet stage
+if num_bbox > 0
+    
+    proposal_imgs = zeros(24, 24, 3, num_bbox);
+    for k=1:num_bbox
+        
+        width_target = total_bboxes(k,3) - total_bboxes(k,1) + 1;
+        height_target = total_bboxes(k,4) - total_bboxes(k,2) + 1;
+        
+        % Work out the start and end indices in the original image
+        start_x_in = max(total_bboxes(k,1), 1);
+        start_y_in = max(total_bboxes(k,2), 1);
+        end_x_in = min(total_bboxes(k,3), width_orig);
+        end_y_in = min(total_bboxes(k,4), height_orig);
+        
+        % Work out the start and end indices in the target image
+        start_x_out = max(-total_bboxes(k,1)+2, 1);
+        start_y_out = max(-total_bboxes(k,2)+2, 1);
+        end_x_out = min(width_target - (total_bboxes(k,3)-width_orig), width_target);
+        end_y_out = min(height_target - (total_bboxes(k,4)-height_orig), height_target);
+                
+        tmp = zeros(height_target, width_target, 3);
+        
+        tmp(start_y_out:end_y_out,start_x_out:end_x_out,:) = ...
+            img(start_y_in:end_y_in, start_x_in:end_x_in,:);
+        
+        proposal_imgs(:,:,:,k) = imresize(tmp, [24 24], 'bilinear','AntiAliasing',false);
+    end
+    
+    % Normalize the proposal images
+    proposal_imgs = (proposal_imgs - 127.5) * 0.0078125;
+    
+    % Apply RNet to proposal faces
+    [ score, out_correction ] = RNet( proposal_imgs, RNet_mlab );
+    out_correction = out_correction';
+
+    % Find faces above the threshold
+    to_keep = find(score > threshold(2));
+
+    total_bboxes = [total_bboxes(to_keep,1:4) score(to_keep)'];
+    out_correction = out_correction(to_keep,:);
+
+    if ~isempty(total_bboxes)
+        % Non maximum supression accross bounding boxes, and their offset
+        % correction    
+        to_keep = non_maximum_supression(total_bboxes, 0.7, 'Union');
+        total_bboxes = total_bboxes(to_keep, :);
+        out_correction = out_correction(to_keep, :);
+
+        total_bboxes = apply_correction(total_bboxes, out_correction, true);
+
+        % Making them into rectangles
+        total_bboxes(:,1:4) = rectify(total_bboxes(:,1:4));
+
+        % Rounding to pixels
+        total_bboxes(:,1:4) = fix(total_bboxes(:,1:4));        
+    end
+end
+
+num_bbox = size(total_bboxes,1);
+
+% ONet stage
+if num_bbox > 0
+    
+    proposal_imgs = zeros(48, 48, 3, num_bbox);
+    for k=1:num_bbox
+        
+        width_target = total_bboxes(k,3) - total_bboxes(k,1) + 1;
+        height_target = total_bboxes(k,4) - total_bboxes(k,2) + 1;
+        
+        % Work out the start and end indices in the original image
+        start_x_in = max(total_bboxes(k,1), 1);
+        start_y_in = max(total_bboxes(k,2), 1);
+        end_x_in = min(total_bboxes(k,3), width_orig);
+        end_y_in = min(total_bboxes(k,4), height_orig);
+        
+        % Work out the start and end indices in the target image
+        start_x_out = max(-total_bboxes(k,1)+2, 1);
+        start_y_out = max(-total_bboxes(k,2)+2, 1);
+        end_x_out = min(width_target - (total_bboxes(k,3)-width_orig), width_target);
+        end_y_out = min(height_target - (total_bboxes(k,4)-height_orig), height_target);
+                
+        tmp = zeros(height_target, width_target, 3);
+        
+        tmp(start_y_out:end_y_out,start_x_out:end_x_out,:) = ...
+            img(start_y_in:end_y_in, start_x_in:end_x_in,:);
+        
+        proposal_imgs(:,:,:,k) = imresize(tmp, [48 48], 'bilinear','AntiAliasing',false);
+    end
+    
+    % Normalize the proposal images
+    proposal_imgs = (proposal_imgs - 127.5) * 0.0078125;
+    
+    % Apply ONet to proposal faces
+    [ score, out_correction, lmarks ] = ONet( proposal_imgs, ONet_mlab );
+    out_correction = out_correction';
+    lmarks = lmarks';
+    
+    % Pick the final faces above the threshold
+    to_keep = find(score > threshold(3));    
+    lmarks = lmarks(to_keep, :);
+    out_correction = out_correction(to_keep, :);
+    total_bboxes = [total_bboxes(to_keep,1:4) score(to_keep)'];
+    
+    % Correct for the landmarks
+    bbw = total_bboxes(:,3) - total_bboxes(:,1) + 1;
+    bbh = total_bboxes(:,4) - total_bboxes(:,2) + 1;
+    
+    lmarks(:, 1:5) = bbw .* lmarks(:,1:5) + total_bboxes(:,1) - 1;
+    lmarks(:, 6:10) = bbh .* lmarks(:,6:10) + total_bboxes(:,2) - 1;
+    
+    % Correct the bounding boxes
+    if size(total_bboxes,1)>0	
+        total_bboxes = apply_correction(total_bboxes, out_correction, true);
+        to_keep = non_maximum_supression(total_bboxes, 0.7, 'Min');
+
+        lmarks = lmarks(to_keep, :);
+        confidence = total_bboxes(to_keep, 5);
+        total_bboxes = total_bboxes(to_keep, 1:4);
+    end
+    
+end
+
+% Correct the bounding boxes to be around the 68 landmark points
+widths = total_bboxes(:,3) - total_bboxes(:,1);
+heights = total_bboxes(:,4) - total_bboxes(:,2);
+txs = total_bboxes(:,1);
+tys = total_bboxes(:,2);
+
+new_widths = widths * 1.0323;
+new_heights = heights * 0.7751;
+new_txs = widths * -0.0075 + txs;
+new_tys = heights * 0.2459 + tys;
+
+total_bboxes = [new_txs, new_tys, new_txs + new_widths, new_tys + new_heights];
+total_bboxes = double(total_bboxes);
+lmarks = double(lmarks);
+
+end
\ No newline at end of file
diff --git a/matlab_version/face_detection/mtcnn/generate_bounding_boxes.m b/matlab_version/face_detection/mtcnn/generate_bounding_boxes.m
new file mode 100644
index 00000000..cf6e6077
--- /dev/null
+++ b/matlab_version/face_detection/mtcnn/generate_bounding_boxes.m
@@ -0,0 +1,25 @@
+function [bboxes] = generate_bounding_boxes(heatmap, correction, scale, t, face_support)
+	%use heatmap to generate bounding boxes in the original image space
+    
+    % Correction for the pooling
+    stride = 2;
+
+    % Offsets for, x, y, width and height
+    dx1=correction(:,:,1);
+	dy1=correction(:,:,2);
+	dx2=correction(:,:,3);
+	dy2=correction(:,:,4);
+    
+    % Find the parts of a heatmap above the threshold (x, y, and indices)
+    [x, y]= find(heatmap >= t);
+    inds = find(heatmap >= t);
+    
+    % Find the corresponding scores and bbox corrections
+    score=heatmap(inds);    
+	correction=[dx1(inds) dy1(inds) dx2(inds) dy2(inds)];
+
+    % Correcting for Matlab's format
+    bboxes=[y - 1 x - 1];
+    bboxes=[fix((stride*(bboxes)+1)/scale) fix((stride*(bboxes)+face_support)/scale) score correction];
+end
+
diff --git a/matlab_version/face_detection/mtcnn/im2col_inds.m b/matlab_version/face_detection/mtcnn/im2col_inds.m
new file mode 100644
index 00000000..5d4192b9
--- /dev/null
+++ b/matlab_version/face_detection/mtcnn/im2col_inds.m
@@ -0,0 +1,120 @@
+function ttt=im2col_inds(a, block)
+%IM2COL Rearrange image blocks into columns.
+%   B = IM2COL(A,[M N],'distinct') rearranges each distinct
+%   M-by-N block in the image A into a column of B. IM2COL pads A
+%   with zeros, if necessary, so its size is an integer multiple
+%   of M-by-N. If A = [A11 A12; A21 A22], where each Aij is
+%   M-by-N, then B = [A11(:) A21(:) A12(:) A22(:)].
+%
+%   B = IM2COL(A,[M N],'sliding') converts each sliding M-by-N
+%   block of A into a column of B, with no zero padding. B has
+%   M*N rows and will contain as many columns as there are M-by-N
+%   neighborhoods in A. If the size of A is [MM NN], then the
+%   size of B is (M*N)-by-((MM-M+1)*(NN-N+1). Each column of B
+%   contains the neighborhoods of A reshaped as NHOOD(:), where
+%   NHOOD is a matrix containing an M-by-N neighborhood of
+%   A. IM2COL orders the columns of B so that they can be
+%   reshaped to form a matrix in the normal way. For example,
+%   suppose you use a function, such as SUM(B), that returns a
+%   scalar for each column of B. You can directly store the
+%   result in a matrix of size (MM-M+1)-by-(NN-N+1) using these
+%   calls:
+%
+%        B = im2col(A,[M N],'sliding');
+%        C = reshape(sum(B),MM-M+1,NN-N+1);
+%
+%   B = IM2COL(A,[M N]) uses the default block type of
+%   'sliding'.
+%
+%   B = IM2COL(A,'indexed',...) processes A as an indexed image,
+%   padding with zeros if the class of A is uint8 or uint16, or
+%   ones if the class of A is double.
+%
+%   Class Support
+%   -------------
+%   The input image A can be numeric or logical. The output matrix
+%   B is of the same class as the input image.
+%
+%   Example
+%   -------
+%   Calculate the local mean using a [2 2] neighborhood with zero padding.
+%
+%       A = reshape(linspace(0,1,16),[4 4])'
+%       B = im2col(A,[2 2])
+%       M = mean(B)
+%       newA = col2im(M,[1 1],[3 3])
+%
+%   See also BLOCKPROC, COL2IM, COLFILT, NLFILTER.
+
+%   Copyright 1993-2016 The MathWorks, Inc.
+
+[ma,na] = size(a);
+m = block(1); n = block(2);
+
+if any([ma na] < [m n]) % if neighborhood is larger than image
+    b = zeros(m*n,0);
+    return
+end
+
+% Create Hankel-like indexing sub matrix.
+mc = block(1); nc = ma-m+1; nn = na-n+1;
+cidx = (0:mc-1)'; ridx = 1:nc;
+t = cidx(:,ones(nc,1)) + ridx(ones(mc,1),:);    % Hankel Subscripts
+tt = zeros(mc*n,nc);
+rows = 1:mc;
+for i=0:n-1,
+    tt(i*mc+rows,:) = t+ma*i;
+end
+ttt = zeros(mc*n,nc*nn);
+cols = 1:nc;
+for j=0:nn-1,
+    ttt(:,j*nc+cols) = tt+ma*j;
+end
+    
+
+%%%
+%%% Function parse_inputs
+%%%
+function [a, block, kind, padval] = parse_inputs(varargin)
+
+narginchk(2,4);
+
+switch nargin
+    case 2
+        if (strcmp(varargin{2},'indexed'))
+            error(message('images:im2col:tooFewInputs'))
+        else
+            % IM2COL(A, [M N])
+            a = varargin{1};
+            block = varargin{2};
+            kind = 'sliding';
+            padval = 0;
+        end
+        
+    case 3
+        if (strcmp(varargin{2},'indexed'))
+            % IM2COL(A, 'indexed', [M N])
+            a = varargin{1};
+            block = varargin{3};
+            kind = 'sliding';
+            padval = 1;
+        else
+            % IM2COL(A, [M N], 'kind')
+            a = varargin{1};
+            block = varargin{2};
+            kind = validatestring(varargin{3},{'sliding','distinct'},mfilename,'kind',3);
+            padval = 0;
+        end
+        
+    case 4
+        % IM2COL(A, 'indexed', [M N], 'kind')
+        a = varargin{1};
+        block = varargin{3};
+        kind = validatestring(varargin{4},{'sliding','distinct'},mfilename,'kind',4);
+        padval = 1;
+        
+end
+
+if (isa(a,'uint8') || isa(a, 'uint16'))
+    padval = 0;
+end
diff --git a/matlab_version/face_detection/mtcnn/im2col_mine.m b/matlab_version/face_detection/mtcnn/im2col_mine.m
new file mode 100644
index 00000000..14ac52c5
--- /dev/null
+++ b/matlab_version/face_detection/mtcnn/im2col_mine.m
@@ -0,0 +1,127 @@
+function b=im2col_mine(a, block)
+%IM2COL Rearrange image blocks into columns.
+%   B = IM2COL(A,[M N],'distinct') rearranges each distinct
+%   M-by-N block in the image A into a column of B. IM2COL pads A
+%   with zeros, if necessary, so its size is an integer multiple
+%   of M-by-N. If A = [A11 A12; A21 A22], where each Aij is
+%   M-by-N, then B = [A11(:) A21(:) A12(:) A22(:)].
+%
+%   B = IM2COL(A,[M N],'sliding') converts each sliding M-by-N
+%   block of A into a column of B, with no zero padding. B has
+%   M*N rows and will contain as many columns as there are M-by-N
+%   neighborhoods in A. If the size of A is [MM NN], then the
+%   size of B is (M*N)-by-((MM-M+1)*(NN-N+1). Each column of B
+%   contains the neighborhoods of A reshaped as NHOOD(:), where
+%   NHOOD is a matrix containing an M-by-N neighborhood of
+%   A. IM2COL orders the columns of B so that they can be
+%   reshaped to form a matrix in the normal way. For example,
+%   suppose you use a function, such as SUM(B), that returns a
+%   scalar for each column of B. You can directly store the
+%   result in a matrix of size (MM-M+1)-by-(NN-N+1) using these
+%   calls:
+%
+%        B = im2col(A,[M N],'sliding');
+%        C = reshape(sum(B),MM-M+1,NN-N+1);
+%
+%   B = IM2COL(A,[M N]) uses the default block type of
+%   'sliding'.
+%
+%   B = IM2COL(A,'indexed',...) processes A as an indexed image,
+%   padding with zeros if the class of A is uint8 or uint16, or
+%   ones if the class of A is double.
+%
+%   Class Support
+%   -------------
+%   The input image A can be numeric or logical. The output matrix
+%   B is of the same class as the input image.
+%
+%   Example
+%   -------
+%   Calculate the local mean using a [2 2] neighborhood with zero padding.
+%
+%       A = reshape(linspace(0,1,16),[4 4])'
+%       B = im2col(A,[2 2])
+%       M = mean(B)
+%       newA = col2im(M,[1 1],[3 3])
+%
+%   See also BLOCKPROC, COL2IM, COLFILT, NLFILTER.
+
+%   Copyright 1993-2016 The MathWorks, Inc.
+
+[ma,na] = size(a);
+m = block(1); n = block(2);
+
+if any([ma na] < [m n]) % if neighborhood is larger than image
+    b = zeros(m*n,0);
+    return
+end
+
+% Create Hankel-like indexing sub matrix.
+mc = block(1); nc = ma-m+1; nn = na-n+1;
+cidx = (0:mc-1)'; ridx = 1:nc;
+t = cidx(:,ones(nc,1)) + ridx(ones(mc,1),:);    % Hankel Subscripts
+tt = zeros(mc*n,nc);
+rows = 1:mc;
+for i=0:n-1,
+    tt(i*mc+rows,:) = t+ma*i;
+end
+ttt = zeros(mc*n,nc*nn);
+cols = 1:nc;
+for j=0:nn-1,
+    ttt(:,j*nc+cols) = tt+ma*j;
+end
+
+% If a is a row vector, change it to a column vector. This change is
+% necessary when A is a row vector and [M N] = size(A).
+if ndims(a) == 2 && na > 1 && ma == 1
+    a = a(:);
+end
+b = a(ttt);
+    
+
+%%%
+%%% Function parse_inputs
+%%%
+function [a, block, kind, padval] = parse_inputs(varargin)
+
+narginchk(2,4);
+
+switch nargin
+    case 2
+        if (strcmp(varargin{2},'indexed'))
+            error(message('images:im2col:tooFewInputs'))
+        else
+            % IM2COL(A, [M N])
+            a = varargin{1};
+            block = varargin{2};
+            kind = 'sliding';
+            padval = 0;
+        end
+        
+    case 3
+        if (strcmp(varargin{2},'indexed'))
+            % IM2COL(A, 'indexed', [M N])
+            a = varargin{1};
+            block = varargin{3};
+            kind = 'sliding';
+            padval = 1;
+        else
+            % IM2COL(A, [M N], 'kind')
+            a = varargin{1};
+            block = varargin{2};
+            kind = validatestring(varargin{3},{'sliding','distinct'},mfilename,'kind',3);
+            padval = 0;
+        end
+        
+    case 4
+        % IM2COL(A, 'indexed', [M N], 'kind')
+        a = varargin{1};
+        block = varargin{3};
+        kind = validatestring(varargin{4},{'sliding','distinct'},mfilename,'kind',4);
+        padval = 1;
+        
+end
+
+if (isa(a,'uint8') || isa(a, 'uint16'))
+    padval = 0;
+end
diff --git a/matlab_version/face_detection/mtcnn/max_pooling.m b/matlab_version/face_detection/mtcnn/max_pooling.m
new file mode 100644
index 00000000..b49ed0c3
--- /dev/null
+++ b/matlab_version/face_detection/mtcnn/max_pooling.m
@@ -0,0 +1,57 @@
+function [ output_maps ] = max_pooling( input_maps)
+%POOLING Summary of this function goes here
+%   Detailed explanation goes here
+    
+    orig_rows = size(input_maps,1);
+    orig_cols = size(input_maps,2);
+    
+    pooled_rows = ceil(orig_rows / 2);
+    pooled_cols = ceil(orig_cols / 2);
+
+    up_to_rows_out = floor(orig_rows / 2);
+    up_to_cols_out = floor(orig_cols / 2);
+
+    if(mod(orig_cols,2) == 0)
+        up_to_cols = orig_cols;
+    else
+        up_to_cols = orig_cols - 1;
+    end
+    
+    if(mod(orig_rows,2) == 0)
+        up_to_rows = orig_rows;
+    else
+        up_to_rows = orig_rows - 1;
+    end
+    
+    output_maps = zeros(pooled_rows, pooled_cols, size(input_maps,3));
+    for i=1:size(input_maps,3)
+        temp = im2col(input_maps(1:up_to_rows,1:up_to_cols,i), [2,2], 'distinct');
+        max_val = max(temp);
+        output_maps(1:up_to_rows_out,1:up_to_cols_out,i) = reshape(max_val, up_to_rows_out, up_to_cols_out);     
+    end
+    
+    % A bit of a hack for non-even number of rows or columns
+    if(mod(orig_cols,2) ~= 0)
+        for i=1:size(input_maps,3)
+            temp = im2col(input_maps(1:up_to_rows,end,i), [2,1], 'distinct');
+            max_val = max(temp);
+            output_maps(1:up_to_rows_out,end,i) = max_val;     
+        end        
+    end
+
+    if(mod(orig_rows,2) ~= 0)
+        for i=1:size(input_maps,3)
+            temp = im2col(input_maps(end, 1:up_to_cols,i), [1,2], 'distinct');
+            max_val = max(temp);
+            output_maps(end, 1:up_to_cols_out,i) = max_val;     
+        end        
+    end
+    
+    if(mod(orig_cols,2) ~= 0 && mod(orig_rows,2) ~= 0)
+        output_maps(end,end,:) = input_maps(end,end,:);
+    end
+    
+
+    
+end
+
diff --git a/matlab_version/face_detection/mtcnn/max_pooling2.m b/matlab_version/face_detection/mtcnn/max_pooling2.m
new file mode 100644
index 00000000..e2fc1091
--- /dev/null
+++ b/matlab_version/face_detection/mtcnn/max_pooling2.m
@@ -0,0 +1,118 @@
+function [ output_maps ] = max_pooling2( input_maps, kernel_size, stride)
+%POOLING Summary of this function goes here
+%   Detailed explanation goes here
+    
+    orig_rows = size(input_maps,1);
+    orig_cols = size(input_maps,2);
+    
+    pooled_rows = round((orig_rows - kernel_size)/stride) + 1;
+    pooled_cols = round((orig_cols - kernel_size)/stride) + 1;   
+     
+    if(exist('vl_nnpool', 'file') == 3)
+        % Caffe and MatConvNet do pooling slightly differently, so need to
+        % counter for that
+
+        pooled_cols_vl = floor((orig_cols - kernel_size)/stride) + 1;
+        pooled_rows_vl = floor((orig_rows - kernel_size)/stride) + 1;
+
+        if(pooled_rows_vl == pooled_rows && pooled_cols_vl == pooled_cols)
+            output_maps = vl_nnpool(input_maps, [kernel_size, kernel_size], 'stride', stride);
+        else
+            % Else need to pad right and bottom with infinities 
+            for x=1:kernel_size
+                pooled_cols_vl = floor((orig_cols + x - kernel_size)/stride) + 1;
+                if(pooled_cols_vl == pooled_cols)
+                    break;
+                end
+            end
+            for y=1:kernel_size
+                pooled_rows_vl = floor((orig_rows +y - kernel_size)/stride) + 1;
+                if(pooled_rows_vl == pooled_rows)
+                    break;
+                end
+            end
+
+            input_maps_new = -inf * ones(size(input_maps,1)+y, size(input_maps,2)+x, size(input_maps,3), size(input_maps,4));
+            input_maps_new(1:size(input_maps,1),1:size(input_maps,2),:,:) = input_maps;
+            output_maps = vl_nnpool(input_maps_new, [kernel_size, kernel_size], 'stride', stride);
+        end
+    else
+    
+        up_to_rows_out = floor((orig_rows - kernel_size)/stride) + 1;
+        up_to_cols_out = floor((orig_cols - kernel_size)/stride) + 1;
+
+        % How many full max-pooling steps are there
+        up_to_cols = kernel_size + (up_to_cols_out-1) * stride;
+        up_to_rows = kernel_size + (up_to_rows_out-1) * stride;
+
+        output_maps = zeros(pooled_rows, pooled_cols, size(input_maps,3), size(input_maps,4));
+
+        % Pick only the striding elements
+        [y, x] = meshgrid(1:up_to_cols-kernel_size+1, 1:up_to_rows-kernel_size+1);
+        to_keep_map = mod(y, stride) == 1 & mod(x, stride) == 1;
+        to_keep = find(to_keep_map);
+
+        inds_pooling = im2col_inds(input_maps(1:up_to_rows,1:up_to_cols,1,1), [kernel_size, kernel_size]);
+        inds_pooling = inds_pooling(:, to_keep);
+        for m=1:size(input_maps,4)
+            for i=1:size(input_maps,3)
+    %             temp = im2col(input_maps(1:up_to_rows,1:up_to_cols,i,m), [kernel_size, kernel_size], 'sliding');     
+    %             temp = im2col_mine(input_maps(1:up_to_rows,1:up_to_cols,i,m), [kernel_size, kernel_size]);        
+    %             temp = temp(:,to_keep);
+
+                temp = input_maps(1:up_to_rows,1:up_to_cols,i,m);
+                temp = temp(inds_pooling);
+
+                max_val = max(temp);
+                output_maps(1:up_to_rows_out,1:up_to_cols_out,i,m) = reshape(max_val, up_to_rows_out, up_to_cols_out);     
+            end
+        end
+        % A bit of a hack for non-even number of rows or columns
+        if(orig_cols ~= up_to_cols)
+            span = orig_cols - (up_to_cols - kernel_size + stride);
+            inds_pooling = im2col_inds(input_maps(1:up_to_rows,end-span+1:end,i,m), [kernel_size, span]);
+            inds_pooling = inds_pooling(:, 1:stride:end);
+            for m=1:size(input_maps,4)
+                for i=1:size(input_maps,3)
+    %                 temp = im2col(input_maps(1:up_to_rows,end-span+1:end,i,m), [kernel_size, span], 'sliding');
+    %                 temp = im2col_mine(input_maps(1:up_to_rows,end-span+1:end,i,m), [kernel_size, span]);
+    %                 max_val = max(temp(:,1:stride:end));
+
+                    temp = input_maps(1:up_to_rows,end-span+1:end,i,m);
+                    max_val = max(temp(inds_pooling));
+                    output_maps(1:up_to_rows_out,end,i,m) = max_val;     
+                end        
+            end
+        end
+
+        if(orig_rows ~= up_to_rows)
+            span = orig_rows - (up_to_rows - kernel_size + stride);
+            inds_pooling = im2col_inds(input_maps(end-span+1:end, 1:up_to_cols,i,m), [span, kernel_size]);
+            inds_pooling = inds_pooling(:, 1:stride:end);
+
+            for m=1:size(input_maps,4)
+                for i=1:size(input_maps,3)
+    %                 temp = im2col(input_maps(end-span+1:end, 1:up_to_cols,i,m), [span, kernel_size], 'sliding');
+    %                 temp = im2col_mine(input_maps(end-span+1:end, 1:up_to_cols,i,m), [span, kernel_size]);
+    %                 max_val = max(temp(:,1:stride:end));
+                    temp = input_maps(end-span+1:end, 1:up_to_cols,i,m);
+                    max_val = max(temp(inds_pooling));
+
+                    output_maps(end, 1:up_to_cols_out,i,m) = max_val;     
+                end   
+            end
+        end
+
+        if(orig_cols ~= up_to_cols && orig_rows ~= up_to_rows)
+            for m=1:size(input_maps,4)
+                for i=1:size(input_maps,3)
+                    tmp = input_maps(up_to_rows- kernel_size + stride + 1:end,up_to_cols - kernel_size + stride+1:end,i,m);            
+                    output_maps(end,end,i,m) = max(tmp(:));
+                end
+            end
+        end
+    
+    end
+    
+end
+
diff --git a/matlab_version/face_detection/mtcnn/non_maximum_supression.m b/matlab_version/face_detection/mtcnn/non_maximum_supression.m
new file mode 100644
index 00000000..6c23c871
--- /dev/null
+++ b/matlab_version/face_detection/mtcnn/non_maximum_supression.m
@@ -0,0 +1,46 @@
+function pick = non_maximum_supression(boxes, overlap_threshold,type)
+	
+    %NMS
+	if isempty(boxes)
+        pick = [];
+        return;
+    end
+    
+    % Compute the corners of boxes and the area
+	x1 = boxes(:,1);
+	y1 = boxes(:,2);
+	x2 = boxes(:,3);
+	y2 = boxes(:,4);
+	s = boxes(:,5);
+	area = (x2-x1+1) .* (y2-y1+1);
+
+    % Sorting based on confidence scores
+    [vals, I] = sort(s);
+    
+	pick = zeros(numel(s),1);
+    
+	counter = 1;
+	while ~isempty(I)
+        last = length(I);
+        i = I(last);
+        pick(counter) = i;
+        counter = counter + 1;  
+        
+        xx1 = max(x1(i), x1(I(1:last-1)));
+        yy1 = max(y1(i), y1(I(1:last-1)));
+        xx2 = min(x2(i), x2(I(1:last-1)));
+        yy2 = min(y2(i), y2(I(1:last-1)));  
+        w = max(0.0, xx2-xx1+1);
+        h = max(0.0, yy2-yy1+1); 
+        inter = w.*h;
+        
+        if strcmp(type,'Min')
+            o = inter ./ min(area(i),area(I(1:last-1)));
+        else
+            o = inter ./ (area(i) + area(I(1:last-1)) - inter);
+        end
+        I = I(find(o<=overlap_threshold));
+    end
+    
+	pick = pick(1:(counter-1));
+end
\ No newline at end of file
diff --git a/matlab_version/face_detection/mtcnn/readme.txt b/matlab_version/face_detection/mtcnn/readme.txt
new file mode 100644
index 00000000..616f1171
--- /dev/null
+++ b/matlab_version/face_detection/mtcnn/readme.txt
@@ -0,0 +1,6 @@
+My re-implementation of MTCNN face detector (https://github.com/kpzhang93/MTCNN_face_detection_alignment) using Matlab and MatcConvNet.
+
+It uses MatConvNet to speed up face detection, and is able to use GPU support. Alternatively, if MatConvNet is not installed the approach will use Matlab native functions for processing (much slower).
+
+MatConvNet version used:
+- MatConvNet from http://www.vlfeat.org/matconvnet/ (tested with version 1.0-beta24), and install following the instructions
diff --git a/matlab_version/face_detection/mtcnn/rectify.m b/matlab_version/face_detection/mtcnn/rectify.m
new file mode 100644
index 00000000..29123251
--- /dev/null
+++ b/matlab_version/face_detection/mtcnn/rectify.m
@@ -0,0 +1,15 @@
+function [bbox_out] = rectify(bbox_in)
+	
+    %convert bboxA to square
+    heights = bbox_in(:,4) - bbox_in(:,2);
+	widths = bbox_in(:,3) - bbox_in(:,1);
+
+    max_side = max([widths'; heights'])';
+    
+    % Correct the starts based on new size
+    new_min_x = bbox_in(:,1) + 0.5 * (widths - max_side);
+    new_min_y = bbox_in(:,2) + 0.5 * (heights - max_side);
+    
+    bbox_out = [new_min_x, new_min_y, new_min_x + max_side, new_min_y + max_side];
+end
+
diff --git a/matlab_version/face_validation/WriteOutFaceCheckersCNNbinary.m b/matlab_version/face_validation/WriteOutFaceCheckersCNNbinary.m
index 24b28ece..cba61d57 100644
--- a/matlab_version/face_validation/WriteOutFaceCheckersCNNbinary.m
+++ b/matlab_version/face_validation/WriteOutFaceCheckersCNNbinary.m
@@ -83,7 +83,7 @@ function WriteOutFaceCheckersCNNbinary(locationTxt, faceCheckers)
 
                     for k=1:num_in_map                                        
                         for k2=1:num_out_kerns
-                            % Write out the bias term                                                
+                            % Write out the kernel                              
                             W = squeeze(cnn.layers{layers}.weights{1}(:,:,k,k2));
                             writeMatrixBin(faceCheckerFile, W, 5);                
                         end