diff --git a/exe/FaceLandmarkImg/FaceLandmarkImg.cpp b/exe/FaceLandmarkImg/FaceLandmarkImg.cpp index 8037ee26..b1164395 100644 --- a/exe/FaceLandmarkImg/FaceLandmarkImg.cpp +++ b/exe/FaceLandmarkImg/FaceLandmarkImg.cpp @@ -35,6 +35,8 @@ #include "LandmarkCoreIncludes.h" +#include "FaceDetectorMTCNN.h" + // System includes #include @@ -306,9 +308,9 @@ int main(int argc, char **argv) // Bounding boxes for a face in each image (optional) vector > bounding_boxes; - + LandmarkDetector::get_image_input_output_params(files, output_landmark_locations, output_pose_locations, output_images, bounding_boxes, arguments); - LandmarkDetector::FaceModelParameters det_parameters(arguments); + LandmarkDetector::FaceModelParameters det_parameters(arguments); // No need to validate detections, as we're not doing tracking det_parameters.validate_detections = false; @@ -335,8 +337,9 @@ int main(int argc, char **argv) LandmarkDetector::CLNF clnf_model(det_parameters.model_location); cout << "Model loaded" << endl; - cv::CascadeClassifier classifier(det_parameters.face_detector_location); + cv::CascadeClassifier classifier(det_parameters.haar_face_detector_location); dlib::frontal_face_detector face_detector_hog = dlib::get_frontal_face_detector(); + LandmarkDetector::FaceDetectorMTCNN face_detector_mtcnn(det_parameters.mtcnn_face_detector_location); // Load facial feature extractor and AU analyser (make sure it is static) FaceAnalysis::FaceAnalyserParameters face_analysis_params(arguments); @@ -393,10 +396,15 @@ int main(int argc, char **argv) vector confidences; LandmarkDetector::DetectFacesHOG(face_detections, grayscale_image, face_detector_hog, confidences); } - else + else if (det_parameters.curr_face_detector == LandmarkDetector::FaceModelParameters::HAAR_DETECTOR) { LandmarkDetector::DetectFaces(face_detections, grayscale_image, classifier); } + else + { + vector confidences; + LandmarkDetector::DetectFacesMTCNN(face_detections, read_image, face_detector_mtcnn, confidences); + } // Detect landmarks around detected faces int face_det = 0; @@ -414,7 +422,7 @@ int main(int argc, char **argv) cv::Point3f gazeDirection1(0, 0, -1); if (success && det_parameters.track_gaze) - { + { GazeAnalysis::EstimateGaze(clnf_model, gazeDirection0, fx, fy, cx, cy, true); GazeAnalysis::EstimateGaze(clnf_model, gazeDirection1, fx, fy, cx, cy, false); diff --git a/exe/FaceLandmarkVid/FaceLandmarkVid.cpp b/exe/FaceLandmarkVid/FaceLandmarkVid.cpp index 4912aed4..fcb405b0 100644 --- a/exe/FaceLandmarkVid/FaceLandmarkVid.cpp +++ b/exe/FaceLandmarkVid/FaceLandmarkVid.cpp @@ -152,7 +152,7 @@ int main(int argc, char **argv) // Some initial parameters that can be overriden from command line vector files, output_video_files, out_dummy; - + // By default try webcam 0 int device = 0; @@ -281,20 +281,8 @@ int main(int argc, char **argv) while (!captured_image.empty()) { - // Reading the images - cv::Mat_ grayscale_image; - - if (captured_image.channels() == 3) - { - cv::cvtColor(captured_image, grayscale_image, CV_BGR2GRAY); - } - else - { - grayscale_image = captured_image.clone(); - } - // The actual facial landmark detection / tracking - bool detection_success = LandmarkDetector::DetectLandmarksInVideo(grayscale_image, clnf_model, det_parameters); + bool detection_success = LandmarkDetector::DetectLandmarksInVideo(captured_image, clnf_model, det_parameters); // Visualising the results // Drawing the facial landmarks on the face and the bounding box around it if tracking is successful and initialised @@ -311,7 +299,7 @@ int main(int argc, char **argv) } visualise_tracking(captured_image, clnf_model, det_parameters, gazeDirection0, gazeDirection1, frame_count, fx, fy, cx, cy); - + // output the tracked video if (!output_video_files.empty()) { diff --git a/exe/FaceLandmarkVidMulti/FaceLandmarkVidMulti.cpp b/exe/FaceLandmarkVidMulti/FaceLandmarkVidMulti.cpp index 8cf3814c..98546e21 100644 --- a/exe/FaceLandmarkVidMulti/FaceLandmarkVidMulti.cpp +++ b/exe/FaceLandmarkVidMulti/FaceLandmarkVidMulti.cpp @@ -120,7 +120,7 @@ int main(int argc, char **argv) // This is so that the model would not try re-initialising itself det_params.reinit_video_every = -1; - det_params.curr_face_detector = LandmarkDetector::FaceModelParameters::HOG_SVM_DETECTOR; + det_params.curr_face_detector = LandmarkDetector::FaceModelParameters::MTCNN_DETECTOR; vector det_parameters; det_parameters.push_back(det_params); @@ -139,8 +139,10 @@ int main(int argc, char **argv) int num_faces_max = 4; LandmarkDetector::CLNF clnf_model(det_parameters[0].model_location); - clnf_model.face_detector_HAAR.load(det_parameters[0].face_detector_location); - clnf_model.face_detector_location = det_parameters[0].face_detector_location; + clnf_model.face_detector_HAAR.load(det_parameters[0].haar_face_detector_location); + clnf_model.haar_face_detector_location = det_parameters[0].haar_face_detector_location; + clnf_model.face_detector_MTCNN.Read(det_parameters[0].mtcnn_face_detector_location); + clnf_model.mtcnn_face_detector_location = det_parameters[0].mtcnn_face_detector_location; clnf_models.reserve(num_faces_max); @@ -271,10 +273,15 @@ int main(int argc, char **argv) vector confidences; LandmarkDetector::DetectFacesHOG(face_detections, grayscale_image, clnf_models[0].face_detector_HOG, confidences); } - else + else if (det_parameters[0].curr_face_detector == LandmarkDetector::FaceModelParameters::HAAR_DETECTOR) { LandmarkDetector::DetectFaces(face_detections, grayscale_image, clnf_models[0].face_detector_HAAR); } + else + { + vector confidences; + LandmarkDetector::DetectFacesMTCNN(face_detections, captured_image, clnf_models[0].face_detector_MTCNN, confidences); + } } diff --git a/exe/FeatureExtraction/FeatureExtraction.cpp b/exe/FeatureExtraction/FeatureExtraction.cpp index 46e4ef4c..e27fd80b 100644 --- a/exe/FeatureExtraction/FeatureExtraction.cpp +++ b/exe/FeatureExtraction/FeatureExtraction.cpp @@ -220,7 +220,7 @@ int main(int argc, char **argv) // Some initial parameters that can be overriden from command line vector input_files, output_files, tracked_videos_output; - + // Get the input output file parameters // Indicates that rotation should be with respect to camera or world coordinates @@ -371,8 +371,8 @@ int main(int argc, char **argv) } // If image sequence provided, assume the fps is 30 fps_vid_in = 30; - } - + } + // If optical centers are not defined just use center of image if (cx_undefined) { @@ -450,29 +450,17 @@ int main(int argc, char **argv) // if loading images assume 30fps time_stamp = (double)frame_count * (1.0 / 30.0); } - - // Reading the images - cv::Mat_ grayscale_image; - - if (captured_image.channels() == 3) - { - cvtColor(captured_image, grayscale_image, CV_BGR2GRAY); - } - else - { - grayscale_image = captured_image.clone(); - } - + // The actual facial landmark detection / tracking bool detection_success; if (video_input || images_as_video) { - detection_success = LandmarkDetector::DetectLandmarksInVideo(grayscale_image, face_model, det_parameters); + detection_success = LandmarkDetector::DetectLandmarksInVideo(captured_image, face_model, det_parameters); } else { - detection_success = LandmarkDetector::DetectLandmarksInImage(grayscale_image, face_model, det_parameters); + detection_success = LandmarkDetector::DetectLandmarksInImage(captured_image, face_model, det_parameters); } diff --git a/lib/local/LandmarkDetector/LandmarkDetector.vcxproj b/lib/local/LandmarkDetector/LandmarkDetector.vcxproj index e542eaa5..c86c51f4 100644 --- a/lib/local/LandmarkDetector/LandmarkDetector.vcxproj +++ b/lib/local/LandmarkDetector/LandmarkDetector.vcxproj @@ -195,6 +195,7 @@ xcopy /I /E /Y /D "$(SolutionDir)lib\3rdParty\OpenCV3.1\classifiers" "$(OutDir)c Use + Use Use @@ -254,6 +255,7 @@ xcopy /I /E /Y /D "$(SolutionDir)lib\3rdParty\OpenCV3.1\classifiers" "$(OutDir)c + diff --git a/lib/local/LandmarkDetector/LandmarkDetector.vcxproj.filters b/lib/local/LandmarkDetector/LandmarkDetector.vcxproj.filters index 6d85a6f4..07e70f23 100644 --- a/lib/local/LandmarkDetector/LandmarkDetector.vcxproj.filters +++ b/lib/local/LandmarkDetector/LandmarkDetector.vcxproj.filters @@ -35,6 +35,7 @@ source + source @@ -76,6 +77,7 @@ headers + headers diff --git a/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h b/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h new file mode 100644 index 00000000..5ce11298 --- /dev/null +++ b/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h @@ -0,0 +1,156 @@ +/////////////////////////////////////////////////////////////////////////////// +// Copyright (C) 2016, Carnegie Mellon University and University of Cambridge, +// all rights reserved. +// +// THIS SOFTWARE IS PROVIDED “AS IS” FOR ACADEMIC USE ONLY AND ANY EXPRESS +// OR IMPLIED WARRANTIES WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY. +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Notwithstanding the license granted herein, Licensee acknowledges that certain components +// of the Software may be covered by so-called “open source” software licenses (“Open Source +// Components”), which means any software licenses approved as open source licenses by the +// Open Source Initiative or any substantially similar licenses, including without limitation any +// license that, as a condition of distribution of the software licensed under such license, +// requires that the distributor make the software available in source code format. Licensor shall +// provide a list of Open Source Components for a particular version of the Software upon +// Licensee’s request. Licensee will comply with the applicable terms of such licenses and to +// the extent required by the licenses covering Open Source Components, the terms of such +// licenses will apply in lieu of the terms of this Agreement. To the extent the terms of the +// licenses applicable to Open Source Components prohibit any of the restrictions in this +// License Agreement with respect to such Open Source Component, such restrictions will not +// apply to such Open Source Component. To the extent the terms of the licenses applicable to +// Open Source Components require Licensor to make an offer to provide source code or +// related information in connection with the Software, such offer is hereby made. Any request +// for source code or related information should be directed to cl-face-tracker-distribution@lists.cam.ac.uk +// Licensee acknowledges receipt of notices for the Open Source Components for the initial +// delivery of the Software. + +// * Any publications arising from the use of this software, including but +// not limited to academic journal and conference publications, technical +// reports and manuals, must cite at least one of the following works: +// +// OpenFace: an open source facial behavior analysis toolkit +// Tadas Baltrušaitis, Peter Robinson, and Louis-Philippe Morency +// in IEEE Winter Conference on Applications of Computer Vision, 2016 +// +// Rendering of Eyes for Eye-Shape Registration and Gaze Estimation +// Erroll Wood, Tadas Baltrušaitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, and Andreas Bulling +// in IEEE International. Conference on Computer Vision (ICCV), 2015 +// +// Cross-dataset learning and person-speci?c normalisation for automatic Action Unit detection +// Tadas Baltrušaitis, Marwa Mahmoud, and Peter Robinson +// in Facial Expression Recognition and Analysis Challenge, +// IEEE International Conference on Automatic Face and Gesture Recognition, 2015 +// +// Constrained Local Neural Fields for robust facial landmark detection in the wild. +// Tadas Baltrušaitis, Peter Robinson, and Louis-Philippe Morency. +// in IEEE Int. Conference on Computer Vision Workshops, 300 Faces in-the-Wild Challenge, 2013. +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef __FACE_DETECTOR_MTCNN_h_ +#define __FACE_DETECTOR_MTCNN_h_ + +// OpenCV includes +#include + +// System includes +#include + +using namespace std; + +namespace LandmarkDetector +{ + class CNN + { + public: + + //========================================== + + // Default constructor + CNN() { ; } + + // Copy constructor + CNN(const CNN& other); + + // Given an image apply a CNN on it, the boolean direct controls if direct convolution is used (through matrix multiplication) or an FFT optimization + std::vector > Inference(const cv::Mat& input_img, bool direct = true); + + // Reading in the model + void Read(const string& location); + + // Clearing precomputed DFTs + void ClearPrecomp(); + + size_t NumberOfLayers() { return cnn_layer_types.size(); } + + private: + //========================================== + // Convolutional Neural Network + + // CNN layers + // Layer -> Weight matrix + vector > cnn_convolutional_layers_weights; + // Layer -> kernel -> input maps + vector > > > cnn_convolutional_layers; + vector > cnn_convolutional_layers_bias; + // Layer matrix + bas + vector > cnn_fully_connected_layers_weights; + vector > cnn_fully_connected_layers_biases; + vector > cnn_prelu_layer_weights; + vector > cnn_max_pooling_layers; + + // Precomputations for faster convolution + vector > > > > cnn_convolutional_layers_dft; + + // CNN: 0 - convolutional, 1 - max pooling, 2 - fully connected, 3 - prelu, 4 - sigmoid + vector cnn_layer_types; + }; + //=========================================================================== + // + // Checking if landmark detection was successful using an SVR regressor + // Using multiple validators trained add different views + // The regressor outputs -1 for ideal alignment and 1 for worst alignment + //=========================================================================== + class FaceDetectorMTCNN + { + + public: + + // Default constructor + FaceDetectorMTCNN() { ; } + + FaceDetectorMTCNN(const string& location); + + // Copy constructor + FaceDetectorMTCNN(const FaceDetectorMTCNN& other); + + // Given an image, orientation and detected landmarks output the result of the appropriate regressor + bool DetectFaces(vector >& o_regions, const cv::Mat& input_img, std::vector& o_confidences, int min_face = 60, double t1 = 0.6, double t2 = 0.7, double t3 = 0.7); + + // Reading in the model + void Read(const string& location); + + // Indicate if the model has been read in + bool empty() { return PNet.NumberOfLayers() == 0 || RNet.NumberOfLayers() == 0 || ONet.NumberOfLayers() == 0; }; + + private: + //========================================== + // Components of the model + + CNN PNet; + CNN RNet; + CNN ONet; + + }; + +} +#endif diff --git a/lib/local/LandmarkDetector/include/LandmarkDetectorFunc.h b/lib/local/LandmarkDetector/include/LandmarkDetectorFunc.h index 42a23ce3..9c1aa8c0 100644 --- a/lib/local/LandmarkDetector/include/LandmarkDetectorFunc.h +++ b/lib/local/LandmarkDetector/include/LandmarkDetectorFunc.h @@ -54,16 +54,16 @@ namespace LandmarkDetector // Landmark detection in videos, need to provide an image and model parameters (default values work well) // Optionally can provide a bounding box from which to start tracking //================================================================================================================ - bool DetectLandmarksInVideo(const cv::Mat_ &grayscale_image, CLNF& clnf_model, FaceModelParameters& params); - bool DetectLandmarksInVideo(const cv::Mat_ &grayscale_image, const cv::Rect_ bounding_box, CLNF& clnf_model, FaceModelParameters& params); + bool DetectLandmarksInVideo(const cv::Mat &image, CLNF& clnf_model, FaceModelParameters& params); + bool DetectLandmarksInVideo(const cv::Mat &image, const cv::Rect_ bounding_box, CLNF& clnf_model, FaceModelParameters& params); //================================================================================================================ // Landmark detection in image, need to provide an image and optionally CLNF model together with parameters (default values work well) // Optionally can provide a bounding box in which detection is performed (this is useful if multiple faces are to be detected in images) //================================================================================================================ - bool DetectLandmarksInImage(const cv::Mat_ &grayscale_image, CLNF& clnf_model, FaceModelParameters& params); + bool DetectLandmarksInImage(const cv::Mat &image, CLNF& clnf_model, FaceModelParameters& params); // Providing a bounding box - bool DetectLandmarksInImage(const cv::Mat_ &grayscale_image, const cv::Rect_ bounding_box, CLNF& clnf_model, FaceModelParameters& params); + bool DetectLandmarksInImage(const cv::Mat &image, const cv::Rect_ bounding_box, CLNF& clnf_model, FaceModelParameters& params); //================================================================ // Helper function for getting head pose from CLNF parameters diff --git a/lib/local/LandmarkDetector/include/LandmarkDetectorModel.h b/lib/local/LandmarkDetector/include/LandmarkDetectorModel.h index e46c1cd4..3cee3704 100644 --- a/lib/local/LandmarkDetector/include/LandmarkDetectorModel.h +++ b/lib/local/LandmarkDetector/include/LandmarkDetectorModel.h @@ -47,6 +47,7 @@ #include "Patch_experts.h" #include "LandmarkDetectionValidator.h" #include "LandmarkDetectorParameters.h" +#include "FaceDetectorMTCNN.h" using namespace std; @@ -85,13 +86,17 @@ public: //==================== Helpers for face detection and landmark detection validation ========================================= + // TODO these should be static, and loading should be made easier + // Haar cascade classifier for face detection cv::CascadeClassifier face_detector_HAAR; - string face_detector_location; - + string haar_face_detector_location; + // A HOG SVM-struct based face detector dlib::frontal_face_detector face_detector_HOG; + FaceDetectorMTCNN face_detector_MTCNN; + string mtcnn_face_detector_location; // Validate if the detected landmarks are correct using an SVR regressor DetectionValidator landmark_validator; diff --git a/lib/local/LandmarkDetector/include/LandmarkDetectorParameters.h b/lib/local/LandmarkDetector/include/LandmarkDetectorParameters.h index 77ed1683..2e259fbd 100644 --- a/lib/local/LandmarkDetector/include/LandmarkDetectorParameters.h +++ b/lib/local/LandmarkDetector/include/LandmarkDetectorParameters.h @@ -88,9 +88,11 @@ struct FaceModelParameters // Determining which face detector to use for (re)initialisation, HAAR is quicker but provides more false positives and is not goot for in-the-wild conditions // Also HAAR detector can detect smaller faces while HOG SVM is only capable of detecting faces at least 70px across - enum FaceDetector{HAAR_DETECTOR, HOG_SVM_DETECTOR}; + // MTCNN detector is much more accurate that the other two, and is even suitable for profile faces, but it is somewhat slower + enum FaceDetector{HAAR_DETECTOR, HOG_SVM_DETECTOR, MTCNN_DETECTOR}; - string face_detector_location; + string haar_face_detector_location; + string mtcnn_face_detector_location; FaceDetector curr_face_detector; // Should the results be visualised and reported to console diff --git a/lib/local/LandmarkDetector/include/LandmarkDetectorUtils.h b/lib/local/LandmarkDetector/include/LandmarkDetectorUtils.h index 974a7df4..56f113bc 100644 --- a/lib/local/LandmarkDetector/include/LandmarkDetectorUtils.h +++ b/lib/local/LandmarkDetector/include/LandmarkDetectorUtils.h @@ -41,6 +41,8 @@ #include "LandmarkDetectorModel.h" +#include "FaceDetectorMTCNN.h" + using namespace std; namespace LandmarkDetector @@ -135,6 +137,11 @@ namespace LandmarkDetector // The preference point allows for disambiguation if multiple faces are present (pick the closest one), if it is not set the biggest face is chosen bool DetectSingleFaceHOG(cv::Rect_& o_region, const cv::Mat_& intensity, dlib::frontal_face_detector& classifier, double& confidence, const cv::Point preference = cv::Point(-1, -1), double min_width = -1, cv::Rect_ roi = cv::Rect_(0.0, 0.0, 1.0, 1.0)); + // Face detection using Multi-task Convolutional Neural Network + bool DetectFacesMTCNN(vector >& o_regions, const cv::Mat& image, LandmarkDetector::FaceDetectorMTCNN& detector, std::vector& confidences); + // The preference point allows for disambiguation if multiple faces are present (pick the closest one), if it is not set the biggest face is chosen + bool DetectSingleFaceMTCNN(cv::Rect_& o_region, const cv::Mat& image, LandmarkDetector::FaceDetectorMTCNN& detector, double& confidence, const cv::Point preference = cv::Point(-1, -1)); + //============================================================================ // Matrix reading functionality //============================================================================ diff --git a/lib/local/LandmarkDetector/model/mtcnn_detector/MTCNN_detector.txt b/lib/local/LandmarkDetector/model/mtcnn_detector/MTCNN_detector.txt new file mode 100644 index 00000000..9a4f805b --- /dev/null +++ b/lib/local/LandmarkDetector/model/mtcnn_detector/MTCNN_detector.txt @@ -0,0 +1,3 @@ +PNet PNet.dat +RNet RNet.dat +ONet ONet.dat diff --git a/lib/local/LandmarkDetector/model/mtcnn_detector/ONet.dat b/lib/local/LandmarkDetector/model/mtcnn_detector/ONet.dat new file mode 100644 index 00000000..291c4462 Binary files /dev/null and b/lib/local/LandmarkDetector/model/mtcnn_detector/ONet.dat differ diff --git a/lib/local/LandmarkDetector/model/mtcnn_detector/PNet.dat b/lib/local/LandmarkDetector/model/mtcnn_detector/PNet.dat new file mode 100644 index 00000000..9550d39a Binary files /dev/null and b/lib/local/LandmarkDetector/model/mtcnn_detector/PNet.dat differ diff --git a/lib/local/LandmarkDetector/model/mtcnn_detector/RNet.dat b/lib/local/LandmarkDetector/model/mtcnn_detector/RNet.dat new file mode 100644 index 00000000..864e0dd9 Binary files /dev/null and b/lib/local/LandmarkDetector/model/mtcnn_detector/RNet.dat differ diff --git a/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp b/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp new file mode 100644 index 00000000..953972ba --- /dev/null +++ b/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp @@ -0,0 +1,1346 @@ +/////////////////////////////////////////////////////////////////////////////// +// Copyright (C) 2016, Carnegie Mellon University and University of Cambridge, +// all rights reserved. +// +// THIS SOFTWARE IS PROVIDED “AS IS” FOR ACADEMIC USE ONLY AND ANY EXPRESS +// OR IMPLIED WARRANTIES WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS +// BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY. +// OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Notwithstanding the license granted herein, Licensee acknowledges that certain components +// of the Software may be covered by so-called “open source” software licenses (“Open Source +// Components”), which means any software licenses approved as open source licenses by the +// Open Source Initiative or any substantially similar licenses, including without limitation any +// license that, as a condition of distribution of the software licensed under such license, +// requires that the distributor make the software available in source code format. Licensor shall +// provide a list of Open Source Components for a particular version of the Software upon +// Licensee’s request. Licensee will comply with the applicable terms of such licenses and to +// the extent required by the licenses covering Open Source Components, the terms of such +// licenses will apply in lieu of the terms of this Agreement. To the extent the terms of the +// licenses applicable to Open Source Components prohibit any of the restrictions in this +// License Agreement with respect to such Open Source Component, such restrictions will not +// apply to such Open Source Component. To the extent the terms of the licenses applicable to +// Open Source Components require Licensor to make an offer to provide source code or +// related information in connection with the Software, such offer is hereby made. Any request +// for source code or related information should be directed to cl-face-tracker-distribution@lists.cam.ac.uk +// Licensee acknowledges receipt of notices for the Open Source Components for the initial +// delivery of the Software. + +// * Any publications arising from the use of this software, including but +// not limited to academic journal and conference publications, technical +// reports and manuals, must cite at least one of the following works: +// +// OpenFace: an open source facial behavior analysis toolkit +// Tadas Baltrušaitis, Peter Robinson, and Louis-Philippe Morency +// in IEEE Winter Conference on Applications of Computer Vision, 2016 +// +// Rendering of Eyes for Eye-Shape Registration and Gaze Estimation +// Erroll Wood, Tadas Baltrušaitis, Xucong Zhang, Yusuke Sugano, Peter Robinson, and Andreas Bulling +// in IEEE International. Conference on Computer Vision (ICCV), 2015 +// +// Cross-dataset learning and person-speci?c normalisation for automatic Action Unit detection +// Tadas Baltrušaitis, Marwa Mahmoud, and Peter Robinson +// in Facial Expression Recognition and Analysis Challenge, +// IEEE International Conference on Automatic Face and Gesture Recognition, 2015 +// +// Constrained Local Neural Fields for robust facial landmark detection in the wild. +// Tadas Baltrušaitis, Peter Robinson, and Louis-Philippe Morency. +// in IEEE Int. Conference on Computer Vision Workshops, 300 Faces in-the-Wild Challenge, 2013. +// +/////////////////////////////////////////////////////////////////////////////// + +#include "stdafx.h" + +#include "FaceDetectorMTCNN.h" + +// OpenCV includes +#include +#include + +// TBB includes +#include + +// System includes +#include + +// Math includes +#define _USE_MATH_DEFINES +#include + +// Boost includes +#include +#include + + +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif + +#include "LandmarkDetectorUtils.h" + +using namespace LandmarkDetector; + +// Constructor from model file location +FaceDetectorMTCNN::FaceDetectorMTCNN(const string& location) +{ + this->Read(location); +} +// Copy constructor +FaceDetectorMTCNN::FaceDetectorMTCNN(const FaceDetectorMTCNN& other) : PNet(other.PNet), RNet(other.RNet), ONet(other.ONet) +{ +} + +CNN::CNN(const CNN& other) : cnn_layer_types(other.cnn_layer_types), cnn_max_pooling_layers(other.cnn_max_pooling_layers), cnn_convolutional_layers_bias(other.cnn_convolutional_layers_bias) +{ + + this->cnn_convolutional_layers_weights.resize(other.cnn_convolutional_layers_weights.size()); + for (size_t l = 0; l < other.cnn_convolutional_layers_weights.size(); ++l) + { + // Make sure the matrix is copied. + this->cnn_convolutional_layers_weights[l] = other.cnn_convolutional_layers_weights[l].clone(); + } + + this->cnn_convolutional_layers.resize(other.cnn_convolutional_layers.size()); + for (size_t l = 0; l < other.cnn_convolutional_layers.size(); ++l) + { + this->cnn_convolutional_layers[l].resize(other.cnn_convolutional_layers[l].size()); + + for (size_t i = 0; i < other.cnn_convolutional_layers[l].size(); ++i) + { + this->cnn_convolutional_layers[l][i].resize(other.cnn_convolutional_layers[l][i].size()); + + for (size_t k = 0; k < other.cnn_convolutional_layers[l][i].size(); ++k) + { + // Make sure the matrix is copied. + this->cnn_convolutional_layers[l][i][k] = other.cnn_convolutional_layers[l][i][k].clone(); + } + } + } + + this->cnn_fully_connected_layers_weights.resize(other.cnn_fully_connected_layers_weights.size()); + + for (size_t l = 0; l < other.cnn_fully_connected_layers_weights.size(); ++l) + { + // Make sure the matrix is copied. + this->cnn_fully_connected_layers_weights[l] = other.cnn_fully_connected_layers_weights[l].clone(); + } + + this->cnn_fully_connected_layers_biases.resize(other.cnn_fully_connected_layers_biases.size()); + + for (size_t l = 0; l < other.cnn_fully_connected_layers_biases.size(); ++l) + { + // Make sure the matrix is copied. + this->cnn_fully_connected_layers_biases[l] = other.cnn_fully_connected_layers_biases[l].clone(); + } + + this->cnn_prelu_layer_weights.resize(other.cnn_prelu_layer_weights.size()); + + for (size_t l = 0; l < other.cnn_prelu_layer_weights.size(); ++l) + { + // Make sure the matrix is copied. + this->cnn_prelu_layer_weights[l] = other.cnn_prelu_layer_weights[l].clone(); + } +} + +void PReLU(std::vector >& input_output_maps, cv::Mat_ prelu_weights) +{ + + if (input_output_maps.size() > 1) + { + int h = input_output_maps[0].rows; + int w = input_output_maps[0].cols; + size_t size_in = h * w; + + for (size_t k = 0; k < input_output_maps.size(); ++k) + { + // Apply the PReLU + auto iter = input_output_maps[k].begin(); + + float neg_mult = prelu_weights.at(k); + + for(size_t i = 0; i < size_in; ++i) + { + float in_val = *iter; + + // The prelu step + *iter++ = in_val >= 0 ? in_val : in_val * neg_mult; + + } + } + } + else + { + + int w = input_output_maps[0].cols; + + for (size_t k = 0; k < prelu_weights.rows; ++k) + { + auto iter = input_output_maps[0].row(k).begin(); + float neg_mult = prelu_weights.at(k); + + for (size_t i = 0; i < w; ++i) + { + float in_val = *iter; + // Apply the PReLU + *iter++ = in_val >= 0 ? in_val : in_val * neg_mult; + } + } + + } + +} + +void fully_connected(std::vector >& outputs, const std::vector >& input_maps, cv::Mat_ weights, cv::Mat_ biases) +{ + outputs.clear(); + + if (input_maps.size() > 1) + { + // Concatenate all the maps + cv::Size orig_size = input_maps[0].size(); + cv::Mat_ input_concat(input_maps.size(), input_maps[0].cols * input_maps[0].rows); + + for (size_t in = 0; in < input_maps.size(); ++in) + { + cv::Mat_ add = input_maps[in]; + + // Reshape if all of the data will be flattened + if (input_concat.rows != weights.cols) + { + add = add.t(); + } + + add = add.reshape(0, 1); + add.copyTo(input_concat.row(in)); + } + + // Treat the input as separate feature maps + if (input_concat.rows == weights.cols) + { + input_concat = weights * input_concat; + // Add biases + for (size_t k = 0; k < biases.rows; ++k) + { + input_concat.row(k) = input_concat.row(k) + biases.at(k); + } + + outputs.clear(); + // Resize and add as output + for (size_t k = 0; k < biases.rows; ++k) + { + cv::Mat_ reshaped = input_concat.row(k).clone(); + reshaped = reshaped.reshape(1, orig_size.height); + outputs.push_back(reshaped); + } + } + else + { + // Flatten the input + input_concat = input_concat.reshape(0, input_concat.rows * input_concat.cols); + + input_concat = weights * input_concat + biases; + + outputs.clear(); + outputs.push_back(input_concat); + } + + } + else + { + cv::Mat out = weights * input_maps[0] + biases; + outputs.clear(); + outputs.push_back(out.t()); + } + +} + +void max_pooling(std::vector >& outputs, const std::vector >& input_maps, int stride_x, int stride_y, int kernel_size_x, int kernel_size_y) +{ + vector > outputs_sub; + + // Iterate over kernel height and width, based on stride + for (size_t in = 0; in < input_maps.size(); ++in) + { + // Help with rounding up a bit, to match caffe style output + int out_x = round((double)(input_maps[in].cols - kernel_size_x) / (double)stride_x) + 1; + int out_y = round((double)(input_maps[in].rows - kernel_size_y) / (double)stride_y) + 1; + + cv::Mat_ sub_out(out_y, out_x, 0.0); + cv::Mat_ in_map = input_maps[in]; + + for (int x = 0; x < input_maps[in].cols; x += stride_x) + { + int max_x = cv::min(input_maps[in].cols, x + kernel_size_x); + int x_in_out = floor(x / stride_x); + + if (x_in_out >= out_x) + continue; + + for (int y = 0; y < input_maps[in].rows; y += stride_y) + { + int y_in_out = floor(y / stride_y); + + if (y_in_out >= out_y) + continue; + + int max_y = cv::min(input_maps[in].rows, y + kernel_size_y); + + float curr_max = -FLT_MAX; + + for (int x_in = x; x_in < max_x; ++x_in) + { + for (int y_in = y; y_in < max_y; ++y_in) + { + float curr_val = in_map.at(y_in, x_in); + if (curr_val > curr_max) + { + curr_max = curr_val; + } + } + } + sub_out.at(y_in_out, x_in_out) = curr_max; + } + } + + outputs_sub.push_back(sub_out); + + } + outputs = outputs_sub; + +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////// + +void convolution_single_kern_fft(const vector >& input_imgs, vector >& img_dfts, const vector >& _templs, map > >& _templ_dfts, cv::Mat_& result) +{ + // Assume result is defined properly + if (result.empty()) + { + cv::Size corrSize(input_imgs[0].cols - _templs[0].cols + 1, input_imgs[0].rows - _templs[0].rows + 1); + result.create(corrSize); + } + + // Our model will always be under min block size so can ignore this + //const double blockScale = 4.5; + //const int minBlockSize = 256; + + int maxDepth = CV_64F; + + cv::Size dftsize; + + dftsize.width = cv::getOptimalDFTSize(result.cols + _templs[0].cols - 1); + dftsize.height = cv::getOptimalDFTSize(result.rows + _templs[0].rows - 1); + + // Compute block size + cv::Size blocksize; + blocksize.width = dftsize.width - _templs[0].cols + 1; + blocksize.width = MIN(blocksize.width, result.cols); + blocksize.height = dftsize.height - _templs[0].rows + 1; + blocksize.height = MIN(blocksize.height, result.rows); + + vector> dftTempl; + + // if this has not been precomputed, precompute it, otherwise use it + if (_templ_dfts.find(dftsize.width) == _templ_dfts.end()) + { + dftTempl.resize(_templs.size()); + for (size_t k = 0; k < _templs.size(); ++k) + { + dftTempl[k].create(dftsize.height, dftsize.width); + + cv::Mat_ src = _templs[k]; + + cv::Mat_ dst(dftTempl[k], cv::Rect(0, 0, dftsize.width, dftsize.height)); + + cv::Mat_ dst1(dftTempl[k], cv::Rect(0, 0, _templs[k].cols, _templs[k].rows)); + + if (dst1.data != src.data) + src.convertTo(dst1, dst1.depth()); + + if (dst.cols > _templs[k].cols) + { + cv::Mat_ part(dst, cv::Range(0, _templs[k].rows), cv::Range(_templs[k].cols, dst.cols)); + part.setTo(0); + } + + // Perform DFT of the template + dft(dst, dst, 0, _templs[k].rows); + + } + _templ_dfts[dftsize.width] = dftTempl; + + } + else + { + dftTempl = _templ_dfts[dftsize.width]; + } + + cv::Size bsz(std::min(blocksize.width, result.cols), std::min(blocksize.height, result.rows)); + cv::Mat src; + + cv::Mat cdst(result, cv::Rect(0, 0, bsz.width, bsz.height)); + + vector > dftImgs; + dftImgs.resize(input_imgs.size()); + + if (img_dfts.empty()) + { + for(size_t k = 0; k < input_imgs.size(); ++k) + { + dftImgs[k].create(dftsize); + dftImgs[k].setTo(0.0); + + cv::Size dsz(bsz.width + _templs[k].cols - 1, bsz.height + _templs[k].rows - 1); + + int x2 = std::min(input_imgs[k].cols, dsz.width); + int y2 = std::min(input_imgs[k].rows, dsz.height); + + cv::Mat src0(input_imgs[k], cv::Range(0, y2), cv::Range(0, x2)); + cv::Mat dst(dftImgs[k], cv::Rect(0, 0, dsz.width, dsz.height)); + cv::Mat dst1(dftImgs[k], cv::Rect(0, 0, x2, y2)); + + src = src0; + + if (dst1.data != src.data) + src.convertTo(dst1, dst1.depth()); + + dft(dftImgs[k], dftImgs[k], 0, dsz.height); + img_dfts.push_back(dftImgs[k].clone()); + } + } + + cv::Mat_ dft_img(img_dfts[0].rows, img_dfts[0].cols, 0.0); + for (size_t k = 0; k < input_imgs.size(); ++k) + { + cv::Mat dftTempl1(dftTempl[k], cv::Rect(0, 0, dftsize.width, dftsize.height)); + if (k == 0) + { + cv::mulSpectrums(img_dfts[k], dftTempl1, dft_img, 0, true); + } + else + { + cv::mulSpectrums(img_dfts[k], dftTempl1, dftImgs[k], 0, true); + dft_img = dft_img + dftImgs[k]; + } + } + + cv::dft(dft_img, dft_img, cv::DFT_INVERSE + cv::DFT_SCALE, bsz.height); + + src = dft_img(cv::Rect(0, 0, bsz.width, bsz.height)); + + src.convertTo(cdst, CV_32F); + +} + +void im2col_t(const cv::Mat_& input, int width, int height, cv::Mat_& output) +{ + + int m = input.cols; + int n = input.rows; + + // determine how many blocks there will be with a sliding window of width x height in the input + int yB = m - height + 1; + int xB = n - width + 1; + + // Allocate the output size + if (output.rows != width * height && output.cols != xB*yB) + { + output = cv::Mat::ones(width * height, xB*yB, CV_32F); + } + + // Iterate over the whole image + for (int i = 0; i< yB; i++) + { + int rowIdx = i; + for (int j = 0; j< xB; j++) + { + //int rowIdx = i; +j*yB; + // iterate over the blocks within the image + for (unsigned int yy = 0; yy < height; ++yy) + { + // Faster iteration over the image + const float* Mi = input.ptr(j + yy); + for (unsigned int xx = 0; xx < width; ++xx) + { + int colIdx = xx*height + yy; + + output.at(colIdx, rowIdx) = Mi[i + xx]; + } + } + rowIdx += yB; + + } + } +} + +void convolution_direct(std::vector >& outputs, const std::vector >& input_maps, const cv::Mat_& weight_matrix, const std::vector& biases, int height_k, int width_k) +{ + outputs.clear(); + + int height_in = input_maps[0].rows; + int width_n = input_maps[0].cols; + + // determine how many blocks there will be with a sliding window of width x height in the input + int yB = height_in - height_k + 1; + int xB = width_n - width_k + 1; + + cv::Mat_ input_matrix(input_maps.size() * height_k * width_k + 1.0, yB * xB, 1.0f); + + // Comibine im2col accross channels to prepare for matrix multiplication + for (size_t i = 0; i < input_maps.size(); ++i) + { + im2col_t(input_maps[i], width_k, height_k, input_matrix(cv::Rect(0, i * height_k * width_k, yB * xB, height_k * width_k))); + } + + // Actual convolution (through multiplication) + cv::Mat_ out = weight_matrix * input_matrix; + + // Move back to vectors and reshape accordingly (also add the bias) + for (size_t k = 0; k < out.rows; ++k) + { + outputs.push_back(out.row(k).reshape(1, yB)); + } + +} + +void convolution_fft2(std::vector >& outputs, const std::vector >& input_maps, const std::vector > >& kernels, const std::vector& biases, vector > > >& precomp_dfts) +{ + outputs.clear(); + + // Useful precomputed data placeholders for quick correlation (convolution) + vector > input_image_dft; + + for (size_t k = 0; k < kernels.size(); ++k) + { + + // The convolution (with precomputation) + cv::Mat_ output; + convolution_single_kern_fft(input_maps, input_image_dft, kernels[k], precomp_dfts[k], output); + + // Combining the maps + outputs.push_back(output + biases[k]); + + } +} + +void convolution_fft(std::vector >& outputs, const std::vector >& input_maps, const std::vector > >& kernels, const std::vector& biases, vector > > >& precomp_dfts) +{ + outputs.clear(); + for (size_t in = 0; in < input_maps.size(); ++in) + { + cv::Mat_ input_image = input_maps[in]; + + // Useful precomputed data placeholders for quick correlation (convolution) + cv::Mat_ input_image_dft; + cv::Mat integral_image; + cv::Mat integral_image_sq; + + for (size_t k = 0; k < kernels[in].size(); ++k) + { + cv::Mat_ kernel = kernels[in][k]; + + // The convolution (with precomputation) + cv::Mat_ output; + if (precomp_dfts[in][k].second.empty()) + { + std::map > precomputed_dft; + + LandmarkDetector::matchTemplate_m(input_image, input_image_dft, integral_image, integral_image_sq, kernel, precomputed_dft, output, CV_TM_CCORR); + + precomp_dfts[in][k].first = precomputed_dft.begin()->first; + precomp_dfts[in][k].second = precomputed_dft.begin()->second; + } + else + { + std::map > precomputed_dft; + precomputed_dft[precomp_dfts[in][k].first] = precomp_dfts[in][k].second; + LandmarkDetector::matchTemplate_m(input_image, input_image_dft, integral_image, integral_image_sq, kernel, precomputed_dft, output, CV_TM_CCORR); + } + + // Combining the maps + if (in == 0) + { + outputs.push_back(output); + } + else + { + outputs[k] = outputs[k] + output; + } + + } + + } + + for (size_t k = 0; k < biases.size(); ++k) + { + outputs[k] = outputs[k] + biases[k]; + } +} + +std::vector> CNN::Inference(const cv::Mat& input_img, bool direct) +{ + if (input_img.channels() == 1) + { + cv::cvtColor(input_img, input_img, cv::COLOR_GRAY2BGR); + } + + int cnn_layer = 0; + int fully_connected_layer = 0; + int prelu_layer = 0; + int max_pool_layer = 0; + + // Slit a BGR image into three chnels + cv::Mat channels[3]; + cv::split(input_img, channels); + + // Flip the BGR order to RGB + vector > input_maps; + input_maps.push_back(channels[2]); + input_maps.push_back(channels[1]); + input_maps.push_back(channels[0]); + + vector > outputs; + + for (size_t layer = 0; layer < cnn_layer_types.size(); ++layer) + { + + // Determine layer type + int layer_type = cnn_layer_types[layer]; + + // Convolutional layer + if (layer_type == 0) + { + + // Either perform direct convolution through matrix multiplication or use an FFT optimized version, which one is optimal depends on the kernel and input sizes + if (direct) + { + convolution_direct(outputs, input_maps, cnn_convolutional_layers_weights[cnn_layer], cnn_convolutional_layers_bias[cnn_layer], cnn_convolutional_layers[cnn_layer][0][0].rows, cnn_convolutional_layers[cnn_layer][0][0].cols); + } + else + { + convolution_fft2(outputs, input_maps, cnn_convolutional_layers[cnn_layer], cnn_convolutional_layers_bias[cnn_layer], cnn_convolutional_layers_dft[cnn_layer]); + } + //vector > outs; + //convolution_fft(outs, input_maps, cnn_convolutional_layers[cnn_layer], cnn_convolutional_layers_bias[cnn_layer], cnn_convolutional_layers_dft[cnn_layer]); + + + + cnn_layer++; + } + if (layer_type == 1) + { + + int stride_x = std::get<2>(cnn_max_pooling_layers[max_pool_layer]); + int stride_y = std::get<3>(cnn_max_pooling_layers[max_pool_layer]); + + int kernel_size_x = std::get<0>(cnn_max_pooling_layers[max_pool_layer]); + int kernel_size_y = std::get<1>(cnn_max_pooling_layers[max_pool_layer]); + + max_pooling(outputs, input_maps, stride_x, stride_y, kernel_size_x, kernel_size_y); + max_pool_layer++; + } + if (layer_type == 2) + { + fully_connected(outputs, input_maps, cnn_fully_connected_layers_weights[fully_connected_layer], cnn_fully_connected_layers_biases[fully_connected_layer]); + fully_connected_layer++; + } + if (layer_type == 3) // PReLU + { + // In place prelu computation + PReLU(input_maps, cnn_prelu_layer_weights[prelu_layer]); + outputs = input_maps; + prelu_layer++; + } + if (layer_type == 4) + { + outputs.clear(); + for (size_t k = 0; k < input_maps.size(); ++k) + { + // Apply the sigmoid + cv::exp(-input_maps[k], input_maps[k]); + input_maps[k] = 1.0 / (1.0 + input_maps[k]); + + outputs.push_back(input_maps[k]); + + } + } + // Set the outputs of this layer to inputs of the next one + input_maps = outputs; + } + + + return outputs; + +} + +void ReadMatBin(std::ifstream& stream, cv::Mat &output_mat) +{ + // Read in the number of rows, columns and the data type + int row, col, type; + + stream.read((char*)&row, 4); + stream.read((char*)&col, 4); + stream.read((char*)&type, 4); + + output_mat = cv::Mat(row, col, type); + int size = output_mat.rows * output_mat.cols * output_mat.elemSize(); + stream.read((char *)output_mat.data, size); + +} + +void CNN::ClearPrecomp() +{ + for (size_t k1 = 0; k1 < cnn_convolutional_layers_dft.size(); ++k1) + { + for (size_t k2 = 0; k2 < cnn_convolutional_layers_dft[k1].size(); ++k2) + { + cnn_convolutional_layers_dft[k1][k2].clear(); + } + } +} + +void CNN::Read(const string& location) +{ + ifstream cnn_stream(location, ios::in | ios::binary); + if (cnn_stream.is_open()) + { + cnn_stream.seekg(0, ios::beg); + + // Reading in CNNs + + int network_depth; + cnn_stream.read((char*)&network_depth, 4); + + cnn_layer_types.resize(network_depth); + + for (int layer = 0; layer < network_depth; ++layer) + { + + int layer_type; + cnn_stream.read((char*)&layer_type, 4); + cnn_layer_types[layer] = layer_type; + + // convolutional + if (layer_type == 0) + { + + // Read the number of input maps + int num_in_maps; + cnn_stream.read((char*)&num_in_maps, 4); + + // Read the number of kernels for each input map + int num_kernels; + cnn_stream.read((char*)&num_kernels, 4); + + vector > > kernels; + + kernels.resize(num_in_maps); + + vector biases; + for (int k = 0; k < num_kernels; ++k) + { + float bias; + cnn_stream.read((char*)&bias, 4); + biases.push_back(bias); + } + + cnn_convolutional_layers_bias.push_back(biases); + + // For every input map + for (int in = 0; in < num_in_maps; ++in) + { + kernels[in].resize(num_kernels); + + // For every kernel on that input map + for (int k = 0; k < num_kernels; ++k) + { + ReadMatBin(cnn_stream, kernels[in][k]); + + } + } + + // Rearrange the kernels for faster inference with FFT + vector > > kernels_rearr; + kernels_rearr.resize(num_kernels); + + // Fill up the rearranged layer + for (int k = 0; k < num_kernels; ++k) + { + for (int in = 0; in < num_in_maps; ++in) + { + kernels_rearr[k].push_back(kernels[in][k]); + } + } + + cnn_convolutional_layers.push_back(kernels_rearr); + + // Place-holders for DFT precomputation + vector > > > cnn_convolutional_layers_dft_curr_layer; + cnn_convolutional_layers_dft_curr_layer.resize(num_kernels); + cnn_convolutional_layers_dft.push_back(cnn_convolutional_layers_dft_curr_layer); + + // Rearrange the flattened kernels into weight matrices for direct convolution computation + cv::Mat_ weight_matrix(num_in_maps * kernels_rearr[0][0].rows * kernels_rearr[0][0].cols, num_kernels); + for (size_t k = 0; k < num_kernels; ++k) + { + for (size_t i = 0; i < num_in_maps; ++i) + { + // Flatten the kernel + cv::Mat_ k_flat = kernels_rearr[k][i].t(); + k_flat = k_flat.reshape(0, 1).t(); + k_flat.copyTo(weight_matrix(cv::Rect(k, i * kernels_rearr[0][0].rows * kernels_rearr[0][0].cols, 1, kernels_rearr[0][0].rows * kernels_rearr[0][0].cols))); + } + } + + // Transpose the weight matrix for more convenient computation + weight_matrix = weight_matrix.t(); + + // Add a bias term to the weight matrix for efficiency + cv::Mat_ W(weight_matrix.rows, weight_matrix.cols + 1, 1.0); + for (size_t k = 0; k < weight_matrix.rows; ++k) + { + W.at(k, weight_matrix.cols) = biases[k]; + } + weight_matrix.copyTo(W(cv::Rect(0, 0, weight_matrix.cols, weight_matrix.rows))); + + cnn_convolutional_layers_weights.push_back(W); + + } + else if (layer_type == 1) + { + int kernel_x, kernel_y, stride_x, stride_y; + cnn_stream.read((char*)&kernel_x, 4); + cnn_stream.read((char*)&kernel_y, 4); + cnn_stream.read((char*)&stride_x, 4); + cnn_stream.read((char*)&stride_y, 4); + cnn_max_pooling_layers.push_back(std::tuple(kernel_x, kernel_y, stride_x, stride_y)); + } + else if (layer_type == 2) + { + cv::Mat_ biases; + ReadMatBin(cnn_stream, biases); + cnn_fully_connected_layers_biases.push_back(biases); + + // Fully connected layer + cv::Mat_ weights; + ReadMatBin(cnn_stream, weights); + cnn_fully_connected_layers_weights.push_back(weights.t()); + } + + else if (layer_type == 3) + { + cv::Mat_ weights; + ReadMatBin(cnn_stream, weights); + cnn_prelu_layer_weights.push_back(weights); + } + } + } + else + { + cout << "WARNING: Can't find the CNN location" << endl; + } +} + +//=========================================================================== +// Read in the MTCNN detector +void FaceDetectorMTCNN::Read(const string& location) +{ + + cout << "Reading the MTCNN face detector from: " << location << endl; + + ifstream locations(location.c_str(), ios_base::in); + if (!locations.is_open()) + { + cout << "Couldn't open the model file, aborting" << endl; + return; + } + string line; + + // The other module locations should be defined as relative paths from the main model + boost::filesystem::path root = boost::filesystem::path(location).parent_path(); + + // The main file contains the references to other files + while (!locations.eof()) + { + getline(locations, line); + + stringstream lineStream(line); + + string module; + string location; + + // figure out which module is to be read from which file + lineStream >> module; + + lineStream >> location; + + // remove carriage return at the end for compatibility with unix systems + if (location.size() > 0 && location.at(location.size() - 1) == '\r') + { + location = location.substr(0, location.size() - 1); + } + + // append to root + location = (root / location).string(); + if (module.compare("PNet") == 0) + { + cout << "Reading the PNet module from: " << location << endl; + PNet.Read(location); + } + else if(module.compare("RNet") == 0) + { + cout << "Reading the RNet module from: " << location << endl; + RNet.Read(location); + } + else if (module.compare("ONet") == 0) + { + cout << "Reading the ONet module from: " << location << endl; + ONet.Read(location); + } + } +} + +// Perform non maximum supression on proposal bounding boxes prioritizing boxes with high score/confidence +std::vector non_maximum_supression(const std::vector >& original_bb, const std::vector& scores, float thresh, bool minimum) +{ + + // Sort the input bounding boxes by the detection score, using the nice trick of multimap always being sorted internally + std::multimap idxs; + for (size_t i = 0; i < original_bb.size(); ++i) + { + idxs.insert(std::pair(scores[i], i)); + } + + std::vector output_ids; + + // keep looping while some indexes still remain in the indexes list + while (idxs.size() > 0) + { + // grab the last rectangle + auto lastElem = --std::end(idxs); + size_t curr_id = lastElem->second; + + const cv::Rect& rect1 = original_bb[curr_id]; + + idxs.erase(lastElem); + + // Iterate through remaining bounding boxes and choose which ones to remove + for (auto pos = std::begin(idxs); pos != std::end(idxs); ) + { + // grab the current rectangle + const cv::Rect& rect2 = original_bb[pos->second]; + + float intArea = (rect1 & rect2).area(); + float unionArea; + if (minimum) + { + unionArea = cv::min(rect1.area(), rect2.area()); + } + else + { + unionArea = rect1.area() + rect2.area() - intArea; + } + float overlap = intArea / unionArea; + + // Remove the bounding boxes with less confidence but with significant overlap with the current one + if (overlap > thresh) + { + pos = idxs.erase(pos); + } + else + { + ++pos; + } + } + output_ids.push_back(curr_id); + + } + + return output_ids; + +} + +// Helper function for selecting a subset of bounding boxes based on indices +void select_subset(const vector& to_keep, vector >& bounding_boxes, vector& scores, vector >& corrections) +{ + vector > bounding_boxes_tmp; + vector scores_tmp; + vector > corrections_tmp; + + for (size_t i = 0; i < to_keep.size(); ++i) + { + bounding_boxes_tmp.push_back(bounding_boxes[to_keep[i]]); + scores_tmp.push_back(scores[to_keep[i]]); + corrections_tmp.push_back(corrections[to_keep[i]]); + } + + bounding_boxes = bounding_boxes_tmp; + scores = scores_tmp; + corrections = corrections_tmp; +} + +// Use the heatmap generated by PNet to generate bounding boxes in the original image space, also generate the correction values and scores of the bounding boxes as well +void generate_bounding_boxes(vector >& o_bounding_boxes, vector& o_scores, vector >& o_corrections, const cv::Mat_& heatmap, const vector >& corrections, double scale, double threshold, int face_support) +{ + + // Correction for the pooling + int stride = 2; + + o_bounding_boxes.clear(); + o_scores.clear(); + o_corrections.clear(); + + int counter = 0; + for (int x = 0; x < heatmap.cols; ++x) + { + for(int y = 0; y < heatmap.rows; ++y) + { + if (heatmap.at(y, x) >= threshold) + { + float min_x = int((stride * x + 1) / scale); + float max_x = int((stride * x + face_support) / scale); + float min_y = int((stride * y + 1) / scale); + float max_y = int((stride * y + face_support) / scale); + + o_bounding_boxes.push_back(cv::Rect_(min_x, min_y, max_x - min_x, max_y - min_y)); + o_scores.push_back(heatmap.at(y, x)); + + float corr_x = corrections[0].at(y, x); + float corr_y = corrections[1].at(y, x); + float corr_width = corrections[2].at(y, x); + float corr_height = corrections[3].at(y, x); + o_corrections.push_back(cv::Rect_(corr_x, corr_y, corr_width, corr_height)); + + counter++; + } + } + } + +} + +// Converting the bounding boxes to squares +void rectify(vector >& total_bboxes) +{ + + // Apply size and location offsets + for (size_t i = 0; i < total_bboxes.size(); ++i) + { + float height = total_bboxes[i].height; + float width = total_bboxes[i].width; + + float max_side = max(width, height); + + // Correct the starts based on new size + float new_min_x = total_bboxes[i].x + 0.5 * (width - max_side); + float new_min_y = total_bboxes[i].y + 0.5 * (height - max_side); + + total_bboxes[i].x = (int)new_min_x; + total_bboxes[i].y = (int)new_min_y; + total_bboxes[i].width = (int)max_side; + total_bboxes[i].height = (int)max_side; + } +} + +void apply_correction(vector >& total_bboxes, const vector > corrections, bool add1) +{ + + // Apply size and location offsets + for (size_t i = 0; i < total_bboxes.size(); ++i) + { + cv::Rect curr_box = total_bboxes[i]; + if (add1) + { + curr_box.width++; + curr_box.height++; + } + + float new_min_x = curr_box.x + corrections[i].x * curr_box.width; + float new_min_y = curr_box.y + corrections[i].y * curr_box.height; + float new_max_x = curr_box.x + curr_box.width + curr_box.width * corrections[i].width; + float new_max_y = curr_box.y + curr_box.height + curr_box.height * corrections[i].height; + total_bboxes[i] = cv::Rect_(new_min_x, new_min_y, new_max_x - new_min_x, new_max_y - new_min_y); + + } + + +} + + +// The actual MTCNN face detection step +bool FaceDetectorMTCNN::DetectFaces(vector >& o_regions, const cv::Mat& img_in, std::vector& o_confidences, int min_face_size, double t1, double t2, double t3) +{ + + int height_orig = img_in.size().height; + int width_orig = img_in.size().width; + + // Size ratio of image pyramids + double pyramid_factor = 0.709; + + // Face support region is 12x12 px, so from that can work out the largest + // scale(which is 12 / min), and work down from there to smallest scale(no smaller than 12x12px) + int min_dim = std::min(height_orig, width_orig); + + int face_support = 12; + int num_scales = floor(log((double)min_face_size / (double)min_dim) / log(pyramid_factor)) + 1; + + cv::Mat input_img; + + if (img_in.channels() == 1) + { + cv::cvtColor(img_in, input_img, CV_GRAY2RGB); + } + else + { + input_img = img_in; + } + + cv::Mat img_float; + input_img.convertTo(img_float, CV_32FC3); + + vector > proposal_boxes_all; + vector scores_all; + vector > proposal_corrections_all; + + // As the scales will be done in parallel have some containers for them + vector > > proposal_boxes_cross_scale(num_scales); + vector > scores_cross_scale(num_scales); + vector > > proposal_corrections_cross_scale(num_scales); + + //tbb::parallel_for(0, (int)num_scales, [&](int i) { + for (int i = 0; i < num_scales; ++i) + { + double scale = ((double)face_support / (double)min_face_size)*cv::pow(pyramid_factor, i); + + int h_pyr = ceil(height_orig * scale); + int w_pyr = ceil(width_orig * scale); + + cv::Mat normalised_img; + cv::resize(img_float, normalised_img, cv::Size(w_pyr, h_pyr)); + + // Normalize the image + normalised_img = (normalised_img - 127.5) * 0.0078125; + + // Actual PNet CNN step + std::vector > pnet_out = PNet.Inference(normalised_img, true); + + // Clear the precomputations, as the image sizes will be different + PNet.ClearPrecomp(); + + // Extract the probabilities from PNet response + cv::Mat_ prob_heatmap; + cv::exp(pnet_out[0]- pnet_out[1], prob_heatmap); + prob_heatmap = 1.0 / (1.0 + prob_heatmap); + + // Extract the probabilities from PNet response + std::vector> corrections_heatmap(pnet_out.begin() + 2, pnet_out.end()); + + // Grab the detections + vector > proposal_boxes; + vector scores; + vector > proposal_corrections; + generate_bounding_boxes(proposal_boxes, scores, proposal_corrections, prob_heatmap, corrections_heatmap, scale, t1, face_support); + + proposal_boxes_cross_scale[i] = proposal_boxes; + scores_cross_scale[i] = scores; + proposal_corrections_cross_scale[i] = proposal_corrections; + } + //}); + + // Perform non-maximum supression on proposals accross scales and combine them + for (int i = 0; i < num_scales; ++i) + { + vector to_keep = non_maximum_supression(proposal_boxes_cross_scale[i], scores_cross_scale[i], 0.5, false); + select_subset(to_keep, proposal_boxes_cross_scale[i], scores_cross_scale[i], proposal_corrections_cross_scale[i]); + + proposal_boxes_all.insert(proposal_boxes_all.end(), proposal_boxes_cross_scale[i].begin(), proposal_boxes_cross_scale[i].end()); + scores_all.insert(scores_all.end(), scores_cross_scale[i].begin(), scores_cross_scale[i].end()); + proposal_corrections_all.insert(proposal_corrections_all.end(), proposal_corrections_cross_scale[i].begin(), proposal_corrections_cross_scale[i].end()); + } + + // Preparation for RNet step + + // Non maximum supression accross bounding boxes, and their offset correction + vector to_keep = non_maximum_supression(proposal_boxes_all, scores_all, 0.7, false); + select_subset(to_keep, proposal_boxes_all, scores_all, proposal_corrections_all); + + apply_correction(proposal_boxes_all, proposal_corrections_all, false); + + // Convert to rectangles and round + rectify(proposal_boxes_all); + + // Creating proposal images from previous step detections + vector above_thresh(proposal_boxes_all.size()); + //tbb::parallel_for(0, (int)proposal_boxes_all.size(), [&](int k) { + for (size_t k = 0; k < proposal_boxes_all.size(); ++k) + { + float width_target = proposal_boxes_all[k].width + 1; + float height_target = proposal_boxes_all[k].height + 1; + + // Work out the start and end indices in the original image + int start_x_in = cv::max((int)(proposal_boxes_all[k].x - 1), 0); + int start_y_in = cv::max((int)(proposal_boxes_all[k].y - 1), 0); + int end_x_in = cv::min((int)(proposal_boxes_all[k].x + width_target - 1), width_orig); + int end_y_in = cv::min((int)(proposal_boxes_all[k].y + height_target - 1), height_orig); + + // Work out the start and end indices in the target image + int start_x_out = cv::max((int)(-proposal_boxes_all[k].x + 1), 0); + int start_y_out = cv::max((int)(-proposal_boxes_all[k].y + 1), 0); + int end_x_out = cv::min(width_target - (proposal_boxes_all[k].x + proposal_boxes_all[k].width - width_orig), width_target); + int end_y_out = cv::min(height_target - (proposal_boxes_all[k].y + proposal_boxes_all[k].height - height_orig), height_target); + + cv::Mat tmp(height_target, width_target, CV_32FC3, cv::Scalar(0.0f,0.0f,0.0f)); + + img_float(cv::Rect(start_x_in, start_y_in, end_x_in - start_x_in, end_y_in - start_y_in)).copyTo( + tmp(cv::Rect(start_x_out, start_y_out, end_x_out - start_x_out, end_y_out - start_y_out))); + + cv::Mat prop_img; + cv::resize(tmp, prop_img, cv::Size(24, 24)); + + prop_img = (prop_img - 127.5) * 0.0078125; + + // Perform RNet on the proposal image + std::vector > rnet_out = RNet.Inference(prop_img, true); + + float prob = 1.0 / (1.0 + cv::exp(rnet_out[0].at(0) - rnet_out[0].at(1))); + scores_all[k] = prob; + proposal_corrections_all[k].x = rnet_out[0].at(2); + proposal_corrections_all[k].y = rnet_out[0].at(3); + proposal_corrections_all[k].width = rnet_out[0].at(4); + proposal_corrections_all[k].height = rnet_out[0].at(5); + if(prob >= t2) + { + above_thresh[k] = true; + } + else + { + above_thresh[k] = false; + } + + } + //}); + + to_keep.clear(); + for (size_t i = 0; i < above_thresh.size(); ++i) + { + if (above_thresh[i]) + { + to_keep.push_back(i); + } + } + + // Pick only the bounding boxes above the threshold + select_subset(to_keep, proposal_boxes_all, scores_all, proposal_corrections_all); + + // Non maximum supression accross bounding boxes, and their offset correction + to_keep = non_maximum_supression(proposal_boxes_all, scores_all, 0.7, false); + select_subset(to_keep, proposal_boxes_all, scores_all, proposal_corrections_all); + + apply_correction(proposal_boxes_all, proposal_corrections_all, false); + + // Convert to rectangles and round + rectify(proposal_boxes_all); + + // Preparing for the ONet stage + above_thresh.clear(); + above_thresh.resize(proposal_boxes_all.size()); + //tbb::parallel_for(0, (int)proposal_boxes_all.size(), [&](int k) { + for (size_t k = 0; k < proposal_boxes_all.size(); ++k) + { + float width_target = proposal_boxes_all[k].width + 1; + float height_target = proposal_boxes_all[k].height + 1; + + // Work out the start and end indices in the original image + int start_x_in = cv::max((int)(proposal_boxes_all[k].x - 1), 0); + int start_y_in = cv::max((int)(proposal_boxes_all[k].y - 1), 0); + int end_x_in = cv::min((int)(proposal_boxes_all[k].x + width_target - 1), width_orig); + int end_y_in = cv::min((int)(proposal_boxes_all[k].y + height_target - 1), height_orig); + + // Work out the start and end indices in the target image + int start_x_out = cv::max((int)(-proposal_boxes_all[k].x + 1), 0); + int start_y_out = cv::max((int)(-proposal_boxes_all[k].y + 1), 0); + int end_x_out = cv::min(width_target - (proposal_boxes_all[k].x + proposal_boxes_all[k].width - width_orig), width_target); + int end_y_out = cv::min(height_target - (proposal_boxes_all[k].y + proposal_boxes_all[k].height - height_orig), height_target); + + cv::Mat tmp(height_target, width_target, CV_32FC3, cv::Scalar(0.0f, 0.0f, 0.0f)); + + img_float(cv::Rect(start_x_in, start_y_in, end_x_in - start_x_in, end_y_in - start_y_in)).copyTo( + tmp(cv::Rect(start_x_out, start_y_out, end_x_out - start_x_out, end_y_out - start_y_out))); + + cv::Mat prop_img; + cv::resize(tmp, prop_img, cv::Size(48, 48)); + + prop_img = (prop_img - 127.5) * 0.0078125; + + // Perform RNet on the proposal image + std::vector > onet_out = ONet.Inference(prop_img, true); + + float prob = 1.0 / (1.0 + cv::exp(onet_out[0].at(0) - onet_out[0].at(1))); + scores_all[k] = prob; + proposal_corrections_all[k].x = onet_out[0].at(2); + proposal_corrections_all[k].y = onet_out[0].at(3); + proposal_corrections_all[k].width = onet_out[0].at(4); + proposal_corrections_all[k].height = onet_out[0].at(5); + if (prob >= t3) + { + above_thresh[k] = true; + } + else + { + above_thresh[k] = false; + } + } + //}); + + to_keep.clear(); + for (size_t i = 0; i < above_thresh.size(); ++i) + { + if (above_thresh[i]) + { + to_keep.push_back(i); + } + } + + // Pick only the bounding boxes above the threshold + select_subset(to_keep, proposal_boxes_all, scores_all, proposal_corrections_all); + apply_correction(proposal_boxes_all, proposal_corrections_all, true); + + // Non maximum supression accross bounding boxes, and their offset correction + to_keep = non_maximum_supression(proposal_boxes_all, scores_all, 0.7, true); + select_subset(to_keep, proposal_boxes_all, scores_all, proposal_corrections_all); + + // TODO rem + cv::Mat disp_img = input_img.clone(); + + // Correct the box to expectation to be tight around facial landmarks + for (size_t k = 0; k < proposal_boxes_all.size(); ++k) + { + proposal_boxes_all[k].x = proposal_boxes_all[k].width * -0.0075 + proposal_boxes_all[k].x; + proposal_boxes_all[k].y = proposal_boxes_all[k].height * 0.2459 + proposal_boxes_all[k].y; + proposal_boxes_all[k].width = 1.0323 * proposal_boxes_all[k].width; + proposal_boxes_all[k].height = 0.7751 * proposal_boxes_all[k].height; + + o_regions.push_back(cv::Rect_(proposal_boxes_all[k].x, proposal_boxes_all[k].y, proposal_boxes_all[k].width, proposal_boxes_all[k].height)); + o_confidences.push_back(scores_all[k]); + + cv::rectangle(disp_img, proposal_boxes_all[k], cv::Scalar(255, 0, 0), 3); + } + cv::imshow("detections", disp_img); + cv::waitKey(20); + + if(o_regions.size() > 0) + { + return true; + } + else + { + return false; + } +} + diff --git a/lib/local/LandmarkDetector/src/LandmarkDetectorFunc.cpp b/lib/local/LandmarkDetector/src/LandmarkDetectorFunc.cpp index 6b9b7158..ba6c9728 100644 --- a/lib/local/LandmarkDetector/src/LandmarkDetectorFunc.cpp +++ b/lib/local/LandmarkDetector/src/LandmarkDetectorFunc.cpp @@ -208,12 +208,23 @@ void CorrectGlobalParametersVideo(const cv::Mat_ &grayscale_image, CLNF& } -bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat_ &grayscale_image, CLNF& clnf_model, FaceModelParameters& params) +bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat &image, CLNF& clnf_model, FaceModelParameters& params) { // First need to decide if the landmarks should be "detected" or "tracked" // Detected means running face detection and a larger search area, tracked means initialising from previous step // and using a smaller search area + cv::Mat grayscale_image; + if (image.channels() == 3) + { + cv::cvtColor(image, grayscale_image, CV_BGR2GRAY); + } + else + { + grayscale_image = image.clone(); + } + + // Indicating that this is a first detection in video sequence or after restart bool initial_detection = !clnf_model.tracking_initialised; @@ -263,8 +274,13 @@ bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat_ &grayscale_i // If the face detector has not been initialised read it in if(clnf_model.face_detector_HAAR.empty()) { - clnf_model.face_detector_HAAR.load(params.face_detector_location); - clnf_model.face_detector_location = params.face_detector_location; + clnf_model.face_detector_HAAR.load(params.haar_face_detector_location); + clnf_model.haar_face_detector_location = params.haar_face_detector_location; + } + if (clnf_model.face_detector_MTCNN.empty()) + { + clnf_model.face_detector_MTCNN.Read(params.mtcnn_face_detector_location); + clnf_model.mtcnn_face_detector_location = params.haar_face_detector_location; } cv::Point preference_det(-1, -1); @@ -285,6 +301,11 @@ bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat_ &grayscale_i { face_detection_success = LandmarkDetector::DetectSingleFace(bounding_box, grayscale_image, clnf_model.face_detector_HAAR, preference_det); } + else if (params.curr_face_detector == FaceModelParameters::MTCNN_DETECTOR) + { + double confidence; + face_detection_success = LandmarkDetector::DetectSingleFaceMTCNN(bounding_box, image, clnf_model.face_detector_MTCNN, confidence, preference_det); + } // Attempt to detect landmarks using the detected face (if unseccessful the detection will be ignored) if(face_detection_success) @@ -350,7 +371,7 @@ bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat_ &grayscale_i } -bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat_ &grayscale_image, const cv::Rect_ bounding_box, CLNF& clnf_model, FaceModelParameters& params) +bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat &image, const cv::Rect_ bounding_box, CLNF& clnf_model, FaceModelParameters& params) { if(bounding_box.width > 0) { @@ -362,7 +383,7 @@ bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat_ &grayscale_i clnf_model.tracking_initialised = true; } - return DetectLandmarksInVideo(grayscale_image, clnf_model, params); + return DetectLandmarksInVideo(image, clnf_model, params); } @@ -621,9 +642,19 @@ bool DetectLandmarksInImageMultiHypEarlyTerm(const cv::Mat_ &grayscale_im // This is the one where the actual work gets done, other DetectLandmarksInImage calls lead to this one -bool LandmarkDetector::DetectLandmarksInImage(const cv::Mat_ &grayscale_image, const cv::Rect_ bounding_box, CLNF& clnf_model, FaceModelParameters& params) +bool LandmarkDetector::DetectLandmarksInImage(const cv::Mat &image, const cv::Rect_ bounding_box, CLNF& clnf_model, FaceModelParameters& params) { + cv::Mat grayscale_image; + if (image.channels() == 3) + { + cv::cvtColor(image, grayscale_image, CV_BGR2GRAY); + } + else + { + grayscale_image = image.clone(); + } + // Can have multiple hypotheses vector rotation_hypotheses; @@ -654,27 +685,41 @@ bool LandmarkDetector::DetectLandmarksInImage(const cv::Mat_ &grayscale_i // Either use basic multi-hypothesis testing or clever testing if early termination parameters are present if(clnf_model.patch_experts.early_term_biases.size() == 0) { - success = DetectLandmarksInImageMultiHypBasic(grayscale_image, rotation_hypotheses, bounding_box, clnf_model, params); + success = DetectLandmarksInImageMultiHypBasic(image, rotation_hypotheses, bounding_box, clnf_model, params); } else { - success = DetectLandmarksInImageMultiHypEarlyTerm(grayscale_image, rotation_hypotheses, bounding_box, clnf_model, params); + success = DetectLandmarksInImageMultiHypEarlyTerm(image, rotation_hypotheses, bounding_box, clnf_model, params); } return success; } -bool LandmarkDetector::DetectLandmarksInImage(const cv::Mat_ &grayscale_image, CLNF& clnf_model, FaceModelParameters& params) +bool LandmarkDetector::DetectLandmarksInImage(const cv::Mat &image, CLNF& clnf_model, FaceModelParameters& params) { + cv::Mat grayscale_image; + if (image.channels() == 3) + { + cv::cvtColor(image, grayscale_image, CV_BGR2GRAY); + } + else + { + grayscale_image = image.clone(); + } cv::Rect_ bounding_box; // If the face detector has not been initialised read it in - if(clnf_model.face_detector_HAAR.empty()) + if(clnf_model.face_detector_HAAR.empty() && params.curr_face_detector == FaceModelParameters::HAAR_DETECTOR) { - clnf_model.face_detector_HAAR.load(params.face_detector_location); - clnf_model.face_detector_location = params.face_detector_location; + clnf_model.face_detector_HAAR.load(params.haar_face_detector_location); + clnf_model.haar_face_detector_location = params.haar_face_detector_location; } - + + if (clnf_model.face_detector_MTCNN.empty() && params.curr_face_detector == FaceModelParameters::MTCNN_DETECTOR) + { + clnf_model.face_detector_MTCNN.Read(params.mtcnn_face_detector_location); + } + // Detect the face first if(params.curr_face_detector == FaceModelParameters::HOG_SVM_DETECTOR) { @@ -683,7 +728,12 @@ bool LandmarkDetector::DetectLandmarksInImage(const cv::Mat_ &grayscale_i } else if(params.curr_face_detector == FaceModelParameters::HAAR_DETECTOR) { - LandmarkDetector::DetectSingleFace(bounding_box, grayscale_image, clnf_model.face_detector_HAAR); + LandmarkDetector::DetectSingleFace(bounding_box, image, clnf_model.face_detector_HAAR); + } + else if (params.curr_face_detector == FaceModelParameters::MTCNN_DETECTOR) + { + double confidence; + LandmarkDetector::DetectSingleFaceMTCNN(bounding_box, image, clnf_model.face_detector_MTCNN, confidence); } if(bounding_box.width == 0) @@ -692,6 +742,6 @@ bool LandmarkDetector::DetectLandmarksInImage(const cv::Mat_ &grayscale_i } else { - return DetectLandmarksInImage(grayscale_image, bounding_box, clnf_model, params); + return DetectLandmarksInImage(image, bounding_box, clnf_model, params); } } diff --git a/lib/local/LandmarkDetector/src/LandmarkDetectorModel.cpp b/lib/local/LandmarkDetector/src/LandmarkDetectorModel.cpp index 92b4a9ef..c72d3e6e 100644 --- a/lib/local/LandmarkDetector/src/LandmarkDetectorModel.cpp +++ b/lib/local/LandmarkDetector/src/LandmarkDetectorModel.cpp @@ -67,9 +67,9 @@ CLNF::CLNF(string fname) // Copy constructor (makes a deep copy of CLNF) CLNF::CLNF(const CLNF& other): pdm(other.pdm), params_local(other.params_local.clone()), params_global(other.params_global), detected_landmarks(other.detected_landmarks.clone()), - landmark_likelihoods(other.landmark_likelihoods.clone()), patch_experts(other.patch_experts), landmark_validator(other.landmark_validator), face_detector_location(other.face_detector_location), - hierarchical_mapping(other.hierarchical_mapping), hierarchical_models(other.hierarchical_models), hierarchical_model_names(other.hierarchical_model_names), - hierarchical_params(other.hierarchical_params), eye_model(other.eye_model) + landmark_likelihoods(other.landmark_likelihoods.clone()), patch_experts(other.patch_experts), landmark_validator(other.landmark_validator), haar_face_detector_location(other.haar_face_detector_location), + mtcnn_face_detector_location(other.mtcnn_face_detector_location), hierarchical_mapping(other.hierarchical_mapping), hierarchical_models(other.hierarchical_models), hierarchical_model_names(other.hierarchical_model_names), + hierarchical_params(other.hierarchical_params), eye_model(other.eye_model), face_detector_MTCNN(other.face_detector_MTCNN) { this->detection_success = other.detection_success; this->tracking_initialised = other.tracking_initialised; @@ -78,9 +78,9 @@ CLNF::CLNF(const CLNF& other): pdm(other.pdm), params_local(other.params_local.c this->failures_in_a_row = other.failures_in_a_row; // Load the CascadeClassifier (as it does not have a proper copy constructor) - if(!face_detector_location.empty()) + if(!haar_face_detector_location.empty()) { - this->face_detector_HAAR.load(face_detector_location); + this->face_detector_HAAR.load(haar_face_detector_location); } // Make sure the matrices are allocated properly this->triangulations.resize(other.triangulations.size()); @@ -114,7 +114,8 @@ CLNF & CLNF::operator= (const CLNF& other) landmark_likelihoods =other.landmark_likelihoods.clone(); patch_experts = Patch_experts(other.patch_experts); landmark_validator = DetectionValidator(other.landmark_validator); - face_detector_location = other.face_detector_location; + haar_face_detector_location = other.haar_face_detector_location; + mtcnn_face_detector_location = other.mtcnn_face_detector_location; this->detection_success = other.detection_success; this->tracking_initialised = other.tracking_initialised; @@ -125,9 +126,9 @@ CLNF & CLNF::operator= (const CLNF& other) this->eye_model = other.eye_model; // Load the CascadeClassifier (as it does not have a proper copy constructor) - if(!face_detector_location.empty()) + if(!haar_face_detector_location.empty()) { - this->face_detector_HAAR.load(face_detector_location); + this->face_detector_HAAR.load(haar_face_detector_location); } // Make sure the matrices are allocated properly this->triangulations.resize(other.triangulations.size()); @@ -149,9 +150,12 @@ CLNF & CLNF::operator= (const CLNF& other) this->hierarchical_models = other.hierarchical_models; this->hierarchical_model_names = other.hierarchical_model_names; this->hierarchical_params = other.hierarchical_params; + + mtcnn_face_detector_location = other.mtcnn_face_detector_location; + face_detector_MTCNN = other.face_detector_MTCNN; } - face_detector_HOG = dlib::get_frontal_face_detector(); + face_detector_HOG = dlib::get_frontal_face_detector(); return *this; } @@ -172,7 +176,8 @@ CLNF::CLNF(const CLNF&& other) landmark_likelihoods = other.landmark_likelihoods; patch_experts = other.patch_experts; landmark_validator = other.landmark_validator; - face_detector_location = other.face_detector_location; + haar_face_detector_location = other.haar_face_detector_location; + mtcnn_face_detector_location = other.mtcnn_face_detector_location; face_detector_HAAR = other.face_detector_HAAR; @@ -181,6 +186,8 @@ CLNF::CLNF(const CLNF&& other) face_detector_HOG = dlib::get_frontal_face_detector(); + face_detector_MTCNN = other.face_detector_MTCNN; + // Copy over the hierarchical models this->hierarchical_mapping = other.hierarchical_mapping; this->hierarchical_models = other.hierarchical_models; @@ -207,7 +214,8 @@ CLNF & CLNF::operator= (const CLNF&& other) landmark_likelihoods = other.landmark_likelihoods; patch_experts = other.patch_experts; landmark_validator = other.landmark_validator; - face_detector_location = other.face_detector_location; + haar_face_detector_location = other.haar_face_detector_location; + mtcnn_face_detector_location = other.mtcnn_face_detector_location; face_detector_HAAR = other.face_detector_HAAR; @@ -216,6 +224,8 @@ CLNF & CLNF::operator= (const CLNF&& other) face_detector_HOG = dlib::get_frontal_face_detector(); + face_detector_MTCNN = other.face_detector_MTCNN; + // Copy over the hierarchical models this->hierarchical_mapping = other.hierarchical_mapping; this->hierarchical_models = other.hierarchical_models; diff --git a/lib/local/LandmarkDetector/src/LandmarkDetectorParameters.cpp b/lib/local/LandmarkDetector/src/LandmarkDetectorParameters.cpp index b3d805e3..2a3f7fd0 100644 --- a/lib/local/LandmarkDetector/src/LandmarkDetectorParameters.cpp +++ b/lib/local/LandmarkDetector/src/LandmarkDetectorParameters.cpp @@ -86,7 +86,7 @@ FaceModelParameters::FaceModelParameters(vector &arguments) if (arguments[i].compare("-fdloc") ==0) { string face_detector_loc = arguments[i + 1]; - face_detector_location = face_detector_loc; + haar_face_detector_location = face_detector_loc; curr_face_detector = HAAR_DETECTOR; valid[i] = false; valid[i + 1] = false; @@ -176,8 +176,10 @@ FaceModelParameters::FaceModelParameters(vector &arguments) valid[i] = false; // For in-the-wild images use an in-the wild detector - curr_face_detector = HOG_SVM_DETECTOR; + curr_face_detector = MTCNN_DETECTOR; + // Use multi-view hypotheses if in-the-wild setting + multi_view = true; } } @@ -216,6 +218,46 @@ FaceModelParameters::FaceModelParameters(vector &arguments) sigma = 1.5 * sigma; reg_factor = 0.9 * reg_factor; } + + // Make sure face detector location is valid + // First check working directory, then the executable's directory, then the config path set by the build process. + model_path = boost::filesystem::path(haar_face_detector_location); + if (boost::filesystem::exists(model_path)) + { + haar_face_detector_location = model_path.string(); + } + else if (boost::filesystem::exists(root / model_path)) + { + haar_face_detector_location = (root / model_path).string(); + } + else if (boost::filesystem::exists(config_path / model_path)) + { + haar_face_detector_location = (config_path / model_path).string(); + } + else + { + std::cout << "Could not find the HAAR face detector location" << std::endl; + } + + // Make sure face detector location is valid + // First check working directory, then the executable's directory, then the config path set by the build process. + model_path = boost::filesystem::path(mtcnn_face_detector_location); + if (boost::filesystem::exists(model_path)) + { + mtcnn_face_detector_location = model_path.string(); + } + else if (boost::filesystem::exists(root / model_path)) + { + mtcnn_face_detector_location = (root / model_path).string(); + } + else if (boost::filesystem::exists(config_path / model_path)) + { + mtcnn_face_detector_location = (config_path / model_path).string(); + } + else + { + std::cout << "Could not find the MTCNN face detector location" << std::endl; + } } void FaceModelParameters::init() @@ -269,11 +311,12 @@ void FaceModelParameters::init() reinit_video_every = 4; // Face detection - face_detector_location = "classifiers/haarcascade_frontalface_alt.xml"; + haar_face_detector_location = "classifiers/haarcascade_frontalface_alt.xml"; + mtcnn_face_detector_location = "model/mtcnn_detector/MTCNN_detector.txt"; quiet_mode = false; - // By default use HOG SVM - curr_face_detector = HOG_SVM_DETECTOR; + // By default use MTCNN + curr_face_detector = MTCNN_DETECTOR; // The gaze tracking has to be explicitly initialised track_gaze = false; diff --git a/lib/local/LandmarkDetector/src/LandmarkDetectorUtils.cpp b/lib/local/LandmarkDetector/src/LandmarkDetectorUtils.cpp index ffba8441..34b21a6d 100644 --- a/lib/local/LandmarkDetector/src/LandmarkDetectorUtils.cpp +++ b/lib/local/LandmarkDetector/src/LandmarkDetectorUtils.cpp @@ -1560,94 +1560,168 @@ namespace LandmarkDetector return detect_success; } - //============================================================================ - // Matrix reading functionality - //============================================================================ +bool DetectFacesMTCNN(vector >& o_regions, const cv::Mat& image, LandmarkDetector::FaceDetectorMTCNN& detector, std::vector& o_confidences) +{ + detector.DetectFaces(o_regions, image, o_confidences); - // Reading in a matrix from a stream - void ReadMat(std::ifstream& stream, cv::Mat &output_mat) + return o_regions.size() > 0; +} + +bool DetectSingleFaceMTCNN(cv::Rect_& o_region, const cv::Mat& image, LandmarkDetector::FaceDetectorMTCNN& detector, double& confidence, cv::Point preference) +{ + // The tracker can return multiple faces + vector > face_detections; + vector confidences; + + detector.DetectFaces(face_detections, image, confidences); + + bool detect_success = face_detections.size() > 0; + if (detect_success) { - // Read in the number of rows, columns and the data type - int row, col, type; - stream >> row >> col >> type; + bool use_preferred = (preference.x != -1) && (preference.y != -1); - output_mat = cv::Mat(row, col, type); - - switch (output_mat.type()) + // keep the most confident one or the one closest to preference point if set + double best_so_far; + if (use_preferred) { - case CV_64FC1: + best_so_far = sqrt((preference.x - (face_detections[0].width / 2 + face_detections[0].x)) * (preference.x - (face_detections[0].width / 2 + face_detections[0].x)) + + (preference.y - (face_detections[0].height / 2 + face_detections[0].y)) * (preference.y - (face_detections[0].height / 2 + face_detections[0].y))); + } + else { - cv::MatIterator_ begin_it = output_mat.begin(); - cv::MatIterator_ end_it = output_mat.end(); + best_so_far = confidences[0]; + } + int bestIndex = 0; - while (begin_it != end_it) + for (size_t i = 1; i < face_detections.size(); ++i) + { + + double dist; + bool better; + + if (use_preferred) { - stream >> *begin_it++; + dist = sqrt((preference.x - (face_detections[0].width / 2 + face_detections[0].x)) * (preference.x - (face_detections[0].width / 2 + face_detections[0].x)) + + (preference.y - (face_detections[0].height / 2 + face_detections[0].y)) * (preference.y - (face_detections[0].height / 2 + face_detections[0].y))); + better = dist < best_so_far; + } + else + { + dist = confidences[i]; + better = dist > best_so_far; + } + + // Pick a closest face + if (better) + { + best_so_far = dist; + bestIndex = i; } } - break; - case CV_32FC1: - { - cv::MatIterator_ begin_it = output_mat.begin(); - cv::MatIterator_ end_it = output_mat.end(); - while (begin_it != end_it) - { - stream >> *begin_it++; - } - } - break; - case CV_32SC1: - { - cv::MatIterator_ begin_it = output_mat.begin(); - cv::MatIterator_ end_it = output_mat.end(); - while (begin_it != end_it) - { - stream >> *begin_it++; - } - } - break; - case CV_8UC1: - { - cv::MatIterator_ begin_it = output_mat.begin(); - cv::MatIterator_ end_it = output_mat.end(); - while (begin_it != end_it) - { - stream >> *begin_it++; - } - } - break; - default: - printf("ERROR(%s,%d) : Unsupported Matrix type %d!\n", __FILE__, __LINE__, output_mat.type()); abort(); + o_region = face_detections[bestIndex]; + confidence = confidences[bestIndex]; + } + else + { + // if not detected + o_region = cv::Rect_(0, 0, 0, 0); + // A completely unreliable detection (shouldn't really matter what is returned here) + confidence = -2; + } + return detect_success; +} +//============================================================================ +// Matrix reading functionality +//============================================================================ + +// Reading in a matrix from a stream +void ReadMat(std::ifstream& stream, cv::Mat &output_mat) +{ + // Read in the number of rows, columns and the data type + int row, col, type; + + stream >> row >> col >> type; + + output_mat = cv::Mat(row, col, type); + + switch (output_mat.type()) + { + case CV_64FC1: + { + cv::MatIterator_ begin_it = output_mat.begin(); + cv::MatIterator_ end_it = output_mat.end(); + + while (begin_it != end_it) + { + stream >> *begin_it++; } } - - void ReadMatBin(std::ifstream& stream, cv::Mat &output_mat) + break; + case CV_32FC1: { - // Read in the number of rows, columns and the data type - int row, col, type; + cv::MatIterator_ begin_it = output_mat.begin(); + cv::MatIterator_ end_it = output_mat.end(); - stream.read((char*)&row, 4); - stream.read((char*)&col, 4); - stream.read((char*)&type, 4); - - output_mat = cv::Mat(row, col, type); - int size = output_mat.rows * output_mat.cols * output_mat.elemSize(); - stream.read((char *)output_mat.data, size); - - } - - // Skipping lines that start with # (together with empty lines) - void SkipComments(std::ifstream& stream) - { - while (stream.peek() == '#' || stream.peek() == '\n' || stream.peek() == ' ' || stream.peek() == '\r') + while (begin_it != end_it) { - std::string skipped; - std::getline(stream, skipped); + stream >> *begin_it++; } } + break; + case CV_32SC1: + { + cv::MatIterator_ begin_it = output_mat.begin(); + cv::MatIterator_ end_it = output_mat.end(); + while (begin_it != end_it) + { + stream >> *begin_it++; + } + } + break; + case CV_8UC1: + { + cv::MatIterator_ begin_it = output_mat.begin(); + cv::MatIterator_ end_it = output_mat.end(); + while (begin_it != end_it) + { + stream >> *begin_it++; + } + } + break; + default: + printf("ERROR(%s,%d) : Unsupported Matrix type %d!\n", __FILE__, __LINE__, output_mat.type()); abort(); + + + } +} + +void ReadMatBin(std::ifstream& stream, cv::Mat &output_mat) +{ + // Read in the number of rows, columns and the data type + int row, col, type; + + stream.read((char*)&row, 4); + stream.read((char*)&col, 4); + stream.read((char*)&type, 4); + + output_mat = cv::Mat(row, col, type); + int size = output_mat.rows * output_mat.cols * output_mat.elemSize(); + stream.read((char *)output_mat.data, size); + +} + +// Skipping lines that start with # (together with empty lines) +void SkipComments(std::ifstream& stream) +{ + while (stream.peek() == '#' || stream.peek() == '\n' || stream.peek() == ' ' || stream.peek() == '\r') + { + std::string skipped; + std::getline(stream, skipped); + } +} } diff --git a/matlab_version/demo/face_image_demo.m b/matlab_version/demo/face_image_demo.m index 654b0acb..80ad8973 100644 --- a/matlab_version/demo/face_image_demo.m +++ b/matlab_version/demo/face_image_demo.m @@ -16,6 +16,10 @@ addpath('../CCNF/'); clmParams.multi_modal_types = patches(1).multi_modal_types; +% Dependencies for face detection (MatConvNet), remove if not present +setup_mconvnet; +addpath('../face_detection/mtcnn/'); + %% root_dir = '../../samples/'; images = dir([root_dir, '*.jpg']); @@ -25,8 +29,11 @@ verbose = true; for img=1:numel(images) image_orig = imread([root_dir images(img).name]); + % MTCNN face detector + [bboxs, det_shapes, confidences] = detect_face_mtcnn(image_orig); + % First attempt to use the Matlab one (fastest but not as accurate, if not present use yu et al.) - [bboxs, det_shapes] = detect_faces(image_orig, {'cascade', 'yu'}); + % [bboxs, det_shapes] = detect_faces(image_orig, {'cascade', 'yu'}); % Zhu and Ramanan and Yu et al. are slower, but also more accurate % and can be used when vision toolbox is unavailable % [bboxs, det_shapes] = detect_faces(image_orig, {'yu', 'zhu'}); @@ -52,28 +59,14 @@ for img=1:numel(images) hold on; end - for i=1:size(bboxs,2) + for i=1:size(bboxs,1) % Convert from the initial detected shape to CLM model parameters, % if shape is available - bbox = bboxs(:,i); - - if(~isempty(det_shapes)) - shape = det_shapes(:,:,i); - inds = [1:60,62:64,66:68]; - M = pdm.M([inds, inds+68, inds+68*2]); - E = pdm.E; - V = pdm.V([inds, inds+68, inds+68*2],:); - [ a, R, T, ~, params, err, shapeOrtho] = fit_PDM_ortho_proj_to_2D(M, E, V, shape); - g_param = [a; Rot2Euler(R)'; T]; - l_param = params; + bbox = bboxs(i,:); - % Use the initial global and local params for clm fitting in the image - [shape,~,~,lhood,lmark_lhood,view_used] = Fitting_from_bb(image, [], bbox, pdm, patches, clmParams, 'gparam', g_param, 'lparam', l_param); - else - [shape,~,~,lhood,lmark_lhood,view_used] = Fitting_from_bb(image, [], bbox, pdm, patches, clmParams); - end + [shape,~,~,lhood,lmark_lhood,view_used] = Fitting_from_bb(image, [], bbox, pdm, patches, clmParams); % shape correction for matlab format shape = shape + 1; diff --git a/matlab_version/demo/face_video_demo.m b/matlab_version/demo/face_video_demo.m index 2241c686..ba84b2d2 100644 --- a/matlab_version/demo/face_video_demo.m +++ b/matlab_version/demo/face_video_demo.m @@ -33,6 +33,12 @@ od = cd('../face_validation/'); setup; cd(od); +% Setup the face detector (remove the setup mconvnet if not using +% MatConvNet) +setup_mconvnet; +addpath('../face_detection/mtcnn/'); + + %% for v=1:numel(vids) % load the video @@ -66,8 +72,9 @@ for v=1:numel(vids) image_orig = read(vr, i); if((~det && mod(i,4) == 0) || ~initialised) + [bboxs, det_shapes, confidences] = detect_face_mtcnn(image_orig); % First attempt to use the Matlab one (fastest but not as accurate, if not present use yu et al.) - [bboxs, det_shapes] = detect_faces(image_orig, {'cascade', 'yu'}); + % [bboxs, det_shapes] = detect_faces(image_orig, {'cascade', 'yu'}); % Zhu and Ramanan and Yu et al. are slower, but also more accurate % and can be used when vision toolbox is unavailable % [bboxs, det_shapes] = detect_faces(image_orig, {'yu', 'zhu'}); @@ -75,8 +82,8 @@ for v=1:numel(vids) if(~isempty(bboxs)) % Pick the biggest face for tracking - [~,ind] = max(bboxs(3,:) - bboxs(1,:)); - bbox = bboxs(:,ind); + [~,ind] = max(bboxs(:,3) - bboxs(:,1)); + bbox = bboxs(ind,:); % Discard overly small detections if(bbox(3) - bbox(1) > 40) @@ -84,39 +91,27 @@ for v=1:numel(vids) % Either infer the local and global shape parameters % from the detected landmarks or just using the % bounding box - if(~isempty(det_shapes)) - shape = det_shapes(:,:,ind); - - inds = [1:60,62:64,66:68]; - M = pdm.M([inds, inds+68, inds+68*2]); - E = pdm.E; - V = pdm.V([inds, inds+68, inds+68*2],:); - [ a, R, T, ~, params, err] = fit_PDM_ortho_proj_to_2D(M, E, V, shape); - g_param_n = [a; Rot2Euler(R)'; T]; - l_param_n = params; - else - num_points = numel(pdm.M) / 3; + num_points = numel(pdm.M) / 3; - M = reshape(pdm.M, num_points, 3); - width_model = max(M(:,1)) - min(M(:,1)); - height_model = max(M(:,2)) - min(M(:,2)); + M = reshape(pdm.M, num_points, 3); + width_model = max(M(:,1)) - min(M(:,1)); + height_model = max(M(:,2)) - min(M(:,2)); - a = (((bbox(3) - bbox(1)) / width_model) + ((bbox(4) - bbox(2))/ height_model)) / 2; + a = (((bbox(3) - bbox(1)) / width_model) + ((bbox(4) - bbox(2))/ height_model)) / 2; - tx = (bbox(3) + bbox(1))/2; - ty = (bbox(4) + bbox(2))/2; + tx = (bbox(3) + bbox(1))/2; + ty = (bbox(4) + bbox(2))/2; - % correct it so that the bounding box is just around the minimum - % and maximum point in the initialised face - tx = tx - a*(min(M(:,1)) + max(M(:,1)))/2; - ty = ty + a*(min(M(:,2)) + max(M(:,2)))/2; + % correct it so that the bounding box is just around the minimum + % and maximum point in the initialised face + tx = tx - a*(min(M(:,1)) + max(M(:,1)))/2; + ty = ty + a*(min(M(:,2)) + max(M(:,2)))/2; - % visualisation - g_param_n = [a, 0, 0, 0, tx, ty]'; + % visualisation + g_param_n = [a, 0, 0, 0, tx, ty]'; - l_param_n = zeros(size(pdm.E)); - end + l_param_n = zeros(size(pdm.E)); % If tracking has not started trust the detection if(~initialised) @@ -186,7 +181,7 @@ for v=1:numel(vids) end hold off; drawnow expose; - pause(0.05); + pause(0.01); if(record) frame = getframe; diff --git a/matlab_version/demo/setup_mconvnet.m b/matlab_version/demo/setup_mconvnet.m new file mode 100644 index 00000000..f4837f11 --- /dev/null +++ b/matlab_version/demo/setup_mconvnet.m @@ -0,0 +1,28 @@ +function setup(varargin) + +try + run D:\soft\matconvnet-master\matconvnet-master\matlab/vl_setupnn ; + addpath D:\soft\matconvnet-master\matconvnet-master\examples ; + + opts.useGpu = false ; + opts.verbose = false ; + opts = vl_argparse(opts, varargin) ; + + try + vl_nnconv(single(1),single(1),[]) ; + catch + warning('VL_NNCONV() does not seem to be compiled. Trying to compile it now.') ; + vl_compilenn('enableGpu', opts.useGpu, 'verbose', opts.verbose) ; + end + + if opts.useGpu + try + vl_nnconv(gpuArray(single(1)),gpuArray(single(1)),[]) ; + catch + vl_compilenn('enableGpu', opts.useGpu, 'verbose', opts.verbose) ; + warning('GPU support does not seem to be compiled in MatConvNet. Trying to compile it now') ; + end + end +catch + fprintf('Could not setup MatConvNet, face detection will be slower, install the library and set the right location for it in setup_mconvnet.m\n'); +end \ No newline at end of file diff --git a/matlab_version/face_detection/detect_faces.m b/matlab_version/face_detection/detect_faces.m index 42cdc1d0..078ac069 100644 --- a/matlab_version/face_detection/detect_faces.m +++ b/matlab_version/face_detection/detect_faces.m @@ -4,8 +4,8 @@ function [ bboxes, shapes ] = detect_faces( image, types ) % image - the image to detect the faces on % type - cell array of the face detectors to use: 'zhu', 'yu', 'cascade' % OUTPUT: -% bboxes - a set of bounding boxes describing the detected faces 4 x -% num_faces, the format is [min_x; min_y; max_x; max_y]; +% bboxes - a set of bounding boxes describing the detected faces num_faces x +% 4, the format is [min_x; min_y; max_x; max_y]; % shapes - if the face detector detects landmarks as well, output them % n_points x 2 x num_faces @@ -57,6 +57,6 @@ function [ bboxes, shapes ] = detect_faces( image, types ) if(use_zhu && isempty(bboxes)) [bboxes, shapes] = Detect_tree_based_zhu(image); end - + bboxes = bboxes'' end diff --git a/matlab_version/face_detection/mtcnn/ONet.m b/matlab_version/face_detection/mtcnn/ONet.m new file mode 100644 index 00000000..14d62325 --- /dev/null +++ b/matlab_version/face_detection/mtcnn/ONet.m @@ -0,0 +1,37 @@ +function [ out_prob, out_correction, out_lmarks ] = ONet( im_data, ONet_mlab ) +%PNET Summary of this function goes here +% Detailed explanation goes here + + % The convolutional and pooling layers + out = convolution(im_data, ONet_mlab.weights_conv1, ONet_mlab.biases_conv1); + out = PReLU(out, ONet_mlab.prelu_weights_1); + out = max_pooling2(out, 3, 2); + out = convolution(out, ONet_mlab.weights_conv2, ONet_mlab.biases_conv2); + out = PReLU(out, ONet_mlab.prelu_weights_2); + out = max_pooling2(out, 3, 2); + out = convolution(out, ONet_mlab.weights_conv3, ONet_mlab.biases_conv3); + out = PReLU(out, ONet_mlab.prelu_weights_3); + out = max_pooling2(out, 2, 2); + out = convolution(out, ONet_mlab.weights_conv4, ONet_mlab.biases_conv4); + out = PReLU(out, ONet_mlab.prelu_weights_4); + + % The fully connected layers + + out_fc_1 = zeros(size(out,1)*size(out,2) * size(out,3), size(out,4)); + out_fc_1(:) = out(:); + out_fc_1 = out_fc_1' * ONet_mlab.w_fc1 + ONet_mlab.b_fc1'; + out_fc_1 = PReLU(out_fc_1, ONet_mlab.prelu_fc1); + + out_fc2 = out_fc_1 * ONet_mlab.w_fc2 + ONet_mlab.b_fc2'; + out_fc2 = out_fc2'; + + % Probability of each proposal + out_prob = 1./(1+exp(out_fc2(1,:)-out_fc2(2,:))); + + % The correction of each detection + out_correction = out_fc2(3:6,:); + + % The actual detected landmarks + out_lmarks = out_fc2(7:end,:); +end + diff --git a/matlab_version/face_detection/mtcnn/ONet_mlab.mat b/matlab_version/face_detection/mtcnn/ONet_mlab.mat new file mode 100644 index 00000000..5f08c114 Binary files /dev/null and b/matlab_version/face_detection/mtcnn/ONet_mlab.mat differ diff --git a/matlab_version/face_detection/mtcnn/PNet.m b/matlab_version/face_detection/mtcnn/PNet.m new file mode 100644 index 00000000..ca22fd68 --- /dev/null +++ b/matlab_version/face_detection/mtcnn/PNet.m @@ -0,0 +1,26 @@ +function [ out_prob, out_correction ] = PNet( im_data, PNet_mlab ) +%PNET Summary of this function goes here +% Detailed explanation goes here + + % Pass through the first convolution layer + out = convolution(im_data, PNet_mlab.weights_conv1, PNet_mlab.biases_conv1); + out = PReLU(out, PNet_mlab.prelu_weights_1); + out = max_pooling2(out, 2, 2); + out = convolution(out, PNet_mlab.weights_conv2, PNet_mlab.biases_conv2); + out = PReLU(out, PNet_mlab.prelu_weights_2); + out = convolution(out, PNet_mlab.weights_conv3, PNet_mlab.biases_conv3); + out = PReLU(out, PNet_mlab.prelu_weights_3); + + % The fully connected layer + out_fc = zeros(size(out,1)*size(out,2), size(out,3)); + out_fc(:) = out(:); + out_fc = out_fc * PNet_mlab.w + PNet_mlab.b'; + out = reshape(out_fc, size(out,1), size(out,2), size(out_fc,2)); + + % The alignment probabilities (face heat map) + out_prob = 1./(1+exp(out(:,:,1)-out(:,:,2))); + + % The correction of the detection + out_correction = out(:,:,3:end); +end + diff --git a/matlab_version/face_detection/mtcnn/PNet_mlab.mat b/matlab_version/face_detection/mtcnn/PNet_mlab.mat new file mode 100644 index 00000000..40726a48 Binary files /dev/null and b/matlab_version/face_detection/mtcnn/PNet_mlab.mat differ diff --git a/matlab_version/face_detection/mtcnn/PReLU.m b/matlab_version/face_detection/mtcnn/PReLU.m new file mode 100644 index 00000000..4c345315 --- /dev/null +++ b/matlab_version/face_detection/mtcnn/PReLU.m @@ -0,0 +1,26 @@ +function [ out_map ] = PReLU( input_maps, PReLU_params ) +%PRELU Summary of this function goes here +% Detailed explanation goes here + + out_map = zeros(size(input_maps)); + if(numel(size(input_maps)) > 2) + for i=1:size(input_maps,3) + + % A more readable but slower version + % in_map = input_maps(:,:,i,:); + % in_map(in_map < 0) = in_map(in_map<0) * PReLU_params(i); + + % alternative +% out_map(:,:,i,:) = max(input_maps(:,:,i,:),0) + min(input_maps(:,:,i,:),0)*PReLU_params(i); + + out_map(:,:,i,:) = input_maps(:,:,i,:) .* (PReLU_params(i) + (1 - PReLU_params(i)) * (input_maps(:,:,i,:) > 0)) ; + end + else + for i=1:size(input_maps,2) + in_map = input_maps(:,i); + in_map(in_map < 0) = in_map(in_map<0) * PReLU_params(i); + out_map(:,i) = in_map; + end + end +end + diff --git a/matlab_version/face_detection/mtcnn/RNet.m b/matlab_version/face_detection/mtcnn/RNet.m new file mode 100644 index 00000000..7c95cc61 --- /dev/null +++ b/matlab_version/face_detection/mtcnn/RNet.m @@ -0,0 +1,31 @@ +function [ out_prob, out_correction ] = RNet( im_data, RNet_mlab ) +%PNET Summary of this function goes here +% Detailed explanation goes here + + % The convolutional and pooling layers + out = convolution(im_data, RNet_mlab.weights_conv1, RNet_mlab.biases_conv1); + out = PReLU(out, RNet_mlab.prelu_weights_1); + out = max_pooling2(out, 3, 2); + out = convolution(out, RNet_mlab.weights_conv2, RNet_mlab.biases_conv2); + out = PReLU(out, RNet_mlab.prelu_weights_2); + out = max_pooling2(out, 3, 2); + out = convolution(out, RNet_mlab.weights_conv3, RNet_mlab.biases_conv3); + out = PReLU(out, RNet_mlab.prelu_weights_3); + + % The fully connected layers + + out_fc_1 = zeros(size(out,1)*size(out,2) * size(out,3), size(out,4)); + out_fc_1(:) = out(:); + out_fc_1 = out_fc_1' * RNet_mlab.w_fc1 + RNet_mlab.b_fc1'; + out_fc_1 = PReLU(out_fc_1, RNet_mlab.prelu_fc1); + + out_fc2 = out_fc_1 * RNet_mlab.w_fc2 + RNet_mlab.b_fc2'; + out_fc2 = out_fc2'; + + % Probability of each proposal + out_prob = 1./(1+exp(out_fc2(1,:)-out_fc2(2,:))); + + % The correction of each detection + out_correction = out_fc2(3:end,:); +end + diff --git a/matlab_version/face_detection/mtcnn/RNet_mlab.mat b/matlab_version/face_detection/mtcnn/RNet_mlab.mat new file mode 100644 index 00000000..f7e25c2d Binary files /dev/null and b/matlab_version/face_detection/mtcnn/RNet_mlab.mat differ diff --git a/matlab_version/face_detection/mtcnn/apply_correction.m b/matlab_version/face_detection/mtcnn/apply_correction.m new file mode 100644 index 00000000..c6e8a5bb --- /dev/null +++ b/matlab_version/face_detection/mtcnn/apply_correction.m @@ -0,0 +1,23 @@ +function [ total_bboxes ] = apply_correction( total_bboxes, corrections, add1 ) +%APPLY_CORRECTION Summary of this function goes here +% Detailed explanation goes here + + % Perform correction based on regression values + bbw = total_bboxes(:,3) - total_bboxes(:,1); + bbh = total_bboxes(:,4) - total_bboxes(:,2); + + % TODO is this needed? + if(add1) + bbw = bbw + 1; + bbh = bbh + 1; + end + + new_min_x = total_bboxes(:,1) + corrections(:,1) .* bbw; + new_min_y = total_bboxes(:,2) + corrections(:,2) .* bbh; + new_max_x = total_bboxes(:,3) + corrections(:,3) .* bbw; + new_max_y = total_bboxes(:,4) + corrections(:,4) .* bbh; + score = total_bboxes(:,5); + total_bboxes = [new_min_x, new_min_y, new_max_x, new_max_y, score]; + +end + diff --git a/matlab_version/face_detection/mtcnn/convert_to_cpp/MTCNN_detector.txt b/matlab_version/face_detection/mtcnn/convert_to_cpp/MTCNN_detector.txt new file mode 100644 index 00000000..9a4f805b --- /dev/null +++ b/matlab_version/face_detection/mtcnn/convert_to_cpp/MTCNN_detector.txt @@ -0,0 +1,3 @@ +PNet PNet.dat +RNet RNet.dat +ONet ONet.dat diff --git a/matlab_version/face_detection/mtcnn/convert_to_cpp/ONet.dat b/matlab_version/face_detection/mtcnn/convert_to_cpp/ONet.dat new file mode 100644 index 00000000..291c4462 Binary files /dev/null and b/matlab_version/face_detection/mtcnn/convert_to_cpp/ONet.dat differ diff --git a/matlab_version/face_detection/mtcnn/convert_to_cpp/PNet.dat b/matlab_version/face_detection/mtcnn/convert_to_cpp/PNet.dat new file mode 100644 index 00000000..9550d39a Binary files /dev/null and b/matlab_version/face_detection/mtcnn/convert_to_cpp/PNet.dat differ diff --git a/matlab_version/face_detection/mtcnn/convert_to_cpp/RNet.dat b/matlab_version/face_detection/mtcnn/convert_to_cpp/RNet.dat new file mode 100644 index 00000000..864e0dd9 Binary files /dev/null and b/matlab_version/face_detection/mtcnn/convert_to_cpp/RNet.dat differ diff --git a/matlab_version/face_detection/mtcnn/convert_to_cpp/Write_CNN_to_binary.m b/matlab_version/face_detection/mtcnn/convert_to_cpp/Write_CNN_to_binary.m new file mode 100644 index 00000000..12106e88 --- /dev/null +++ b/matlab_version/face_detection/mtcnn/convert_to_cpp/Write_CNN_to_binary.m @@ -0,0 +1,70 @@ +function Write_CNN_to_binary(location_binary, cnn) + + addpath('../../../PDM_helpers/'); + + % use little-endian + cnn_binary_file = fopen(location_binary, 'w', 'l'); + + num_layers = size(cnn.layers,2); + + % Get the number of layers + fwrite(cnn_binary_file, num_layers, 'uint'); % 4 bytes + + for layers=1:num_layers + + % write layer type: 0 - convolutional, 1 - max pooling, 2 - + % fully connected, 3 - prelu, 4 - sigmoid + if(strcmp(cnn.layers{layers}.type, 'conv')) + + % write the type (convolutional) + fwrite(cnn_binary_file, 0, 'uint'); % 4 bytes + + num_in_map = size(cnn.layers{layers}.weights{1},3); + + % write the number of input maps + fwrite(cnn_binary_file, num_in_map, 'uint'); % 4 bytes + + num_out_kerns = size(cnn.layers{layers}.weights{1},4); + + % write the number of kernels for each output map + fwrite(cnn_binary_file, num_out_kerns, 'uint'); % 4 bytes + + % Write output map bias terms + for k2=1:num_out_kerns + fwrite(cnn_binary_file, cnn.layers{layers}.weights{2}(k2), 'float32'); % 4 bytes + end + + for k=1:num_in_map + for k2=1:num_out_kerns + % Write out the kernel + W = squeeze(cnn.layers{layers}.weights{1}(:,:,k,k2)); + writeMatrixBin(cnn_binary_file, W, 5); + end + end + elseif(strcmp(cnn.layers{layers}.type, 'fc')) + + % This is the fully connected layer + fwrite(cnn_binary_file, 2, 'uint'); % 4 bytes + + % the bias term + writeMatrixBin(cnn_binary_file, cnn.layers{layers}.weights{2}, 5); + % the weights + writeMatrixBin(cnn_binary_file, cnn.layers{layers}.weights{1}, 5); + + elseif(strcmp(cnn.layers{layers}.type, 'max_pooling')) + fwrite(cnn_binary_file, 1, 'uint'); % 4 bytes, indicate max pooling layer + % params kernel and stride size + fwrite(cnn_binary_file, cnn.layers{layers}.kernel_size_x, 'uint'); % 4 bytes + fwrite(cnn_binary_file, cnn.layers{layers}.kernel_size_y, 'uint'); % 4 bytes + fwrite(cnn_binary_file, cnn.layers{layers}.stride_x, 'uint'); % 4 bytes + fwrite(cnn_binary_file, cnn.layers{layers}.stride_y, 'uint'); % 4 bytes + + elseif(strcmp(cnn.layers{layers}.type, 'prelu')) + fwrite(cnn_binary_file, 3, 'uint'); % 4 bytes, indicate a parametric relu layer + writeMatrixBin(cnn_binary_file, cnn.layers{layers}.weights{1}, 5); + end + end + + fclose(cnn_binary_file); + +end \ No newline at end of file diff --git a/matlab_version/face_detection/mtcnn/convert_to_cpp/Write_out_mtcnn.m b/matlab_version/face_detection/mtcnn/convert_to_cpp/Write_out_mtcnn.m new file mode 100644 index 00000000..7afbce88 --- /dev/null +++ b/matlab_version/face_detection/mtcnn/convert_to_cpp/Write_out_mtcnn.m @@ -0,0 +1,184 @@ +% First writing out PNet +load('../PNet_mlab.mat'); + +cnn = struct; +cnn.layers = cell(1,8); +cnn.layers{1} = struct; +cnn.layers{1}.type = 'conv'; +cnn.layers{1}.weights = {PNet_mlab.weights_conv1, PNet_mlab.biases_conv1}; + +cnn.layers{2} = struct; +cnn.layers{2}.type = 'prelu'; +cnn.layers{2}.weights = {PNet_mlab.prelu_weights_1}; + +cnn.layers{3} = struct; +cnn.layers{3}.type = 'max_pooling'; +cnn.layers{3}.weights = {}; +cnn.layers{3}.stride_x = 2; +cnn.layers{3}.stride_y = 2; +cnn.layers{3}.kernel_size_x = 2; +cnn.layers{3}.kernel_size_y = 2; + +cnn.layers{4} = struct; +cnn.layers{4}.type = 'conv'; +cnn.layers{4}.weights = {PNet_mlab.weights_conv2, PNet_mlab.biases_conv2}; + +cnn.layers{5} = struct; +cnn.layers{5}.type = 'prelu'; +cnn.layers{5}.weights = {PNet_mlab.prelu_weights_2}; + +cnn.layers{6} = struct; +cnn.layers{6}.type = 'conv'; +cnn.layers{6}.weights = {PNet_mlab.weights_conv3, PNet_mlab.biases_conv3}; + +cnn.layers{7} = struct; +cnn.layers{7}.type = 'prelu'; +cnn.layers{7}.weights = {PNet_mlab.prelu_weights_3}; + +cnn.layers{8} = struct; +cnn.layers{8}.type = 'fc'; +cnn.layers{8}.weights = {PNet_mlab.w, PNet_mlab.b}; + +Write_CNN_to_binary('PNet.dat', cnn); + +%% Next writing out the RNet +clear +load('../RNet_mlab.mat'); + +cnn = struct; +cnn.layers = cell(1,11); +cnn.layers{1} = struct; +cnn.layers{1}.type = 'conv'; +cnn.layers{1}.weights = {RNet_mlab.weights_conv1, RNet_mlab.biases_conv1}; + +cnn.layers{2} = struct; +cnn.layers{2}.type = 'prelu'; +cnn.layers{2}.weights = {RNet_mlab.prelu_weights_1}; + +cnn.layers{3} = struct; +cnn.layers{3}.type = 'max_pooling'; +cnn.layers{3}.weights = {}; +cnn.layers{3}.stride_x = 2; +cnn.layers{3}.stride_y = 2; +cnn.layers{3}.kernel_size_x = 3; +cnn.layers{3}.kernel_size_y = 3; + +cnn.layers{4} = struct; +cnn.layers{4}.type = 'conv'; +cnn.layers{4}.weights = {RNet_mlab.weights_conv2, RNet_mlab.biases_conv2}; + +cnn.layers{5} = struct; +cnn.layers{5}.type = 'prelu'; +cnn.layers{5}.weights = {RNet_mlab.prelu_weights_2}; + +cnn.layers{6} = struct; +cnn.layers{6}.type = 'max_pooling'; +cnn.layers{6}.weights = {}; +cnn.layers{6}.stride_x = 2; +cnn.layers{6}.stride_y = 2; +cnn.layers{6}.kernel_size_x = 3; +cnn.layers{6}.kernel_size_y = 3; + +cnn.layers{7} = struct; +cnn.layers{7}.type = 'conv'; +cnn.layers{7}.weights = {RNet_mlab.weights_conv3, RNet_mlab.biases_conv3}; + +cnn.layers{8} = struct; +cnn.layers{8}.type = 'prelu'; +cnn.layers{8}.weights = {RNet_mlab.prelu_weights_3}; + +cnn.layers{9} = struct; +cnn.layers{9}.type = 'fc'; +cnn.layers{9}.weights = {RNet_mlab.w_fc1, RNet_mlab.b_fc1}; + +cnn.layers{10} = struct; +cnn.layers{10}.type = 'prelu'; +cnn.layers{10}.weights = {RNet_mlab.prelu_fc1}; + +cnn.layers{11} = struct; +cnn.layers{11}.type = 'fc'; +cnn.layers{11}.weights = {RNet_mlab.w_fc2, RNet_mlab.b_fc2}; + +Write_CNN_to_binary('RNet.dat', cnn); + +%% Next writing out the ONet +clear +load('../ONet_mlab.mat'); + +cnn = struct; +cnn.layers = cell(1,14); +cnn.layers{1} = struct; +cnn.layers{1}.type = 'conv'; +cnn.layers{1}.weights = {ONet_mlab.weights_conv1, ONet_mlab.biases_conv1}; + +cnn.layers{2} = struct; +cnn.layers{2}.type = 'prelu'; +cnn.layers{2}.weights = {ONet_mlab.prelu_weights_1}; + +cnn.layers{3} = struct; +cnn.layers{3}.type = 'max_pooling'; +cnn.layers{3}.weights = {}; +cnn.layers{3}.stride_x = 2; +cnn.layers{3}.stride_y = 2; +cnn.layers{3}.kernel_size_x = 3; +cnn.layers{3}.kernel_size_y = 3; + +cnn.layers{4} = struct; +cnn.layers{4}.type = 'conv'; +cnn.layers{4}.weights = {ONet_mlab.weights_conv2, ONet_mlab.biases_conv2}; + +cnn.layers{5} = struct; +cnn.layers{5}.type = 'prelu'; +cnn.layers{5}.weights = {ONet_mlab.prelu_weights_2}; + +cnn.layers{6} = struct; +cnn.layers{6}.type = 'max_pooling'; +cnn.layers{6}.weights = {}; +cnn.layers{6}.stride_x = 2; +cnn.layers{6}.stride_y = 2; +cnn.layers{6}.kernel_size_x = 3; +cnn.layers{6}.kernel_size_y = 3; + +cnn.layers{7} = struct; +cnn.layers{7}.type = 'conv'; +cnn.layers{7}.weights = {ONet_mlab.weights_conv3, ONet_mlab.biases_conv3}; + +cnn.layers{8} = struct; +cnn.layers{8}.type = 'prelu'; +cnn.layers{8}.weights = {ONet_mlab.prelu_weights_3}; + +cnn.layers{9} = struct; +cnn.layers{9}.type = 'max_pooling'; +cnn.layers{9}.weights = {}; +cnn.layers{9}.stride_x = 2; +cnn.layers{9}.stride_y = 2; +cnn.layers{9}.kernel_size_x = 2; +cnn.layers{9}.kernel_size_y = 2; + +cnn.layers{10} = struct; +cnn.layers{10}.type = 'conv'; +cnn.layers{10}.weights = {ONet_mlab.weights_conv4, ONet_mlab.biases_conv4}; + +cnn.layers{11} = struct; +cnn.layers{11}.type = 'prelu'; +cnn.layers{11}.weights = {ONet_mlab.prelu_weights_4}; + +cnn.layers{12} = struct; +cnn.layers{12}.type = 'fc'; +cnn.layers{12}.weights = {ONet_mlab.w_fc1, ONet_mlab.b_fc1}; + +cnn.layers{13} = struct; +cnn.layers{13}.type = 'prelu'; +cnn.layers{13}.weights = {ONet_mlab.prelu_fc1}; + +cnn.layers{14} = struct; +cnn.layers{14}.type = 'fc'; +cnn.layers{14}.weights = {ONet_mlab.w_fc2, ONet_mlab.b_fc2}; + +Write_CNN_to_binary('ONet.dat', cnn); + +f = fopen('MTCNN_detector.txt', 'w'); +fprintf(f, 'PNet PNet.dat\r\n'); +fprintf(f, 'RNet RNet.dat\r\n'); +fprintf(f, 'ONet ONet.dat\r\n'); +fclose(f); \ No newline at end of file diff --git a/matlab_version/face_detection/mtcnn/convolution.m b/matlab_version/face_detection/mtcnn/convolution.m new file mode 100644 index 00000000..b2e5b554 --- /dev/null +++ b/matlab_version/face_detection/mtcnn/convolution.m @@ -0,0 +1,24 @@ +function [ output_maps ] = convolution( input_maps, kernels, biases ) +%CONVOLUTION Summary of this function goes here +% Detailed explanation goes here + + % If MatConvNet is not installed use Matlab (much slower) + if(exist('vl_nnconv', 'file') == 3) + output_maps = vl_nnconv(single(input_maps), kernels, biases); + else + n_filters = size(kernels, 4); + + kernels2 = kernels(:,:,end:-1:1,:); + for i=1:n_filters + for n_in_maps=1:size(kernels,3) + kernels2(:,:,n_in_maps,i) = fliplr(squeeze(kernels2(:,:,n_in_maps,i))); + kernels2(:,:,n_in_maps,i) = flipud(squeeze(kernels2(:,:,n_in_maps,i))); + end + end + output_maps = []; + for i=1:n_filters + output_maps = cat(3, output_maps, convn(input_maps, kernels2(:,:,:,i), 'valid') + biases(i)); + end + end +end + diff --git a/matlab_version/face_detection/mtcnn/correct_bbox.m b/matlab_version/face_detection/mtcnn/correct_bbox.m new file mode 100644 index 00000000..e75c1f86 --- /dev/null +++ b/matlab_version/face_detection/mtcnn/correct_bbox.m @@ -0,0 +1,36 @@ +function [ total_bboxes, to_keep ] = correct_bbox( total_bboxes, corrections, add1, rectangulate, round, type ) +%CORRECT_BBOX Summary of this function goes here +% Detailed explanation goes here + + % Non maximum supression accross bounding boxes + to_keep = non_maximum_supression(total_bboxes, 0.7, type); + total_bboxes = total_bboxes(to_keep, :); + corrections = corrections(to_keep, :); + % Perform correction based on regression values + bbw = total_bboxes(:,3) - total_bboxes(:,1); + bbh = total_bboxes(:,4) - total_bboxes(:,2); + + % TODO is this needed? + if(add1) + bbw = bbw + 1; + bbh = bbh + 1; + end + + new_min_x = total_bboxes(:,1) + corrections(:,1) .* bbw; + new_min_y = total_bboxes(:,2) + corrections(:,2) .* bbh; + new_max_x = total_bboxes(:,3) + corrections(:,3) .* bbw; + new_max_y = total_bboxes(:,4) + corrections(:,4) .* bbh; + score = total_bboxes(:,5); + total_bboxes = [new_min_x, new_min_y, new_max_x, new_max_y, score]; + + if(rectangulate) + % Convert the bounding boxes to rectangles + total_bboxes(:,1:4) = rectify(total_bboxes(:,1:4)); + end + + if(round) + % Rounding to pixels + total_bboxes(:,1:4) = fix(total_bboxes(:,1:4)); + end +end + diff --git a/matlab_version/face_detection/mtcnn/demo.m b/matlab_version/face_detection/mtcnn/demo.m new file mode 100644 index 00000000..b3910706 --- /dev/null +++ b/matlab_version/face_detection/mtcnn/demo.m @@ -0,0 +1,10 @@ +clear; + +% Make sure we have the dependencies for convolution +od = cd('../../face_validation'); +setup; +cd(od); + +img = imread('test1.jpg'); + +[bboxes, lmarks, confidences] = detect_face_mtcnn(img); \ No newline at end of file diff --git a/matlab_version/face_detection/mtcnn/demo_300W.m b/matlab_version/face_detection/mtcnn/demo_300W.m new file mode 100644 index 00000000..cd46bf80 --- /dev/null +++ b/matlab_version/face_detection/mtcnn/demo_300W.m @@ -0,0 +1,20 @@ +clear; + +% Make sure we have the dependencies for convolution +od = cd('../../face_validation'); +setup; +cd(od); + +imgs = dir('D:\Datasets\300_W\AFW/*.jpg'); +for i=2:numel(imgs) + img = imread(['D:\Datasets\300_W\AFW/', imgs(i).name]); + [bboxes, lmarks, confidences] = detect_face_mtcnn(img, 60); + hold off + imshow(img); + hold on; + for d=1:size(bboxes,1) + rectangle('Position', [bboxes(d,1), bboxes(d,2), bboxes(d,3)-bboxes(d,1), bboxes(d,4) - bboxes(d,2)]); + plot(lmarks(d,1:5), lmarks(d,6:10), '.r'); + end + drawnow expose +end \ No newline at end of file diff --git a/matlab_version/face_detection/mtcnn/detect_face_mtcnn.m b/matlab_version/face_detection/mtcnn/detect_face_mtcnn.m new file mode 100644 index 00000000..714f8d2b --- /dev/null +++ b/matlab_version/face_detection/mtcnn/detect_face_mtcnn.m @@ -0,0 +1,222 @@ +function [total_bboxes, lmarks, confidence] = detect_face_mtcnn(img, min_face_size) + +% Check if MatConvNet is installed +if(exist('vl_nnconv', 'file') ~= 3) + fprintf('Warning MatConvNet is not installed or not setup, face detection will be quite slow\n'); +end + +height_orig = size(img,1); +width_orig = size(img,2); + +% Everything is done in floats +img = single(img); + +% Minimum face size +if(nargin ==1) + min_face_size = 30; +end + +% Image pyramid scaling factor +factor = 0.709; + +% Thresholds for the PNet, ONet, and RNet +threshold=[0.6 0.7 0.7]; + +min_dim = min([width_orig height_orig]); + +% Face support region is 12x12 px, so from that can work out the largest +% scale (which is 12 / min), and work down from there to smallest scale (no smaller than +% 12x12px) +face_support = 12; +num_scales = floor(log(min_face_size / min_dim) / log(factor)); +scales = (face_support / min_face_size)*factor.^(0:num_scales); + +load('PNet_mlab'); +load('RNet_mlab'); +load('ONet_mlab'); + +total_bboxes = []; + +% First the PNet stage on image pyramid +for s = scales + h_pyr = ceil(height_orig * s); + w_pyr = ceil(width_orig * s); + + % Resize the image and normalize to what MTCNN expects it to be + im_data=(imresize(img, [h_pyr w_pyr],'bilinear','AntiAliasing',false)-127.5)*0.0078125; + + [ out_prob, out_correction ] = PNet( im_data, PNet_mlab ); + + % Generate bounding boxes from the heatmap + bboxes = generate_bounding_boxes(out_prob, out_correction, s, threshold(1), face_support); + + % TODO correct bboxes before running NMS?, as now lots of overlaping + % boxes are present + + % Perform non maximum supression to remove reduntant bounding boxes + pick = non_maximum_supression(bboxes, 0.5, 'Union'); + bboxes=bboxes(pick,:); + if ~isempty(bboxes) + total_bboxes = cat(1, total_bboxes, bboxes); + end +end + +if ~isempty(total_bboxes) + % Non maximum supression accross bounding boxes, and their offset + % correction + corrections = total_bboxes(:,6:end); + total_bboxes = total_bboxes(:,1:5); + + to_keep = non_maximum_supression(total_bboxes, 0.7, 'Union'); + total_bboxes = total_bboxes(to_keep, :); + corrections = corrections(to_keep, :); + + total_bboxes = apply_correction(total_bboxes, corrections, false); + + % Making them into rectangles + total_bboxes(:,1:4) = rectify(total_bboxes(:,1:4)); + + % Rounding to pixels + total_bboxes(:,1:4) = fix(total_bboxes(:,1:4)); +end +num_bbox = size(total_bboxes,1); + +% RNet stage +if num_bbox > 0 + + proposal_imgs = zeros(24, 24, 3, num_bbox); + for k=1:num_bbox + + width_target = total_bboxes(k,3) - total_bboxes(k,1) + 1; + height_target = total_bboxes(k,4) - total_bboxes(k,2) + 1; + + % Work out the start and end indices in the original image + start_x_in = max(total_bboxes(k,1), 1); + start_y_in = max(total_bboxes(k,2), 1); + end_x_in = min(total_bboxes(k,3), width_orig); + end_y_in = min(total_bboxes(k,4), height_orig); + + % Work out the start and end indices in the target image + start_x_out = max(-total_bboxes(k,1)+2, 1); + start_y_out = max(-total_bboxes(k,2)+2, 1); + end_x_out = min(width_target - (total_bboxes(k,3)-width_orig), width_target); + end_y_out = min(height_target - (total_bboxes(k,4)-height_orig), height_target); + + tmp = zeros(height_target, width_target, 3); + + tmp(start_y_out:end_y_out,start_x_out:end_x_out,:) = ... + img(start_y_in:end_y_in, start_x_in:end_x_in,:); + + proposal_imgs(:,:,:,k) = imresize(tmp, [24 24], 'bilinear','AntiAliasing',false); + end + + % Normalize the proposal images + proposal_imgs = (proposal_imgs - 127.5) * 0.0078125; + + % Apply RNet to proposal faces + [ score, out_correction ] = RNet( proposal_imgs, RNet_mlab ); + out_correction = out_correction'; + + % Find faces above the threshold + to_keep = find(score > threshold(2)); + + total_bboxes = [total_bboxes(to_keep,1:4) score(to_keep)']; + out_correction = out_correction(to_keep,:); + + if ~isempty(total_bboxes) + % Non maximum supression accross bounding boxes, and their offset + % correction + to_keep = non_maximum_supression(total_bboxes, 0.7, 'Union'); + total_bboxes = total_bboxes(to_keep, :); + out_correction = out_correction(to_keep, :); + + total_bboxes = apply_correction(total_bboxes, out_correction, true); + + % Making them into rectangles + total_bboxes(:,1:4) = rectify(total_bboxes(:,1:4)); + + % Rounding to pixels + total_bboxes(:,1:4) = fix(total_bboxes(:,1:4)); + end +end + +num_bbox = size(total_bboxes,1); + +% ONet stage +if num_bbox > 0 + + proposal_imgs = zeros(48, 48, 3, num_bbox); + for k=1:num_bbox + + width_target = total_bboxes(k,3) - total_bboxes(k,1) + 1; + height_target = total_bboxes(k,4) - total_bboxes(k,2) + 1; + + % Work out the start and end indices in the original image + start_x_in = max(total_bboxes(k,1), 1); + start_y_in = max(total_bboxes(k,2), 1); + end_x_in = min(total_bboxes(k,3), width_orig); + end_y_in = min(total_bboxes(k,4), height_orig); + + % Work out the start and end indices in the target image + start_x_out = max(-total_bboxes(k,1)+2, 1); + start_y_out = max(-total_bboxes(k,2)+2, 1); + end_x_out = min(width_target - (total_bboxes(k,3)-width_orig), width_target); + end_y_out = min(height_target - (total_bboxes(k,4)-height_orig), height_target); + + tmp = zeros(height_target, width_target, 3); + + tmp(start_y_out:end_y_out,start_x_out:end_x_out,:) = ... + img(start_y_in:end_y_in, start_x_in:end_x_in,:); + + proposal_imgs(:,:,:,k) = imresize(tmp, [48 48], 'bilinear','AntiAliasing',false); + end + + % Normalize the proposal images + proposal_imgs = (proposal_imgs - 127.5) * 0.0078125; + + % Apply ONet to proposal faces + [ score, out_correction, lmarks ] = ONet( proposal_imgs, ONet_mlab ); + out_correction = out_correction'; + lmarks = lmarks'; + + % Pick the final faces above the threshold + to_keep = find(score > threshold(3)); + lmarks = lmarks(to_keep, :); + out_correction = out_correction(to_keep, :); + total_bboxes = [total_bboxes(to_keep,1:4) score(to_keep)']; + + % Correct for the landmarks + bbw = total_bboxes(:,3) - total_bboxes(:,1) + 1; + bbh = total_bboxes(:,4) - total_bboxes(:,2) + 1; + + lmarks(:, 1:5) = bbw .* lmarks(:,1:5) + total_bboxes(:,1) - 1; + lmarks(:, 6:10) = bbh .* lmarks(:,6:10) + total_bboxes(:,2) - 1; + + % Correct the bounding boxes + if size(total_bboxes,1)>0 + total_bboxes = apply_correction(total_bboxes, out_correction, true); + to_keep = non_maximum_supression(total_bboxes, 0.7, 'Min'); + + lmarks = lmarks(to_keep, :); + confidence = total_bboxes(to_keep, 5); + total_bboxes = total_bboxes(to_keep, 1:4); + end + +end + +% Correct the bounding boxes to be around the 68 landmark points +widths = total_bboxes(:,3) - total_bboxes(:,1); +heights = total_bboxes(:,4) - total_bboxes(:,2); +txs = total_bboxes(:,1); +tys = total_bboxes(:,2); + +new_widths = widths * 1.0323; +new_heights = heights * 0.7751; +new_txs = widths * -0.0075 + txs; +new_tys = heights * 0.2459 + tys; + +total_bboxes = [new_txs, new_tys, new_txs + new_widths, new_tys + new_heights]; +total_bboxes = double(total_bboxes); +lmarks = double(lmarks); + +end \ No newline at end of file diff --git a/matlab_version/face_detection/mtcnn/generate_bounding_boxes.m b/matlab_version/face_detection/mtcnn/generate_bounding_boxes.m new file mode 100644 index 00000000..cf6e6077 --- /dev/null +++ b/matlab_version/face_detection/mtcnn/generate_bounding_boxes.m @@ -0,0 +1,25 @@ +function [bboxes] = generate_bounding_boxes(heatmap, correction, scale, t, face_support) + %use heatmap to generate bounding boxes in the original image space + + % Correction for the pooling + stride = 2; + + % Offsets for, x, y, width and height + dx1=correction(:,:,1); + dy1=correction(:,:,2); + dx2=correction(:,:,3); + dy2=correction(:,:,4); + + % Find the parts of a heatmap above the threshold (x, y, and indices) + [x, y]= find(heatmap >= t); + inds = find(heatmap >= t); + + % Find the corresponding scores and bbox corrections + score=heatmap(inds); + correction=[dx1(inds) dy1(inds) dx2(inds) dy2(inds)]; + + % Correcting for Matlab's format + bboxes=[y - 1 x - 1]; + bboxes=[fix((stride*(bboxes)+1)/scale) fix((stride*(bboxes)+face_support)/scale) score correction]; +end + diff --git a/matlab_version/face_detection/mtcnn/im2col_inds.m b/matlab_version/face_detection/mtcnn/im2col_inds.m new file mode 100644 index 00000000..5d4192b9 --- /dev/null +++ b/matlab_version/face_detection/mtcnn/im2col_inds.m @@ -0,0 +1,120 @@ +function ttt=im2col_inds(a, block) +%IM2COL Rearrange image blocks into columns. +% B = IM2COL(A,[M N],'distinct') rearranges each distinct +% M-by-N block in the image A into a column of B. IM2COL pads A +% with zeros, if necessary, so its size is an integer multiple +% of M-by-N. If A = [A11 A12; A21 A22], where each Aij is +% M-by-N, then B = [A11(:) A21(:) A12(:) A22(:)]. +% +% B = IM2COL(A,[M N],'sliding') converts each sliding M-by-N +% block of A into a column of B, with no zero padding. B has +% M*N rows and will contain as many columns as there are M-by-N +% neighborhoods in A. If the size of A is [MM NN], then the +% size of B is (M*N)-by-((MM-M+1)*(NN-N+1). Each column of B +% contains the neighborhoods of A reshaped as NHOOD(:), where +% NHOOD is a matrix containing an M-by-N neighborhood of +% A. IM2COL orders the columns of B so that they can be +% reshaped to form a matrix in the normal way. For example, +% suppose you use a function, such as SUM(B), that returns a +% scalar for each column of B. You can directly store the +% result in a matrix of size (MM-M+1)-by-(NN-N+1) using these +% calls: +% +% B = im2col(A,[M N],'sliding'); +% C = reshape(sum(B),MM-M+1,NN-N+1); +% +% B = IM2COL(A,[M N]) uses the default block type of +% 'sliding'. +% +% B = IM2COL(A,'indexed',...) processes A as an indexed image, +% padding with zeros if the class of A is uint8 or uint16, or +% ones if the class of A is double. +% +% Class Support +% ------------- +% The input image A can be numeric or logical. The output matrix +% B is of the same class as the input image. +% +% Example +% ------- +% Calculate the local mean using a [2 2] neighborhood with zero padding. +% +% A = reshape(linspace(0,1,16),[4 4])' +% B = im2col(A,[2 2]) +% M = mean(B) +% newA = col2im(M,[1 1],[3 3]) +% +% See also BLOCKPROC, COL2IM, COLFILT, NLFILTER. + +% Copyright 1993-2016 The MathWorks, Inc. + +[ma,na] = size(a); +m = block(1); n = block(2); + +if any([ma na] < [m n]) % if neighborhood is larger than image + b = zeros(m*n,0); + return +end + +% Create Hankel-like indexing sub matrix. +mc = block(1); nc = ma-m+1; nn = na-n+1; +cidx = (0:mc-1)'; ridx = 1:nc; +t = cidx(:,ones(nc,1)) + ridx(ones(mc,1),:); % Hankel Subscripts +tt = zeros(mc*n,nc); +rows = 1:mc; +for i=0:n-1, + tt(i*mc+rows,:) = t+ma*i; +end +ttt = zeros(mc*n,nc*nn); +cols = 1:nc; +for j=0:nn-1, + ttt(:,j*nc+cols) = tt+ma*j; +end + + +%%% +%%% Function parse_inputs +%%% +function [a, block, kind, padval] = parse_inputs(varargin) + +narginchk(2,4); + +switch nargin + case 2 + if (strcmp(varargin{2},'indexed')) + error(message('images:im2col:tooFewInputs')) + else + % IM2COL(A, [M N]) + a = varargin{1}; + block = varargin{2}; + kind = 'sliding'; + padval = 0; + end + + case 3 + if (strcmp(varargin{2},'indexed')) + % IM2COL(A, 'indexed', [M N]) + a = varargin{1}; + block = varargin{3}; + kind = 'sliding'; + padval = 1; + else + % IM2COL(A, [M N], 'kind') + a = varargin{1}; + block = varargin{2}; + kind = validatestring(varargin{3},{'sliding','distinct'},mfilename,'kind',3); + padval = 0; + end + + case 4 + % IM2COL(A, 'indexed', [M N], 'kind') + a = varargin{1}; + block = varargin{3}; + kind = validatestring(varargin{4},{'sliding','distinct'},mfilename,'kind',4); + padval = 1; + +end + +if (isa(a,'uint8') || isa(a, 'uint16')) + padval = 0; +end diff --git a/matlab_version/face_detection/mtcnn/im2col_mine.m b/matlab_version/face_detection/mtcnn/im2col_mine.m new file mode 100644 index 00000000..14ac52c5 --- /dev/null +++ b/matlab_version/face_detection/mtcnn/im2col_mine.m @@ -0,0 +1,127 @@ +function b=im2col_mine(a, block) +%IM2COL Rearrange image blocks into columns. +% B = IM2COL(A,[M N],'distinct') rearranges each distinct +% M-by-N block in the image A into a column of B. IM2COL pads A +% with zeros, if necessary, so its size is an integer multiple +% of M-by-N. If A = [A11 A12; A21 A22], where each Aij is +% M-by-N, then B = [A11(:) A21(:) A12(:) A22(:)]. +% +% B = IM2COL(A,[M N],'sliding') converts each sliding M-by-N +% block of A into a column of B, with no zero padding. B has +% M*N rows and will contain as many columns as there are M-by-N +% neighborhoods in A. If the size of A is [MM NN], then the +% size of B is (M*N)-by-((MM-M+1)*(NN-N+1). Each column of B +% contains the neighborhoods of A reshaped as NHOOD(:), where +% NHOOD is a matrix containing an M-by-N neighborhood of +% A. IM2COL orders the columns of B so that they can be +% reshaped to form a matrix in the normal way. For example, +% suppose you use a function, such as SUM(B), that returns a +% scalar for each column of B. You can directly store the +% result in a matrix of size (MM-M+1)-by-(NN-N+1) using these +% calls: +% +% B = im2col(A,[M N],'sliding'); +% C = reshape(sum(B),MM-M+1,NN-N+1); +% +% B = IM2COL(A,[M N]) uses the default block type of +% 'sliding'. +% +% B = IM2COL(A,'indexed',...) processes A as an indexed image, +% padding with zeros if the class of A is uint8 or uint16, or +% ones if the class of A is double. +% +% Class Support +% ------------- +% The input image A can be numeric or logical. The output matrix +% B is of the same class as the input image. +% +% Example +% ------- +% Calculate the local mean using a [2 2] neighborhood with zero padding. +% +% A = reshape(linspace(0,1,16),[4 4])' +% B = im2col(A,[2 2]) +% M = mean(B) +% newA = col2im(M,[1 1],[3 3]) +% +% See also BLOCKPROC, COL2IM, COLFILT, NLFILTER. + +% Copyright 1993-2016 The MathWorks, Inc. + +[ma,na] = size(a); +m = block(1); n = block(2); + +if any([ma na] < [m n]) % if neighborhood is larger than image + b = zeros(m*n,0); + return +end + +% Create Hankel-like indexing sub matrix. +mc = block(1); nc = ma-m+1; nn = na-n+1; +cidx = (0:mc-1)'; ridx = 1:nc; +t = cidx(:,ones(nc,1)) + ridx(ones(mc,1),:); % Hankel Subscripts +tt = zeros(mc*n,nc); +rows = 1:mc; +for i=0:n-1, + tt(i*mc+rows,:) = t+ma*i; +end +ttt = zeros(mc*n,nc*nn); +cols = 1:nc; +for j=0:nn-1, + ttt(:,j*nc+cols) = tt+ma*j; +end + +% If a is a row vector, change it to a column vector. This change is +% necessary when A is a row vector and [M N] = size(A). +if ndims(a) == 2 && na > 1 && ma == 1 + a = a(:); +end +b = a(ttt); + + +%%% +%%% Function parse_inputs +%%% +function [a, block, kind, padval] = parse_inputs(varargin) + +narginchk(2,4); + +switch nargin + case 2 + if (strcmp(varargin{2},'indexed')) + error(message('images:im2col:tooFewInputs')) + else + % IM2COL(A, [M N]) + a = varargin{1}; + block = varargin{2}; + kind = 'sliding'; + padval = 0; + end + + case 3 + if (strcmp(varargin{2},'indexed')) + % IM2COL(A, 'indexed', [M N]) + a = varargin{1}; + block = varargin{3}; + kind = 'sliding'; + padval = 1; + else + % IM2COL(A, [M N], 'kind') + a = varargin{1}; + block = varargin{2}; + kind = validatestring(varargin{3},{'sliding','distinct'},mfilename,'kind',3); + padval = 0; + end + + case 4 + % IM2COL(A, 'indexed', [M N], 'kind') + a = varargin{1}; + block = varargin{3}; + kind = validatestring(varargin{4},{'sliding','distinct'},mfilename,'kind',4); + padval = 1; + +end + +if (isa(a,'uint8') || isa(a, 'uint16')) + padval = 0; +end diff --git a/matlab_version/face_detection/mtcnn/max_pooling.m b/matlab_version/face_detection/mtcnn/max_pooling.m new file mode 100644 index 00000000..b49ed0c3 --- /dev/null +++ b/matlab_version/face_detection/mtcnn/max_pooling.m @@ -0,0 +1,57 @@ +function [ output_maps ] = max_pooling( input_maps) +%POOLING Summary of this function goes here +% Detailed explanation goes here + + orig_rows = size(input_maps,1); + orig_cols = size(input_maps,2); + + pooled_rows = ceil(orig_rows / 2); + pooled_cols = ceil(orig_cols / 2); + + up_to_rows_out = floor(orig_rows / 2); + up_to_cols_out = floor(orig_cols / 2); + + if(mod(orig_cols,2) == 0) + up_to_cols = orig_cols; + else + up_to_cols = orig_cols - 1; + end + + if(mod(orig_rows,2) == 0) + up_to_rows = orig_rows; + else + up_to_rows = orig_rows - 1; + end + + output_maps = zeros(pooled_rows, pooled_cols, size(input_maps,3)); + for i=1:size(input_maps,3) + temp = im2col(input_maps(1:up_to_rows,1:up_to_cols,i), [2,2], 'distinct'); + max_val = max(temp); + output_maps(1:up_to_rows_out,1:up_to_cols_out,i) = reshape(max_val, up_to_rows_out, up_to_cols_out); + end + + % A bit of a hack for non-even number of rows or columns + if(mod(orig_cols,2) ~= 0) + for i=1:size(input_maps,3) + temp = im2col(input_maps(1:up_to_rows,end,i), [2,1], 'distinct'); + max_val = max(temp); + output_maps(1:up_to_rows_out,end,i) = max_val; + end + end + + if(mod(orig_rows,2) ~= 0) + for i=1:size(input_maps,3) + temp = im2col(input_maps(end, 1:up_to_cols,i), [1,2], 'distinct'); + max_val = max(temp); + output_maps(end, 1:up_to_cols_out,i) = max_val; + end + end + + if(mod(orig_cols,2) ~= 0 && mod(orig_rows,2) ~= 0) + output_maps(end,end,:) = input_maps(end,end,:); + end + + + +end + diff --git a/matlab_version/face_detection/mtcnn/max_pooling2.m b/matlab_version/face_detection/mtcnn/max_pooling2.m new file mode 100644 index 00000000..e2fc1091 --- /dev/null +++ b/matlab_version/face_detection/mtcnn/max_pooling2.m @@ -0,0 +1,118 @@ +function [ output_maps ] = max_pooling2( input_maps, kernel_size, stride) +%POOLING Summary of this function goes here +% Detailed explanation goes here + + orig_rows = size(input_maps,1); + orig_cols = size(input_maps,2); + + pooled_rows = round((orig_rows - kernel_size)/stride) + 1; + pooled_cols = round((orig_cols - kernel_size)/stride) + 1; + + if(exist('vl_nnpool', 'file') == 3) + % Caffe and MatConvNet do pooling slightly differently, so need to + % counter for that + + pooled_cols_vl = floor((orig_cols - kernel_size)/stride) + 1; + pooled_rows_vl = floor((orig_rows - kernel_size)/stride) + 1; + + if(pooled_rows_vl == pooled_rows && pooled_cols_vl == pooled_cols) + output_maps = vl_nnpool(input_maps, [kernel_size, kernel_size], 'stride', stride); + else + % Else need to pad right and bottom with infinities + for x=1:kernel_size + pooled_cols_vl = floor((orig_cols + x - kernel_size)/stride) + 1; + if(pooled_cols_vl == pooled_cols) + break; + end + end + for y=1:kernel_size + pooled_rows_vl = floor((orig_rows +y - kernel_size)/stride) + 1; + if(pooled_rows_vl == pooled_rows) + break; + end + end + + input_maps_new = -inf * ones(size(input_maps,1)+y, size(input_maps,2)+x, size(input_maps,3), size(input_maps,4)); + input_maps_new(1:size(input_maps,1),1:size(input_maps,2),:,:) = input_maps; + output_maps = vl_nnpool(input_maps_new, [kernel_size, kernel_size], 'stride', stride); + end + else + + up_to_rows_out = floor((orig_rows - kernel_size)/stride) + 1; + up_to_cols_out = floor((orig_cols - kernel_size)/stride) + 1; + + % How many full max-pooling steps are there + up_to_cols = kernel_size + (up_to_cols_out-1) * stride; + up_to_rows = kernel_size + (up_to_rows_out-1) * stride; + + output_maps = zeros(pooled_rows, pooled_cols, size(input_maps,3), size(input_maps,4)); + + % Pick only the striding elements + [y, x] = meshgrid(1:up_to_cols-kernel_size+1, 1:up_to_rows-kernel_size+1); + to_keep_map = mod(y, stride) == 1 & mod(x, stride) == 1; + to_keep = find(to_keep_map); + + inds_pooling = im2col_inds(input_maps(1:up_to_rows,1:up_to_cols,1,1), [kernel_size, kernel_size]); + inds_pooling = inds_pooling(:, to_keep); + for m=1:size(input_maps,4) + for i=1:size(input_maps,3) + % temp = im2col(input_maps(1:up_to_rows,1:up_to_cols,i,m), [kernel_size, kernel_size], 'sliding'); + % temp = im2col_mine(input_maps(1:up_to_rows,1:up_to_cols,i,m), [kernel_size, kernel_size]); + % temp = temp(:,to_keep); + + temp = input_maps(1:up_to_rows,1:up_to_cols,i,m); + temp = temp(inds_pooling); + + max_val = max(temp); + output_maps(1:up_to_rows_out,1:up_to_cols_out,i,m) = reshape(max_val, up_to_rows_out, up_to_cols_out); + end + end + % A bit of a hack for non-even number of rows or columns + if(orig_cols ~= up_to_cols) + span = orig_cols - (up_to_cols - kernel_size + stride); + inds_pooling = im2col_inds(input_maps(1:up_to_rows,end-span+1:end,i,m), [kernel_size, span]); + inds_pooling = inds_pooling(:, 1:stride:end); + for m=1:size(input_maps,4) + for i=1:size(input_maps,3) + % temp = im2col(input_maps(1:up_to_rows,end-span+1:end,i,m), [kernel_size, span], 'sliding'); + % temp = im2col_mine(input_maps(1:up_to_rows,end-span+1:end,i,m), [kernel_size, span]); + % max_val = max(temp(:,1:stride:end)); + + temp = input_maps(1:up_to_rows,end-span+1:end,i,m); + max_val = max(temp(inds_pooling)); + output_maps(1:up_to_rows_out,end,i,m) = max_val; + end + end + end + + if(orig_rows ~= up_to_rows) + span = orig_rows - (up_to_rows - kernel_size + stride); + inds_pooling = im2col_inds(input_maps(end-span+1:end, 1:up_to_cols,i,m), [span, kernel_size]); + inds_pooling = inds_pooling(:, 1:stride:end); + + for m=1:size(input_maps,4) + for i=1:size(input_maps,3) + % temp = im2col(input_maps(end-span+1:end, 1:up_to_cols,i,m), [span, kernel_size], 'sliding'); + % temp = im2col_mine(input_maps(end-span+1:end, 1:up_to_cols,i,m), [span, kernel_size]); + % max_val = max(temp(:,1:stride:end)); + temp = input_maps(end-span+1:end, 1:up_to_cols,i,m); + max_val = max(temp(inds_pooling)); + + output_maps(end, 1:up_to_cols_out,i,m) = max_val; + end + end + end + + if(orig_cols ~= up_to_cols && orig_rows ~= up_to_rows) + for m=1:size(input_maps,4) + for i=1:size(input_maps,3) + tmp = input_maps(up_to_rows- kernel_size + stride + 1:end,up_to_cols - kernel_size + stride+1:end,i,m); + output_maps(end,end,i,m) = max(tmp(:)); + end + end + end + + end + +end + diff --git a/matlab_version/face_detection/mtcnn/non_maximum_supression.m b/matlab_version/face_detection/mtcnn/non_maximum_supression.m new file mode 100644 index 00000000..6c23c871 --- /dev/null +++ b/matlab_version/face_detection/mtcnn/non_maximum_supression.m @@ -0,0 +1,46 @@ +function pick = non_maximum_supression(boxes, overlap_threshold,type) + + %NMS + if isempty(boxes) + pick = []; + return; + end + + % Compute the corners of boxes and the area + x1 = boxes(:,1); + y1 = boxes(:,2); + x2 = boxes(:,3); + y2 = boxes(:,4); + s = boxes(:,5); + area = (x2-x1+1) .* (y2-y1+1); + + % Sorting based on confidence scores + [vals, I] = sort(s); + + pick = zeros(numel(s),1); + + counter = 1; + while ~isempty(I) + last = length(I); + i = I(last); + pick(counter) = i; + counter = counter + 1; + + xx1 = max(x1(i), x1(I(1:last-1))); + yy1 = max(y1(i), y1(I(1:last-1))); + xx2 = min(x2(i), x2(I(1:last-1))); + yy2 = min(y2(i), y2(I(1:last-1))); + w = max(0.0, xx2-xx1+1); + h = max(0.0, yy2-yy1+1); + inter = w.*h; + + if strcmp(type,'Min') + o = inter ./ min(area(i),area(I(1:last-1))); + else + o = inter ./ (area(i) + area(I(1:last-1)) - inter); + end + I = I(find(o<=overlap_threshold)); + end + + pick = pick(1:(counter-1)); +end \ No newline at end of file diff --git a/matlab_version/face_detection/mtcnn/readme.txt b/matlab_version/face_detection/mtcnn/readme.txt new file mode 100644 index 00000000..616f1171 --- /dev/null +++ b/matlab_version/face_detection/mtcnn/readme.txt @@ -0,0 +1,6 @@ +My re-implementation of MTCNN face detector (https://github.com/kpzhang93/MTCNN_face_detection_alignment) using Matlab and MatcConvNet. + +It uses MatConvNet to speed up face detection, and is able to use GPU support. Alternatively, if MatConvNet is not installed the approach will use Matlab native functions for processing (much slower). + +MatConvNet version used: +- MatConvNet from http://www.vlfeat.org/matconvnet/ (tested with version 1.0-beta24), and install following the instructions diff --git a/matlab_version/face_detection/mtcnn/rectify.m b/matlab_version/face_detection/mtcnn/rectify.m new file mode 100644 index 00000000..29123251 --- /dev/null +++ b/matlab_version/face_detection/mtcnn/rectify.m @@ -0,0 +1,15 @@ +function [bbox_out] = rectify(bbox_in) + + %convert bboxA to square + heights = bbox_in(:,4) - bbox_in(:,2); + widths = bbox_in(:,3) - bbox_in(:,1); + + max_side = max([widths'; heights'])'; + + % Correct the starts based on new size + new_min_x = bbox_in(:,1) + 0.5 * (widths - max_side); + new_min_y = bbox_in(:,2) + 0.5 * (heights - max_side); + + bbox_out = [new_min_x, new_min_y, new_min_x + max_side, new_min_y + max_side]; +end + diff --git a/matlab_version/face_validation/WriteOutFaceCheckersCNNbinary.m b/matlab_version/face_validation/WriteOutFaceCheckersCNNbinary.m index 24b28ece..cba61d57 100644 --- a/matlab_version/face_validation/WriteOutFaceCheckersCNNbinary.m +++ b/matlab_version/face_validation/WriteOutFaceCheckersCNNbinary.m @@ -83,7 +83,7 @@ function WriteOutFaceCheckersCNNbinary(locationTxt, faceCheckers) for k=1:num_in_map for k2=1:num_out_kerns - % Write out the bias term + % Write out the kernel W = squeeze(cnn.layers{layers}.weights{1}(:,:,k,k2)); writeMatrixBin(faceCheckerFile, W, 5); end