diff --git a/exe/FaceLandmarkImg/FaceLandmarkImg.cpp b/exe/FaceLandmarkImg/FaceLandmarkImg.cpp index 9d424fe0..b5f018cc 100644 --- a/exe/FaceLandmarkImg/FaceLandmarkImg.cpp +++ b/exe/FaceLandmarkImg/FaceLandmarkImg.cpp @@ -35,6 +35,8 @@ #include "LandmarkCoreIncludes.h" +#include "FaceDetectorMTCNN.h" + // System includes #include @@ -336,6 +338,7 @@ int main (int argc, char **argv) cv::CascadeClassifier classifier(det_parameters.haar_face_detector_location); dlib::frontal_face_detector face_detector_hog = dlib::get_frontal_face_detector(); + LandmarkDetector::FaceDetectorMTCNN face_detector_mtcnn(det_parameters.mtcnn_face_detector_location); // Loading the AU prediction models string au_loc = "AU_predictors/AU_all_static.txt"; @@ -432,11 +435,15 @@ int main (int argc, char **argv) vector confidences; LandmarkDetector::DetectFacesHOG(face_detections, grayscale_image, face_detector_hog, confidences); } - else + else if(det_parameters.curr_face_detector == LandmarkDetector::FaceModelParameters::HAAR_DETECTOR) { LandmarkDetector::DetectFaces(face_detections, grayscale_image, classifier); } - + else + { + vector confidences; + LandmarkDetector::DetectFacesMTCNN(face_detections, grayscale_image, face_detector_mtcnn, confidences); + } // Detect landmarks around detected faces int face_det = 0; // perform landmark detection for every face detected diff --git a/exe/FaceLandmarkVid/FaceLandmarkVid.cpp b/exe/FaceLandmarkVid/FaceLandmarkVid.cpp index 22a2a790..a17e6f5f 100644 --- a/exe/FaceLandmarkVid/FaceLandmarkVid.cpp +++ b/exe/FaceLandmarkVid/FaceLandmarkVid.cpp @@ -37,9 +37,6 @@ #include "LandmarkCoreIncludes.h" #include "GazeEstimation.h" -// TODO rem -#include "FaceDetectorMTCNN.h" - #include #include @@ -161,14 +158,6 @@ int main (int argc, char **argv) LandmarkDetector::FaceModelParameters det_parameters(arguments); - // Testing out the MTCNN readin - cv::Mat in_img = cv::imread("C:/Users/tbaltrus/Documents/OpenFace/matlab_version/face_detection/mtcnn/test1.jpg", 1); - LandmarkDetector::FaceDetectorMTCNN face_detector; - face_detector.Read(det_parameters.mtcnn_face_detector_location); - std::vector > regions; - std::vector confs; - face_detector.DetectFaces(regions, in_img, confs, 30); - // Get the input output file parameters // Indicates that rotation should be with respect to world or camera coordinates diff --git a/exe/FaceLandmarkVidMulti/FaceLandmarkVidMulti.cpp b/exe/FaceLandmarkVidMulti/FaceLandmarkVidMulti.cpp index b21ec431..c735cec8 100644 --- a/exe/FaceLandmarkVidMulti/FaceLandmarkVidMulti.cpp +++ b/exe/FaceLandmarkVidMulti/FaceLandmarkVidMulti.cpp @@ -120,7 +120,7 @@ int main (int argc, char **argv) // This is so that the model would not try re-initialising itself det_params.reinit_video_every = -1; - det_params.curr_face_detector = LandmarkDetector::FaceModelParameters::HOG_SVM_DETECTOR; + det_params.curr_face_detector = LandmarkDetector::FaceModelParameters::MTCNN_DETECTOR; vector det_parameters; det_parameters.push_back(det_params); @@ -271,11 +271,15 @@ int main (int argc, char **argv) vector confidences; LandmarkDetector::DetectFacesHOG(face_detections, grayscale_image, clnf_models[0].face_detector_HOG, confidences); } - else + else if(det_parameters[0].curr_face_detector == LandmarkDetector::FaceModelParameters::HAAR_DETECTOR) { LandmarkDetector::DetectFaces(face_detections, grayscale_image, clnf_models[0].face_detector_HAAR); } - + else + { + vector confidences; + LandmarkDetector::DetectFacesMTCNN(face_detections, grayscale_image, clnf_models[0].face_detector_MTCNN, confidences); + } } // Keep only non overlapping detections (also convert to a concurrent vector diff --git a/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h b/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h index 441e0cf9..5ce11298 100644 --- a/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h +++ b/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h @@ -85,11 +85,13 @@ namespace LandmarkDetector std::vector > Inference(const cv::Mat& input_img, bool direct = true); // Reading in the model - void Read(string location); + void Read(const string& location); // Clearing precomputed DFTs void ClearPrecomp(); + size_t NumberOfLayers() { return cnn_layer_types.size(); } + private: //========================================== // Convolutional Neural Network @@ -126,14 +128,19 @@ namespace LandmarkDetector // Default constructor FaceDetectorMTCNN() { ; } + FaceDetectorMTCNN(const string& location); + // Copy constructor FaceDetectorMTCNN(const FaceDetectorMTCNN& other); // Given an image, orientation and detected landmarks output the result of the appropriate regressor - bool DetectFaces(vector >& o_regions, const cv::Mat& input_img, std::vector& o_confidences, int min_face = 30, double t1 = 0.6, double t2 = 0.7, double t3 = 0.7); + bool DetectFaces(vector >& o_regions, const cv::Mat& input_img, std::vector& o_confidences, int min_face = 60, double t1 = 0.6, double t2 = 0.7, double t3 = 0.7); // Reading in the model - void Read(string location); + void Read(const string& location); + + // Indicate if the model has been read in + bool empty() { return PNet.NumberOfLayers() == 0 || RNet.NumberOfLayers() == 0 || ONet.NumberOfLayers() == 0; }; private: //========================================== diff --git a/lib/local/LandmarkDetector/include/LandmarkDetectorModel.h b/lib/local/LandmarkDetector/include/LandmarkDetectorModel.h index 99f8a0e3..695f2299 100644 --- a/lib/local/LandmarkDetector/include/LandmarkDetectorModel.h +++ b/lib/local/LandmarkDetector/include/LandmarkDetectorModel.h @@ -47,6 +47,7 @@ #include "Patch_experts.h" #include "LandmarkDetectionValidator.h" #include "LandmarkDetectorParameters.h" +#include "FaceDetectorMTCNN.h" using namespace std; @@ -90,11 +91,12 @@ public: // Haar cascade classifier for face detection cv::CascadeClassifier face_detector_HAAR; string haar_face_detector_location; - string mtcnn_face_detector_location; - + // A HOG SVM-struct based face detector dlib::frontal_face_detector face_detector_HOG; + FaceDetectorMTCNN face_detector_MTCNN; + string mtcnn_face_detector_location; // Validate if the detected landmarks are correct using an SVR regressor DetectionValidator landmark_validator; diff --git a/lib/local/LandmarkDetector/include/LandmarkDetectorParameters.h b/lib/local/LandmarkDetector/include/LandmarkDetectorParameters.h index eda3866a..2e259fbd 100644 --- a/lib/local/LandmarkDetector/include/LandmarkDetectorParameters.h +++ b/lib/local/LandmarkDetector/include/LandmarkDetectorParameters.h @@ -88,7 +88,8 @@ struct FaceModelParameters // Determining which face detector to use for (re)initialisation, HAAR is quicker but provides more false positives and is not goot for in-the-wild conditions // Also HAAR detector can detect smaller faces while HOG SVM is only capable of detecting faces at least 70px across - enum FaceDetector{HAAR_DETECTOR, HOG_SVM_DETECTOR}; + // MTCNN detector is much more accurate that the other two, and is even suitable for profile faces, but it is somewhat slower + enum FaceDetector{HAAR_DETECTOR, HOG_SVM_DETECTOR, MTCNN_DETECTOR}; string haar_face_detector_location; string mtcnn_face_detector_location; diff --git a/lib/local/LandmarkDetector/include/LandmarkDetectorUtils.h b/lib/local/LandmarkDetector/include/LandmarkDetectorUtils.h index d4a360d8..dba711c6 100644 --- a/lib/local/LandmarkDetector/include/LandmarkDetectorUtils.h +++ b/lib/local/LandmarkDetector/include/LandmarkDetectorUtils.h @@ -41,6 +41,8 @@ #include "LandmarkDetectorModel.h" +#include "FaceDetectorMTCNN.h" + using namespace std; namespace LandmarkDetector @@ -130,6 +132,11 @@ namespace LandmarkDetector // The preference point allows for disambiguation if multiple faces are present (pick the closest one), if it is not set the biggest face is chosen bool DetectSingleFaceHOG(cv::Rect_& o_region, const cv::Mat_& intensity, dlib::frontal_face_detector& classifier, double& confidence, const cv::Point preference = cv::Point(-1,-1)); + // Face detection using Multi-task Convolutional Neural Network + bool DetectFacesMTCNN(vector >& o_regions, const cv::Mat& image, LandmarkDetector::FaceDetectorMTCNN& detector, std::vector& confidences); + // The preference point allows for disambiguation if multiple faces are present (pick the closest one), if it is not set the biggest face is chosen + bool DetectSingleFaceMTCNN(cv::Rect_& o_region, const cv::Mat& image, LandmarkDetector::FaceDetectorMTCNN& detector, double& confidence, const cv::Point preference = cv::Point(-1, -1)); + //============================================================================ // Matrix reading functionality //============================================================================ diff --git a/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp b/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp index 39af2f1c..67178c2e 100644 --- a/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp +++ b/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp @@ -87,6 +87,11 @@ using namespace LandmarkDetector; +// Constructor from model file location +FaceDetectorMTCNN::FaceDetectorMTCNN(const string& location) +{ + this->Read(location); +} // Copy constructor FaceDetectorMTCNN::FaceDetectorMTCNN(const FaceDetectorMTCNN& other) : PNet(other.PNet), RNet(other.RNet), ONet(other.ONet) { @@ -438,8 +443,8 @@ void im2colBias(const cv::Mat_& input, int width, int height, cv::Mat_(rowIdx, colIdx + 1) = input.at(i + yy, j + xx); @@ -712,7 +717,7 @@ void CNN::ClearPrecomp() } } -void CNN::Read(string location) +void CNN::Read(const string& location) { ifstream cnn_stream(location, ios::in | ios::binary); if (cnn_stream.is_open()) @@ -844,7 +849,7 @@ void CNN::Read(string location) //=========================================================================== // Read in the MTCNN detector -void FaceDetectorMTCNN::Read(string location) +void FaceDetectorMTCNN::Read(const string& location) { cout << "Reading the MTCNN face detector from: " << location << endl; @@ -1068,11 +1073,11 @@ void apply_correction(vector >& total_bboxes, const vector >& o_regions, const cv::Mat& input_img, std::vector& o_confidences, int min_face_size, double t1, double t2, double t3) +bool FaceDetectorMTCNN::DetectFaces(vector >& o_regions, const cv::Mat& img_in, std::vector& o_confidences, int min_face_size, double t1, double t2, double t3) { - int height_orig = input_img.size().height; - int width_orig = input_img.size().width; + int height_orig = img_in.size().height; + int width_orig = img_in.size().width; // Size ratio of image pyramids double pyramid_factor = 0.709; @@ -1084,9 +1089,15 @@ bool FaceDetectorMTCNN::DetectFaces(vector >& o_regions, const int face_support = 12; int num_scales = floor(log((double)min_face_size / (double)min_dim) / log(pyramid_factor)) + 1; - if (input_img.channels() == 1) + cv::Mat input_img; + + if (img_in.channels() == 1) { - cv::cvtColor(input_img, input_img, CV_GRAY2RGB); + cv::cvtColor(img_in, input_img, CV_GRAY2RGB); + } + else + { + input_img = img_in; } cv::Mat img_float; diff --git a/lib/local/LandmarkDetector/src/LandmarkDetectorFunc.cpp b/lib/local/LandmarkDetector/src/LandmarkDetectorFunc.cpp index 68238899..5d24478a 100644 --- a/lib/local/LandmarkDetector/src/LandmarkDetectorFunc.cpp +++ b/lib/local/LandmarkDetector/src/LandmarkDetectorFunc.cpp @@ -317,6 +317,11 @@ bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat_ &grayscale_i clnf_model.face_detector_HAAR.load(params.haar_face_detector_location); clnf_model.haar_face_detector_location = params.haar_face_detector_location; } + if (clnf_model.face_detector_MTCNN.empty()) + { + clnf_model.face_detector_MTCNN.Read(params.mtcnn_face_detector_location); + clnf_model.mtcnn_face_detector_location = params.haar_face_detector_location; + } cv::Point preference_det(-1, -1); if(clnf_model.preference_det.x != -1 && clnf_model.preference_det.y != -1) @@ -336,6 +341,11 @@ bool LandmarkDetector::DetectLandmarksInVideo(const cv::Mat_ &grayscale_i { face_detection_success = LandmarkDetector::DetectSingleFace(bounding_box, grayscale_image, clnf_model.face_detector_HAAR, preference_det); } + else if (params.curr_face_detector == FaceModelParameters::MTCNN_DETECTOR) + { + double confidence; + face_detection_success = LandmarkDetector::DetectSingleFaceMTCNN(bounding_box, grayscale_image, clnf_model.face_detector_MTCNN, confidence, preference_det); + } // Attempt to detect landmarks using the detected face (if unseccessful the detection will be ignored) if(face_detection_success) @@ -527,12 +537,17 @@ bool LandmarkDetector::DetectLandmarksInImage(const cv::Mat_ &grayscale_i cv::Rect_ bounding_box; // If the face detector has not been initialised read it in - if(clnf_model.face_detector_HAAR.empty()) + if(clnf_model.face_detector_HAAR.empty() && params.curr_face_detector == FaceModelParameters::HAAR_DETECTOR) { clnf_model.face_detector_HAAR.load(params.haar_face_detector_location); clnf_model.haar_face_detector_location = params.haar_face_detector_location; } - + + if (clnf_model.face_detector_MTCNN.empty() && params.curr_face_detector == FaceModelParameters::MTCNN_DETECTOR) + { + clnf_model.face_detector_MTCNN.Read(params.mtcnn_face_detector_location); + } + // Detect the face first if(params.curr_face_detector == FaceModelParameters::HOG_SVM_DETECTOR) { diff --git a/lib/local/LandmarkDetector/src/LandmarkDetectorModel.cpp b/lib/local/LandmarkDetector/src/LandmarkDetectorModel.cpp index 4900602c..7463f587 100644 --- a/lib/local/LandmarkDetector/src/LandmarkDetectorModel.cpp +++ b/lib/local/LandmarkDetector/src/LandmarkDetectorModel.cpp @@ -69,7 +69,7 @@ CLNF::CLNF(string fname) CLNF::CLNF(const CLNF& other): pdm(other.pdm), params_local(other.params_local.clone()), params_global(other.params_global), detected_landmarks(other.detected_landmarks.clone()), landmark_likelihoods(other.landmark_likelihoods.clone()), patch_experts(other.patch_experts), landmark_validator(other.landmark_validator), haar_face_detector_location(other.haar_face_detector_location), mtcnn_face_detector_location(other.mtcnn_face_detector_location), hierarchical_mapping(other.hierarchical_mapping), hierarchical_models(other.hierarchical_models), hierarchical_model_names(other.hierarchical_model_names), - hierarchical_params(other.hierarchical_params), eye_model(other.eye_model) + hierarchical_params(other.hierarchical_params), eye_model(other.eye_model), face_detector_MTCNN(other.face_detector_MTCNN) { this->detection_success = other.detection_success; this->tracking_initialised = other.tracking_initialised; @@ -150,9 +150,12 @@ CLNF & CLNF::operator= (const CLNF& other) this->hierarchical_models = other.hierarchical_models; this->hierarchical_model_names = other.hierarchical_model_names; this->hierarchical_params = other.hierarchical_params; + + mtcnn_face_detector_location = other.mtcnn_face_detector_location; + face_detector_MTCNN = other.face_detector_MTCNN; } - face_detector_HOG = dlib::get_frontal_face_detector(); + face_detector_HOG = dlib::get_frontal_face_detector(); return *this; } @@ -183,6 +186,8 @@ CLNF::CLNF(const CLNF&& other) face_detector_HOG = dlib::get_frontal_face_detector(); + face_detector_MTCNN = other.face_detector_MTCNN; + // Copy over the hierarchical models this->hierarchical_mapping = other.hierarchical_mapping; this->hierarchical_models = other.hierarchical_models; @@ -219,6 +224,8 @@ CLNF & CLNF::operator= (const CLNF&& other) face_detector_HOG = dlib::get_frontal_face_detector(); + face_detector_MTCNN = other.face_detector_MTCNN; + // Copy over the hierarchical models this->hierarchical_mapping = other.hierarchical_mapping; this->hierarchical_models = other.hierarchical_models; diff --git a/lib/local/LandmarkDetector/src/LandmarkDetectorParameters.cpp b/lib/local/LandmarkDetector/src/LandmarkDetectorParameters.cpp index b6a6e3f0..0ba0b329 100644 --- a/lib/local/LandmarkDetector/src/LandmarkDetectorParameters.cpp +++ b/lib/local/LandmarkDetector/src/LandmarkDetectorParameters.cpp @@ -176,7 +176,7 @@ FaceModelParameters::FaceModelParameters(vector &arguments) valid[i] = false; // For in-the-wild images use an in-the wild detector - curr_face_detector = HOG_SVM_DETECTOR; + curr_face_detector = MTCNN_DETECTOR; } } @@ -306,8 +306,8 @@ void FaceModelParameters::init() mtcnn_face_detector_location = "model/mtcnn_detector/MTCNN_detector.txt"; quiet_mode = false; - // By default use HOG SVM - curr_face_detector = HOG_SVM_DETECTOR; + // By default use MTCNN + curr_face_detector = MTCNN_DETECTOR; // The gaze tracking has to be explicitly initialised track_gaze = false; diff --git a/lib/local/LandmarkDetector/src/LandmarkDetectorUtils.cpp b/lib/local/LandmarkDetector/src/LandmarkDetectorUtils.cpp index 76c09548..84029f70 100644 --- a/lib/local/LandmarkDetector/src/LandmarkDetectorUtils.cpp +++ b/lib/local/LandmarkDetector/src/LandmarkDetectorUtils.cpp @@ -1474,6 +1474,80 @@ bool DetectSingleFaceHOG(cv::Rect_& o_region, const cv::Mat_& int return detect_success; } +bool DetectFacesMTCNN(vector >& o_regions, const cv::Mat& image, LandmarkDetector::FaceDetectorMTCNN& detector, std::vector& o_confidences) +{ + detector.DetectFaces(o_regions, image, o_confidences); + + return o_regions.size() > 0; +} + +bool DetectSingleFaceMTCNN(cv::Rect_& o_region, const cv::Mat& image, LandmarkDetector::FaceDetectorMTCNN& detector, double& confidence, cv::Point preference) +{ + // The tracker can return multiple faces + vector > face_detections; + vector confidences; + + detector.DetectFaces(face_detections, image, confidences); + + bool detect_success = face_detections.size() > 0; + if (detect_success) + { + + bool use_preferred = (preference.x != -1) && (preference.y != -1); + + // keep the most confident one or the one closest to preference point if set + double best_so_far; + if (use_preferred) + { + best_so_far = sqrt((preference.x - (face_detections[0].width / 2 + face_detections[0].x)) * (preference.x - (face_detections[0].width / 2 + face_detections[0].x)) + + (preference.y - (face_detections[0].height / 2 + face_detections[0].y)) * (preference.y - (face_detections[0].height / 2 + face_detections[0].y))); + } + else + { + best_so_far = confidences[0]; + } + int bestIndex = 0; + + for (size_t i = 1; i < face_detections.size(); ++i) + { + + double dist; + bool better; + + if (use_preferred) + { + dist = sqrt((preference.x - (face_detections[0].width / 2 + face_detections[0].x)) * (preference.x - (face_detections[0].width / 2 + face_detections[0].x)) + + (preference.y - (face_detections[0].height / 2 + face_detections[0].y)) * (preference.y - (face_detections[0].height / 2 + face_detections[0].y))); + better = dist < best_so_far; + } + else + { + dist = confidences[i]; + better = dist > best_so_far; + } + + // Pick a closest face + if (better) + { + best_so_far = dist; + bestIndex = i; + } + } + + o_region = face_detections[bestIndex]; + confidence = confidences[bestIndex]; + } + else + { + // if not detected + o_region = cv::Rect_(0, 0, 0, 0); + // A completely unreliable detection (shouldn't really matter what is returned here) + confidence = -2; + } + return detect_success; +} + + //============================================================================ // Matrix reading functionality //============================================================================