diff --git a/exe/FaceLandmarkVid/FaceLandmarkVid.cpp b/exe/FaceLandmarkVid/FaceLandmarkVid.cpp index a17e6f5f..22a2a790 100644 --- a/exe/FaceLandmarkVid/FaceLandmarkVid.cpp +++ b/exe/FaceLandmarkVid/FaceLandmarkVid.cpp @@ -37,6 +37,9 @@ #include "LandmarkCoreIncludes.h" #include "GazeEstimation.h" +// TODO rem +#include "FaceDetectorMTCNN.h" + #include #include @@ -158,6 +161,14 @@ int main (int argc, char **argv) LandmarkDetector::FaceModelParameters det_parameters(arguments); + // Testing out the MTCNN readin + cv::Mat in_img = cv::imread("C:/Users/tbaltrus/Documents/OpenFace/matlab_version/face_detection/mtcnn/test1.jpg", 1); + LandmarkDetector::FaceDetectorMTCNN face_detector; + face_detector.Read(det_parameters.mtcnn_face_detector_location); + std::vector > regions; + std::vector confs; + face_detector.DetectFaces(regions, in_img, confs, 30); + // Get the input output file parameters // Indicates that rotation should be with respect to world or camera coordinates diff --git a/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h b/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h index 3fe432b1..89aa9f3d 100644 --- a/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h +++ b/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h @@ -82,7 +82,7 @@ namespace LandmarkDetector CNN(const CNN& other); // Given an image, orientation and detected landmarks output the result of the appropriate regressor - cv::Mat_ Inference(const cv::Mat_& input_img); + std::vector > Inference(const cv::Mat& input_img); // Reading in the model void Read(string location); @@ -123,7 +123,7 @@ namespace LandmarkDetector FaceDetectorMTCNN(const FaceDetectorMTCNN& other); // Given an image, orientation and detected landmarks output the result of the appropriate regressor - bool DetectFaces(vector >& o_regions, const cv::Mat_& input_img, std::vector& o_confidences, int min_face = 30, double t1 = 0.6, double t2 = 0.7, double t3 = 0.7); + bool DetectFaces(vector >& o_regions, const cv::Mat& input_img, std::vector& o_confidences, int min_face = 30, double t1 = 0.6, double t2 = 0.7, double t3 = 0.7); // Reading in the model void Read(string location); diff --git a/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp b/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp index 0594aab3..5b88c660 100644 --- a/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp +++ b/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp @@ -136,7 +136,7 @@ CNN::CNN(const CNN& other) : cnn_layer_types(other.cnn_layer_types), cnn_max_poo } } -cv::Mat_ CNN::Inference(const cv::Mat_& input_img) +std::vector> CNN::Inference(const cv::Mat& input_img) { if (input_img.channels() == 1) { @@ -148,13 +148,20 @@ cv::Mat_ CNN::Inference(const cv::Mat_& input_img) int prelu_layer = 0; int max_pool_layer = 0; + // Slit a BGR image into three chnels + cv::Mat channels[3]; + cv::split(input_img, channels); + vector > input_maps; - input_maps.push_back(input_img); + input_maps.push_back(channels[2]); + input_maps.push_back(channels[1]); + input_maps.push_back(channels[0]); vector > outputs; for (size_t layer = 0; layer < cnn_layer_types.size(); ++layer) { + // Determine layer type int layer_type = cnn_layer_types[layer]; @@ -216,7 +223,7 @@ cv::Mat_ CNN::Inference(const cv::Mat_& input_img) } if (layer_type == 1) { - vector> outputs_sub; + vector > outputs_sub; int stride_x = std::get<2>(cnn_max_pooling_layers[max_pool_layer]); int stride_y = std::get<3>(cnn_max_pooling_layers[max_pool_layer]); @@ -270,27 +277,35 @@ cv::Mat_ CNN::Inference(const cv::Mat_& input_img) { cv::Mat_ add = input_maps[in].t(); add = add.reshape(0, 1); - cv::hconcat(input_concat, add, input_concat); + cv::vconcat(input_concat, add, input_concat); } - input_concat = input_concat * cnn_fully_connected_layers_weights[fully_connected_layer]; - input_concat = input_concat + cnn_fully_connected_layers_biases[fully_connected_layer].t(); + input_concat = input_concat.t() * cnn_fully_connected_layers_weights[fully_connected_layer]; + + for (size_t k = 0; k < cnn_fully_connected_layers_biases[fully_connected_layer].rows; ++k) + { + input_concat.col(k) = input_concat.col(k) + cnn_fully_connected_layers_biases[fully_connected_layer].at(k); + } outputs.clear(); outputs.push_back(input_concat); fully_connected_layer++; } - if (layer_type == 3) // PReLU, TODO + if (layer_type == 3) // PReLU { outputs.clear(); for (size_t k = 0; k < input_maps.size(); ++k) { - // Apply the ReLU - cv::threshold(input_maps[k], input_maps[k], 0, 0, cv::THRESH_TOZERO); - outputs.push_back(input_maps[k]); + // Apply the PReLU + cv::Mat_ pos; + cv::threshold(input_maps[k], pos, 0, 0, cv::THRESH_TOZERO); + cv::Mat_ neg; + cv::threshold(input_maps[k], neg, 0, 0, cv::THRESH_TOZERO_INV); + outputs.push_back(pos + neg * cnn_prelu_layer_weights[prelu_layer].at(k)); } + prelu_layer++; } if (layer_type == 4) { @@ -307,11 +322,19 @@ cv::Mat_ CNN::Inference(const cv::Mat_& input_img) } // Set the outputs of this layer to inputs of the next input_maps = outputs; + + // TODO rem + //cv::Mat to_vis = input_maps[0]; + //cout << to_vis << endl; + //double min, max; + //cv::minMaxIdx(to_vis, &min, &max); + //cv::imshow("image 1", (to_vis - min)/(max-min)); + //cv::waitKey(0); } - return outputs[0]; + return outputs; } @@ -491,11 +514,11 @@ void FaceDetectorMTCNN::Read(string location) } // The actual MTCNN face detection step -bool DetectFaces(vector >& o_regions, const cv::Mat_& input_img, std::vector& o_confidences, int min_face_size = 30, double t1 = 0.6, double t2 = 0.7, double t3 = 0.7) +bool FaceDetectorMTCNN::DetectFaces(vector >& o_regions, const cv::Mat& input_img, std::vector& o_confidences, int min_face_size, double t1, double t2, double t3) { - int height_orig = input_img.rows; - int width_orig = input_img.cols; + int height_orig = input_img.size().height; + int width_orig = input_img.size().width; // Size ratio of image pyramids double pyramid_factor = 0.709; @@ -505,21 +528,41 @@ bool DetectFaces(vector >& o_regions, const cv::Mat_& i int min_dim = std::min(height_orig, width_orig); int face_support = 12; - int num_scales = floor(log(min_face_size / min_dim) / log(pyramid_factor)) + 1; + int num_scales = floor(log((double)min_face_size / (double)min_dim) / log(pyramid_factor)) + 1; + + if (input_img.channels() == 1) + { + cv::cvtColor(input_img, input_img, CV_GRAY2RGB); + } + + cv::Mat img_float; + input_img.convertTo(img_float, CV_32FC3); for (int i = 0; i < num_scales; ++i) { - double scale = (face_support / min_face_size)*cv::pow(pyramid_factor, i); + double scale = ((double)face_support / (double)min_face_size)*cv::pow(pyramid_factor, i); int h_pyr = ceil(height_orig * scale); int w_pyr = ceil(width_orig * scale); - cv::Mat_ normalised_img; - cv::resize(input_img, normalised_img, cv::Size(w_pyr, h_pyr)); + cv::Mat normalised_img; + cv::resize(img_float, normalised_img, cv::Size(w_pyr, h_pyr)); normalised_img = (normalised_img - 127.5) * 0.0078125; + std::vector > pnet_out = PNet.Inference(normalised_img); + + // TODO resize appropriately the output + + cv::Mat_ out_prob; + cv::exp(pnet_out[0]- pnet_out[1], out_prob); + out_prob = 1.0 / (1.0 + out_prob); + + cv::imshow("out_map", out_prob); + cv::waitKey(0); + } + return true; }