From d062ab690dcb4f38e42d249eb006908c525954f9 Mon Sep 17 00:00:00 2001 From: Tadas Baltrusaitis Date: Tue, 15 Aug 2017 21:53:25 +0100 Subject: [PATCH] Final bit of simplification and fixing an issue with RNet and ONet. --- .../include/FaceDetectorMTCNN.h | 6 +- .../src/FaceDetectorMTCNN.cpp | 98 +++++++++++-------- 2 files changed, 59 insertions(+), 45 deletions(-) diff --git a/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h b/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h index 89aa9f3d..cfe7a656 100644 --- a/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h +++ b/lib/local/LandmarkDetector/include/FaceDetectorMTCNN.h @@ -87,15 +87,17 @@ namespace LandmarkDetector // Reading in the model void Read(string location); + // Precomputations for faster convolution + vector > > > > cnn_convolutional_layers_dft; + private: //========================================== // Convolutional Neural Network // CNN layers // layer -> input maps -> kernels - vector > > > cnn_convolutional_layers; // Bit ugly with so much nesting, but oh well - vector > > > > cnn_convolutional_layers_dft; + vector > > > cnn_convolutional_layers; vector > cnn_convolutional_layers_bias; vector > cnn_fully_connected_layers_weights; vector > cnn_fully_connected_layers_biases; diff --git a/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp b/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp index 31ab963f..6f95a188 100644 --- a/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp +++ b/lib/local/LandmarkDetector/src/FaceDetectorMTCNN.cpp @@ -136,8 +136,43 @@ CNN::CNN(const CNN& other) : cnn_layer_types(other.cnn_layer_types), cnn_max_poo } } +void PReLU(std::vector >& outputs, const std::vector >& input_maps, cv::Mat_ prelu_weights) +{ + outputs.clear(); + if (input_maps.size() > 1) + { + for (size_t k = 0; k < input_maps.size(); ++k) + { + // Apply the PReLU + cv::Mat_ pos; + cv::threshold(input_maps[k], pos, 0, 0, cv::THRESH_TOZERO); + cv::Mat_ neg; + cv::threshold(input_maps[k], neg, 0, 0, cv::THRESH_TOZERO_INV); + outputs.push_back(pos + neg * prelu_weights.at(k)); + + } + } + else + { + cv::Mat_ pos(input_maps[0].size(), 0.0); + cv::Mat_ neg(input_maps[0].size(), 0.0); + for (size_t k = 0; k < prelu_weights.rows; ++k) + { + // Apply the PReLU + cv::threshold(input_maps[0].row(k), pos.row(k), 0, 0, cv::THRESH_TOZERO); + cv::threshold(input_maps[0].row(k), neg.row(k), 0, 0, cv::THRESH_TOZERO_INV); + neg.row(k) = neg.row(k) * prelu_weights.at(k); + } + outputs.push_back(pos + neg); + + } + +} + void fully_connected(std::vector >& outputs, const std::vector >& input_maps, cv::Mat_ weights, cv::Mat_ biases) { + outputs.clear(); + if (input_maps.size() > 1) { // Concatenate all the maps @@ -335,7 +370,6 @@ std::vector> CNN::Inference(const cv::Mat& input_img) // Convolutional layer if (layer_type == 0) { - outputs.clear(); convolution_fft(outputs, input_maps, cnn_convolutional_layers[cnn_layer], cnn_convolutional_layers_bias[cnn_layer], cnn_convolutional_layers_dft[cnn_layer]); cnn_layer++; @@ -354,48 +388,12 @@ std::vector> CNN::Inference(const cv::Mat& input_img) } if (layer_type == 2) { - fully_connected(outputs, input_maps, cnn_fully_connected_layers_weights[fully_connected_layer], cnn_fully_connected_layers_biases[fully_connected_layer]); fully_connected_layer++; } if (layer_type == 3) // PReLU { - outputs.clear(); - if(input_maps.size() > 1) - { - for (size_t k = 0; k < input_maps.size(); ++k) - { - // Apply the PReLU - cv::Mat_ pos; - cv::threshold(input_maps[k], pos, 0, 0, cv::THRESH_TOZERO); - cv::Mat_ neg; - cv::threshold(input_maps[k], neg, 0, 0, cv::THRESH_TOZERO_INV); - outputs.push_back(pos + neg * cnn_prelu_layer_weights[prelu_layer].at(k)); - - } - } - else - { - cv::Mat_ pos(input_maps[0].size(), 0.0); - cv::Mat_ neg(input_maps[0].size(), 0.0); - for (size_t k = 0; k < cnn_prelu_layer_weights[prelu_layer].rows; ++k) - { - // Apply the PReLU - cv::threshold(input_maps[0].row(k), pos.row(k), 0, 0, cv::THRESH_TOZERO); - cv::threshold(input_maps[0].row(k), neg.row(k), 0, 0, cv::THRESH_TOZERO_INV); - neg.row(k) = neg.row(k) * cnn_prelu_layer_weights[prelu_layer].at(k); - } - outputs.push_back(pos + neg); - - } - - //float diff = 0.0; - //for (size_t k = 0; k < outs.size(); ++k) - //{ - // diff += cv::mean(cv::abs(outputs[k] - outs[k]))[0]; - //} - //cout << diff << endl; - + PReLU(outputs, input_maps, cnn_prelu_layer_weights[prelu_layer]); prelu_layer++; } if (layer_type == 4) @@ -668,7 +666,7 @@ void select_subset(const vector& to_keep, vector >& boundi scores_tmp.push_back(scores[to_keep[i]]); corrections_tmp.push_back(corrections[to_keep[i]]); } - + bounding_boxes = bounding_boxes_tmp; scores = scores_tmp; corrections = corrections_tmp; @@ -805,7 +803,19 @@ bool FaceDetectorMTCNN::DetectFaces(vector >& o_regions, const // Actual PNet CNN step std::vector > pnet_out = PNet.Inference(normalised_img); - + + // Clear the precomputations, as the image sizes will be different (TODO could be useful for videos) + for (size_t k1 = 0; k1 < PNet.cnn_convolutional_layers_dft.size(); ++k1) + { + for (size_t k2 = 0; k2 < PNet.cnn_convolutional_layers_dft[k1].size(); ++k2) + { + for (size_t k3 = 0; k3 < PNet.cnn_convolutional_layers_dft[k1][k2].size(); ++k3) + { + PNet.cnn_convolutional_layers_dft[k1][k2][k3].second = cv::Mat_(0, 0, 0.0); + } + } + } + // Extract the probabilities from PNet response cv::Mat_ prob_heatmap; cv::exp(pnet_out[0]- pnet_out[1], prob_heatmap); @@ -860,7 +870,7 @@ bool FaceDetectorMTCNN::DetectFaces(vector >& o_regions, const int end_x_out = cv::min(width_target - (proposal_boxes_all[k].x + proposal_boxes_all[k].width - width_orig), width_target); int end_y_out = cv::min(height_target - (proposal_boxes_all[k].y + proposal_boxes_all[k].height - height_orig), height_target); - cv::Mat tmp(height_target, width_target, CV_32FC3); + cv::Mat tmp(height_target, width_target, CV_32FC3, cv::Scalar(0.0f,0.0f,0.0f)); img_float(cv::Rect(start_x_in, start_y_in, end_x_in - start_x_in, end_y_in - start_y_in)).copyTo( tmp(cv::Rect(start_x_out, start_y_out, end_x_out - start_x_out, end_y_out - start_y_out))); @@ -883,6 +893,7 @@ bool FaceDetectorMTCNN::DetectFaces(vector >& o_regions, const { to_keep.push_back(k); } + } // Pick only the bounding boxes above the threshold @@ -899,6 +910,7 @@ bool FaceDetectorMTCNN::DetectFaces(vector >& o_regions, const // Preparing for the ONet stage to_keep.clear(); + for (size_t k = 0; k < proposal_boxes_all.size(); ++k) { float width_target = proposal_boxes_all[k].width + 1; @@ -916,7 +928,7 @@ bool FaceDetectorMTCNN::DetectFaces(vector >& o_regions, const int end_x_out = cv::min(width_target - (proposal_boxes_all[k].x + proposal_boxes_all[k].width - width_orig), width_target); int end_y_out = cv::min(height_target - (proposal_boxes_all[k].y + proposal_boxes_all[k].height - height_orig), height_target); - cv::Mat tmp(height_target, width_target, CV_32FC3); + cv::Mat tmp(height_target, width_target, CV_32FC3, cv::Scalar(0.0f, 0.0f, 0.0f)); img_float(cv::Rect(start_x_in, start_y_in, end_x_in - start_x_in, end_y_in - start_y_in)).copyTo( tmp(cv::Rect(start_x_out, start_y_out, end_x_out - start_x_out, end_y_out - start_y_out))); @@ -966,7 +978,7 @@ bool FaceDetectorMTCNN::DetectFaces(vector >& o_regions, const cv::rectangle(disp_img, proposal_boxes_all[k], cv::Scalar(255, 0, 0), 3); } cv::imshow("detections", disp_img); - cv::waitKey(5); + cv::waitKey(20); if(o_regions.size() > 0) {