Initial Matlab version of MTCNN.

2025-12-30 13:02:30 +00:00 · 2017-08-07 14:49:29 -04:00
parent cdd258330f
commit 6a613dffb6
10 changed files with 430 additions and 1 deletions
--- a/matlab_version/face_detection/mtcnn/PNet_mlab.mat
+++ b/matlab_version/face_detection/mtcnn/PNet_mlab.mat
--- a/matlab_version/face_detection/mtcnn/PReLU.m
+++ b/matlab_version/face_detection/mtcnn/PReLU.m
@@ -0,0 +1,20 @@
+function [ out_map ] = PReLU( input_maps, PReLU_params )
+%PRELU Summary of this function goes here
+%   Detailed explanation goes here
+
+    out_map = [];
+    if(numel(size(input_maps)) > 2)
+        for i=1:size(input_maps,3)
+            in_map = input_maps(:,:,i,:);
+            in_map(in_map < 0) = in_map(in_map<0) * PReLU_params(i);
+            out_map = cat(3, out_map, in_map);
+        end  
+    else
+        for i=1:size(input_maps,2)
+            in_map = input_maps(:,i);
+            in_map(in_map < 0) = in_map(in_map<0) * PReLU_params(i);
+            out_map = cat(2, out_map, in_map);
+        end        
+    end 
+end
+
--- a/matlab_version/face_detection/mtcnn/convolution.m
+++ b/matlab_version/face_detection/mtcnn/convolution.m
@@ -0,0 +1,22 @@
+function [ output_maps ] = convolution( input_maps, kernels, biases )
+%CONVOLUTION Summary of this function goes here
+%   Detailed explanation goes here
+
+%     n_filters = size(kernels, 4);
+
+%     kernels2 = kernels(:,:,end:-1:1,:);
+%     for i=1:n_filters
+%         for n_in_maps=1:size(kernels,3)
+%             kernels2(:,:,n_in_maps,i) = fliplr(squeeze(kernels2(:,:,n_in_maps,i)));
+%             kernels2(:,:,n_in_maps,i) = flipud(squeeze(kernels2(:,:,n_in_maps,i)));
+%         end
+%     end
+%     output_maps_1 = [];
+%     for i=1:n_filters
+%         output_maps_1 = cat(3, output_maps_1, convn(input_maps, kernels2(:,:,:,i), 'valid') + biases(i));
+%     end
+    
+
+    output_maps = vl_nnconv(single(input_maps), kernels, biases);
+end
+
--- a/matlab_version/face_detection/mtcnn/detect_face.m
+++ b/matlab_version/face_detection/mtcnn/detect_face.m
@@ -0,0 +1,178 @@
+clear;
+
+% Make sure we have the dependencies for convolution
+od = cd('../../face_validation');
+setup;
+cd(od);
+
+img = imread('test1.jpg');
+height_orig = size(img,1);
+width_orig = size(img,2);
+
+% Everything is done in floats
+img = single(img);
+
+% Minimum face size
+min_face_size = 30;
+
+% Image pyramid scaling factor
+factor = 0.709;
+
+% Thresholds for the PNet, ONet, and RNet
+threshold=[0.6 0.7 0.7];
+
+min_dim = min([width_orig height_orig]);
+
+% Face support region is 12x12 px, so from that can work out the largest
+% scale (which is 12 / min), and work down from there to smallest scale (no smaller than
+% 12x12px)
+face_support = 12;
+num_scales = floor(log(min_face_size / min_dim) / log(factor));
+scales = (face_support / min_face_size)*factor.^(0:num_scales);
+
+load('PNet_mlab');
+load('RNet_mlab');
+load('ONet_mlab');
+
+total_bboxes = [];
+
+% First the PNet stage on image pyramid
+for s = scales
+    h_pyr = ceil(height_orig * s);
+    w_pyr = ceil(width_orig * s);
+
+    % Resize the image and normalize to what MTCNN expects it to be
+    im_data=(imresize(img, [h_pyr w_pyr],'bilinear')-127.5)*0.0078125;
+
+    [ out_prob, out_correction ] = PNet( im_data, PNet_mlab );
+
+    % Generate bounding boxes from the heatmap
+    bboxes = generate_bounding_boxes(out_prob, out_correction, s, threshold(1), face_support);
+
+    % TODO correct bboxes before running NMS?, as now lots of overlaping
+    % boxes are present
+    
+    % Perform non maximum supression to remove reduntant bounding boxes
+    pick = non_maximum_supression(bboxes, 0.5, 'Union');
+    bboxes=bboxes(pick,:);
+    if ~isempty(bboxes)
+        total_bboxes = cat(1, total_bboxes, bboxes);
+    end
+end
+
+if ~isempty(total_bboxes)
+    % Non maximum supression accross bounding boxes, and their offset
+    % correction
+    total_bboxes = correct_bbox(total_bboxes(:,1:5), total_bboxes(:,6:end), false, true, true); 
+    
+end
+num_bbox = size(total_bboxes,1);
+
+% RNet stage
+if num_bbox > 0
+    
+    proposal_imgs = zeros(24, 24, 3, num_bbox);
+    for k=1:num_bbox
+        
+        width_target = total_bboxes(k,3) - total_bboxes(k,1) + 1;
+        height_target = total_bboxes(k,4) - total_bboxes(k,2) + 1;
+        
+        % Work out the start and end indices in the original image
+        start_x_in = max(total_bboxes(k,1), 1);
+        start_y_in = max(total_bboxes(k,2), 1);
+        end_x_in = min(total_bboxes(k,3), width_orig);
+        end_y_in = min(total_bboxes(k,4), height_orig);
+        
+        % Work out the start and end indices in the target image
+        start_x_out = max(-total_bboxes(k,1)+2, 1);
+        start_y_out = max(-total_bboxes(k,2)+2, 1);
+        end_x_out = min(width_target - (total_bboxes(k,3)-width_orig), width_target);
+        end_y_out = min(height_target - (total_bboxes(k,4)-height_orig), height_target);
+                
+        tmp = zeros(height_target, width_target, 3);
+        
+        tmp(start_y_out:end_y_out,start_x_out:end_x_out,:) = ...
+            img(start_y_in:end_y_in, start_x_in:end_x_in,:);
+        
+        proposal_imgs(:,:,:,k) = imresize(tmp, [24 24], 'bilinear');
+    end
+    
+    % Normalize the proposal images
+    proposal_imgs = (proposal_imgs - 127.5) * 0.0078125;
+    
+    % Apply RNet to proposal faces
+    [ score, out_correction ] = RNet( proposal_imgs, RNet_mlab );
+    out_correction = out_correction';
+
+    % Find faces above the threshold
+    to_keep = find(score > threshold(2));
+
+    total_bboxes = [total_bboxes(to_keep,1:4) score(to_keep)'];
+    out_correction = out_correction(to_keep,:);
+
+    if ~isempty(total_bboxes)
+        % Non maximum supression accross bounding boxes, and their offset
+        % correction
+        total_bboxes = correct_bbox(total_bboxes, out_correction, true, true, true); 
+    end
+end
+
+num_bbox = size(total_bboxes,1);
+
+% ONet stage
+if num_bbox > 0
+    
+    proposal_imgs = zeros(48, 48, 3, num_bbox);
+    for k=1:num_bbox
+        
+        width_target = total_bboxes(k,3) - total_bboxes(k,1) + 1;
+        height_target = total_bboxes(k,4) - total_bboxes(k,2) + 1;
+        
+        % Work out the start and end indices in the original image
+        start_x_in = max(total_bboxes(k,1), 1);
+        start_y_in = max(total_bboxes(k,2), 1);
+        end_x_in = min(total_bboxes(k,3), width_orig);
+        end_y_in = min(total_bboxes(k,4), height_orig);
+        
+        % Work out the start and end indices in the target image
+        start_x_out = max(-total_bboxes(k,1)+2, 1);
+        start_y_out = max(-total_bboxes(k,2)+2, 1);
+        end_x_out = min(width_target - (total_bboxes(k,3)-width_orig), width_target);
+        end_y_out = min(height_target - (total_bboxes(k,4)-height_orig), height_target);
+                
+        tmp = zeros(height_target, width_target, 3);
+        
+        tmp(start_y_out:end_y_out,start_x_out:end_x_out,:) = ...
+            img(start_y_in:end_y_in, start_x_in:end_x_in,:);
+        
+        proposal_imgs(:,:,:,k) = imresize(tmp, [48 48], 'bilinear');
+    end
+    
+    % Normalize the proposal images
+    proposal_imgs = (proposal_imgs - 127.5) * 0.0078125;
+    
+    % Apply ONet to proposal faces
+    [ score, out_correction, lmarks ] = ONet( proposal_imgs, ONet_mlab );
+    out_correction = out_correction';
+    lmarks = lmarks';
+    
+    % Pick the final faces above the threshold
+    to_keep = find(score > threshold(3));    
+    lmarks = lmarks(to_keep, :);
+    out_correction = out_correction(to_keep, :);
+    total_bboxes = [total_bboxes(to_keep,1:4) score(to_keep)'];
+    
+    % Correct for the landmarks
+    bbw = total_bboxes(:,3) - total_bboxes(:,1) + 1;
+    bbh = total_bboxes(:,4) - total_bboxes(:,2) + 1;
+    
+    lmarks(:, 1:5) = bbw .* lmarks(:,1:5) + total_bboxes(:,1) - 1;
+    lmarks(:, 6:10) = bbh .* lmarks(:,6:10) + total_bboxes(:,2) - 1;
+    
+    % Correct the bounding boxes
+    if size(total_bboxes,1)>0				
+        [total_bboxes, to_keep] = correct_bbox(total_bboxes, out_correction, true, false, false);
+        lmarks = lmarks(to_keep, :);
+    end
+    
+end
--- a/matlab_version/face_detection/mtcnn/generate_bounding_boxes.m
+++ b/matlab_version/face_detection/mtcnn/generate_bounding_boxes.m
@@ -0,0 +1,25 @@
+function [bboxes] = generate_bounding_boxes(heatmap, correction, scale, t, face_support)
+	%use heatmap to generate bounding boxes in the original image space
+    
+    % Correction for the pooling
+    stride = 2;
+
+    % Offsets for, x, y, width and height
+    dx1=correction(:,:,1);
+	dy1=correction(:,:,2);
+	dx2=correction(:,:,3);
+	dy2=correction(:,:,4);
+    
+    % Find the parts of a heatmap above the threshold (x, y, and indices)
+    [x, y]= find(heatmap >= t);
+    inds = find(heatmap >= t);
+    
+    % Find the corresponding scores and bbox corrections
+    score=heatmap(inds);    
+	correction=[dx1(inds) dy1(inds) dx2(inds) dy2(inds)];
+
+    % Correcting for Matlab's format
+    bboxes=[y - 1 x - 1];
+    bboxes=[fix((stride*(bboxes)+1)/scale) fix((stride*(bboxes)+face_support)/scale) score correction];
+end
+
--- a/matlab_version/face_detection/mtcnn/max_pooling.m
+++ b/matlab_version/face_detection/mtcnn/max_pooling.m
@@ -0,0 +1,57 @@
+function [ output_maps ] = max_pooling( input_maps)
+%POOLING Summary of this function goes here
+%   Detailed explanation goes here
+    
+    orig_rows = size(input_maps,1);
+    orig_cols = size(input_maps,2);
+    
+    pooled_rows = ceil(orig_rows / 2);
+    pooled_cols = ceil(orig_cols / 2);
+
+    up_to_rows_out = floor(orig_rows / 2);
+    up_to_cols_out = floor(orig_cols / 2);
+
+    if(mod(orig_cols,2) == 0)
+        up_to_cols = orig_cols;
+    else
+        up_to_cols = orig_cols - 1;
+    end
+    
+    if(mod(orig_rows,2) == 0)
+        up_to_rows = orig_rows;
+    else
+        up_to_rows = orig_rows - 1;
+    end
+    
+    output_maps = zeros(pooled_rows, pooled_cols, size(input_maps,3));
+    for i=1:size(input_maps,3)
+        temp = im2col(input_maps(1:up_to_rows,1:up_to_cols,i), [2,2], 'distinct');
+        max_val = max(temp);
+        output_maps(1:up_to_rows_out,1:up_to_cols_out,i) = reshape(max_val, up_to_rows_out, up_to_cols_out);     
+    end
+    
+    % A bit of a hack for non-even number of rows or columns
+    if(mod(orig_cols,2) ~= 0)
+        for i=1:size(input_maps,3)
+            temp = im2col(input_maps(1:up_to_rows,end,i), [2,1], 'distinct');
+            max_val = max(temp);
+            output_maps(1:up_to_rows_out,end,i) = max_val;     
+        end        
+    end
+
+    if(mod(orig_rows,2) ~= 0)
+        for i=1:size(input_maps,3)
+            temp = im2col(input_maps(end, 1:up_to_cols,i), [1,2], 'distinct');
+            max_val = max(temp);
+            output_maps(end, 1:up_to_cols_out,i) = max_val;     
+        end        
+    end
+    
+    if(mod(orig_cols,2) ~= 0 && mod(orig_rows,2) ~= 0)
+        output_maps(end,end,:) = input_maps(end,end,:);
+    end
+    
+
+    
+end
+
--- a/matlab_version/face_detection/mtcnn/max_pooling2.m
+++ b/matlab_version/face_detection/mtcnn/max_pooling2.m
@@ -0,0 +1,66 @@
+function [ output_maps ] = max_pooling2( input_maps, kernel_size, stride)
+%POOLING Summary of this function goes here
+%   Detailed explanation goes here
+    
+    orig_rows = size(input_maps,1);
+    orig_cols = size(input_maps,2);
+    
+    pooled_rows = round((orig_rows - kernel_size)/stride) + 1;
+    pooled_cols = round((orig_cols - kernel_size)/stride) + 1;
+
+    up_to_rows_out = floor((orig_rows - kernel_size)/stride) + 1;
+    up_to_cols_out = floor((orig_cols - kernel_size)/stride) + 1;
+
+    % How many full max-pooling steps are there
+    up_to_cols = kernel_size + (up_to_cols_out-1) * stride;
+    up_to_rows = kernel_size + (up_to_rows_out-1) * stride;
+    
+    output_maps = zeros(pooled_rows, pooled_cols, size(input_maps,3), size(input_maps,4));
+        
+    % Pick only the striding elements
+    [y, x] = meshgrid(1:up_to_cols-kernel_size+1, 1:up_to_rows-kernel_size+1);
+    to_keep_map = mod(y, stride) == 1 & mod(x, stride) == 1;
+    to_keep = find(to_keep_map);
+    
+    for m=1:size(input_maps,4)
+        for i=1:size(input_maps,3)
+            temp = im2col(input_maps(1:up_to_rows,1:up_to_cols,i,m), [kernel_size, kernel_size], 'sliding');        
+            temp = temp(:,to_keep);
+            max_val = max(temp);
+            output_maps(1:up_to_rows_out,1:up_to_cols_out,i,m) = reshape(max_val, up_to_rows_out, up_to_cols_out);     
+        end
+    end
+    % A bit of a hack for non-even number of rows or columns
+    if(orig_cols ~= up_to_cols)
+        span = orig_cols - (up_to_cols - kernel_size + stride);
+        for m=1:size(input_maps,4)
+            for i=1:size(input_maps,3)
+                temp = im2col(input_maps(1:up_to_rows,end-span+1:end,i,m), [kernel_size, span], 'sliding');
+                max_val = max(temp(:,1:stride:end));
+                output_maps(1:up_to_rows_out,end,i,m) = max_val;     
+            end        
+        end
+    end
+
+    if(orig_rows ~= up_to_rows)
+        span = orig_rows - (up_to_rows - kernel_size + stride);
+        for m=1:size(input_maps,4)
+            for i=1:size(input_maps,3)
+                temp = im2col(input_maps(end-span+1:end, 1:up_to_cols,i,m), [span, kernel_size], 'sliding');
+                max_val = max(temp(:,1:stride:end));
+                output_maps(end, 1:up_to_cols_out,i,m) = max_val;     
+            end   
+        end
+    end
+    
+    if(orig_cols ~= up_to_cols && orig_rows ~= up_to_rows)
+        for m=1:size(input_maps,4)
+            for i=1:size(input_maps,3)
+                tmp = input_maps(up_to_rows- kernel_size + stride + 1:end,up_to_cols - kernel_size + stride+1:end,i,m);            
+                output_maps(end,end,i,m) = max(tmp(:));
+            end
+        end
+    end
+        
+end
+
--- a/matlab_version/face_detection/mtcnn/non_maximum_supression.m
+++ b/matlab_version/face_detection/mtcnn/non_maximum_supression.m
@@ -0,0 +1,46 @@
+function pick = non_maximum_supression(boxes, overlap_threshold,type)
+	
+    %NMS
+	if isempty(boxes)
+        pick = [];
+        return;
+    end
+    
+    % Compute the corners of boxes and the area
+	x1 = boxes(:,1);
+	y1 = boxes(:,2);
+	x2 = boxes(:,3);
+	y2 = boxes(:,4);
+	s = boxes(:,5);
+	area = (x2-x1+1) .* (y2-y1+1);
+
+    % Sorting based on confidence scores
+    [vals, I] = sort(s);
+    
+	pick = zeros(numel(s),1);
+    
+	counter = 1;
+	while ~isempty(I)
+        last = length(I);
+        i = I(last);
+        pick(counter) = i;
+        counter = counter + 1;  
+        
+        xx1 = max(x1(i), x1(I(1:last-1)));
+        yy1 = max(y1(i), y1(I(1:last-1)));
+        xx2 = min(x2(i), x2(I(1:last-1)));
+        yy2 = min(y2(i), y2(I(1:last-1)));  
+        w = max(0.0, xx2-xx1+1);
+        h = max(0.0, yy2-yy1+1); 
+        inter = w.*h;
+        
+        if strcmp(type,'Min')
+            o = inter ./ min(area(i),area(I(1:last-1)));
+        else
+            o = inter ./ (area(i) + area(I(1:last-1)) - inter);
+        end
+        I = I(find(o<=overlap_threshold));
+    end
+    
+	pick = pick(1:(counter-1));
+end
--- a/matlab_version/face_detection/mtcnn/rectify.m
+++ b/matlab_version/face_detection/mtcnn/rectify.m
@@ -0,0 +1,15 @@
+function [bbox_out] = rectify(bbox_in)
+	
+    %convert bboxA to square
+    heights = bbox_in(:,4) - bbox_in(:,2);
+	widths = bbox_in(:,3) - bbox_in(:,1);
+
+    max_side = max([widths'; heights'])';
+    
+    % Correct the starts based on new size
+    new_min_x = bbox_in(:,1) + 0.5 * (widths - max_side);
+    new_min_y = bbox_in(:,2) + 0.5 * (heights - max_side);
+    
+    bbox_out = [new_min_x, new_min_y, new_min_x + max_side, new_min_y + max_side];
+end
+
--- a/matlab_version/face_validation/WriteOutFaceCheckersCNNbinary.m
+++ b/matlab_version/face_validation/WriteOutFaceCheckersCNNbinary.m
@@ -83,7 +83,7 @@ function WriteOutFaceCheckersCNNbinary(locationTxt, faceCheckers)

                    for k=1:num_in_map                                        
                        for k2=1:num_out_kerns
-                            % Write out the bias term                                                
+                            % Write out the kernel                              
                            W = squeeze(cnn.layers{layers}.weights{1}(:,:,k,k2));
                            writeMatrixBin(faceCheckerFile, W, 5);                
                        end