mirror of
https://github.com/yakhyo/uniface.git
synced 2025-12-30 09:02:25 +00:00
- add dynamic onnx provider selection for m1/m2/m3/m4 macs - replace mkdocs with simple markdown files - fix model download and scrfd detection issues - update ci/cd workflows
330 lines
14 KiB
Python
330 lines
14 KiB
Python
# Copyright 2025 Yakhyokhuja Valikhujaev
|
|
# Author: Yakhyokhuja Valikhujaev
|
|
# GitHub: https://github.com/yakhyo
|
|
|
|
import numpy as np
|
|
|
|
from typing import Tuple, List, Literal, Dict, Any
|
|
|
|
from uniface.log import Logger
|
|
from uniface.model_store import verify_model_weights
|
|
from uniface.constants import RetinaFaceWeights
|
|
from uniface.onnx_utils import create_onnx_session
|
|
|
|
from .base import BaseDetector
|
|
from .utils import (
|
|
non_max_supression,
|
|
resize_image,
|
|
decode_boxes,
|
|
generate_anchors,
|
|
decode_landmarks
|
|
)
|
|
|
|
|
|
class RetinaFace(BaseDetector):
|
|
"""
|
|
Face detector based on the RetinaFace architecture.
|
|
|
|
Title: "RetinaFace: Single-stage Dense Face Localisation in the Wild"
|
|
Paper: https://arxiv.org/abs/1905.00641
|
|
|
|
Args:
|
|
**kwargs: Keyword arguments passed to BaseDetector and RetinaFace. Supported keys include:
|
|
model_name (RetinaFaceWeights, optional): Model weights to use. Defaults to `RetinaFaceWeights.MNET_V2`.
|
|
conf_thresh (float, optional): Confidence threshold for filtering detections. Defaults to 0.5.
|
|
nms_thresh (float, optional): Non-maximum suppression (NMS) IoU threshold. Defaults to 0.4.
|
|
pre_nms_topk (int, optional): Number of top-scoring boxes considered before NMS. Defaults to 5000.
|
|
post_nms_topk (int, optional): Max number of detections kept after NMS. Defaults to 750.
|
|
dynamic_size (bool, optional): If True, generate anchors dynamically per input image. Defaults to False.
|
|
input_size (Tuple[int, int], optional): Fixed input size (width, height) if `dynamic_size=False`. Defaults to (640, 640).
|
|
|
|
Attributes:
|
|
model_name (RetinaFaceWeights): Selected model variant.
|
|
conf_thresh (float): Threshold for confidence-based filtering.
|
|
nms_thresh (float): IoU threshold used for NMS.
|
|
pre_nms_topk (int): Limit on proposals before applying NMS.
|
|
post_nms_topk (int): Limit on retained detections after NMS.
|
|
dynamic_size (bool): Flag indicating dynamic or static input sizing.
|
|
input_size (Tuple[int, int]): Static input size if `dynamic_size=False`.
|
|
_model_path (str): Absolute path to the verified model weights.
|
|
_priors (np.ndarray): Precomputed anchor boxes (if static size).
|
|
_supports_landmarks (bool): Indicates landmark prediction support.
|
|
|
|
Raises:
|
|
ValueError: If the model weights are invalid or not found.
|
|
RuntimeError: If the ONNX model fails to load or initialize.
|
|
"""
|
|
|
|
def __init__(self, **kwargs) -> None:
|
|
super().__init__(**kwargs)
|
|
self._supports_landmarks = True # RetinaFace supports landmarks
|
|
|
|
self.model_name = kwargs.get('model_name', RetinaFaceWeights.MNET_V2)
|
|
self.conf_thresh = kwargs.get('conf_thresh', 0.5)
|
|
self.nms_thresh = kwargs.get('nms_thresh', 0.4)
|
|
self.pre_nms_topk = kwargs.get('pre_nms_topk', 5000)
|
|
self.post_nms_topk = kwargs.get('post_nms_topk', 750)
|
|
self.dynamic_size = kwargs.get('dynamic_size', False)
|
|
self.input_size = kwargs.get('input_size', (640, 640))
|
|
|
|
Logger.info(
|
|
f"Initializing RetinaFace with model={self.model_name}, conf_thresh={self.conf_thresh}, nms_thresh={self.nms_thresh}, "
|
|
f"input_size={self.input_size}"
|
|
)
|
|
|
|
# Get path to model weights
|
|
self._model_path = verify_model_weights(self.model_name)
|
|
Logger.info(f"Verified model weights located at: {self._model_path}")
|
|
|
|
# Precompute anchors if using static size
|
|
if not self.dynamic_size and self.input_size is not None:
|
|
self._priors = generate_anchors(image_size=self.input_size)
|
|
Logger.debug("Generated anchors for static input size.")
|
|
|
|
# Initialize model
|
|
self._initialize_model(self._model_path)
|
|
|
|
def _initialize_model(self, model_path: str) -> None:
|
|
"""
|
|
Initializes an ONNX model session from the given path.
|
|
|
|
Args:
|
|
model_path (str): The file path to the ONNX model.
|
|
|
|
Raises:
|
|
RuntimeError: If the model fails to load, logs an error and raises an exception.
|
|
"""
|
|
try:
|
|
self.session = create_onnx_session(model_path)
|
|
self.input_names = self.session.get_inputs()[0].name
|
|
self.output_names = [x.name for x in self.session.get_outputs()]
|
|
Logger.info(f"Successfully initialized the model from {model_path}")
|
|
except Exception as e:
|
|
Logger.error(f"Failed to load model from '{model_path}': {e}", exc_info=True)
|
|
raise RuntimeError(f"Failed to initialize model session for '{model_path}'") from e
|
|
|
|
def preprocess(self, image: np.ndarray) -> np.ndarray:
|
|
"""Preprocess input image for model inference.
|
|
|
|
Args:
|
|
image (np.ndarray): Input image.
|
|
|
|
Returns:
|
|
np.ndarray: Preprocessed image tensor with shape (1, C, H, W)
|
|
"""
|
|
image = np.float32(image) - np.array([104, 117, 123], dtype=np.float32)
|
|
image = image.transpose(2, 0, 1) # HWC to CHW
|
|
image = np.expand_dims(image, axis=0) # Add batch dimension (1, C, H, W)
|
|
return image
|
|
|
|
def inference(self, input_tensor: np.ndarray) -> List[np.ndarray]:
|
|
"""Perform model inference on the preprocessed image tensor.
|
|
|
|
Args:
|
|
input_tensor (np.ndarray): Preprocessed input tensor.
|
|
|
|
Returns:
|
|
Tuple[np.ndarray, np.ndarray]: Raw model outputs.
|
|
"""
|
|
return self.session.run(self.output_names, {self.input_names: input_tensor})
|
|
|
|
def detect(
|
|
self,
|
|
image: np.ndarray,
|
|
max_num: int = 0,
|
|
metric: Literal["default", "max"] = "max",
|
|
center_weight: float = 2.0
|
|
) -> List[Dict[str, Any]]:
|
|
"""
|
|
Perform face detection on an input image and return bounding boxes and facial landmarks.
|
|
|
|
Args:
|
|
image (np.ndarray): Input image as a NumPy array of shape (H, W, C).
|
|
max_num (int): Maximum number of detections to return. Use 0 to return all detections. Defaults to 0.
|
|
metric (Literal["default", "max"]): Metric for ranking detections when `max_num` is limited.
|
|
- "default": Prioritize detections closer to the image center.
|
|
- "max": Prioritize detections with larger bounding box areas.
|
|
center_weight (float): Weight for penalizing detections farther from the image center
|
|
when using the "default" metric. Defaults to 2.0.
|
|
|
|
Returns:
|
|
List[Dict[str, Any]]: List of face detection dictionaries, each containing:
|
|
- 'bbox': [x1, y1, x2, y2] - Bounding box coordinates
|
|
- 'confidence': float - Detection confidence score
|
|
- 'landmarks': [[x1, y1], [x2, y2], [x3, y3], [x4, y4], [x5, y5]] - 5-point facial landmarks
|
|
"""
|
|
|
|
original_height, original_width = image.shape[:2]
|
|
|
|
if self.dynamic_size:
|
|
height, width, _ = image.shape
|
|
self._priors = generate_anchors(image_size=(height, width)) # generate anchors for each input image
|
|
resize_factor = 1.0 # No resizing
|
|
else:
|
|
image, resize_factor = resize_image(image, target_shape=self.input_size)
|
|
|
|
height, width, _ = image.shape
|
|
image_tensor = self.preprocess(image)
|
|
|
|
# ONNXRuntime inference
|
|
outputs = self.inference(image_tensor)
|
|
|
|
# Postprocessing
|
|
detections, landmarks = self.postprocess(outputs, resize_factor, shape=(width, height))
|
|
|
|
if max_num > 0 and detections.shape[0] > max_num:
|
|
# Calculate area of detections
|
|
areas = (detections[:, 2] - detections[:, 0]) * (detections[:, 3] - detections[:, 1])
|
|
|
|
# Calculate offsets from image center
|
|
center = (original_height // 2, original_width // 2)
|
|
offsets = np.vstack([
|
|
(detections[:, 0] + detections[:, 2]) / 2 - center[1],
|
|
(detections[:, 1] + detections[:, 3]) / 2 - center[0]
|
|
])
|
|
offset_dist_squared = np.sum(np.power(offsets, 2.0), axis=0)
|
|
|
|
# Calculate scores based on the chosen metric
|
|
if metric == 'max':
|
|
scores = areas
|
|
else:
|
|
scores = areas - offset_dist_squared * center_weight
|
|
|
|
# Sort by scores and select top `max_num`
|
|
sorted_indices = np.argsort(scores)[::-1][:max_num]
|
|
|
|
detections = detections[sorted_indices]
|
|
landmarks = landmarks[sorted_indices]
|
|
|
|
faces = []
|
|
for i in range(detections.shape[0]):
|
|
face_dict = {
|
|
'bbox': detections[i, :4].astype(float).tolist(),
|
|
'confidence': detections[i, 4].item(),
|
|
'landmarks': landmarks[i].astype(float).tolist()
|
|
}
|
|
faces.append(face_dict)
|
|
|
|
return faces
|
|
|
|
def postprocess(self, outputs: List[np.ndarray], resize_factor: float, shape: Tuple[int, int]) -> Tuple[np.ndarray, np.ndarray]:
|
|
"""
|
|
Process the model outputs into final detection results.
|
|
|
|
Args:
|
|
outputs (List[np.ndarray]): Raw outputs from the detection model.
|
|
- outputs[0]: Location predictions (bounding box coordinates).
|
|
- outputs[1]: Class confidence scores.
|
|
- outputs[2]: Landmark predictions.
|
|
resize_factor (float): Factor used to resize the input image during preprocessing.
|
|
shape (Tuple[int, int]): Original shape of the image as (height, width).
|
|
|
|
Returns:
|
|
Tuple[np.ndarray, np.ndarray]: Processed results containing:
|
|
- detections (np.ndarray): Array of detected bounding boxes with confidence scores.
|
|
Shape: (num_detections, 5), where each row is [x_min, y_min, x_max, y_max, score].
|
|
- landmarks (np.ndarray): Array of detected facial landmarks.
|
|
Shape: (num_detections, 5, 2), where each row contains 5 landmark points (x, y).
|
|
"""
|
|
loc, conf, landmarks = outputs[0].squeeze(0), outputs[1].squeeze(0), outputs[2].squeeze(0)
|
|
|
|
# Decode boxes and landmarks
|
|
boxes = decode_boxes(loc, self._priors)
|
|
landmarks = decode_landmarks(landmarks, self._priors)
|
|
|
|
boxes, landmarks = self._scale_detections(boxes, landmarks, resize_factor, shape=(shape[0], shape[1]))
|
|
|
|
# Extract confidence scores for the face class
|
|
scores = conf[:, 1]
|
|
mask = scores > self.conf_thresh
|
|
|
|
# Filter by confidence threshold
|
|
boxes, landmarks, scores = boxes[mask], landmarks[mask], scores[mask]
|
|
|
|
# Sort by scores
|
|
order = scores.argsort()[::-1][:self.pre_nms_topk]
|
|
boxes, landmarks, scores = boxes[order], landmarks[order], scores[order]
|
|
|
|
# Apply NMS
|
|
detections = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False)
|
|
keep = non_max_supression(detections, self.nms_thresh)
|
|
detections, landmarks = detections[keep], landmarks[keep]
|
|
|
|
# Keep top-k detections
|
|
detections, landmarks = detections[:self.post_nms_topk], landmarks[:self.post_nms_topk]
|
|
|
|
landmarks = landmarks.reshape(-1, 5, 2).astype(np.int32)
|
|
|
|
return detections, landmarks
|
|
|
|
def _scale_detections(self, boxes: np.ndarray, landmarks: np.ndarray, resize_factor: float, shape: Tuple[int, int]) -> Tuple[np.ndarray, np.ndarray]:
|
|
# Scale bounding boxes and landmarks to the original image size.
|
|
bbox_scale = np.array([shape[0], shape[1]] * 2)
|
|
boxes = boxes * bbox_scale / resize_factor
|
|
|
|
landmark_scale = np.array([shape[0], shape[1]] * 5)
|
|
landmarks = landmarks * landmark_scale / resize_factor
|
|
|
|
return boxes, landmarks
|
|
|
|
|
|
# TODO: below is only for testing, remove it later
|
|
def draw_bbox(frame, bbox, score, color=(0, 255, 0), thickness=2):
|
|
x1, y1, x2, y2 = map(int, bbox) # Unpack 4 bbox values
|
|
cv2.rectangle(frame, (x1, y1), (x2, y2), color, thickness)
|
|
cv2.putText(frame, f"{score:.2f}", (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 1)
|
|
|
|
|
|
def draw_keypoints(frame, points, color=(0, 0, 255), radius=2):
|
|
for (x, y) in points.astype(np.int32):
|
|
cv2.circle(frame, (int(x), int(y)), radius, color, -1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import cv2
|
|
detector = RetinaFace(model_name=RetinaFaceWeights.MNET_050)
|
|
print(detector.get_info())
|
|
cap = cv2.VideoCapture(0)
|
|
|
|
if not cap.isOpened():
|
|
print("❌ Failed to open webcam.")
|
|
exit()
|
|
|
|
print("📷 Webcam started. Press 'q' to exit.")
|
|
|
|
while True:
|
|
ret, frame = cap.read()
|
|
if not ret:
|
|
print("❌ Failed to read frame.")
|
|
break
|
|
|
|
# Get face detections as list of dictionaries
|
|
faces = detector.detect(frame)
|
|
|
|
# Process each detected face
|
|
for face in faces:
|
|
# Extract bbox and landmarks from dictionary
|
|
bbox = face['bbox'] # [x1, y1, x2, y2]
|
|
landmarks = face['landmarks'] # [[x1, y1], [x2, y2], ...]
|
|
confidence = face['confidence']
|
|
|
|
# Pass bbox and confidence separately
|
|
draw_bbox(frame, bbox, confidence)
|
|
|
|
# Convert landmarks to numpy array format if needed
|
|
if landmarks is not None and len(landmarks) > 0:
|
|
# Convert list of [x, y] pairs to numpy array
|
|
points = np.array(landmarks, dtype=np.float32) # Shape: (5, 2)
|
|
draw_keypoints(frame, points)
|
|
|
|
# Display face count
|
|
cv2.putText(frame, f"Faces: {len(faces)}", (10, 30),
|
|
cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 255, 255), 2)
|
|
|
|
cv2.imshow("FaceDetection", frame)
|
|
if cv2.waitKey(1) & 0xFF == ord("q"):
|
|
break
|
|
|
|
cap.release()
|
|
cv2.destroyAllWindows()
|