From fb29a919b11b7591ccedb278564c4c3db10d65e2 Mon Sep 17 00:00:00 2001 From: yakhyo Date: Thu, 8 May 2025 17:11:13 +0900 Subject: [PATCH] ref: Update some modules and remove redundant parts --- scripts/search_face.py | 1 - uniface/attribute/age_gender.py | 147 +++++++++++++++++++----------- uniface/attribute/emotion.py | 129 ++++++++++++++------------ uniface/detection/scrfd.py | 8 +- uniface/landmark/model.py | 154 +++++++++++++++++++------------- uniface/recognition/base.py | 112 +++++++++++------------ 6 files changed, 323 insertions(+), 228 deletions(-) diff --git a/scripts/search_face.py b/scripts/search_face.py index b27188d..e183baa 100644 --- a/scripts/search_face.py +++ b/scripts/search_face.py @@ -18,7 +18,6 @@ def extract_reference_embedding(detector, recognizer, image_path): raise RuntimeError("No faces found in reference image.") embedding = recognizer.get_embedding(image, landmarks[0]) - print(f"Reference embedding extracted (L2 norm = {np.linalg.norm(embedding):.4f})") return embedding diff --git a/uniface/attribute/age_gender.py b/uniface/attribute/age_gender.py index d6af83a..faab4fd 100644 --- a/uniface/attribute/age_gender.py +++ b/uniface/attribute/age_gender.py @@ -4,12 +4,10 @@ import onnxruntime as ort from typing import Tuple from uniface.log import Logger +from uniface.constants import AgeGenderWeights from uniface.face_utils import bbox_center_alignment from uniface.model_store import verify_model_weights -from uniface.constants import AgeGenderWeights -from uniface.detection import RetinaFace -from uniface.constants import RetinaFaceWeights __all__ = ["AgeGender"] @@ -17,109 +15,156 @@ __all__ = ["AgeGender"] class AgeGender: """ Age and Gender Prediction Model. + + This model predicts both a person's gender (male/female) and age from a facial image. + Gender is returned as an integer (0: female, 1: male) and age as years. """ - def __init__(self, model_name: AgeGenderWeights = AgeGenderWeights.DEFAULT, input_size: Tuple[int, int] = (112, 112)) -> None: + def __init__( + self, + model_name: AgeGenderWeights = AgeGenderWeights.DEFAULT, + input_size: Tuple[int, int] = (112, 112) + ) -> None: """ - Initializes the Attribute model for inference. + Initializes the Age and Gender prediction model. Args: - model_path (str): Path to the ONNX file. + model_name: Model weights enum to use + input_size: Input resolution for the model (width, height) """ - Logger.info( f"Initializing AgeGender with model={model_name}, " f"input_size={input_size}" ) + # Model configuration self.input_size = input_size self.input_std = 1.0 self.input_mean = 0.0 # Get path to model weights - self._model_path = verify_model_weights(model_name) - Logger.info(f"Verfied model weights located at: {self._model_path}") + self.model_path = verify_model_weights(model_name) + Logger.info(f"Verified model weights located at: {self.model_path}") # Initialize model - self._initialize_model(model_path=self._model_path) + self._initialize_model() - def _initialize_model(self, model_path: str): - """Initialize the model from the given path. + def _initialize_model(self): + """ + Initialize the ONNX model for inference. - Args: - model_path (str): Path to .onnx model. + Raises: + RuntimeError: If the model fails to load or initialize. """ try: + # Initialize session with available providers self.session = ort.InferenceSession( - model_path, + self.model_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"] - ) - # Get model info - metadata = self.session.get_inputs()[0] - input_shape = metadata.shape - self.input_size = tuple(input_shape[2:4][::-1]) + # Extract model metadata + input_metadata = self.session.get_inputs()[0] + input_shape = input_metadata.shape + self.input_size = tuple(input_shape[2:4][::-1]) # Update from model (width, height) - self.input_names = [x.name for x in self.session.get_inputs()] - self.output_names = [x.name for x in self.session.get_outputs()] + # Get input/output names + self.input_names = [input.name for input in self.session.get_inputs()] + self.output_names = [output.name for output in self.session.get_outputs()] + + Logger.info(f"Successfully initialized AgeGender model") except Exception as e: - print(f"Failed to load the model: {e}") - raise + Logger.error(f"Failed to load AgeGender model from '{self.model_path}'", exc_info=True) + raise RuntimeError(f"Failed to initialize AgeGender model: {e}") - def preprocess(self, image: np.ndarray, bbox: np.ndarray): - """Preprocessing + def preprocess(self, image: np.ndarray, bbox: np.ndarray) -> np.ndarray: + """ + Preprocess the input image and face bounding box for inference. Args: - image (np.ndarray): Numpy image - bbox (np.ndarray): Bounding box coordinates: [x1, y1, x2, y2] + image: Input image in BGR format + bbox: Face bounding box coordinates [x1, y1, x2, y2] Returns: - np.ndarray: Transformed image + Preprocessed image blob ready for inference """ + # Calculate face dimensions and center width, height = bbox[2] - bbox[0], bbox[3] - bbox[1] center = (bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2 + + # Determine scale to fit face with margin scale = self.input_size[0] / (max(width, height) * 1.5) rotation = 0.0 - transformed_image, M = bbox_center_alignment(image, center, self.input_size[0], scale, rotation) - - input_size = tuple(transformed_image.shape[0:2][::-1]) - - blob = cv2.dnn.blobFromImage( - transformed_image, - 1.0/self.input_std, - input_size, - (self.input_mean, self.input_mean, self.input_mean), - swapRB=True + # Align face based on bounding box + aligned_face, _ = bbox_center_alignment( + image, center, self.input_size[0], scale, rotation ) - return blob - def postprocess(self, predictions: np.ndarray) -> Tuple[np.int64, int]: - """Postprocessing + # Convert to blob format for network input + face_blob = cv2.dnn.blobFromImage( + aligned_face, + 1.0 / self.input_std, + self.input_size, + (self.input_mean, self.input_mean, self.input_mean), + swapRB=True # Convert BGR to RGB + ) + + return face_blob + + def postprocess(self, predictions: np.ndarray) -> Tuple[int, int]: + """ + Process model predictions to extract gender and age. Args: - predictions (np.ndarray): Model predictions, shape: [1, 3] + predictions: Raw model output, shape [1, 3] where: + - First two elements represent gender logits + - Third element represents normalized age Returns: - Tuple[np.int64, int]: Gender and Age values + Tuple containing: + - Gender (0: female, 1: male) + - Age in years """ - gender = np.argmax(predictions[:2]) - age = int(np.round(predictions[2]*100)) - return gender, age + # First two values are gender logits (female/male) + gender = int(np.argmax(predictions[:2])) - def predict(self, image: np.ndarray, bbox: np.ndarray) -> Tuple[np.int64, int]: - blob = self.preprocess(image, bbox) - predictions = self.session.run(self.output_names, {self.input_names[0]: blob})[0][0] - gender, age = self.postprocess(predictions) + # Third value is normalized age that needs scaling + age = int(np.round(predictions[2] * 100)) return gender, age + def predict(self, image: np.ndarray, bbox: np.ndarray) -> Tuple[int, int]: + """ + Predict age and gender for a face in the image. + + Args: + image: Input image in BGR format + bbox: Face bounding box [x1, y1, x2, y2] + + Returns: + - 'gender_id': Gender as integer (0: female, 1: male) + - 'age': Age in years + """ + # Preprocess and run inference + face_blob = self.preprocess(image, bbox) + predictions = self.session.run( + self.output_names, + {self.input_names[0]: face_blob} + )[0][0] + + # Extract gender and age from predictions + gender_id, age = self.postprocess(predictions) + + return gender_id, age + # TODO: For testing purposes only, remove later def main(): + from uniface.detection import RetinaFace + from uniface.constants import RetinaFaceWeights face_detector = RetinaFace( model_name=RetinaFaceWeights.MNET_V2, diff --git a/uniface/attribute/emotion.py b/uniface/attribute/emotion.py index 6a3befb..8d49993 100644 --- a/uniface/attribute/emotion.py +++ b/uniface/attribute/emotion.py @@ -10,10 +10,9 @@ from PIL import Image from typing import Tuple, Union from uniface.log import Logger -from uniface import RetinaFace +from uniface.constants import DDAMFNWeights from uniface.face_utils import face_alignment from uniface.model_store import verify_model_weights -from uniface.constants import RetinaFaceWeights, DDAMFNWeights class Emotion: @@ -21,10 +20,11 @@ class Emotion: Emotion recognition using a TorchScript model. Args: - model_name (DDAMFNWeights): Pretrained model enum. Defaults to AFFECNET7. + model_weights (DDAMFNWeights): Pretrained model weights enum. Defaults to AFFECNET7. + input_size (Tuple[int, int]): Size of input images. Defaults to (112, 112). Attributes: - emotions (List[str]): Emotion label list. + emotion_labels (List[str]): List of emotion labels the model can predict. device (torch.device): Inference device (CPU or CUDA). model (torch.jit.ScriptModule): Loaded TorchScript model. @@ -33,122 +33,133 @@ class Emotion: RuntimeError: If model loading fails. """ - def __init__(self, model_name: DDAMFNWeights = DDAMFNWeights.AFFECNET7, input_size: Tuple[int, int] = (112, 112)) -> None: + def __init__( + self, + model_weights: DDAMFNWeights = DDAMFNWeights.AFFECNET7, + input_size: Tuple[int, int] = (112, 112) + ) -> None: """ Initialize the emotion detector with a TorchScript model """ - self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - - self.emotions = [ + self.emotion_labels = [ "Neutral", "Happy", "Sad", "Surprise", "Fear", "Disgust", "Angry" ] - if model_name == DDAMFNWeights.AFFECNET8: - self.emotions.append("Contempt") + # Add contempt for AFFECNET8 model + if model_weights == DDAMFNWeights.AFFECNET8: + self.emotion_labels.append("Contempt") + + # Initialize image preprocessing parameters self.input_size = input_size - self.input_std = [0.229, 0.224, 0.225] - self.input_mean = [0.485, 0.456, 0.406] + self.normalization_std = [0.229, 0.224, 0.225] + self.normalization_mean = [0.485, 0.456, 0.406] Logger.info( - f"Initialized Emotion class with model={model_name.name}, " + f"Initialized Emotion class with model={model_weights.name}, " f"device={'cuda' if torch.cuda.is_available() else 'cpu'}, " - f"num_classes={len(self.emotions)}, input_size={self.input_size}" + f"num_classes={len(self.emotion_labels)}, input_size={self.input_size}" ) - # Get path to model weights - self._model_path = verify_model_weights(model_name) - Logger.info(f"Verified model weights located at: {self._model_path}") + # Get path to model weights and initialize model + self.model_path = verify_model_weights(model_weights) + Logger.info(f"Verified model weights located at: {self.model_path}") + self._load_model() - # Initialize model - self._initialize_model(model_path=self._model_path) - - def _initialize_model(self, model_path: str) -> None: + def _load_model(self) -> None: """ - Initializes a TorchScript model for emotion inference. + Loads and initializes a TorchScript model for emotion inference. - Args: - model_path (str): Path to the TorchScript (.pt) model. + Raises: + RuntimeError: If loading the model fails. """ try: - self.model = torch.jit.load(model_path, map_location=self.device) + self.model = torch.jit.load(self.model_path, map_location=self.device) self.model.eval() - Logger.info(f"TorchScript model successfully loaded from: {model_path}") + Logger.info(f"TorchScript model successfully loaded from: {self.model_path}") - # Warm-up - dummy = torch.randn(1, 3, 112, 112).to(self.device) + # Warm-up with dummy input + dummy_input = torch.randn(1, 3, *self.input_size).to(self.device) with torch.no_grad(): - _ = self.model(dummy) + _ = self.model(dummy_input) Logger.info("Emotion model warmed up with dummy input.") except Exception as e: - Logger.error(f"Failed to load TorchScript model from {model_path}: {e}") - raise + Logger.error(f"Failed to load TorchScript model from {self.model_path}: {e}") + raise RuntimeError(f"Model loading failed: {str(e)}") def preprocess(self, image: np.ndarray) -> torch.Tensor: """ - Resize, normalize and convert image to tensor manually without torchvision. + Preprocess image for model inference: resize, normalize and convert to tensor. Args: image (np.ndarray): BGR image (H, W, 3) - Returns: - torch.Tensor: Preprocessed image tensor of shape (1, 3, 112, 112) - """ - image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # BGR -> RGB - # Resize to (112, 112) - image = cv2.resize(image, self.input_size).astype(np.float32) / 255.0 + Returns: + torch.Tensor: Preprocessed image tensor of shape (1, 3, H, W) + """ + # Convert BGR to RGB + rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + + # Resize to target input size + resized_image = cv2.resize(rgb_image, self.input_size).astype(np.float32) / 255.0 # Normalize with mean and std - mean = np.array(self.input_mean, dtype=np.float32) - std = np.array(self.input_std, dtype=np.float32) - image_normalized = (image - mean) / std + mean_array = np.array(self.normalization_mean, dtype=np.float32) + std_array = np.array(self.normalization_std, dtype=np.float32) + normalized_image = (resized_image - mean_array) / std_array - # HWC to CHW - image_transposed = image_normalized.transpose((2, 0, 1)) + # Convert from HWC to CHW format + transposed_image = normalized_image.transpose((2, 0, 1)) # Convert to torch tensor and add batch dimension - tensor = torch.from_numpy(image_transposed).unsqueeze(0).to(self.device) - + tensor = torch.from_numpy(transposed_image).unsqueeze(0).to(self.device) return tensor def predict(self, image: np.ndarray, landmark: np.ndarray) -> Tuple[Union[str, None], Union[float, None]]: """ - Predict the emotion from an BGR face image. + Predict the emotion from a face image. Args: - image (np.ndarray): Input face image in RGB format. + image (np.ndarray): Input face image in BGR format. landmark (np.ndarray): Facial five point landmark. Returns: Tuple[str, float]: (Predicted emotion label, Confidence score) + Returns (None, None) if prediction fails. Raises: - RuntimeError: If the input is invalid or inference fails internally. + ValueError: If the input is not a valid BGR image. """ + # Validate input if not isinstance(image, np.ndarray): Logger.error("Input must be a NumPy ndarray.") - raise ValueError("Input must be a NumPy ndarray (RGB image).") + raise ValueError("Input must be a NumPy ndarray (BGR image).") if image.ndim != 3 or image.shape[2] != 3: - Logger.error(f"Invalid image shape: {image.shape}. Expected HxWx3 RGB image.") - raise ValueError("Input image must be in RGB format with shape (H, W, 3).") + Logger.error(f"Invalid image shape: {image.shape}. Expected HxWx3 image.") + raise ValueError("Input image must have shape (H, W, 3).") try: - image, _ = face_alignment(image, landmark) - tensor = self.preprocess(image) + # Align face using landmarks + aligned_image, _ = face_alignment(image, landmark) + + # Preprocess and run inference + input_tensor = self.preprocess(aligned_image) with torch.no_grad(): - output = self.model(tensor) + output = self.model(input_tensor) + # Handle case where model returns a tuple if isinstance(output, tuple): output = output[0] - probs = torch.nn.functional.softmax(output, dim=1).squeeze(0).cpu().numpy() - pred_idx = int(np.argmax(probs)) - confidence = round(float(probs[pred_idx]), 2) + # Get probabilities and prediction + probabilities = torch.nn.functional.softmax(output, dim=1).squeeze(0).cpu().numpy() + predicted_index = int(np.argmax(probabilities)) + confidence_score = round(float(probabilities[predicted_index]), 2) - return self.emotions[pred_idx], confidence + return self.emotion_labels[predicted_index], confidence_score except Exception as e: Logger.error(f"Emotion inference failed: {e}") @@ -158,6 +169,8 @@ class Emotion: # TODO: For testing purposes only, remove later def main(): + from uniface import RetinaFace + from uniface.constants import RetinaFaceWeights face_detector = RetinaFace( model_name=RetinaFaceWeights.MNET_V2, diff --git a/uniface/detection/scrfd.py b/uniface/detection/scrfd.py index 135ed7a..610ba79 100644 --- a/uniface/detection/scrfd.py +++ b/uniface/detection/scrfd.py @@ -11,8 +11,9 @@ import onnxruntime as ort from typing import Tuple, List, Literal from uniface.log import Logger -from uniface.model_store import verify_model_weights from uniface.constants import SCRFDWeights +from uniface.model_store import verify_model_weights + from .utils import non_max_supression, distance2bbox, distance2kps, resize_image __all__ = ['SCRFD'] @@ -248,13 +249,12 @@ class SCRFD: sorted_indices = np.argsort(values)[::-1][:max_num] det = det[sorted_indices] landmarks = landmarks[sorted_indices] - - return det, landmarks # TODO: below is only for testing, remove it later + def draw_bbox(frame, bbox, color=(0, 255, 0), thickness=2): x1, y1, x2, y2 = bbox[:4].astype(np.int32) cv2.rectangle(frame, (x1, y1), (x2, y2), color, thickness) @@ -267,6 +267,8 @@ def draw_keypoints(frame, points, color=(0, 0, 255), radius=2): cv2.circle(frame, (x, y), radius, color, -1) +# TODO: Remove late, just for testing + if __name__ == "__main__": detector = SCRFD(model_name=SCRFDWeights.SCRFD_500M_KPS) cap = cv2.VideoCapture(0) diff --git a/uniface/landmark/model.py b/uniface/landmark/model.py index 9759d1b..955c7eb 100644 --- a/uniface/landmark/model.py +++ b/uniface/landmark/model.py @@ -6,131 +6,163 @@ import numpy as np from typing import Tuple from uniface.log import Logger -from uniface.face_utils import bbox_center_alignment, transform_points_2d +from uniface.constants import LandmarkWeights from uniface.model_store import verify_model_weights - -from uniface.detection import RetinaFace -from uniface.constants import RetinaFaceWeights, LandmarkWeights +from uniface.face_utils import bbox_center_alignment, transform_points_2d __all__ = ['Landmark'] class Landmark: - def __init__(self, model_name: LandmarkWeights = LandmarkWeights.DEFAULT, input_size: Tuple[int, int] = (192, 192)) -> None: + """ + Facial landmark detection model for predicting facial keypoints. + """ + + def __init__( + self, + model_name: LandmarkWeights = LandmarkWeights.DEFAULT, + input_size: Tuple[int, int] = (192, 192) + ) -> None: """ Initializes the Facial Landmark model for inference. Args: - model_path (str): Path to the ONNX file. + model_name: Enum specifying which landmark model weights to use + input_size: Input resolution for the model (width, height) """ - Logger.info( f"Initializing Facial Landmark with model={model_name}, " f"input_size={input_size}" ) + # Initialize configuration self.input_size = input_size self.input_std = 1.0 self.input_mean = 0.0 # Get path to model weights - self._model_path = verify_model_weights(model_name) - Logger.info(f"Verfied model weights located at: {self._model_path}") + self.model_path = verify_model_weights(model_name) + Logger.info(f"Verified model weights located at: {self.model_path}") # Initialize model - self._initialize_model(model_path=self._model_path) + self._initialize_model() - def _initialize_model(self, model_path: str): - """ Initialize the model from the given path. - Args: - model_path (str): Path to .onnx model. + def _initialize_model(self): + """ + Initialize the ONNX model from the stored model path. + + Raises: + RuntimeError: If the model fails to load or initialize. """ try: self.session = ort.InferenceSession( - model_path, + self.model_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"] ) - metadata = self.session.get_inputs()[0] - input_shape = metadata.shape - self.input_size = tuple(input_shape[2:4][::-1]) + # Get input configuration + input_metadata = self.session.get_inputs()[0] + input_shape = input_metadata.shape + self.input_size = tuple(input_shape[2:4][::-1]) # Update input size from model - self.input_names = [x.name for x in self.session.get_inputs()] - self.output_names = [x.name for x in self.session.get_outputs()] + # Get input/output names + self.input_names = [input.name for input in self.session.get_inputs()] + self.output_names = [output.name for output in self.session.get_outputs()] - outputs = self.session.get_outputs() - output_shape = outputs[0].shape - self.lmk_dim = 2 - self.lmk_num = output_shape[1] // self.lmk_dim + # Determine landmark dimensions from output shape + output_shape = self.session.get_outputs()[0].shape + self.lmk_dim = 2 # x,y coordinates + self.lmk_num = output_shape[1] // self.lmk_dim # Number of landmarks + + Logger.info(f"Model initialized with {self.lmk_num} landmarks") except Exception as e: - print(f"Failed to load the model: {e}") - raise + Logger.error(f"Failed to load landmark model from '{self.model_path}'", exc_info=True) + raise RuntimeError(f"Failed to initialize landmark model: {e}") def preprocess(self, image: np.ndarray, bbox: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """ - Preprocess the input image and bbox for inference. + Preprocess the input image and bounding box for inference. Args: - image (np.ndarray): Input image. - bbox (np.ndarray): Bounding box [x1, y1, x2, y2]. + image: Input image in BGR format + bbox: Bounding box coordinates [x1, y1, x2, y2] Returns: - Tuple[np.ndarray, np.ndarray]: Preprocessed blob and transformation matrix. + Tuple containing: + - Preprocessed image blob ready for inference + - Transformation matrix for mapping predictions back to original image """ + # Calculate face dimensions and center width, height = bbox[2] - bbox[0], bbox[3] - bbox[1] center = (bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2 + + # Determine scale to fit face with some margin scale = self.input_size[0] / (max(width, height) * 1.5) rotation = 0.0 - transformed_image, M = bbox_center_alignment(image, center, self.input_size[0], scale, rotation) - input_size = tuple(transformed_image.shape[0:2][::-1]) - - blob = cv2.dnn.blobFromImage( - transformed_image, - 1.0/self.input_std, - input_size, - (self.input_mean, self.input_mean, self.input_mean), - swapRB=True + # Align face using center, scale and rotation + aligned_face, transform_matrix = bbox_center_alignment( + image, center, self.input_size[0], scale, rotation ) - return blob, M + + # Convert to blob format for inference + face_blob = cv2.dnn.blobFromImage( + aligned_face, + 1.0 / self.input_std, + self.input_size, + (self.input_mean, self.input_mean, self.input_mean), + swapRB=True # Convert BGR to RGB + ) + + return face_blob, transform_matrix - def postprocess(self, predictions: np.ndarray, M: np.ndarray) -> np.ndarray: + def postprocess(self, predictions: np.ndarray, transform_matrix: np.ndarray) -> np.ndarray: """ - Postprocess model outputs to get landmarks. + Convert raw model predictions to image coordinates. Args: - predictions (np.ndarray): Raw model predictions. - M (np.ndarray): Affine transformation matrix. + predictions: Raw landmark coordinates from model output + transform_matrix: Affine transformation matrix from preprocessing Returns: - np.ndarray: Transformed landmarks. + Landmarks in original image coordinates """ + # Reshape to pairs of x,y coordinates + landmarks = predictions.reshape((-1, 2)) - predictions = predictions.reshape((-1, 2)) + # Denormalize coordinates to pixel space + landmarks[:, 0:2] += 1 # Shift from [-1,1] to [0,2] range + landmarks[:, 0:2] *= (self.input_size[0] // 2) # Scale to pixel coordinates - predictions[:, 0:2] += 1 - predictions[:, 0:2] *= (self.input_size[0] // 2) + # Invert the transformation to map back to original image + inverse_matrix = cv2.invertAffineTransform(transform_matrix) + landmarks = transform_points_2d(landmarks, inverse_matrix) - IM = cv2.invertAffineTransform(M) - predictions = transform_points_2d(predictions, IM) - - return predictions + return landmarks def predict(self, image: np.ndarray, bbox: np.ndarray) -> np.ndarray: """ - Predict facial landmarks for the given image and bounding box. + Predict facial landmarks for the given image and face bounding box. Args: - image (np.ndarray): Input image. - bbox (np.ndarray): Bounding box [x1, y1, x2, y2]. + image: Input image in BGR format + bbox: Face bounding box [x1, y1, x2, y2] Returns: - np.ndarray: Predicted landmarks. + Array of facial landmarks in original image coordinates """ - blob, M = self.preprocess(image, bbox) - preds = self.session.run(self.output_names, {self.input_names[0]: blob})[0][0] - landmarks = self.postprocess(preds, M) + # Preprocess image + face_blob, transform_matrix = self.preprocess(image, bbox) + + # Run inference + raw_predictions = self.session.run( + self.output_names, + {self.input_names[0]: face_blob} + )[0][0] + + # Postprocess to get landmarks in original image space + landmarks = self.postprocess(raw_predictions, transform_matrix) return landmarks @@ -138,7 +170,9 @@ class Landmark: if __name__ == "__main__": - + from uniface.detection import RetinaFace + from uniface.constants import RetinaFaceWeights + face_detector = RetinaFace( model_name=RetinaFaceWeights.MNET_V2, conf_thresh=0.5, diff --git a/uniface/recognition/base.py b/uniface/recognition/base.py index c82cc4e..b804cdb 100644 --- a/uniface/recognition/base.py +++ b/uniface/recognition/base.py @@ -7,7 +7,6 @@ import os import cv2 import numpy as np import onnxruntime as ort - from typing import Tuple, Optional, Union, List from dataclasses import dataclass @@ -37,100 +36,99 @@ class BaseFaceEncoder: def __init__( self, - model_name: SphereFaceWeights | MobileFaceWeights | ArcFaceWeights = MobileFaceWeights.MNET_V2, + model_name: Union[SphereFaceWeights, MobileFaceWeights, ArcFaceWeights] = MobileFaceWeights.MNET_V2, preprocessing: PreprocessConfig = PreprocessConfig(), ) -> None: """ Initializes the FaceEncoder model for inference. Args: - model_name (SphereFaceWeights | MobileFaceWeights | ArcFaceWeights): Selected model weight enum. - preprocessing (PreprocessConfig): Configuration for input normalization and resizing. + model_name: Selected model weight enum. + preprocessing: Configuration for input normalization and resizing. """ + # Store preprocessing parameters self.input_mean = preprocessing.input_mean self.input_std = preprocessing.input_std self.input_size = preprocessing.input_size Logger.info( f"Initializing Face Recognition with model={model_name}, " - f"input_mean={self.input_mean}, input_std={self.input_std}, input_size={self.input_size}" + f"input_mean={self.input_mean}, input_std={self.input_std}, " + f"input_size={self.input_size}" ) - # Get path to model weights - self._model_path = verify_model_weights(model_name) - Logger.info(f"Verfied model weights located at: {self._model_path}") + # Get path to model weights and initialize model + self.model_path = verify_model_weights(model_name) + Logger.info(f"Verified model weights located at: {self.model_path}") - # Initialize model - self._initialize_model(self._model_path) + self._initialize_model() - def _initialize_model(self, model_path: str) -> None: + def _initialize_model(self) -> None: """ Loads the ONNX model and prepares it for inference. - Args: - model_path (str): Path to the ONNX model file. - Raises: RuntimeError: If the model fails to load or initialize. """ try: + # Initialize model session with available providers self.session = ort.InferenceSession( - model_path, + self.model_path, providers=["CUDAExecutionProvider", "CPUExecutionProvider"] ) - self._setup_model() - Logger.info(f"Successfully initialized face encoder from {model_path}") + + # Extract input configuration + input_cfg = self.session.get_inputs()[0] + self.input_name = input_cfg.name + + # Verify input dimensions match our configuration + input_shape = input_cfg.shape + model_input_size = tuple(input_shape[2:4][::-1]) # (width, height) + if model_input_size != self.input_size: + Logger.warning(f"Model input size {model_input_size} differs from configured size {self.input_size}") + + # Extract output configuration + self.output_names = [output.name for output in self.session.get_outputs()] + self.output_shape = self.session.get_outputs()[0].shape + + assert len(self.output_names) == 1, "Expected only one output node." + Logger.info(f"Successfully initialized face encoder from {self.model_path}") + except Exception as e: - Logger.error(f"Failed to load face encoder model from '{model_path}'", exc_info=True) - raise RuntimeError(f"Failed to initialize model session for '{model_path}'") from e + Logger.error(f"Failed to load face encoder model from '{self.model_path}'", exc_info=True) + raise RuntimeError(f"Failed to initialize model session for '{self.model_path}'") from e - def _setup_model(self) -> None: - """ - Extracts input/output configuration from the ONNX model session. - """ - input_cfg = self.session.get_inputs()[0] - input_shape = input_cfg.shape - model_input_size = tuple(input_shape[2:4][::-1]) # (width, height) - - if model_input_size != self.input_size: - Logger.warning(f"Model input size {model_input_size} differs from configured size {self.input_size}") - - self.input_name = input_cfg.name - self.output_names = [output.name for output in self.session.get_outputs()] - self.output_shape = self.session.get_outputs()[0].shape - - assert len(self.output_names) == 1, "Expected only one output node." - - def preprocess(self, image: np.ndarray) -> np.ndarray: + def preprocess(self, face_img: np.ndarray) -> np.ndarray: """ Preprocess the image: resize, normalize, and convert it to a blob. Args: - image (np.ndarray): Input image in BGR format. + face_img: Input image in BGR format. Returns: - np.ndarray: Preprocessed image as a NumPy array ready for inference. + Preprocessed image as a NumPy array ready for inference. """ - image = cv2.resize(image, self.input_size) # Resize to (112, 112) - if isinstance(self.input_std, (list, tuple)): - # if self.input_std is a list, we assume it's per-channel std - image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32) + resized_img = cv2.resize(face_img, self.input_size) - image -= np.array(self.input_mean, dtype=np.float32) - image /= np.array(self.input_std, dtype=np.float32) + if isinstance(self.input_std, (list, tuple)): + # Per-channel normalization + rgb_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB).astype(np.float32) + normalized_img = (rgb_img - np.array(self.input_mean, dtype=np.float32)) / \ + np.array(self.input_std, dtype=np.float32) # Change to NCHW (batch, channels, height, width) - blob = np.transpose(image, (2, 0, 1)) # CHW + blob = np.transpose(normalized_img, (2, 0, 1)) # CHW blob = np.expand_dims(blob, axis=0) # NCHW else: - # cv2.dnn.blobFromImage does not support per-channel std so we use a single value here + # Single-value normalization blob = cv2.dnn.blobFromImage( - image, + resized_img, scalefactor=1.0 / self.input_std, size=self.input_size, mean=(self.input_mean, self.input_mean, self.input_mean), swapRB=True # Convert BGR to RGB ) + return blob def get_embedding(self, image: np.ndarray, landmarks: np.ndarray) -> np.ndarray: @@ -138,13 +136,17 @@ class BaseFaceEncoder: Extracts face embedding from an aligned image. Args: - image (np.ndarray): Input face image (BGR format). - landmarks (np.ndarray): Facial landmarks (5 points for alignment). + image: Input face image (BGR format). + landmarks: Facial landmarks (5 points for alignment). Returns: - np.ndarray: 512-dimensional face embedding. + Face embedding vector (typically 512-dimensional). """ - aligned_face, _ = face_alignment(image, landmarks) # Use your function for alignment - blob = self.preprocess(aligned_face) # Convert to blob - embedding = self.session.run(self.output_names, {self.input_name: blob})[0] - return embedding # Return the 512-D feature vector + # Align face using landmarks + aligned_face, _ = face_alignment(image, landmarks) + + # Generate embedding from aligned face + face_blob = self.preprocess(aligned_face) + embedding = self.session.run(self.output_names, {self.input_name: face_blob})[0] + + return embedding