ref: Update some modules and remove redundant parts

This commit is contained in:
yakhyo
2025-05-08 17:11:13 +09:00
parent b35b1a3f7c
commit fb29a919b1
6 changed files with 323 additions and 228 deletions

View File

@@ -18,7 +18,6 @@ def extract_reference_embedding(detector, recognizer, image_path):
raise RuntimeError("No faces found in reference image.")
embedding = recognizer.get_embedding(image, landmarks[0])
print(f"Reference embedding extracted (L2 norm = {np.linalg.norm(embedding):.4f})")
return embedding

View File

@@ -4,12 +4,10 @@ import onnxruntime as ort
from typing import Tuple
from uniface.log import Logger
from uniface.constants import AgeGenderWeights
from uniface.face_utils import bbox_center_alignment
from uniface.model_store import verify_model_weights
from uniface.constants import AgeGenderWeights
from uniface.detection import RetinaFace
from uniface.constants import RetinaFaceWeights
__all__ = ["AgeGender"]
@@ -17,109 +15,156 @@ __all__ = ["AgeGender"]
class AgeGender:
"""
Age and Gender Prediction Model.
This model predicts both a person's gender (male/female) and age from a facial image.
Gender is returned as an integer (0: female, 1: male) and age as years.
"""
def __init__(self, model_name: AgeGenderWeights = AgeGenderWeights.DEFAULT, input_size: Tuple[int, int] = (112, 112)) -> None:
def __init__(
self,
model_name: AgeGenderWeights = AgeGenderWeights.DEFAULT,
input_size: Tuple[int, int] = (112, 112)
) -> None:
"""
Initializes the Attribute model for inference.
Initializes the Age and Gender prediction model.
Args:
model_path (str): Path to the ONNX file.
model_name: Model weights enum to use
input_size: Input resolution for the model (width, height)
"""
Logger.info(
f"Initializing AgeGender with model={model_name}, "
f"input_size={input_size}"
)
# Model configuration
self.input_size = input_size
self.input_std = 1.0
self.input_mean = 0.0
# Get path to model weights
self._model_path = verify_model_weights(model_name)
Logger.info(f"Verfied model weights located at: {self._model_path}")
self.model_path = verify_model_weights(model_name)
Logger.info(f"Verified model weights located at: {self.model_path}")
# Initialize model
self._initialize_model(model_path=self._model_path)
self._initialize_model()
def _initialize_model(self, model_path: str):
"""Initialize the model from the given path.
def _initialize_model(self):
"""
Initialize the ONNX model for inference.
Args:
model_path (str): Path to .onnx model.
Raises:
RuntimeError: If the model fails to load or initialize.
"""
try:
# Initialize session with available providers
self.session = ort.InferenceSession(
model_path,
self.model_path,
providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
)
# Get model info
metadata = self.session.get_inputs()[0]
input_shape = metadata.shape
self.input_size = tuple(input_shape[2:4][::-1])
# Extract model metadata
input_metadata = self.session.get_inputs()[0]
input_shape = input_metadata.shape
self.input_size = tuple(input_shape[2:4][::-1]) # Update from model (width, height)
self.input_names = [x.name for x in self.session.get_inputs()]
self.output_names = [x.name for x in self.session.get_outputs()]
# Get input/output names
self.input_names = [input.name for input in self.session.get_inputs()]
self.output_names = [output.name for output in self.session.get_outputs()]
Logger.info(f"Successfully initialized AgeGender model")
except Exception as e:
print(f"Failed to load the model: {e}")
raise
Logger.error(f"Failed to load AgeGender model from '{self.model_path}'", exc_info=True)
raise RuntimeError(f"Failed to initialize AgeGender model: {e}")
def preprocess(self, image: np.ndarray, bbox: np.ndarray):
"""Preprocessing
def preprocess(self, image: np.ndarray, bbox: np.ndarray) -> np.ndarray:
"""
Preprocess the input image and face bounding box for inference.
Args:
image (np.ndarray): Numpy image
bbox (np.ndarray): Bounding box coordinates: [x1, y1, x2, y2]
image: Input image in BGR format
bbox: Face bounding box coordinates [x1, y1, x2, y2]
Returns:
np.ndarray: Transformed image
Preprocessed image blob ready for inference
"""
# Calculate face dimensions and center
width, height = bbox[2] - bbox[0], bbox[3] - bbox[1]
center = (bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2
# Determine scale to fit face with margin
scale = self.input_size[0] / (max(width, height) * 1.5)
rotation = 0.0
transformed_image, M = bbox_center_alignment(image, center, self.input_size[0], scale, rotation)
input_size = tuple(transformed_image.shape[0:2][::-1])
blob = cv2.dnn.blobFromImage(
transformed_image,
1.0/self.input_std,
input_size,
(self.input_mean, self.input_mean, self.input_mean),
swapRB=True
# Align face based on bounding box
aligned_face, _ = bbox_center_alignment(
image, center, self.input_size[0], scale, rotation
)
return blob
def postprocess(self, predictions: np.ndarray) -> Tuple[np.int64, int]:
"""Postprocessing
# Convert to blob format for network input
face_blob = cv2.dnn.blobFromImage(
aligned_face,
1.0 / self.input_std,
self.input_size,
(self.input_mean, self.input_mean, self.input_mean),
swapRB=True # Convert BGR to RGB
)
return face_blob
def postprocess(self, predictions: np.ndarray) -> Tuple[int, int]:
"""
Process model predictions to extract gender and age.
Args:
predictions (np.ndarray): Model predictions, shape: [1, 3]
predictions: Raw model output, shape [1, 3] where:
- First two elements represent gender logits
- Third element represents normalized age
Returns:
Tuple[np.int64, int]: Gender and Age values
Tuple containing:
- Gender (0: female, 1: male)
- Age in years
"""
gender = np.argmax(predictions[:2])
age = int(np.round(predictions[2]*100))
return gender, age
# First two values are gender logits (female/male)
gender = int(np.argmax(predictions[:2]))
def predict(self, image: np.ndarray, bbox: np.ndarray) -> Tuple[np.int64, int]:
blob = self.preprocess(image, bbox)
predictions = self.session.run(self.output_names, {self.input_names[0]: blob})[0][0]
gender, age = self.postprocess(predictions)
# Third value is normalized age that needs scaling
age = int(np.round(predictions[2] * 100))
return gender, age
def predict(self, image: np.ndarray, bbox: np.ndarray) -> Tuple[int, int]:
"""
Predict age and gender for a face in the image.
Args:
image: Input image in BGR format
bbox: Face bounding box [x1, y1, x2, y2]
Returns:
- 'gender_id': Gender as integer (0: female, 1: male)
- 'age': Age in years
"""
# Preprocess and run inference
face_blob = self.preprocess(image, bbox)
predictions = self.session.run(
self.output_names,
{self.input_names[0]: face_blob}
)[0][0]
# Extract gender and age from predictions
gender_id, age = self.postprocess(predictions)
return gender_id, age
# TODO: For testing purposes only, remove later
def main():
from uniface.detection import RetinaFace
from uniface.constants import RetinaFaceWeights
face_detector = RetinaFace(
model_name=RetinaFaceWeights.MNET_V2,

View File

@@ -10,10 +10,9 @@ from PIL import Image
from typing import Tuple, Union
from uniface.log import Logger
from uniface import RetinaFace
from uniface.constants import DDAMFNWeights
from uniface.face_utils import face_alignment
from uniface.model_store import verify_model_weights
from uniface.constants import RetinaFaceWeights, DDAMFNWeights
class Emotion:
@@ -21,10 +20,11 @@ class Emotion:
Emotion recognition using a TorchScript model.
Args:
model_name (DDAMFNWeights): Pretrained model enum. Defaults to AFFECNET7.
model_weights (DDAMFNWeights): Pretrained model weights enum. Defaults to AFFECNET7.
input_size (Tuple[int, int]): Size of input images. Defaults to (112, 112).
Attributes:
emotions (List[str]): Emotion label list.
emotion_labels (List[str]): List of emotion labels the model can predict.
device (torch.device): Inference device (CPU or CUDA).
model (torch.jit.ScriptModule): Loaded TorchScript model.
@@ -33,122 +33,133 @@ class Emotion:
RuntimeError: If model loading fails.
"""
def __init__(self, model_name: DDAMFNWeights = DDAMFNWeights.AFFECNET7, input_size: Tuple[int, int] = (112, 112)) -> None:
def __init__(
self,
model_weights: DDAMFNWeights = DDAMFNWeights.AFFECNET7,
input_size: Tuple[int, int] = (112, 112)
) -> None:
"""
Initialize the emotion detector with a TorchScript model
"""
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.emotions = [
self.emotion_labels = [
"Neutral", "Happy", "Sad", "Surprise", "Fear", "Disgust", "Angry"
]
if model_name == DDAMFNWeights.AFFECNET8:
self.emotions.append("Contempt")
# Add contempt for AFFECNET8 model
if model_weights == DDAMFNWeights.AFFECNET8:
self.emotion_labels.append("Contempt")
# Initialize image preprocessing parameters
self.input_size = input_size
self.input_std = [0.229, 0.224, 0.225]
self.input_mean = [0.485, 0.456, 0.406]
self.normalization_std = [0.229, 0.224, 0.225]
self.normalization_mean = [0.485, 0.456, 0.406]
Logger.info(
f"Initialized Emotion class with model={model_name.name}, "
f"Initialized Emotion class with model={model_weights.name}, "
f"device={'cuda' if torch.cuda.is_available() else 'cpu'}, "
f"num_classes={len(self.emotions)}, input_size={self.input_size}"
f"num_classes={len(self.emotion_labels)}, input_size={self.input_size}"
)
# Get path to model weights
self._model_path = verify_model_weights(model_name)
Logger.info(f"Verified model weights located at: {self._model_path}")
# Get path to model weights and initialize model
self.model_path = verify_model_weights(model_weights)
Logger.info(f"Verified model weights located at: {self.model_path}")
self._load_model()
# Initialize model
self._initialize_model(model_path=self._model_path)
def _initialize_model(self, model_path: str) -> None:
def _load_model(self) -> None:
"""
Initializes a TorchScript model for emotion inference.
Loads and initializes a TorchScript model for emotion inference.
Args:
model_path (str): Path to the TorchScript (.pt) model.
Raises:
RuntimeError: If loading the model fails.
"""
try:
self.model = torch.jit.load(model_path, map_location=self.device)
self.model = torch.jit.load(self.model_path, map_location=self.device)
self.model.eval()
Logger.info(f"TorchScript model successfully loaded from: {model_path}")
Logger.info(f"TorchScript model successfully loaded from: {self.model_path}")
# Warm-up
dummy = torch.randn(1, 3, 112, 112).to(self.device)
# Warm-up with dummy input
dummy_input = torch.randn(1, 3, *self.input_size).to(self.device)
with torch.no_grad():
_ = self.model(dummy)
_ = self.model(dummy_input)
Logger.info("Emotion model warmed up with dummy input.")
except Exception as e:
Logger.error(f"Failed to load TorchScript model from {model_path}: {e}")
raise
Logger.error(f"Failed to load TorchScript model from {self.model_path}: {e}")
raise RuntimeError(f"Model loading failed: {str(e)}")
def preprocess(self, image: np.ndarray) -> torch.Tensor:
"""
Resize, normalize and convert image to tensor manually without torchvision.
Preprocess image for model inference: resize, normalize and convert to tensor.
Args:
image (np.ndarray): BGR image (H, W, 3)
Returns:
torch.Tensor: Preprocessed image tensor of shape (1, 3, 112, 112)
"""
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # BGR -> RGB
# Resize to (112, 112)
image = cv2.resize(image, self.input_size).astype(np.float32) / 255.0
Returns:
torch.Tensor: Preprocessed image tensor of shape (1, 3, H, W)
"""
# Convert BGR to RGB
rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# Resize to target input size
resized_image = cv2.resize(rgb_image, self.input_size).astype(np.float32) / 255.0
# Normalize with mean and std
mean = np.array(self.input_mean, dtype=np.float32)
std = np.array(self.input_std, dtype=np.float32)
image_normalized = (image - mean) / std
mean_array = np.array(self.normalization_mean, dtype=np.float32)
std_array = np.array(self.normalization_std, dtype=np.float32)
normalized_image = (resized_image - mean_array) / std_array
# HWC to CHW
image_transposed = image_normalized.transpose((2, 0, 1))
# Convert from HWC to CHW format
transposed_image = normalized_image.transpose((2, 0, 1))
# Convert to torch tensor and add batch dimension
tensor = torch.from_numpy(image_transposed).unsqueeze(0).to(self.device)
tensor = torch.from_numpy(transposed_image).unsqueeze(0).to(self.device)
return tensor
def predict(self, image: np.ndarray, landmark: np.ndarray) -> Tuple[Union[str, None], Union[float, None]]:
"""
Predict the emotion from an BGR face image.
Predict the emotion from a face image.
Args:
image (np.ndarray): Input face image in RGB format.
image (np.ndarray): Input face image in BGR format.
landmark (np.ndarray): Facial five point landmark.
Returns:
Tuple[str, float]: (Predicted emotion label, Confidence score)
Returns (None, None) if prediction fails.
Raises:
RuntimeError: If the input is invalid or inference fails internally.
ValueError: If the input is not a valid BGR image.
"""
# Validate input
if not isinstance(image, np.ndarray):
Logger.error("Input must be a NumPy ndarray.")
raise ValueError("Input must be a NumPy ndarray (RGB image).")
raise ValueError("Input must be a NumPy ndarray (BGR image).")
if image.ndim != 3 or image.shape[2] != 3:
Logger.error(f"Invalid image shape: {image.shape}. Expected HxWx3 RGB image.")
raise ValueError("Input image must be in RGB format with shape (H, W, 3).")
Logger.error(f"Invalid image shape: {image.shape}. Expected HxWx3 image.")
raise ValueError("Input image must have shape (H, W, 3).")
try:
image, _ = face_alignment(image, landmark)
tensor = self.preprocess(image)
# Align face using landmarks
aligned_image, _ = face_alignment(image, landmark)
# Preprocess and run inference
input_tensor = self.preprocess(aligned_image)
with torch.no_grad():
output = self.model(tensor)
output = self.model(input_tensor)
# Handle case where model returns a tuple
if isinstance(output, tuple):
output = output[0]
probs = torch.nn.functional.softmax(output, dim=1).squeeze(0).cpu().numpy()
pred_idx = int(np.argmax(probs))
confidence = round(float(probs[pred_idx]), 2)
# Get probabilities and prediction
probabilities = torch.nn.functional.softmax(output, dim=1).squeeze(0).cpu().numpy()
predicted_index = int(np.argmax(probabilities))
confidence_score = round(float(probabilities[predicted_index]), 2)
return self.emotions[pred_idx], confidence
return self.emotion_labels[predicted_index], confidence_score
except Exception as e:
Logger.error(f"Emotion inference failed: {e}")
@@ -158,6 +169,8 @@ class Emotion:
# TODO: For testing purposes only, remove later
def main():
from uniface import RetinaFace
from uniface.constants import RetinaFaceWeights
face_detector = RetinaFace(
model_name=RetinaFaceWeights.MNET_V2,

View File

@@ -11,8 +11,9 @@ import onnxruntime as ort
from typing import Tuple, List, Literal
from uniface.log import Logger
from uniface.model_store import verify_model_weights
from uniface.constants import SCRFDWeights
from uniface.model_store import verify_model_weights
from .utils import non_max_supression, distance2bbox, distance2kps, resize_image
__all__ = ['SCRFD']
@@ -248,13 +249,12 @@ class SCRFD:
sorted_indices = np.argsort(values)[::-1][:max_num]
det = det[sorted_indices]
landmarks = landmarks[sorted_indices]
return det, landmarks
# TODO: below is only for testing, remove it later
def draw_bbox(frame, bbox, color=(0, 255, 0), thickness=2):
x1, y1, x2, y2 = bbox[:4].astype(np.int32)
cv2.rectangle(frame, (x1, y1), (x2, y2), color, thickness)
@@ -267,6 +267,8 @@ def draw_keypoints(frame, points, color=(0, 0, 255), radius=2):
cv2.circle(frame, (x, y), radius, color, -1)
# TODO: Remove late, just for testing
if __name__ == "__main__":
detector = SCRFD(model_name=SCRFDWeights.SCRFD_500M_KPS)
cap = cv2.VideoCapture(0)

View File

@@ -6,131 +6,163 @@ import numpy as np
from typing import Tuple
from uniface.log import Logger
from uniface.face_utils import bbox_center_alignment, transform_points_2d
from uniface.constants import LandmarkWeights
from uniface.model_store import verify_model_weights
from uniface.detection import RetinaFace
from uniface.constants import RetinaFaceWeights, LandmarkWeights
from uniface.face_utils import bbox_center_alignment, transform_points_2d
__all__ = ['Landmark']
class Landmark:
def __init__(self, model_name: LandmarkWeights = LandmarkWeights.DEFAULT, input_size: Tuple[int, int] = (192, 192)) -> None:
"""
Facial landmark detection model for predicting facial keypoints.
"""
def __init__(
self,
model_name: LandmarkWeights = LandmarkWeights.DEFAULT,
input_size: Tuple[int, int] = (192, 192)
) -> None:
"""
Initializes the Facial Landmark model for inference.
Args:
model_path (str): Path to the ONNX file.
model_name: Enum specifying which landmark model weights to use
input_size: Input resolution for the model (width, height)
"""
Logger.info(
f"Initializing Facial Landmark with model={model_name}, "
f"input_size={input_size}"
)
# Initialize configuration
self.input_size = input_size
self.input_std = 1.0
self.input_mean = 0.0
# Get path to model weights
self._model_path = verify_model_weights(model_name)
Logger.info(f"Verfied model weights located at: {self._model_path}")
self.model_path = verify_model_weights(model_name)
Logger.info(f"Verified model weights located at: {self.model_path}")
# Initialize model
self._initialize_model(model_path=self._model_path)
self._initialize_model()
def _initialize_model(self, model_path: str):
""" Initialize the model from the given path.
Args:
model_path (str): Path to .onnx model.
def _initialize_model(self):
"""
Initialize the ONNX model from the stored model path.
Raises:
RuntimeError: If the model fails to load or initialize.
"""
try:
self.session = ort.InferenceSession(
model_path,
self.model_path,
providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
)
metadata = self.session.get_inputs()[0]
input_shape = metadata.shape
self.input_size = tuple(input_shape[2:4][::-1])
# Get input configuration
input_metadata = self.session.get_inputs()[0]
input_shape = input_metadata.shape
self.input_size = tuple(input_shape[2:4][::-1]) # Update input size from model
self.input_names = [x.name for x in self.session.get_inputs()]
self.output_names = [x.name for x in self.session.get_outputs()]
# Get input/output names
self.input_names = [input.name for input in self.session.get_inputs()]
self.output_names = [output.name for output in self.session.get_outputs()]
outputs = self.session.get_outputs()
output_shape = outputs[0].shape
self.lmk_dim = 2
self.lmk_num = output_shape[1] // self.lmk_dim
# Determine landmark dimensions from output shape
output_shape = self.session.get_outputs()[0].shape
self.lmk_dim = 2 # x,y coordinates
self.lmk_num = output_shape[1] // self.lmk_dim # Number of landmarks
Logger.info(f"Model initialized with {self.lmk_num} landmarks")
except Exception as e:
print(f"Failed to load the model: {e}")
raise
Logger.error(f"Failed to load landmark model from '{self.model_path}'", exc_info=True)
raise RuntimeError(f"Failed to initialize landmark model: {e}")
def preprocess(self, image: np.ndarray, bbox: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
"""
Preprocess the input image and bbox for inference.
Preprocess the input image and bounding box for inference.
Args:
image (np.ndarray): Input image.
bbox (np.ndarray): Bounding box [x1, y1, x2, y2].
image: Input image in BGR format
bbox: Bounding box coordinates [x1, y1, x2, y2]
Returns:
Tuple[np.ndarray, np.ndarray]: Preprocessed blob and transformation matrix.
Tuple containing:
- Preprocessed image blob ready for inference
- Transformation matrix for mapping predictions back to original image
"""
# Calculate face dimensions and center
width, height = bbox[2] - bbox[0], bbox[3] - bbox[1]
center = (bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2
# Determine scale to fit face with some margin
scale = self.input_size[0] / (max(width, height) * 1.5)
rotation = 0.0
transformed_image, M = bbox_center_alignment(image, center, self.input_size[0], scale, rotation)
input_size = tuple(transformed_image.shape[0:2][::-1])
blob = cv2.dnn.blobFromImage(
transformed_image,
1.0/self.input_std,
input_size,
(self.input_mean, self.input_mean, self.input_mean),
swapRB=True
# Align face using center, scale and rotation
aligned_face, transform_matrix = bbox_center_alignment(
image, center, self.input_size[0], scale, rotation
)
return blob, M
# Convert to blob format for inference
face_blob = cv2.dnn.blobFromImage(
aligned_face,
1.0 / self.input_std,
self.input_size,
(self.input_mean, self.input_mean, self.input_mean),
swapRB=True # Convert BGR to RGB
)
return face_blob, transform_matrix
def postprocess(self, predictions: np.ndarray, M: np.ndarray) -> np.ndarray:
def postprocess(self, predictions: np.ndarray, transform_matrix: np.ndarray) -> np.ndarray:
"""
Postprocess model outputs to get landmarks.
Convert raw model predictions to image coordinates.
Args:
predictions (np.ndarray): Raw model predictions.
M (np.ndarray): Affine transformation matrix.
predictions: Raw landmark coordinates from model output
transform_matrix: Affine transformation matrix from preprocessing
Returns:
np.ndarray: Transformed landmarks.
Landmarks in original image coordinates
"""
# Reshape to pairs of x,y coordinates
landmarks = predictions.reshape((-1, 2))
predictions = predictions.reshape((-1, 2))
# Denormalize coordinates to pixel space
landmarks[:, 0:2] += 1 # Shift from [-1,1] to [0,2] range
landmarks[:, 0:2] *= (self.input_size[0] // 2) # Scale to pixel coordinates
predictions[:, 0:2] += 1
predictions[:, 0:2] *= (self.input_size[0] // 2)
# Invert the transformation to map back to original image
inverse_matrix = cv2.invertAffineTransform(transform_matrix)
landmarks = transform_points_2d(landmarks, inverse_matrix)
IM = cv2.invertAffineTransform(M)
predictions = transform_points_2d(predictions, IM)
return predictions
return landmarks
def predict(self, image: np.ndarray, bbox: np.ndarray) -> np.ndarray:
"""
Predict facial landmarks for the given image and bounding box.
Predict facial landmarks for the given image and face bounding box.
Args:
image (np.ndarray): Input image.
bbox (np.ndarray): Bounding box [x1, y1, x2, y2].
image: Input image in BGR format
bbox: Face bounding box [x1, y1, x2, y2]
Returns:
np.ndarray: Predicted landmarks.
Array of facial landmarks in original image coordinates
"""
blob, M = self.preprocess(image, bbox)
preds = self.session.run(self.output_names, {self.input_names[0]: blob})[0][0]
landmarks = self.postprocess(preds, M)
# Preprocess image
face_blob, transform_matrix = self.preprocess(image, bbox)
# Run inference
raw_predictions = self.session.run(
self.output_names,
{self.input_names[0]: face_blob}
)[0][0]
# Postprocess to get landmarks in original image space
landmarks = self.postprocess(raw_predictions, transform_matrix)
return landmarks
@@ -138,7 +170,9 @@ class Landmark:
if __name__ == "__main__":
from uniface.detection import RetinaFace
from uniface.constants import RetinaFaceWeights
face_detector = RetinaFace(
model_name=RetinaFaceWeights.MNET_V2,
conf_thresh=0.5,

View File

@@ -7,7 +7,6 @@ import os
import cv2
import numpy as np
import onnxruntime as ort
from typing import Tuple, Optional, Union, List
from dataclasses import dataclass
@@ -37,100 +36,99 @@ class BaseFaceEncoder:
def __init__(
self,
model_name: SphereFaceWeights | MobileFaceWeights | ArcFaceWeights = MobileFaceWeights.MNET_V2,
model_name: Union[SphereFaceWeights, MobileFaceWeights, ArcFaceWeights] = MobileFaceWeights.MNET_V2,
preprocessing: PreprocessConfig = PreprocessConfig(),
) -> None:
"""
Initializes the FaceEncoder model for inference.
Args:
model_name (SphereFaceWeights | MobileFaceWeights | ArcFaceWeights): Selected model weight enum.
preprocessing (PreprocessConfig): Configuration for input normalization and resizing.
model_name: Selected model weight enum.
preprocessing: Configuration for input normalization and resizing.
"""
# Store preprocessing parameters
self.input_mean = preprocessing.input_mean
self.input_std = preprocessing.input_std
self.input_size = preprocessing.input_size
Logger.info(
f"Initializing Face Recognition with model={model_name}, "
f"input_mean={self.input_mean}, input_std={self.input_std}, input_size={self.input_size}"
f"input_mean={self.input_mean}, input_std={self.input_std}, "
f"input_size={self.input_size}"
)
# Get path to model weights
self._model_path = verify_model_weights(model_name)
Logger.info(f"Verfied model weights located at: {self._model_path}")
# Get path to model weights and initialize model
self.model_path = verify_model_weights(model_name)
Logger.info(f"Verified model weights located at: {self.model_path}")
# Initialize model
self._initialize_model(self._model_path)
self._initialize_model()
def _initialize_model(self, model_path: str) -> None:
def _initialize_model(self) -> None:
"""
Loads the ONNX model and prepares it for inference.
Args:
model_path (str): Path to the ONNX model file.
Raises:
RuntimeError: If the model fails to load or initialize.
"""
try:
# Initialize model session with available providers
self.session = ort.InferenceSession(
model_path,
self.model_path,
providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
)
self._setup_model()
Logger.info(f"Successfully initialized face encoder from {model_path}")
# Extract input configuration
input_cfg = self.session.get_inputs()[0]
self.input_name = input_cfg.name
# Verify input dimensions match our configuration
input_shape = input_cfg.shape
model_input_size = tuple(input_shape[2:4][::-1]) # (width, height)
if model_input_size != self.input_size:
Logger.warning(f"Model input size {model_input_size} differs from configured size {self.input_size}")
# Extract output configuration
self.output_names = [output.name for output in self.session.get_outputs()]
self.output_shape = self.session.get_outputs()[0].shape
assert len(self.output_names) == 1, "Expected only one output node."
Logger.info(f"Successfully initialized face encoder from {self.model_path}")
except Exception as e:
Logger.error(f"Failed to load face encoder model from '{model_path}'", exc_info=True)
raise RuntimeError(f"Failed to initialize model session for '{model_path}'") from e
Logger.error(f"Failed to load face encoder model from '{self.model_path}'", exc_info=True)
raise RuntimeError(f"Failed to initialize model session for '{self.model_path}'") from e
def _setup_model(self) -> None:
"""
Extracts input/output configuration from the ONNX model session.
"""
input_cfg = self.session.get_inputs()[0]
input_shape = input_cfg.shape
model_input_size = tuple(input_shape[2:4][::-1]) # (width, height)
if model_input_size != self.input_size:
Logger.warning(f"Model input size {model_input_size} differs from configured size {self.input_size}")
self.input_name = input_cfg.name
self.output_names = [output.name for output in self.session.get_outputs()]
self.output_shape = self.session.get_outputs()[0].shape
assert len(self.output_names) == 1, "Expected only one output node."
def preprocess(self, image: np.ndarray) -> np.ndarray:
def preprocess(self, face_img: np.ndarray) -> np.ndarray:
"""
Preprocess the image: resize, normalize, and convert it to a blob.
Args:
image (np.ndarray): Input image in BGR format.
face_img: Input image in BGR format.
Returns:
np.ndarray: Preprocessed image as a NumPy array ready for inference.
Preprocessed image as a NumPy array ready for inference.
"""
image = cv2.resize(image, self.input_size) # Resize to (112, 112)
if isinstance(self.input_std, (list, tuple)):
# if self.input_std is a list, we assume it's per-channel std
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
resized_img = cv2.resize(face_img, self.input_size)
image -= np.array(self.input_mean, dtype=np.float32)
image /= np.array(self.input_std, dtype=np.float32)
if isinstance(self.input_std, (list, tuple)):
# Per-channel normalization
rgb_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB).astype(np.float32)
normalized_img = (rgb_img - np.array(self.input_mean, dtype=np.float32)) / \
np.array(self.input_std, dtype=np.float32)
# Change to NCHW (batch, channels, height, width)
blob = np.transpose(image, (2, 0, 1)) # CHW
blob = np.transpose(normalized_img, (2, 0, 1)) # CHW
blob = np.expand_dims(blob, axis=0) # NCHW
else:
# cv2.dnn.blobFromImage does not support per-channel std so we use a single value here
# Single-value normalization
blob = cv2.dnn.blobFromImage(
image,
resized_img,
scalefactor=1.0 / self.input_std,
size=self.input_size,
mean=(self.input_mean, self.input_mean, self.input_mean),
swapRB=True # Convert BGR to RGB
)
return blob
def get_embedding(self, image: np.ndarray, landmarks: np.ndarray) -> np.ndarray:
@@ -138,13 +136,17 @@ class BaseFaceEncoder:
Extracts face embedding from an aligned image.
Args:
image (np.ndarray): Input face image (BGR format).
landmarks (np.ndarray): Facial landmarks (5 points for alignment).
image: Input face image (BGR format).
landmarks: Facial landmarks (5 points for alignment).
Returns:
np.ndarray: 512-dimensional face embedding.
Face embedding vector (typically 512-dimensional).
"""
aligned_face, _ = face_alignment(image, landmarks) # Use your function for alignment
blob = self.preprocess(aligned_face) # Convert to blob
embedding = self.session.run(self.output_names, {self.input_name: blob})[0]
return embedding # Return the 512-D feature vector
# Align face using landmarks
aligned_face, _ = face_alignment(image, landmarks)
# Generate embedding from aligned face
face_blob = self.preprocess(aligned_face)
embedding = self.session.run(self.output_names, {self.input_name: face_blob})[0]
return embedding