ref: Update some modules and remove redundant parts

This commit is contained in:
yakhyo
2025-05-08 17:11:13 +09:00
parent b35b1a3f7c
commit fb29a919b1
6 changed files with 323 additions and 228 deletions

View File

@@ -18,7 +18,6 @@ def extract_reference_embedding(detector, recognizer, image_path):
raise RuntimeError("No faces found in reference image.") raise RuntimeError("No faces found in reference image.")
embedding = recognizer.get_embedding(image, landmarks[0]) embedding = recognizer.get_embedding(image, landmarks[0])
print(f"Reference embedding extracted (L2 norm = {np.linalg.norm(embedding):.4f})")
return embedding return embedding

View File

@@ -4,12 +4,10 @@ import onnxruntime as ort
from typing import Tuple from typing import Tuple
from uniface.log import Logger from uniface.log import Logger
from uniface.constants import AgeGenderWeights
from uniface.face_utils import bbox_center_alignment from uniface.face_utils import bbox_center_alignment
from uniface.model_store import verify_model_weights from uniface.model_store import verify_model_weights
from uniface.constants import AgeGenderWeights
from uniface.detection import RetinaFace
from uniface.constants import RetinaFaceWeights
__all__ = ["AgeGender"] __all__ = ["AgeGender"]
@@ -17,109 +15,156 @@ __all__ = ["AgeGender"]
class AgeGender: class AgeGender:
""" """
Age and Gender Prediction Model. Age and Gender Prediction Model.
This model predicts both a person's gender (male/female) and age from a facial image.
Gender is returned as an integer (0: female, 1: male) and age as years.
""" """
def __init__(self, model_name: AgeGenderWeights = AgeGenderWeights.DEFAULT, input_size: Tuple[int, int] = (112, 112)) -> None: def __init__(
self,
model_name: AgeGenderWeights = AgeGenderWeights.DEFAULT,
input_size: Tuple[int, int] = (112, 112)
) -> None:
""" """
Initializes the Attribute model for inference. Initializes the Age and Gender prediction model.
Args: Args:
model_path (str): Path to the ONNX file. model_name: Model weights enum to use
input_size: Input resolution for the model (width, height)
""" """
Logger.info( Logger.info(
f"Initializing AgeGender with model={model_name}, " f"Initializing AgeGender with model={model_name}, "
f"input_size={input_size}" f"input_size={input_size}"
) )
# Model configuration
self.input_size = input_size self.input_size = input_size
self.input_std = 1.0 self.input_std = 1.0
self.input_mean = 0.0 self.input_mean = 0.0
# Get path to model weights # Get path to model weights
self._model_path = verify_model_weights(model_name) self.model_path = verify_model_weights(model_name)
Logger.info(f"Verfied model weights located at: {self._model_path}") Logger.info(f"Verified model weights located at: {self.model_path}")
# Initialize model # Initialize model
self._initialize_model(model_path=self._model_path) self._initialize_model()
def _initialize_model(self, model_path: str): def _initialize_model(self):
"""Initialize the model from the given path. """
Initialize the ONNX model for inference.
Args: Raises:
model_path (str): Path to .onnx model. RuntimeError: If the model fails to load or initialize.
""" """
try: try:
# Initialize session with available providers
self.session = ort.InferenceSession( self.session = ort.InferenceSession(
model_path, self.model_path,
providers=["CUDAExecutionProvider", "CPUExecutionProvider"] providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
) )
# Get model info # Extract model metadata
metadata = self.session.get_inputs()[0] input_metadata = self.session.get_inputs()[0]
input_shape = metadata.shape input_shape = input_metadata.shape
self.input_size = tuple(input_shape[2:4][::-1]) self.input_size = tuple(input_shape[2:4][::-1]) # Update from model (width, height)
self.input_names = [x.name for x in self.session.get_inputs()] # Get input/output names
self.output_names = [x.name for x in self.session.get_outputs()] self.input_names = [input.name for input in self.session.get_inputs()]
self.output_names = [output.name for output in self.session.get_outputs()]
Logger.info(f"Successfully initialized AgeGender model")
except Exception as e: except Exception as e:
print(f"Failed to load the model: {e}") Logger.error(f"Failed to load AgeGender model from '{self.model_path}'", exc_info=True)
raise raise RuntimeError(f"Failed to initialize AgeGender model: {e}")
def preprocess(self, image: np.ndarray, bbox: np.ndarray): def preprocess(self, image: np.ndarray, bbox: np.ndarray) -> np.ndarray:
"""Preprocessing """
Preprocess the input image and face bounding box for inference.
Args: Args:
image (np.ndarray): Numpy image image: Input image in BGR format
bbox (np.ndarray): Bounding box coordinates: [x1, y1, x2, y2] bbox: Face bounding box coordinates [x1, y1, x2, y2]
Returns: Returns:
np.ndarray: Transformed image Preprocessed image blob ready for inference
""" """
# Calculate face dimensions and center
width, height = bbox[2] - bbox[0], bbox[3] - bbox[1] width, height = bbox[2] - bbox[0], bbox[3] - bbox[1]
center = (bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2 center = (bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2
# Determine scale to fit face with margin
scale = self.input_size[0] / (max(width, height) * 1.5) scale = self.input_size[0] / (max(width, height) * 1.5)
rotation = 0.0 rotation = 0.0
transformed_image, M = bbox_center_alignment(image, center, self.input_size[0], scale, rotation) # Align face based on bounding box
aligned_face, _ = bbox_center_alignment(
input_size = tuple(transformed_image.shape[0:2][::-1]) image, center, self.input_size[0], scale, rotation
blob = cv2.dnn.blobFromImage(
transformed_image,
1.0/self.input_std,
input_size,
(self.input_mean, self.input_mean, self.input_mean),
swapRB=True
) )
return blob
def postprocess(self, predictions: np.ndarray) -> Tuple[np.int64, int]: # Convert to blob format for network input
"""Postprocessing face_blob = cv2.dnn.blobFromImage(
aligned_face,
1.0 / self.input_std,
self.input_size,
(self.input_mean, self.input_mean, self.input_mean),
swapRB=True # Convert BGR to RGB
)
return face_blob
def postprocess(self, predictions: np.ndarray) -> Tuple[int, int]:
"""
Process model predictions to extract gender and age.
Args: Args:
predictions (np.ndarray): Model predictions, shape: [1, 3] predictions: Raw model output, shape [1, 3] where:
- First two elements represent gender logits
- Third element represents normalized age
Returns: Returns:
Tuple[np.int64, int]: Gender and Age values Tuple containing:
- Gender (0: female, 1: male)
- Age in years
""" """
gender = np.argmax(predictions[:2]) # First two values are gender logits (female/male)
age = int(np.round(predictions[2]*100)) gender = int(np.argmax(predictions[:2]))
return gender, age
def predict(self, image: np.ndarray, bbox: np.ndarray) -> Tuple[np.int64, int]: # Third value is normalized age that needs scaling
blob = self.preprocess(image, bbox) age = int(np.round(predictions[2] * 100))
predictions = self.session.run(self.output_names, {self.input_names[0]: blob})[0][0]
gender, age = self.postprocess(predictions)
return gender, age return gender, age
def predict(self, image: np.ndarray, bbox: np.ndarray) -> Tuple[int, int]:
"""
Predict age and gender for a face in the image.
Args:
image: Input image in BGR format
bbox: Face bounding box [x1, y1, x2, y2]
Returns:
- 'gender_id': Gender as integer (0: female, 1: male)
- 'age': Age in years
"""
# Preprocess and run inference
face_blob = self.preprocess(image, bbox)
predictions = self.session.run(
self.output_names,
{self.input_names[0]: face_blob}
)[0][0]
# Extract gender and age from predictions
gender_id, age = self.postprocess(predictions)
return gender_id, age
# TODO: For testing purposes only, remove later # TODO: For testing purposes only, remove later
def main(): def main():
from uniface.detection import RetinaFace
from uniface.constants import RetinaFaceWeights
face_detector = RetinaFace( face_detector = RetinaFace(
model_name=RetinaFaceWeights.MNET_V2, model_name=RetinaFaceWeights.MNET_V2,

View File

@@ -10,10 +10,9 @@ from PIL import Image
from typing import Tuple, Union from typing import Tuple, Union
from uniface.log import Logger from uniface.log import Logger
from uniface import RetinaFace from uniface.constants import DDAMFNWeights
from uniface.face_utils import face_alignment from uniface.face_utils import face_alignment
from uniface.model_store import verify_model_weights from uniface.model_store import verify_model_weights
from uniface.constants import RetinaFaceWeights, DDAMFNWeights
class Emotion: class Emotion:
@@ -21,10 +20,11 @@ class Emotion:
Emotion recognition using a TorchScript model. Emotion recognition using a TorchScript model.
Args: Args:
model_name (DDAMFNWeights): Pretrained model enum. Defaults to AFFECNET7. model_weights (DDAMFNWeights): Pretrained model weights enum. Defaults to AFFECNET7.
input_size (Tuple[int, int]): Size of input images. Defaults to (112, 112).
Attributes: Attributes:
emotions (List[str]): Emotion label list. emotion_labels (List[str]): List of emotion labels the model can predict.
device (torch.device): Inference device (CPU or CUDA). device (torch.device): Inference device (CPU or CUDA).
model (torch.jit.ScriptModule): Loaded TorchScript model. model (torch.jit.ScriptModule): Loaded TorchScript model.
@@ -33,122 +33,133 @@ class Emotion:
RuntimeError: If model loading fails. RuntimeError: If model loading fails.
""" """
def __init__(self, model_name: DDAMFNWeights = DDAMFNWeights.AFFECNET7, input_size: Tuple[int, int] = (112, 112)) -> None: def __init__(
self,
model_weights: DDAMFNWeights = DDAMFNWeights.AFFECNET7,
input_size: Tuple[int, int] = (112, 112)
) -> None:
""" """
Initialize the emotion detector with a TorchScript model Initialize the emotion detector with a TorchScript model
""" """
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.emotion_labels = [
self.emotions = [
"Neutral", "Happy", "Sad", "Surprise", "Fear", "Disgust", "Angry" "Neutral", "Happy", "Sad", "Surprise", "Fear", "Disgust", "Angry"
] ]
if model_name == DDAMFNWeights.AFFECNET8:
self.emotions.append("Contempt")
# Add contempt for AFFECNET8 model
if model_weights == DDAMFNWeights.AFFECNET8:
self.emotion_labels.append("Contempt")
# Initialize image preprocessing parameters
self.input_size = input_size self.input_size = input_size
self.input_std = [0.229, 0.224, 0.225] self.normalization_std = [0.229, 0.224, 0.225]
self.input_mean = [0.485, 0.456, 0.406] self.normalization_mean = [0.485, 0.456, 0.406]
Logger.info( Logger.info(
f"Initialized Emotion class with model={model_name.name}, " f"Initialized Emotion class with model={model_weights.name}, "
f"device={'cuda' if torch.cuda.is_available() else 'cpu'}, " f"device={'cuda' if torch.cuda.is_available() else 'cpu'}, "
f"num_classes={len(self.emotions)}, input_size={self.input_size}" f"num_classes={len(self.emotion_labels)}, input_size={self.input_size}"
) )
# Get path to model weights # Get path to model weights and initialize model
self._model_path = verify_model_weights(model_name) self.model_path = verify_model_weights(model_weights)
Logger.info(f"Verified model weights located at: {self._model_path}") Logger.info(f"Verified model weights located at: {self.model_path}")
self._load_model()
# Initialize model def _load_model(self) -> None:
self._initialize_model(model_path=self._model_path)
def _initialize_model(self, model_path: str) -> None:
""" """
Initializes a TorchScript model for emotion inference. Loads and initializes a TorchScript model for emotion inference.
Args: Raises:
model_path (str): Path to the TorchScript (.pt) model. RuntimeError: If loading the model fails.
""" """
try: try:
self.model = torch.jit.load(model_path, map_location=self.device) self.model = torch.jit.load(self.model_path, map_location=self.device)
self.model.eval() self.model.eval()
Logger.info(f"TorchScript model successfully loaded from: {model_path}") Logger.info(f"TorchScript model successfully loaded from: {self.model_path}")
# Warm-up # Warm-up with dummy input
dummy = torch.randn(1, 3, 112, 112).to(self.device) dummy_input = torch.randn(1, 3, *self.input_size).to(self.device)
with torch.no_grad(): with torch.no_grad():
_ = self.model(dummy) _ = self.model(dummy_input)
Logger.info("Emotion model warmed up with dummy input.") Logger.info("Emotion model warmed up with dummy input.")
except Exception as e: except Exception as e:
Logger.error(f"Failed to load TorchScript model from {model_path}: {e}") Logger.error(f"Failed to load TorchScript model from {self.model_path}: {e}")
raise raise RuntimeError(f"Model loading failed: {str(e)}")
def preprocess(self, image: np.ndarray) -> torch.Tensor: def preprocess(self, image: np.ndarray) -> torch.Tensor:
""" """
Resize, normalize and convert image to tensor manually without torchvision. Preprocess image for model inference: resize, normalize and convert to tensor.
Args: Args:
image (np.ndarray): BGR image (H, W, 3) image (np.ndarray): BGR image (H, W, 3)
Returns:
torch.Tensor: Preprocessed image tensor of shape (1, 3, 112, 112)
"""
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # BGR -> RGB
# Resize to (112, 112) Returns:
image = cv2.resize(image, self.input_size).astype(np.float32) / 255.0 torch.Tensor: Preprocessed image tensor of shape (1, 3, H, W)
"""
# Convert BGR to RGB
rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# Resize to target input size
resized_image = cv2.resize(rgb_image, self.input_size).astype(np.float32) / 255.0
# Normalize with mean and std # Normalize with mean and std
mean = np.array(self.input_mean, dtype=np.float32) mean_array = np.array(self.normalization_mean, dtype=np.float32)
std = np.array(self.input_std, dtype=np.float32) std_array = np.array(self.normalization_std, dtype=np.float32)
image_normalized = (image - mean) / std normalized_image = (resized_image - mean_array) / std_array
# HWC to CHW # Convert from HWC to CHW format
image_transposed = image_normalized.transpose((2, 0, 1)) transposed_image = normalized_image.transpose((2, 0, 1))
# Convert to torch tensor and add batch dimension # Convert to torch tensor and add batch dimension
tensor = torch.from_numpy(image_transposed).unsqueeze(0).to(self.device) tensor = torch.from_numpy(transposed_image).unsqueeze(0).to(self.device)
return tensor return tensor
def predict(self, image: np.ndarray, landmark: np.ndarray) -> Tuple[Union[str, None], Union[float, None]]: def predict(self, image: np.ndarray, landmark: np.ndarray) -> Tuple[Union[str, None], Union[float, None]]:
""" """
Predict the emotion from an BGR face image. Predict the emotion from a face image.
Args: Args:
image (np.ndarray): Input face image in RGB format. image (np.ndarray): Input face image in BGR format.
landmark (np.ndarray): Facial five point landmark. landmark (np.ndarray): Facial five point landmark.
Returns: Returns:
Tuple[str, float]: (Predicted emotion label, Confidence score) Tuple[str, float]: (Predicted emotion label, Confidence score)
Returns (None, None) if prediction fails.
Raises: Raises:
RuntimeError: If the input is invalid or inference fails internally. ValueError: If the input is not a valid BGR image.
""" """
# Validate input
if not isinstance(image, np.ndarray): if not isinstance(image, np.ndarray):
Logger.error("Input must be a NumPy ndarray.") Logger.error("Input must be a NumPy ndarray.")
raise ValueError("Input must be a NumPy ndarray (RGB image).") raise ValueError("Input must be a NumPy ndarray (BGR image).")
if image.ndim != 3 or image.shape[2] != 3: if image.ndim != 3 or image.shape[2] != 3:
Logger.error(f"Invalid image shape: {image.shape}. Expected HxWx3 RGB image.") Logger.error(f"Invalid image shape: {image.shape}. Expected HxWx3 image.")
raise ValueError("Input image must be in RGB format with shape (H, W, 3).") raise ValueError("Input image must have shape (H, W, 3).")
try: try:
image, _ = face_alignment(image, landmark) # Align face using landmarks
tensor = self.preprocess(image) aligned_image, _ = face_alignment(image, landmark)
# Preprocess and run inference
input_tensor = self.preprocess(aligned_image)
with torch.no_grad(): with torch.no_grad():
output = self.model(tensor) output = self.model(input_tensor)
# Handle case where model returns a tuple
if isinstance(output, tuple): if isinstance(output, tuple):
output = output[0] output = output[0]
probs = torch.nn.functional.softmax(output, dim=1).squeeze(0).cpu().numpy() # Get probabilities and prediction
pred_idx = int(np.argmax(probs)) probabilities = torch.nn.functional.softmax(output, dim=1).squeeze(0).cpu().numpy()
confidence = round(float(probs[pred_idx]), 2) predicted_index = int(np.argmax(probabilities))
confidence_score = round(float(probabilities[predicted_index]), 2)
return self.emotions[pred_idx], confidence return self.emotion_labels[predicted_index], confidence_score
except Exception as e: except Exception as e:
Logger.error(f"Emotion inference failed: {e}") Logger.error(f"Emotion inference failed: {e}")
@@ -158,6 +169,8 @@ class Emotion:
# TODO: For testing purposes only, remove later # TODO: For testing purposes only, remove later
def main(): def main():
from uniface import RetinaFace
from uniface.constants import RetinaFaceWeights
face_detector = RetinaFace( face_detector = RetinaFace(
model_name=RetinaFaceWeights.MNET_V2, model_name=RetinaFaceWeights.MNET_V2,

View File

@@ -11,8 +11,9 @@ import onnxruntime as ort
from typing import Tuple, List, Literal from typing import Tuple, List, Literal
from uniface.log import Logger from uniface.log import Logger
from uniface.model_store import verify_model_weights
from uniface.constants import SCRFDWeights from uniface.constants import SCRFDWeights
from uniface.model_store import verify_model_weights
from .utils import non_max_supression, distance2bbox, distance2kps, resize_image from .utils import non_max_supression, distance2bbox, distance2kps, resize_image
__all__ = ['SCRFD'] __all__ = ['SCRFD']
@@ -248,13 +249,12 @@ class SCRFD:
sorted_indices = np.argsort(values)[::-1][:max_num] sorted_indices = np.argsort(values)[::-1][:max_num]
det = det[sorted_indices] det = det[sorted_indices]
landmarks = landmarks[sorted_indices] landmarks = landmarks[sorted_indices]
return det, landmarks return det, landmarks
# TODO: below is only for testing, remove it later # TODO: below is only for testing, remove it later
def draw_bbox(frame, bbox, color=(0, 255, 0), thickness=2): def draw_bbox(frame, bbox, color=(0, 255, 0), thickness=2):
x1, y1, x2, y2 = bbox[:4].astype(np.int32) x1, y1, x2, y2 = bbox[:4].astype(np.int32)
cv2.rectangle(frame, (x1, y1), (x2, y2), color, thickness) cv2.rectangle(frame, (x1, y1), (x2, y2), color, thickness)
@@ -267,6 +267,8 @@ def draw_keypoints(frame, points, color=(0, 0, 255), radius=2):
cv2.circle(frame, (x, y), radius, color, -1) cv2.circle(frame, (x, y), radius, color, -1)
# TODO: Remove late, just for testing
if __name__ == "__main__": if __name__ == "__main__":
detector = SCRFD(model_name=SCRFDWeights.SCRFD_500M_KPS) detector = SCRFD(model_name=SCRFDWeights.SCRFD_500M_KPS)
cap = cv2.VideoCapture(0) cap = cv2.VideoCapture(0)

View File

@@ -6,131 +6,163 @@ import numpy as np
from typing import Tuple from typing import Tuple
from uniface.log import Logger from uniface.log import Logger
from uniface.face_utils import bbox_center_alignment, transform_points_2d from uniface.constants import LandmarkWeights
from uniface.model_store import verify_model_weights from uniface.model_store import verify_model_weights
from uniface.face_utils import bbox_center_alignment, transform_points_2d
from uniface.detection import RetinaFace
from uniface.constants import RetinaFaceWeights, LandmarkWeights
__all__ = ['Landmark'] __all__ = ['Landmark']
class Landmark: class Landmark:
def __init__(self, model_name: LandmarkWeights = LandmarkWeights.DEFAULT, input_size: Tuple[int, int] = (192, 192)) -> None: """
Facial landmark detection model for predicting facial keypoints.
"""
def __init__(
self,
model_name: LandmarkWeights = LandmarkWeights.DEFAULT,
input_size: Tuple[int, int] = (192, 192)
) -> None:
""" """
Initializes the Facial Landmark model for inference. Initializes the Facial Landmark model for inference.
Args: Args:
model_path (str): Path to the ONNX file. model_name: Enum specifying which landmark model weights to use
input_size: Input resolution for the model (width, height)
""" """
Logger.info( Logger.info(
f"Initializing Facial Landmark with model={model_name}, " f"Initializing Facial Landmark with model={model_name}, "
f"input_size={input_size}" f"input_size={input_size}"
) )
# Initialize configuration
self.input_size = input_size self.input_size = input_size
self.input_std = 1.0 self.input_std = 1.0
self.input_mean = 0.0 self.input_mean = 0.0
# Get path to model weights # Get path to model weights
self._model_path = verify_model_weights(model_name) self.model_path = verify_model_weights(model_name)
Logger.info(f"Verfied model weights located at: {self._model_path}") Logger.info(f"Verified model weights located at: {self.model_path}")
# Initialize model # Initialize model
self._initialize_model(model_path=self._model_path) self._initialize_model()
def _initialize_model(self, model_path: str): def _initialize_model(self):
""" Initialize the model from the given path. """
Args: Initialize the ONNX model from the stored model path.
model_path (str): Path to .onnx model.
Raises:
RuntimeError: If the model fails to load or initialize.
""" """
try: try:
self.session = ort.InferenceSession( self.session = ort.InferenceSession(
model_path, self.model_path,
providers=["CUDAExecutionProvider", "CPUExecutionProvider"] providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
) )
metadata = self.session.get_inputs()[0] # Get input configuration
input_shape = metadata.shape input_metadata = self.session.get_inputs()[0]
self.input_size = tuple(input_shape[2:4][::-1]) input_shape = input_metadata.shape
self.input_size = tuple(input_shape[2:4][::-1]) # Update input size from model
self.input_names = [x.name for x in self.session.get_inputs()] # Get input/output names
self.output_names = [x.name for x in self.session.get_outputs()] self.input_names = [input.name for input in self.session.get_inputs()]
self.output_names = [output.name for output in self.session.get_outputs()]
outputs = self.session.get_outputs() # Determine landmark dimensions from output shape
output_shape = outputs[0].shape output_shape = self.session.get_outputs()[0].shape
self.lmk_dim = 2 self.lmk_dim = 2 # x,y coordinates
self.lmk_num = output_shape[1] // self.lmk_dim self.lmk_num = output_shape[1] // self.lmk_dim # Number of landmarks
Logger.info(f"Model initialized with {self.lmk_num} landmarks")
except Exception as e: except Exception as e:
print(f"Failed to load the model: {e}") Logger.error(f"Failed to load landmark model from '{self.model_path}'", exc_info=True)
raise raise RuntimeError(f"Failed to initialize landmark model: {e}")
def preprocess(self, image: np.ndarray, bbox: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: def preprocess(self, image: np.ndarray, bbox: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
""" """
Preprocess the input image and bbox for inference. Preprocess the input image and bounding box for inference.
Args: Args:
image (np.ndarray): Input image. image: Input image in BGR format
bbox (np.ndarray): Bounding box [x1, y1, x2, y2]. bbox: Bounding box coordinates [x1, y1, x2, y2]
Returns: Returns:
Tuple[np.ndarray, np.ndarray]: Preprocessed blob and transformation matrix. Tuple containing:
- Preprocessed image blob ready for inference
- Transformation matrix for mapping predictions back to original image
""" """
# Calculate face dimensions and center
width, height = bbox[2] - bbox[0], bbox[3] - bbox[1] width, height = bbox[2] - bbox[0], bbox[3] - bbox[1]
center = (bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2 center = (bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2
# Determine scale to fit face with some margin
scale = self.input_size[0] / (max(width, height) * 1.5) scale = self.input_size[0] / (max(width, height) * 1.5)
rotation = 0.0 rotation = 0.0
transformed_image, M = bbox_center_alignment(image, center, self.input_size[0], scale, rotation) # Align face using center, scale and rotation
input_size = tuple(transformed_image.shape[0:2][::-1]) aligned_face, transform_matrix = bbox_center_alignment(
image, center, self.input_size[0], scale, rotation
blob = cv2.dnn.blobFromImage(
transformed_image,
1.0/self.input_std,
input_size,
(self.input_mean, self.input_mean, self.input_mean),
swapRB=True
) )
return blob, M
# Convert to blob format for inference
face_blob = cv2.dnn.blobFromImage(
aligned_face,
1.0 / self.input_std,
self.input_size,
(self.input_mean, self.input_mean, self.input_mean),
swapRB=True # Convert BGR to RGB
)
return face_blob, transform_matrix
def postprocess(self, predictions: np.ndarray, M: np.ndarray) -> np.ndarray: def postprocess(self, predictions: np.ndarray, transform_matrix: np.ndarray) -> np.ndarray:
""" """
Postprocess model outputs to get landmarks. Convert raw model predictions to image coordinates.
Args: Args:
predictions (np.ndarray): Raw model predictions. predictions: Raw landmark coordinates from model output
M (np.ndarray): Affine transformation matrix. transform_matrix: Affine transformation matrix from preprocessing
Returns: Returns:
np.ndarray: Transformed landmarks. Landmarks in original image coordinates
""" """
# Reshape to pairs of x,y coordinates
landmarks = predictions.reshape((-1, 2))
predictions = predictions.reshape((-1, 2)) # Denormalize coordinates to pixel space
landmarks[:, 0:2] += 1 # Shift from [-1,1] to [0,2] range
landmarks[:, 0:2] *= (self.input_size[0] // 2) # Scale to pixel coordinates
predictions[:, 0:2] += 1 # Invert the transformation to map back to original image
predictions[:, 0:2] *= (self.input_size[0] // 2) inverse_matrix = cv2.invertAffineTransform(transform_matrix)
landmarks = transform_points_2d(landmarks, inverse_matrix)
IM = cv2.invertAffineTransform(M) return landmarks
predictions = transform_points_2d(predictions, IM)
return predictions
def predict(self, image: np.ndarray, bbox: np.ndarray) -> np.ndarray: def predict(self, image: np.ndarray, bbox: np.ndarray) -> np.ndarray:
""" """
Predict facial landmarks for the given image and bounding box. Predict facial landmarks for the given image and face bounding box.
Args: Args:
image (np.ndarray): Input image. image: Input image in BGR format
bbox (np.ndarray): Bounding box [x1, y1, x2, y2]. bbox: Face bounding box [x1, y1, x2, y2]
Returns: Returns:
np.ndarray: Predicted landmarks. Array of facial landmarks in original image coordinates
""" """
blob, M = self.preprocess(image, bbox) # Preprocess image
preds = self.session.run(self.output_names, {self.input_names[0]: blob})[0][0] face_blob, transform_matrix = self.preprocess(image, bbox)
landmarks = self.postprocess(preds, M)
# Run inference
raw_predictions = self.session.run(
self.output_names,
{self.input_names[0]: face_blob}
)[0][0]
# Postprocess to get landmarks in original image space
landmarks = self.postprocess(raw_predictions, transform_matrix)
return landmarks return landmarks
@@ -138,7 +170,9 @@ class Landmark:
if __name__ == "__main__": if __name__ == "__main__":
from uniface.detection import RetinaFace
from uniface.constants import RetinaFaceWeights
face_detector = RetinaFace( face_detector = RetinaFace(
model_name=RetinaFaceWeights.MNET_V2, model_name=RetinaFaceWeights.MNET_V2,
conf_thresh=0.5, conf_thresh=0.5,

View File

@@ -7,7 +7,6 @@ import os
import cv2 import cv2
import numpy as np import numpy as np
import onnxruntime as ort import onnxruntime as ort
from typing import Tuple, Optional, Union, List from typing import Tuple, Optional, Union, List
from dataclasses import dataclass from dataclasses import dataclass
@@ -37,100 +36,99 @@ class BaseFaceEncoder:
def __init__( def __init__(
self, self,
model_name: SphereFaceWeights | MobileFaceWeights | ArcFaceWeights = MobileFaceWeights.MNET_V2, model_name: Union[SphereFaceWeights, MobileFaceWeights, ArcFaceWeights] = MobileFaceWeights.MNET_V2,
preprocessing: PreprocessConfig = PreprocessConfig(), preprocessing: PreprocessConfig = PreprocessConfig(),
) -> None: ) -> None:
""" """
Initializes the FaceEncoder model for inference. Initializes the FaceEncoder model for inference.
Args: Args:
model_name (SphereFaceWeights | MobileFaceWeights | ArcFaceWeights): Selected model weight enum. model_name: Selected model weight enum.
preprocessing (PreprocessConfig): Configuration for input normalization and resizing. preprocessing: Configuration for input normalization and resizing.
""" """
# Store preprocessing parameters
self.input_mean = preprocessing.input_mean self.input_mean = preprocessing.input_mean
self.input_std = preprocessing.input_std self.input_std = preprocessing.input_std
self.input_size = preprocessing.input_size self.input_size = preprocessing.input_size
Logger.info( Logger.info(
f"Initializing Face Recognition with model={model_name}, " f"Initializing Face Recognition with model={model_name}, "
f"input_mean={self.input_mean}, input_std={self.input_std}, input_size={self.input_size}" f"input_mean={self.input_mean}, input_std={self.input_std}, "
f"input_size={self.input_size}"
) )
# Get path to model weights # Get path to model weights and initialize model
self._model_path = verify_model_weights(model_name) self.model_path = verify_model_weights(model_name)
Logger.info(f"Verfied model weights located at: {self._model_path}") Logger.info(f"Verified model weights located at: {self.model_path}")
# Initialize model self._initialize_model()
self._initialize_model(self._model_path)
def _initialize_model(self, model_path: str) -> None: def _initialize_model(self) -> None:
""" """
Loads the ONNX model and prepares it for inference. Loads the ONNX model and prepares it for inference.
Args:
model_path (str): Path to the ONNX model file.
Raises: Raises:
RuntimeError: If the model fails to load or initialize. RuntimeError: If the model fails to load or initialize.
""" """
try: try:
# Initialize model session with available providers
self.session = ort.InferenceSession( self.session = ort.InferenceSession(
model_path, self.model_path,
providers=["CUDAExecutionProvider", "CPUExecutionProvider"] providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
) )
self._setup_model()
Logger.info(f"Successfully initialized face encoder from {model_path}") # Extract input configuration
input_cfg = self.session.get_inputs()[0]
self.input_name = input_cfg.name
# Verify input dimensions match our configuration
input_shape = input_cfg.shape
model_input_size = tuple(input_shape[2:4][::-1]) # (width, height)
if model_input_size != self.input_size:
Logger.warning(f"Model input size {model_input_size} differs from configured size {self.input_size}")
# Extract output configuration
self.output_names = [output.name for output in self.session.get_outputs()]
self.output_shape = self.session.get_outputs()[0].shape
assert len(self.output_names) == 1, "Expected only one output node."
Logger.info(f"Successfully initialized face encoder from {self.model_path}")
except Exception as e: except Exception as e:
Logger.error(f"Failed to load face encoder model from '{model_path}'", exc_info=True) Logger.error(f"Failed to load face encoder model from '{self.model_path}'", exc_info=True)
raise RuntimeError(f"Failed to initialize model session for '{model_path}'") from e raise RuntimeError(f"Failed to initialize model session for '{self.model_path}'") from e
def _setup_model(self) -> None: def preprocess(self, face_img: np.ndarray) -> np.ndarray:
"""
Extracts input/output configuration from the ONNX model session.
"""
input_cfg = self.session.get_inputs()[0]
input_shape = input_cfg.shape
model_input_size = tuple(input_shape[2:4][::-1]) # (width, height)
if model_input_size != self.input_size:
Logger.warning(f"Model input size {model_input_size} differs from configured size {self.input_size}")
self.input_name = input_cfg.name
self.output_names = [output.name for output in self.session.get_outputs()]
self.output_shape = self.session.get_outputs()[0].shape
assert len(self.output_names) == 1, "Expected only one output node."
def preprocess(self, image: np.ndarray) -> np.ndarray:
""" """
Preprocess the image: resize, normalize, and convert it to a blob. Preprocess the image: resize, normalize, and convert it to a blob.
Args: Args:
image (np.ndarray): Input image in BGR format. face_img: Input image in BGR format.
Returns: Returns:
np.ndarray: Preprocessed image as a NumPy array ready for inference. Preprocessed image as a NumPy array ready for inference.
""" """
image = cv2.resize(image, self.input_size) # Resize to (112, 112) resized_img = cv2.resize(face_img, self.input_size)
if isinstance(self.input_std, (list, tuple)):
# if self.input_std is a list, we assume it's per-channel std
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
image -= np.array(self.input_mean, dtype=np.float32) if isinstance(self.input_std, (list, tuple)):
image /= np.array(self.input_std, dtype=np.float32) # Per-channel normalization
rgb_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB).astype(np.float32)
normalized_img = (rgb_img - np.array(self.input_mean, dtype=np.float32)) / \
np.array(self.input_std, dtype=np.float32)
# Change to NCHW (batch, channels, height, width) # Change to NCHW (batch, channels, height, width)
blob = np.transpose(image, (2, 0, 1)) # CHW blob = np.transpose(normalized_img, (2, 0, 1)) # CHW
blob = np.expand_dims(blob, axis=0) # NCHW blob = np.expand_dims(blob, axis=0) # NCHW
else: else:
# cv2.dnn.blobFromImage does not support per-channel std so we use a single value here # Single-value normalization
blob = cv2.dnn.blobFromImage( blob = cv2.dnn.blobFromImage(
image, resized_img,
scalefactor=1.0 / self.input_std, scalefactor=1.0 / self.input_std,
size=self.input_size, size=self.input_size,
mean=(self.input_mean, self.input_mean, self.input_mean), mean=(self.input_mean, self.input_mean, self.input_mean),
swapRB=True # Convert BGR to RGB swapRB=True # Convert BGR to RGB
) )
return blob return blob
def get_embedding(self, image: np.ndarray, landmarks: np.ndarray) -> np.ndarray: def get_embedding(self, image: np.ndarray, landmarks: np.ndarray) -> np.ndarray:
@@ -138,13 +136,17 @@ class BaseFaceEncoder:
Extracts face embedding from an aligned image. Extracts face embedding from an aligned image.
Args: Args:
image (np.ndarray): Input face image (BGR format). image: Input face image (BGR format).
landmarks (np.ndarray): Facial landmarks (5 points for alignment). landmarks: Facial landmarks (5 points for alignment).
Returns: Returns:
np.ndarray: 512-dimensional face embedding. Face embedding vector (typically 512-dimensional).
""" """
aligned_face, _ = face_alignment(image, landmarks) # Use your function for alignment # Align face using landmarks
blob = self.preprocess(aligned_face) # Convert to blob aligned_face, _ = face_alignment(image, landmarks)
embedding = self.session.run(self.output_names, {self.input_name: blob})[0]
return embedding # Return the 512-D feature vector # Generate embedding from aligned face
face_blob = self.preprocess(aligned_face)
embedding = self.session.run(self.output_names, {self.input_name: face_blob})[0]
return embedding