mirror of
https://github.com/yakhyo/uniface.git
synced 2025-12-30 09:02:25 +00:00
ref: Update some modules and remove redundant parts
This commit is contained in:
@@ -18,7 +18,6 @@ def extract_reference_embedding(detector, recognizer, image_path):
|
||||
raise RuntimeError("No faces found in reference image.")
|
||||
|
||||
embedding = recognizer.get_embedding(image, landmarks[0])
|
||||
print(f"Reference embedding extracted (L2 norm = {np.linalg.norm(embedding):.4f})")
|
||||
return embedding
|
||||
|
||||
|
||||
|
||||
@@ -4,12 +4,10 @@ import onnxruntime as ort
|
||||
from typing import Tuple
|
||||
|
||||
from uniface.log import Logger
|
||||
from uniface.constants import AgeGenderWeights
|
||||
from uniface.face_utils import bbox_center_alignment
|
||||
from uniface.model_store import verify_model_weights
|
||||
from uniface.constants import AgeGenderWeights
|
||||
|
||||
from uniface.detection import RetinaFace
|
||||
from uniface.constants import RetinaFaceWeights
|
||||
|
||||
__all__ = ["AgeGender"]
|
||||
|
||||
@@ -17,109 +15,156 @@ __all__ = ["AgeGender"]
|
||||
class AgeGender:
|
||||
"""
|
||||
Age and Gender Prediction Model.
|
||||
|
||||
This model predicts both a person's gender (male/female) and age from a facial image.
|
||||
Gender is returned as an integer (0: female, 1: male) and age as years.
|
||||
"""
|
||||
|
||||
def __init__(self, model_name: AgeGenderWeights = AgeGenderWeights.DEFAULT, input_size: Tuple[int, int] = (112, 112)) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
model_name: AgeGenderWeights = AgeGenderWeights.DEFAULT,
|
||||
input_size: Tuple[int, int] = (112, 112)
|
||||
) -> None:
|
||||
"""
|
||||
Initializes the Attribute model for inference.
|
||||
Initializes the Age and Gender prediction model.
|
||||
|
||||
Args:
|
||||
model_path (str): Path to the ONNX file.
|
||||
model_name: Model weights enum to use
|
||||
input_size: Input resolution for the model (width, height)
|
||||
"""
|
||||
|
||||
Logger.info(
|
||||
f"Initializing AgeGender with model={model_name}, "
|
||||
f"input_size={input_size}"
|
||||
)
|
||||
|
||||
# Model configuration
|
||||
self.input_size = input_size
|
||||
self.input_std = 1.0
|
||||
self.input_mean = 0.0
|
||||
|
||||
# Get path to model weights
|
||||
self._model_path = verify_model_weights(model_name)
|
||||
Logger.info(f"Verfied model weights located at: {self._model_path}")
|
||||
self.model_path = verify_model_weights(model_name)
|
||||
Logger.info(f"Verified model weights located at: {self.model_path}")
|
||||
|
||||
# Initialize model
|
||||
self._initialize_model(model_path=self._model_path)
|
||||
self._initialize_model()
|
||||
|
||||
def _initialize_model(self, model_path: str):
|
||||
"""Initialize the model from the given path.
|
||||
def _initialize_model(self):
|
||||
"""
|
||||
Initialize the ONNX model for inference.
|
||||
|
||||
Args:
|
||||
model_path (str): Path to .onnx model.
|
||||
Raises:
|
||||
RuntimeError: If the model fails to load or initialize.
|
||||
"""
|
||||
try:
|
||||
# Initialize session with available providers
|
||||
self.session = ort.InferenceSession(
|
||||
model_path,
|
||||
self.model_path,
|
||||
providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
|
||||
|
||||
)
|
||||
|
||||
# Get model info
|
||||
metadata = self.session.get_inputs()[0]
|
||||
input_shape = metadata.shape
|
||||
self.input_size = tuple(input_shape[2:4][::-1])
|
||||
# Extract model metadata
|
||||
input_metadata = self.session.get_inputs()[0]
|
||||
input_shape = input_metadata.shape
|
||||
self.input_size = tuple(input_shape[2:4][::-1]) # Update from model (width, height)
|
||||
|
||||
self.input_names = [x.name for x in self.session.get_inputs()]
|
||||
self.output_names = [x.name for x in self.session.get_outputs()]
|
||||
# Get input/output names
|
||||
self.input_names = [input.name for input in self.session.get_inputs()]
|
||||
self.output_names = [output.name for output in self.session.get_outputs()]
|
||||
|
||||
Logger.info(f"Successfully initialized AgeGender model")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed to load the model: {e}")
|
||||
raise
|
||||
Logger.error(f"Failed to load AgeGender model from '{self.model_path}'", exc_info=True)
|
||||
raise RuntimeError(f"Failed to initialize AgeGender model: {e}")
|
||||
|
||||
def preprocess(self, image: np.ndarray, bbox: np.ndarray):
|
||||
"""Preprocessing
|
||||
def preprocess(self, image: np.ndarray, bbox: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Preprocess the input image and face bounding box for inference.
|
||||
|
||||
Args:
|
||||
image (np.ndarray): Numpy image
|
||||
bbox (np.ndarray): Bounding box coordinates: [x1, y1, x2, y2]
|
||||
image: Input image in BGR format
|
||||
bbox: Face bounding box coordinates [x1, y1, x2, y2]
|
||||
|
||||
Returns:
|
||||
np.ndarray: Transformed image
|
||||
Preprocessed image blob ready for inference
|
||||
"""
|
||||
# Calculate face dimensions and center
|
||||
width, height = bbox[2] - bbox[0], bbox[3] - bbox[1]
|
||||
center = (bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2
|
||||
|
||||
# Determine scale to fit face with margin
|
||||
scale = self.input_size[0] / (max(width, height) * 1.5)
|
||||
rotation = 0.0
|
||||
|
||||
transformed_image, M = bbox_center_alignment(image, center, self.input_size[0], scale, rotation)
|
||||
|
||||
input_size = tuple(transformed_image.shape[0:2][::-1])
|
||||
|
||||
blob = cv2.dnn.blobFromImage(
|
||||
transformed_image,
|
||||
1.0/self.input_std,
|
||||
input_size,
|
||||
(self.input_mean, self.input_mean, self.input_mean),
|
||||
swapRB=True
|
||||
# Align face based on bounding box
|
||||
aligned_face, _ = bbox_center_alignment(
|
||||
image, center, self.input_size[0], scale, rotation
|
||||
)
|
||||
return blob
|
||||
|
||||
def postprocess(self, predictions: np.ndarray) -> Tuple[np.int64, int]:
|
||||
"""Postprocessing
|
||||
# Convert to blob format for network input
|
||||
face_blob = cv2.dnn.blobFromImage(
|
||||
aligned_face,
|
||||
1.0 / self.input_std,
|
||||
self.input_size,
|
||||
(self.input_mean, self.input_mean, self.input_mean),
|
||||
swapRB=True # Convert BGR to RGB
|
||||
)
|
||||
|
||||
return face_blob
|
||||
|
||||
def postprocess(self, predictions: np.ndarray) -> Tuple[int, int]:
|
||||
"""
|
||||
Process model predictions to extract gender and age.
|
||||
|
||||
Args:
|
||||
predictions (np.ndarray): Model predictions, shape: [1, 3]
|
||||
predictions: Raw model output, shape [1, 3] where:
|
||||
- First two elements represent gender logits
|
||||
- Third element represents normalized age
|
||||
|
||||
Returns:
|
||||
Tuple[np.int64, int]: Gender and Age values
|
||||
Tuple containing:
|
||||
- Gender (0: female, 1: male)
|
||||
- Age in years
|
||||
"""
|
||||
gender = np.argmax(predictions[:2])
|
||||
# First two values are gender logits (female/male)
|
||||
gender = int(np.argmax(predictions[:2]))
|
||||
|
||||
# Third value is normalized age that needs scaling
|
||||
age = int(np.round(predictions[2] * 100))
|
||||
return gender, age
|
||||
|
||||
def predict(self, image: np.ndarray, bbox: np.ndarray) -> Tuple[np.int64, int]:
|
||||
blob = self.preprocess(image, bbox)
|
||||
predictions = self.session.run(self.output_names, {self.input_names[0]: blob})[0][0]
|
||||
gender, age = self.postprocess(predictions)
|
||||
|
||||
return gender, age
|
||||
|
||||
def predict(self, image: np.ndarray, bbox: np.ndarray) -> Tuple[int, int]:
|
||||
"""
|
||||
Predict age and gender for a face in the image.
|
||||
|
||||
Args:
|
||||
image: Input image in BGR format
|
||||
bbox: Face bounding box [x1, y1, x2, y2]
|
||||
|
||||
Returns:
|
||||
- 'gender_id': Gender as integer (0: female, 1: male)
|
||||
- 'age': Age in years
|
||||
"""
|
||||
# Preprocess and run inference
|
||||
face_blob = self.preprocess(image, bbox)
|
||||
predictions = self.session.run(
|
||||
self.output_names,
|
||||
{self.input_names[0]: face_blob}
|
||||
)[0][0]
|
||||
|
||||
# Extract gender and age from predictions
|
||||
gender_id, age = self.postprocess(predictions)
|
||||
|
||||
return gender_id, age
|
||||
|
||||
|
||||
# TODO: For testing purposes only, remove later
|
||||
|
||||
def main():
|
||||
from uniface.detection import RetinaFace
|
||||
from uniface.constants import RetinaFaceWeights
|
||||
|
||||
face_detector = RetinaFace(
|
||||
model_name=RetinaFaceWeights.MNET_V2,
|
||||
|
||||
@@ -10,10 +10,9 @@ from PIL import Image
|
||||
from typing import Tuple, Union
|
||||
|
||||
from uniface.log import Logger
|
||||
from uniface import RetinaFace
|
||||
from uniface.constants import DDAMFNWeights
|
||||
from uniface.face_utils import face_alignment
|
||||
from uniface.model_store import verify_model_weights
|
||||
from uniface.constants import RetinaFaceWeights, DDAMFNWeights
|
||||
|
||||
|
||||
class Emotion:
|
||||
@@ -21,10 +20,11 @@ class Emotion:
|
||||
Emotion recognition using a TorchScript model.
|
||||
|
||||
Args:
|
||||
model_name (DDAMFNWeights): Pretrained model enum. Defaults to AFFECNET7.
|
||||
model_weights (DDAMFNWeights): Pretrained model weights enum. Defaults to AFFECNET7.
|
||||
input_size (Tuple[int, int]): Size of input images. Defaults to (112, 112).
|
||||
|
||||
Attributes:
|
||||
emotions (List[str]): Emotion label list.
|
||||
emotion_labels (List[str]): List of emotion labels the model can predict.
|
||||
device (torch.device): Inference device (CPU or CUDA).
|
||||
model (torch.jit.ScriptModule): Loaded TorchScript model.
|
||||
|
||||
@@ -33,122 +33,133 @@ class Emotion:
|
||||
RuntimeError: If model loading fails.
|
||||
"""
|
||||
|
||||
def __init__(self, model_name: DDAMFNWeights = DDAMFNWeights.AFFECNET7, input_size: Tuple[int, int] = (112, 112)) -> None:
|
||||
def __init__(
|
||||
self,
|
||||
model_weights: DDAMFNWeights = DDAMFNWeights.AFFECNET7,
|
||||
input_size: Tuple[int, int] = (112, 112)
|
||||
) -> None:
|
||||
"""
|
||||
Initialize the emotion detector with a TorchScript model
|
||||
"""
|
||||
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
self.emotions = [
|
||||
self.emotion_labels = [
|
||||
"Neutral", "Happy", "Sad", "Surprise", "Fear", "Disgust", "Angry"
|
||||
]
|
||||
if model_name == DDAMFNWeights.AFFECNET8:
|
||||
self.emotions.append("Contempt")
|
||||
|
||||
# Add contempt for AFFECNET8 model
|
||||
if model_weights == DDAMFNWeights.AFFECNET8:
|
||||
self.emotion_labels.append("Contempt")
|
||||
|
||||
# Initialize image preprocessing parameters
|
||||
self.input_size = input_size
|
||||
self.input_std = [0.229, 0.224, 0.225]
|
||||
self.input_mean = [0.485, 0.456, 0.406]
|
||||
self.normalization_std = [0.229, 0.224, 0.225]
|
||||
self.normalization_mean = [0.485, 0.456, 0.406]
|
||||
|
||||
Logger.info(
|
||||
f"Initialized Emotion class with model={model_name.name}, "
|
||||
f"Initialized Emotion class with model={model_weights.name}, "
|
||||
f"device={'cuda' if torch.cuda.is_available() else 'cpu'}, "
|
||||
f"num_classes={len(self.emotions)}, input_size={self.input_size}"
|
||||
f"num_classes={len(self.emotion_labels)}, input_size={self.input_size}"
|
||||
)
|
||||
|
||||
# Get path to model weights
|
||||
self._model_path = verify_model_weights(model_name)
|
||||
Logger.info(f"Verified model weights located at: {self._model_path}")
|
||||
# Get path to model weights and initialize model
|
||||
self.model_path = verify_model_weights(model_weights)
|
||||
Logger.info(f"Verified model weights located at: {self.model_path}")
|
||||
self._load_model()
|
||||
|
||||
# Initialize model
|
||||
self._initialize_model(model_path=self._model_path)
|
||||
|
||||
def _initialize_model(self, model_path: str) -> None:
|
||||
def _load_model(self) -> None:
|
||||
"""
|
||||
Initializes a TorchScript model for emotion inference.
|
||||
Loads and initializes a TorchScript model for emotion inference.
|
||||
|
||||
Args:
|
||||
model_path (str): Path to the TorchScript (.pt) model.
|
||||
Raises:
|
||||
RuntimeError: If loading the model fails.
|
||||
"""
|
||||
try:
|
||||
self.model = torch.jit.load(model_path, map_location=self.device)
|
||||
self.model = torch.jit.load(self.model_path, map_location=self.device)
|
||||
self.model.eval()
|
||||
Logger.info(f"TorchScript model successfully loaded from: {model_path}")
|
||||
Logger.info(f"TorchScript model successfully loaded from: {self.model_path}")
|
||||
|
||||
# Warm-up
|
||||
dummy = torch.randn(1, 3, 112, 112).to(self.device)
|
||||
# Warm-up with dummy input
|
||||
dummy_input = torch.randn(1, 3, *self.input_size).to(self.device)
|
||||
with torch.no_grad():
|
||||
_ = self.model(dummy)
|
||||
_ = self.model(dummy_input)
|
||||
Logger.info("Emotion model warmed up with dummy input.")
|
||||
|
||||
except Exception as e:
|
||||
Logger.error(f"Failed to load TorchScript model from {model_path}: {e}")
|
||||
raise
|
||||
Logger.error(f"Failed to load TorchScript model from {self.model_path}: {e}")
|
||||
raise RuntimeError(f"Model loading failed: {str(e)}")
|
||||
|
||||
def preprocess(self, image: np.ndarray) -> torch.Tensor:
|
||||
"""
|
||||
Resize, normalize and convert image to tensor manually without torchvision.
|
||||
Preprocess image for model inference: resize, normalize and convert to tensor.
|
||||
|
||||
Args:
|
||||
image (np.ndarray): BGR image (H, W, 3)
|
||||
Returns:
|
||||
torch.Tensor: Preprocessed image tensor of shape (1, 3, 112, 112)
|
||||
"""
|
||||
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) # BGR -> RGB
|
||||
|
||||
# Resize to (112, 112)
|
||||
image = cv2.resize(image, self.input_size).astype(np.float32) / 255.0
|
||||
Returns:
|
||||
torch.Tensor: Preprocessed image tensor of shape (1, 3, H, W)
|
||||
"""
|
||||
# Convert BGR to RGB
|
||||
rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||||
|
||||
# Resize to target input size
|
||||
resized_image = cv2.resize(rgb_image, self.input_size).astype(np.float32) / 255.0
|
||||
|
||||
# Normalize with mean and std
|
||||
mean = np.array(self.input_mean, dtype=np.float32)
|
||||
std = np.array(self.input_std, dtype=np.float32)
|
||||
image_normalized = (image - mean) / std
|
||||
mean_array = np.array(self.normalization_mean, dtype=np.float32)
|
||||
std_array = np.array(self.normalization_std, dtype=np.float32)
|
||||
normalized_image = (resized_image - mean_array) / std_array
|
||||
|
||||
# HWC to CHW
|
||||
image_transposed = image_normalized.transpose((2, 0, 1))
|
||||
# Convert from HWC to CHW format
|
||||
transposed_image = normalized_image.transpose((2, 0, 1))
|
||||
|
||||
# Convert to torch tensor and add batch dimension
|
||||
tensor = torch.from_numpy(image_transposed).unsqueeze(0).to(self.device)
|
||||
|
||||
tensor = torch.from_numpy(transposed_image).unsqueeze(0).to(self.device)
|
||||
return tensor
|
||||
|
||||
def predict(self, image: np.ndarray, landmark: np.ndarray) -> Tuple[Union[str, None], Union[float, None]]:
|
||||
"""
|
||||
Predict the emotion from an BGR face image.
|
||||
Predict the emotion from a face image.
|
||||
|
||||
Args:
|
||||
image (np.ndarray): Input face image in RGB format.
|
||||
image (np.ndarray): Input face image in BGR format.
|
||||
landmark (np.ndarray): Facial five point landmark.
|
||||
|
||||
Returns:
|
||||
Tuple[str, float]: (Predicted emotion label, Confidence score)
|
||||
Returns (None, None) if prediction fails.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If the input is invalid or inference fails internally.
|
||||
ValueError: If the input is not a valid BGR image.
|
||||
"""
|
||||
# Validate input
|
||||
if not isinstance(image, np.ndarray):
|
||||
Logger.error("Input must be a NumPy ndarray.")
|
||||
raise ValueError("Input must be a NumPy ndarray (RGB image).")
|
||||
raise ValueError("Input must be a NumPy ndarray (BGR image).")
|
||||
|
||||
if image.ndim != 3 or image.shape[2] != 3:
|
||||
Logger.error(f"Invalid image shape: {image.shape}. Expected HxWx3 RGB image.")
|
||||
raise ValueError("Input image must be in RGB format with shape (H, W, 3).")
|
||||
Logger.error(f"Invalid image shape: {image.shape}. Expected HxWx3 image.")
|
||||
raise ValueError("Input image must have shape (H, W, 3).")
|
||||
|
||||
try:
|
||||
image, _ = face_alignment(image, landmark)
|
||||
tensor = self.preprocess(image)
|
||||
# Align face using landmarks
|
||||
aligned_image, _ = face_alignment(image, landmark)
|
||||
|
||||
# Preprocess and run inference
|
||||
input_tensor = self.preprocess(aligned_image)
|
||||
|
||||
with torch.no_grad():
|
||||
output = self.model(tensor)
|
||||
output = self.model(input_tensor)
|
||||
|
||||
# Handle case where model returns a tuple
|
||||
if isinstance(output, tuple):
|
||||
output = output[0]
|
||||
|
||||
probs = torch.nn.functional.softmax(output, dim=1).squeeze(0).cpu().numpy()
|
||||
pred_idx = int(np.argmax(probs))
|
||||
confidence = round(float(probs[pred_idx]), 2)
|
||||
# Get probabilities and prediction
|
||||
probabilities = torch.nn.functional.softmax(output, dim=1).squeeze(0).cpu().numpy()
|
||||
predicted_index = int(np.argmax(probabilities))
|
||||
confidence_score = round(float(probabilities[predicted_index]), 2)
|
||||
|
||||
return self.emotions[pred_idx], confidence
|
||||
return self.emotion_labels[predicted_index], confidence_score
|
||||
|
||||
except Exception as e:
|
||||
Logger.error(f"Emotion inference failed: {e}")
|
||||
@@ -158,6 +169,8 @@ class Emotion:
|
||||
# TODO: For testing purposes only, remove later
|
||||
|
||||
def main():
|
||||
from uniface import RetinaFace
|
||||
from uniface.constants import RetinaFaceWeights
|
||||
|
||||
face_detector = RetinaFace(
|
||||
model_name=RetinaFaceWeights.MNET_V2,
|
||||
|
||||
@@ -11,8 +11,9 @@ import onnxruntime as ort
|
||||
from typing import Tuple, List, Literal
|
||||
|
||||
from uniface.log import Logger
|
||||
from uniface.model_store import verify_model_weights
|
||||
from uniface.constants import SCRFDWeights
|
||||
from uniface.model_store import verify_model_weights
|
||||
|
||||
from .utils import non_max_supression, distance2bbox, distance2kps, resize_image
|
||||
|
||||
__all__ = ['SCRFD']
|
||||
@@ -249,12 +250,11 @@ class SCRFD:
|
||||
det = det[sorted_indices]
|
||||
landmarks = landmarks[sorted_indices]
|
||||
|
||||
|
||||
|
||||
return det, landmarks
|
||||
|
||||
# TODO: below is only for testing, remove it later
|
||||
|
||||
|
||||
def draw_bbox(frame, bbox, color=(0, 255, 0), thickness=2):
|
||||
x1, y1, x2, y2 = bbox[:4].astype(np.int32)
|
||||
cv2.rectangle(frame, (x1, y1), (x2, y2), color, thickness)
|
||||
@@ -267,6 +267,8 @@ def draw_keypoints(frame, points, color=(0, 0, 255), radius=2):
|
||||
cv2.circle(frame, (x, y), radius, color, -1)
|
||||
|
||||
|
||||
# TODO: Remove late, just for testing
|
||||
|
||||
if __name__ == "__main__":
|
||||
detector = SCRFD(model_name=SCRFDWeights.SCRFD_500M_KPS)
|
||||
cap = cv2.VideoCapture(0)
|
||||
|
||||
@@ -6,131 +6,163 @@ import numpy as np
|
||||
from typing import Tuple
|
||||
|
||||
from uniface.log import Logger
|
||||
from uniface.face_utils import bbox_center_alignment, transform_points_2d
|
||||
from uniface.constants import LandmarkWeights
|
||||
from uniface.model_store import verify_model_weights
|
||||
|
||||
from uniface.detection import RetinaFace
|
||||
from uniface.constants import RetinaFaceWeights, LandmarkWeights
|
||||
from uniface.face_utils import bbox_center_alignment, transform_points_2d
|
||||
|
||||
__all__ = ['Landmark']
|
||||
|
||||
|
||||
class Landmark:
|
||||
def __init__(self, model_name: LandmarkWeights = LandmarkWeights.DEFAULT, input_size: Tuple[int, int] = (192, 192)) -> None:
|
||||
"""
|
||||
Facial landmark detection model for predicting facial keypoints.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: LandmarkWeights = LandmarkWeights.DEFAULT,
|
||||
input_size: Tuple[int, int] = (192, 192)
|
||||
) -> None:
|
||||
"""
|
||||
Initializes the Facial Landmark model for inference.
|
||||
|
||||
Args:
|
||||
model_path (str): Path to the ONNX file.
|
||||
model_name: Enum specifying which landmark model weights to use
|
||||
input_size: Input resolution for the model (width, height)
|
||||
"""
|
||||
|
||||
Logger.info(
|
||||
f"Initializing Facial Landmark with model={model_name}, "
|
||||
f"input_size={input_size}"
|
||||
)
|
||||
|
||||
# Initialize configuration
|
||||
self.input_size = input_size
|
||||
self.input_std = 1.0
|
||||
self.input_mean = 0.0
|
||||
|
||||
# Get path to model weights
|
||||
self._model_path = verify_model_weights(model_name)
|
||||
Logger.info(f"Verfied model weights located at: {self._model_path}")
|
||||
self.model_path = verify_model_weights(model_name)
|
||||
Logger.info(f"Verified model weights located at: {self.model_path}")
|
||||
|
||||
# Initialize model
|
||||
self._initialize_model(model_path=self._model_path)
|
||||
self._initialize_model()
|
||||
|
||||
def _initialize_model(self, model_path: str):
|
||||
""" Initialize the model from the given path.
|
||||
Args:
|
||||
model_path (str): Path to .onnx model.
|
||||
def _initialize_model(self):
|
||||
"""
|
||||
Initialize the ONNX model from the stored model path.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If the model fails to load or initialize.
|
||||
"""
|
||||
try:
|
||||
self.session = ort.InferenceSession(
|
||||
model_path,
|
||||
self.model_path,
|
||||
providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
|
||||
)
|
||||
|
||||
metadata = self.session.get_inputs()[0]
|
||||
input_shape = metadata.shape
|
||||
self.input_size = tuple(input_shape[2:4][::-1])
|
||||
# Get input configuration
|
||||
input_metadata = self.session.get_inputs()[0]
|
||||
input_shape = input_metadata.shape
|
||||
self.input_size = tuple(input_shape[2:4][::-1]) # Update input size from model
|
||||
|
||||
self.input_names = [x.name for x in self.session.get_inputs()]
|
||||
self.output_names = [x.name for x in self.session.get_outputs()]
|
||||
# Get input/output names
|
||||
self.input_names = [input.name for input in self.session.get_inputs()]
|
||||
self.output_names = [output.name for output in self.session.get_outputs()]
|
||||
|
||||
outputs = self.session.get_outputs()
|
||||
output_shape = outputs[0].shape
|
||||
self.lmk_dim = 2
|
||||
self.lmk_num = output_shape[1] // self.lmk_dim
|
||||
# Determine landmark dimensions from output shape
|
||||
output_shape = self.session.get_outputs()[0].shape
|
||||
self.lmk_dim = 2 # x,y coordinates
|
||||
self.lmk_num = output_shape[1] // self.lmk_dim # Number of landmarks
|
||||
|
||||
Logger.info(f"Model initialized with {self.lmk_num} landmarks")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Failed to load the model: {e}")
|
||||
raise
|
||||
Logger.error(f"Failed to load landmark model from '{self.model_path}'", exc_info=True)
|
||||
raise RuntimeError(f"Failed to initialize landmark model: {e}")
|
||||
|
||||
def preprocess(self, image: np.ndarray, bbox: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
||||
"""
|
||||
Preprocess the input image and bbox for inference.
|
||||
Preprocess the input image and bounding box for inference.
|
||||
|
||||
Args:
|
||||
image (np.ndarray): Input image.
|
||||
bbox (np.ndarray): Bounding box [x1, y1, x2, y2].
|
||||
image: Input image in BGR format
|
||||
bbox: Bounding box coordinates [x1, y1, x2, y2]
|
||||
|
||||
Returns:
|
||||
Tuple[np.ndarray, np.ndarray]: Preprocessed blob and transformation matrix.
|
||||
Tuple containing:
|
||||
- Preprocessed image blob ready for inference
|
||||
- Transformation matrix for mapping predictions back to original image
|
||||
"""
|
||||
# Calculate face dimensions and center
|
||||
width, height = bbox[2] - bbox[0], bbox[3] - bbox[1]
|
||||
center = (bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2
|
||||
|
||||
# Determine scale to fit face with some margin
|
||||
scale = self.input_size[0] / (max(width, height) * 1.5)
|
||||
rotation = 0.0
|
||||
|
||||
transformed_image, M = bbox_center_alignment(image, center, self.input_size[0], scale, rotation)
|
||||
input_size = tuple(transformed_image.shape[0:2][::-1])
|
||||
|
||||
blob = cv2.dnn.blobFromImage(
|
||||
transformed_image,
|
||||
1.0/self.input_std,
|
||||
input_size,
|
||||
(self.input_mean, self.input_mean, self.input_mean),
|
||||
swapRB=True
|
||||
# Align face using center, scale and rotation
|
||||
aligned_face, transform_matrix = bbox_center_alignment(
|
||||
image, center, self.input_size[0], scale, rotation
|
||||
)
|
||||
return blob, M
|
||||
|
||||
def postprocess(self, predictions: np.ndarray, M: np.ndarray) -> np.ndarray:
|
||||
# Convert to blob format for inference
|
||||
face_blob = cv2.dnn.blobFromImage(
|
||||
aligned_face,
|
||||
1.0 / self.input_std,
|
||||
self.input_size,
|
||||
(self.input_mean, self.input_mean, self.input_mean),
|
||||
swapRB=True # Convert BGR to RGB
|
||||
)
|
||||
|
||||
return face_blob, transform_matrix
|
||||
|
||||
def postprocess(self, predictions: np.ndarray, transform_matrix: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Postprocess model outputs to get landmarks.
|
||||
Convert raw model predictions to image coordinates.
|
||||
|
||||
Args:
|
||||
predictions (np.ndarray): Raw model predictions.
|
||||
M (np.ndarray): Affine transformation matrix.
|
||||
predictions: Raw landmark coordinates from model output
|
||||
transform_matrix: Affine transformation matrix from preprocessing
|
||||
|
||||
Returns:
|
||||
np.ndarray: Transformed landmarks.
|
||||
Landmarks in original image coordinates
|
||||
"""
|
||||
# Reshape to pairs of x,y coordinates
|
||||
landmarks = predictions.reshape((-1, 2))
|
||||
|
||||
predictions = predictions.reshape((-1, 2))
|
||||
# Denormalize coordinates to pixel space
|
||||
landmarks[:, 0:2] += 1 # Shift from [-1,1] to [0,2] range
|
||||
landmarks[:, 0:2] *= (self.input_size[0] // 2) # Scale to pixel coordinates
|
||||
|
||||
predictions[:, 0:2] += 1
|
||||
predictions[:, 0:2] *= (self.input_size[0] // 2)
|
||||
# Invert the transformation to map back to original image
|
||||
inverse_matrix = cv2.invertAffineTransform(transform_matrix)
|
||||
landmarks = transform_points_2d(landmarks, inverse_matrix)
|
||||
|
||||
IM = cv2.invertAffineTransform(M)
|
||||
predictions = transform_points_2d(predictions, IM)
|
||||
|
||||
return predictions
|
||||
return landmarks
|
||||
|
||||
def predict(self, image: np.ndarray, bbox: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Predict facial landmarks for the given image and bounding box.
|
||||
Predict facial landmarks for the given image and face bounding box.
|
||||
|
||||
Args:
|
||||
image (np.ndarray): Input image.
|
||||
bbox (np.ndarray): Bounding box [x1, y1, x2, y2].
|
||||
image: Input image in BGR format
|
||||
bbox: Face bounding box [x1, y1, x2, y2]
|
||||
|
||||
Returns:
|
||||
np.ndarray: Predicted landmarks.
|
||||
Array of facial landmarks in original image coordinates
|
||||
"""
|
||||
blob, M = self.preprocess(image, bbox)
|
||||
preds = self.session.run(self.output_names, {self.input_names[0]: blob})[0][0]
|
||||
landmarks = self.postprocess(preds, M)
|
||||
# Preprocess image
|
||||
face_blob, transform_matrix = self.preprocess(image, bbox)
|
||||
|
||||
# Run inference
|
||||
raw_predictions = self.session.run(
|
||||
self.output_names,
|
||||
{self.input_names[0]: face_blob}
|
||||
)[0][0]
|
||||
|
||||
# Postprocess to get landmarks in original image space
|
||||
landmarks = self.postprocess(raw_predictions, transform_matrix)
|
||||
|
||||
return landmarks
|
||||
|
||||
@@ -138,6 +170,8 @@ class Landmark:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from uniface.detection import RetinaFace
|
||||
from uniface.constants import RetinaFaceWeights
|
||||
|
||||
face_detector = RetinaFace(
|
||||
model_name=RetinaFaceWeights.MNET_V2,
|
||||
|
||||
@@ -7,7 +7,6 @@ import os
|
||||
import cv2
|
||||
import numpy as np
|
||||
import onnxruntime as ort
|
||||
|
||||
from typing import Tuple, Optional, Union, List
|
||||
from dataclasses import dataclass
|
||||
|
||||
@@ -37,100 +36,99 @@ class BaseFaceEncoder:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: SphereFaceWeights | MobileFaceWeights | ArcFaceWeights = MobileFaceWeights.MNET_V2,
|
||||
model_name: Union[SphereFaceWeights, MobileFaceWeights, ArcFaceWeights] = MobileFaceWeights.MNET_V2,
|
||||
preprocessing: PreprocessConfig = PreprocessConfig(),
|
||||
) -> None:
|
||||
"""
|
||||
Initializes the FaceEncoder model for inference.
|
||||
|
||||
Args:
|
||||
model_name (SphereFaceWeights | MobileFaceWeights | ArcFaceWeights): Selected model weight enum.
|
||||
preprocessing (PreprocessConfig): Configuration for input normalization and resizing.
|
||||
model_name: Selected model weight enum.
|
||||
preprocessing: Configuration for input normalization and resizing.
|
||||
"""
|
||||
# Store preprocessing parameters
|
||||
self.input_mean = preprocessing.input_mean
|
||||
self.input_std = preprocessing.input_std
|
||||
self.input_size = preprocessing.input_size
|
||||
|
||||
Logger.info(
|
||||
f"Initializing Face Recognition with model={model_name}, "
|
||||
f"input_mean={self.input_mean}, input_std={self.input_std}, input_size={self.input_size}"
|
||||
f"input_mean={self.input_mean}, input_std={self.input_std}, "
|
||||
f"input_size={self.input_size}"
|
||||
)
|
||||
|
||||
# Get path to model weights
|
||||
self._model_path = verify_model_weights(model_name)
|
||||
Logger.info(f"Verfied model weights located at: {self._model_path}")
|
||||
# Get path to model weights and initialize model
|
||||
self.model_path = verify_model_weights(model_name)
|
||||
Logger.info(f"Verified model weights located at: {self.model_path}")
|
||||
|
||||
# Initialize model
|
||||
self._initialize_model(self._model_path)
|
||||
self._initialize_model()
|
||||
|
||||
def _initialize_model(self, model_path: str) -> None:
|
||||
def _initialize_model(self) -> None:
|
||||
"""
|
||||
Loads the ONNX model and prepares it for inference.
|
||||
|
||||
Args:
|
||||
model_path (str): Path to the ONNX model file.
|
||||
|
||||
Raises:
|
||||
RuntimeError: If the model fails to load or initialize.
|
||||
"""
|
||||
try:
|
||||
# Initialize model session with available providers
|
||||
self.session = ort.InferenceSession(
|
||||
model_path,
|
||||
self.model_path,
|
||||
providers=["CUDAExecutionProvider", "CPUExecutionProvider"]
|
||||
)
|
||||
self._setup_model()
|
||||
Logger.info(f"Successfully initialized face encoder from {model_path}")
|
||||
except Exception as e:
|
||||
Logger.error(f"Failed to load face encoder model from '{model_path}'", exc_info=True)
|
||||
raise RuntimeError(f"Failed to initialize model session for '{model_path}'") from e
|
||||
|
||||
def _setup_model(self) -> None:
|
||||
"""
|
||||
Extracts input/output configuration from the ONNX model session.
|
||||
"""
|
||||
# Extract input configuration
|
||||
input_cfg = self.session.get_inputs()[0]
|
||||
self.input_name = input_cfg.name
|
||||
|
||||
# Verify input dimensions match our configuration
|
||||
input_shape = input_cfg.shape
|
||||
model_input_size = tuple(input_shape[2:4][::-1]) # (width, height)
|
||||
|
||||
if model_input_size != self.input_size:
|
||||
Logger.warning(f"Model input size {model_input_size} differs from configured size {self.input_size}")
|
||||
|
||||
self.input_name = input_cfg.name
|
||||
# Extract output configuration
|
||||
self.output_names = [output.name for output in self.session.get_outputs()]
|
||||
self.output_shape = self.session.get_outputs()[0].shape
|
||||
|
||||
assert len(self.output_names) == 1, "Expected only one output node."
|
||||
Logger.info(f"Successfully initialized face encoder from {self.model_path}")
|
||||
|
||||
def preprocess(self, image: np.ndarray) -> np.ndarray:
|
||||
except Exception as e:
|
||||
Logger.error(f"Failed to load face encoder model from '{self.model_path}'", exc_info=True)
|
||||
raise RuntimeError(f"Failed to initialize model session for '{self.model_path}'") from e
|
||||
|
||||
def preprocess(self, face_img: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Preprocess the image: resize, normalize, and convert it to a blob.
|
||||
|
||||
Args:
|
||||
image (np.ndarray): Input image in BGR format.
|
||||
face_img: Input image in BGR format.
|
||||
|
||||
Returns:
|
||||
np.ndarray: Preprocessed image as a NumPy array ready for inference.
|
||||
Preprocessed image as a NumPy array ready for inference.
|
||||
"""
|
||||
image = cv2.resize(image, self.input_size) # Resize to (112, 112)
|
||||
if isinstance(self.input_std, (list, tuple)):
|
||||
# if self.input_std is a list, we assume it's per-channel std
|
||||
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
|
||||
resized_img = cv2.resize(face_img, self.input_size)
|
||||
|
||||
image -= np.array(self.input_mean, dtype=np.float32)
|
||||
image /= np.array(self.input_std, dtype=np.float32)
|
||||
if isinstance(self.input_std, (list, tuple)):
|
||||
# Per-channel normalization
|
||||
rgb_img = cv2.cvtColor(resized_img, cv2.COLOR_BGR2RGB).astype(np.float32)
|
||||
normalized_img = (rgb_img - np.array(self.input_mean, dtype=np.float32)) / \
|
||||
np.array(self.input_std, dtype=np.float32)
|
||||
|
||||
# Change to NCHW (batch, channels, height, width)
|
||||
blob = np.transpose(image, (2, 0, 1)) # CHW
|
||||
blob = np.transpose(normalized_img, (2, 0, 1)) # CHW
|
||||
blob = np.expand_dims(blob, axis=0) # NCHW
|
||||
else:
|
||||
# cv2.dnn.blobFromImage does not support per-channel std so we use a single value here
|
||||
# Single-value normalization
|
||||
blob = cv2.dnn.blobFromImage(
|
||||
image,
|
||||
resized_img,
|
||||
scalefactor=1.0 / self.input_std,
|
||||
size=self.input_size,
|
||||
mean=(self.input_mean, self.input_mean, self.input_mean),
|
||||
swapRB=True # Convert BGR to RGB
|
||||
)
|
||||
|
||||
return blob
|
||||
|
||||
def get_embedding(self, image: np.ndarray, landmarks: np.ndarray) -> np.ndarray:
|
||||
@@ -138,13 +136,17 @@ class BaseFaceEncoder:
|
||||
Extracts face embedding from an aligned image.
|
||||
|
||||
Args:
|
||||
image (np.ndarray): Input face image (BGR format).
|
||||
landmarks (np.ndarray): Facial landmarks (5 points for alignment).
|
||||
image: Input face image (BGR format).
|
||||
landmarks: Facial landmarks (5 points for alignment).
|
||||
|
||||
Returns:
|
||||
np.ndarray: 512-dimensional face embedding.
|
||||
Face embedding vector (typically 512-dimensional).
|
||||
"""
|
||||
aligned_face, _ = face_alignment(image, landmarks) # Use your function for alignment
|
||||
blob = self.preprocess(aligned_face) # Convert to blob
|
||||
embedding = self.session.run(self.output_names, {self.input_name: blob})[0]
|
||||
return embedding # Return the 512-D feature vector
|
||||
# Align face using landmarks
|
||||
aligned_face, _ = face_alignment(image, landmarks)
|
||||
|
||||
# Generate embedding from aligned face
|
||||
face_blob = self.preprocess(aligned_face)
|
||||
embedding = self.session.run(self.output_names, {self.input_name: face_blob})[0]
|
||||
|
||||
return embedding
|
||||
|
||||
Reference in New Issue
Block a user