feat: Add 2D Gaze estimation models (#34)

* feat: Add Gaze Estimation, update docs and Add example notebook, inference code * docs: Update README.md
2025-12-30 09:02:25 +00:00 · 2025-12-14 14:07:46 +09:00
parent da8a5cf35b
commit 4d1921e531
16 changed files with 1004 additions and 7 deletions
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -9,6 +9,7 @@ Scripts for testing UniFace features.
 | `run_detection.py` | Face detection on image or webcam |
 | `run_age_gender.py` | Age and gender prediction |
 | `run_emotion.py` | Emotion detection (7 or 8 emotions) |
+| `run_gaze_estimation.py` | Gaze direction estimation |
 | `run_landmarks.py` | 106-point facial landmark detection |
 | `run_recognition.py` | Face embedding extraction and comparison |
 | `run_face_analyzer.py` | Complete face analysis (detection + recognition + attributes) |
@@ -33,6 +34,10 @@ python scripts/run_age_gender.py --webcam
 python scripts/run_emotion.py --image assets/test.jpg
 python scripts/run_emotion.py --webcam

+# Gaze estimation
+python scripts/run_gaze_estimation.py --image assets/test.jpg
+python scripts/run_gaze_estimation.py --webcam
+
 # Landmarks
 python scripts/run_landmarks.py --image assets/test.jpg
 python scripts/run_landmarks.py --webcam
--- a/scripts/run_age_gender.py
+++ b/scripts/run_age_gender.py
@@ -79,7 +79,9 @@ def run_webcam(detector, age_gender, threshold: float = 0.6):
        bboxes = [f['bbox'] for f in faces]
        scores = [f['confidence'] for f in faces]
        landmarks = [f['landmarks'] for f in faces]
-        draw_detections(frame, bboxes, scores, landmarks, vis_threshold=threshold)
+        draw_detections(
+            image=frame, bboxes=bboxes, scores=scores, landmarks=landmarks, vis_threshold=threshold, fancy_bbox=True
+        )

        for face in faces:
            gender_id, age = age_gender.predict(frame, face['bbox'])  # predict per face
--- a/scripts/run_detection.py
+++ b/scripts/run_detection.py
@@ -98,7 +98,7 @@ def main():
    else:
        from uniface.constants import YOLOv5FaceWeights

-        detector = YOLOv5Face(model_name=YOLOv5FaceWeights.YOLOV5N)
+        detector = YOLOv5Face(model_name=YOLOv5FaceWeights.YOLOV5M)

    if args.webcam:
        run_webcam(detector, args.threshold)
--- a/scripts/run_gaze_estimation.py
+++ b/scripts/run_gaze_estimation.py
@@ -0,0 +1,104 @@
+# Gaze estimation on detected faces
+# Usage: python run_gaze_estimation.py --image path/to/image.jpg
+#        python run_gaze_estimation.py --webcam
+
+import argparse
+import os
+from pathlib import Path
+
+import cv2
+import numpy as np
+
+from uniface import RetinaFace
+from uniface.gaze import MobileGaze
+from uniface.visualization import draw_gaze
+
+
+def process_image(detector, gaze_estimator, image_path: str, save_dir: str = 'outputs'):
+    image = cv2.imread(image_path)
+    if image is None:
+        print(f"Error: Failed to load image from '{image_path}'")
+        return
+
+    faces = detector.detect(image)
+    print(f'Detected {len(faces)} face(s)')
+
+    for i, face in enumerate(faces):
+        bbox = face['bbox']
+        x1, y1, x2, y2 = map(int, bbox[:4])
+        face_crop = image[y1:y2, x1:x2]
+
+        if face_crop.size == 0:
+            continue
+
+        pitch, yaw = gaze_estimator.estimate(face_crop)
+        print(f'  Face {i + 1}: pitch={np.degrees(pitch):.1f}°, yaw={np.degrees(yaw):.1f}°')
+
+        # Draw both bbox and gaze arrow with angle text
+        draw_gaze(image, bbox, pitch, yaw, draw_angles=True)
+
+    os.makedirs(save_dir, exist_ok=True)
+    output_path = os.path.join(save_dir, f'{Path(image_path).stem}_gaze.jpg')
+    cv2.imwrite(output_path, image)
+    print(f'Output saved: {output_path}')
+
+
+def run_webcam(detector, gaze_estimator):
+    cap = cv2.VideoCapture(0)
+    if not cap.isOpened():
+        print('Cannot open webcam')
+        return
+
+    print("Press 'q' to quit")
+
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+
+        frame = cv2.flip(frame, 1)
+        faces = detector.detect(frame)
+
+        for face in faces:
+            bbox = face['bbox']
+            x1, y1, x2, y2 = map(int, bbox[:4])
+            face_crop = frame[y1:y2, x1:x2]
+
+            if face_crop.size == 0:
+                continue
+
+            pitch, yaw = gaze_estimator.estimate(face_crop)
+            # Draw both bbox and gaze arrow
+            draw_gaze(frame, bbox, pitch, yaw)
+
+        cv2.putText(frame, f'Faces: {len(faces)}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
+        cv2.imshow('Gaze Estimation', frame)
+
+        if cv2.waitKey(1) & 0xFF == ord('q'):
+            break
+
+    cap.release()
+    cv2.destroyAllWindows()
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Run gaze estimation')
+    parser.add_argument('--image', type=str, help='Path to input image')
+    parser.add_argument('--webcam', action='store_true', help='Use webcam')
+    parser.add_argument('--save_dir', type=str, default='outputs')
+    args = parser.parse_args()
+
+    if not args.image and not args.webcam:
+        parser.error('Either --image or --webcam must be specified')
+
+    detector = RetinaFace()
+    gaze_estimator = MobileGaze()
+
+    if args.webcam:
+        run_webcam(detector, gaze_estimator)
+    else:
+        process_image(detector, gaze_estimator, args.image, args.save_dir)
+
+
+if __name__ == '__main__':
+    main()