uniface/scripts/run_face_parsing.py

# Face parsing on detected faces
# Usage: python run_face_parsing.py --image path/to/image.jpg
#        python run_face_parsing.py --webcam

import argparse
import os
from pathlib import Path

import cv2
import numpy as np

from uniface import RetinaFace
from uniface.constants import ParsingWeights
from uniface.parsing import BiSeNet
from uniface.visualization import vis_parsing_maps


def expand_bbox(
    bbox: np.ndarray,
    image_shape: tuple[int, int],
    expand_ratio: float = 0.2,
    expand_top_ratio: float = 0.4,
) -> tuple[int, int, int, int]:
    """
    Expand bounding box to include full head region for face parsing.

    Face detection typically returns tight face boxes, but face parsing
    requires the full head including hair, ears, and neck.

    Args:
        bbox: Original bounding box [x1, y1, x2, y2].
        image_shape: Image dimensions as (height, width).
        expand_ratio: Expansion ratio for left, right, and bottom (default: 0.2 = 20%).
        expand_top_ratio: Expansion ratio for top to capture hair/forehead (default: 0.4 = 40%).

    Returns:
        Tuple[int, int, int, int]: Expanded bbox (x1, y1, x2, y2) clamped to image bounds.
    """
    x1, y1, x2, y2 = map(int, bbox[:4])
    height, width = image_shape[:2]

    # Calculate face dimensions
    face_width = x2 - x1
    face_height = y2 - y1

    # Calculate expansion amounts
    expand_x = int(face_width * expand_ratio)
    expand_y_bottom = int(face_height * expand_ratio)
    expand_y_top = int(face_height * expand_top_ratio)

    # Expand and clamp to image boundaries
    new_x1 = max(0, x1 - expand_x)
    new_y1 = max(0, y1 - expand_y_top)
    new_x2 = min(width, x2 + expand_x)
    new_y2 = min(height, y2 + expand_y_bottom)

    return new_x1, new_y1, new_x2, new_y2


def process_image(detector, parser, image_path: str, save_dir: str = 'outputs', expand_ratio: float = 0.2):
    image = cv2.imread(image_path)
    if image is None:
        print(f"Error: Failed to load image from '{image_path}'")
        return

    faces = detector.detect(image)
    print(f'Detected {len(faces)} face(s)')

    result_image = image.copy()

    for i, face in enumerate(faces):
        # Expand bbox to include full head for parsing
        x1, y1, x2, y2 = expand_bbox(face.bbox, image.shape, expand_ratio=expand_ratio)
        face_crop = image[y1:y2, x1:x2]

        if face_crop.size == 0:
            continue

        # Parse the face
        mask = parser.parse(face_crop)
        print(f'  Face {i + 1}: parsed with {len(set(mask.flatten()))} unique classes')

        # Visualize the parsing result
        face_crop_rgb = cv2.cvtColor(face_crop, cv2.COLOR_BGR2RGB)
        vis_result = vis_parsing_maps(face_crop_rgb, mask, save_image=False)

        # Place the visualization back on the original image
        result_image[y1:y2, x1:x2] = vis_result

        # Draw expanded bounding box
        cv2.rectangle(result_image, (x1, y1), (x2, y2), (0, 255, 0), 2)

    os.makedirs(save_dir, exist_ok=True)
    output_path = os.path.join(save_dir, f'{Path(image_path).stem}_parsing.jpg')
    cv2.imwrite(output_path, result_image)
    print(f'Output saved: {output_path}')


def run_webcam(detector, parser, expand_ratio: float = 0.2):
    cap = cv2.VideoCapture(0)
    if not cap.isOpened():
        print('Cannot open webcam')
        return

    print("Press 'q' to quit")

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        frame = cv2.flip(frame, 1)
        faces = detector.detect(frame)

        for face in faces:
            # Expand bbox to include full head for parsing
            x1, y1, x2, y2 = expand_bbox(face.bbox, frame.shape, expand_ratio=expand_ratio)
            face_crop = frame[y1:y2, x1:x2]

            if face_crop.size == 0:
                continue

            # Parse the face
            mask = parser.parse(face_crop)

            # Visualize the parsing result
            face_crop_rgb = cv2.cvtColor(face_crop, cv2.COLOR_BGR2RGB)
            vis_result = vis_parsing_maps(face_crop_rgb, mask, save_image=False)

            # Place the visualization back on the frame
            frame[y1:y2, x1:x2] = vis_result

            # Draw expanded bounding box
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

        cv2.putText(frame, f'Faces: {len(faces)}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
        cv2.imshow('Face Parsing', frame)

        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()


def main():
    parser_arg = argparse.ArgumentParser(description='Run face parsing')
    parser_arg.add_argument('--image', type=str, help='Path to input image')
    parser_arg.add_argument('--webcam', action='store_true', help='Use webcam')
    parser_arg.add_argument('--save_dir', type=str, default='outputs')
    parser_arg.add_argument(
        '--model', type=str, default=ParsingWeights.RESNET18, choices=[ParsingWeights.RESNET18, ParsingWeights.RESNET34]
    )
    parser_arg.add_argument(
        '--expand-ratio',
        type=float,
        default=0.2,
        help='Bbox expansion ratio for full head coverage (default: 0.2 = 20%%)',
    )
    args = parser_arg.parse_args()

    if not args.image and not args.webcam:
        parser_arg.error('Either --image or --webcam must be specified')

    detector = RetinaFace()
    parser = BiSeNet(model_name=ParsingWeights.RESNET34)

    if args.webcam:
        run_webcam(detector, parser, expand_ratio=args.expand_ratio)
    else:
        process_image(detector, parser, args.image, args.save_dir, expand_ratio=args.expand_ratio)


if __name__ == '__main__':
    main()