Source code for concepts.language.gpt_vlm_query.gpt_image_query_utils

#! /usr/bin/env python3
# -*- coding: utf-8 -*-
# File   : gpt_image_query_utils.py
# Author : Jiayuan Mao
# Email  : maojiayuan@gmail.com
# Date   : 09/12/2024
#
# This file is part of Project Concepts.
# Distributed under terms of the MIT license.

"""Utility functions for preparing image queries for GPT models, such as drawing bounding boxes or drawing grids on images."""

from typing import Optional, Tuple, List

import cv2
import seaborn as sns
import numpy as np



[docs]
def resize_to(img: np.ndarray, target_max_dim: int) -> np.ndarray:
    max_dim = max(img.shape[:2])
    scale = target_max_dim / max_dim
    target_size = (int(img.shape[1] * scale), int(img.shape[0] * scale))
    return cv2.resize(img, target_size)




[docs]
def draw_text_inplace(
    img_: np.ndarray, text: str, x: int, y: int,
    font_scale: float = 2, font_thickness: int = 3, text_color: Tuple[int, ...] = (0, 0, 0), bg_color: Optional[Tuple[int, ...]] = None, font = cv2.FONT_HERSHEY_PLAIN
) -> None:
    text_size, _ = cv2.getTextSize(text, font, font_scale, font_thickness)
    text_w, text_h = text_size

    if bg_color is not None:
        cv2.rectangle(img_, (x, y), (x + text_w, y + text_h), bg_color, -1)

    cv2.putText(img_, text, (x, y + text_h + font_scale - 1), font, font_scale, text_color, font_thickness)




[docs]
def draw_text(img: np.ndarray, text: str, x: int, y: int, font_scale: float = 2, font_thickness: int = 3, text_color: Tuple[int, ...] = (0, 0, 0), bg_color: Optional[Tuple[int, ...]] = None, font = cv2.FONT_HERSHEY_PLAIN) -> np.ndarray:
    img = img.copy()
    draw_text_inplace(img, text, x, y, font_scale, font_thickness, text_color, bg_color, font)
    return img




[docs]
def draw_grid(img: np.ndarray, nr_vertical: int, nr_horizontal: int, resize_to_max_dim: int = 0) -> np.ndarray:
    """Draw a grid on the image with nr_vertical and nr_horizontal lines. It will also put a number at the top-left corner of each cell."""

    if resize_to_max_dim > 0:
        img = resize_to(img, resize_to_max_dim)

    img = img.copy()
    h, w = img.shape[:2]
    for i in range(1, nr_vertical):
        x = i * w // nr_vertical
        cv2.line(img, (x, 0), (x, h), (255, 255, 255), 3)
    for i in range(1, nr_horizontal):
        y = i * h // nr_horizontal
        cv2.line(img, (0, y), (w, y), (255, 255, 255), 3)

    font = cv2.FONT_HERSHEY_PLAIN
    font_scale = 2
    font_thickness = 3
    text_color = (0, 0, 0)
    text_color_bg = (255, 255, 255)

    for i in range(nr_horizontal):
        for j in range(nr_vertical):
            x = j * w // nr_vertical +  w // nr_vertical // 2
            y = i * h // nr_horizontal + h // nr_horizontal // 2

            text = f"{i * nr_vertical + j + 1}"
            draw_text_inplace(img, text, x, y, font_scale, font_thickness, text_color, text_color_bg, font)

    return img




[docs]
def draw_masks(img: np.ndarray, masks: List[np.ndarray], alpha: float = 0.5, contour_width: int = 2, mode: str = 'mask+bbox', resize_to_max_dim: int = 0) -> np.ndarray:
    """Draw masks on the image with the specified color and alpha value."""

    if resize_to_max_dim > 0:
        img = resize_to(img, resize_to_max_dim)

    draw_mask = 'mask' in mode
    draw_contour = 'contour' in mode
    draw_bbox = 'bbox' in mode

    img = img.copy()
    nr_colors = len(masks)
    colors = sns.color_palette('bright', n_colors=nr_colors)
    for i, mask in enumerate(masks):
        if resize_to_max_dim > 0:
            mask = (resize_to(mask, resize_to_max_dim) > 0.5)

        text_pos = None
        contour_color = tuple(map(int, np.array(colors[i][:3]) * 255))

        if draw_mask:
            img = img.astype(np.float32)
            img[mask > 0] = img[mask > 0] * alpha + np.array(contour_color) / np.array(contour_color).max() * (1 - alpha) * 255
            img = img.astype(np.uint8)
            x = np.where(mask > 0)[1]
            y = np.where(mask > 0)[0]
            text_pos = (int(x.mean()) + 5, int(y.mean()) + 5)

        if draw_contour:
            contours, _ = cv2.findContours(mask.astype(np.uint8).copy(), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
            contours_valid = [x for x in contours if cv2.contourArea(x) > 50]
            if len(contours_valid) > 0:
                contour = contours_valid[0]
            else:
                contour = None

            cv2.drawContours(img, contour, -1, contour_color, thickness=contour_width)

            if contour is not None:
                M = cv2.moments(contour)
                text_pos = (round(M['m10'] / M['m00']), round(M['m01'] / M['m00']))

        if draw_bbox:
            x, y, w, h = cv2.boundingRect(mask.astype(np.uint8))
            cv2.rectangle(img, (x, y), (x + w, y + h), contour_color, contour_width)

        if text_pos is not None:
            draw_text_inplace(img, str(i), text_pos[0], text_pos[1], font_scale=1, font_thickness=1, text_color=(0, 0, 0), bg_color=(255, 255, 255))

    return img