Shortcuts

Source code for agentlego.tools.image_editing.expansion

import math

import numpy as np
from PIL import Image, ImageOps

from agentlego.types import Annotated, ImageIO, Info
from agentlego.utils import load_or_build_object, parse_multi_float, require
from ..base import BaseTool
from .replace import Inpainting


def blend_gt2pt(old_image, new_image, sigma=0.15, steps=100):
    """Blend the ground truth image with the predicted image.

    This function is copied from 'TaskMatrix/visual_chatgpt.py:
    <https://github.com/microsoft/TaskMatrix/blob/main/visual_chatgpt.py>'_.

    Args:
        old_image (PIL.Image.Image): The ground truth image.
        new_image (PIL.Image.Image): The predicted image.
        sigma (float): The sigma of the Gaussian kernel.
        steps (int): The number of steps to blend.

    Returns:
        PIL.Image.Image: The blended image.
    """
    import cv2
    new_size = new_image.size
    old_size = old_image.size
    easy_img = np.array(new_image)
    gt_img_array = np.array(old_image)
    pos_w = (new_size[0] - old_size[0]) // 2
    pos_h = (new_size[1] - old_size[1]) // 2

    kernel_h = cv2.getGaussianKernel(old_size[1], old_size[1] * sigma)
    kernel_w = cv2.getGaussianKernel(old_size[0], old_size[0] * sigma)
    kernel = np.multiply(kernel_h, np.transpose(kernel_w))

    kernel[steps:-steps, steps:-steps] = 1
    kernel[:steps, :steps] = \
        kernel[:steps, :steps] / kernel[steps - 1, steps - 1]
    kernel[:steps, -steps:] = \
        kernel[:steps, -steps:] / kernel[steps - 1, -(steps)]
    kernel[-steps:, :steps] = \
        kernel[-steps:, :steps] / kernel[-steps, steps - 1]
    kernel[-steps:, -steps:] = \
        kernel[-steps:, -steps:] / kernel[-steps, -steps]
    kernel = np.expand_dims(kernel, 2)
    kernel = np.repeat(kernel, 3, 2)

    weight = np.linspace(0, 1, steps)
    top = np.expand_dims(weight, 1)
    top = np.repeat(top, old_size[0] - 2 * steps, 1)
    top = np.expand_dims(top, 2)
    top = np.repeat(top, 3, 2)

    weight = np.linspace(1, 0, steps)
    down = np.expand_dims(weight, 1)
    down = np.repeat(down, old_size[0] - 2 * steps, 1)
    down = np.expand_dims(down, 2)
    down = np.repeat(down, 3, 2)

    weight = np.linspace(0, 1, steps)
    left = np.expand_dims(weight, 0)
    left = np.repeat(left, old_size[1] - 2 * steps, 0)
    left = np.expand_dims(left, 2)
    left = np.repeat(left, 3, 2)

    weight = np.linspace(1, 0, steps)
    right = np.expand_dims(weight, 0)
    right = np.repeat(right, old_size[1] - 2 * steps, 0)
    right = np.expand_dims(right, 2)
    right = np.repeat(right, 3, 2)

    kernel[:steps, steps:-steps] = top
    kernel[-steps:, steps:-steps] = down
    kernel[steps:-steps, :steps] = left
    kernel[steps:-steps, -steps:] = right

    pt_gt_img = easy_img[pos_h:pos_h + old_size[1], pos_w:pos_w + old_size[0]]
    gaussian_gt_img = \
        kernel * gt_img_array + (1 - kernel) * pt_gt_img
    gaussian_gt_img = gaussian_gt_img.astype(np.int64)
    easy_img[pos_h:pos_h + old_size[1], pos_w:pos_w + old_size[0]] = \
        gaussian_gt_img
    gaussian_img = Image.fromarray(easy_img)
    return gaussian_img


[docs]class ImageExpansion(BaseTool): """A tool to expand the given image. Args: caption_model (str): The model name used to inference. Which can be found in the ``MMPreTrain`` repository. Defaults to ``blip-base_3rdparty_caption``. device (str): The device to load the model. Defaults to 'cuda'. toolmeta (None | dict | ToolMeta): The additional info of the tool. Defaults to None. """ default_desc = ('This tool can expand the peripheral area of an image ' 'based on its content, thus obtaining a larger image.') @require('mmpretrain') @require('diffusers') def __init__(self, caption_model: str = 'blip-base_3rdparty_caption', device: str = 'cuda', toolmeta=None): super().__init__(toolmeta=toolmeta) self.caption_model_name = caption_model self.device = device def setup(self): from mmpretrain.apis import ImageCaptionInferencer self.caption_inferencer = load_or_build_object( ImageCaptionInferencer, model=self.caption_model_name, device=self.device) self.inpainting_inferencer = load_or_build_object(Inpainting, device=self.device) def apply( self, image: ImageIO, scale: Annotated[str, Info('expand ratio, can be a float number or two ' 'float number for width and height ratio.')], ) -> ImageIO: old_img = image.to_pil().convert('RGB') expand_ratio = 4 # maximum expand ratio for a single round. scale_w, scale_h = parse_multi_float(scale, 2) target_w = int(old_img.size[0] * scale_w) target_h = int(old_img.size[1] * scale_h) while old_img.size != (target_w, target_h): caption = self.get_caption(old_img) # crop the some border to re-generation. crop_w = 15 if (old_img.width != target_w and old_img.width > 100) else 0 crop_h = 15 if (old_img.height != target_h and old_img.height > 100) else 0 old_img = ImageOps.crop(old_img, (crop_w, crop_h, crop_w, crop_h)) canvas_w = min(expand_ratio * old_img.width, target_w) canvas_h = min(expand_ratio * old_img.height, target_h) canvas = Image.new('RGB', (canvas_w, canvas_h), color='white') mask = Image.new('L', (canvas_w, canvas_h), color='white') # paste the old image into the center of canvas. x = (canvas.width - old_img.width) // 2 y = (canvas.height - old_img.height) // 2 canvas.paste(old_img, (x, y)) mask.paste(0, (x, y, x + old_img.width, y + old_img.height)) # Resize the canvas into a proper size (about 1000x1000) to # generate more details resized_canvas = self.resize_image(canvas) resized_mask = self.resize_image(mask) image = self.inpainting_inferencer( prompt=caption, image=resized_canvas, mask_image=resized_mask, height=resized_canvas.height, width=resized_canvas.width, num_inference_steps=10) # Resize the generated image into the canvas size and # blend with the old image. image = image.resize((canvas.width, canvas.height), Image.ANTIALIAS) image = blend_gt2pt(old_img, image) old_img = image return ImageIO(old_img) def get_caption(self, image: Image.Image): image = np.array(image)[:, :, ::-1] return self.caption_inferencer(image)[0]['pred_caption'] def resize_image(self, image, max_size=1000000, multiple=8): aspect_ratio = image.size[0] / image.size[1] new_width = int(math.sqrt(max_size * aspect_ratio)) new_height = int(new_width / aspect_ratio) new_width = new_width - (new_width % multiple) new_height = new_height - (new_height % multiple) return image.resize((new_width, new_height))