Source code for agentlego.tools.image_editing.expansion
import math
import numpy as np
from PIL import Image, ImageOps
from agentlego.types import Annotated, ImageIO, Info
from agentlego.utils import load_or_build_object, parse_multi_float, require
from ..base import BaseTool
from .replace import Inpainting
def blend_gt2pt(old_image, new_image, sigma=0.15, steps=100):
"""Blend the ground truth image with the predicted image.
This function is copied from 'TaskMatrix/visual_chatgpt.py:
<https://github.com/microsoft/TaskMatrix/blob/main/visual_chatgpt.py>'_.
Args:
old_image (PIL.Image.Image): The ground truth image.
new_image (PIL.Image.Image): The predicted image.
sigma (float): The sigma of the Gaussian kernel.
steps (int): The number of steps to blend.
Returns:
PIL.Image.Image: The blended image.
"""
import cv2
new_size = new_image.size
old_size = old_image.size
easy_img = np.array(new_image)
gt_img_array = np.array(old_image)
pos_w = (new_size[0] - old_size[0]) // 2
pos_h = (new_size[1] - old_size[1]) // 2
kernel_h = cv2.getGaussianKernel(old_size[1], old_size[1] * sigma)
kernel_w = cv2.getGaussianKernel(old_size[0], old_size[0] * sigma)
kernel = np.multiply(kernel_h, np.transpose(kernel_w))
kernel[steps:-steps, steps:-steps] = 1
kernel[:steps, :steps] = \
kernel[:steps, :steps] / kernel[steps - 1, steps - 1]
kernel[:steps, -steps:] = \
kernel[:steps, -steps:] / kernel[steps - 1, -(steps)]
kernel[-steps:, :steps] = \
kernel[-steps:, :steps] / kernel[-steps, steps - 1]
kernel[-steps:, -steps:] = \
kernel[-steps:, -steps:] / kernel[-steps, -steps]
kernel = np.expand_dims(kernel, 2)
kernel = np.repeat(kernel, 3, 2)
weight = np.linspace(0, 1, steps)
top = np.expand_dims(weight, 1)
top = np.repeat(top, old_size[0] - 2 * steps, 1)
top = np.expand_dims(top, 2)
top = np.repeat(top, 3, 2)
weight = np.linspace(1, 0, steps)
down = np.expand_dims(weight, 1)
down = np.repeat(down, old_size[0] - 2 * steps, 1)
down = np.expand_dims(down, 2)
down = np.repeat(down, 3, 2)
weight = np.linspace(0, 1, steps)
left = np.expand_dims(weight, 0)
left = np.repeat(left, old_size[1] - 2 * steps, 0)
left = np.expand_dims(left, 2)
left = np.repeat(left, 3, 2)
weight = np.linspace(1, 0, steps)
right = np.expand_dims(weight, 0)
right = np.repeat(right, old_size[1] - 2 * steps, 0)
right = np.expand_dims(right, 2)
right = np.repeat(right, 3, 2)
kernel[:steps, steps:-steps] = top
kernel[-steps:, steps:-steps] = down
kernel[steps:-steps, :steps] = left
kernel[steps:-steps, -steps:] = right
pt_gt_img = easy_img[pos_h:pos_h + old_size[1], pos_w:pos_w + old_size[0]]
gaussian_gt_img = \
kernel * gt_img_array + (1 - kernel) * pt_gt_img
gaussian_gt_img = gaussian_gt_img.astype(np.int64)
easy_img[pos_h:pos_h + old_size[1], pos_w:pos_w + old_size[0]] = \
gaussian_gt_img
gaussian_img = Image.fromarray(easy_img)
return gaussian_img
[docs]class ImageExpansion(BaseTool):
"""A tool to expand the given image.
Args:
caption_model (str): The model name used to inference. Which can be
found in the ``MMPreTrain`` repository.
Defaults to ``blip-base_3rdparty_caption``.
device (str): The device to load the model. Defaults to 'cuda'.
toolmeta (None | dict | ToolMeta): The additional info of the tool.
Defaults to None.
"""
default_desc = ('This tool can expand the peripheral area of an image '
'based on its content, thus obtaining a larger image.')
@require('mmpretrain')
@require('diffusers')
def __init__(self,
caption_model: str = 'blip-base_3rdparty_caption',
device: str = 'cuda',
toolmeta=None):
super().__init__(toolmeta=toolmeta)
self.caption_model_name = caption_model
self.device = device
def setup(self):
from mmpretrain.apis import ImageCaptionInferencer
self.caption_inferencer = load_or_build_object(
ImageCaptionInferencer, model=self.caption_model_name, device=self.device)
self.inpainting_inferencer = load_or_build_object(Inpainting, device=self.device)
def apply(
self,
image: ImageIO,
scale: Annotated[str,
Info('expand ratio, can be a float number or two '
'float number for width and height ratio.')],
) -> ImageIO:
old_img = image.to_pil().convert('RGB')
expand_ratio = 4 # maximum expand ratio for a single round.
scale_w, scale_h = parse_multi_float(scale, 2)
target_w = int(old_img.size[0] * scale_w)
target_h = int(old_img.size[1] * scale_h)
while old_img.size != (target_w, target_h):
caption = self.get_caption(old_img)
# crop the some border to re-generation.
crop_w = 15 if (old_img.width != target_w and old_img.width > 100) else 0
crop_h = 15 if (old_img.height != target_h and old_img.height > 100) else 0
old_img = ImageOps.crop(old_img, (crop_w, crop_h, crop_w, crop_h))
canvas_w = min(expand_ratio * old_img.width, target_w)
canvas_h = min(expand_ratio * old_img.height, target_h)
canvas = Image.new('RGB', (canvas_w, canvas_h), color='white')
mask = Image.new('L', (canvas_w, canvas_h), color='white')
# paste the old image into the center of canvas.
x = (canvas.width - old_img.width) // 2
y = (canvas.height - old_img.height) // 2
canvas.paste(old_img, (x, y))
mask.paste(0, (x, y, x + old_img.width, y + old_img.height))
# Resize the canvas into a proper size (about 1000x1000) to
# generate more details
resized_canvas = self.resize_image(canvas)
resized_mask = self.resize_image(mask)
image = self.inpainting_inferencer(
prompt=caption,
image=resized_canvas,
mask_image=resized_mask,
height=resized_canvas.height,
width=resized_canvas.width,
num_inference_steps=10)
# Resize the generated image into the canvas size and
# blend with the old image.
image = image.resize((canvas.width, canvas.height), Image.ANTIALIAS)
image = blend_gt2pt(old_img, image)
old_img = image
return ImageIO(old_img)
def get_caption(self, image: Image.Image):
image = np.array(image)[:, :, ::-1]
return self.caption_inferencer(image)[0]['pred_caption']
def resize_image(self, image, max_size=1000000, multiple=8):
aspect_ratio = image.size[0] / image.size[1]
new_width = int(math.sqrt(max_size * aspect_ratio))
new_height = int(new_width / aspect_ratio)
new_width = new_width - (new_width % multiple)
new_height = new_height - (new_height % multiple)
return image.resize((new_width, new_height))