Source code for agentlego.tools.imagebind.anything_to_image

from agentlego.types import AudioIO, ImageIO
from agentlego.utils import is_package_available, load_or_build_object, require
from ..base import BaseTool

if is_package_available('torch'):
    import torch


class AnythingToImage:

    @require(['diffusers', 'ftfy', 'iopath', 'timm'])
    def __init__(self, device):
        from diffusers import StableUnCLIPImg2ImgPipeline

        from .models.imagebind_model import imagebind_huge

        pipe = load_or_build_object(
            StableUnCLIPImg2ImgPipeline.from_pretrained,
            pretrained_model_name_or_path='stabilityai/'
            'stable-diffusion-2-1-unclip',
            torch_dtype=torch.float16,
            variant='fp16')

        self.device = device
        self.pipe = pipe.to(device)
        self.pipe.enable_vae_slicing()
        self.model = imagebind_huge(pretrained=True).to(self.device)
        self.model.eval()


[docs]class AudioToImage(BaseTool):
    """A tool to generate image from an audio.

    Args:
        device (str): The device to load the model. Defaults to 'cpu'.
        toolmeta (None | dict | ToolMeta): The additional info of the tool.
            Defaults to None.
    """

    default_desc = ('This tool can generate an image '
                    'according to the input audio.')

    @require(['diffusers', 'ftfy', 'iopath', 'timm', 'pytorchvideo'])
    def __init__(self, device: str = 'cpu', toolmeta=None):
        super().__init__(toolmeta=toolmeta)
        self.device = device

    def setup(self):
        self._inferencer = load_or_build_object(AnythingToImage, device=self.device)

    def apply(self, audio: AudioIO) -> ImageIO:
        from .data import load_and_transform_audio_data
        from .models.imagebind_model import ModalityType

        audio_paths = [audio.to_path()]
        audio_data = load_and_transform_audio_data(audio_paths, self.device)
        embeddings = self._inferencer.model.forward({ModalityType.AUDIO: audio_data})
        embeddings = embeddings[ModalityType.AUDIO]
        images = self._inferencer.pipe(
            image_embeds=embeddings.half(), width=512, height=512).images
        output_image = images[0]

        return ImageIO(output_image)


[docs]class ThermalToImage(BaseTool):
    """A tool to generate image from an thermal image.

    Args:
        device (str): The device to load the model. Defaults to 'cpu'.
        toolmeta (None | dict | ToolMeta): The additional info of the tool.
            Defaults to None.
    """

    default_desc = ('This tool can generate an image '
                    'according to the input thermal image.')

    @require(['diffusers', 'ftfy', 'iopath', 'timm'])
    def __init__(self, device: str = 'cpu', toolmeta=None):
        super().__init__(toolmeta=toolmeta)
        self.device = device

    def setup(self):
        self._inferencer = load_or_build_object(AnythingToImage, device=self.device)

    def apply(self, thermal: ImageIO) -> ImageIO:
        from .data import load_and_transform_thermal_data
        from .models.imagebind_model import ModalityType

        thermal_paths = [thermal.to_path()]
        thermal_data = load_and_transform_thermal_data(thermal_paths, self.device)
        embeddings = self._inferencer.model.forward({ModalityType.THERMAL: thermal_data})
        embeddings = embeddings[ModalityType.THERMAL]
        images = self._inferencer.pipe(
            image_embeds=embeddings.half(), width=512, height=512).images
        output_image = images[0]

        return ImageIO(output_image)


[docs]class AudioImageToImage(BaseTool):
    """A tool to generate image from an audio and an image.

    Args:
        device (str): The device to load the model. Defaults to 'cpu'.
        toolmeta (None | dict | ToolMeta): The additional info of the tool.
            Defaults to None.
    """

    default_desc = ('This tool can generate an image according to '
                    'the input reference image and the input audio.')

    @require(['diffusers', 'ftfy', 'iopath', 'timm', 'pytorchvideo'])
    def __init__(self, device: str = 'cpu', toolmeta=None):
        super().__init__(toolmeta=toolmeta)
        self.device = device

    def setup(self):
        self._inferencer = load_or_build_object(AnythingToImage, device=self.device)

    def apply(self, image: ImageIO, audio: AudioIO) -> ImageIO:
        from .data import load_and_transform_audio_data, load_and_transform_vision_data
        from .models.imagebind_model import ModalityType

        # process image data
        vision_data = load_and_transform_vision_data([image.to_path()], self.device)
        embeddings = self._inferencer.model.forward({ModalityType.VISION: vision_data},
                                                    normalize=False)
        img_embeddings = embeddings[ModalityType.VISION]

        # process audio data
        audio_data = load_and_transform_audio_data([audio.to_path()], self.device)
        embeddings = self._inferencer.model.forward({
            ModalityType.AUDIO: audio_data,
        })
        audio_embeddings = embeddings[ModalityType.AUDIO]

        embeddings = (img_embeddings + audio_embeddings) / 2
        images = self._inferencer.pipe(
            image_embeds=embeddings.half(), width=512, height=512).images
        output_image = images[0]

        return ImageIO(output_image)


[docs]class AudioTextToImage(BaseTool):
    """A tool to generate image from an audio and texts.

    Args:
        device (str): The device to load the model. Defaults to 'cpu'.
        toolmeta (None | dict | ToolMeta): The additional info of the tool.
            Defaults to None.
    """

    default_desc = ('This tool can generate an image according to '
                    'the input audio and the input description.')

    @require(['diffusers', 'ftfy', 'iopath', 'timm', 'pytorchvideo'])
    def __init__(self, device: str = 'cpu', toolmeta=None):
        super().__init__(toolmeta=toolmeta)
        self.device = device

    def setup(self):
        self._inferencer = load_or_build_object(AnythingToImage, device=self.device)

    def apply(self, audio: AudioIO, prompt: str) -> ImageIO:
        from .data import load_and_transform_audio_data, load_and_transform_text
        from .models.imagebind_model import ModalityType

        audio_paths = [audio.to_path()]
        text = load_and_transform_text([prompt], self.device)
        embeddings = self._inferencer.model.forward({ModalityType.TEXT: text},
                                                    normalize=False)
        text_embeddings = embeddings[ModalityType.TEXT]

        audio_data = load_and_transform_audio_data(audio_paths, self.device)
        embeddings = self._inferencer.model.forward({
            ModalityType.AUDIO: audio_data,
        })
        audio_embeddings = embeddings[ModalityType.AUDIO]
        embeddings = text_embeddings * 0.5 + audio_embeddings * 0.5
        images = self._inferencer.pipe(
            image_embeds=embeddings.half(), width=512, height=512).images
        output_image = images[0]

        return ImageIO(output_image)