Shortcuts

Source code for agentlego.tools.image_text.image_to_text

from agentlego.types import ImageIO
from agentlego.utils import load_or_build_object, require
from ..base import BaseTool


[docs]class ImageDescription(BaseTool): """A tool to describe an image. Args: model (str): The model name used to inference. Which can be found in the ``MMPreTrain`` repository. Defaults to ``blip-base_3rdparty_caption``. device (str): The device to load the model. Defaults to 'cpu'. toolmeta (None | dict | ToolMeta): The additional info of the tool. Defaults to None. """ default_desc = ('A useful tool that returns a brief ' 'description of the input image.') @require('mmpretrain') def __init__(self, model: str = 'blip-base_3rdparty_caption', device: str = 'cuda', toolmeta=None): super().__init__(toolmeta=toolmeta) self.model = model self.device = device def setup(self): from mmengine.registry import DefaultScope from mmpretrain.apis import ImageCaptionInferencer with DefaultScope.overwrite_default_scope('mmpretrain'): self._inferencer = load_or_build_object( ImageCaptionInferencer, model=self.model, device=self.device, ) def apply(self, image: ImageIO) -> str: image = image.to_array()[:, :, ::-1] return self._inferencer(image)[0]['pred_caption']