Source code for agentlego.tools.vqa.visual_question_answering

from agentlego.types import Annotated, ImageIO, Info
from agentlego.utils import load_or_build_object, require
from ..base import BaseTool


[docs]class VQA(BaseTool):
    """A tool to answer the question about an image.

    Args:
        remote (bool): Whether to use the remote model. Defaults to False.
        device (str): The device to load the model. Defaults to 'cuda'.
        toolmeta (None | dict | ToolMeta): The additional info of the tool.
            Defaults to None.
    """

    default_desc = ('This tool can answer the input question based on the '
                    'input image.')

    def __init__(self,
                 model: str = 'ofa-base_3rdparty-zeroshot_vqa',
                 device: str = 'cuda',
                 toolmeta=None):
        super().__init__(toolmeta)
        self.device = device
        self.model = model

    @require('mmpretrain')
    def setup(self):
        from mmengine.registry import DefaultScope
        from mmpretrain.apis import VisualQuestionAnsweringInferencer

        with DefaultScope.overwrite_default_scope('mmpretrain'):
            self._inferencer = load_or_build_object(
                VisualQuestionAnsweringInferencer, model=self.model, device=self.device)

    def apply(
        self,
        image: ImageIO,
        question: Annotated[str, Info('The question should be in English.')],
    ) -> str:
        image = image.to_array()[:, :, ::-1]
        return self._inferencer(image, question)[0]['pred_answer']