Source code for agentlego.tools.vqa.visual_question_answering
from agentlego.types import Annotated, ImageIO, Info
from agentlego.utils import load_or_build_object, require
from ..base import BaseTool
[docs]class VQA(BaseTool):
"""A tool to answer the question about an image.
Args:
remote (bool): Whether to use the remote model. Defaults to False.
device (str): The device to load the model. Defaults to 'cuda'.
toolmeta (None | dict | ToolMeta): The additional info of the tool.
Defaults to None.
"""
default_desc = ('This tool can answer the input question based on the '
'input image.')
def __init__(self,
model: str = 'ofa-base_3rdparty-zeroshot_vqa',
device: str = 'cuda',
toolmeta=None):
super().__init__(toolmeta)
self.device = device
self.model = model
@require('mmpretrain')
def setup(self):
from mmengine.registry import DefaultScope
from mmpretrain.apis import VisualQuestionAnsweringInferencer
with DefaultScope.overwrite_default_scope('mmpretrain'):
self._inferencer = load_or_build_object(
VisualQuestionAnsweringInferencer, model=self.model, device=self.device)
def apply(
self,
image: ImageIO,
question: Annotated[str, Info('The question should be in English.')],
) -> str:
image = image.to_array()[:, :, ::-1]
return self._inferencer(image, question)[0]['pred_answer']