Source code for agentlego.tools.ocr.ocr
from typing import Sequence, Tuple, Union
from agentlego.types import Annotated, ImageIO, Info
from agentlego.utils import load_or_build_object, require
from ..base import BaseTool
[docs]class OCR(BaseTool):
"""A tool to recognize the optical characters on an image.
Args:
lang (str | Sequence[str]): The language to be recognized.
Defaults to 'en'.
line_group_tolerance (int): The line group tolerance threshold.
Defaults to -1, which means to disable the line group method.
device (str | bool): The device to load the model. Defaults to True,
which means automatically select device.
**read_args: Other keyword arguments for read text. Please check the
`EasyOCR docs <https://www.jaided.ai/easyocr/documentation/>`_.
toolmeta (None | dict | ToolMeta): The additional info of the tool.
Defaults to None.
"""
default_desc = 'This tool can recognize all text on the input image.'
@require('easyocr')
def __init__(self,
lang: Union[str, Sequence[str]] = 'en',
line_group_tolerance: int = -1,
device: Union[bool, str] = True,
toolmeta=None,
**read_args):
super().__init__(toolmeta=toolmeta)
if isinstance(lang, str):
lang = [lang]
self.lang = list(lang)
self.read_args = read_args
self.device = device
self.line_group_tolerance = line_group_tolerance
read_args.setdefault('decoder', 'beamsearch')
if line_group_tolerance >= 0:
read_args.setdefault('paragraph', False)
else:
read_args.setdefault('paragraph', True)
def setup(self):
import easyocr
self._reader: easyocr.Reader = load_or_build_object(
easyocr.Reader, self.lang, gpu=self.device)
def apply(
self,
image: ImageIO,
) -> Annotated[str,
Info('OCR results, include bbox in x1, y1, x2, y2 format '
'and the recognized text.')]:
image = image.to_array()
results = self._reader.readtext(image, detail=1, **self.read_args)
results = [(self.extract_bbox(item[0]), item[1]) for item in results]
if self.line_group_tolerance >= 0:
results.sort(key=lambda x: x[0][1])
groups = []
group = []
for item in results:
if not group:
group.append(item)
continue
if abs(item[0][1] - group[-1][0][1]) <= self.line_group_tolerance:
group.append(item)
else:
groups.append(group)
group = [item]
groups.append(group)
results = []
for group in groups:
# For each line, sort the elements by their left x-coordinate and join their texts
line = sorted(group, key=lambda x: x[0][0])
bboxes = [item[0] for item in line]
text = ' '.join(item[1] for item in line)
results.append((self.extract_bbox(bboxes), text))
outputs = []
for item in results:
outputs.append('({}, {}, {}, {}) {}'.format(*item[0], item[1]))
outputs = '\n'.join(outputs)
return outputs
@staticmethod
def extract_bbox(char_boxes) -> Tuple[int, int, int, int]:
xs = [int(box[0]) for box in char_boxes]
ys = [int(box[1]) for box in char_boxes]
return min(xs), min(ys), max(xs), max(ys)