Source code for agentlego.tools.speech_text.text_to_speech
from io import BytesIO
from typing import Union
import requests
from agentlego.types import Annotated, AudioIO, Info
from agentlego.utils import is_package_available, require
from ..base import BaseTool
if is_package_available('torch'):
import torch
LANG_CODES = {
'zh-cn': 'Chinese',
'en': 'English',
'es': 'Spanish',
'fr': 'French',
'de': 'German',
'it': 'Italian',
'tr': 'Turkish',
'ru': 'Russian',
'ar': 'Arabic',
'ja': 'Japanese',
'ko': 'Korean',
# "pt": "Portuguese",
# "pl": "Polish",
# "nl": "Dutch",
# "cs": "Czech",
# "hu": "Hungarian",
# "hi": "Hindi",
}
[docs]class TextToSpeech(BaseTool):
"""A tool to convert input text to speech audio.
Args:
model (str): The model name used to inference. Which can be found
in https://github.com/coqui-ai/TTSHuggingFace .
Defaults to ``tts_models/multilingual/multi-dataset/xtts_v2``.
speaker_embeddings (str | dict): The speaker embedding
of the TTS model. Defaults to a default embedding.
device (str): The device to load the model. Defaults to 'cuda'.
toolmeta (None | dict | ToolMeta): The additional info of the tool.
Defaults to None.
"""
SPEAKER_EMBEDDING = ('http://download.openmmlab.com/agentlego/default_voice.pth')
default_desc = ('The tool can speak the input text into audio. The language code '
'should be one of ' +
', '.join(f"'{k}' ({v})" for k, v in LANG_CODES.items()) + '.')
@require('TTS', 'langid')
def __init__(self,
model: str = 'tts_models/multilingual/multi-dataset/xtts_v2',
speaker_embeddings: Union[str, dict] = SPEAKER_EMBEDDING,
device='cuda',
toolmeta=None):
super().__init__(toolmeta=toolmeta)
self.model_name = model
if isinstance(speaker_embeddings, str):
with BytesIO(requests.get(speaker_embeddings).content) as f:
speaker_embeddings = torch.load(f, map_location=device)
self.speaker_embeddings = speaker_embeddings
self.device = device
def setup(self) -> None:
from TTS.api import TTS
from TTS.tts.models.xtts import Xtts
self.model = TTS(self.model_name).to(self.device).synthesizer.tts_model
self.model: Xtts
def apply(
self,
text: str,
lang: Annotated[str, Info('The language code of text.')] = 'auto',
) -> AudioIO:
if lang == 'auto':
import langid
langid.set_languages(
[lang if lang != 'zh-cn' else 'zh' for lang in LANG_CODES])
lang = langid.classify(text)[0]
lang = 'zh-cn' if lang == 'zh' else lang
text = text.replace(',', ', ').replace('。', '. ').replace('?', '? ').replace(
'!', '! ').replace('、', ', ').strip()
out = self.model.inference(
text,
language=lang,
do_sample=False,
enable_text_splitting=len(text) > 72, # Split text if too long.
**self.speaker_embeddings,
)
return AudioIO(torch.tensor(out['wav']).unsqueeze(0), sampling_rate=24000)