|
| 1 | +import base64 |
| 2 | +from collections.abc import AsyncIterator |
| 3 | +from typing import override |
| 4 | + |
| 5 | +from agentle.tts.audio_format import AudioFormat |
| 6 | +from agentle.tts.output_format_type import OutputFormatType |
| 7 | +from agentle.tts.speech_config import SpeechConfig |
| 8 | +from agentle.tts.speech_result import SpeechResult |
| 9 | +from agentle.tts.tts_provider import TtsProvider |
| 10 | +from agentle.utils.needs import needs |
| 11 | + |
| 12 | + |
| 13 | +class ElevenLabsTtsProvider(TtsProvider): |
| 14 | + @override |
| 15 | + @needs("elevenlabs") |
| 16 | + async def synthesize(self, text: str, config: SpeechConfig) -> SpeechResult: |
| 17 | + from elevenlabs import AsyncElevenLabs |
| 18 | + from elevenlabs.types.voice_settings import ( |
| 19 | + VoiceSettings as ElevenLabsVoiceSettings, |
| 20 | + ) |
| 21 | + |
| 22 | + elevenlabs = AsyncElevenLabs() |
| 23 | + audio_stream: AsyncIterator[bytes] = elevenlabs.text_to_speech.convert( |
| 24 | + text=text, |
| 25 | + voice_id=config.voice_id, |
| 26 | + model_id=config.model_id, |
| 27 | + output_format=config.output_format, |
| 28 | + voice_settings=ElevenLabsVoiceSettings( |
| 29 | + stability=config.voice_settings.stability, |
| 30 | + use_speaker_boost=config.voice_settings.use_speaker_boost, |
| 31 | + similarity_boost=config.voice_settings.similarity_boost, |
| 32 | + style=config.voice_settings.style, |
| 33 | + speed=config.voice_settings.speed, |
| 34 | + ) |
| 35 | + if config.voice_settings |
| 36 | + else None, |
| 37 | + language_code=config.language_code, |
| 38 | + ) |
| 39 | + |
| 40 | + # Collect all chunks into bytes |
| 41 | + chunks: list[bytes] = [] |
| 42 | + async for chunk in audio_stream: |
| 43 | + chunks.append(chunk) |
| 44 | + audio_bytes = b"".join(chunks) |
| 45 | + |
| 46 | + audio_base64 = base64.b64encode(audio_bytes).decode("utf-8") |
| 47 | + |
| 48 | + return SpeechResult( |
| 49 | + audio=audio_base64, |
| 50 | + mime_type=self._get_mime_type(config.output_format), |
| 51 | + format=config.output_format, |
| 52 | + ) |
| 53 | + |
| 54 | + def _get_mime_type(self, output_format: OutputFormatType) -> AudioFormat: |
| 55 | + """Convert ElevenLabs output format to MIME type.""" |
| 56 | + if output_format.startswith("mp3_"): |
| 57 | + return "audio/mpeg" |
| 58 | + elif output_format.startswith("pcm_"): |
| 59 | + return "audio/wav" # or "audio/pcm" depending on your use case |
| 60 | + elif output_format.startswith("ulaw_"): |
| 61 | + return "audio/basic" |
| 62 | + elif output_format.startswith("alaw_"): |
| 63 | + return "audio/basic" |
| 64 | + elif output_format.startswith("opus_"): |
| 65 | + return "audio/opus" |
| 66 | + else: |
| 67 | + return "application/octet-stream" # fallback |
0 commit comments