Skip to content

Commit 7239bd2

Browse files
committed
feat(whatsapp): Add text-to-speech (TTS) integration for WhatsApp bot
- Implement speech configuration in WhatsAppBotConfig - Add speech_play_chance and speech_config fields to control TTS behavior - Create send_audio_message method in WhatsAppProvider base class - Implement send_audio_message for EvolutionAPI provider - Add documentation and example for WhatsApp TTS integration - Enhance WhatsApp bot to support optional audio message generation This change enables flexible text-to-speech capabilities for WhatsApp bots, allowing dynamic audio response generation with configurable probability and provider settings.
1 parent 94feaf2 commit 7239bd2

File tree

7 files changed

+612
-2
lines changed

7 files changed

+612
-2
lines changed

agentle/agents/whatsapp/models/whatsapp_bot_config.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
from rsb.models.base_model import BaseModel
77
from rsb.models.field import Field
88

9+
from agentle.tts.speech_config import SpeechConfig
10+
911

1012
class WhatsAppBotConfig(BaseModel):
1113
"""Configuration for WhatsApp bot behavior with simplified constructors and better organization."""
@@ -85,6 +87,18 @@ class WhatsAppBotConfig(BaseModel):
8587
default=10.0, description="Threshold for logging slow responses"
8688
)
8789

90+
# === Text-to-Speech (TTS) ===
91+
speech_play_chance: float = Field(
92+
default=0.0,
93+
ge=0.0,
94+
le=1.0,
95+
description="Probability (0.0-1.0) of sending audio response instead of text",
96+
)
97+
speech_config: SpeechConfig | None = Field(
98+
default=None,
99+
description="Optional SpeechConfig for TTS provider customization",
100+
)
101+
88102
# === Error Handling ===
89103
retry_failed_messages: bool = Field(
90104
default=True, description="Retry processing failed messages"
@@ -153,6 +167,9 @@ def with_overrides(
153167
retry_failed_messages: bool | None = None,
154168
max_retry_attempts: int | None = None,
155169
retry_delay_seconds: float | None = None,
170+
# Text-to-Speech
171+
speech_play_chance: float | None = None,
172+
speech_config: SpeechConfig | None = None,
156173
) -> "WhatsAppBotConfig":
157174
"""
158175
Create a new configuration instance with specified parameters overridden.
@@ -259,6 +276,12 @@ def with_overrides(
259276
if retry_delay_seconds is not None:
260277
overrides["retry_delay_seconds"] = retry_delay_seconds
261278

279+
# Text-to-Speech
280+
if speech_play_chance is not None:
281+
overrides["speech_play_chance"] = speech_play_chance
282+
if speech_config is not None:
283+
overrides["speech_config"] = speech_config
284+
262285
# Update configuration with overrides
263286
current_config.update(overrides)
264287

agentle/agents/whatsapp/providers/base/whatsapp_provider.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,26 @@ async def send_media_message(
8181
"""
8282
pass
8383

84+
@abstractmethod
85+
async def send_audio_message(
86+
self,
87+
to: str,
88+
audio_base64: str,
89+
quoted_message_id: str | None = None,
90+
) -> WhatsAppMediaMessage:
91+
"""
92+
Send an audio message (optimized for voice/TTS).
93+
94+
Args:
95+
to: Recipient phone number
96+
audio_base64: Base64-encoded audio data
97+
quoted_message_id: Optional ID of message to quote/reply to
98+
99+
Returns:
100+
The sent audio message
101+
"""
102+
pass
103+
84104
@abstractmethod
85105
async def send_typing_indicator(self, to: str, duration: int = 3) -> None:
86106
"""

agentle/agents/whatsapp/providers/evolution/evolution_api_provider.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -947,6 +947,85 @@ async def send_media_message(
947947

948948
# [Continue with remaining methods using enhanced patterns...]
949949

950+
async def send_audio_message(
951+
self,
952+
to: str,
953+
audio_base64: str,
954+
quoted_message_id: str | None = None,
955+
) -> WhatsAppMediaMessage:
956+
"""Send an audio message via Evolution API with enhanced error handling."""
957+
logger.info(f"Sending audio message to {to}")
958+
if quoted_message_id:
959+
logger.debug(f"Audio message is quoting message ID: {quoted_message_id}")
960+
961+
try:
962+
# CRITICAL FIX: Check if there's a stored remoteJid for this contact
963+
session = await self.get_session(to)
964+
remote_jid = session.context_data.get("remote_jid") if session else None
965+
966+
if remote_jid:
967+
logger.info(
968+
f"🔑 Using stored remoteJid for audio to {to}: {remote_jid}"
969+
)
970+
normalized_to = remote_jid
971+
else:
972+
normalized_to = self._normalize_phone(to)
973+
logger.debug(f"Normalized phone number: {to} -> {normalized_to}")
974+
975+
payload: MutableMapping[str, Any] = {
976+
"number": normalized_to,
977+
"audio": audio_base64,
978+
}
979+
980+
if quoted_message_id:
981+
payload["quoted"] = {"key": {"id": quoted_message_id}}
982+
983+
url = self._build_url(f"sendWhatsAppAudio/{self.config.instance_name}")
984+
response_data = await self._make_request_with_resilience(
985+
"POST", url, payload, expected_status=201
986+
)
987+
988+
message_id = response_data["key"]["id"]
989+
from_jid = response_data["key"]["remoteJid"]
990+
991+
message = WhatsAppAudioMessage(
992+
id=message_id,
993+
from_number=from_jid,
994+
to_number=to,
995+
timestamp=datetime.now(),
996+
status=WhatsAppMessageStatus.SENT,
997+
media_url="", # Base64 audio doesn't have a URL
998+
media_mime_type="audio/ogg",
999+
quoted_message_id=quoted_message_id,
1000+
is_voice_note=True,
1001+
)
1002+
1003+
logger.info(
1004+
f"Audio message sent successfully to {to}: {message_id}",
1005+
extra={
1006+
"message_id": message_id,
1007+
"to_number": to,
1008+
"normalized_to": normalized_to,
1009+
"from_jid": from_jid,
1010+
"has_quote": quoted_message_id is not None,
1011+
},
1012+
)
1013+
return message
1014+
1015+
except EvolutionAPIError:
1016+
logger.error(f"Evolution API error while sending audio message to {to}")
1017+
raise
1018+
except Exception as e:
1019+
logger.error(
1020+
f"Failed to send audio message to {to}: {type(e).__name__}: {e}",
1021+
extra={
1022+
"to_number": to,
1023+
"error_type": type(e).__name__,
1024+
"has_quote": quoted_message_id is not None,
1025+
},
1026+
)
1027+
raise EvolutionAPIError(f"Failed to send audio message: {e}")
1028+
9501029
async def send_typing_indicator(self, to: str, duration: int = 3) -> None:
9511030
"""Send typing indicator via Evolution API."""
9521031
logger.debug(f"Sending typing indicator to {to} for {duration}s")

agentle/agents/whatsapp/whatsapp_bot.py

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
from agentle.generations.models.messages.user_message import UserMessage
5555
from agentle.generations.tools.tool import Tool
5656
from agentle.generations.tools.tool_execution_result import ToolExecutionResult
57-
57+
from agentle.tts.tts_provider import TtsProvider
5858

5959
if TYPE_CHECKING:
6060
from blacksheep import Application
@@ -135,6 +135,7 @@ class WhatsAppBot(BaseModel):
135135

136136
agent: Agent[Any]
137137
provider: WhatsAppProvider
138+
tts_provider: TtsProvider | None = Field(default=None)
138139
config: WhatsAppBotConfig = Field(default_factory=WhatsAppBotConfig)
139140

140141
# REMOVED: context_manager field - no longer needed
@@ -2080,6 +2081,59 @@ async def _send_response(
20802081
f"[SEND_RESPONSE] Sending response to {to} (length: {len(response_text)}, reply_to: {reply_to})"
20812082
)
20822083

2084+
# Check if we should send audio via TTS
2085+
if (
2086+
self.tts_provider
2087+
and self.config.speech_config
2088+
and self.config.speech_play_chance > 0
2089+
):
2090+
import random
2091+
2092+
# Determine if we should play speech based on chance
2093+
should_play_speech = random.random() < self.config.speech_play_chance
2094+
2095+
if should_play_speech:
2096+
logger.info(
2097+
f"[TTS] Attempting to send audio response to {to} (chance: {self.config.speech_play_chance * 100}%)"
2098+
)
2099+
try:
2100+
# Synthesize speech
2101+
speech_result = await self.tts_provider.synthesize(
2102+
response_text, config=self.config.speech_config
2103+
)
2104+
2105+
# Send audio message
2106+
await self.provider.send_audio_message(
2107+
to=to,
2108+
audio_base64=speech_result.audio,
2109+
quoted_message_id=reply_to
2110+
if self.config.quote_messages
2111+
else None,
2112+
)
2113+
2114+
logger.info(
2115+
f"[TTS] Successfully sent audio response to {to}",
2116+
extra={
2117+
"to_number": to,
2118+
"text_length": len(response_text),
2119+
"mime_type": str(speech_result.mime_type),
2120+
"format": str(speech_result.format),
2121+
},
2122+
)
2123+
# Audio sent successfully, return early
2124+
return
2125+
2126+
except Exception as e:
2127+
logger.warning(
2128+
f"[TTS] Failed to send audio response to {to}, falling back to text: {e}",
2129+
extra={
2130+
"to_number": to,
2131+
"error_type": type(e).__name__,
2132+
"error": str(e),
2133+
},
2134+
)
2135+
# Fall through to send text message instead
2136+
20832137
# Split messages by line breaks and length
20842138
messages = self._split_message_by_line_breaks(response_text)
20852139
logger.info(f"[SEND_RESPONSE] Split response into {len(messages)} parts")

agentle/tts/speech_result.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
class SpeechResult(BaseModel):
88
audio: str = Field(...)
9-
"""the raw audio bytes in base-64 format"""
9+
"""The speech in base-64 format"""
1010

1111
mime_type: AudioFormat = Field(...)
1212
"""`audio/mpeg`, `audio/wav`, `audio/opus`"""

0 commit comments

Comments
 (0)