feat(whatsapp): Add text-to-speech (TTS) integration for WhatsApp bot

arthurbrenno · arthurbrenno · commit 7239bd252f8a · 2025-10-25T13:42:59.000-03:00
- Implement speech configuration in WhatsAppBotConfig
- Add speech_play_chance and speech_config fields to control TTS behavior
- Create send_audio_message method in WhatsAppProvider base class
- Implement send_audio_message for EvolutionAPI provider
- Add documentation and example for WhatsApp TTS integration
- Enhance WhatsApp bot to support optional audio message generation
This change enables flexible text-to-speech capabilities for WhatsApp bots, allowing dynamic audio response generation with configurable probability and provider settings.
diff --git a/agentle/agents/whatsapp/models/whatsapp_bot_config.py b/agentle/agents/whatsapp/models/whatsapp_bot_config.py
@@ -6,6 +6,8 @@
 from rsb.models.base_model import BaseModel
 from rsb.models.field import Field
 
+from agentle.tts.speech_config import SpeechConfig
+
 
 class WhatsAppBotConfig(BaseModel):
     """Configuration for WhatsApp bot behavior with simplified constructors and better organization."""
@@ -85,6 +87,18 @@ class WhatsAppBotConfig(BaseModel):
         default=10.0, description="Threshold for logging slow responses"
     )
 
+    # === Text-to-Speech (TTS) ===
+    speech_play_chance: float = Field(
+        default=0.0,
+        ge=0.0,
+        le=1.0,
+        description="Probability (0.0-1.0) of sending audio response instead of text",
+    )
+    speech_config: SpeechConfig | None = Field(
+        default=None,
+        description="Optional SpeechConfig for TTS provider customization",
+    )
+
     # === Error Handling ===
     retry_failed_messages: bool = Field(
         default=True, description="Retry processing failed messages"
@@ -153,6 +167,9 @@ def with_overrides(
         retry_failed_messages: bool | None = None,
         max_retry_attempts: int | None = None,
         retry_delay_seconds: float | None = None,
+        # Text-to-Speech
+        speech_play_chance: float | None = None,
+        speech_config: SpeechConfig | None = None,
     ) -> "WhatsAppBotConfig":
         """
         Create a new configuration instance with specified parameters overridden.
@@ -259,6 +276,12 @@ def with_overrides(
         if retry_delay_seconds is not None:
             overrides["retry_delay_seconds"] = retry_delay_seconds
 
+        # Text-to-Speech
+        if speech_play_chance is not None:
+            overrides["speech_play_chance"] = speech_play_chance
+        if speech_config is not None:
+            overrides["speech_config"] = speech_config
+
         # Update configuration with overrides
         current_config.update(overrides)
 
diff --git a/agentle/agents/whatsapp/providers/base/whatsapp_provider.py b/agentle/agents/whatsapp/providers/base/whatsapp_provider.py
@@ -81,6 +81,26 @@ async def send_media_message(
         """
         pass
 
+    @abstractmethod
+    async def send_audio_message(
+        self,
+        to: str,
+        audio_base64: str,
+        quoted_message_id: str | None = None,
+    ) -> WhatsAppMediaMessage:
+        """
+        Send an audio message (optimized for voice/TTS).
+
+        Args:
+            to: Recipient phone number
+            audio_base64: Base64-encoded audio data
+            quoted_message_id: Optional ID of message to quote/reply to
+
+        Returns:
+            The sent audio message
+        """
+        pass
+
     @abstractmethod
     async def send_typing_indicator(self, to: str, duration: int = 3) -> None:
         """
diff --git a/agentle/agents/whatsapp/providers/evolution/evolution_api_provider.py b/agentle/agents/whatsapp/providers/evolution/evolution_api_provider.py
@@ -947,6 +947,85 @@ async def send_media_message(
 
     # [Continue with remaining methods using enhanced patterns...]
 
+    async def send_audio_message(
+        self,
+        to: str,
+        audio_base64: str,
+        quoted_message_id: str | None = None,
+    ) -> WhatsAppMediaMessage:
+        """Send an audio message via Evolution API with enhanced error handling."""
+        logger.info(f"Sending audio message to {to}")
+        if quoted_message_id:
+            logger.debug(f"Audio message is quoting message ID: {quoted_message_id}")
+
+        try:
+            # CRITICAL FIX: Check if there's a stored remoteJid for this contact
+            session = await self.get_session(to)
+            remote_jid = session.context_data.get("remote_jid") if session else None
+
+            if remote_jid:
+                logger.info(
+                    f"🔑 Using stored remoteJid for audio to {to}: {remote_jid}"
+                )
+                normalized_to = remote_jid
+            else:
+                normalized_to = self._normalize_phone(to)
+                logger.debug(f"Normalized phone number: {to} -> {normalized_to}")
+
+            payload: MutableMapping[str, Any] = {
+                "number": normalized_to,
+                "audio": audio_base64,
+            }
+
+            if quoted_message_id:
+                payload["quoted"] = {"key": {"id": quoted_message_id}}
+
+            url = self._build_url(f"sendWhatsAppAudio/{self.config.instance_name}")
+            response_data = await self._make_request_with_resilience(
+                "POST", url, payload, expected_status=201
+            )
+
+            message_id = response_data["key"]["id"]
+            from_jid = response_data["key"]["remoteJid"]
+
+            message = WhatsAppAudioMessage(
+                id=message_id,
+                from_number=from_jid,
+                to_number=to,
+                timestamp=datetime.now(),
+                status=WhatsAppMessageStatus.SENT,
+                media_url="",  # Base64 audio doesn't have a URL
+                media_mime_type="audio/ogg",
+                quoted_message_id=quoted_message_id,
+                is_voice_note=True,
+            )
+
+            logger.info(
+                f"Audio message sent successfully to {to}: {message_id}",
+                extra={
+                    "message_id": message_id,
+                    "to_number": to,
+                    "normalized_to": normalized_to,
+                    "from_jid": from_jid,
+                    "has_quote": quoted_message_id is not None,
+                },
+            )
+            return message
+
+        except EvolutionAPIError:
+            logger.error(f"Evolution API error while sending audio message to {to}")
+            raise
+        except Exception as e:
+            logger.error(
+                f"Failed to send audio message to {to}: {type(e).__name__}: {e}",
+                extra={
+                    "to_number": to,
+                    "error_type": type(e).__name__,
+                    "has_quote": quoted_message_id is not None,
+                },
+            )
+            raise EvolutionAPIError(f"Failed to send audio message: {e}")
+
     async def send_typing_indicator(self, to: str, duration: int = 3) -> None:
         """Send typing indicator via Evolution API."""
         logger.debug(f"Sending typing indicator to {to} for {duration}s")
diff --git a/agentle/agents/whatsapp/whatsapp_bot.py b/agentle/agents/whatsapp/whatsapp_bot.py
@@ -54,7 +54,7 @@
 from agentle.generations.models.messages.user_message import UserMessage
 from agentle.generations.tools.tool import Tool
 from agentle.generations.tools.tool_execution_result import ToolExecutionResult
-
+from agentle.tts.tts_provider import TtsProvider
 
 if TYPE_CHECKING:
     from blacksheep import Application
@@ -135,6 +135,7 @@ class WhatsAppBot(BaseModel):
 
     agent: Agent[Any]
     provider: WhatsAppProvider
+    tts_provider: TtsProvider | None = Field(default=None)
     config: WhatsAppBotConfig = Field(default_factory=WhatsAppBotConfig)
 
     # REMOVED: context_manager field - no longer needed
@@ -2080,6 +2081,59 @@ async def _send_response(
             f"[SEND_RESPONSE] Sending response to {to} (length: {len(response_text)}, reply_to: {reply_to})"
         )
 
+        # Check if we should send audio via TTS
+        if (
+            self.tts_provider
+            and self.config.speech_config
+            and self.config.speech_play_chance > 0
+        ):
+            import random
+
+            # Determine if we should play speech based on chance
+            should_play_speech = random.random() < self.config.speech_play_chance
+
+            if should_play_speech:
+                logger.info(
+                    f"[TTS] Attempting to send audio response to {to} (chance: {self.config.speech_play_chance * 100}%)"
+                )
+                try:
+                    # Synthesize speech
+                    speech_result = await self.tts_provider.synthesize(
+                        response_text, config=self.config.speech_config
+                    )
+
+                    # Send audio message
+                    await self.provider.send_audio_message(
+                        to=to,
+                        audio_base64=speech_result.audio,
+                        quoted_message_id=reply_to
+                        if self.config.quote_messages
+                        else None,
+                    )
+
+                    logger.info(
+                        f"[TTS] Successfully sent audio response to {to}",
+                        extra={
+                            "to_number": to,
+                            "text_length": len(response_text),
+                            "mime_type": str(speech_result.mime_type),
+                            "format": str(speech_result.format),
+                        },
+                    )
+                    # Audio sent successfully, return early
+                    return
+
+                except Exception as e:
+                    logger.warning(
+                        f"[TTS] Failed to send audio response to {to}, falling back to text: {e}",
+                        extra={
+                            "to_number": to,
+                            "error_type": type(e).__name__,
+                            "error": str(e),
+                        },
+                    )
+                    # Fall through to send text message instead
+
         # Split messages by line breaks and length
         messages = self._split_message_by_line_breaks(response_text)
         logger.info(f"[SEND_RESPONSE] Split response into {len(messages)} parts")
diff --git a/agentle/tts/speech_result.py b/agentle/tts/speech_result.py
@@ -6,7 +6,7 @@
 
 class SpeechResult(BaseModel):
     audio: str = Field(...)
-    """the raw audio bytes in base-64 format"""
+    """The speech in base-64 format"""
 
     mime_type: AudioFormat = Field(...)
     """`audio/mpeg`, `audio/wav`, `audio/opus`"""
diff --git a/docs/whatsapp_tts_integration.md b/docs/whatsapp_tts_integration.md
diff --git a/examples/whatsapp_tts_example.py b/examples/whatsapp_tts_example.py