Skip to content

Commit 6fa8f90

Browse files
committed
feat(whatsapp): Enhance audio message handling with URL support
- Introduce ability to send audio messages via URL in WhatsApp providers - Implement file storage management for TTS audio uploads - Refactor audio message sending logic to prioritize URL method over base64 - Add new method for audio extension retrieval based on TTS format - Update dependencies in pyproject.toml for improved development environment
1 parent 5eaeb47 commit 6fa8f90

File tree

10 files changed

+683
-10
lines changed

10 files changed

+683
-10
lines changed

agentle/agents/whatsapp/providers/base/whatsapp_provider.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,26 @@ async def send_audio_message(
101101
"""
102102
pass
103103

104+
@abstractmethod
105+
async def send_audio_message_by_url(
106+
self,
107+
to: str,
108+
audio_url: str,
109+
quoted_message_id: str | None = None,
110+
) -> WhatsAppMediaMessage:
111+
"""
112+
Send an audio message via URL.
113+
114+
Args:
115+
to: Recipient phone number
116+
audio_url: URL of the audio file
117+
quoted_message_id: Optional ID of message to quote/reply to
118+
119+
Returns:
120+
The sent audio message
121+
"""
122+
pass
123+
104124
@abstractmethod
105125
async def send_typing_indicator(self, to: str, duration: int = 3) -> None:
106126
"""

agentle/agents/whatsapp/providers/evolution/evolution_api_provider.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1029,6 +1029,89 @@ async def send_audio_message(
10291029
)
10301030
raise EvolutionAPIError(f"Failed to send audio message: {e}")
10311031

1032+
async def send_audio_message_by_url(
1033+
self,
1034+
to: str,
1035+
audio_url: str,
1036+
quoted_message_id: str | None = None,
1037+
) -> WhatsAppMediaMessage:
1038+
"""Send an audio message via URL using Evolution API."""
1039+
logger.info(f"Sending audio message via URL to {to}: {audio_url}")
1040+
if quoted_message_id:
1041+
logger.debug(f"Audio message is quoting message ID: {quoted_message_id}")
1042+
1043+
try:
1044+
# CRITICAL FIX: Check if there's a stored remoteJid for this contact
1045+
session = await self.get_session(to)
1046+
remote_jid = session.context_data.get("remote_jid") if session else None
1047+
1048+
if remote_jid:
1049+
logger.info(
1050+
f"🔑 Using stored remoteJid for audio URL to {to}: {remote_jid}"
1051+
)
1052+
normalized_to = remote_jid
1053+
else:
1054+
normalized_to = self._normalize_phone(to)
1055+
logger.debug(f"Normalized phone number: {to} -> {normalized_to}")
1056+
1057+
payload: MutableMapping[str, Any] = {
1058+
"number": normalized_to,
1059+
"audioUrl": audio_url, # Use URL instead of base64
1060+
}
1061+
1062+
if quoted_message_id:
1063+
payload["quoted"] = {"key": {"id": quoted_message_id}}
1064+
1065+
url = self._build_url(f"sendWhatsAppAudio/{self.config.instance_name}")
1066+
response_data = await self._make_request_with_resilience(
1067+
"POST", url, payload, expected_status=[200, 201]
1068+
)
1069+
1070+
message_id = response_data["key"]["id"]
1071+
from_jid = response_data["key"]["remoteJid"]
1072+
1073+
message = WhatsAppAudioMessage(
1074+
id=message_id,
1075+
from_number=from_jid,
1076+
to_number=to,
1077+
timestamp=datetime.now(),
1078+
status=WhatsAppMessageStatus.SENT,
1079+
media_url=audio_url, # Store the URL
1080+
media_mime_type="audio/ogg",
1081+
quoted_message_id=quoted_message_id,
1082+
is_voice_note=True,
1083+
)
1084+
1085+
logger.info(
1086+
f"Audio message sent successfully via URL to {to}: {message_id}",
1087+
extra={
1088+
"message_id": message_id,
1089+
"to_number": to,
1090+
"normalized_to": normalized_to,
1091+
"from_jid": from_jid,
1092+
"audio_url": audio_url,
1093+
"has_quote": quoted_message_id is not None,
1094+
},
1095+
)
1096+
return message
1097+
1098+
except EvolutionAPIError:
1099+
logger.error(
1100+
f"Evolution API error while sending audio message via URL to {to}"
1101+
)
1102+
raise
1103+
except Exception as e:
1104+
logger.error(
1105+
f"Failed to send audio message via URL to {to}: {type(e).__name__}: {e}",
1106+
extra={
1107+
"to_number": to,
1108+
"audio_url": audio_url,
1109+
"error_type": type(e).__name__,
1110+
"has_quote": quoted_message_id is not None,
1111+
},
1112+
)
1113+
raise EvolutionAPIError(f"Failed to send audio message via URL: {e}")
1114+
10321115
async def send_typing_indicator(self, to: str, duration: int = 3) -> None:
10331116
"""Send typing indicator via Evolution API."""
10341117
logger.debug(f"Sending typing indicator to {to} for {duration}s")

agentle/agents/whatsapp/providers/meta/meta_whatsapp_provider.py

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -837,3 +837,129 @@ def get_stats(self) -> Mapping[str, Any]:
837837
base_stats["session_stats"] = session_stats
838838

839839
return base_stats
840+
841+
async def send_audio_message(
842+
self,
843+
to: str,
844+
audio_base64: str,
845+
quoted_message_id: str | None = None,
846+
) -> WhatsAppMediaMessage:
847+
"""Send an audio message via Meta WhatsApp Business API."""
848+
logger.info(f"Sending audio message to {to}")
849+
850+
try:
851+
# Upload audio to Meta first
852+
media_id = await self._upload_audio_base64(audio_base64)
853+
854+
# Send audio message
855+
payload = {
856+
"messaging_product": "whatsapp",
857+
"to": self._normalize_phone(to),
858+
"type": "audio",
859+
"audio": {"id": media_id},
860+
}
861+
862+
if quoted_message_id:
863+
payload["context"] = {"message_id": quoted_message_id}
864+
865+
url = self._build_url(f"{self.config.phone_number_id}/messages")
866+
response_data = await self._make_request("POST", url, payload)
867+
868+
message_id = response_data["messages"][0]["id"]
869+
870+
return WhatsAppAudioMessage(
871+
id=message_id,
872+
from_number=self.config.phone_number_id,
873+
to_number=to,
874+
timestamp=datetime.now(),
875+
status=WhatsAppMessageStatus.SENT,
876+
media_url=media_id,
877+
media_mime_type="audio/ogg",
878+
quoted_message_id=quoted_message_id,
879+
is_voice_note=True,
880+
)
881+
882+
except Exception as e:
883+
logger.error(f"Failed to send audio message: {e}")
884+
raise MetaWhatsAppError(f"Failed to send audio message: {e}")
885+
886+
async def send_audio_message_by_url(
887+
self,
888+
to: str,
889+
audio_url: str,
890+
quoted_message_id: str | None = None,
891+
) -> WhatsAppMediaMessage:
892+
"""Send an audio message via URL using Meta WhatsApp Business API."""
893+
logger.info(f"Sending audio message via URL to {to}: {audio_url}")
894+
895+
try:
896+
# Upload audio from URL to Meta
897+
media_id = await self._upload_media(audio_url, "audio")
898+
899+
# Send audio message
900+
payload = {
901+
"messaging_product": "whatsapp",
902+
"to": self._normalize_phone(to),
903+
"type": "audio",
904+
"audio": {"id": media_id},
905+
}
906+
907+
if quoted_message_id:
908+
payload["context"] = {"message_id": quoted_message_id}
909+
910+
url = self._build_url(f"{self.config.phone_number_id}/messages")
911+
response_data = await self._make_request("POST", url, payload)
912+
913+
message_id = response_data["messages"][0]["id"]
914+
915+
return WhatsAppAudioMessage(
916+
id=message_id,
917+
from_number=self.config.phone_number_id,
918+
to_number=to,
919+
timestamp=datetime.now(),
920+
status=WhatsAppMessageStatus.SENT,
921+
media_url=audio_url,
922+
media_mime_type="audio/ogg",
923+
quoted_message_id=quoted_message_id,
924+
is_voice_note=True,
925+
)
926+
927+
except Exception as e:
928+
logger.error(f"Failed to send audio message via URL: {e}")
929+
raise MetaWhatsAppError(f"Failed to send audio message via URL: {e}")
930+
931+
async def _upload_audio_base64(self, audio_base64: str) -> str:
932+
"""Upload base64 audio to Meta and return media ID."""
933+
try:
934+
import base64
935+
936+
# Decode base64 to bytes
937+
audio_data = base64.b64decode(audio_base64)
938+
939+
# Upload to Meta
940+
upload_url = self._build_url(f"{self.config.phone_number_id}/media")
941+
942+
form_data = aiohttp.FormData()
943+
form_data.add_field("messaging_product", "whatsapp")
944+
form_data.add_field("type", "audio")
945+
form_data.add_field(
946+
"file",
947+
audio_data,
948+
filename="audio.ogg",
949+
content_type="audio/ogg",
950+
)
951+
952+
# Create a separate session for file upload
953+
headers = {"Authorization": f"Bearer {self.config.access_token}"}
954+
timeout = aiohttp.ClientTimeout(total=self.config.timeout)
955+
956+
async with aiohttp.ClientSession(
957+
headers=headers, timeout=timeout
958+
) as upload_session:
959+
async with upload_session.post(upload_url, data=form_data) as response:
960+
response_data = await self._handle_response(response, 200)
961+
return response_data["id"]
962+
963+
except Exception as e:
964+
logger.error(f"Failed to upload audio base64: {e}")
965+
raise MetaWhatsAppError(f"Failed to upload audio base64: {e}")

agentle/agents/whatsapp/whatsapp_bot.py

Lines changed: 79 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
from agentle.generations.models.messages.user_message import UserMessage
5555
from agentle.generations.tools.tool import Tool
5656
from agentle.generations.tools.tool_execution_result import ToolExecutionResult
57+
from agentle.storage.file_storage_manager import FileStorageManager
5758
from agentle.tts.tts_provider import TtsProvider
5859

5960
if TYPE_CHECKING:
@@ -136,6 +137,7 @@ class WhatsAppBot(BaseModel):
136137
agent: Agent[Any]
137138
provider: WhatsAppProvider
138139
tts_provider: TtsProvider | None = Field(default=None)
140+
file_storage_manager: FileStorageManager | None = Field(default=None)
139141
config: WhatsAppBotConfig = Field(default_factory=WhatsAppBotConfig)
140142

141143
# REMOVED: context_manager field - no longer needed
@@ -2116,14 +2118,71 @@ async def _send_response(
21162118
response_text, config=self.config.speech_config
21172119
)
21182120

2119-
# Send audio message
2120-
await self.provider.send_audio_message(
2121-
to=to,
2122-
audio_base64=speech_result.audio,
2123-
quoted_message_id=reply_to
2124-
if self.config.quote_messages
2125-
else None,
2126-
)
2121+
# Try to upload to file storage if available
2122+
audio_url = None
2123+
if self.file_storage_manager:
2124+
try:
2125+
import base64
2126+
import time
2127+
2128+
# Decode base64 to bytes
2129+
audio_bytes = base64.b64decode(speech_result.audio)
2130+
2131+
# Generate unique filename
2132+
timestamp = int(time.time())
2133+
extension = self._get_audio_extension(speech_result.format)
2134+
filename = f"tts_{timestamp}.{extension}"
2135+
2136+
# Upload to storage
2137+
audio_url = await self.file_storage_manager.upload_file(
2138+
file_data=audio_bytes,
2139+
filename=filename,
2140+
mime_type=str(speech_result.mime_type),
2141+
)
2142+
2143+
logger.info(f"[TTS] Audio uploaded to storage: {audio_url}")
2144+
2145+
except Exception as e:
2146+
logger.warning(
2147+
f"[TTS] Failed to upload to storage, falling back to base64: {e}"
2148+
)
2149+
audio_url = None
2150+
2151+
# Send audio message (URL or base64)
2152+
if audio_url:
2153+
# Try URL method first
2154+
try:
2155+
await self.provider.send_audio_message_by_url(
2156+
to=to,
2157+
audio_url=audio_url,
2158+
quoted_message_id=reply_to
2159+
if self.config.quote_messages
2160+
else None,
2161+
)
2162+
logger.info(f"[TTS] Audio sent via URL to {to}")
2163+
except Exception as e:
2164+
logger.warning(
2165+
f"[TTS] URL method failed, falling back to base64: {e}"
2166+
)
2167+
# Fallback to base64
2168+
await self.provider.send_audio_message(
2169+
to=to,
2170+
audio_base64=speech_result.audio,
2171+
quoted_message_id=reply_to
2172+
if self.config.quote_messages
2173+
else None,
2174+
)
2175+
logger.info(f"[TTS] Audio sent via base64 to {to}")
2176+
else:
2177+
# Use base64 method (current behavior)
2178+
await self.provider.send_audio_message(
2179+
to=to,
2180+
audio_base64=speech_result.audio,
2181+
quoted_message_id=reply_to
2182+
if self.config.quote_messages
2183+
else None,
2184+
)
2185+
logger.info(f"[TTS] Audio sent via base64 to {to}")
21272186

21282187
logger.info(
21292188
f"[TTS] Successfully sent audio response to {to}",
@@ -2319,6 +2378,18 @@ def _validate_tts_configuration(self) -> bool:
23192378
)
23202379
return False
23212380

2381+
def _get_audio_extension(self, format_type: Any) -> str:
2382+
"""Get file extension from TTS format."""
2383+
format_str = str(format_type)
2384+
if "mp3" in format_str:
2385+
return "mp3"
2386+
elif "wav" in format_str:
2387+
return "wav"
2388+
elif "ogg" in format_str:
2389+
return "ogg"
2390+
else:
2391+
return "mp3" # default
2392+
23222393
def _split_message_by_line_breaks(self, text: str) -> Sequence[str]:
23232394
"""Split message by line breaks first, then by length if needed with enhanced validation."""
23242395
if not text or not text.strip():

agentle/storage/__init__.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
"""Storage module for file management."""
2+
3+
from agentle.storage.file_storage_manager import FileStorageManager
4+
from agentle.storage.local_file_storage_manager import LocalFileStorageManager
5+
from agentle.storage.s3_file_storage_manager import S3FileStorageManager
6+
7+
__all__ = [
8+
"FileStorageManager",
9+
"LocalFileStorageManager",
10+
"S3FileStorageManager",
11+
]

0 commit comments

Comments
 (0)