Skip to content

Commit 19fb3ee

Browse files
authored
Merge pull request #3466 from pipecat-ai/pk/fix-aws-nova-sonic-rtvi-bot-output
Fix realtime (speech-to-speech) services' RTVI event compatibility
2 parents b292b32 + ce99924 commit 19fb3ee

5 files changed

Lines changed: 91 additions & 20 deletions

File tree

changelog/3446.fixed.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
- Fixed an issue where the "bot-llm-text" RTVI event would not fire for realtime (speech-to-speech) services:
2+
3+
- `AWSNovaSonicLLMService`
4+
- `GeminiLiveLLMService`
5+
- `OpenAIRealtimeLLMService`
6+
- `GrokRealtimeLLMService`
7+
8+
The issue was that these services weren't pushing `LLMTextFrame`s. Now they do.

src/pipecat/services/aws/nova_sonic/llm.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
LLMContextFrame,
3939
LLMFullResponseEndFrame,
4040
LLMFullResponseStartFrame,
41+
LLMTextFrame,
4142
StartFrame,
4243
TranscriptionFrame,
4344
TTSAudioRawFrame,
@@ -1077,9 +1078,7 @@ async def _report_assistant_response_text_added(self, text):
10771078
logger.debug(f"Assistant response text added: {text}")
10781079

10791080
# Report the text of the assistant response.
1080-
frame = TTSTextFrame(text, aggregated_by=AggregationType.SENTENCE)
1081-
frame.includes_inter_frame_spaces = True
1082-
await self.push_frame(frame)
1081+
await self._push_assistant_response_text_frames(text)
10831082

10841083
# HACK: here we're also buffering the assistant text ourselves as a
10851084
# backup rather than relying solely on the assistant context aggregator
@@ -1112,11 +1111,7 @@ async def _report_assistant_response_ended(self):
11121111
# TTSTextFrame would be ignored otherwise (the interruption frame
11131112
# would have cleared the assistant aggregator state).
11141113
await self.push_frame(LLMFullResponseStartFrame())
1115-
frame = TTSTextFrame(
1116-
self._assistant_text_buffer, aggregated_by=AggregationType.SENTENCE
1117-
)
1118-
frame.includes_inter_frame_spaces = True
1119-
await self.push_frame(frame)
1114+
await self._push_assistant_response_text_frames(self._assistant_text_buffer)
11201115
self._may_need_repush_assistant_text = False
11211116

11221117
# Report the end of the assistant response.
@@ -1128,6 +1123,25 @@ async def _report_assistant_response_ended(self):
11281123
# Clear out the buffered assistant text
11291124
self._assistant_text_buffer = ""
11301125

1126+
async def _push_assistant_response_text_frames(self, text: str):
1127+
# In a typical "cascade" LLM + TTS setup, LLMTextFrames would not
1128+
# proceed beyond the TTS service. Therefore, since a speech-to-speech
1129+
# service like Nova Sonic combines both LLM and TTS functionality, you
1130+
# would think we wouldn't need to push LLMTextFrames at all. However,
1131+
# RTVI relies on LLMTextFrames being pushed to trigger its
1132+
# "bot-llm-text" event. So here we push an LLMTextFrame, too, but avoid
1133+
# appending it to context to avoid context message duplication.
1134+
1135+
# Push LLMTextFrame
1136+
llm_text_frame = LLMTextFrame(text)
1137+
llm_text_frame.append_to_context = False
1138+
await self.push_frame(llm_text_frame)
1139+
1140+
# Push TTSTextFrame
1141+
tts_text_frame = TTSTextFrame(text, aggregated_by=AggregationType.SENTENCE)
1142+
tts_text_frame.includes_inter_frame_spaces = True
1143+
await self.push_frame(tts_text_frame)
1144+
11311145
#
11321146
# user transcription reporting
11331147
#

src/pipecat/services/google/gemini_live/llm.py

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1710,11 +1710,26 @@ async def _handle_msg_output_transcription(self, message: LiveServerMessage):
17101710
await self.push_frame(TTSStartedFrame())
17111711
await self.push_frame(LLMFullResponseStartFrame())
17121712

1713-
frame = TTSTextFrame(text=text, aggregated_by=AggregationType.SENTENCE)
1714-
# Gemini Live text already includes any necessary inter-chunk spaces
1715-
frame.includes_inter_frame_spaces = True
1716-
1717-
await self.push_frame(frame)
1713+
await self._push_output_transcription_text_frames(text)
1714+
1715+
async def _push_output_transcription_text_frames(self, text: str):
1716+
# In a typical "cascade" LLM + TTS setup, LLMTextFrames would not
1717+
# proceed beyond the TTS service. Therefore, since a speech-to-speech
1718+
# service like Gemini Live combines both LLM and TTS functionality, you
1719+
# might think we wouldn't need to push LLMTextFrames at all. However,
1720+
# RTVI relies on LLMTextFrames being pushed to trigger its
1721+
# "bot-llm-text" event. So here we push an LLMTextFrame, too, but avoid
1722+
# appending it to context to avoid context message duplication.
1723+
1724+
# Push LLMTextFrame
1725+
llm_text_frame = LLMTextFrame(text)
1726+
llm_text_frame.append_to_context = False
1727+
await self.push_frame(llm_text_frame)
1728+
1729+
# Push TTSTextFrame
1730+
tts_text_frame = TTSTextFrame(text, aggregated_by=AggregationType.SENTENCE)
1731+
tts_text_frame.includes_inter_frame_spaces = True
1732+
await self.push_frame(tts_text_frame)
17181733

17191734
async def _handle_msg_grounding_metadata(self, message: LiveServerMessage):
17201735
"""Handle dedicated grounding metadata messages."""

src/pipecat/services/grok/realtime/llm.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
LLMFullResponseStartFrame,
3434
LLMMessagesAppendFrame,
3535
LLMSetToolsFrame,
36+
LLMTextFrame,
3637
LLMUpdateSettingsFrame,
3738
StartFrame,
3839
TranscriptionFrame,
@@ -619,9 +620,26 @@ async def _handle_evt_response_done(self, evt):
619620
async def _handle_evt_audio_transcript_delta(self, evt):
620621
"""Handle audio transcript delta event."""
621622
if evt.delta:
622-
frame = TTSTextFrame(evt.delta, aggregated_by=AggregationType.SENTENCE)
623-
frame.includes_inter_frame_spaces = True
624-
await self.push_frame(frame)
623+
await self._push_output_transcript_text_frames(evt.delta)
624+
625+
async def _push_output_transcript_text_frames(self, text: str):
626+
# In a typical "cascade" LLM + TTS setup, LLMTextFrames would not
627+
# proceed beyond the TTS service. Therefore, since a speech-to-speech
628+
# service like Grok Realtime combines both LLM and TTS functionality,
629+
# you might think we wouldn't need to push LLMTextFrames at all.
630+
# However, RTVI relies on LLMTextFrames being pushed to trigger its
631+
# "bot-llm-text" event. So here we push an LLMTextFrame, too, but avoid
632+
# appending it to context to avoid context message duplication.
633+
634+
# Push LLMTextFrame
635+
llm_text_frame = LLMTextFrame(text)
636+
llm_text_frame.append_to_context = False
637+
await self.push_frame(llm_text_frame)
638+
639+
# Push TTSTextFrame
640+
tts_text_frame = TTSTextFrame(text, aggregated_by=AggregationType.SENTENCE)
641+
tts_text_frame.includes_inter_frame_spaces = True
642+
await self.push_frame(tts_text_frame)
625643

626644
async def _handle_evt_function_call_arguments_done(self, evt):
627645
"""Handle function call arguments done event."""

src/pipecat/services/openai/realtime/llm.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -724,10 +724,26 @@ async def _handle_evt_audio_transcript_delta(self, evt):
724724
# We receive audio transcript deltas (as opposed to text deltas) when
725725
# the output modality is "audio" (the default)
726726
if evt.delta:
727-
frame = TTSTextFrame(evt.delta, aggregated_by=AggregationType.SENTENCE)
728-
# OpenAI Realtime text already includes any necessary inter-chunk spaces
729-
frame.includes_inter_frame_spaces = True
730-
await self.push_frame(frame)
727+
await self._push_output_transcript_text_frames(evt.delta)
728+
729+
async def _push_output_transcript_text_frames(self, text: str):
730+
# In a typical "cascade" LLM + TTS setup, LLMTextFrames would not
731+
# proceed beyond the TTS service. Therefore, since a speech-to-speech
732+
# service like OpenAI Realtime combines both LLM and TTS functionality,
733+
# you might think we wouldn't need to push LLMTextFrames at all.
734+
# However, RTVI relies on LLMTextFrames being pushed to trigger its
735+
# "bot-llm-text" event. So here we push an LLMTextFrame, too, but avoid
736+
# appending it to context to avoid context message duplication.
737+
738+
# Push LLMTextFrame
739+
llm_text_frame = LLMTextFrame(text)
740+
llm_text_frame.append_to_context = False
741+
await self.push_frame(llm_text_frame)
742+
743+
# Push TTSTextFrame
744+
tts_text_frame = TTSTextFrame(text, aggregated_by=AggregationType.SENTENCE)
745+
tts_text_frame.includes_inter_frame_spaces = True
746+
await self.push_frame(tts_text_frame)
731747

732748
async def _handle_evt_function_call_arguments_done(self, evt):
733749
"""Handle completion of function call arguments.

0 commit comments

Comments
 (0)