3838 LLMContextFrame ,
3939 LLMFullResponseEndFrame ,
4040 LLMFullResponseStartFrame ,
41+ LLMTextFrame ,
4142 StartFrame ,
4243 TranscriptionFrame ,
4344 TTSAudioRawFrame ,
@@ -1077,9 +1078,7 @@ async def _report_assistant_response_text_added(self, text):
10771078 logger .debug (f"Assistant response text added: { text } " )
10781079
10791080 # Report the text of the assistant response.
1080- frame = TTSTextFrame (text , aggregated_by = AggregationType .SENTENCE )
1081- frame .includes_inter_frame_spaces = True
1082- await self .push_frame (frame )
1081+ await self ._push_assistant_response_text_frames (text )
10831082
10841083 # HACK: here we're also buffering the assistant text ourselves as a
10851084 # backup rather than relying solely on the assistant context aggregator
@@ -1112,11 +1111,7 @@ async def _report_assistant_response_ended(self):
11121111 # TTSTextFrame would be ignored otherwise (the interruption frame
11131112 # would have cleared the assistant aggregator state).
11141113 await self .push_frame (LLMFullResponseStartFrame ())
1115- frame = TTSTextFrame (
1116- self ._assistant_text_buffer , aggregated_by = AggregationType .SENTENCE
1117- )
1118- frame .includes_inter_frame_spaces = True
1119- await self .push_frame (frame )
1114+ await self ._push_assistant_response_text_frames (self ._assistant_text_buffer )
11201115 self ._may_need_repush_assistant_text = False
11211116
11221117 # Report the end of the assistant response.
@@ -1128,6 +1123,25 @@ async def _report_assistant_response_ended(self):
11281123 # Clear out the buffered assistant text
11291124 self ._assistant_text_buffer = ""
11301125
1126+ async def _push_assistant_response_text_frames (self , text : str ):
1127+ # In a typical "cascade" LLM + TTS setup, LLMTextFrames would not
1128+ # proceed beyond the TTS service. Therefore, since a speech-to-speech
1129+ # service like Nova Sonic combines both LLM and TTS functionality, you
1130+ # would think we wouldn't need to push LLMTextFrames at all. However,
1131+ # RTVI relies on LLMTextFrames being pushed to trigger its
1132+ # "bot-llm-text" event. So here we push an LLMTextFrame, too, but avoid
1133+ # appending it to context to avoid context message duplication.
1134+
1135+ # Push LLMTextFrame
1136+ llm_text_frame = LLMTextFrame (text )
1137+ llm_text_frame .append_to_context = False
1138+ await self .push_frame (llm_text_frame )
1139+
1140+ # Push TTSTextFrame
1141+ tts_text_frame = TTSTextFrame (text , aggregated_by = AggregationType .SENTENCE )
1142+ tts_text_frame .includes_inter_frame_spaces = True
1143+ await self .push_frame (tts_text_frame )
1144+
11311145 #
11321146 # user transcription reporting
11331147 #
0 commit comments