NVIDIATTSService: process incoming audio frame right away

aconchillo · aconchillo · commit a787fd9cd81c · 2026-01-20T15:41:05.000-08:00
Process audio as soon as we receive it from the generator. Previously, we were
reading from the generator and adding elements into a queue until there was no
more data, then we would process the queue.
diff --git a/changelog/3509.fixed.2.md b/changelog/3509.fixed.2.md
@@ -0,0 +1 @@
+- Optimized `NVIDIATTSService` to process incoming audio frames immediately.
diff --git a/src/pipecat/services/nvidia/tts.py b/src/pipecat/services/nvidia/tts.py
@@ -12,7 +12,7 @@
 
 import asyncio
 import os
-from typing import AsyncGenerator, Mapping, Optional
+from typing import AsyncGenerator, AsyncIterable, Generator, Mapping, Optional
 
 from pipecat.utils.tracing.service_decorators import traced_tts
 
@@ -35,14 +35,12 @@
 
 try:
     import riva.client
-
+    import riva.client.proto.riva_tts_pb2 as rtts
 except ModuleNotFoundError as e:
     logger.error(f"Exception: {e}")
     logger.error("In order to use NVIDIA Riva TTS, you need to `pip install pipecat-ai[nvidia]`.")
     raise Exception(f"Missing module: {e}")
 
-NVIDIA_TTS_TIMEOUT_SECS = 5
-
 
 class NvidiaTTSService(TTSService):
     """NVIDIA Riva text-to-speech service.
@@ -165,26 +163,30 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
             Frame: Audio frames containing the synthesized speech data.
         """
 
-        def read_audio_responses(queue: asyncio.Queue):
-            def add_response(r):
-                asyncio.run_coroutine_threadsafe(queue.put(r), self.get_event_loop())
-
+        def read_audio_responses() -> Generator[rtts.SynthesizeSpeechResponse, None, None]:
+            responses = self._service.synthesize_online(
+                text,
+                self._voice_id,
+                self._language_code,
+                sample_rate_hz=self.sample_rate,
+                zero_shot_audio_prompt_file=None,
+                zero_shot_quality=self._quality,
+                custom_dictionary={},
+            )
+            return responses
+
+        def async_next(it):
             try:
-                responses = self._service.synthesize_online(
-                    text,
-                    self._voice_id,
-                    self._language_code,
-                    sample_rate_hz=self.sample_rate,
-                    zero_shot_audio_prompt_file=None,
-                    zero_shot_quality=self._quality,
-                    custom_dictionary={},
-                )
-                for r in responses:
-                    add_response(r)
-                add_response(None)
-            except Exception as e:
-                logger.error(f"{self} exception: {e}")
-                add_response(None)
+                return next(it)
+            except StopIteration:
+                return None
+
+        async def async_iterator(iterator) -> AsyncIterable[rtts.SynthesizeSpeechResponse]:
+            while True:
+                item = await asyncio.to_thread(async_next, iterator)
+                if item is None:
+                    return
+                yield item
 
         try:
             assert self._service is not None, "TTS service not initialized"
@@ -195,20 +197,16 @@ def add_response(r):
 
             logger.debug(f"{self}: Generating TTS [{text}]")
 
-            queue = asyncio.Queue()
-            await asyncio.to_thread(read_audio_responses, queue)
+            responses = await asyncio.to_thread(read_audio_responses)
 
-            # Wait for the thread to start.
-            resp = await asyncio.wait_for(queue.get(), timeout=NVIDIA_TTS_TIMEOUT_SECS)
-            while resp:
+            async for resp in async_iterator(responses):
                 await self.stop_ttfb_metrics()
                 frame = TTSAudioRawFrame(
                     audio=resp.audio,
                     sample_rate=self.sample_rate,
                     num_channels=1,
                 )
                 yield frame
-                resp = await asyncio.wait_for(queue.get(), timeout=NVIDIA_TTS_TIMEOUT_SECS)
 
             await self.start_tts_usage_metrics(text)
             yield TTSStoppedFrame()

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+- Optimized `NVIDIATTSService` to process incoming audio frames immediately.