Merge pull request pipecat-ai#3504 from pipecat-ai/aleix/nvidia-stt-tts-error-handling

aconchillo · web-flow · commit bf43032652cd · 2026-01-20T09:41:08.000-08:00
NVIDIA STT/TTS error handling
diff --git a/changelog/3504.fixed.md b/changelog/3504.fixed.md
@@ -0,0 +1 @@
+- Moved `NVIDIATTSService` and `NVIDIASTTService` client initialization from constructor to `start()` for better error handling.
diff --git a/src/pipecat/services/nvidia/stt.py b/src/pipecat/services/nvidia/stt.py
@@ -134,6 +134,7 @@ def __init__(
 
         params = params or NvidiaSTTService.InputParams()
 
+        self._server = server
         self._api_key = api_key
         self._use_ssl = use_ssl
         self._profanity_filter = False
@@ -162,18 +163,54 @@ def __init__(
 
         self.set_model_name(model_function_map.get("model_name"))
 
+        self._asr_service = None
+        self._queue = None
+        self._config = None
+        self._thread_task = None
+        self._response_task = None
+
+    def _initialize_client(self):
         metadata = [
             ["function-id", self._function_id],
-            ["authorization", f"Bearer {api_key}"],
+            ["authorization", f"Bearer {self._api_key}"],
         ]
-        auth = riva.client.Auth(None, self._use_ssl, server, metadata)
+        auth = riva.client.Auth(None, self._use_ssl, self._server, metadata)
 
         self._asr_service = riva.client.ASRService(auth)
 
-        self._queue = None
-        self._config = None
-        self._thread_task = None
-        self._response_task = None
+    def _create_recognition_config(self):
+        """Create the NVIDIA Riva ASR recognition configuration."""
+        config = riva.client.StreamingRecognitionConfig(
+            config=riva.client.RecognitionConfig(
+                encoding=riva.client.AudioEncoding.LINEAR_PCM,
+                language_code=self._language_code,
+                model="",
+                max_alternatives=1,
+                profanity_filter=self._profanity_filter,
+                enable_automatic_punctuation=self._automatic_punctuation,
+                verbatim_transcripts=not self._no_verbatim_transcripts,
+                sample_rate_hertz=self.sample_rate,
+                audio_channel_count=1,
+            ),
+            interim_results=True,
+        )
+
+        riva.client.add_word_boosting_to_config(
+            config, self._boosted_lm_words, self._boosted_lm_score
+        )
+
+        riva.client.add_endpoint_parameters_to_config(
+            config,
+            self._start_history,
+            self._start_threshold,
+            self._stop_history,
+            self._stop_history_eou,
+            self._stop_threshold,
+            self._stop_threshold_eou,
+        )
+        riva.client.add_custom_configuration_to_config(config, self._custom_configuration)
+
+        return config
 
     def can_generate_metrics(self) -> bool:
         """Check if this service can generate processing metrics.
@@ -206,41 +243,9 @@ async def start(self, frame: StartFrame):
             frame: StartFrame indicating pipeline start.
         """
         await super().start(frame)
+        self._initialize_client()
+        self._config = self._create_recognition_config()
 
-        if self._config:
-            return
-
-        config = riva.client.StreamingRecognitionConfig(
-            config=riva.client.RecognitionConfig(
-                encoding=riva.client.AudioEncoding.LINEAR_PCM,
-                language_code=self._language_code,
-                model="",
-                max_alternatives=1,
-                profanity_filter=self._profanity_filter,
-                enable_automatic_punctuation=self._automatic_punctuation,
-                verbatim_transcripts=not self._no_verbatim_transcripts,
-                sample_rate_hertz=self.sample_rate,
-                audio_channel_count=1,
-            ),
-            interim_results=True,
-        )
-
-        riva.client.add_word_boosting_to_config(
-            config, self._boosted_lm_words, self._boosted_lm_score
-        )
-
-        riva.client.add_endpoint_parameters_to_config(
-            config,
-            self._start_history,
-            self._start_threshold,
-            self._stop_history,
-            self._stop_history_eou,
-            self._stop_threshold,
-            self._stop_threshold_eou,
-        )
-        riva.client.add_custom_configuration_to_config(config, self._custom_configuration)
-
-        self._config = config
         self._queue = asyncio.Queue()
 
         if not self._thread_task:
@@ -250,6 +255,8 @@ async def start(self, frame: StartFrame):
             self._response_queue = asyncio.Queue()
             self._response_task = self.create_task(self._response_task_handler())
 
+        logger.debug(f"Initialized NvidiaSTTService with model: {self.model_name}")
+
     async def stop(self, frame: EndFrame):
         """Stop the NVIDIA Riva STT service and clean up resources.
 
@@ -503,8 +510,6 @@ def _initialize_client(self):
         auth = riva.client.Auth(None, self._use_ssl, self._server, metadata)
         self._asr_service = riva.client.ASRService(auth)
 
-        logger.info(f"Initialized NvidiaSegmentedSTTService with model: {self.model_name}")
-
     def _create_recognition_config(self):
         """Create the NVIDIA Riva ASR recognition configuration."""
         # Create base configuration
@@ -572,6 +577,7 @@ async def start(self, frame: StartFrame):
         await super().start(frame)
         self._initialize_client()
         self._config = self._create_recognition_config()
+        logger.debug(f"Initialized NvidiaSegmentedSTTService with model: {self.model_name}")
 
     async def set_language(self, language: Language):
         """Set the language for the STT service.
@@ -605,65 +611,53 @@ async def run_stt(self, audio: bytes) -> AsyncGenerator[Frame, None]:
             Frame: TranscriptionFrame containing the transcribed text.
         """
         try:
-            await self.start_processing_metrics()
-            await self.start_ttfb_metrics()
-
-            # Make sure the client is initialized
-            if self._asr_service is None:
-                self._initialize_client()
-
-            # Make sure the config is created
-            if self._config is None:
-                self._config = self._create_recognition_config()
-
-            # Type assertion to satisfy the IDE
             assert self._asr_service is not None, "ASR service not initialized"
             assert self._config is not None, "Recognition config not created"
 
+            await self.start_processing_metrics()
+            await self.start_ttfb_metrics()
+
             # Process audio with NVIDIA Riva ASR - explicitly request non-future response
             raw_response = self._asr_service.offline_recognize(audio, self._config, future=False)
 
             await self.stop_ttfb_metrics()
             await self.stop_processing_metrics()
 
             # Process the response - handle different possible return types
-            try:
-                # If it's a future-like object, get the result
-                if hasattr(raw_response, "result"):
-                    response = raw_response.result()
-                else:
-                    response = raw_response
-
-                # Process transcription results
-                transcription_found = False
-
-                # Now we can safely check results
-                # Type hint for the IDE
-                results = getattr(response, "results", [])
-
-                for result in results:
-                    alternatives = getattr(result, "alternatives", [])
-                    if alternatives:
-                        text = alternatives[0].transcript.strip()
-                        if text:
-                            logger.debug(f"Transcription: [{text}]")
-                            yield TranscriptionFrame(
-                                text,
-                                self._user_id,
-                                time_now_iso8601(),
-                                self._language_enum,
-                            )
-                            transcription_found = True
-
-                            await self._handle_transcription(text, True, self._language_enum)
-
-                if not transcription_found:
-                    logger.debug("No transcription results found in NVIDIA Riva response")
-
-            except AttributeError as ae:
-                logger.error(f"Unexpected response structure from NVIDIA Riva: {ae}")
-                yield ErrorFrame(f"Unexpected NVIDIA Riva response format: {str(ae)}")
+            # If it's a future-like object, get the result
+            if hasattr(raw_response, "result"):
+                response = raw_response.result()
+            else:
+                response = raw_response
+
+            # Process transcription results
+            transcription_found = False
+
+            # Now we can safely check results
+            # Type hint for the IDE
+            results = getattr(response, "results", [])
+
+            for result in results:
+                alternatives = getattr(result, "alternatives", [])
+                if alternatives:
+                    text = alternatives[0].transcript.strip()
+                    if text:
+                        logger.debug(f"Transcription: [{text}]")
+                        yield TranscriptionFrame(
+                            text,
+                            self._user_id,
+                            time_now_iso8601(),
+                            self._language_enum,
+                        )
+                        transcription_found = True
+
+                        await self._handle_transcription(text, True, self._language_enum)
 
+            if not transcription_found:
+                logger.debug(f"{self}: No transcription results found in NVIDIA Riva response")
+        except AttributeError as ae:
+            logger.error(f"{self}: Unexpected response structure from NVIDIA Riva: {ae}")
+            yield ErrorFrame(f"{self}: Unexpected NVIDIA Riva response format: {str(ae)}")
         except Exception as e:
             logger.error(f"{self} exception: {e}")
             yield ErrorFrame(error=f"{self} error: {e}")
diff --git a/src/pipecat/services/nvidia/tts.py b/src/pipecat/services/nvidia/tts.py
@@ -25,6 +25,7 @@
 from pipecat.frames.frames import (
     ErrorFrame,
     Frame,
+    StartFrame,
     TTSAudioRawFrame,
     TTSStartedFrame,
     TTSStoppedFrame,
@@ -93,6 +94,7 @@ def __init__(
 
         params = params or NvidiaTTSService.InputParams()
 
+        self._server = server
         self._api_key = api_key
         self._voice_id = voice_id
         self._language_code = params.language
@@ -102,18 +104,8 @@ def __init__(
         self.set_model_name(model_function_map.get("model_name"))
         self.set_voice(voice_id)
 
-        metadata = [
-            ["function-id", self._function_id],
-            ["authorization", f"Bearer {api_key}"],
-        ]
-        auth = riva.client.Auth(None, self._use_ssl, server, metadata)
-
-        self._service = riva.client.SpeechSynthesisService(auth)
-
-        # warm up the service
-        config_response = self._service.stub.GetRivaSynthesisConfig(
-            riva.client.proto.riva_tts_pb2.RivaSynthesisConfigRequest()
-        )
+        self._service = None
+        self._config = None
 
     async def set_model(self, model: str):
         """Attempt to set the TTS model.
@@ -129,6 +121,39 @@ async def set_model(self, model: str):
             f"{self.__class__.__name__}(api_key=<api_key>, model_function_map={example})"
         )
 
+    def _initialize_client(self):
+        if self._service is not None:
+            return
+
+        metadata = [
+            ["function-id", self._function_id],
+            ["authorization", f"Bearer {self._api_key}"],
+        ]
+        auth = riva.client.Auth(None, self._use_ssl, self._server, metadata)
+
+        self._service = riva.client.SpeechSynthesisService(auth)
+
+    def _create_synthesis_config(self):
+        if not self._service:
+            return
+
+        # warm up the service
+        config = self._service.stub.GetRivaSynthesisConfig(
+            riva.client.proto.riva_tts_pb2.RivaSynthesisConfigRequest()
+        )
+        return config
+
+    async def start(self, frame: StartFrame):
+        """Start the Cartesia TTS service.
+
+        Args:
+            frame: The start frame containing initialization parameters.
+        """
+        await super().start(frame)
+        self._initialize_client()
+        self._config = self._create_synthesis_config()
+        logger.debug(f"Initialized NvidiaTTSService with model: {self.model_name}")
+
     @traced_tts
     async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
         """Generate speech from text using NVIDIA Riva TTS.
@@ -161,12 +186,15 @@ def add_response(r):
                 logger.error(f"{self} exception: {e}")
                 add_response(None)
 
-        await self.start_ttfb_metrics()
-        yield TTSStartedFrame()
+        try:
+            assert self._service is not None, "TTS service not initialized"
+            assert self._config is not None, "Synthesis configuration not created"
+
+            await self.start_ttfb_metrics()
+            yield TTSStartedFrame()
 
-        logger.debug(f"{self}: Generating TTS [{text}]")
+            logger.debug(f"{self}: Generating TTS [{text}]")
 
-        try:
             queue = asyncio.Queue()
             await asyncio.to_thread(read_audio_responses, queue)
 
@@ -181,9 +209,12 @@ def add_response(r):
                 )
                 yield frame
                 resp = await asyncio.wait_for(queue.get(), timeout=NVIDIA_TTS_TIMEOUT_SECS)
+
+            await self.start_tts_usage_metrics(text)
+            yield TTSStoppedFrame()
         except asyncio.TimeoutError:
             logger.error(f"{self} timeout waiting for audio response")
             yield ErrorFrame(error=f"{self} error: {e}")
-
-        await self.start_tts_usage_metrics(text)
-        yield TTSStoppedFrame()
+        except Exception as e:
+            logger.error(f"{self} exception: {e}")
+            yield ErrorFrame(error=f"{self} error: {e}")

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+- Moved `NVIDIATTSService` and `NVIDIASTTService` client initialization from constructor to `start()` for better error handling.