Merge pull request pipecat-ai#3404 from poseneror/feature/gladia-vad-events

markbackman · web-flow · commit ae59b3ba364d · 2026-01-13T11:26:56.000-05:00
feat(gladia): add VAD events support
diff --git a/src/pipecat/services/gladia/config.py b/src/pipecat/services/gladia/config.py
@@ -169,6 +169,9 @@ class GladiaInputParams(BaseModel):
         pre_processing: Audio pre-processing options
         realtime_processing: Real-time processing features
         messages_config: WebSocket message filtering options
+        enable_vad: Enable VAD to trigger end of utterance detection. This should be used
+            without any other VAD enabled in the agent and will emit the speaker started
+            and stopped frames. Defaults to False.
     """
 
     encoding: Optional[str] = "wav/pcm"
@@ -182,3 +185,4 @@ class GladiaInputParams(BaseModel):
     pre_processing: Optional[PreProcessingConfig] = None
     realtime_processing: Optional[RealtimeProcessingConfig] = None
     messages_config: Optional[MessagesConfig] = None
+    enable_vad: bool = False
diff --git a/src/pipecat/services/gladia/stt.py b/src/pipecat/services/gladia/stt.py
@@ -28,6 +28,8 @@
     StartFrame,
     TranscriptionFrame,
     TranslationFrame,
+    UserStartedSpeakingFrame,
+    UserStoppedSpeakingFrame,
 )
 from pipecat.services.gladia.config import GladiaInputParams
 from pipecat.services.stt_service import WebsocketSTTService
@@ -202,6 +204,7 @@ def __init__(
         model: str = "solaria-1",
         params: Optional[GladiaInputParams] = None,
         max_buffer_size: int = 1024 * 1024 * 20,  # 20MB default buffer
+        should_interrupt: bool = True,
         **kwargs,
     ):
         """Initialize the Gladia STT service.
@@ -220,6 +223,8 @@ def __init__(
             model: Model to use for transcription. Defaults to "solaria-1".
             params: Additional configuration parameters for Gladia service.
             max_buffer_size: Maximum size of audio buffer in bytes. Defaults to 20MB.
+            should_interrupt: Determine whether the bot should be interrupted when
+                Gladia VAD detects user speech. Defaults to True.
             **kwargs: Additional arguments passed to the STTService parent class.
         """
         super().__init__(sample_rate=sample_rate, **kwargs)
@@ -266,6 +271,10 @@ def __init__(
         self._max_buffer_size = max_buffer_size
         self._buffer_lock = asyncio.Lock()
 
+        # VAD state tracking
+        self._is_speaking = False
+        self._should_interrupt = should_interrupt
+
     def __str__(self):
         return f"{self.name} [{self._session_id}]"
 
@@ -507,6 +516,33 @@ async def _handle_transcription(
         await self.stop_ttfb_metrics()
         await self.stop_processing_metrics()
 
+    async def _on_speech_started(self):
+        """Handle speech start event from Gladia.
+
+        Broadcasts UserStartedSpeakingFrame and optionally triggers interruption
+        when VAD is enabled.
+        """
+        if not self._params.enable_vad or self._is_speaking:
+            return
+
+        logger.debug(f"{self} User started speaking")
+        self._is_speaking = True
+
+        await self.broadcast_frame(UserStartedSpeakingFrame)
+        if self._should_interrupt:
+            await self.push_interruption_task_frame_and_wait()
+
+    async def _on_speech_ended(self):
+        """Handle speech end event from Gladia.
+
+        Broadcasts UserStoppedSpeakingFrame when VAD is enabled.
+        """
+        if not self._params.enable_vad or not self._is_speaking:
+            return
+        self._is_speaking = False
+        await self.broadcast_frame(UserStoppedSpeakingFrame)
+        logger.debug(f"{self} User stopped speaking")
+
     async def _send_audio(self, audio: bytes):
         """Send audio chunk with proper message format."""
         if self._websocket and self._websocket.state is State.OPEN:
@@ -599,6 +635,10 @@ async def _receive_messages(self):
                                 translation, "", time_now_iso8601(), translated_language
                             )
                         )
+                elif content["type"] == "speech_start":
+                    await self._on_speech_started()
+                elif content["type"] == "speech_end":
+                    await self._on_speech_ended()
             except json.JSONDecodeError:
                 logger.warning(f"{self} Received non-JSON message: {message}")