1515"""
1616
1717import os
18+ import time
1819from typing import Optional , Tuple
1920
2021import numpy as np
2627 int_to_krisp_sample_rate ,
2728)
2829from pipecat .audio .turn .base_turn_analyzer import BaseTurnAnalyzer , BaseTurnParams , EndOfTurnState
29- from pipecat .metrics .metrics import MetricsData
30+ from pipecat .metrics .metrics import MetricsData , TurnMetricsData
3031
3132try :
3233 import krisp_audio
@@ -63,6 +64,7 @@ def __init__(
6364 model_path : Optional [str ] = None ,
6465 sample_rate : Optional [int ] = None ,
6566 params : Optional [KrispTurnParams ] = None ,
67+ api_key : str = "" ,
6668 ) -> None :
6769 """Initialize the Krisp turn analyzer.
6870
@@ -72,6 +74,8 @@ def __init__(
7274 sample_rate: Optional initial sample rate for audio processing.
7375 If provided, this will be used as the fixed sample rate.
7476 params: Configuration parameters for turn analysis behavior.
77+ api_key: Krisp SDK API key. If empty, falls back to
78+ the KRISP_VIVA_API_KEY environment variable.
7579
7680 Raises:
7781 ValueError: If model_path is not provided and KRISP_VIVA_TURN_MODEL_PATH is not set.
@@ -83,7 +87,7 @@ def __init__(
8387
8488 # Acquire SDK reference (will initialize on first call)
8589 try :
86- KrispVivaSDKManager .acquire ()
90+ KrispVivaSDKManager .acquire (api_key = api_key )
8791 self ._sdk_acquired = True
8892 except Exception as e :
8993 self ._sdk_acquired = False
@@ -115,6 +119,9 @@ def __init__(
115119 self ._last_probability = None
116120 self ._frame_probabilities = []
117121 self ._last_state = EndOfTurnState .INCOMPLETE
122+ self ._speech_stopped_time : Optional [float ] = None
123+ self ._e2e_processing_time_ms : Optional [float ] = None
124+ self ._last_metrics : Optional [TurnMetricsData ] = None
118125
119126 # Create session with provided sample rate or default to 16000 Hz
120127 # This preloads the model to improve latency when set_sample_rate is called later
@@ -288,7 +295,14 @@ def append_audio(self, buffer: bytes, is_speech: bool) -> EndOfTurnState:
288295 # Track speech start time
289296 if not self ._speech_triggered :
290297 logger .trace ("Speech detected, turn analysis started" )
298+ self ._e2e_processing_time_ms = None
291299 self ._speech_triggered = True
300+ # Reset speech stopped time when speech resumes
301+ self ._speech_stopped_time = None
302+ else :
303+ # Record the moment speech transitions to non-speech
304+ if self ._speech_triggered and self ._speech_stopped_time is None :
305+ self ._speech_stopped_time = time .perf_counter ()
292306 # Note: We don't immediately mark as complete on silence detection.
293307 # Instead, we wait for the model's probability check below to confirm
294308 # end-of-turn based on the threshold.
@@ -308,6 +322,18 @@ def append_audio(self, buffer: bytes, is_speech: bool) -> EndOfTurnState:
308322 # Only mark as complete if we've detected speech and the model
309323 # confirms with sufficient confidence
310324 if self ._speech_triggered and prob >= self ._params .threshold :
325+ # Calculate e2e processing time: time from speech stop to threshold crossing
326+ if self ._speech_stopped_time is not None :
327+ self ._e2e_processing_time_ms = (
328+ time .perf_counter () - self ._speech_stopped_time
329+ ) * 1000
330+ self ._last_metrics = TurnMetricsData (
331+ processor = "KrispVivaTurn" ,
332+ is_complete = True ,
333+ probability = prob ,
334+ e2e_processing_time_ms = self ._e2e_processing_time_ms ,
335+ )
336+ logger .debug (f"Krisp turn complete" )
311337 state = EndOfTurnState .COMPLETE
312338 self .clear ()
313339 break
@@ -329,12 +355,15 @@ async def analyze_end_of_turn(self) -> Tuple[EndOfTurnState, Optional[MetricsDat
329355 Tuple containing the end-of-turn state and optional metrics data.
330356 Returns the last state determined by append_audio().
331357 """
332- # For real-time processing, the state is determined in append_audio
333- # Return the last state that was computed
334- return self ._last_state , None
358+ # For real-time processing, the state is determined in append_audio.
359+ # Consume metrics so they aren't pushed twice.
360+ metrics = self ._last_metrics
361+ self ._last_metrics = None
362+ return self ._last_state , metrics
335363
336364 def clear (self ):
337365 """Reset the turn analyzer to its initial state."""
338366 self ._speech_triggered = False
339367 self ._audio_buffer .clear ()
340368 self ._last_state = EndOfTurnState .INCOMPLETE
369+ self ._speech_stopped_time = None
0 commit comments