Skip to content

Commit 3e6c59c

Browse files
authored
Merge pull request #3809 from pipecat-ai/mb/krisp-viva-result
Add Krisp API key support and debug logging
2 parents 2f60074 + 0ca8c85 commit 3e6c59c

14 files changed

Lines changed: 144 additions & 55 deletions

File tree

changelog/3809.added.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
- Added `TurnMetricsData` as a generic metrics class for turn detection, with e2e processing time measurement. `KrispVivaTurn` now emits `TurnMetricsData` with `e2e_processing_time_ms` tracking the interval from VAD speech-to-silence transition to turn completion.

changelog/3809.changed.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
- Added `api_key` parameter to `KrispVivaSDKManager`, `KrispVivaTurn`, and `KrispVivaFilter` for Krisp SDK v1.6.1+ licensing. Falls back to `KRISP_VIVA_API_KEY` environment variable.

changelog/3809.deprecated.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
- Deprecated `SmartTurnMetricsData` in favor of `TurnMetricsData`. `BaseSmartTurn` now emits `TurnMetricsData` directly.

env.example

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ INWORLD_API_KEY=...
104104
KRISP_MODEL_PATH=...
105105

106106
# Krisp Viva
107+
KRISP_VIVA_API_KEY=...
107108
KRISP_VIVA_FILTER_MODEL_PATH=...
108109
KRISP_VIVA_TURN_MODEL_PATH=...
109110

examples/foundational/07p-interruptible-krisp-viva.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
from pipecat.audio.turn.krisp_viva_turn import KrispVivaTurn
3232
from pipecat.audio.vad.silero import SileroVADAnalyzer
3333
from pipecat.frames.frames import LLMRunFrame
34+
from pipecat.metrics.metrics import TurnMetricsData
35+
from pipecat.observers.loggers.metrics_log_observer import MetricsLogObserver
3436
from pipecat.pipeline.pipeline import Pipeline
3537
from pipecat.pipeline.runner import PipelineRunner
3638
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -41,32 +43,37 @@
4143
)
4244
from pipecat.runner.types import RunnerArguments
4345
from pipecat.runner.utils import create_transport
46+
from pipecat.services.cartesia.tts import CartesiaTTSService
4447
from pipecat.services.deepgram.stt import DeepgramSTTService
45-
from pipecat.services.deepgram.tts import DeepgramTTSService
4648
from pipecat.services.openai.llm import OpenAILLMService
4749
from pipecat.transports.base_transport import BaseTransport, TransportParams
4850
from pipecat.transports.daily.transport import DailyParams
4951
from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
52+
from pipecat.turns.user_stop import TurnAnalyzerUserTurnStopStrategy
53+
from pipecat.turns.user_turn_strategies import UserTurnStrategies
5054

5155
load_dotenv(override=True)
5256

5357
# We use lambdas to defer transport parameter creation until the transport
5458
# type is selected at runtime.
59+
60+
krisp_viva_filter = KrispVivaFilter()
61+
5562
transport_params = {
5663
"daily": lambda: DailyParams(
5764
audio_in_enabled=True,
5865
audio_out_enabled=True,
59-
audio_in_filter=KrispVivaFilter(),
66+
audio_in_filter=krisp_viva_filter,
6067
),
6168
"twilio": lambda: FastAPIWebsocketParams(
6269
audio_in_enabled=True,
6370
audio_out_enabled=True,
64-
audio_in_filter=KrispVivaFilter(),
71+
audio_in_filter=krisp_viva_filter,
6572
),
6673
"webrtc": lambda: TransportParams(
6774
audio_in_enabled=True,
6875
audio_out_enabled=True,
69-
audio_in_filter=KrispVivaFilter(),
76+
audio_in_filter=krisp_viva_filter,
7077
),
7178
}
7279

@@ -76,7 +83,9 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
7683

7784
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
7885

79-
tts = DeepgramTTSService(api_key=os.getenv("DEEPGRAM_API_KEY"), voice="aura-helios-en")
86+
tts = CartesiaTTSService(
87+
api_key=os.getenv("CARTESIA_API_KEY"), voice_id="71a7ad14-091c-4e8e-a314-022ece01c121"
88+
)
8089

8190
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
8291

@@ -117,6 +126,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
117126
enable_usage_metrics=True,
118127
),
119128
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
129+
observers=[MetricsLogObserver(include_metrics={TurnMetricsData})],
120130
)
121131

122132
@transport.event_handler("on_client_connected")

examples/foundational/38b-smart-turn-local.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212

1313
from pipecat.audio.vad.silero import SileroVADAnalyzer
1414
from pipecat.frames.frames import LLMRunFrame
15+
from pipecat.metrics.metrics import TurnMetricsData
16+
from pipecat.observers.loggers.metrics_log_observer import MetricsLogObserver
1517
from pipecat.pipeline.pipeline import Pipeline
1618
from pipecat.pipeline.runner import PipelineRunner
1719
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -77,7 +79,6 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
7779
pipeline = Pipeline(
7880
[
7981
transport.input(), # Transport user input
80-
rtvi,
8182
stt,
8283
user_aggregator, # User responses
8384
llm, # LLM
@@ -94,17 +95,15 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
9495
enable_usage_metrics=True,
9596
),
9697
idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
98+
observers=[MetricsLogObserver(include_metrics={TurnMetricsData})],
9799
)
98100

99-
@task.rtvi.event_handler("on_client_ready")
100-
async def on_client_ready(rtvi):
101-
# Kick off the conversation
102-
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
103-
await task.queue_frames([LLMRunFrame()])
104-
105101
@transport.event_handler("on_client_connected")
106102
async def on_client_connected(transport, client):
107103
logger.info(f"Client connected")
104+
# Kick off the conversation
105+
messages.append({"role": "system", "content": "Please introduce yourself to the user."})
106+
await task.queue_frames([LLMRunFrame()])
108107

109108
@transport.event_handler("on_client_disconnected")
110109
async def on_client_disconnected(transport, client):

scripts/evals/run-release-evals.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ def EVAL_VISION_IMAGE(*, eval_speaks_first: bool = False):
123123
("07n-interruptible-google.py", EVAL_SIMPLE_MATH),
124124
("07n-interruptible-google-http.py", EVAL_SIMPLE_MATH),
125125
("07o-interruptible-assemblyai.py", EVAL_SIMPLE_MATH),
126+
("07p-interruptible-krisp-viva.py", EVAL_SIMPLE_MATH),
126127
("07q-interruptible-rime.py", EVAL_SIMPLE_MATH),
127128
("07q-interruptible-rime-http.py", EVAL_SIMPLE_MATH),
128129
("07r-interruptible-nvidia.py", EVAL_SIMPLE_MATH),
@@ -148,8 +149,6 @@ def EVAL_VISION_IMAGE(*, eval_speaks_first: bool = False):
148149
("07zj-interruptible-kokoro.py", EVAL_SIMPLE_MATH),
149150
# Needs a local XTTS docker instance running.
150151
# ("07i-interruptible-xtts.py", EVAL_SIMPLE_MATH),
151-
# Needs a Krisp license.
152-
# ("07p-interruptible-krisp.py", EVAL_SIMPLE_MATH),
153152
]
154153

155154
TESTS_12 = [

src/pipecat/audio/filters/krisp_viva_filter.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,11 @@ class KrispVivaFilter(BaseAudioFilter):
3939
"""
4040

4141
def __init__(
42-
self, model_path: str = None, frame_duration: int = 10, noise_suppression_level: int = 100
42+
self,
43+
model_path: str = None,
44+
frame_duration: int = 10,
45+
noise_suppression_level: int = 100,
46+
api_key: str = "",
4347
) -> None:
4448
"""Initialize the Krisp noise reduction filter.
4549
@@ -48,6 +52,8 @@ def __init__(
4852
If None, uses KRISP_VIVA_FILTER_MODEL_PATH environment variable.
4953
frame_duration: Frame duration in milliseconds.
5054
noise_suppression_level: Noise suppression level.
55+
api_key: Krisp SDK API key. If empty, falls back to
56+
the KRISP_VIVA_API_KEY environment variable.
5157
5258
Raises:
5359
ValueError: If model_path is not provided and KRISP_VIVA_FILTER_MODEL_PATH is not set.
@@ -57,6 +63,8 @@ def __init__(
5763
"""
5864
super().__init__()
5965

66+
self._api_key = api_key
67+
6068
try:
6169
# Set model path, checking environment if not specified
6270
if model_path:
@@ -132,7 +140,7 @@ async def start(self, sample_rate: int):
132140
"""
133141
try:
134142
# Acquire SDK reference (will initialize on first call)
135-
KrispVivaSDKManager.acquire()
143+
KrispVivaSDKManager.acquire(api_key=self._api_key)
136144
self._session = self._create_session(sample_rate, self._frame_duration_ms)
137145
except Exception as e:
138146
logger.error(f"Failed to start Krisp session: {e}", exc_info=True)

src/pipecat/audio/krisp_instance.py

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"""Krisp Instance manager for pipecat audio."""
88

99
import atexit
10+
import os
1011
from threading import Lock
1112

1213
from loguru import logger
@@ -88,25 +89,46 @@ class KrispVivaSDKManager:
8889
_lock = Lock()
8990
_reference_count = 0
9091

92+
@staticmethod
93+
def _license_callback(error, error_message):
94+
"""Callback for Krisp SDK licensing errors."""
95+
logger.error(f"Krisp licensing error: {error} - {error_message}")
96+
9197
@staticmethod
9298
def _log_callback(log_message, log_level):
9399
"""Thread-safe callback for Krisp SDK logging."""
94100
logger.info(f"[{log_level}] {log_message}")
95101

96102
@classmethod
97-
def acquire(cls):
103+
def acquire(cls, api_key: str = ""):
98104
"""Acquire a reference to the SDK (initializes if needed).
99105
100106
Call this when creating a filter instance.
101107
108+
Args:
109+
api_key: Krisp SDK API key. If empty, falls back to the
110+
KRISP_VIVA_API_KEY environment variable.
111+
102112
Raises:
103113
Exception: If SDK initialization fails (propagated from krisp_audio)
104114
"""
105115
with cls._lock:
106116
# Initialize SDK on first acquire
107117
if cls._reference_count == 0:
108118
try:
109-
krisp_audio.globalInit("", cls._log_callback, krisp_audio.LogLevel.Off)
119+
key = api_key or os.environ.get("KRISP_VIVA_API_KEY", "")
120+
try:
121+
# New SDK signature (requires license key)
122+
krisp_audio.globalInit(
123+
"",
124+
key,
125+
cls._license_callback,
126+
cls._log_callback,
127+
krisp_audio.LogLevel.Off,
128+
)
129+
except TypeError:
130+
# Old SDK signature (no license key)
131+
krisp_audio.globalInit("", cls._log_callback, krisp_audio.LogLevel.Off)
110132

111133
cls._initialized = True
112134

src/pipecat/audio/turn/krisp_viva_turn.py

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
"""
1616

1717
import os
18+
import time
1819
from typing import Optional, Tuple
1920

2021
import numpy as np
@@ -26,7 +27,7 @@
2627
int_to_krisp_sample_rate,
2728
)
2829
from pipecat.audio.turn.base_turn_analyzer import BaseTurnAnalyzer, BaseTurnParams, EndOfTurnState
29-
from pipecat.metrics.metrics import MetricsData
30+
from pipecat.metrics.metrics import MetricsData, TurnMetricsData
3031

3132
try:
3233
import krisp_audio
@@ -63,6 +64,7 @@ def __init__(
6364
model_path: Optional[str] = None,
6465
sample_rate: Optional[int] = None,
6566
params: Optional[KrispTurnParams] = None,
67+
api_key: str = "",
6668
) -> None:
6769
"""Initialize the Krisp turn analyzer.
6870
@@ -72,6 +74,8 @@ def __init__(
7274
sample_rate: Optional initial sample rate for audio processing.
7375
If provided, this will be used as the fixed sample rate.
7476
params: Configuration parameters for turn analysis behavior.
77+
api_key: Krisp SDK API key. If empty, falls back to
78+
the KRISP_VIVA_API_KEY environment variable.
7579
7680
Raises:
7781
ValueError: If model_path is not provided and KRISP_VIVA_TURN_MODEL_PATH is not set.
@@ -83,7 +87,7 @@ def __init__(
8387

8488
# Acquire SDK reference (will initialize on first call)
8589
try:
86-
KrispVivaSDKManager.acquire()
90+
KrispVivaSDKManager.acquire(api_key=api_key)
8791
self._sdk_acquired = True
8892
except Exception as e:
8993
self._sdk_acquired = False
@@ -115,6 +119,9 @@ def __init__(
115119
self._last_probability = None
116120
self._frame_probabilities = []
117121
self._last_state = EndOfTurnState.INCOMPLETE
122+
self._speech_stopped_time: Optional[float] = None
123+
self._e2e_processing_time_ms: Optional[float] = None
124+
self._last_metrics: Optional[TurnMetricsData] = None
118125

119126
# Create session with provided sample rate or default to 16000 Hz
120127
# This preloads the model to improve latency when set_sample_rate is called later
@@ -288,7 +295,14 @@ def append_audio(self, buffer: bytes, is_speech: bool) -> EndOfTurnState:
288295
# Track speech start time
289296
if not self._speech_triggered:
290297
logger.trace("Speech detected, turn analysis started")
298+
self._e2e_processing_time_ms = None
291299
self._speech_triggered = True
300+
# Reset speech stopped time when speech resumes
301+
self._speech_stopped_time = None
302+
else:
303+
# Record the moment speech transitions to non-speech
304+
if self._speech_triggered and self._speech_stopped_time is None:
305+
self._speech_stopped_time = time.perf_counter()
292306
# Note: We don't immediately mark as complete on silence detection.
293307
# Instead, we wait for the model's probability check below to confirm
294308
# end-of-turn based on the threshold.
@@ -308,6 +322,18 @@ def append_audio(self, buffer: bytes, is_speech: bool) -> EndOfTurnState:
308322
# Only mark as complete if we've detected speech and the model
309323
# confirms with sufficient confidence
310324
if self._speech_triggered and prob >= self._params.threshold:
325+
# Calculate e2e processing time: time from speech stop to threshold crossing
326+
if self._speech_stopped_time is not None:
327+
self._e2e_processing_time_ms = (
328+
time.perf_counter() - self._speech_stopped_time
329+
) * 1000
330+
self._last_metrics = TurnMetricsData(
331+
processor="KrispVivaTurn",
332+
is_complete=True,
333+
probability=prob,
334+
e2e_processing_time_ms=self._e2e_processing_time_ms,
335+
)
336+
logger.debug(f"Krisp turn complete")
311337
state = EndOfTurnState.COMPLETE
312338
self.clear()
313339
break
@@ -329,12 +355,15 @@ async def analyze_end_of_turn(self) -> Tuple[EndOfTurnState, Optional[MetricsDat
329355
Tuple containing the end-of-turn state and optional metrics data.
330356
Returns the last state determined by append_audio().
331357
"""
332-
# For real-time processing, the state is determined in append_audio
333-
# Return the last state that was computed
334-
return self._last_state, None
358+
# For real-time processing, the state is determined in append_audio.
359+
# Consume metrics so they aren't pushed twice.
360+
metrics = self._last_metrics
361+
self._last_metrics = None
362+
return self._last_state, metrics
335363

336364
def clear(self):
337365
"""Reset the turn analyzer to its initial state."""
338366
self._speech_triggered = False
339367
self._audio_buffer.clear()
340368
self._last_state = EndOfTurnState.INCOMPLETE
369+
self._speech_stopped_time = None

0 commit comments

Comments
 (0)