44# SPDX-License-Identifier: BSD 2-Clause License
55#
66
7- """Camb.ai MARS TTS example with local audio (microphone/speakers).
7+ """Camb.ai TTS example with local audio (microphone/speakers).
88
99This example demonstrates:
10- - Basic TTS synthesis with Camb.ai MARS
10+ - Camb.ai MARS TTS with streaming audio
1111- Local audio input/output (no WebRTC or Daily needed)
12- - Handling interruptions
12+ - TTFB metrics tracking
13+ - End-to-end latency measurement (user speech → AI response)
1314
1415Requirements:
1516- CAMB_API_KEY environment variable
1617- OPENAI_API_KEY environment variable (for LLM)
1718- DEEPGRAM_API_KEY environment variable (for STT)
1819
1920Usage:
20- export CAMB_API_KEY=your_camb_api_key
21- export OPENAI_API_KEY=your_openai_api_key
22- export DEEPGRAM_API_KEY=your_deepgram_api_key
23- python 07zb-interruptible-camb-local.py [--voice-id VOICE_ID]
21+ python 07zb-interruptible-camb-local.py
22+ python 07zb-interruptible-camb-local.py --voice-id 147320
2423"""
2524
2625import argparse
2726import asyncio
2827import os
2928import sys
29+ import time
3030
3131from dotenv import load_dotenv
3232from loguru import logger
3333
3434from pipecat .audio .vad .silero import SileroVADAnalyzer
3535from pipecat .audio .vad .vad_analyzer import VADParams
36- from pipecat .frames .frames import LLMRunFrame
36+ from pipecat .frames .frames import (
37+ BotStartedSpeakingFrame ,
38+ Frame ,
39+ LLMFullResponseStartFrame ,
40+ LLMRunFrame ,
41+ TTSStartedFrame ,
42+ UserStoppedSpeakingFrame ,
43+ )
3744from pipecat .metrics .metrics import TTFBMetricsData
3845from pipecat .observers .loggers .metrics_log_observer import MetricsLogObserver
3946from pipecat .pipeline .pipeline import Pipeline
4350from pipecat .processors .aggregators .llm_response_universal import (
4451 LLMContextAggregatorPair ,
4552)
53+ from pipecat .processors .frame_processor import FrameDirection , FrameProcessor
4654from pipecat .services .camb .tts import CambTTSService
4755from pipecat .services .deepgram .stt import DeepgramSTTService
4856from pipecat .services .openai .llm import OpenAILLMService
4957from pipecat .transports .local .audio import LocalAudioTransport , LocalAudioTransportParams
5058
59+
60+ class LatencyTracker (FrameProcessor ):
61+ """Tracks end-to-end latency from user speech to AI audio response."""
62+
63+ def __init__ (self , ** kwargs ):
64+ super ().__init__ (** kwargs )
65+ self ._user_stopped_time : float = 0
66+ self ._llm_start_time : float = 0
67+ self ._tts_start_time : float = 0
68+
69+ async def process_frame (self , frame : Frame , direction : FrameDirection ):
70+ await super ().process_frame (frame , direction )
71+
72+ if isinstance (frame , UserStoppedSpeakingFrame ):
73+ self ._user_stopped_time = time .time ()
74+ logger .info ("⏱️ User stopped speaking - timer started" )
75+
76+ elif isinstance (frame , LLMFullResponseStartFrame ):
77+ self ._llm_start_time = time .time ()
78+ if self ._user_stopped_time > 0 :
79+ stt_latency = (self ._llm_start_time - self ._user_stopped_time ) * 1000
80+ logger .info (f"⏱️ STT latency: { stt_latency :.0f} ms" )
81+
82+ elif isinstance (frame , TTSStartedFrame ):
83+ self ._tts_start_time = time .time ()
84+ if self ._llm_start_time > 0 :
85+ llm_latency = (self ._tts_start_time - self ._llm_start_time ) * 1000
86+ logger .info (f"⏱️ LLM TTFB: { llm_latency :.0f} ms" )
87+
88+ elif isinstance (frame , BotStartedSpeakingFrame ):
89+ if self ._user_stopped_time > 0 :
90+ total_latency = (time .time () - self ._user_stopped_time ) * 1000
91+ tts_latency = (time .time () - self ._tts_start_time ) * 1000 if self ._tts_start_time > 0 else 0
92+ logger .info (f"⏱️ TTS TTFB: { tts_latency :.0f} ms" )
93+ logger .info (f"⏱️ ✨ TOTAL END-TO-END LATENCY: { total_latency :.0f} ms" )
94+ # Reset for next turn
95+ self ._user_stopped_time = 0
96+ self ._llm_start_time = 0
97+ self ._tts_start_time = 0
98+
99+ await self .push_frame (frame , direction )
100+
51101load_dotenv (override = True )
52102
53103logger .remove (0 )
54104logger .add (sys .stderr , level = "DEBUG" )
55105
106+ # Default voice
107+ DEFAULT_VOICE_ID = 147320
108+
56109
57110async def main (voice_id : int ):
111+ sample_rate = 48000
112+
58113 # Local audio transport - uses your microphone and speakers
114+ # Increase audio_out_10ms_chunks for larger buffer (default is 4 = 40ms)
59115 transport = LocalAudioTransport (
60116 LocalAudioTransportParams (
61117 audio_in_enabled = True ,
62118 audio_out_enabled = True ,
119+ audio_out_10ms_chunks = 10 , # 100ms buffer for smoother playback
63120 vad_analyzer = SileroVADAnalyzer (params = VADParams (stop_secs = 0.2 )),
64121 )
65122 )
66123
67124 # Deepgram STT for speech recognition
68125 stt = DeepgramSTTService (api_key = os .getenv ("DEEPGRAM_API_KEY" ))
69126
70- # Camb.ai TTS with MARS-flash model (uses official SDK )
127+ # Camb.ai TTS (48kHz output )
71128 tts = CambTTSService (
72129 api_key = os .getenv ("CAMB_API_KEY" ),
73130 voice_id = voice_id ,
@@ -81,7 +138,7 @@ async def main(voice_id: int):
81138 messages = [
82139 {
83140 "role" : "system" ,
84- "content" : """You are a helpful voice assistant powered by Camb.ai's MARS
141+ "content" : """You are a helpful voice assistant powered by Camb.ai
85142text-to-speech technology. Keep your responses concise and conversational since
86143they will be spoken aloud. Avoid special characters, emojis, or bullet points.""" ,
87144 },
@@ -91,26 +148,28 @@ async def main(voice_id: int):
91148 context = LLMContext (messages )
92149 context_aggregator = LLMContextAggregatorPair (context )
93150
151+ # Latency tracker for end-to-end timing
152+ latency_tracker = LatencyTracker ()
153+
94154 # Build the pipeline
95155 pipeline = Pipeline (
96156 [
97157 transport .input (), # Microphone input
98158 stt , # Speech-to-text
159+ latency_tracker , # Track latency at various stages
99160 context_aggregator .user (), # User context
100161 llm , # Language model
101- tts , # Camb.ai TTS
162+ tts , # TTS
102163 transport .output (), # Speaker output
103164 context_aggregator .assistant (), # Assistant context
104165 ]
105166 )
106167
107- # Create pipeline task
108- # Use 24kHz sample rate to match Camb.ai TTS output
109- # Add MetricsLogObserver to track TTFB metrics
168+ # Create pipeline task with TTFB tracking
110169 task = PipelineTask (
111170 pipeline ,
112171 params = PipelineParams (
113- audio_out_sample_rate = 24000 ,
172+ audio_out_sample_rate = sample_rate ,
114173 enable_metrics = True ,
115174 enable_usage_metrics = True ,
116175 ),
@@ -136,12 +195,12 @@ async def on_pipeline_started(task, frame):
136195
137196
138197if __name__ == "__main__" :
139- parser = argparse .ArgumentParser (description = "Camb.ai TTS example with local audio" )
198+ parser = argparse .ArgumentParser (description = "Camb.ai TTS with local audio" )
140199 parser .add_argument (
141200 "--voice-id" ,
142201 type = int ,
143- default = 147320 ,
144- help = "Camb.ai voice ID to use (default: 147320 )" ,
202+ default = DEFAULT_VOICE_ID ,
203+ help = f "Camb.ai voice ID (default: { DEFAULT_VOICE_ID } )" ,
145204 )
146205 args = parser .parse_args ()
147206 asyncio .run (main (args .voice_id ))
0 commit comments