Skip to content

Commit e76a3d0

Browse files
committed
Update Camb TTS to 48kHz sample rate
1 parent 641d170 commit e76a3d0

3 files changed

Lines changed: 86 additions & 25 deletions

File tree

examples/foundational/07zb-interruptible-camb-local.py

Lines changed: 77 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,36 +4,43 @@
44
# SPDX-License-Identifier: BSD 2-Clause License
55
#
66

7-
"""Camb.ai MARS TTS example with local audio (microphone/speakers).
7+
"""Camb.ai TTS example with local audio (microphone/speakers).
88
99
This example demonstrates:
10-
- Basic TTS synthesis with Camb.ai MARS
10+
- Camb.ai MARS TTS with streaming audio
1111
- Local audio input/output (no WebRTC or Daily needed)
12-
- Handling interruptions
12+
- TTFB metrics tracking
13+
- End-to-end latency measurement (user speech → AI response)
1314
1415
Requirements:
1516
- CAMB_API_KEY environment variable
1617
- OPENAI_API_KEY environment variable (for LLM)
1718
- DEEPGRAM_API_KEY environment variable (for STT)
1819
1920
Usage:
20-
export CAMB_API_KEY=your_camb_api_key
21-
export OPENAI_API_KEY=your_openai_api_key
22-
export DEEPGRAM_API_KEY=your_deepgram_api_key
23-
python 07zb-interruptible-camb-local.py [--voice-id VOICE_ID]
21+
python 07zb-interruptible-camb-local.py
22+
python 07zb-interruptible-camb-local.py --voice-id 147320
2423
"""
2524

2625
import argparse
2726
import asyncio
2827
import os
2928
import sys
29+
import time
3030

3131
from dotenv import load_dotenv
3232
from loguru import logger
3333

3434
from pipecat.audio.vad.silero import SileroVADAnalyzer
3535
from pipecat.audio.vad.vad_analyzer import VADParams
36-
from pipecat.frames.frames import LLMRunFrame
36+
from pipecat.frames.frames import (
37+
BotStartedSpeakingFrame,
38+
Frame,
39+
LLMFullResponseStartFrame,
40+
LLMRunFrame,
41+
TTSStartedFrame,
42+
UserStoppedSpeakingFrame,
43+
)
3744
from pipecat.metrics.metrics import TTFBMetricsData
3845
from pipecat.observers.loggers.metrics_log_observer import MetricsLogObserver
3946
from pipecat.pipeline.pipeline import Pipeline
@@ -43,31 +50,81 @@
4350
from pipecat.processors.aggregators.llm_response_universal import (
4451
LLMContextAggregatorPair,
4552
)
53+
from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
4654
from pipecat.services.camb.tts import CambTTSService
4755
from pipecat.services.deepgram.stt import DeepgramSTTService
4856
from pipecat.services.openai.llm import OpenAILLMService
4957
from pipecat.transports.local.audio import LocalAudioTransport, LocalAudioTransportParams
5058

59+
60+
class LatencyTracker(FrameProcessor):
61+
"""Tracks end-to-end latency from user speech to AI audio response."""
62+
63+
def __init__(self, **kwargs):
64+
super().__init__(**kwargs)
65+
self._user_stopped_time: float = 0
66+
self._llm_start_time: float = 0
67+
self._tts_start_time: float = 0
68+
69+
async def process_frame(self, frame: Frame, direction: FrameDirection):
70+
await super().process_frame(frame, direction)
71+
72+
if isinstance(frame, UserStoppedSpeakingFrame):
73+
self._user_stopped_time = time.time()
74+
logger.info("⏱️ User stopped speaking - timer started")
75+
76+
elif isinstance(frame, LLMFullResponseStartFrame):
77+
self._llm_start_time = time.time()
78+
if self._user_stopped_time > 0:
79+
stt_latency = (self._llm_start_time - self._user_stopped_time) * 1000
80+
logger.info(f"⏱️ STT latency: {stt_latency:.0f}ms")
81+
82+
elif isinstance(frame, TTSStartedFrame):
83+
self._tts_start_time = time.time()
84+
if self._llm_start_time > 0:
85+
llm_latency = (self._tts_start_time - self._llm_start_time) * 1000
86+
logger.info(f"⏱️ LLM TTFB: {llm_latency:.0f}ms")
87+
88+
elif isinstance(frame, BotStartedSpeakingFrame):
89+
if self._user_stopped_time > 0:
90+
total_latency = (time.time() - self._user_stopped_time) * 1000
91+
tts_latency = (time.time() - self._tts_start_time) * 1000 if self._tts_start_time > 0 else 0
92+
logger.info(f"⏱️ TTS TTFB: {tts_latency:.0f}ms")
93+
logger.info(f"⏱️ ✨ TOTAL END-TO-END LATENCY: {total_latency:.0f}ms")
94+
# Reset for next turn
95+
self._user_stopped_time = 0
96+
self._llm_start_time = 0
97+
self._tts_start_time = 0
98+
99+
await self.push_frame(frame, direction)
100+
51101
load_dotenv(override=True)
52102

53103
logger.remove(0)
54104
logger.add(sys.stderr, level="DEBUG")
55105

106+
# Default voice
107+
DEFAULT_VOICE_ID = 147320
108+
56109

57110
async def main(voice_id: int):
111+
sample_rate = 48000
112+
58113
# Local audio transport - uses your microphone and speakers
114+
# Increase audio_out_10ms_chunks for larger buffer (default is 4 = 40ms)
59115
transport = LocalAudioTransport(
60116
LocalAudioTransportParams(
61117
audio_in_enabled=True,
62118
audio_out_enabled=True,
119+
audio_out_10ms_chunks=10, # 100ms buffer for smoother playback
63120
vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
64121
)
65122
)
66123

67124
# Deepgram STT for speech recognition
68125
stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
69126

70-
# Camb.ai TTS with MARS-flash model (uses official SDK)
127+
# Camb.ai TTS (48kHz output)
71128
tts = CambTTSService(
72129
api_key=os.getenv("CAMB_API_KEY"),
73130
voice_id=voice_id,
@@ -81,7 +138,7 @@ async def main(voice_id: int):
81138
messages = [
82139
{
83140
"role": "system",
84-
"content": """You are a helpful voice assistant powered by Camb.ai's MARS
141+
"content": """You are a helpful voice assistant powered by Camb.ai
85142
text-to-speech technology. Keep your responses concise and conversational since
86143
they will be spoken aloud. Avoid special characters, emojis, or bullet points.""",
87144
},
@@ -91,26 +148,28 @@ async def main(voice_id: int):
91148
context = LLMContext(messages)
92149
context_aggregator = LLMContextAggregatorPair(context)
93150

151+
# Latency tracker for end-to-end timing
152+
latency_tracker = LatencyTracker()
153+
94154
# Build the pipeline
95155
pipeline = Pipeline(
96156
[
97157
transport.input(), # Microphone input
98158
stt, # Speech-to-text
159+
latency_tracker, # Track latency at various stages
99160
context_aggregator.user(), # User context
100161
llm, # Language model
101-
tts, # Camb.ai TTS
162+
tts, # TTS
102163
transport.output(), # Speaker output
103164
context_aggregator.assistant(), # Assistant context
104165
]
105166
)
106167

107-
# Create pipeline task
108-
# Use 24kHz sample rate to match Camb.ai TTS output
109-
# Add MetricsLogObserver to track TTFB metrics
168+
# Create pipeline task with TTFB tracking
110169
task = PipelineTask(
111170
pipeline,
112171
params=PipelineParams(
113-
audio_out_sample_rate=24000,
172+
audio_out_sample_rate=sample_rate,
114173
enable_metrics=True,
115174
enable_usage_metrics=True,
116175
),
@@ -136,12 +195,12 @@ async def on_pipeline_started(task, frame):
136195

137196

138197
if __name__ == "__main__":
139-
parser = argparse.ArgumentParser(description="Camb.ai TTS example with local audio")
198+
parser = argparse.ArgumentParser(description="Camb.ai TTS with local audio")
140199
parser.add_argument(
141200
"--voice-id",
142201
type=int,
143-
default=147320,
144-
help="Camb.ai voice ID to use (default: 147320)",
202+
default=DEFAULT_VOICE_ID,
203+
help=f"Camb.ai voice ID (default: {DEFAULT_VOICE_ID})",
145204
)
146205
args = parser.parse_args()
147206
asyncio.run(main(args.voice_id))

src/pipecat/services/camb/tts.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
- MARS models: mars-flash, mars-pro, mars-instruct
1414
- 140+ languages supported
1515
- Real-time streaming via official SDK
16-
- 24kHz audio output
16+
- 48kHz audio output
1717
- Voice customization (instructions for mars-instruct)
1818
"""
1919

@@ -41,7 +41,7 @@
4141
DEFAULT_VOICE_ID = 147320
4242
DEFAULT_LANGUAGE = "en-us"
4343
DEFAULT_MODEL = "mars-flash" # Faster inference
44-
DEFAULT_SAMPLE_RATE = 24000 # 24kHz
44+
DEFAULT_SAMPLE_RATE = 48000 # 48kHz
4545
DEFAULT_TIMEOUT = 60.0 # Seconds (minimum recommended by Camb.ai)
4646
MIN_TEXT_LENGTH = 3
4747
MAX_TEXT_LENGTH = 3000
@@ -133,6 +133,8 @@ class CambTTSService(TTSService):
133133
Converts text to speech using Camb.ai's MARS TTS models with support for
134134
multiple languages. Provides custom instructions support for the mars-instruct model.
135135
136+
All models output 48kHz audio.
137+
136138
Example::
137139
138140
# Basic usage with defaults
@@ -145,13 +147,13 @@ class CambTTSService(TTSService):
145147
model="mars-pro",
146148
)
147149
148-
# For mars-instruct with custom instructions:
150+
# mars-instruct with custom instructions
149151
tts = CambTTSService(
150152
api_key="your-api-key",
151153
model="mars-instruct",
152154
params=CambTTSService.InputParams(
153155
user_instructions="Speak with excitement and energy"
154-
)
156+
),
155157
)
156158
"""
157159

@@ -191,7 +193,7 @@ def __init__(
191193
model: TTS model to use. Options: "mars-flash", "mars-pro", "mars-instruct".
192194
Defaults to DEFAULT_MODEL (mars-flash, fastest).
193195
timeout: Request timeout in seconds. Defaults to DEFAULT_TIMEOUT (60s).
194-
sample_rate: Audio sample rate in Hz. If None, uses DEFAULT_SAMPLE_RATE (24kHz).
196+
sample_rate: Audio sample rate in Hz. If None, uses DEFAULT_SAMPLE_RATE (48kHz).
195197
params: Additional voice parameters. If None, uses defaults.
196198
**kwargs: Additional arguments passed to parent TTSService.
197199
"""
@@ -241,7 +243,7 @@ async def start(self, frame: StartFrame):
241243
frame: The start frame containing initialization parameters.
242244
"""
243245
await super().start(frame)
244-
# Use Camb.ai's native sample rate if not specified
246+
# Use 48kHz sample rate if not explicitly specified
245247
if not self._init_sample_rate:
246248
self._sample_rate = DEFAULT_SAMPLE_RATE
247249
self._settings["sample_rate"] = self._sample_rate

tests/test_camb_tts.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ async def test_run_camb_tts_success():
7575
audio_frames = [f for f in frames if isinstance(f, TTSAudioRawFrame)]
7676
assert len(audio_frames) > 0, "Should have at least one audio frame"
7777

78-
# Verify sample rate matches Camb.ai's output
78+
# Verify sample rate matches 48kHz output
7979
for a_frame in audio_frames:
8080
assert a_frame.sample_rate == DEFAULT_SAMPLE_RATE
8181
assert a_frame.num_channels == 1, "Should be mono audio"

0 commit comments

Comments
 (0)