Update Camb TTS to 48kHz sample rate

eRuaro · eRuaro · commit e76a3d04f0cf · 2026-01-16T01:18:37.000+08:00
diff --git a/examples/foundational/07zb-interruptible-camb-local.py b/examples/foundational/07zb-interruptible-camb-local.py
@@ -4,36 +4,43 @@
 # SPDX-License-Identifier: BSD 2-Clause License
 #
 
-"""Camb.ai MARS TTS example with local audio (microphone/speakers).
+"""Camb.ai TTS example with local audio (microphone/speakers).
 
 This example demonstrates:
-- Basic TTS synthesis with Camb.ai MARS
+- Camb.ai MARS TTS with streaming audio
 - Local audio input/output (no WebRTC or Daily needed)
-- Handling interruptions
+- TTFB metrics tracking
+- End-to-end latency measurement (user speech → AI response)
 
 Requirements:
 - CAMB_API_KEY environment variable
 - OPENAI_API_KEY environment variable (for LLM)
 - DEEPGRAM_API_KEY environment variable (for STT)
 
 Usage:
-    export CAMB_API_KEY=your_camb_api_key
-    export OPENAI_API_KEY=your_openai_api_key
-    export DEEPGRAM_API_KEY=your_deepgram_api_key
-    python 07zb-interruptible-camb-local.py [--voice-id VOICE_ID]
+    python 07zb-interruptible-camb-local.py
+    python 07zb-interruptible-camb-local.py --voice-id 147320
 """
 
 import argparse
 import asyncio
 import os
 import sys
+import time
 
 from dotenv import load_dotenv
 from loguru import logger
 
 from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.audio.vad.vad_analyzer import VADParams
-from pipecat.frames.frames import LLMRunFrame
+from pipecat.frames.frames import (
+    BotStartedSpeakingFrame,
+    Frame,
+    LLMFullResponseStartFrame,
+    LLMRunFrame,
+    TTSStartedFrame,
+    UserStoppedSpeakingFrame,
+)
 from pipecat.metrics.metrics import TTFBMetricsData
 from pipecat.observers.loggers.metrics_log_observer import MetricsLogObserver
 from pipecat.pipeline.pipeline import Pipeline
@@ -43,31 +50,81 @@
 from pipecat.processors.aggregators.llm_response_universal import (
     LLMContextAggregatorPair,
 )
+from pipecat.processors.frame_processor import FrameDirection, FrameProcessor
 from pipecat.services.camb.tts import CambTTSService
 from pipecat.services.deepgram.stt import DeepgramSTTService
 from pipecat.services.openai.llm import OpenAILLMService
 from pipecat.transports.local.audio import LocalAudioTransport, LocalAudioTransportParams
 
+
+class LatencyTracker(FrameProcessor):
+    """Tracks end-to-end latency from user speech to AI audio response."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self._user_stopped_time: float = 0
+        self._llm_start_time: float = 0
+        self._tts_start_time: float = 0
+
+    async def process_frame(self, frame: Frame, direction: FrameDirection):
+        await super().process_frame(frame, direction)
+
+        if isinstance(frame, UserStoppedSpeakingFrame):
+            self._user_stopped_time = time.time()
+            logger.info("⏱️  User stopped speaking - timer started")
+
+        elif isinstance(frame, LLMFullResponseStartFrame):
+            self._llm_start_time = time.time()
+            if self._user_stopped_time > 0:
+                stt_latency = (self._llm_start_time - self._user_stopped_time) * 1000
+                logger.info(f"⏱️  STT latency: {stt_latency:.0f}ms")
+
+        elif isinstance(frame, TTSStartedFrame):
+            self._tts_start_time = time.time()
+            if self._llm_start_time > 0:
+                llm_latency = (self._tts_start_time - self._llm_start_time) * 1000
+                logger.info(f"⏱️  LLM TTFB: {llm_latency:.0f}ms")
+
+        elif isinstance(frame, BotStartedSpeakingFrame):
+            if self._user_stopped_time > 0:
+                total_latency = (time.time() - self._user_stopped_time) * 1000
+                tts_latency = (time.time() - self._tts_start_time) * 1000 if self._tts_start_time > 0 else 0
+                logger.info(f"⏱️  TTS TTFB: {tts_latency:.0f}ms")
+                logger.info(f"⏱️  ✨ TOTAL END-TO-END LATENCY: {total_latency:.0f}ms")
+                # Reset for next turn
+                self._user_stopped_time = 0
+                self._llm_start_time = 0
+                self._tts_start_time = 0
+
+        await self.push_frame(frame, direction)
+
 load_dotenv(override=True)
 
 logger.remove(0)
 logger.add(sys.stderr, level="DEBUG")
 
+# Default voice
+DEFAULT_VOICE_ID = 147320
+
 
 async def main(voice_id: int):
+    sample_rate = 48000
+
     # Local audio transport - uses your microphone and speakers
+    # Increase audio_out_10ms_chunks for larger buffer (default is 4 = 40ms)
     transport = LocalAudioTransport(
         LocalAudioTransportParams(
             audio_in_enabled=True,
             audio_out_enabled=True,
+            audio_out_10ms_chunks=10,  # 100ms buffer for smoother playback
             vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
         )
     )
 
     # Deepgram STT for speech recognition
     stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
 
-    # Camb.ai TTS with MARS-flash model (uses official SDK)
+    # Camb.ai TTS (48kHz output)
     tts = CambTTSService(
         api_key=os.getenv("CAMB_API_KEY"),
         voice_id=voice_id,
@@ -81,7 +138,7 @@ async def main(voice_id: int):
     messages = [
         {
             "role": "system",
-            "content": """You are a helpful voice assistant powered by Camb.ai's MARS
+            "content": """You are a helpful voice assistant powered by Camb.ai
 text-to-speech technology. Keep your responses concise and conversational since
 they will be spoken aloud. Avoid special characters, emojis, or bullet points.""",
         },
@@ -91,26 +148,28 @@ async def main(voice_id: int):
     context = LLMContext(messages)
     context_aggregator = LLMContextAggregatorPair(context)
 
+    # Latency tracker for end-to-end timing
+    latency_tracker = LatencyTracker()
+
     # Build the pipeline
     pipeline = Pipeline(
         [
             transport.input(),  # Microphone input
             stt,  # Speech-to-text
+            latency_tracker,  # Track latency at various stages
             context_aggregator.user(),  # User context
             llm,  # Language model
-            tts,  # Camb.ai TTS
+            tts,  # TTS
             transport.output(),  # Speaker output
             context_aggregator.assistant(),  # Assistant context
         ]
     )
 
-    # Create pipeline task
-    # Use 24kHz sample rate to match Camb.ai TTS output
-    # Add MetricsLogObserver to track TTFB metrics
+    # Create pipeline task with TTFB tracking
     task = PipelineTask(
         pipeline,
         params=PipelineParams(
-            audio_out_sample_rate=24000,
+            audio_out_sample_rate=sample_rate,
             enable_metrics=True,
             enable_usage_metrics=True,
         ),
@@ -136,12 +195,12 @@ async def on_pipeline_started(task, frame):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Camb.ai TTS example with local audio")
+    parser = argparse.ArgumentParser(description="Camb.ai TTS with local audio")
     parser.add_argument(
         "--voice-id",
         type=int,
-        default=147320,
-        help="Camb.ai voice ID to use (default: 147320)",
+        default=DEFAULT_VOICE_ID,
+        help=f"Camb.ai voice ID (default: {DEFAULT_VOICE_ID})",
     )
     args = parser.parse_args()
     asyncio.run(main(args.voice_id))
diff --git a/src/pipecat/services/camb/tts.py b/src/pipecat/services/camb/tts.py
@@ -13,7 +13,7 @@
     - MARS models: mars-flash, mars-pro, mars-instruct
     - 140+ languages supported
     - Real-time streaming via official SDK
-    - 24kHz audio output
+    - 48kHz audio output
     - Voice customization (instructions for mars-instruct)
 """
 
@@ -41,7 +41,7 @@
 DEFAULT_VOICE_ID = 147320
 DEFAULT_LANGUAGE = "en-us"
 DEFAULT_MODEL = "mars-flash"  # Faster inference
-DEFAULT_SAMPLE_RATE = 24000  # 24kHz
+DEFAULT_SAMPLE_RATE = 48000  # 48kHz
 DEFAULT_TIMEOUT = 60.0  # Seconds (minimum recommended by Camb.ai)
 MIN_TEXT_LENGTH = 3
 MAX_TEXT_LENGTH = 3000
@@ -133,6 +133,8 @@ class CambTTSService(TTSService):
     Converts text to speech using Camb.ai's MARS TTS models with support for
     multiple languages. Provides custom instructions support for the mars-instruct model.
 
+    All models output 48kHz audio.
+
     Example::
 
         # Basic usage with defaults
@@ -145,13 +147,13 @@ class CambTTSService(TTSService):
             model="mars-pro",
         )
 
-        # For mars-instruct with custom instructions:
+        # mars-instruct with custom instructions
         tts = CambTTSService(
             api_key="your-api-key",
             model="mars-instruct",
             params=CambTTSService.InputParams(
                 user_instructions="Speak with excitement and energy"
-            )
+            ),
         )
     """
 
@@ -191,7 +193,7 @@ def __init__(
             model: TTS model to use. Options: "mars-flash", "mars-pro", "mars-instruct".
                 Defaults to DEFAULT_MODEL (mars-flash, fastest).
             timeout: Request timeout in seconds. Defaults to DEFAULT_TIMEOUT (60s).
-            sample_rate: Audio sample rate in Hz. If None, uses DEFAULT_SAMPLE_RATE (24kHz).
+            sample_rate: Audio sample rate in Hz. If None, uses DEFAULT_SAMPLE_RATE (48kHz).
             params: Additional voice parameters. If None, uses defaults.
             **kwargs: Additional arguments passed to parent TTSService.
         """
@@ -241,7 +243,7 @@ async def start(self, frame: StartFrame):
             frame: The start frame containing initialization parameters.
         """
         await super().start(frame)
-        # Use Camb.ai's native sample rate if not specified
+        # Use 48kHz sample rate if not explicitly specified
         if not self._init_sample_rate:
             self._sample_rate = DEFAULT_SAMPLE_RATE
         self._settings["sample_rate"] = self._sample_rate
diff --git a/tests/test_camb_tts.py b/tests/test_camb_tts.py
@@ -75,7 +75,7 @@ async def test_run_camb_tts_success():
         audio_frames = [f for f in frames if isinstance(f, TTSAudioRawFrame)]
         assert len(audio_frames) > 0, "Should have at least one audio frame"
 
-        # Verify sample rate matches Camb.ai's output
+        # Verify sample rate matches 48kHz output
         for a_frame in audio_frames:
             assert a_frame.sample_rate == DEFAULT_SAMPLE_RATE
             assert a_frame.num_channels == 1, "Should be mono audio"