Merge pull request #3585 from pipecat-ai/aleix/improve-piper-tts-support

aconchillo · web-flow · commit f3b72e9263fa · 2026-01-29T08:36:13.000-08:00
improve Piper TTS support
diff --git a/.github/workflows/coverage.yaml b/.github/workflows/coverage.yaml
@@ -33,7 +33,14 @@ jobs:
 
       - name: Install dependencies
         run: |
-          uv sync --group dev --extra anthropic --extra aws --extra google --extra langchain --extra livekit --extra websocket
+          uv sync --group dev \
+            --extra anthropic \
+            --extra aws \
+            --extra google \
+            --extra langchain \
+            --extra livekit \
+            --extra piper \
+            --extra websocket
 
       - name: Run tests with coverage
         run: |
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -37,7 +37,14 @@ jobs:
 
       - name: Install dependencies
         run: |
-          uv sync --group dev --extra anthropic --extra aws --extra google --extra langchain --extra livekit --extra websocket
+          uv sync --group dev \
+            --extra anthropic \
+            --extra aws \
+            --extra google \
+            --extra langchain \
+            --extra livekit \
+            --extra piper \
+            --extra websocket
 
       - name: Test with pytest
         run: |
diff --git a/changelog/3585.added.md b/changelog/3585.added.md
@@ -0,0 +1 @@
+- Added local `PiperTTSService` for offline text-to-speech using Piper voice models. The existing HTTP-based service has been renamed to `PiperHttpTTSService`.
diff --git a/changelog/3585.fixed.md b/changelog/3585.fixed.md
@@ -0,0 +1 @@
+- Fixed `PiperHttpTTSService` (olf `PiperTTSService`) to resample audio output based on the model's sample rate parsed from the WAV header.
diff --git a/examples/foundational/01-say-one-thing-piper.py b/examples/foundational/01-say-one-thing-piper.py
@@ -16,7 +16,7 @@
 from pipecat.pipeline.task import PipelineTask
 from pipecat.runner.types import RunnerArguments
 from pipecat.runner.utils import create_transport
-from pipecat.services.piper.tts import PiperTTSService
+from pipecat.services.piper.tts import PiperHttpTTSService
 from pipecat.transports.base_transport import BaseTransport, TransportParams
 from pipecat.transports.daily.transport import DailyParams
 from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
@@ -39,7 +39,7 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
 
     # Create an HTTP session
     async with aiohttp.ClientSession() as session:
-        tts = PiperTTSService(
+        tts = PiperHttpTTSService(
             base_url=os.getenv("PIPER_BASE_URL"), aiohttp_session=session, sample_rate=24000
         )
 
diff --git a/examples/foundational/07zi-interruptible-piper.py b/examples/foundational/07zi-interruptible-piper.py
@@ -0,0 +1,132 @@
+#
+# Copyright (c) 2024-2026, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+import os
+
+from dotenv import load_dotenv
+from loguru import logger
+
+from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
+from pipecat.audio.vad.silero import SileroVADAnalyzer
+from pipecat.audio.vad.vad_analyzer import VADParams
+from pipecat.frames.frames import LLMRunFrame
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.llm_context import LLMContext
+from pipecat.processors.aggregators.llm_response_universal import (
+    LLMContextAggregatorPair,
+    LLMUserAggregatorParams,
+)
+from pipecat.runner.types import RunnerArguments
+from pipecat.runner.utils import create_transport
+from pipecat.services.deepgram.stt import DeepgramSTTService
+from pipecat.services.openai.llm import OpenAILLMService
+from pipecat.services.piper.tts import PiperTTSService
+from pipecat.transports.base_transport import BaseTransport, TransportParams
+from pipecat.transports.daily.transport import DailyParams
+from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
+from pipecat.turns.user_stop import TurnAnalyzerUserTurnStopStrategy
+from pipecat.turns.user_turn_strategies import UserTurnStrategies
+
+load_dotenv(override=True)
+
+# We store functions so objects (e.g. SileroVADAnalyzer) don't get
+# instantiated. The function will be called when the desired transport gets
+# selected.
+transport_params = {
+    "daily": lambda: DailyParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
+    ),
+    "twilio": lambda: FastAPIWebsocketParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
+    ),
+    "webrtc": lambda: TransportParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
+    ),
+}
+
+
+async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
+    logger.info(f"Starting bot")
+
+    stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
+
+    tts = PiperTTSService(voice_id="en_US-ryan-high")
+
+    llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
+
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a helpful LLM in a WebRTC call. Your goal is to demonstrate your capabilities in a succinct way. Your output will be spoken aloud, so avoid special characters that can't easily be spoken, such as emojis or bullet points. Respond to what the user said in a creative and helpful way.",
+        },
+    ]
+
+    context = LLMContext(messages)
+    user_aggregator, assistant_aggregator = LLMContextAggregatorPair(
+        context,
+        user_params=LLMUserAggregatorParams(
+            user_turn_strategies=UserTurnStrategies(
+                stop=[TurnAnalyzerUserTurnStopStrategy(turn_analyzer=LocalSmartTurnAnalyzerV3())]
+            ),
+        ),
+    )
+
+    pipeline = Pipeline(
+        [
+            transport.input(),  # Transport user input
+            stt,
+            user_aggregator,  # User responses
+            llm,  # LLM
+            tts,  # TTS
+            transport.output(),  # Transport bot output
+            assistant_aggregator,  # Assistant spoken responses
+        ]
+    )
+
+    task = PipelineTask(
+        pipeline,
+        params=PipelineParams(
+            enable_metrics=True,
+            enable_usage_metrics=True,
+        ),
+        idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
+    )
+
+    @transport.event_handler("on_client_connected")
+    async def on_client_connected(transport, client):
+        logger.info(f"Client connected")
+        # Kick off the conversation.
+        messages.append({"role": "system", "content": "Please introduce yourself to the user."})
+        await task.queue_frames([LLMRunFrame()])
+
+    @transport.event_handler("on_client_disconnected")
+    async def on_client_disconnected(transport, client):
+        logger.info(f"Client disconnected")
+        await task.cancel()
+
+    runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
+
+    await runner.run(task)
+
+
+async def bot(runner_args: RunnerArguments):
+    """Main bot entry point compatible with Pipecat Cloud."""
+    transport = await create_transport(runner_args, transport_params)
+    await run_bot(transport, runner_args)
+
+
+if __name__ == "__main__":
+    from pipecat.runner.run import main
+
+    main()
diff --git a/pyproject.toml b/pyproject.toml
@@ -95,6 +95,7 @@ rnnoise = [ "pyrnnoise~=0.4.1" ]
 openpipe = [ "openpipe>=4.50.0,<6" ]
 openrouter = []
 perplexity = []
+piper = [ "piper-tts>=1.3.0,<2" ]
 playht = [ "pipecat-ai[websockets-base]" ]
 qwen = []
 remote-smart-turn = []
diff --git a/scripts/evals/run-release-evals.py b/scripts/evals/run-release-evals.py
@@ -138,6 +138,7 @@ def EVAL_VISION_IMAGE(*, eval_speaks_first: bool = False):
     ("07zf-interruptible-gradium.py", EVAL_SIMPLE_MATH),
     ("07zg-interruptible-camb.py", EVAL_SIMPLE_MATH),
     ("07zh-interruptible-hathora.py", EVAL_SIMPLE_MATH),
+    ("07zi-interruptible-piper.py", EVAL_SIMPLE_MATH),
     # Needs a local XTTS docker instance running.
     # ("07i-interruptible-xtts.py", EVAL_SIMPLE_MATH),
     # Needs a Krisp license.
diff --git a/src/pipecat/services/piper/tts.py b/src/pipecat/services/piper/tts.py
@@ -6,7 +6,9 @@
 
 """Piper TTS service implementation."""
 
-from typing import AsyncGenerator, Optional
+import asyncio
+from pathlib import Path
+from typing import AsyncGenerator, AsyncIterator, Optional
 
 import aiohttp
 from loguru import logger
@@ -20,11 +22,128 @@
 from pipecat.services.tts_service import TTSService
 from pipecat.utils.tracing.service_decorators import traced_tts
 
+try:
+    from piper import PiperVoice
+    from piper.download_voices import download_voice
+except ModuleNotFoundError as e:
+    logger.error(f"Exception: {e}")
+    logger.error("In order to use Piper, you need to `pip install pipecat-ai[piper]`.")
+    raise Exception(f"Missing module: {e}")
+
 
-# This assumes a running TTS service running: https://github.com/OHF-Voice/piper1-gpl/blob/main/docs/API_HTTP.md
 class PiperTTSService(TTSService):
     """Piper TTS service implementation.
 
+    Provides local text-to-speech synthesis using Piper voice models. Automatically
+    downloads voice models if not already present and resamples audio output to
+    match the configured sample rate.
+    """
+
+    def __init__(
+        self,
+        *,
+        voice_id: str,
+        download_dir: Optional[Path] = None,
+        force_redownload: bool = False,
+        use_cuda: bool = False,
+        **kwargs,
+    ):
+        """Initialize the Piper TTS service.
+
+        Args:
+            voice_id: Piper voice model identifier (e.g. `en_US-ryan-high`).
+            download_dir: Directory for storing voice model files. Defaults to
+                the current working directory.
+            force_redownload: Re-download the voice model even if it already exists.
+            use_cuda: Use CUDA for GPU-accelerated inference.
+            **kwargs: Additional arguments passed to the parent `TTSService`.
+        """
+        super().__init__(**kwargs)
+
+        self._voice_id = voice_id
+
+        download_dir = download_dir or Path.cwd()
+
+        model_file = f"{voice_id}.onnx"
+        model_path = Path(download_dir) / model_file
+
+        if not model_path.exists():
+            logger.debug(f"Downloading Piper '{voice_id}' model")
+            download_voice(voice_id, download_dir, force_redownload=force_redownload)
+
+        logger.debug(f"Loading Piper '{voice_id}' model from {model_path}")
+
+        self._voice = PiperVoice.load(model_path, use_cuda=use_cuda)
+
+        logger.debug(f"Loaded Piper '{voice_id}' model")
+
+    def can_generate_metrics(self) -> bool:
+        """Check if this service can generate processing metrics.
+
+        Returns:
+            True, as Piper service supports metrics generation.
+        """
+        return True
+
+    @traced_tts
+    async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
+        """Generate speech from text using Piper.
+
+        Args:
+            text: The text to convert to speech.
+
+        Yields:
+            Frame: Audio frames containing the synthesized speech and status frames.
+        """
+
+        def async_next(it):
+            try:
+                return next(it)
+            except StopIteration:
+                return None
+
+        async def async_iterator(iterator) -> AsyncIterator[bytes]:
+            while True:
+                item = await asyncio.to_thread(async_next, iterator)
+                if item is None:
+                    return
+                yield item.audio_int16_bytes
+
+        logger.debug(f"{self}: Generating TTS [{text}]")
+
+        try:
+            await self.start_ttfb_metrics()
+
+            await self.start_tts_usage_metrics(text)
+
+            yield TTSStartedFrame()
+
+            async for frame in self._stream_audio_frames_from_iterator(
+                async_iterator(self._voice.synthesize(text)),
+                in_sample_rate=self._voice.config.sample_rate,
+            ):
+                await self.stop_ttfb_metrics()
+                yield frame
+        except Exception as e:
+            logger.error(f"{self} exception: {e}")
+            yield ErrorFrame(error=f"Unknown error occurred: {e}")
+        finally:
+            logger.debug(f"{self}: Finished TTS [{text}]")
+            await self.stop_ttfb_metrics()
+            yield TTSStoppedFrame()
+
+
+# This assumes a running TTS service running:
+# https://github.com/OHF-Voice/piper1-gpl/blob/main/docs/API_HTTP.md
+#
+# Usage:
+#
+#  $ uv pip install "piper-tts[http]"
+#  $ uv run python -m piper.http_server -m en_US-ryan-high
+#
+class PiperHttpTTSService(TTSService):
+    """Piper HTTP TTS service implementation.
+
     Provides integration with Piper's HTTP TTS server for text-to-speech
     synthesis. Supports streaming audio generation with configurable sample
     rates and automatic WAV header removal.
@@ -35,28 +154,26 @@ def __init__(
         *,
         base_url: str,
         aiohttp_session: aiohttp.ClientSession,
-        # When using Piper, the sample rate of the generated audio depends on the
-        # voice model being used.
-        sample_rate: Optional[int] = None,
+        voice_id: Optional[str] = None,
         **kwargs,
     ):
         """Initialize the Piper TTS service.
 
         Args:
             base_url: Base URL for the Piper TTS HTTP server.
             aiohttp_session: aiohttp ClientSession for making HTTP requests.
-            sample_rate: Output sample rate. If None, uses the voice model's native rate.
+            voice_id: Piper voice model identifier (e.g. `en_US-ryan-high`).
             **kwargs: Additional arguments passed to the parent TTSService.
         """
-        super().__init__(sample_rate=sample_rate, **kwargs)
+        super().__init__(**kwargs)
 
         if base_url.endswith("/"):
             logger.warning("Base URL ends with a slash, this is not allowed.")
             base_url = base_url[:-1]
 
         self._base_url = base_url
         self._session = aiohttp_session
-        self._settings = {"base_url": base_url}
+        self._model_id = voice_id
 
     def can_generate_metrics(self) -> bool:
         """Check if this service can generate processing metrics.
@@ -83,9 +200,12 @@ async def run_tts(self, text: str) -> AsyncGenerator[Frame, None]:
         try:
             await self.start_ttfb_metrics()
 
-            async with self._session.post(
-                self._base_url, json={"text": text}, headers=headers
-            ) as response:
+            data = {
+                "text": text,
+                "voice": self._model_id,
+            }
+
+            async with self._session.post(self._base_url, json=data, headers=headers) as response:
                 if response.status != 200:
                     error = await response.text()
                     yield ErrorFrame(
diff --git a/src/pipecat/services/tts_service.py b/src/pipecat/services/tts_service.py
diff --git a/tests/test_piper_tts.py b/tests/test_piper_tts.py
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+- Added local `PiperTTSService` for offline text-to-speech using Piper voice models. The existing HTTP-based service has been renamed to `PiperHttpTTSService`.
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+- Fixed `PiperHttpTTSService` (olf `PiperTTSService`) to resample audio output based on the model's sample rate parsed from the WAV header.