Merge pull request pipecat-ai#3316 from pipecat-ai/aleix/llm-user-aggreagtor-enable-interruptions

aconchillo · web-flow · commit ad4c22cf44c9 · 2025-12-29T17:45:56.000-08:00
turns(user): add support for enabling/disabling interruptions
diff --git a/changelog/3297.deprecated.md b/changelog/3297.deprecated.md
@@ -1 +1,12 @@
-- `PipelineParams.allow_interruptions` is now deprecated, use `LLMUserAggregator`'s new parameter `user_mute_strategies` instead.
+- `PipelineParams.allow_interruptions` is now deprecated, use `LLMUserAggregator`'s new parameter `turn_start_strategies` instead. For example, to disable interruptions but still get user turns you can do:
+
+  ```python
+  context_aggregator = LLMContextAggregatorPair(
+      context,
+      user_params=LLMUserAggregatorParams(
+          turn_start_strategies=TurnStartStrategies(
+              user=[TranscriptionUserTurnStartStrategy(enable_interruptions=False)],
+          ),
+      ),
+  )
+  ```
diff --git a/changelog/3316.added.md b/changelog/3316.added.md
@@ -0,0 +1 @@
+- Added `enable_interruptions` constructor argument to all user turn strategies. This tells the `LLMUserAggregator` to push or not push an `InterruptionFrame`.
diff --git a/changelog/3316.other.md b/changelog/3316.other.md
@@ -0,0 +1 @@
+- Added `52-live-transcription.py` foundational example demonstrating live transcription and translation from English to Spanish. In this example, the bot is not interruptible: as the user continues speaking, English transcriptions are queued, and the bot continuously translates and speaks each queued sentence in Spanish without being interrupted by new user speech.
diff --git a/examples/foundational/52-live-translation.py b/examples/foundational/52-live-translation.py
@@ -0,0 +1,140 @@
+#
+# Copyright (c) 2024–2025, Daily
+#
+# SPDX-License-Identifier: BSD 2-Clause License
+#
+
+
+import os
+
+from dotenv import load_dotenv
+from loguru import logger
+
+from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
+from pipecat.audio.vad.silero import SileroVADAnalyzer
+from pipecat.audio.vad.vad_analyzer import VADParams
+from pipecat.pipeline.pipeline import Pipeline
+from pipecat.pipeline.runner import PipelineRunner
+from pipecat.pipeline.task import PipelineParams, PipelineTask
+from pipecat.processors.aggregators.llm_context import LLMContext
+from pipecat.processors.aggregators.llm_response_universal import (
+    LLMContextAggregatorPair,
+    LLMUserAggregatorParams,
+)
+from pipecat.runner.types import RunnerArguments
+from pipecat.runner.utils import create_transport
+from pipecat.services.cartesia.tts import CartesiaTTSService
+from pipecat.services.deepgram.stt import DeepgramSTTService
+from pipecat.services.openai.llm import OpenAILLMService
+from pipecat.transports.base_transport import BaseTransport, TransportParams
+from pipecat.transports.daily.transport import DailyParams
+from pipecat.transports.websocket.fastapi import FastAPIWebsocketParams
+from pipecat.turns.bot import TurnAnalyzerBotTurnStartStrategy
+from pipecat.turns.turn_start_strategies import TurnStartStrategies
+from pipecat.turns.user import TranscriptionUserTurnStartStrategy
+
+load_dotenv(override=True)
+
+
+# We store functions so objects (e.g. SileroVADAnalyzer) don't get
+# instantiated. The function will be called when the desired transport gets
+# selected.
+transport_params = {
+    "daily": lambda: DailyParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
+    ),
+    "twilio": lambda: FastAPIWebsocketParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
+    ),
+    "webrtc": lambda: TransportParams(
+        audio_in_enabled=True,
+        audio_out_enabled=True,
+        vad_analyzer=SileroVADAnalyzer(params=VADParams(stop_secs=0.2)),
+    ),
+}
+
+
+async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
+    logger.info(f"Starting bot")
+
+    stt = DeepgramSTTService(api_key=os.getenv("DEEPGRAM_API_KEY"))
+
+    tts = CartesiaTTSService(
+        api_key=os.getenv("CARTESIA_API_KEY"),
+        voice_id="d4db5fb9-f44b-4bd1-85fa-192e0f0d75f9",  # Spanish-speaking Lady
+    )
+
+    llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
+
+    messages = [
+        {
+            "role": "system",
+            "content": "You are a live translation assistant. Your sole purpose is to translate English text into Spanish. When you receive English text from the user, immediately translate it into natural, fluent Spanish. Do not add explanations, commentary, or extra information—only provide the Spanish translation of the text you receive.",
+        },
+    ]
+
+    context = LLMContext(messages)
+
+    # We use the TranscriptionUserTurnStartStrategy to start a new user turn
+    # every time a transcription is received. We disable interruptions, so the
+    # user can continue speaking while the bot is transcribing, without
+    # interrupting the bot.
+    context_aggregator = LLMContextAggregatorPair(
+        context,
+        user_params=LLMUserAggregatorParams(
+            turn_start_strategies=TurnStartStrategies(
+                user=[TranscriptionUserTurnStartStrategy(enable_interruptions=False)],
+                bot=[TurnAnalyzerBotTurnStartStrategy(turn_analyzer=LocalSmartTurnAnalyzerV3())],
+            ),
+        ),
+    )
+
+    pipeline = Pipeline(
+        [
+            transport.input(),  # Transport user input
+            stt,  # STT
+            context_aggregator.user(),  # User responses
+            llm,  # LLM
+            tts,  # TTS (bot will speak the chosen language)
+            transport.output(),  # Transport bot output
+            context_aggregator.assistant(),  # Assistant spoken responses
+        ]
+    )
+
+    task = PipelineTask(
+        pipeline,
+        params=PipelineParams(
+            enable_metrics=True,
+            enable_usage_metrics=True,
+        ),
+        idle_timeout_secs=runner_args.pipeline_idle_timeout_secs,
+    )
+
+    @transport.event_handler("on_client_connected")
+    async def on_client_connected(transport, client):
+        logger.info(f"Client connected")
+
+    @transport.event_handler("on_client_disconnected")
+    async def on_client_disconnected(transport, client):
+        logger.info(f"Client disconnected")
+        await task.cancel()
+
+    runner = PipelineRunner(handle_sigint=runner_args.handle_sigint)
+
+    await runner.run(task)
+
+
+async def bot(runner_args: RunnerArguments):
+    """Main bot entry point compatible with Pipecat Cloud."""
+    transport = await create_transport(runner_args, transport_params)
+    await run_bot(transport, runner_args)
+
+
+if __name__ == "__main__":
+    from pipecat.runner.run import main
+
+    main()
diff --git a/src/pipecat/pipeline/task.py b/src/pipecat/pipeline/task.py
@@ -107,7 +107,7 @@ class PipelineParams(BaseModel):
         allow_interruptions: Whether to allow pipeline interruptions.
 
             .. deprecated:: 0.0.99
-                Use  `LLMUserAggregator`'s new `user_mute_strategies` parameter instead.
+                Use  `LLMUserAggregator`'s new `turn_start_strategies` parameter instead.
 
         audio_in_sample_rate: Input audio sample rate in Hz.
         audio_out_sample_rate: Output audio sample rate in Hz.
diff --git a/src/pipecat/processors/aggregators/llm_response_universal.py b/src/pipecat/processors/aggregators/llm_response_universal.py
@@ -561,8 +561,11 @@ async def _trigger_user_turn_start(
                 await s.reset()
 
         if params.enable_user_speaking_frames:
-            # TODO(aleix): These frames should really come from the top of the pipeline.
+            # TODO(aleix): This frame should really come from the top of the pipeline.
             await self.broadcast_frame(UserStartedSpeakingFrame)
+
+        if params.enable_interruptions:
+            # TODO(aleix): This frame should really come from the top of the pipeline.
             await self.broadcast_frame(InterruptionFrame)
 
         await self._call_event_handler("on_user_turn_started", strategy)
diff --git a/src/pipecat/turns/bot/external_bot_turn_start_strategy.py b/src/pipecat/turns/bot/external_bot_turn_start_strategy.py
@@ -29,14 +29,15 @@ class ExternalBotTurnStartStrategy(BaseBotTurnStartStrategy):
 
     """
 
-    def __init__(self, *, timeout: float = 0.5):
+    def __init__(self, *, timeout: float = 0.5, **kwargs):
         """Initialize the external bot turn start strategy.
 
         Args:
             timeout: A short delay used internally to handle consecutive or
                 slightly delayed transcriptions.
+            **kwargs: Additional keyword arguments.
         """
-        super().__init__(enable_user_speaking_frames=False)
+        super().__init__(enable_user_speaking_frames=False, **kwargs)
         self._timeout = timeout
         self._text = ""
         self._user_speaking = False
diff --git a/src/pipecat/turns/bot/transcription_bot_turn_start_strategy.py b/src/pipecat/turns/bot/transcription_bot_turn_start_strategy.py
@@ -28,14 +28,15 @@ class TranscriptionBotTurnStartStrategy(BaseBotTurnStartStrategy):
     multiple or delayed transcription frames gracefully.
     """
 
-    def __init__(self, *, timeout: float = 0.5):
+    def __init__(self, *, timeout: float = 0.5, **kwargs):
         """Initialize the transcription-based bot turn start strategy.
 
         Args:
             timeout: A short delay used internally to handle consecutive or
                 slightly delayed transcriptions.
+            **kwargs: Additional keyword arguments.
         """
-        super().__init__()
+        super().__init__(**kwargs)
         self._timeout = timeout
         self._text = ""
         self._vad_user_speaking = False
diff --git a/src/pipecat/turns/bot/turn_analyzer_bot_turn_start_strategy.py b/src/pipecat/turns/bot/turn_analyzer_bot_turn_start_strategy.py
@@ -35,14 +35,15 @@ class TurnAnalyzerBotTurnStartStrategy(BaseBotTurnStartStrategy):
 
     """
 
-    def __init__(self, *, turn_analyzer: BaseTurnAnalyzer, timeout: float = 0.5):
+    def __init__(self, *, turn_analyzer: BaseTurnAnalyzer, timeout: float = 0.5, **kwargs):
         """Initialize the bot turn start strategy.
 
         Args:
             turn_analyzer: The turn detection analyzer instance to detect end of user turn.
             timeout: Short delay used internally to handle frame timing and event triggering.
+            **kwargs: Additional keyword arguments.
         """
-        super().__init__()
+        super().__init__(**kwargs)
         self._turn_analyzer = turn_analyzer
         self._timeout = timeout
         self._text = ""
diff --git a/src/pipecat/turns/user/base_user_turn_start_strategy.py b/src/pipecat/turns/user/base_user_turn_start_strategy.py
@@ -32,6 +32,7 @@ class UserTurnStartedParams:
 
     """
 
+    enable_interruptions: bool
     enable_user_speaking_frames: bool
 
 
@@ -49,18 +50,27 @@ class BaseUserTurnStartStrategy(BaseObject):
       - `on_user_turn_started`: Signals that a user turn has started.
     """
 
-    def __init__(self, *, enable_user_speaking_frames: bool = True, **kwargs):
+    def __init__(
+        self,
+        *,
+        enable_interruptions: bool = True,
+        enable_user_speaking_frames: bool = True,
+        **kwargs,
+    ):
         """Initialize the base user turn start strategy.
 
         Args:
-            enable_user_speaking_frames: If True, the aggregator will emit frames
-                indicating when the user starts speaking, as well as interruption
-                frames. This is enabled by default, but you may want to disable it
-                if another component (e.g., an STT service) is already generating
-                these frames.
+            enable_interruptions: If True, the user aggregator will emit an
+                interruption frame when the user turn starts.
+            enable_user_speaking_frames: If True, the user aggregator will emit
+                frames indicating when the user starts speaking, as well as
+                interruption frames. This is enabled by default, but you may want
+                to disable it if another component (e.g., an STT service) is
+                already generating these frames.
             **kwargs: Additional keyword arguments.
         """
         super().__init__(**kwargs)
+        self._enable_interruptions = enable_interruptions
         self._enable_user_speaking_frames = enable_user_speaking_frames
         self._task_manager: Optional[BaseTaskManager] = None
         self._register_event_handler("on_push_frame", sync=True)
@@ -123,5 +133,8 @@ async def trigger_user_turn_started(self):
         """Trigger the `on_user_turn_started` event."""
         await self._call_event_handler(
             "on_user_turn_started",
-            UserTurnStartedParams(enable_user_speaking_frames=self._enable_user_speaking_frames),
+            UserTurnStartedParams(
+                enable_interruptions=self._enable_interruptions,
+                enable_user_speaking_frames=self._enable_user_speaking_frames,
+            ),
         )
diff --git a/src/pipecat/turns/user/external_user_turn_start_strategy.py b/src/pipecat/turns/user/external_user_turn_start_strategy.py
@@ -19,9 +19,13 @@ class ExternalUserTurnStartStrategy(BaseUserTurnStartStrategy):
 
     """
 
-    def __init__(self):
-        """Initialize the external user turn start strategy."""
-        super().__init__(enable_user_speaking_frames=False)
+    def __init__(self, **kwargs):
+        """Initialize the external user turn start strategy.
+
+        Args:
+            **kwargs: Additional keyword arguments.
+        """
+        super().__init__(enable_user_speaking_frames=False, **kwargs)
 
     async def process_frame(self, frame: Frame):
         """Process an incoming frame to detect user turn start.
diff --git a/src/pipecat/turns/user/min_words_user_turn_start_strategy.py b/src/pipecat/turns/user/min_words_user_turn_start_strategy.py
@@ -27,16 +27,17 @@ class MinWordsUserTurnStartStrategy(BaseUserTurnStartStrategy):
 
     """
 
-    def __init__(self, *, min_words: int, use_interim: bool = True):
+    def __init__(self, *, min_words: int, use_interim: bool = True, **kwargs):
         """Initialize the minimum words bot turn start strategy.
 
         Args:
             min_words: Minimum number of spoken words required to trigger the
                 start of a user turn.
             use_interim: Whether to consider interim transcription frames for
                 earlier detection.
+            **kwargs: Additional keyword arguments.
         """
-        super().__init__()
+        super().__init__(**kwargs)
         self._min_words = min_words
         self._use_interim = use_interim
         self._bot_speaking = False
diff --git a/src/pipecat/turns/user/transcription_user_turn_start_strategy.py b/src/pipecat/turns/user/transcription_user_turn_start_strategy.py
@@ -6,7 +6,7 @@
 
 """User turn start strategy based on transcriptions."""
 
-from pipecat.frames.frames import BotStartedSpeakingFrame, Frame, TranscriptionFrame
+from pipecat.frames.frames import Frame, InterimTranscriptionFrame, TranscriptionFrame
 from pipecat.turns.user.base_user_turn_start_strategy import BaseUserTurnStartStrategy
 
 
@@ -20,15 +20,10 @@ class TranscriptionUserTurnStartStrategy(BaseUserTurnStartStrategy):
 
     """
 
-    def __init__(self):
+    def __init__(self, *, use_interim: bool = True, **kwargs):
         """Initialize transcription-based user turn start strategy."""
-        super().__init__()
-        self._bot_speaking = False
-
-    async def reset(self):
-        """Reset the strategy to its initial state."""
-        await super().reset()
-        self._bot_speaking = False
+        super().__init__(**kwargs)
+        self._use_interim = use_interim
 
     async def process_frame(self, frame: Frame):
         """Process an incoming frame to detect the start of a user turn.
@@ -38,14 +33,7 @@ async def process_frame(self, frame: Frame):
         """
         await super().process_frame(frame)
 
-        if isinstance(frame, BotStartedSpeakingFrame):
-            await self._handle_bot_started_speaking(frame)
+        if isinstance(frame, InterimTranscriptionFrame) and self._use_interim:
+            await self.trigger_user_turn_started()
         elif isinstance(frame, TranscriptionFrame):
-            await self._handle_transcription(frame)
-
-    async def _handle_bot_started_speaking(self, _: BotStartedSpeakingFrame):
-        self._bot_speaking = True
-
-    async def _handle_transcription(self, _: TranscriptionFrame):
-        if self._bot_speaking:
             await self.trigger_user_turn_started()
diff --git a/tests/test_bot_turn_start_strategy.py b/tests/test_bot_turn_start_strategy.py
diff --git a/tests/test_user_turn_start_strategy.py b/tests/test_user_turn_start_strategy.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+- Added `enable_interruptions` constructor argument to all user turn strategies. This tells the `LLMUserAggregator` to push or not push an `InterruptionFrame`.
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+- Added `52-live-transcription.py` foundational example demonstrating live transcription and translation from English to Spanish. In this example, the bot is not interruptible: as the user continues speaking, English transcriptions are queued, and the bot continuously translates and speaks each queued sentence in Spanish without being interrupted by new user speech.