Merge pull request #3483 from pipecat-ai/aleix/throttle-user-speaking-frame

aconchillo · web-flow · commit 84c7e97be2af · 2026-01-16T15:29:37.000-08:00
throttle user speaking frame
diff --git a/changelog/3483.changed.md b/changelog/3483.changed.md
@@ -0,0 +1 @@
+- Throttle `UserSpeakingFrame` to broadcast at most every 200ms instead of on every audio chunk, reducing frame processing overhead during user speech.
diff --git a/src/pipecat/transports/base_input.py b/src/pipecat/transports/base_input.py
@@ -11,6 +11,7 @@
 """
 
 import asyncio
+import time
 from typing import Optional
 
 from loguru import logger
@@ -77,6 +78,11 @@ def __init__(self, params: TransportParams, **kwargs):
 
         # Track user speaking state for interruption logic
         self._user_speaking = False
+        # Last time a UserSpeakingFrame was pushed.
+        self._user_speaking_frame_time = 0
+        # How often a UserSpeakingFrame should be pushed (value should be
+        # greater than the audio chunks to have any effect).
+        self._user_speaking_frame_period = 0.2
 
         # Task to process incoming audio (VAD) and push audio frames downstream
         # if passthrough is enabled.
@@ -423,7 +429,7 @@ async def _audio_task_handler(self):
                     await self._deprecated_run_turn_analyzer(frame, vad_state, previous_vad_state)
 
                 if vad_state == VADState.SPEAKING:
-                    await self.broadcast_frame(UserSpeakingFrame)
+                    await self._user_currently_speaking()
 
                 # Push audio downstream if passthrough is set.
                 if self._params.audio_in_passthrough:
@@ -444,6 +450,13 @@ async def _audio_task_handler(self):
                     else:
                         await self.push_frame(VADUserStoppedSpeakingFrame())
 
+    async def _user_currently_speaking(self):
+        """Handle user speaking frame."""
+        diff_time = time.time() - self._user_speaking_frame_time
+        if diff_time >= self._user_speaking_frame_period:
+            await self.broadcast_frame(UserSpeakingFrame)
+            self._user_speaking_frame_time = time.time()
+
     #
     # DEPRECATED.
     #
diff --git a/src/pipecat/transports/base_output.py b/src/pipecat/transports/base_output.py
@@ -403,7 +403,7 @@ def __init__(
             # Last time a BotSpeakingFrame was pushed.
             self._bot_speaking_frame_time = 0
             # How often a BotSpeakingFrame should be pushed (value should be
-            # lower than the audio chunks).
+            # greater than the audio chunks to have any effect).
             self._bot_speaking_frame_period = 0.2
             # Last time the bot actually spoke.
             self._bot_speech_last_time = 0
@@ -644,8 +644,7 @@ async def _bot_currently_speaking(self):
 
             diff_time = time.time() - self._bot_speaking_frame_time
             if diff_time >= self._bot_speaking_frame_period:
-                await self._transport.push_frame(BotSpeakingFrame())
-                await self._transport.push_frame(BotSpeakingFrame(), FrameDirection.UPSTREAM)
+                await self._transport.broadcast_frame(BotSpeakingFrame)
                 self._bot_speaking_frame_time = time.time()
 
             self._bot_speech_last_time = time.time()

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+- Throttle `UserSpeakingFrame` to broadcast at most every 200ms instead of on every audio chunk, reducing frame processing overhead during user speech.