speechmatics
diff --git a/‎changelog/3430.fixed.md‎
Lines changed: 1 addition & 0 deletions b/‎changelog/3430.fixed.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/foundational/14d-function-calling-anthropic-video.py‎
Lines changed: 13 additions & 3 deletions b/‎examples/foundational/14d-function-calling-anthropic-video.py‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎examples/foundational/14d-function-calling-aws-video.py‎
Lines changed: 13 additions & 3 deletions b/‎examples/foundational/14d-function-calling-aws-video.py‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎examples/foundational/14d-function-calling-gemini-flash-video.py‎
Lines changed: 13 additions & 3 deletions b/‎examples/foundational/14d-function-calling-gemini-flash-video.py‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎examples/foundational/14d-function-calling-moondream-video.py‎
Lines changed: 13 additions & 2 deletions b/‎examples/foundational/14d-function-calling-moondream-video.py‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎examples/foundational/14d-function-calling-openai-video.py‎
Lines changed: 13 additions & 3 deletions b/‎examples/foundational/14d-function-calling-openai-video.py‎
Lines changed: 13 additions & 3 deletions
diff --git a/‎examples/foundational/14e-function-calling-google.py‎
Lines changed: 41 additions & 26 deletions b/‎examples/foundational/14e-function-calling-google.py‎
Lines changed: 41 additions & 26 deletions
@@ -0,0 +1 @@
+- Fixed `request_image_frame` (for backwards compatibility) and restored function-call–related fields in `UserImageRequestFrame` and `UserImageRawFrame`, preventing a case where adding a non-LLM message to the context could trigger duplicate LLM inferences (on image arrival and on function-call result), potentially causing an infinite inference loop.
@@ -14,7 +14,7 @@
 from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
 from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.audio.vad.vad_analyzer import VADParams
-from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame
+from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame, UserImageRequestFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -55,9 +55,15 @@ async def fetch_user_image(params: FunctionCallParams):
     logger.debug(f"Requesting image with user_id={user_id}, question={question}")
 
     # Request a user image frame and indicate that it should be added to the
-    # context.
+    # context. Also associate it to the function call.
     await params.llm.push_frame(
-        UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
+        UserImageRequestFrame(
+            user_id=user_id,
+            text=question,
+            append_to_context=True,
+            function_name=params.function_name,
+            tool_call_id=params.tool_call_id,
+        ),
         FrameDirection.UPSTREAM,
     )
 
@@ -101,6 +107,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
     llm = AnthropicLLMService(api_key=os.getenv("ANTHROPIC_API_KEY"))
     llm.register_function("fetch_user_image", fetch_user_image)
 
+    @llm.event_handler("on_function_calls_started")
+    async def on_function_calls_started(service, function_calls):
+        await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
+
     fetch_image_function = FunctionSchema(
         name="fetch_user_image",
         description="Called when the user requests a description of their camera feed",
 
@@ -14,7 +14,7 @@
 from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
 from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.audio.vad.vad_analyzer import VADParams
-from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame
+from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame, UserImageRequestFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -55,9 +55,15 @@ async def fetch_user_image(params: FunctionCallParams):
     logger.debug(f"Requesting image with user_id={user_id}, question={question}")
 
     # Request a user image frame and indicate that it should be added to the
-    # context.
+    # context. Also associate it to the function call.
     await params.llm.push_frame(
-        UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
+        UserImageRequestFrame(
+            user_id=user_id,
+            text=question,
+            append_to_context=True,
+            function_name=params.function_name,
+            tool_call_id=params.tool_call_id,
+        ),
         FrameDirection.UPSTREAM,
     )
 
@@ -108,6 +114,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
     )
     llm.register_function("fetch_user_image", fetch_user_image)
 
+    @llm.event_handler("on_function_calls_started")
+    async def on_function_calls_started(service, function_calls):
+        await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
+
     fetch_image_function = FunctionSchema(
         name="fetch_user_image",
         description="Called when the user requests a description of their camera feed",
 
@@ -14,7 +14,7 @@
 from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
 from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.audio.vad.vad_analyzer import VADParams
-from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame
+from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame, UserImageRequestFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -55,9 +55,15 @@ async def fetch_user_image(params: FunctionCallParams):
     logger.debug(f"Requesting image with user_id={user_id}, question={question}")
 
     # Request a user image frame and indicate that it should be added to the
-    # context.
+    # context. Also associate it to the function call.
     await params.llm.push_frame(
-        UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
+        UserImageRequestFrame(
+            user_id=user_id,
+            text=question,
+            append_to_context=True,
+            function_name=params.function_name,
+            tool_call_id=params.tool_call_id,
+        ),
         FrameDirection.UPSTREAM,
     )
 
@@ -101,6 +107,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
     llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"))
     llm.register_function("fetch_user_image", fetch_user_image)
 
+    @llm.event_handler("on_function_calls_started")
+    async def on_function_calls_started(service, function_calls):
+        await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
+
     fetch_image_function = FunctionSchema(
         name="fetch_user_image",
         description="Called when the user requests a description of their camera feed",
 
@@ -20,6 +20,7 @@
     LLMFullResponseStartFrame,
     LLMRunFrame,
     TextFrame,
+    TTSSpeakFrame,
     UserImageRequestFrame,
 )
 from pipecat.pipeline.parallel_pipeline import ParallelPipeline
@@ -64,9 +65,15 @@ async def fetch_user_image(params: FunctionCallParams):
 
     # Request a user image frame. In this case, we don't want the requested
     # image to be added to the context because we will process it with
-    # Moondream.
+    # Moondream. Also associate it to the function call.
     await params.llm.push_frame(
-        UserImageRequestFrame(user_id=user_id, text=question, append_to_context=False),
+        UserImageRequestFrame(
+            user_id=user_id,
+            text=question,
+            append_to_context=False,
+            function_name=params.function_name,
+            tool_call_id=params.tool_call_id,
+        ),
         FrameDirection.UPSTREAM,
     )
 
@@ -130,6 +137,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
     llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
     llm.register_function("fetch_user_image", fetch_user_image)
 
+    @llm.event_handler("on_function_calls_started")
+    async def on_function_calls_started(service, function_calls):
+        await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
+
     fetch_image_function = FunctionSchema(
         name="fetch_user_image",
         description="Called when the user requests a description of their camera feed",
 
@@ -15,7 +15,7 @@
 from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
 from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.audio.vad.vad_analyzer import VADParams
-from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame
+from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame, UserImageRequestFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -56,9 +56,15 @@ async def fetch_user_image(params: FunctionCallParams):
     logger.debug(f"Requesting image with user_id={user_id}, question={question}")
 
     # Request a user image frame and indicate that it should be added to the
-    # context.
+    # context. Also associate it to the function call.
     await params.llm.push_frame(
-        UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
+        UserImageRequestFrame(
+            user_id=user_id,
+            text=question,
+            append_to_context=True,
+            function_name=params.function_name,
+            tool_call_id=params.tool_call_id,
+        ),
         FrameDirection.UPSTREAM,
     )
 
@@ -101,6 +107,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
     llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
     llm.register_function("fetch_user_image", fetch_user_image)
 
+    @llm.event_handler("on_function_calls_started")
+    async def on_function_calls_started(service, function_calls):
+        await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
+
     fetch_image_function = FunctionSchema(
         name="fetch_user_image",
         description="Called when the user requests a description of their camera feed",
 
@@ -5,7 +5,6 @@
 #
 
 
-import asyncio
 import os
 
 from dotenv import load_dotenv
@@ -16,7 +15,7 @@
 from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
 from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.audio.vad.vad_analyzer import VADParams
-from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame
+from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame, UserImageRequestFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -25,6 +24,7 @@
     LLMContextAggregatorPair,
     LLMUserAggregatorParams,
 )
+from pipecat.processors.frame_processor import FrameDirection
 from pipecat.runner.types import RunnerArguments
 from pipecat.runner.utils import (
     create_transport,
@@ -43,10 +43,6 @@
 load_dotenv(override=True)
 
 
-# Global variable to store the client ID
-client_id = ""
-
-
 async def get_weather(params: FunctionCallParams):
     location = params.arguments["location"]
     await params.result_callback(f"The weather in {location} is currently 72 degrees and sunny.")
@@ -57,24 +53,35 @@ async def fetch_restaurant_recommendation(params: FunctionCallParams):
 
 
 async def get_image(params: FunctionCallParams):
+    """Fetch the user image and push it to the LLM.
+
+    When called, this function pushes a UserImageRequestFrame upstream to the
+    transport. As a result, the transport will request the user image and push a
+    UserImageRawFrame downstream which will be added to the context by the LLM
+    assistant aggregator.
+    """
+    user_id = params.arguments["user_id"]
     question = params.arguments["question"]
-    logger.debug(f"Requesting image with user_id={client_id}, question={question}")
-
-    # Request the image frame
-    await params.llm.request_image_frame(
-        user_id=client_id,
-        function_name=params.function_name,
-        tool_call_id=params.tool_call_id,
-        text_content=question,
+    logger.debug(f"Requesting image with user_id={user_id}, question={question}")
+
+    # Request a user image frame and indicate that it should be added to the
+    # context. Also associate it to the function call.
+    await params.llm.push_frame(
+        UserImageRequestFrame(
+            user_id=user_id,
+            text=question,
+            append_to_context=True,
+            function_name=params.function_name,
+            tool_call_id=params.tool_call_id,
+        ),
+        FrameDirection.UPSTREAM,
     )
 
-    # Wait a short time for the frame to be processed
-    await asyncio.sleep(0.5)
+    await params.result_callback(None)
 
-    # Return a result to complete the function call
-    await params.result_callback(
-        f"I've captured an image from your camera and I'm analyzing what you asked about: {question}"
-    )
+    # Instead of None, it's possible to also provide a tool call answer to
+    # tell the LLM that we are grabbing the image to analyze.
+    # await params.result_callback({"result": "Image is being captured."})
 
 
 # We store functions so objects (e.g. SileroVADAnalyzer) don't get
@@ -144,14 +151,18 @@ async def on_function_calls_started(service, function_calls):
     )
     get_image_function = FunctionSchema(
         name="get_image",
-        description="Get an image from the video stream.",
+        description="Called when the user requests a description of their camera feed",
         properties={
+            "user_id": {
+                "type": "string",
+                "description": "The ID of the user to grab the image from",
+            },
             "question": {
                 "type": "string",
-                "description": "The question that the user is asking about the image.",
-            }
+                "description": "The question that the user is asking about the image",
+            },
         },
-        required=["question"],
+        required=["user_id", "question"],
     )
     tools = ToolsSchema(standard_tools=[weather_function, get_image_function, restaurant_function])
 
@@ -175,7 +186,6 @@ async def on_function_calls_started(service, function_calls):
 """
     messages = [
         {"role": "system", "content": system_prompt},
-        {"role": "user", "content": "Say hello."},
     ]
 
     context = LLMContext(messages, tools)
@@ -215,10 +225,15 @@ async def on_client_connected(transport, client):
 
         await maybe_capture_participant_camera(transport, client)
 
-        global client_id
         client_id = get_transport_client_id(transport, client)
 
         # Kick off the conversation.
+        messages.append(
+            {
+                "role": "system",
+                "content": f"Please introduce yourself to the user. Use '{client_id}' as the user ID during function calls.",
+            }
+        )
         await task.queue_frames([LLMRunFrame()])
 
     @transport.event_handler("on_client_disconnected")
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+- Fixed `request_image_frame` (for backwards compatibility) and restored function-call–related fields in `UserImageRequestFrame` and `UserImageRawFrame`, preventing a case where adding a non-LLM message to the context could trigger duplicate LLM inferences (on image arrival and on function-call result), potentially causing an infinite inference loop.