examples(foundational): associate image requests to function calls

aconchillo · aconchillo · commit 027e54425ac2 · 2026-01-13T12:11:30.000-08:00
diff --git a/examples/foundational/14d-function-calling-anthropic-video.py b/examples/foundational/14d-function-calling-anthropic-video.py
@@ -55,9 +55,15 @@ async def fetch_user_image(params: FunctionCallParams):
     logger.debug(f"Requesting image with user_id={user_id}, question={question}")
 
     # Request a user image frame and indicate that it should be added to the
-    # context.
+    # context. Also associate it to the function call.
     await params.llm.push_frame(
-        UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
+        UserImageRequestFrame(
+            user_id=user_id,
+            text=question,
+            append_to_context=True,
+            function_name=params.function_name,
+            tool_call_id=params.tool_call_id,
+        ),
         FrameDirection.UPSTREAM,
     )
 
diff --git a/examples/foundational/14d-function-calling-aws-video.py b/examples/foundational/14d-function-calling-aws-video.py
@@ -55,9 +55,15 @@ async def fetch_user_image(params: FunctionCallParams):
     logger.debug(f"Requesting image with user_id={user_id}, question={question}")
 
     # Request a user image frame and indicate that it should be added to the
-    # context.
+    # context. Also associate it to the function call.
     await params.llm.push_frame(
-        UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
+        UserImageRequestFrame(
+            user_id=user_id,
+            text=question,
+            append_to_context=True,
+            function_name=params.function_name,
+            tool_call_id=params.tool_call_id,
+        ),
         FrameDirection.UPSTREAM,
     )
 
diff --git a/examples/foundational/14d-function-calling-gemini-flash-video.py b/examples/foundational/14d-function-calling-gemini-flash-video.py
@@ -55,9 +55,15 @@ async def fetch_user_image(params: FunctionCallParams):
     logger.debug(f"Requesting image with user_id={user_id}, question={question}")
 
     # Request a user image frame and indicate that it should be added to the
-    # context.
+    # context. Also associate it to the function call.
     await params.llm.push_frame(
-        UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
+        UserImageRequestFrame(
+            user_id=user_id,
+            text=question,
+            append_to_context=True,
+            function_name=params.function_name,
+            tool_call_id=params.tool_call_id,
+        ),
         FrameDirection.UPSTREAM,
     )
 
diff --git a/examples/foundational/14d-function-calling-moondream-video.py b/examples/foundational/14d-function-calling-moondream-video.py
@@ -64,9 +64,15 @@ async def fetch_user_image(params: FunctionCallParams):
 
     # Request a user image frame. In this case, we don't want the requested
     # image to be added to the context because we will process it with
-    # Moondream.
+    # Moondream. Also associate it to the function call.
     await params.llm.push_frame(
-        UserImageRequestFrame(user_id=user_id, text=question, append_to_context=False),
+        UserImageRequestFrame(
+            user_id=user_id,
+            text=question,
+            append_to_context=False,
+            function_name=params.function_name,
+            tool_call_id=params.tool_call_id,
+        ),
         FrameDirection.UPSTREAM,
     )
 
diff --git a/examples/foundational/14d-function-calling-openai-video.py b/examples/foundational/14d-function-calling-openai-video.py
@@ -56,9 +56,15 @@ async def fetch_user_image(params: FunctionCallParams):
     logger.debug(f"Requesting image with user_id={user_id}, question={question}")
 
     # Request a user image frame and indicate that it should be added to the
-    # context.
+    # context. Also associate it to the function call.
     await params.llm.push_frame(
-        UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
+        UserImageRequestFrame(
+            user_id=user_id,
+            text=question,
+            append_to_context=True,
+            function_name=params.function_name,
+            tool_call_id=params.tool_call_id,
+        ),
         FrameDirection.UPSTREAM,
     )
 
diff --git a/examples/foundational/14e-function-calling-google.py b/examples/foundational/14e-function-calling-google.py
@@ -5,7 +5,6 @@
 #
 
 
-import asyncio
 import os
 
 from dotenv import load_dotenv
@@ -16,7 +15,7 @@
 from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
 from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.audio.vad.vad_analyzer import VADParams
-from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame
+from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame, UserImageRequestFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -25,6 +24,7 @@
     LLMContextAggregatorPair,
     LLMUserAggregatorParams,
 )
+from pipecat.processors.frame_processor import FrameDirection
 from pipecat.runner.types import RunnerArguments
 from pipecat.runner.utils import (
     create_transport,
@@ -43,10 +43,6 @@
 load_dotenv(override=True)
 
 
-# Global variable to store the client ID
-client_id = ""
-
-
 async def get_weather(params: FunctionCallParams):
     location = params.arguments["location"]
     await params.result_callback(f"The weather in {location} is currently 72 degrees and sunny.")
@@ -57,19 +53,36 @@ async def fetch_restaurant_recommendation(params: FunctionCallParams):
 
 
 async def get_image(params: FunctionCallParams):
+    """Fetch the user image and push it to the LLM.
+
+    When called, this function pushes a UserImageRequestFrame upstream to the
+    transport. As a result, the transport will request the user image and push a
+    UserImageRawFrame downstream which will be added to the context by the LLM
+    assistant aggregator.
+    """
+    user_id = params.arguments["user_id"]
     question = params.arguments["question"]
-    logger.debug(f"Requesting image with user_id={client_id}, question={question}")
-
-    # Request the image frame
-    await params.llm.request_image_frame(
-        user_id=client_id,
-        function_name=params.function_name,
-        tool_call_id=params.tool_call_id,
-        text_content=question,
+    logger.debug(f"Requesting image with user_id={user_id}, question={question}")
+
+    # Request a user image frame and indicate that it should be added to the
+    # context. Also associate it to the function call.
+    await params.llm.push_frame(
+        UserImageRequestFrame(
+            user_id=user_id,
+            text=question,
+            append_to_context=True,
+            function_name=params.function_name,
+            tool_call_id=params.tool_call_id,
+        ),
+        FrameDirection.UPSTREAM,
     )
 
     await params.result_callback(None)
 
+    # Instead of None, it's possible to also provide a tool call answer to
+    # tell the LLM that we are grabbing the image to analyze.
+    # await params.result_callback({"result": "Image is being captured."})
+
 
 # We store functions so objects (e.g. SileroVADAnalyzer) don't get
 # instantiated. The function will be called when the desired transport gets
@@ -138,14 +151,18 @@ async def on_function_calls_started(service, function_calls):
     )
     get_image_function = FunctionSchema(
         name="get_image",
-        description="Get an image from the video stream.",
+        description="Called when the user requests a description of their camera feed",
         properties={
+            "user_id": {
+                "type": "string",
+                "description": "The ID of the user to grab the image from",
+            },
             "question": {
                 "type": "string",
-                "description": "The question that the user is asking about the image.",
-            }
+                "description": "The question that the user is asking about the image",
+            },
         },
-        required=["question"],
+        required=["user_id", "question"],
     )
     tools = ToolsSchema(standard_tools=[weather_function, get_image_function, restaurant_function])
 
@@ -169,7 +186,6 @@ async def on_function_calls_started(service, function_calls):
 """
     messages = [
         {"role": "system", "content": system_prompt},
-        {"role": "user", "content": "Say hello."},
     ]
 
     context = LLMContext(messages, tools)
@@ -209,10 +225,15 @@ async def on_client_connected(transport, client):
 
         await maybe_capture_participant_camera(transport, client)
 
-        global client_id
         client_id = get_transport_client_id(transport, client)
 
         # Kick off the conversation.
+        messages.append(
+            {
+                "role": "system",
+                "content": f"Please introduce yourself to the user. Use '{client_id}' as the user ID during function calls.",
+            }
+        )
         await task.queue_frames([LLMRunFrame()])
 
     @transport.event_handler("on_client_disconnected")
diff --git a/examples/foundational/20d-persistent-context-gemini.py b/examples/foundational/20d-persistent-context-gemini.py
@@ -17,7 +17,7 @@
 from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
 from pipecat.audio.vad.silero import SileroVADAnalyzer
 from pipecat.audio.vad.vad_analyzer import VADParams
-from pipecat.frames.frames import LLMRunFrame
+from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame, UserImageRequestFrame
 from pipecat.pipeline.pipeline import Pipeline
 from pipecat.pipeline.runner import PipelineRunner
 from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -26,6 +26,7 @@
     LLMContextAggregatorPair,
     LLMUserAggregatorParams,
 )
+from pipecat.processors.frame_processor import FrameDirection
 from pipecat.runner.types import RunnerArguments
 from pipecat.runner.utils import (
     create_transport,
@@ -46,9 +47,6 @@
 
 BASE_FILENAME = "/tmp/pipecat_conversation_"
 
-# Global variable to store the client ID
-client_id = ""
-
 
 async def fetch_weather_from_api(params: FunctionCallParams):
     temperature = 75 if params.arguments["format"] == "fahrenheit" else 24
@@ -63,19 +61,29 @@ async def fetch_weather_from_api(params: FunctionCallParams):
 
 
 async def get_image(params: FunctionCallParams):
+    user_id = params.arguments["user_id"]
     question = params.arguments["question"]
-    logger.debug(f"Requesting image with user_id={client_id}, question={question}")
-
-    # Request the image frame
-    await params.llm.request_image_frame(
-        user_id=client_id,
-        function_name=params.function_name,
-        tool_call_id=params.tool_call_id,
-        text_content=question,
+    logger.debug(f"Requesting image with user_id={user_id}, question={question}")
+
+    # Request a user image frame and indicate that it should be added to the
+    # context. Also associate it to the function call.
+    await params.llm.push_frame(
+        UserImageRequestFrame(
+            user_id=user_id,
+            text=question,
+            append_to_context=True,
+            function_name=params.function_name,
+            tool_call_id=params.tool_call_id,
+        ),
+        FrameDirection.UPSTREAM,
     )
 
     await params.result_callback(None)
 
+    # Instead of None, it's possible to also provide a tool call answer to
+    # tell the LLM that we are grabbing the image to analyze.
+    # await params.result_callback({"result": "Image is being captured."})
+
 
 async def get_saved_conversation_filenames(params: FunctionCallParams):
     # Construct the full pattern including the BASE_FILENAME
@@ -209,14 +217,18 @@ async def load_conversation(params: FunctionCallParams):
 
 get_image_function = FunctionSchema(
     name="get_image",
-    description="Get and image from the camera or video stream.",
+    description="Called when the user requests a description of their camera feed",
     properties={
+        "user_id": {
+            "type": "string",
+            "description": "The ID of the user to grab the image from",
+        },
         "question": {
             "type": "string",
-            "description": "The question to to use when running inference on the acquired image.",
+            "description": "The question that the user is asking about the image",
         },
     },
-    required=["question"],
+    required=["user_id", "question"],
 )
 
 tools = ToolsSchema(
@@ -306,10 +318,15 @@ async def on_client_connected(transport, client):
 
         await maybe_capture_participant_camera(transport, client)
 
-        global client_id
         client_id = get_transport_client_id(transport, client)
 
         # Kick off the conversation.
+        messages.append(
+            {
+                "role": "system",
+                "content": f"Please introduce yourself to the user. Use '{client_id}' as the user ID during function calls.",
+            }
+        )
         await task.queue_frames([LLMRunFrame()])
 
     @transport.event_handler("on_client_disconnected")