Skip to content

Commit 027e544

Browse files
committed
examples(foundational): associate image requests to function calls
1 parent e268c73 commit 027e544

7 files changed

Lines changed: 114 additions & 46 deletions

examples/foundational/14d-function-calling-anthropic-video.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,15 @@ async def fetch_user_image(params: FunctionCallParams):
5555
logger.debug(f"Requesting image with user_id={user_id}, question={question}")
5656

5757
# Request a user image frame and indicate that it should be added to the
58-
# context.
58+
# context. Also associate it to the function call.
5959
await params.llm.push_frame(
60-
UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
60+
UserImageRequestFrame(
61+
user_id=user_id,
62+
text=question,
63+
append_to_context=True,
64+
function_name=params.function_name,
65+
tool_call_id=params.tool_call_id,
66+
),
6167
FrameDirection.UPSTREAM,
6268
)
6369

examples/foundational/14d-function-calling-aws-video.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,15 @@ async def fetch_user_image(params: FunctionCallParams):
5555
logger.debug(f"Requesting image with user_id={user_id}, question={question}")
5656

5757
# Request a user image frame and indicate that it should be added to the
58-
# context.
58+
# context. Also associate it to the function call.
5959
await params.llm.push_frame(
60-
UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
60+
UserImageRequestFrame(
61+
user_id=user_id,
62+
text=question,
63+
append_to_context=True,
64+
function_name=params.function_name,
65+
tool_call_id=params.tool_call_id,
66+
),
6167
FrameDirection.UPSTREAM,
6268
)
6369

examples/foundational/14d-function-calling-gemini-flash-video.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,15 @@ async def fetch_user_image(params: FunctionCallParams):
5555
logger.debug(f"Requesting image with user_id={user_id}, question={question}")
5656

5757
# Request a user image frame and indicate that it should be added to the
58-
# context.
58+
# context. Also associate it to the function call.
5959
await params.llm.push_frame(
60-
UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
60+
UserImageRequestFrame(
61+
user_id=user_id,
62+
text=question,
63+
append_to_context=True,
64+
function_name=params.function_name,
65+
tool_call_id=params.tool_call_id,
66+
),
6167
FrameDirection.UPSTREAM,
6268
)
6369

examples/foundational/14d-function-calling-moondream-video.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,15 @@ async def fetch_user_image(params: FunctionCallParams):
6464

6565
# Request a user image frame. In this case, we don't want the requested
6666
# image to be added to the context because we will process it with
67-
# Moondream.
67+
# Moondream. Also associate it to the function call.
6868
await params.llm.push_frame(
69-
UserImageRequestFrame(user_id=user_id, text=question, append_to_context=False),
69+
UserImageRequestFrame(
70+
user_id=user_id,
71+
text=question,
72+
append_to_context=False,
73+
function_name=params.function_name,
74+
tool_call_id=params.tool_call_id,
75+
),
7076
FrameDirection.UPSTREAM,
7177
)
7278

examples/foundational/14d-function-calling-openai-video.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,15 @@ async def fetch_user_image(params: FunctionCallParams):
5656
logger.debug(f"Requesting image with user_id={user_id}, question={question}")
5757

5858
# Request a user image frame and indicate that it should be added to the
59-
# context.
59+
# context. Also associate it to the function call.
6060
await params.llm.push_frame(
61-
UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
61+
UserImageRequestFrame(
62+
user_id=user_id,
63+
text=question,
64+
append_to_context=True,
65+
function_name=params.function_name,
66+
tool_call_id=params.tool_call_id,
67+
),
6268
FrameDirection.UPSTREAM,
6369
)
6470

examples/foundational/14e-function-calling-google.py

Lines changed: 41 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#
66

77

8-
import asyncio
98
import os
109

1110
from dotenv import load_dotenv
@@ -16,7 +15,7 @@
1615
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
1716
from pipecat.audio.vad.silero import SileroVADAnalyzer
1817
from pipecat.audio.vad.vad_analyzer import VADParams
19-
from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame
18+
from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame, UserImageRequestFrame
2019
from pipecat.pipeline.pipeline import Pipeline
2120
from pipecat.pipeline.runner import PipelineRunner
2221
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -25,6 +24,7 @@
2524
LLMContextAggregatorPair,
2625
LLMUserAggregatorParams,
2726
)
27+
from pipecat.processors.frame_processor import FrameDirection
2828
from pipecat.runner.types import RunnerArguments
2929
from pipecat.runner.utils import (
3030
create_transport,
@@ -43,10 +43,6 @@
4343
load_dotenv(override=True)
4444

4545

46-
# Global variable to store the client ID
47-
client_id = ""
48-
49-
5046
async def get_weather(params: FunctionCallParams):
5147
location = params.arguments["location"]
5248
await params.result_callback(f"The weather in {location} is currently 72 degrees and sunny.")
@@ -57,19 +53,36 @@ async def fetch_restaurant_recommendation(params: FunctionCallParams):
5753

5854

5955
async def get_image(params: FunctionCallParams):
56+
"""Fetch the user image and push it to the LLM.
57+
58+
When called, this function pushes a UserImageRequestFrame upstream to the
59+
transport. As a result, the transport will request the user image and push a
60+
UserImageRawFrame downstream which will be added to the context by the LLM
61+
assistant aggregator.
62+
"""
63+
user_id = params.arguments["user_id"]
6064
question = params.arguments["question"]
61-
logger.debug(f"Requesting image with user_id={client_id}, question={question}")
62-
63-
# Request the image frame
64-
await params.llm.request_image_frame(
65-
user_id=client_id,
66-
function_name=params.function_name,
67-
tool_call_id=params.tool_call_id,
68-
text_content=question,
65+
logger.debug(f"Requesting image with user_id={user_id}, question={question}")
66+
67+
# Request a user image frame and indicate that it should be added to the
68+
# context. Also associate it to the function call.
69+
await params.llm.push_frame(
70+
UserImageRequestFrame(
71+
user_id=user_id,
72+
text=question,
73+
append_to_context=True,
74+
function_name=params.function_name,
75+
tool_call_id=params.tool_call_id,
76+
),
77+
FrameDirection.UPSTREAM,
6978
)
7079

7180
await params.result_callback(None)
7281

82+
# Instead of None, it's possible to also provide a tool call answer to
83+
# tell the LLM that we are grabbing the image to analyze.
84+
# await params.result_callback({"result": "Image is being captured."})
85+
7386

7487
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
7588
# instantiated. The function will be called when the desired transport gets
@@ -138,14 +151,18 @@ async def on_function_calls_started(service, function_calls):
138151
)
139152
get_image_function = FunctionSchema(
140153
name="get_image",
141-
description="Get an image from the video stream.",
154+
description="Called when the user requests a description of their camera feed",
142155
properties={
156+
"user_id": {
157+
"type": "string",
158+
"description": "The ID of the user to grab the image from",
159+
},
143160
"question": {
144161
"type": "string",
145-
"description": "The question that the user is asking about the image.",
146-
}
162+
"description": "The question that the user is asking about the image",
163+
},
147164
},
148-
required=["question"],
165+
required=["user_id", "question"],
149166
)
150167
tools = ToolsSchema(standard_tools=[weather_function, get_image_function, restaurant_function])
151168

@@ -169,7 +186,6 @@ async def on_function_calls_started(service, function_calls):
169186
"""
170187
messages = [
171188
{"role": "system", "content": system_prompt},
172-
{"role": "user", "content": "Say hello."},
173189
]
174190

175191
context = LLMContext(messages, tools)
@@ -209,10 +225,15 @@ async def on_client_connected(transport, client):
209225

210226
await maybe_capture_participant_camera(transport, client)
211227

212-
global client_id
213228
client_id = get_transport_client_id(transport, client)
214229

215230
# Kick off the conversation.
231+
messages.append(
232+
{
233+
"role": "system",
234+
"content": f"Please introduce yourself to the user. Use '{client_id}' as the user ID during function calls.",
235+
}
236+
)
216237
await task.queue_frames([LLMRunFrame()])
217238

218239
@transport.event_handler("on_client_disconnected")

examples/foundational/20d-persistent-context-gemini.py

Lines changed: 33 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
1818
from pipecat.audio.vad.silero import SileroVADAnalyzer
1919
from pipecat.audio.vad.vad_analyzer import VADParams
20-
from pipecat.frames.frames import LLMRunFrame
20+
from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame, UserImageRequestFrame
2121
from pipecat.pipeline.pipeline import Pipeline
2222
from pipecat.pipeline.runner import PipelineRunner
2323
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -26,6 +26,7 @@
2626
LLMContextAggregatorPair,
2727
LLMUserAggregatorParams,
2828
)
29+
from pipecat.processors.frame_processor import FrameDirection
2930
from pipecat.runner.types import RunnerArguments
3031
from pipecat.runner.utils import (
3132
create_transport,
@@ -46,9 +47,6 @@
4647

4748
BASE_FILENAME = "/tmp/pipecat_conversation_"
4849

49-
# Global variable to store the client ID
50-
client_id = ""
51-
5250

5351
async def fetch_weather_from_api(params: FunctionCallParams):
5452
temperature = 75 if params.arguments["format"] == "fahrenheit" else 24
@@ -63,19 +61,29 @@ async def fetch_weather_from_api(params: FunctionCallParams):
6361

6462

6563
async def get_image(params: FunctionCallParams):
64+
user_id = params.arguments["user_id"]
6665
question = params.arguments["question"]
67-
logger.debug(f"Requesting image with user_id={client_id}, question={question}")
68-
69-
# Request the image frame
70-
await params.llm.request_image_frame(
71-
user_id=client_id,
72-
function_name=params.function_name,
73-
tool_call_id=params.tool_call_id,
74-
text_content=question,
66+
logger.debug(f"Requesting image with user_id={user_id}, question={question}")
67+
68+
# Request a user image frame and indicate that it should be added to the
69+
# context. Also associate it to the function call.
70+
await params.llm.push_frame(
71+
UserImageRequestFrame(
72+
user_id=user_id,
73+
text=question,
74+
append_to_context=True,
75+
function_name=params.function_name,
76+
tool_call_id=params.tool_call_id,
77+
),
78+
FrameDirection.UPSTREAM,
7579
)
7680

7781
await params.result_callback(None)
7882

83+
# Instead of None, it's possible to also provide a tool call answer to
84+
# tell the LLM that we are grabbing the image to analyze.
85+
# await params.result_callback({"result": "Image is being captured."})
86+
7987

8088
async def get_saved_conversation_filenames(params: FunctionCallParams):
8189
# Construct the full pattern including the BASE_FILENAME
@@ -209,14 +217,18 @@ async def load_conversation(params: FunctionCallParams):
209217

210218
get_image_function = FunctionSchema(
211219
name="get_image",
212-
description="Get and image from the camera or video stream.",
220+
description="Called when the user requests a description of their camera feed",
213221
properties={
222+
"user_id": {
223+
"type": "string",
224+
"description": "The ID of the user to grab the image from",
225+
},
214226
"question": {
215227
"type": "string",
216-
"description": "The question to to use when running inference on the acquired image.",
228+
"description": "The question that the user is asking about the image",
217229
},
218230
},
219-
required=["question"],
231+
required=["user_id", "question"],
220232
)
221233

222234
tools = ToolsSchema(
@@ -306,10 +318,15 @@ async def on_client_connected(transport, client):
306318

307319
await maybe_capture_participant_camera(transport, client)
308320

309-
global client_id
310321
client_id = get_transport_client_id(transport, client)
311322

312323
# Kick off the conversation.
324+
messages.append(
325+
{
326+
"role": "system",
327+
"content": f"Please introduce yourself to the user. Use '{client_id}' as the user ID during function calls.",
328+
}
329+
)
313330
await task.queue_frames([LLMRunFrame()])
314331

315332
@transport.event_handler("on_client_disconnected")

0 commit comments

Comments
 (0)