Skip to content

Commit f6d09e1

Browse files
authored
Merge pull request pipecat-ai#3430 from pipecat-ai/pk/request-image-frame-fixes
Fix request_image_frame and usage
2 parents b8e48de + 21534f7 commit f6d09e1

13 files changed

Lines changed: 202 additions & 104 deletions

changelog/3430.fixed.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
- Fixed `request_image_frame` (for backwards compatibility) and restored function-call–related fields in `UserImageRequestFrame` and `UserImageRawFrame`, preventing a case where adding a non-LLM message to the context could trigger duplicate LLM inferences (on image arrival and on function-call result), potentially causing an infinite inference loop.

examples/foundational/14d-function-calling-anthropic-video.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
1515
from pipecat.audio.vad.silero import SileroVADAnalyzer
1616
from pipecat.audio.vad.vad_analyzer import VADParams
17-
from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame
17+
from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame, UserImageRequestFrame
1818
from pipecat.pipeline.pipeline import Pipeline
1919
from pipecat.pipeline.runner import PipelineRunner
2020
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -55,9 +55,15 @@ async def fetch_user_image(params: FunctionCallParams):
5555
logger.debug(f"Requesting image with user_id={user_id}, question={question}")
5656

5757
# Request a user image frame and indicate that it should be added to the
58-
# context.
58+
# context. Also associate it to the function call.
5959
await params.llm.push_frame(
60-
UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
60+
UserImageRequestFrame(
61+
user_id=user_id,
62+
text=question,
63+
append_to_context=True,
64+
function_name=params.function_name,
65+
tool_call_id=params.tool_call_id,
66+
),
6167
FrameDirection.UPSTREAM,
6268
)
6369

@@ -101,6 +107,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
101107
llm = AnthropicLLMService(api_key=os.getenv("ANTHROPIC_API_KEY"))
102108
llm.register_function("fetch_user_image", fetch_user_image)
103109

110+
@llm.event_handler("on_function_calls_started")
111+
async def on_function_calls_started(service, function_calls):
112+
await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
113+
104114
fetch_image_function = FunctionSchema(
105115
name="fetch_user_image",
106116
description="Called when the user requests a description of their camera feed",

examples/foundational/14d-function-calling-aws-video.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
1515
from pipecat.audio.vad.silero import SileroVADAnalyzer
1616
from pipecat.audio.vad.vad_analyzer import VADParams
17-
from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame
17+
from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame, UserImageRequestFrame
1818
from pipecat.pipeline.pipeline import Pipeline
1919
from pipecat.pipeline.runner import PipelineRunner
2020
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -55,9 +55,15 @@ async def fetch_user_image(params: FunctionCallParams):
5555
logger.debug(f"Requesting image with user_id={user_id}, question={question}")
5656

5757
# Request a user image frame and indicate that it should be added to the
58-
# context.
58+
# context. Also associate it to the function call.
5959
await params.llm.push_frame(
60-
UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
60+
UserImageRequestFrame(
61+
user_id=user_id,
62+
text=question,
63+
append_to_context=True,
64+
function_name=params.function_name,
65+
tool_call_id=params.tool_call_id,
66+
),
6167
FrameDirection.UPSTREAM,
6268
)
6369

@@ -108,6 +114,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
108114
)
109115
llm.register_function("fetch_user_image", fetch_user_image)
110116

117+
@llm.event_handler("on_function_calls_started")
118+
async def on_function_calls_started(service, function_calls):
119+
await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
120+
111121
fetch_image_function = FunctionSchema(
112122
name="fetch_user_image",
113123
description="Called when the user requests a description of their camera feed",

examples/foundational/14d-function-calling-gemini-flash-video.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
1515
from pipecat.audio.vad.silero import SileroVADAnalyzer
1616
from pipecat.audio.vad.vad_analyzer import VADParams
17-
from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame
17+
from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame, UserImageRequestFrame
1818
from pipecat.pipeline.pipeline import Pipeline
1919
from pipecat.pipeline.runner import PipelineRunner
2020
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -55,9 +55,15 @@ async def fetch_user_image(params: FunctionCallParams):
5555
logger.debug(f"Requesting image with user_id={user_id}, question={question}")
5656

5757
# Request a user image frame and indicate that it should be added to the
58-
# context.
58+
# context. Also associate it to the function call.
5959
await params.llm.push_frame(
60-
UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
60+
UserImageRequestFrame(
61+
user_id=user_id,
62+
text=question,
63+
append_to_context=True,
64+
function_name=params.function_name,
65+
tool_call_id=params.tool_call_id,
66+
),
6167
FrameDirection.UPSTREAM,
6268
)
6369

@@ -101,6 +107,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
101107
llm = GoogleLLMService(api_key=os.getenv("GOOGLE_API_KEY"))
102108
llm.register_function("fetch_user_image", fetch_user_image)
103109

110+
@llm.event_handler("on_function_calls_started")
111+
async def on_function_calls_started(service, function_calls):
112+
await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
113+
104114
fetch_image_function = FunctionSchema(
105115
name="fetch_user_image",
106116
description="Called when the user requests a description of their camera feed",

examples/foundational/14d-function-calling-moondream-video.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
LLMFullResponseStartFrame,
2121
LLMRunFrame,
2222
TextFrame,
23+
TTSSpeakFrame,
2324
UserImageRequestFrame,
2425
)
2526
from pipecat.pipeline.parallel_pipeline import ParallelPipeline
@@ -64,9 +65,15 @@ async def fetch_user_image(params: FunctionCallParams):
6465

6566
# Request a user image frame. In this case, we don't want the requested
6667
# image to be added to the context because we will process it with
67-
# Moondream.
68+
# Moondream. Also associate it to the function call.
6869
await params.llm.push_frame(
69-
UserImageRequestFrame(user_id=user_id, text=question, append_to_context=False),
70+
UserImageRequestFrame(
71+
user_id=user_id,
72+
text=question,
73+
append_to_context=False,
74+
function_name=params.function_name,
75+
tool_call_id=params.tool_call_id,
76+
),
7077
FrameDirection.UPSTREAM,
7178
)
7279

@@ -130,6 +137,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
130137
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
131138
llm.register_function("fetch_user_image", fetch_user_image)
132139

140+
@llm.event_handler("on_function_calls_started")
141+
async def on_function_calls_started(service, function_calls):
142+
await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
143+
133144
fetch_image_function = FunctionSchema(
134145
name="fetch_user_image",
135146
description="Called when the user requests a description of their camera feed",

examples/foundational/14d-function-calling-openai-video.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
1616
from pipecat.audio.vad.silero import SileroVADAnalyzer
1717
from pipecat.audio.vad.vad_analyzer import VADParams
18-
from pipecat.frames.frames import LLMRunFrame, UserImageRequestFrame
18+
from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame, UserImageRequestFrame
1919
from pipecat.pipeline.pipeline import Pipeline
2020
from pipecat.pipeline.runner import PipelineRunner
2121
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -56,9 +56,15 @@ async def fetch_user_image(params: FunctionCallParams):
5656
logger.debug(f"Requesting image with user_id={user_id}, question={question}")
5757

5858
# Request a user image frame and indicate that it should be added to the
59-
# context.
59+
# context. Also associate it to the function call.
6060
await params.llm.push_frame(
61-
UserImageRequestFrame(user_id=user_id, text=question, append_to_context=True),
61+
UserImageRequestFrame(
62+
user_id=user_id,
63+
text=question,
64+
append_to_context=True,
65+
function_name=params.function_name,
66+
tool_call_id=params.tool_call_id,
67+
),
6268
FrameDirection.UPSTREAM,
6369
)
6470

@@ -101,6 +107,10 @@ async def run_bot(transport: BaseTransport, runner_args: RunnerArguments):
101107
llm = OpenAILLMService(api_key=os.getenv("OPENAI_API_KEY"))
102108
llm.register_function("fetch_user_image", fetch_user_image)
103109

110+
@llm.event_handler("on_function_calls_started")
111+
async def on_function_calls_started(service, function_calls):
112+
await tts.queue_frame(TTSSpeakFrame("Let me check on that."))
113+
104114
fetch_image_function = FunctionSchema(
105115
name="fetch_user_image",
106116
description="Called when the user requests a description of their camera feed",

examples/foundational/14e-function-calling-google.py

Lines changed: 41 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#
66

77

8-
import asyncio
98
import os
109

1110
from dotenv import load_dotenv
@@ -16,7 +15,7 @@
1615
from pipecat.audio.turn.smart_turn.local_smart_turn_v3 import LocalSmartTurnAnalyzerV3
1716
from pipecat.audio.vad.silero import SileroVADAnalyzer
1817
from pipecat.audio.vad.vad_analyzer import VADParams
19-
from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame
18+
from pipecat.frames.frames import LLMRunFrame, TTSSpeakFrame, UserImageRequestFrame
2019
from pipecat.pipeline.pipeline import Pipeline
2120
from pipecat.pipeline.runner import PipelineRunner
2221
from pipecat.pipeline.task import PipelineParams, PipelineTask
@@ -25,6 +24,7 @@
2524
LLMContextAggregatorPair,
2625
LLMUserAggregatorParams,
2726
)
27+
from pipecat.processors.frame_processor import FrameDirection
2828
from pipecat.runner.types import RunnerArguments
2929
from pipecat.runner.utils import (
3030
create_transport,
@@ -43,10 +43,6 @@
4343
load_dotenv(override=True)
4444

4545

46-
# Global variable to store the client ID
47-
client_id = ""
48-
49-
5046
async def get_weather(params: FunctionCallParams):
5147
location = params.arguments["location"]
5248
await params.result_callback(f"The weather in {location} is currently 72 degrees and sunny.")
@@ -57,24 +53,35 @@ async def fetch_restaurant_recommendation(params: FunctionCallParams):
5753

5854

5955
async def get_image(params: FunctionCallParams):
56+
"""Fetch the user image and push it to the LLM.
57+
58+
When called, this function pushes a UserImageRequestFrame upstream to the
59+
transport. As a result, the transport will request the user image and push a
60+
UserImageRawFrame downstream which will be added to the context by the LLM
61+
assistant aggregator.
62+
"""
63+
user_id = params.arguments["user_id"]
6064
question = params.arguments["question"]
61-
logger.debug(f"Requesting image with user_id={client_id}, question={question}")
62-
63-
# Request the image frame
64-
await params.llm.request_image_frame(
65-
user_id=client_id,
66-
function_name=params.function_name,
67-
tool_call_id=params.tool_call_id,
68-
text_content=question,
65+
logger.debug(f"Requesting image with user_id={user_id}, question={question}")
66+
67+
# Request a user image frame and indicate that it should be added to the
68+
# context. Also associate it to the function call.
69+
await params.llm.push_frame(
70+
UserImageRequestFrame(
71+
user_id=user_id,
72+
text=question,
73+
append_to_context=True,
74+
function_name=params.function_name,
75+
tool_call_id=params.tool_call_id,
76+
),
77+
FrameDirection.UPSTREAM,
6978
)
7079

71-
# Wait a short time for the frame to be processed
72-
await asyncio.sleep(0.5)
80+
await params.result_callback(None)
7381

74-
# Return a result to complete the function call
75-
await params.result_callback(
76-
f"I've captured an image from your camera and I'm analyzing what you asked about: {question}"
77-
)
82+
# Instead of None, it's possible to also provide a tool call answer to
83+
# tell the LLM that we are grabbing the image to analyze.
84+
# await params.result_callback({"result": "Image is being captured."})
7885

7986

8087
# We store functions so objects (e.g. SileroVADAnalyzer) don't get
@@ -144,14 +151,18 @@ async def on_function_calls_started(service, function_calls):
144151
)
145152
get_image_function = FunctionSchema(
146153
name="get_image",
147-
description="Get an image from the video stream.",
154+
description="Called when the user requests a description of their camera feed",
148155
properties={
156+
"user_id": {
157+
"type": "string",
158+
"description": "The ID of the user to grab the image from",
159+
},
149160
"question": {
150161
"type": "string",
151-
"description": "The question that the user is asking about the image.",
152-
}
162+
"description": "The question that the user is asking about the image",
163+
},
153164
},
154-
required=["question"],
165+
required=["user_id", "question"],
155166
)
156167
tools = ToolsSchema(standard_tools=[weather_function, get_image_function, restaurant_function])
157168

@@ -175,7 +186,6 @@ async def on_function_calls_started(service, function_calls):
175186
"""
176187
messages = [
177188
{"role": "system", "content": system_prompt},
178-
{"role": "user", "content": "Say hello."},
179189
]
180190

181191
context = LLMContext(messages, tools)
@@ -215,10 +225,15 @@ async def on_client_connected(transport, client):
215225

216226
await maybe_capture_participant_camera(transport, client)
217227

218-
global client_id
219228
client_id = get_transport_client_id(transport, client)
220229

221230
# Kick off the conversation.
231+
messages.append(
232+
{
233+
"role": "system",
234+
"content": f"Please introduce yourself to the user. Use '{client_id}' as the user ID during function calls.",
235+
}
236+
)
222237
await task.queue_frames([LLMRunFrame()])
223238

224239
@transport.event_handler("on_client_disconnected")

0 commit comments

Comments
 (0)