Skip to content

Commit 292f80d

Browse files
committed
Add evals for parallel function calling
1 parent a0d64c4 commit 292f80d

1 file changed

Lines changed: 13 additions & 1 deletion

File tree

scripts/evals/run-release-evals.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,15 @@
3030
)
3131

3232
EVAL_WEATHER = EvalConfig(
33-
prompt="What's the weather in San Francisco? Temperature should be in fahrenheits.",
33+
prompt="What's the weather in San Francisco? Temperature should be in Fahrenheit.",
3434
eval="The user talks about the weather in San Francisco, including the degrees.",
3535
)
3636

37+
EVAL_WEATHER_AND_RESTAURANT = EvalConfig(
38+
prompt="What's the weather in San Francisco, and what's a good restaurant there? Temperature should be in Fahrenheit.",
39+
eval="The user talks about the weather in San Francisco, including the degrees, and provides a restaurant recommendation.",
40+
)
41+
3742
EVAL_ONLINE_SEARCH = EvalConfig(
3843
prompt="What's the current date in UTC?",
3944
eval=f"Current date in UTC is {datetime.now(timezone.utc).strftime('%A, %B %d, %Y')}.",
@@ -145,10 +150,16 @@ def EVAL_VISION_IMAGE(*, eval_speaks_first: bool = False):
145150
("12d-describe-image-moondream.py", EVAL_VISION_IMAGE()),
146151
]
147152

153+
# For a few major services, we also test parallel function calling.
154+
# (We don't bother doing this with every single service, as it's expensive and
155+
# most rely on the same OpenAI-compatible implementation.)
148156
TESTS_14 = [
149157
("14-function-calling.py", EVAL_WEATHER),
158+
("14-function-calling.py", EVAL_WEATHER_AND_RESTAURANT),
150159
("14a-function-calling-anthropic.py", EVAL_WEATHER),
160+
("14a-function-calling-anthropic.py", EVAL_WEATHER_AND_RESTAURANT),
151161
("14e-function-calling-google.py", EVAL_WEATHER),
162+
("14e-function-calling-google.py", EVAL_WEATHER_AND_RESTAURANT),
152163
("14f-function-calling-groq.py", EVAL_WEATHER),
153164
("14g-function-calling-grok.py", EVAL_WEATHER),
154165
("14h-function-calling-azure.py", EVAL_WEATHER),
@@ -160,6 +171,7 @@ def EVAL_VISION_IMAGE(*, eval_speaks_first: bool = False):
160171
("14p-function-calling-gemini-vertex-ai.py", EVAL_WEATHER),
161172
("14q-function-calling-qwen.py", EVAL_WEATHER),
162173
("14r-function-calling-aws.py", EVAL_WEATHER),
174+
("14r-function-calling-aws.py", EVAL_WEATHER_AND_RESTAURANT),
163175
("14v-function-calling-openai.py", EVAL_WEATHER),
164176
("14w-function-calling-mistral.py", EVAL_WEATHER),
165177
("14x-function-calling-openpipe.py", EVAL_WEATHER),

0 commit comments

Comments
 (0)