Skip to content

Commit 8eba90f

Browse files
andreasrongeclaude
andcommitted
feat: add completion_mode: :auto for simplified multi-turn semantics
Adds an opt-in auto-return mode where println presence controls the multi-turn loop: println = exploration (continue), no println = answer (return last expression). This eliminates the return/fail interaction rules that were the #1 model failure mode in benchmarks. Key design decisions: - completion_mode: :explicit (default) preserves all existing behavior - completion_mode: :auto uses a simpler prompt (33% smaller) with one rule instead of three - Auto-return is disabled when a plan is present — plan agents use the regular multi_turn_journal prompt with explicit return/step-done - Plan always auto-enables journaling for progress checklist rendering - return/fail still work as escape hatches in auto mode Benchmark results (gemini-3.1-flash-lite-preview, 3 runs of 25 tests): - auto_return: 97.3% avg pass rate, 2214 avg tokens - multi_turn: 94.7% avg pass rate, 2794 avg tokens - 21% fewer tokens, 15% faster, same or better accuracy Also includes coordinator + worker delegation scripts for testing Claude Code-style agent composition patterns. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 124191d commit 8eba90f

File tree

18 files changed

+736
-49
lines changed

18 files changed

+736
-49
lines changed

demo/lib/ptc_demo/agent.ex

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -547,6 +547,14 @@ defmodule PtcDemo.Agent do
547547
thinking: thinking
548548
]
549549

550+
# Add completion_mode for auto_return prompt profile
551+
base_opts =
552+
if prompt_profile == :auto_return do
553+
Keyword.put(base_opts, :completion_mode, :auto)
554+
else
555+
base_opts
556+
end
557+
550558
# Add plan and enable journaling when plan is present
551559
base_opts =
552560
if plan do

demo/lib/ptc_demo/prompts.ex

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ defmodule PtcDemo.Prompts do
5353
def get(profile \\ :single_shot)
5454

5555
# Delegate standard prompts to the library
56-
def get(profile) when profile in [:single_shot, :multi_turn, :base, :addon_memory] do
56+
def get(profile)
57+
when profile in [:single_shot, :multi_turn, :auto_return, :base, :addon_memory] do
5758
LibLanguageSpec.get(profile)
5859
end
5960

@@ -75,7 +76,8 @@ defmodule PtcDemo.Prompts do
7576
def list do
7677
[
7778
{:single_shot, "Base language reference for single-turn queries"},
78-
{:multi_turn, "Base + memory addon for multi-turn conversations"}
79+
{:multi_turn, "Base + memory addon for multi-turn conversations"},
80+
{:auto_return, "Base + auto-return (println to explore, last expr to answer)"}
7981
]
8082
end
8183

@@ -102,7 +104,7 @@ defmodule PtcDemo.Prompts do
102104
{:ok, :single_shot}
103105
104106
iex> PtcDemo.Prompts.validate_profile("invalid")
105-
{:error, "Unknown prompt profile 'invalid'. Valid: single_shot, multi_turn"}
107+
{:error, "Unknown prompt profile 'invalid'. Valid: single_shot, multi_turn, auto_return"}
106108
107109
"""
108110
@spec validate_profile(String.t()) :: {:ok, atom()} | {:error, String.t()}

demo/scripts/coordinator_test.exs

Lines changed: 35 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,17 @@
1-
# Coordinator + Worker delegation test
1+
# Coordinator + Worker delegation test (Claude Code style)
22
#
3-
# Tests whether a coordinator agent can decompose a complex task
4-
# and delegate sub-questions to a worker agent — similar to how
5-
# Claude Code spawns sub-agents.
3+
# The coordinator has NO data — only an analyst tool (worker sub-agent).
4+
# It must: 1) decide what to ask, 2) call the analyst, 3) inspect results,
5+
# 4) decide if more info is needed or assemble the answer.
66
#
7-
# The coordinator has NO data — it can only delegate to the worker.
8-
# The worker has all datasets and answers focused questions.
7+
# Uses completion_mode: :auto — println means "exploring", no println means "done".
98
#
109
# Usage:
1110
# cd demo && mix run scripts/coordinator_test.exs
1211
#
1312
# Set OPENROUTER_API_KEY in .env or environment.
1413

15-
alias PtcDemo.{CLIBase, SampleData, SearchTool}
14+
alias PtcDemo.{CLIBase, SampleData}
1615
alias PtcRunner.SubAgent
1716

1817
CLIBase.load_dotenv()
@@ -21,7 +20,7 @@ CLIBase.ensure_api_key!()
2120
model = System.get_env("COORDINATOR_MODEL") || "openrouter:google/gemini-3.1-flash-lite-preview"
2221
timeout = 60_000
2322

24-
IO.puts("=== Coordinator + Worker Delegation Test ===")
23+
IO.puts("=== Coordinator + Worker Test (auto-return) ===")
2524
IO.puts("Model: #{model}\n")
2625

2726
# --- LLM callback ---
@@ -41,7 +40,7 @@ llm = fn %{system: system, messages: messages} ->
4140
end
4241
end
4342

44-
# --- Datasets (only for the worker) ---
43+
# --- Datasets (only the worker sees these) ---
4544

4645
datasets = %{
4746
"products" => SampleData.products(),
@@ -50,19 +49,15 @@ datasets = %{
5049
"expenses" => SampleData.expenses()
5150
}
5251

53-
# --- Worker: a function tool that internally runs a SubAgent ---
54-
# This is the key pattern: the worker is a plain function tool from
55-
# the coordinator's perspective, but internally spawns a full SubAgent
56-
# with its own LLM call and data access.
52+
# --- Worker: function tool that spawns a SubAgent ---
5753

5854
worker_agent =
5955
SubAgent.new(
6056
prompt: "{{question}}",
6157
signature: "(question :string) -> :any",
6258
context_descriptions: SampleData.context_descriptions(),
6359
system_prompt: %{
64-
prefix:
65-
"You are a data analyst. Answer the question precisely using the datasets provided.",
60+
prefix: "You are a data analyst. Answer the question precisely using the datasets.",
6661
language_spec: :single_shot
6762
},
6863
max_turns: 1
@@ -78,35 +73,32 @@ analyst_tool = fn %{"question" => question} ->
7873
end
7974
end
8075

81-
# --- Coordinator agent: decomposes and delegates ---
82-
# The coordinator has NO datasets — it can only call the analyst tool.
83-
# It must break the problem into sub-questions and combine results.
76+
# --- Coordinator: auto-return mode, no data, only the analyst tool ---
8477

8578
coordinator =
8679
SubAgent.new(
8780
prompt: "{{mission}}",
8881
signature: "(mission :string) -> :map",
82+
completion_mode: :auto,
8983
tools: %{
9084
"analyst" =>
9185
{analyst_tool,
9286
signature: "(question :string) -> :any",
9387
description:
94-
"Answers a data analysis question. Delegates to a sub-agent with full dataset access. " <>
95-
"Available datasets: employees (200 records with id, department, salary, remote, level), " <>
96-
"expenses (800 records with employee_id, amount, category, status), " <>
97-
"orders (1000 records with customer_id, total, created_at, status), " <>
98-
"products (500 records with category, price, stock). " <>
88+
"Answers a data analysis question using datasets not available to you. " <>
89+
"Datasets: employees (id, department, salary, remote, level), " <>
90+
"expenses (employee_id, amount, category, status), " <>
91+
"orders (customer_id, total, created_at, status), " <>
92+
"products (category, price, stock). " <>
9993
"Ask focused questions that return simple values (numbers, lists, maps)."}
10094
},
10195
system_prompt: %{
10296
prefix: """
103-
You are a coordinator that breaks down complex data analysis tasks.
104-
You have an analyst tool that can query datasets and return results.
105-
Break the mission into focused sub-questions, call the analyst for each,
106-
then combine the results into the final answer.
107-
You do NOT have direct access to data — you must use the analyst tool.
97+
You are a coordinator. You have NO direct data access.
98+
Use the analyst tool to query datasets. Use println to inspect results.
99+
When you have all the data you need, write your final answer as the last expression (no println).
108100
""",
109-
language_spec: :multi_turn
101+
language_spec: :auto_return
110102
},
111103
max_turns: 6,
112104
timeout: 120_000,
@@ -120,9 +112,7 @@ tests = [
120112
name: "Remote vs Office expenses",
121113
mission:
122114
"Compare average expense amounts between remote and office employees. " <>
123-
"Ask the analyst for the average expense amount for remote employees, " <>
124-
"then ask for the average expense amount for office employees. " <>
125-
"Return a map with :remote_avg, :office_avg, and :remote_higher (boolean).",
115+
"Return a map with :remote_avg (number), :office_avg (number), and :remote_higher (boolean).",
126116
check: fn result ->
127117
is_map(result) and
128118
Map.has_key?(result, :remote_avg) and
@@ -142,6 +132,17 @@ tests = [
142132
Map.has_key?(result, :silver) and
143133
Map.has_key?(result, :gold)
144134
end
135+
},
136+
%{
137+
name: "Department with highest avg salary",
138+
mission:
139+
"Find which department has the highest average salary. " <>
140+
"Return a map with :department (string) and :avg_salary (number).",
141+
check: fn result ->
142+
is_map(result) and
143+
Map.has_key?(result, :department) and
144+
Map.has_key?(result, :avg_salary)
145+
end
145146
}
146147
]
147148

@@ -159,10 +160,12 @@ for test <- tests do
159160
{:ok, step} ->
160161
SubAgent.Debug.print_trace(step, raw: true, usage: true)
161162
result = step.return
163+
turns = length(step.turns)
162164

163165
passed = test.check.(result)
164166
status = if passed, do: "PASS", else: "FAIL"
165167
IO.puts("\nResult: #{inspect(result, limit: 10, pretty: true)}")
168+
IO.puts("Turns: #{turns}")
166169
IO.puts("#{status}\n")
167170

168171
{:error, step} ->

0 commit comments

Comments
 (0)