From 9a8eba4eb7143b42e7c23118de0aafa6ff3761ae Mon Sep 17 00:00:00 2001 From: Ron Forrester Date: Fri, 17 Apr 2026 21:53:06 -0700 Subject: [PATCH] fix(runtime): self-heal orphaned tool_result blocks on load + compact (#5813) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Orphaned tool_result messages (whose paired assistant tool_use was lost to compaction or a crash) bricked Signal-channel sessions with repeated Anthropic 400 "unexpected tool_use_id in tool_result blocks" errors. - Load paths (CLI session file + orchestrator channel hydration) now run remove_orphaned_tool_messages so a corrupt persisted session heals instead of requiring manual file deletion. - Compaction's repair pass now delegates to the canonical remove_orphaned_tool_messages instead of a weak adjacency heuristic. - Orphan detection parses the assistant's structured tool_calls array instead of substring-matching content — summaries that preserve the orphan's id in prose no longer fool the check. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../zeroclaw-channels/src/orchestrator/mod.rs | 6 ++ .../src/agent/context_compressor.rs | 75 ++++++++++-------- crates/zeroclaw-runtime/src/agent/history.rs | 8 ++ .../src/agent/history_pruner.rs | 77 +++++++++++++++---- crates/zeroclaw-runtime/src/agent/loop_.rs | 31 ++++++++ 5 files changed, 146 insertions(+), 51 deletions(-) diff --git a/crates/zeroclaw-channels/src/orchestrator/mod.rs b/crates/zeroclaw-channels/src/orchestrator/mod.rs index 39c243bcd4..f64f13d69a 100644 --- a/crates/zeroclaw-channels/src/orchestrator/mod.rs +++ b/crates/zeroclaw-channels/src/orchestrator/mod.rs @@ -5600,6 +5600,12 @@ pub async fn start_channels(config: Config) -> Result<()> { if msgs.len() > MAX_CHANNEL_HISTORY { msgs.drain(..msgs.len() - MAX_CHANNEL_HISTORY); } + // Self-heal: strip orphaned tool_result messages left by a + // prior compaction that dropped the assistant tool_use without + // its paired tool_result. Without this, the session is bricked + // until the file is deleted because every API call fails with + // 400 "unexpected tool_use_id in tool_result blocks". See #5813. + zeroclaw_runtime::agent::history_pruner::remove_orphaned_tool_messages(&mut msgs); // Close orphaned user turns from crashed sessions. if msgs.last().is_some_and(|m| m.role == "user") { let closure = diff --git a/crates/zeroclaw-runtime/src/agent/context_compressor.rs b/crates/zeroclaw-runtime/src/agent/context_compressor.rs index dfd8781369..13ddb2dd65 100644 --- a/crates/zeroclaw-runtime/src/agent/context_compressor.rs +++ b/crates/zeroclaw-runtime/src/agent/context_compressor.rs @@ -409,39 +409,16 @@ fn align_boundary_backward(messages: &[ChatMessage], idx: usize) -> usize { // Tool pair repair // --------------------------------------------------------------------------- -/// Remove orphaned tool_results and add stubs for orphaned tool_calls. +/// Remove orphaned tool_result messages whose assistant (tool_use) +/// counterpart was summarized away. /// -/// After compression, some tool results may reference tool_calls that were -/// summarized away, and vice versa. This function cleans up the history -/// so every tool_result has a matching assistant message and every -/// tool_call-bearing assistant message has results. +/// Delegates to `history_pruner::remove_orphaned_tool_messages`, which +/// matches on the structured `tool_call_id` payload — catching orphans +/// regardless of where they sit relative to the [CONTEXT SUMMARY] marker. +/// Without this, a compaction that trims a `tool_use` but leaves its +/// `tool_result` bricks the session with a 400 from Anthropic. See #5813. fn repair_tool_pairs(messages: &mut Vec) { - // Heuristic: tool messages whose content references a call ID that no longer - // exists in any assistant message should be removed. Since ChatMessage is a - // simple role+content struct (no structured tool_call_id field), we use a - // simpler approach: remove any "tool" message that immediately follows the - // [CONTEXT SUMMARY] message (it's orphaned by definition). - let mut i = 0; - while i < messages.len() { - if messages[i].content.contains("[CONTEXT SUMMARY") { - // Remove any immediately following orphaned tool results - while i + 1 < messages.len() && messages[i + 1].role == "tool" { - messages.remove(i + 1); - } - } - i += 1; - } - - // Also check for tool results at the very start (after system prompt) that - // are orphaned because their assistant message was compressed. - let start = if messages.first().is_some_and(|m| m.role == "system") { - 1 - } else { - 0 - }; - while start < messages.len() && messages[start].role == "tool" { - messages.remove(start); - } + crate::agent::history_pruner::remove_orphaned_tool_messages(messages); } // --------------------------------------------------------------------------- @@ -587,12 +564,42 @@ mod tests { let mut messages = vec![ msg("system", "sys"), msg("user", "q"), - msg("assistant", "calling tool"), - msg("tool", "result"), + msg( + "assistant", + r#"{"content":"calling","tool_calls":[{"id":"toolu_abc","name":"shell","arguments":"{}"}]}"#, + ), + msg("tool", r#"{"tool_call_id":"toolu_abc","content":"result"}"#), msg("user", "thanks"), ]; repair_tool_pairs(&mut messages); - assert_eq!(messages.len(), 5); // no change + assert_eq!(messages.len(), 5); // no change — pairing intact + } + + /// Regression test for #5813. The compact pass must remove orphaned + /// tool_result messages even when they sit after additional turns — + /// not just immediately after the [CONTEXT SUMMARY] marker. Otherwise + /// a post-compaction Anthropic call fails with 400 "unexpected + /// tool_use_id found in tool_result blocks". + #[test] + fn test_repair_tool_pairs_removes_orphan_after_intermediate_user() { + let mut messages = vec![ + msg("system", "sys"), + msg( + "assistant", + "[CONTEXT SUMMARY \u{2014} 4 earlier messages compressed]", + ), + msg("user", "follow-up"), + msg( + "tool", + r#"{"tool_call_id":"toolu_GONE","content":"stale result"}"#, + ), + msg("user", "next"), + ]; + repair_tool_pairs(&mut messages); + assert!( + !messages.iter().any(|m| m.role == "tool"), + "orphaned tool whose tool_use was summarized must be removed" + ); } #[test] diff --git a/crates/zeroclaw-runtime/src/agent/history.rs b/crates/zeroclaw-runtime/src/agent/history.rs index c9f1d90d5c..9938888b15 100644 --- a/crates/zeroclaw-runtime/src/agent/history.rs +++ b/crates/zeroclaw-runtime/src/agent/history.rs @@ -195,6 +195,14 @@ pub fn load_interactive_session_history( state.history.insert(0, ChatMessage::system(system_prompt)); } + // Self-heal persisted sessions that were written with orphaned + // tool_result messages (e.g. a crash mid-compaction, or a trim that + // dropped the assistant tool_use block but left its tool_result). + // Without this the next API call fails with 400 "unexpected tool_use_id + // found in tool_result blocks" and the session stays bricked until the + // file is deleted. See #5813. + remove_orphaned_tool_messages(&mut state.history); + Ok(state.history) } diff --git a/crates/zeroclaw-runtime/src/agent/history_pruner.rs b/crates/zeroclaw-runtime/src/agent/history_pruner.rs index 2be24b4ebd..b9594385a2 100644 --- a/crates/zeroclaw-runtime/src/agent/history_pruner.rs +++ b/crates/zeroclaw-runtime/src/agent/history_pruner.rs @@ -70,18 +70,17 @@ pub fn remove_orphaned_tool_messages(messages: &mut Vec) -> usize { let mut i = 0; while i < messages.len() { if messages[i].role == "assistant" - && messages[i].content.contains("tool_calls") + && extract_assistant_tool_call_ids(&messages[i].content).is_some() && i > 0 && messages[i - 1].role == "assistant" { - // Collect tool_call_ids from this assistant to find matching tool_results. - let doomed_content = messages[i].content.clone(); + let doomed_ids = + extract_assistant_tool_call_ids(&messages[i].content).unwrap_or_default(); messages.remove(i); removed += 1; - // Remove following tool messages that reference this assistant. while i < messages.len() && messages[i].role == "tool" { let dominated = match extract_tool_call_id(&messages[i].content) { - Some(id) => doomed_content.contains(&id), + Some(id) => doomed_ids.iter().any(|d| d == &id), None => true, }; if dominated { @@ -97,7 +96,11 @@ pub fn remove_orphaned_tool_messages(messages: &mut Vec) -> usize { } // Pass 2: Remove remaining orphan tool messages whose tool_call_id - // doesn't appear in the immediately preceding assistant. + // is not in the preceding assistant's structured tool_calls array. + // A substring match on the assistant's *text* is NOT sufficient — + // compaction summaries are instructed to preserve identifiers, so an + // id can appear in prose without an actual tool_use block backing it + // (see #5813). i = 0; while i < messages.len() { if messages[i].role != "tool" { @@ -112,17 +115,13 @@ pub fn remove_orphaned_tool_messages(messages: &mut Vec) -> usize { let is_orphan = match assistant_idx { None => true, - Some(idx) => { - let assistant_content = &messages[idx].content; - if assistant_content.contains("tool_calls") { - match extract_tool_call_id(&messages[i].content) { - Some(tool_call_id) => !assistant_content.contains(&tool_call_id), - None => false, - } - } else { - true - } - } + Some(idx) => match extract_assistant_tool_call_ids(&messages[idx].content) { + None => true, + Some(ids) => match extract_tool_call_id(&messages[i].content) { + Some(tool_call_id) => !ids.iter().any(|id| id == &tool_call_id), + None => false, + }, + }, }; if is_orphan { @@ -154,6 +153,20 @@ fn extract_tool_call_id(content: &str) -> Option { .map(|s| s.to_string()) } +/// Extract the list of structured tool-call IDs an assistant message +/// is claiming to have invoked, if any. Returns `None` when the content +/// does not parse as a JSON object with a `tool_calls` array — meaning the +/// assistant has no native tool_use blocks backing any tool_results. +fn extract_assistant_tool_call_ids(content: &str) -> Option> { + let value: serde_json::Value = serde_json::from_str(content).ok()?; + let arr = value.get("tool_calls")?.as_array()?; + let ids: Vec = arr + .iter() + .filter_map(|call| call.get("id").and_then(|v| v.as_str()).map(str::to_owned)) + .collect(); + if ids.is_empty() { None } else { Some(ids) } +} + // --------------------------------------------------------------------------- // Public entry point // --------------------------------------------------------------------------- @@ -755,6 +768,36 @@ mod tests { ); } + /// Regression test for issue #5813: a compaction summary preserves + /// identifiers by design (UUIDs, tokens, tool_call_ids). That means the + /// summary text may contain the tool_call_id of a tool_result whose + /// tool_use was dropped. The orphan detector must not be fooled by a + /// substring match on the summary — it must confirm the id appears in + /// a structured tool_calls array. + #[test] + fn orphan_tool_not_fooled_by_id_in_summary_text() { + let summary = format!( + "[CONTEXT SUMMARY \u{2014} 4 messages compressed]\n\ + Earlier turns invoked shell with tool_calls id toolu_01Orphan \ + and returned ok." + ); + let mut messages = vec![ + msg("system", "sys"), + msg("assistant", &summary), + msg( + "tool", + r#"{"tool_call_id":"toolu_01Orphan","content":"stale"}"#, + ), + msg("user", "new question"), + ]; + let removed = remove_orphaned_tool_messages(&mut messages); + assert_eq!( + removed, 1, + "orphan must be removed even if its id is mentioned in summary text" + ); + assert!(!messages.iter().any(|m| m.role == "tool")); + } + /// Regression test for issue #5743: MiniMax rejects orphaned tool-role /// messages whose assistant (with `tool_calls`) was trimmed by the /// channel orchestrator's proactive history trimming. diff --git a/crates/zeroclaw-runtime/src/agent/loop_.rs b/crates/zeroclaw-runtime/src/agent/loop_.rs index 82076a853a..16e56d9073 100644 --- a/crates/zeroclaw-runtime/src/agent/loop_.rs +++ b/crates/zeroclaw-runtime/src/agent/loop_.rs @@ -3696,6 +3696,37 @@ mod tests { assert_eq!(restored[1].content, "orphan"); } + /// Regression test for issue #5813: a persisted session whose assistant + /// (tool_use) was lost to compaction must self-heal on load so the next + /// API call doesn't fail with "unexpected tool_use_id found in tool_result + /// blocks". + #[test] + fn load_interactive_session_heals_orphaned_tool_result() { + let dir = tempdir().unwrap(); + let path = dir.path().join("session.json"); + let orphan_tool = ChatMessage::tool( + r#"{"tool_call_id":"toolu_01OrphanFromCompaction","content":"stale result"}"#, + ); + let payload = serde_json::to_string_pretty(&InteractiveSessionState { + version: 1, + history: vec![ + ChatMessage::system("sys"), + orphan_tool, + ChatMessage::user("next question"), + ], + }) + .unwrap(); + std::fs::write(&path, payload).unwrap(); + + let restored = load_interactive_session_history(&path, "fallback").unwrap(); + + assert!( + !restored.iter().any(|m| m.role == "tool"), + "orphaned tool_result should be removed on load; got roles {:?}", + restored.iter().map(|m| &m.role).collect::>() + ); + } + use super::*; use async_trait::async_trait; use base64::{Engine as _, engine::general_purpose::STANDARD};