Skip to content

Commit 460d2db

Browse files
kyan12RationallyPrime
authored andcommitted
fix(gateway): preserve resume marker on interrupted restart
1 parent e0bb712 commit 460d2db

2 files changed

Lines changed: 49 additions & 1 deletion

File tree

gateway/run.py

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -987,6 +987,26 @@ def _normalize_empty_agent_response(
987987
return response
988988

989989

990+
def _should_clear_resume_pending_after_turn(agent_result: dict) -> bool:
991+
"""Return True only when a gateway turn really completed successfully.
992+
993+
Restart recovery uses ``resume_pending`` as a durable marker for sessions
994+
interrupted during gateway drain. A soft interrupt can still bubble out as
995+
a syntactically normal agent result with an empty final response; clearing
996+
the marker in that case loses the recovery signal and startup auto-resume
997+
has nothing to schedule.
998+
"""
999+
if not isinstance(agent_result, dict):
1000+
return False
1001+
if agent_result.get("interrupted"):
1002+
return False
1003+
if agent_result.get("failed") or agent_result.get("partial") or agent_result.get("error"):
1004+
return False
1005+
if agent_result.get("completed") is False:
1006+
return False
1007+
return True
1008+
1009+
9901010
class GatewayRunner:
9911011
"""
9921012
Main gateway controller.
@@ -6589,7 +6609,7 @@ async def _handle_message_with_agent(self, event, source, _quick_key: str, run_g
65896609
# shutdown) — the turn ran to completion, so recovery
65906610
# succeeded and subsequent messages should no longer receive
65916611
# the restart-interruption system note.
6592-
if session_key:
6612+
if session_key and _should_clear_resume_pending_after_turn(agent_result):
65936613
self._clear_restart_failure_count(session_key)
65946614
try:
65956615
self.session_store.clear_resume_pending(session_key)
@@ -14068,6 +14088,11 @@ def _approval_notify_sync(approval_data: dict) -> None:
1406814088
"messages": result.get("messages", []),
1406914089
"api_calls": result.get("api_calls", 0),
1407014090
"failed": result.get("failed", False),
14091+
"partial": result.get("partial", False),
14092+
"completed": result.get("completed"),
14093+
"interrupted": result.get("interrupted", False),
14094+
"interrupt_message": result.get("interrupt_message"),
14095+
"error": result.get("error"),
1407114096
"compression_exhausted": result.get("compression_exhausted", False),
1407214097
"tools": tools_holder[0] or [],
1407314098
"history_offset": len(agent_history),
@@ -14183,6 +14208,11 @@ def _approval_notify_sync(approval_data: dict) -> None:
1418314208
"last_reasoning": result.get("last_reasoning"),
1418414209
"messages": result_holder[0].get("messages", []) if result_holder[0] else [],
1418514210
"api_calls": result_holder[0].get("api_calls", 0) if result_holder[0] else 0,
14211+
"completed": result_holder[0].get("completed") if result_holder[0] else None,
14212+
"interrupted": result_holder[0].get("interrupted", False) if result_holder[0] else False,
14213+
"partial": result_holder[0].get("partial", False) if result_holder[0] else False,
14214+
"error": result_holder[0].get("error") if result_holder[0] else None,
14215+
"interrupt_message": result_holder[0].get("interrupt_message") if result_holder[0] else None,
1418614216
"tools": tools_holder[0] or [],
1418714217
"history_offset": _effective_history_offset,
1418814218
"last_prompt_tokens": _last_prompt_toks,

tests/gateway/test_restart_resume_pending.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
_coerce_gateway_timestamp,
4040
_is_fresh_gateway_interruption,
4141
_last_transcript_timestamp,
42+
_should_clear_resume_pending_after_turn,
4243
)
4344
from gateway.session import SessionEntry, SessionSource, SessionStore
4445
from tests.gateway.restart_test_helpers import (
@@ -52,6 +53,23 @@
5253
# ---------------------------------------------------------------------------
5354

5455

56+
def test_resume_pending_is_cleared_only_after_successful_turn():
57+
"""Interrupted/failed drain results must keep the restart recovery marker.
58+
59+
Regression for dogfood failure: during gateway restart the interrupted run
60+
returned an empty final response and was normalized into a user-facing
61+
fallback, but the gateway cleared ``resume_pending`` before startup could
62+
auto-resume it.
63+
"""
64+
assert _should_clear_resume_pending_after_turn({"final_response": "done"}) is True
65+
assert _should_clear_resume_pending_after_turn({"completed": True}) is True
66+
assert _should_clear_resume_pending_after_turn({"interrupted": True}) is False
67+
assert _should_clear_resume_pending_after_turn({"completed": False}) is False
68+
assert _should_clear_resume_pending_after_turn({"failed": True}) is False
69+
assert _should_clear_resume_pending_after_turn({"partial": True}) is False
70+
assert _should_clear_resume_pending_after_turn({"error": "boom"}) is False
71+
72+
5573
def _make_source(platform=Platform.TELEGRAM, chat_id="123", user_id="u1"):
5674
return SessionSource(platform=platform, chat_id=chat_id, user_id=user_id)
5775

0 commit comments

Comments
 (0)