Skip to content

Commit 8cc3ceb

Browse files
benbarclayteknium1
authored andcommitted
fix(mcp): add half-open state to circuit breaker
The MCP circuit breaker previously had no path back to the closed state: once _server_error_counts[srv] reached _CIRCUIT_BREAKER_THRESHOLD the gate short-circuited every subsequent call, so the only reset path (on successful call) was unreachable. A single transient 3-failure blip (bad network, server restart, expired token) permanently disabled every tool on that MCP server for the rest of the agent session. Introduce a classic closed/open/half-open state machine: - Track a per-server breaker-open timestamp in _server_breaker_opened_at alongside the existing failure count. - Add _CIRCUIT_BREAKER_COOLDOWN_SEC (60s). Once the count reaches threshold, calls short-circuit for the cooldown window. - After the cooldown elapses, the *next* call falls through as a half-open probe that actually hits the session. Success resets the breaker via _reset_server_error; failure re-bumps the count via _bump_server_error, which re-stamps the open timestamp and re-arms the cooldown. The error message now includes the live failure count and an "Auto-retry available in ~Ns" hint so the model knows the breaker will self-heal rather than giving up on the tool for the whole session. Covers tests 1 (half-opens after cooldown) and 2 (reopens on probe failure); test 3 (cleared on reconnect) still fails pending fix NousResearch#2.
1 parent 724377c commit 8cc3ceb

1 file changed

Lines changed: 68 additions & 17 deletions

File tree

tools/mcp_tool.py

Lines changed: 68 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1249,9 +1249,47 @@ async def shutdown(self):
12491249
# _CIRCUIT_BREAKER_THRESHOLD consecutive failures, the handler returns
12501250
# a "server unreachable" message that tells the model to stop retrying,
12511251
# preventing the 90-iteration burn loop described in #10447.
1252-
# Reset to 0 on any successful call.
1252+
#
1253+
# State machine:
1254+
# closed — error count below threshold; all calls go through.
1255+
# open — threshold reached; calls short-circuit until the
1256+
# cooldown elapses.
1257+
# half-open — cooldown elapsed; the next call is a probe that
1258+
# actually hits the session. Probe success → closed.
1259+
# Probe failure → reopens (cooldown re-armed).
1260+
#
1261+
# ``_server_breaker_opened_at`` records the monotonic timestamp when
1262+
# the breaker most recently transitioned into the open state. Use the
1263+
# ``_bump_server_error`` / ``_reset_server_error`` helpers to mutate
1264+
# this state — they keep the count and timestamp in sync.
12531265
_server_error_counts: Dict[str, int] = {}
1266+
_server_breaker_opened_at: Dict[str, float] = {}
12541267
_CIRCUIT_BREAKER_THRESHOLD = 3
1268+
_CIRCUIT_BREAKER_COOLDOWN_SEC = 60.0
1269+
1270+
1271+
def _bump_server_error(server_name: str) -> None:
1272+
"""Increment the consecutive-failure count for ``server_name``.
1273+
1274+
When the count crosses :data:`_CIRCUIT_BREAKER_THRESHOLD`, stamp the
1275+
breaker-open timestamp so the cooldown clock starts (or re-starts,
1276+
for probe failures in the half-open state).
1277+
"""
1278+
n = _server_error_counts.get(server_name, 0) + 1
1279+
_server_error_counts[server_name] = n
1280+
if n >= _CIRCUIT_BREAKER_THRESHOLD:
1281+
_server_breaker_opened_at[server_name] = time.monotonic()
1282+
1283+
1284+
def _reset_server_error(server_name: str) -> None:
1285+
"""Fully close the breaker for ``server_name``.
1286+
1287+
Clears both the failure count and the breaker-open timestamp. Call
1288+
this on any unambiguous success signal (successful tool call,
1289+
successful reconnect, manual /mcp refresh).
1290+
"""
1291+
_server_error_counts[server_name] = 0
1292+
_server_breaker_opened_at.pop(server_name, None)
12551293

12561294
# ---------------------------------------------------------------------------
12571295
# Auth-failure detection helpers (Task 6 of MCP OAuth consolidation)
@@ -1396,10 +1434,10 @@ async def _recover():
13961434
try:
13971435
parsed = json.loads(result)
13981436
if "error" not in parsed:
1399-
_server_error_counts[server_name] = 0
1437+
_reset_server_error(server_name)
14001438
return result
14011439
except (json.JSONDecodeError, TypeError):
1402-
_server_error_counts[server_name] = 0
1440+
_reset_server_error(server_name)
14031441
return result
14041442
except Exception as retry_exc:
14051443
logger.warning(
@@ -1410,7 +1448,7 @@ async def _recover():
14101448
# No recovery available, or retry also failed: surface a structured
14111449
# needs_reauth error. Bumps the circuit breaker so the model stops
14121450
# retrying the tool.
1413-
_server_error_counts[server_name] = _server_error_counts.get(server_name, 0) + 1
1451+
_bump_server_error(server_name)
14141452
return json.dumps({
14151453
"error": (
14161454
f"MCP server '{server_name}' requires re-authentication. "
@@ -1614,20 +1652,33 @@ def _handler(args: dict, **kwargs) -> str:
16141652
# Circuit breaker: if this server has failed too many times
16151653
# consecutively, short-circuit with a clear message so the model
16161654
# stops retrying and uses alternative approaches (#10447).
1655+
#
1656+
# Once the cooldown elapses, the breaker transitions to
1657+
# half-open: we let the *next* call through as a probe. On
1658+
# success the success-path below resets the breaker; on
1659+
# failure the error paths below bump the count again, which
1660+
# re-stamps the open-time via _bump_server_error (re-arming
1661+
# the cooldown).
16171662
if _server_error_counts.get(server_name, 0) >= _CIRCUIT_BREAKER_THRESHOLD:
1618-
return json.dumps({
1619-
"error": (
1620-
f"MCP server '{server_name}' is unreachable after "
1621-
f"{_CIRCUIT_BREAKER_THRESHOLD} consecutive failures. "
1622-
f"Do NOT retry this tool — use alternative approaches "
1623-
f"or ask the user to check the MCP server."
1624-
)
1625-
}, ensure_ascii=False)
1663+
opened_at = _server_breaker_opened_at.get(server_name, 0.0)
1664+
age = time.monotonic() - opened_at
1665+
if age < _CIRCUIT_BREAKER_COOLDOWN_SEC:
1666+
remaining = max(1, int(_CIRCUIT_BREAKER_COOLDOWN_SEC - age))
1667+
return json.dumps({
1668+
"error": (
1669+
f"MCP server '{server_name}' is unreachable after "
1670+
f"{_server_error_counts[server_name]} consecutive "
1671+
f"failures. Auto-retry available in ~{remaining}s. "
1672+
f"Do NOT retry this tool yet — use alternative "
1673+
f"approaches or ask the user to check the MCP server."
1674+
)
1675+
}, ensure_ascii=False)
1676+
# Cooldown elapsed → fall through as a half-open probe.
16261677

16271678
with _lock:
16281679
server = _servers.get(server_name)
16291680
if not server or not server.session:
1630-
_server_error_counts[server_name] = _server_error_counts.get(server_name, 0) + 1
1681+
_bump_server_error(server_name)
16311682
return json.dumps({
16321683
"error": f"MCP server '{server_name}' is not connected"
16331684
}, ensure_ascii=False)
@@ -1676,11 +1727,11 @@ def _call_once():
16761727
try:
16771728
parsed = json.loads(result)
16781729
if "error" in parsed:
1679-
_server_error_counts[server_name] = _server_error_counts.get(server_name, 0) + 1
1730+
_bump_server_error(server_name)
16801731
else:
1681-
_server_error_counts[server_name] = 0 # success — reset
1732+
_reset_server_error(server_name) # success — reset
16821733
except (json.JSONDecodeError, TypeError):
1683-
_server_error_counts[server_name] = 0 # non-JSON = success
1734+
_reset_server_error(server_name) # non-JSON = success
16841735
return result
16851736
except InterruptedError:
16861737
return _interrupted_call_result()
@@ -1695,7 +1746,7 @@ def _call_once():
16951746
if recovered is not None:
16961747
return recovered
16971748

1698-
_server_error_counts[server_name] = _server_error_counts.get(server_name, 0) + 1
1749+
_bump_server_error(server_name)
16991750
logger.error(
17001751
"MCP tool %s/%s call failed: %s",
17011752
server_name, tool_name, exc,

0 commit comments

Comments
 (0)