Skip to content

Commit f25d3ec

Browse files
Brecht-Hclaude
authored andcommitted
fix(kanban): suppress dispatcher stuck-warn when ready queue holds only non-spawnable assignees
After PR #20105 (dispatcher skips ready tasks whose assignee fails ``profile_exists()`` to prevent the orion-cc/orion-research crash loop), the gateway and CLI emit a spurious "kanban dispatcher stuck: ready queue non-empty for N consecutive ticks but 0 workers spawned" warning every 5 minutes on multi-lane setups where the queue is steadily full of human-pulled work assigned to terminal lanes. The warn is intended to catch real failure modes (broken PATH, missing venv, credential loss for a real Hermes profile). On a multi-lane host it fires forever even though everything is healthy: the dispatcher correctly chose not to spawn, and there is nothing for the operator to fix. Changes: * ``DispatchResult`` gains a ``skipped_nonspawnable`` field (separate from ``skipped_unassigned``) so callers can distinguish "task missing an owner — operator should route it" from "task owned by a control-plane lane — terminal will pull it". * ``dispatch_once`` routes the ``not profile_exists(assignee)`` skip into the new bucket (was lumped into ``skipped_unassigned``). * New helper ``has_spawnable_ready(conn)`` returns True iff at least one ready+assigned+unclaimed task in the DB has an assignee that maps to a real Hermes profile. Falls back to legacy "any ready+assigned" when ``profile_exists`` is unimportable so degraded installs still surface the original warn. * The gateway dispatcher (``gateway/run.py``) and the CLI standalone daemon (``hermes_cli/kanban.py``) both swap their cheap ``ready_nonempty`` probe to use ``has_spawnable_ready``. Stuck-warn now fires only when there is genuine spawnable work the dispatcher failed to start. * CLI dispatch output prints ``Skipped (non-spawnable assignee — terminal lane, OK)`` for visibility without alarm. Tests: * New ``has_spawnable_ready`` cases (empty queue, terminal-lane only, mixed real+terminal). * New ``test_dispatch_skips_nonspawnable_into_separate_bucket`` verifies the bucketing change. * Updated ``test_dispatch_skips_unassigned`` to assert no cross-leak. * Added ``all_assignees_spawnable`` fixture in ``tests/hermes_cli/conftest.py`` and threaded it through dispatcher tests that use synthetic assignees ("alice", "bob"). PR #20105 (the parent commit) silently broke 8 such tests by routing those assignees into ``skipped_nonspawnable`` instead of spawning; this PR repairs them as part of the same code area. Verified locally: 246/246 kanban-suite tests pass. Stacks on top of fix/kanban-dispatcher-skip-missing-profile-2026-05-05 (PR #20105). Reviewer: this PR is meant to merge AFTER #20105. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent ca5595f commit f25d3ec

6 files changed

Lines changed: 152 additions & 25 deletions

File tree

gateway/run.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3906,7 +3906,17 @@ def _tick_once() -> "list[tuple[str, Optional[object]]]":
39063906
return out
39073907

39083908
def _ready_nonempty() -> bool:
3909-
"""Cheap probe: is there a ready+assigned+unclaimed task on ANY board?"""
3909+
"""Cheap probe: is there at least one ready+assigned+unclaimed
3910+
task on ANY board whose assignee maps to a real Hermes profile
3911+
(i.e. one the dispatcher would actually spawn for)?
3912+
3913+
Tasks assigned to control-plane lanes (e.g. ``orion-cc``,
3914+
``orion-research``) are pulled by terminals via
3915+
``claim_task`` directly and never spawnable, so a queue full
3916+
of those is "correctly idle", not "stuck". Filtering them out
3917+
here keeps the stuck-warn fire only on real failures (broken
3918+
PATH, missing venv, credential loss for a real Hermes profile).
3919+
"""
39103920
try:
39113921
boards = _kb.list_boards(include_archived=False)
39123922
except Exception:
@@ -3916,12 +3926,7 @@ def _ready_nonempty() -> bool:
39163926
conn = None
39173927
try:
39183928
conn = _kb.connect(board=slug)
3919-
row = conn.execute(
3920-
"SELECT 1 FROM tasks "
3921-
"WHERE status = 'ready' AND assignee IS NOT NULL "
3922-
" AND claim_lock IS NULL LIMIT 1"
3923-
).fetchone()
3924-
if row is not None:
3929+
if _kb.has_spawnable_ready(conn):
39253930
return True
39263931
except Exception:
39273932
continue

hermes_cli/kanban.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1274,6 +1274,7 @@ def _cmd_dispatch(args: argparse.Namespace) -> int:
12741274
for (tid, who, ws) in res.spawned
12751275
],
12761276
"skipped_unassigned": res.skipped_unassigned,
1277+
"skipped_nonspawnable": res.skipped_nonspawnable,
12771278
}, indent=2))
12781279
return 0
12791280
print(f"Reclaimed: {res.reclaimed}")
@@ -1293,6 +1294,11 @@ def _cmd_dispatch(args: argparse.Namespace) -> int:
12931294
print(f" - {tid} -> {who} @ {ws or '-'}{tag}")
12941295
if res.skipped_unassigned:
12951296
print(f"Skipped (unassigned): {', '.join(res.skipped_unassigned)}")
1297+
if res.skipped_nonspawnable:
1298+
print(
1299+
f"Skipped (non-spawnable assignee — terminal lane, OK): "
1300+
f"{', '.join(res.skipped_nonspawnable)}"
1301+
)
12961302
return 0
12971303

12981304

@@ -1404,16 +1410,18 @@ def _on_tick(res):
14041410
)
14051411

14061412
def _ready_queue_nonempty() -> bool:
1407-
"""Cheap SELECT — just asks whether there's at least one ready
1408-
task with an assignee that the dispatcher could have picked up."""
1413+
"""Cheap probe — is there at least one ready+assigned+unclaimed
1414+
task whose assignee maps to a real Hermes profile (i.e. one the
1415+
dispatcher would actually try to spawn for)?
1416+
1417+
Filters out tasks assigned to control-plane lanes
1418+
(e.g. ``orion-cc``, ``orion-research``) that are pulled by
1419+
terminals via ``claim_task`` directly — those are correctly idle
1420+
from the dispatcher's perspective, not stuck.
1421+
"""
14091422
try:
14101423
with kb.connect() as conn:
1411-
row = conn.execute(
1412-
"SELECT 1 FROM tasks "
1413-
"WHERE status = 'ready' AND assignee IS NOT NULL "
1414-
" AND claim_lock IS NULL LIMIT 1"
1415-
).fetchone()
1416-
return row is not None
1424+
return kb.has_spawnable_ready(conn)
14171425
except Exception:
14181426
return False
14191427

hermes_cli/kanban_db.py

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2118,6 +2118,15 @@ class DispatchResult:
21182118
spawned: list[tuple[str, str, str]] = field(default_factory=list)
21192119
"""List of ``(task_id, assignee, workspace_path)`` triples."""
21202120
skipped_unassigned: list[str] = field(default_factory=list)
2121+
"""Ready task ids skipped because they have no assignee at all.
2122+
Operator-actionable — usually a misfiled task waiting for routing."""
2123+
skipped_nonspawnable: list[str] = field(default_factory=list)
2124+
"""Ready task ids skipped because their assignee names a control-plane
2125+
lane (a Claude Code terminal like ``orion-cc``) rather than a Hermes
2126+
profile. Expected steady-state on multi-lane setups; NOT an
2127+
operator-actionable failure. Tracked separately so health telemetry
2128+
can distinguish "real stuck" (nothing spawned but spawnable work
2129+
available) from "correctly idle" (nothing spawnable in the queue)."""
21212130
crashed: list[str] = field(default_factory=list)
21222131
"""Task ids reclaimed because their worker PID disappeared."""
21232132
auto_blocked: list[str] = field(default_factory=list)
@@ -2459,6 +2468,38 @@ def _clear_spawn_failures(conn: sqlite3.Connection, task_id: str) -> None:
24592468
)
24602469

24612470

2471+
def has_spawnable_ready(conn: sqlite3.Connection) -> bool:
2472+
"""Return True iff there is at least one ready+assigned+unclaimed task
2473+
whose assignee maps to a real Hermes profile.
2474+
2475+
Used by the gateway- and CLI-embedded dispatchers' health telemetry to
2476+
decide whether ``0 spawned`` is a "stuck" condition (real spawnable
2477+
work waiting) or a "correctly idle" condition (only control-plane
2478+
lanes like ``orion-cc`` / ``orion-research`` waiting on terminals
2479+
that pull tasks via ``claim_task`` directly).
2480+
2481+
Falls back to "any ready+assigned" if ``profile_exists`` is not
2482+
importable (e.g. partial install) — preserves the old behavior so
2483+
the warning still fires in degraded environments.
2484+
"""
2485+
rows = conn.execute(
2486+
"SELECT DISTINCT assignee FROM tasks "
2487+
"WHERE status = 'ready' AND assignee IS NOT NULL "
2488+
" AND claim_lock IS NULL"
2489+
).fetchall()
2490+
if not rows:
2491+
return False
2492+
try:
2493+
from hermes_cli.profiles import profile_exists # local import: avoids cycle
2494+
except Exception:
2495+
# Can't introspect — assume spawnable, preserve legacy behavior.
2496+
return True
2497+
for row in rows:
2498+
if profile_exists(row["assignee"]):
2499+
return True
2500+
return False
2501+
2502+
24622503
def dispatch_once(
24632504
conn: sqlite3.Connection,
24642505
*,
@@ -2521,7 +2562,13 @@ def dispatch_once(
25212562
except Exception:
25222563
profile_exists = None # type: ignore[assignment]
25232564
if profile_exists is not None and not profile_exists(row["assignee"]):
2524-
result.skipped_unassigned.append(row["id"])
2565+
# Bucket separately from skipped_unassigned: the operator
2566+
# cannot fix this by assigning a profile (the assignee IS the
2567+
# intended owner — a terminal lane). Health telemetry uses
2568+
# this distinction to suppress spurious "stuck" warnings on
2569+
# multi-lane setups where the ready queue is steadily full
2570+
# of human-pulled work.
2571+
result.skipped_nonspawnable.append(row["id"])
25252572
continue
25262573
if dry_run:
25272574
result.spawned.append((row["id"], row["assignee"], ""))

tests/hermes_cli/conftest.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
"""Fixtures shared across hermes_cli kanban tests."""
2+
3+
from __future__ import annotations
4+
5+
import pytest
6+
7+
8+
@pytest.fixture
9+
def all_assignees_spawnable(monkeypatch):
10+
"""Pretend every assignee maps to a real Hermes profile.
11+
12+
Most dispatcher tests use synthetic assignees ("alice", "bob") that
13+
don't correspond to actual profile directories on disk. Without this
14+
patch, the dispatcher's profile-exists guard (PR #20105) routes
15+
those tasks into ``skipped_nonspawnable`` instead of spawning, which
16+
would break tests that assert spawn behavior.
17+
"""
18+
from hermes_cli import profiles
19+
monkeypatch.setattr(profiles, "profile_exists", lambda name: True)

tests/hermes_cli/test_kanban_core_functionality.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ def test_no_idempotency_key_never_collides(kanban_home):
8080
# Spawn-failure circuit breaker
8181
# ---------------------------------------------------------------------------
8282

83-
def test_spawn_failure_auto_blocks_after_limit(kanban_home):
83+
def test_spawn_failure_auto_blocks_after_limit(kanban_home, all_assignees_spawnable):
8484
"""N consecutive spawn failures on the same task → auto_blocked."""
8585
def _bad_spawn(task, ws):
8686
raise RuntimeError("no PATH")
@@ -109,7 +109,7 @@ def _bad_spawn(task, ws):
109109
conn.close()
110110

111111

112-
def test_successful_spawn_resets_failure_counter(kanban_home):
112+
def test_successful_spawn_resets_failure_counter(kanban_home, all_assignees_spawnable):
113113
"""A successful spawn clears the counter so past failures don't count
114114
against future retries of the same task."""
115115
calls = [0]
@@ -138,7 +138,7 @@ def _flaky_spawn(task, ws):
138138
conn.close()
139139

140140

141-
def test_workspace_resolution_failure_also_counts(kanban_home):
141+
def test_workspace_resolution_failure_also_counts(kanban_home, all_assignees_spawnable):
142142
"""`dir:` workspace with no path should fail workspace resolution AND
143143
count against the failure budget — not just crash the tick."""
144144
conn = kb.connect()
@@ -824,7 +824,7 @@ def test_recompute_ready_emits_promoted_not_ready(kanban_home):
824824
conn.close()
825825

826826

827-
def test_spawn_failure_circuit_breaker_emits_gave_up(kanban_home):
827+
def test_spawn_failure_circuit_breaker_emits_gave_up(kanban_home, all_assignees_spawnable):
828828
def _bad(task, ws):
829829
raise RuntimeError("nope")
830830
conn = kb.connect()
@@ -840,7 +840,7 @@ def _bad(task, ws):
840840
conn.close()
841841

842842

843-
def test_spawned_event_emitted_with_pid(kanban_home):
843+
def test_spawned_event_emitted_with_pid(kanban_home, all_assignees_spawnable):
844844
"""Successful spawn must append a ``spawned`` event with the pid in
845845
the payload so humans tailing events see pid tracking."""
846846
def _spawn_returns_pid(task, ws):
@@ -1154,7 +1154,7 @@ def test_run_on_block_with_reason(kanban_home):
11541154
conn.close()
11551155

11561156

1157-
def test_run_on_spawn_failure_records_failed_runs(kanban_home):
1157+
def test_run_on_spawn_failure_records_failed_runs(kanban_home, all_assignees_spawnable):
11581158
"""Each spawn_failed event closes a run with outcome='spawn_failed',
11591159
and the Nth failure closes a run with outcome='gave_up'."""
11601160
def _bad(task, ws):

tests/hermes_cli/test_kanban_db.py

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -327,7 +327,7 @@ def test_worker_context_includes_parent_results_and_comments(kanban_home):
327327
# Dispatcher
328328
# ---------------------------------------------------------------------------
329329

330-
def test_dispatch_dry_run_does_not_claim(kanban_home):
330+
def test_dispatch_dry_run_does_not_claim(kanban_home, all_assignees_spawnable):
331331
with kb.connect() as conn:
332332
t1 = kb.create_task(conn, title="a", assignee="alice")
333333
t2 = kb.create_task(conn, title="b", assignee="bob")
@@ -344,10 +344,58 @@ def test_dispatch_skips_unassigned(kanban_home):
344344
t = kb.create_task(conn, title="floater")
345345
res = kb.dispatch_once(conn, dry_run=True)
346346
assert t in res.skipped_unassigned
347+
assert t not in res.skipped_nonspawnable
347348
assert not res.spawned
348349

349350

350-
def test_dispatch_promotes_ready_and_spawns(kanban_home):
351+
def test_dispatch_skips_nonspawnable_into_separate_bucket(kanban_home, monkeypatch):
352+
"""Tasks whose assignee fails profile_exists() must NOT land in
353+
``skipped_unassigned`` (which is operator-actionable) — they go in
354+
the dedicated ``skipped_nonspawnable`` bucket so health telemetry
355+
can suppress false-positive "stuck" warnings."""
356+
from hermes_cli import profiles
357+
monkeypatch.setattr(profiles, "profile_exists", lambda name: False)
358+
with kb.connect() as conn:
359+
t = kb.create_task(conn, title="for-terminal", assignee="orion-cc")
360+
res = kb.dispatch_once(conn, dry_run=True)
361+
assert t in res.skipped_nonspawnable
362+
assert t not in res.skipped_unassigned
363+
assert not res.spawned
364+
365+
366+
def test_has_spawnable_ready_false_when_only_terminal_lanes(kanban_home, monkeypatch):
367+
"""``has_spawnable_ready`` returns False when every ready task is
368+
assigned to a control-plane lane — used by gateway/CLI dispatchers
369+
to silence the stuck-warn while terminals still have queued work."""
370+
from hermes_cli import profiles
371+
monkeypatch.setattr(profiles, "profile_exists", lambda name: False)
372+
with kb.connect() as conn:
373+
kb.create_task(conn, title="t1", assignee="orion-cc")
374+
kb.create_task(conn, title="t2", assignee="orion-research")
375+
assert kb.has_spawnable_ready(conn) is False
376+
377+
378+
def test_has_spawnable_ready_true_when_real_profile_present(kanban_home, monkeypatch):
379+
"""``has_spawnable_ready`` returns True as soon as ANY ready task
380+
has an assignee that maps to a real Hermes profile — preserves the
381+
real "stuck" signal when a daily/agent task is queued."""
382+
from hermes_cli import profiles
383+
monkeypatch.setattr(
384+
profiles, "profile_exists", lambda name: name == "daily"
385+
)
386+
with kb.connect() as conn:
387+
kb.create_task(conn, title="terminal-task", assignee="orion-cc")
388+
kb.create_task(conn, title="hermes-task", assignee="daily")
389+
assert kb.has_spawnable_ready(conn) is True
390+
391+
392+
def test_has_spawnable_ready_false_on_empty_queue(kanban_home):
393+
"""Empty queue is the trivial false case — no ready tasks at all."""
394+
with kb.connect() as conn:
395+
assert kb.has_spawnable_ready(conn) is False
396+
397+
398+
def test_dispatch_promotes_ready_and_spawns(kanban_home, all_assignees_spawnable):
351399
spawns = []
352400

353401
def fake_spawn(task, workspace):
@@ -368,7 +416,7 @@ def fake_spawn(task, workspace):
368416
assert kb.get_task(conn, c).status == "running"
369417

370418

371-
def test_dispatch_spawn_failure_releases_claim(kanban_home):
419+
def test_dispatch_spawn_failure_releases_claim(kanban_home, all_assignees_spawnable):
372420
def boom(task, workspace):
373421
raise RuntimeError("spawn failed")
374422

0 commit comments

Comments
 (0)