Skip to content

Commit 4cc0dff

Browse files
authored
fix(update): bypass systemd RestartSec after graceful drain (NousResearch#22101)
After a clean SIGUSR1 drain, cmd_update passively polled for systemd's auto-restart to fire. Our unit file sets RestartSec=60 (a crash-loop guard), so the voluntary-restart path waited a full minute of dead air before the gateway came back — the user saw 'draining (up to 75s)...' and stared at it. Change: after the drain exits with code 75, call 'reset-failed' + 'start' explicitly. Manual start bypasses RestartSec entirely (RestartSec only governs systemd's own auto-restart logic). Takes about as long as the gateway needs to come up (~1-3s on a warm box) instead of ~60s. The RestartSec=60 default stays — it's the right crash-loop guard for actual crashes. This only short-circuits the voluntary-restart path. Matches the pattern already used in 'hermes gateway restart' (systemd_restart() in hermes_cli/gateway.py, PR NousResearch#20949). Tests: - tests/hermes_cli/test_update_gateway_restart.py: new test_update_bypasses_restartsec_after_graceful_drain asserts both 'reset-failed hermes-gateway' AND 'start hermes-gateway' (NOT 'restart') are issued after a successful graceful drain. - All existing tests in the affected classes still pass (TestCmdUpdateLaunchdRestart, TestCmdUpdateResetFailedBeforeRestart are green; one pre-existing flake in the latter is unrelated).
1 parent 08f6345 commit 4cc0dff

2 files changed

Lines changed: 121 additions & 8 deletions

File tree

hermes_cli/main.py

Lines changed: 50 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7756,14 +7756,56 @@ def _service_restart_sec(
77567756
)
77577757

77587758
if _graceful_ok:
7759-
# Gateway exited 75; systemd should relaunch
7760-
# via Restart=on-failure. The unit's
7761-
# RestartSec (default 30s on ours) gates the
7762-
# respawn — poll past that + slack so we
7763-
# don't give up mid-cooldown and falsely
7764-
# print "drained but didn't relaunch". For
7765-
# units without RestartSec set we fall back
7766-
# to the original 10s budget.
7759+
# Gateway exited 75. ``Restart=always`` +
7760+
# ``RestartForceExitStatus=75`` means systemd
7761+
# WILL respawn the unit — but only after
7762+
# ``RestartSec`` (default 60s on our unit
7763+
# file). That 60s wait is a crash-loop guard,
7764+
# and is the right default when the gateway
7765+
# dies unexpectedly. For a voluntary restart
7766+
# on update, it's dead time the user watches.
7767+
#
7768+
# Shortcut it: ``reset-failed`` + ``start``
7769+
# skips RestartSec entirely (we're manually
7770+
# initiating the unit, not waiting for
7771+
# systemd's auto-restart logic). Takes about
7772+
# as long as the process takes to come up
7773+
# (~1-3s on a warm box).
7774+
#
7775+
# If the unit is already active because
7776+
# RestartSec elapsed while we were draining,
7777+
# ``start`` is a no-op and we fall through to
7778+
# the poll below. Either way we collapse the
7779+
# 60s+ delay to a ~5s one.
7780+
subprocess.run(
7781+
scope_cmd + ["reset-failed", svc_name],
7782+
capture_output=True,
7783+
text=True,
7784+
timeout=10,
7785+
)
7786+
subprocess.run(
7787+
scope_cmd + ["start", svc_name],
7788+
capture_output=True,
7789+
text=True,
7790+
timeout=15,
7791+
)
7792+
# Short poll: the gateway should be up within
7793+
# a few seconds now that we bypassed
7794+
# RestartSec. Fall back to the longer
7795+
# RestartSec + slack budget ONLY if the
7796+
# explicit start failed and we need to rely
7797+
# on systemd's auto-restart.
7798+
if _wait_for_service_active(
7799+
scope_cmd,
7800+
svc_name,
7801+
timeout=10.0,
7802+
):
7803+
restarted_services.append(svc_name)
7804+
continue
7805+
# Explicit start didn't take. Fall back to
7806+
# the original passive poll (systemd's
7807+
# auto-restart WILL fire after RestartSec
7808+
# regardless).
77677809
_restart_sec = _service_restart_sec(
77687810
scope_cmd,
77697811
svc_name,

tests/hermes_cli/test_update_gateway_restart.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -653,6 +653,77 @@ def wrapped(cmd, **kwargs):
653653
"Drain path failed; expected fallback `systemctl restart`."
654654
)
655655

656+
@patch("shutil.which", return_value=None)
657+
@patch("subprocess.run")
658+
def test_update_bypasses_restartsec_after_graceful_drain(
659+
self, mock_run, _mock_which, mock_args, capsys, monkeypatch,
660+
):
661+
"""After a graceful SIGUSR1 drain, cmd_update must issue
662+
``reset-failed`` + ``start`` to bypass the unit's ``RestartSec``
663+
cooldown (default 60s on our unit file) rather than passively
664+
waiting for systemd's auto-restart. Collapses the post-drain delay
665+
from ~60s to ~5s on a voluntary restart.
666+
"""
667+
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
668+
monkeypatch.setattr(gateway_cli, "supports_systemd_services", lambda: True)
669+
monkeypatch.setattr(gateway_cli, "is_termux", lambda: False)
670+
671+
def side_effect(cmd, **kwargs):
672+
joined = " ".join(str(c) for c in cmd)
673+
if "rev-parse" in joined and "--abbrev-ref" in joined:
674+
return subprocess.CompletedProcess(cmd, 0, stdout="main\n", stderr="")
675+
if "rev-parse" in joined and "--verify" in joined:
676+
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
677+
if "rev-list" in joined:
678+
return subprocess.CompletedProcess(cmd, 0, stdout="3\n", stderr="")
679+
if "systemctl" in joined and "list-units" in joined:
680+
if "--user" in joined:
681+
return subprocess.CompletedProcess(
682+
cmd, 0,
683+
stdout="hermes-gateway.service loaded active running\n",
684+
stderr="",
685+
)
686+
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
687+
if "systemctl" in joined and "is-active" in joined:
688+
return subprocess.CompletedProcess(cmd, 0, stdout="active\n", stderr="")
689+
if "systemctl" in joined and "show" in joined and "MainPID" in joined:
690+
return subprocess.CompletedProcess(cmd, 0, stdout="4242\n", stderr="")
691+
return subprocess.CompletedProcess(cmd, 0, stdout="", stderr="")
692+
693+
mock_run.side_effect = side_effect
694+
695+
# Simulate a successful graceful drain so cmd_update reaches the
696+
# post-drain restart bypass.
697+
monkeypatch.setattr(
698+
"hermes_cli.gateway._graceful_restart_via_sigusr1",
699+
lambda pid, drain_timeout: True,
700+
)
701+
702+
with patch.object(gateway_cli, "find_gateway_pids", return_value=[]):
703+
cmd_update(mock_args)
704+
705+
calls = [
706+
" ".join(str(a) for a in c.args[0])
707+
for c in mock_run.call_args_list
708+
if "systemctl" in " ".join(str(a) for a in c.args[0])
709+
]
710+
711+
# Must have called ``reset-failed hermes-gateway`` AND ``start
712+
# hermes-gateway`` explicitly so systemd bypasses RestartSec.
713+
reset_calls = [c for c in calls if "reset-failed" in c and "hermes-gateway" in c]
714+
start_calls = [
715+
c for c in calls
716+
if "start" in c and "hermes-gateway" in c and "restart" not in c
717+
]
718+
assert reset_calls, (
719+
f"Expected explicit `reset-failed hermes-gateway` after graceful drain; "
720+
f"systemctl calls were: {calls}"
721+
)
722+
assert start_calls, (
723+
f"Expected explicit `start hermes-gateway` after graceful drain to "
724+
f"bypass RestartSec; systemctl calls were: {calls}"
725+
)
726+
656727
@patch("shutil.which", return_value=None)
657728
@patch("subprocess.run")
658729
def test_update_no_gateway_running_skips_restart(

0 commit comments

Comments
 (0)