Skip to content

Commit b632290

Browse files
helix4uteknium1
authored andcommitted
fix(gateway): handle planned service stops
1 parent 20428f5 commit b632290

5 files changed

Lines changed: 282 additions & 68 deletions

File tree

gateway/run.py

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15003,15 +15003,14 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
1500315003

1500415004
runner = GatewayRunner(config)
1500515005

15006-
# Track whether a signal initiated the shutdown (vs. internal request).
15007-
# When an unexpected SIGTERM kills the gateway, we exit non-zero so
15008-
# systemd's Restart=on-failure revives the process. systemctl stop
15009-
# is safe: systemd tracks stop-requested state independently of exit
15010-
# code, so Restart= never fires for a deliberate stop.
15006+
# Track whether an unexpected signal initiated the shutdown. When an
15007+
# unexpected SIGTERM kills the gateway, we exit non-zero so service
15008+
# managers can revive the process. Planned stop paths write a marker
15009+
# before signalling us so they can exit cleanly instead.
1501115010
_signal_initiated_shutdown = False
1501215011

1501315012
# Set up signal handlers
15014-
def shutdown_signal_handler():
15013+
def shutdown_signal_handler(received_signal=None):
1501515014
nonlocal _signal_initiated_shutdown
1501615015
# Planned --replace takeover check: when a sibling gateway is
1501715016
# taking over via --replace, it wrote a marker naming this PID
@@ -15027,10 +15026,28 @@ def shutdown_signal_handler():
1502715026
except Exception as e:
1502815027
logger.debug("Takeover marker check failed: %s", e)
1502915028

15029+
# Planned stop check: service managers and `hermes gateway stop`
15030+
# also send SIGTERM, which is indistinguishable from an unexpected
15031+
# external kill unless the CLI marks it first. SIGINT comes from an
15032+
# interactive Ctrl+C and is likewise an intentional foreground stop.
15033+
planned_stop = False
15034+
if received_signal == signal.SIGINT:
15035+
planned_stop = True
15036+
elif not planned_takeover:
15037+
try:
15038+
from gateway.status import consume_planned_stop_marker_for_self
15039+
planned_stop = consume_planned_stop_marker_for_self()
15040+
except Exception as e:
15041+
logger.debug("Planned stop marker check failed: %s", e)
15042+
1503015043
if planned_takeover:
1503115044
logger.info(
1503215045
"Received SIGTERM as a planned --replace takeover — exiting cleanly"
1503315046
)
15047+
elif planned_stop:
15048+
logger.info(
15049+
"Received SIGTERM/SIGINT as a planned gateway stop — exiting cleanly"
15050+
)
1503415051
else:
1503515052
_signal_initiated_shutdown = True
1503615053
logger.info("Received SIGTERM/SIGINT — initiating shutdown")
@@ -15066,7 +15083,7 @@ def restart_signal_handler():
1506615083
if threading.current_thread() is threading.main_thread():
1506715084
for sig in (signal.SIGINT, signal.SIGTERM):
1506815085
try:
15069-
loop.add_signal_handler(sig, shutdown_signal_handler)
15086+
loop.add_signal_handler(sig, shutdown_signal_handler, sig)
1507015087
except NotImplementedError:
1507115088
pass
1507215089
if hasattr(signal, "SIGUSR1"):
@@ -15164,14 +15181,14 @@ def restart_signal_handler():
1516415181
if runner.exit_code is not None:
1516515182
raise SystemExit(runner.exit_code)
1516615183

15167-
# When a signal (SIGTERM/SIGINT) caused the shutdown and it wasn't a
15168-
# planned restart (/restart, /update, SIGUSR1), exit non-zero so
15169-
# systemd's Restart=on-failure revives the process. This covers:
15184+
# When an unexpected SIGTERM caused the shutdown and it wasn't a planned
15185+
# restart (/restart, /update, SIGUSR1), exit non-zero so systemd's
15186+
# Restart=on-failure revives the process. This covers:
1517015187
# - hermes update killing the gateway mid-work
1517115188
# - External kill commands
1517215189
# - WSL2/container runtime sending unexpected signals
15173-
# systemctl stop is safe: systemd tracks "stop requested" state
15174-
# independently of exit code, so Restart= never fires for it.
15190+
# `hermes gateway stop` and interactive Ctrl+C are handled above as
15191+
# planned stops and should not trigger service-manager revival.
1517515192
if _signal_initiated_shutdown and not runner._restart_requested:
1517615193
logger.info(
1517715194
"Exiting with code 1 (signal-initiated shutdown without restart "

gateway/status.py

Lines changed: 103 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -637,6 +637,8 @@ def release_all_scoped_locks(
637637

638638
_TAKEOVER_MARKER_FILENAME = ".gateway-takeover.json"
639639
_TAKEOVER_MARKER_TTL_S = 60 # Marker older than this is treated as stale
640+
_PLANNED_STOP_MARKER_FILENAME = ".gateway-planned-stop.json"
641+
_PLANNED_STOP_MARKER_TTL_S = 60
640642

641643

642644
def _get_takeover_marker_path() -> Path:
@@ -645,6 +647,67 @@ def _get_takeover_marker_path() -> Path:
645647
return home / _TAKEOVER_MARKER_FILENAME
646648

647649

650+
def _get_planned_stop_marker_path() -> Path:
651+
"""Return the path to the intentional gateway stop marker file."""
652+
home = get_hermes_home()
653+
return home / _PLANNED_STOP_MARKER_FILENAME
654+
655+
656+
def _marker_is_stale(written_at: str, ttl_s: int) -> bool:
657+
try:
658+
written_dt = datetime.fromisoformat(written_at)
659+
age = (datetime.now(timezone.utc) - written_dt).total_seconds()
660+
return age > ttl_s
661+
except (TypeError, ValueError):
662+
return True
663+
664+
665+
def _consume_pid_marker_for_self(
666+
path: Path,
667+
*,
668+
pid_field: str,
669+
start_time_field: str,
670+
ttl_s: int,
671+
) -> bool:
672+
record = _read_json_file(path)
673+
if not record:
674+
return False
675+
676+
try:
677+
target_pid = int(record[pid_field])
678+
target_start_time = record.get(start_time_field)
679+
written_at = record.get("written_at") or ""
680+
except (KeyError, TypeError, ValueError):
681+
try:
682+
path.unlink(missing_ok=True)
683+
except OSError:
684+
pass
685+
return False
686+
687+
if _marker_is_stale(written_at, ttl_s):
688+
try:
689+
path.unlink(missing_ok=True)
690+
except OSError:
691+
pass
692+
return False
693+
694+
our_pid = os.getpid()
695+
our_start_time = _get_process_start_time(our_pid)
696+
matches = (
697+
target_pid == our_pid
698+
and target_start_time is not None
699+
and our_start_time is not None
700+
and target_start_time == our_start_time
701+
)
702+
703+
try:
704+
path.unlink(missing_ok=True)
705+
except OSError:
706+
pass
707+
708+
return matches
709+
710+
648711
def write_takeover_marker(target_pid: int) -> bool:
649712
"""Record that ``target_pid`` is being replaced by the current process.
650713
@@ -681,64 +744,57 @@ def consume_takeover_marker_for_self() -> bool:
681744
Always unlinks the marker on match (and on detected staleness) so
682745
subsequent unrelated signals don't re-trigger.
683746
"""
684-
path = _get_takeover_marker_path()
685-
record = _read_json_file(path)
686-
if not record:
687-
return False
747+
return _consume_pid_marker_for_self(
748+
_get_takeover_marker_path(),
749+
pid_field="target_pid",
750+
start_time_field="target_start_time",
751+
ttl_s=_TAKEOVER_MARKER_TTL_S,
752+
)
688753

689-
# Any malformed or stale marker → drop it and return False
690-
try:
691-
target_pid = int(record["target_pid"])
692-
target_start_time = record.get("target_start_time")
693-
written_at = record.get("written_at") or ""
694-
except (KeyError, TypeError, ValueError):
695-
try:
696-
path.unlink(missing_ok=True)
697-
except OSError:
698-
pass
699-
return False
700754

701-
# TTL guard: a stale marker older than _TAKEOVER_MARKER_TTL_S is ignored.
702-
stale = False
755+
def clear_takeover_marker() -> None:
756+
"""Remove the takeover marker unconditionally. Safe to call repeatedly."""
703757
try:
704-
written_dt = datetime.fromisoformat(written_at)
705-
age = (datetime.now(timezone.utc) - written_dt).total_seconds()
706-
if age > _TAKEOVER_MARKER_TTL_S:
707-
stale = True
708-
except (TypeError, ValueError):
709-
stale = True # Unparseable timestamp — treat as stale
758+
_get_takeover_marker_path().unlink(missing_ok=True)
759+
except OSError:
760+
pass
710761

711-
if stale:
712-
try:
713-
path.unlink(missing_ok=True)
714-
except OSError:
715-
pass
716-
return False
717762

718-
# Does the marker name THIS process?
719-
our_pid = os.getpid()
720-
our_start_time = _get_process_start_time(our_pid)
721-
matches = (
722-
target_pid == our_pid
723-
and target_start_time is not None
724-
and our_start_time is not None
725-
and target_start_time == our_start_time
726-
)
763+
def write_planned_stop_marker(target_pid: int) -> bool:
764+
"""Record that ``target_pid`` is being stopped intentionally.
727765
728-
# Consume the marker whether it matched or not — a marker that doesn't
729-
# match our identity is stale-for-us anyway.
766+
The gateway exits non-zero for unexpected SIGTERM so service managers can
767+
revive it. Service stop commands send the same SIGTERM, so the CLI writes
768+
this short-lived marker first to let the target process exit cleanly.
769+
"""
730770
try:
731-
path.unlink(missing_ok=True)
732-
except OSError:
733-
pass
771+
target_start_time = _get_process_start_time(target_pid)
772+
record = {
773+
"target_pid": target_pid,
774+
"target_start_time": target_start_time,
775+
"stopper_pid": os.getpid(),
776+
"written_at": _utc_now_iso(),
777+
}
778+
_write_json_file(_get_planned_stop_marker_path(), record)
779+
return True
780+
except (OSError, PermissionError):
781+
return False
734782

735-
return matches
736783

784+
def consume_planned_stop_marker_for_self() -> bool:
785+
"""Return True when the current process is being intentionally stopped."""
786+
return _consume_pid_marker_for_self(
787+
_get_planned_stop_marker_path(),
788+
pid_field="target_pid",
789+
start_time_field="target_start_time",
790+
ttl_s=_PLANNED_STOP_MARKER_TTL_S,
791+
)
737792

738-
def clear_takeover_marker() -> None:
739-
"""Remove the takeover marker unconditionally. Safe to call repeatedly."""
793+
794+
def clear_planned_stop_marker() -> None:
795+
"""Remove the planned-stop marker unconditionally."""
740796
try:
741-
_get_takeover_marker_path().unlink(missing_ok=True)
797+
_get_planned_stop_marker_path().unlink(missing_ok=True)
742798
except OSError:
743799
pass
744800

hermes_cli/gateway.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -785,6 +785,12 @@ def stop_profile_gateway() -> bool:
785785
if pid is None:
786786
return False
787787

788+
try:
789+
from gateway.status import write_planned_stop_marker
790+
write_planned_stop_marker(pid)
791+
except Exception:
792+
pass
793+
788794
try:
789795
os.kill(pid, signal.SIGTERM)
790796
except ProcessLookupError:
@@ -2043,6 +2049,13 @@ def systemd_stop(system: bool = False):
20432049
if system:
20442050
_require_root_for_system_service("stop")
20452051
_require_service_installed("stop", system=system)
2052+
try:
2053+
from gateway.status import get_running_pid, write_planned_stop_marker
2054+
pid = get_running_pid(cleanup_stale=False)
2055+
if pid is not None:
2056+
write_planned_stop_marker(pid)
2057+
except Exception:
2058+
pass
20462059
_run_systemctl(["stop", get_service_name()], system=system, check=True, timeout=90)
20472060
print(f"✓ {_service_scope_label(system).capitalize()} service stopped")
20482061

@@ -2404,6 +2417,13 @@ def launchd_start():
24042417
def launchd_stop():
24052418
label = get_launchd_label()
24062419
target = f"{_launchd_domain()}/{label}"
2420+
try:
2421+
from gateway.status import get_running_pid, write_planned_stop_marker
2422+
pid = get_running_pid(cleanup_stale=False)
2423+
if pid is not None:
2424+
write_planned_stop_marker(pid)
2425+
except Exception:
2426+
pass
24072427
# bootout unloads the service definition so KeepAlive doesn't respawn
24082428
# the process. A plain `kill SIGTERM` only signals the process — launchd
24092429
# immediately restarts it because KeepAlive.SuccessfulExit = false.

tests/gateway/test_status.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -702,3 +702,88 @@ def test_consume_ignores_marker_for_different_process_and_prevents_stale_grief(
702702

703703
# We are not the target — must NOT consume as planned
704704
assert result is False
705+
706+
707+
class TestPlannedStopMarker:
708+
"""Tests for intentional service/manual gateway stop markers."""
709+
710+
def test_write_marker_records_target_identity(self, tmp_path, monkeypatch):
711+
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
712+
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 42)
713+
714+
ok = status.write_planned_stop_marker(target_pid=12345)
715+
716+
assert ok is True
717+
marker = tmp_path / ".gateway-planned-stop.json"
718+
assert marker.exists()
719+
payload = json.loads(marker.read_text())
720+
assert payload["target_pid"] == 12345
721+
assert payload["target_start_time"] == 42
722+
assert payload["stopper_pid"] == os.getpid()
723+
assert "written_at" in payload
724+
725+
def test_consume_returns_true_when_marker_names_self(self, tmp_path, monkeypatch):
726+
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
727+
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
728+
ok = status.write_planned_stop_marker(target_pid=os.getpid())
729+
assert ok is True
730+
731+
result = status.consume_planned_stop_marker_for_self()
732+
733+
assert result is True
734+
assert not (tmp_path / ".gateway-planned-stop.json").exists()
735+
736+
def test_consume_returns_false_for_different_pid(self, tmp_path, monkeypatch):
737+
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
738+
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
739+
ok = status.write_planned_stop_marker(target_pid=os.getpid() + 9999)
740+
assert ok is True
741+
742+
result = status.consume_planned_stop_marker_for_self()
743+
744+
assert result is False
745+
assert not (tmp_path / ".gateway-planned-stop.json").exists()
746+
747+
def test_consume_returns_false_for_stale_marker(self, tmp_path, monkeypatch):
748+
from datetime import datetime, timezone, timedelta
749+
750+
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
751+
marker_path = tmp_path / ".gateway-planned-stop.json"
752+
stale_time = (datetime.now(timezone.utc) - timedelta(minutes=2)).isoformat()
753+
marker_path.write_text(json.dumps({
754+
"target_pid": os.getpid(),
755+
"target_start_time": 123,
756+
"stopper_pid": 99999,
757+
"written_at": stale_time,
758+
}))
759+
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
760+
761+
result = status.consume_planned_stop_marker_for_self()
762+
763+
assert result is False
764+
assert not marker_path.exists()
765+
766+
def test_clear_planned_stop_marker_is_idempotent(self, tmp_path, monkeypatch):
767+
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
768+
monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 100)
769+
770+
status.clear_planned_stop_marker()
771+
status.write_planned_stop_marker(target_pid=12345)
772+
assert (tmp_path / ".gateway-planned-stop.json").exists()
773+
774+
status.clear_planned_stop_marker()
775+
776+
assert not (tmp_path / ".gateway-planned-stop.json").exists()
777+
status.clear_planned_stop_marker()
778+
779+
def test_write_marker_returns_false_on_write_failure(self, tmp_path, monkeypatch):
780+
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
781+
782+
def raise_oserror(*args, **kwargs):
783+
raise OSError("simulated write failure")
784+
785+
monkeypatch.setattr(status, "_write_json_file", raise_oserror)
786+
787+
ok = status.write_planned_stop_marker(target_pid=12345)
788+
789+
assert ok is False

0 commit comments

Comments
 (0)