Skip to content

Commit d675ef6

Browse files
teknium1Chris Han
authored andcommitted
fix(gateway): use git HEAD SHA, not file mtimes, for stale-code check (NousResearch#19740)
The stale-code self-check (Issue NousResearch#17648) used sentinel-file mtimes to decide whether the gateway survived a `hermes update` with stale `sys.modules`. That signal false-positives on any write to the sentinel files — including agent-driven edits during Hermes-on-Hermes dev sessions. Telling the agent to patch `run_agent.py` would flip the check to True on the next user message and force a gateway restart even though no update happened. Switch the signal to `git rev-parse HEAD`. Agent file edits don't move HEAD; `hermes update` (git pull) always does. Reading .git/HEAD directly (no subprocess) with a 5s cache keeps the overhead negligible on bursty chats. Non-git installs short-circuit to False — the stale-modules class can't occur without a git-backed update path, so there's nothing to detect. The legacy `_compute_repo_mtime` helper is kept but unused by detection, reserved as a fallback hook for future pip-install update paths. - _read_git_head_sha(): resolves HEAD across main checkout, worktree (follows `gitdir:` + `commondir` pointers), and packed-refs layouts. - _current_git_sha_cached(): per-runner 5s SHA cache. - _detect_stale_code(): boot SHA vs current SHA, returns False when either is unavailable. - Tests cover all four layouts, the agent-edits-don't-trigger regression, and cache behavior. Refs NousResearch#17648.
1 parent 63d55f9 commit d675ef6

2 files changed

Lines changed: 476 additions & 133 deletions

File tree

gateway/run.py

Lines changed: 179 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,21 @@ def _replace(match: re.Match[str]) -> str:
101101
# already-loaded stale module object and raises ``ImportError`` — see
102102
# Issue #17648. Rather than papering over the import failure site-by-site
103103
# in every tool file, detect the stale state centrally and auto-restart
104-
# so the gateway reloads with fresh code. The sentinel files below are
105-
# the canonical repo-level markers that every update touches; if any is
106-
# newer than the gateway's boot time, we know the running process is out
107-
# of date.
104+
# so the gateway reloads with fresh code.
105+
#
106+
# The signal we use is ``git rev-parse HEAD`` — the only thing ``hermes
107+
# update`` moves that is NOT moved by agent-driven file edits. Earlier
108+
# revisions of this check compared file mtimes across a sentinel set
109+
# (run_agent.py, gateway/run.py, ...), but that produced false positives
110+
# whenever the agent edited its own source files during a session:
111+
# mtime jumps, stale-check fires, gateway restarts, user must retype.
112+
# See the conversation at PR #<this> for the motivating incident.
113+
#
114+
# The legacy mtime sentinels are kept ONLY as a last-resort fallback for
115+
# non-git installs (pip install from wheel, sparse clones with no .git
116+
# dir). In those environments ``hermes update`` is not a supported path,
117+
# so the check effectively no-ops — which is the safe behavior: better
118+
# to ship one broken import than to restart on every agent-edit.
108119
_STALE_CODE_SENTINELS: tuple[str, ...] = (
109120
"hermes_cli/config.py",
110121
"hermes_cli/__init__.py",
@@ -113,10 +124,106 @@ def _replace(match: re.Match[str]) -> str:
113124
"pyproject.toml",
114125
)
115126

127+
# Cache git HEAD reads across consecutive messages so a chat burst doesn't
128+
# spawn one subprocess per message. 5s is long enough to collapse a burst
129+
# and short enough that the real post-update detection still fires within
130+
# the user's perceived "next message" window.
131+
_GIT_SHA_CACHE_TTL_SECS = 5.0
132+
133+
134+
def _read_git_head_sha(repo_root: Path) -> Optional[str]:
135+
"""Return the git HEAD SHA for ``repo_root``, or None if unavailable.
136+
137+
Reads ``.git/HEAD`` directly (and follows one level of ref) instead
138+
of shelling out to ``git`` — cheaper, no subprocess tax, works on
139+
gateway hosts that don't have a ``git`` binary on PATH. Returns
140+
None for non-git installs (no ``.git`` dir) or any I/O error; callers
141+
treat None as "can't tell" and skip the check.
142+
143+
Supports the three layouts we care about:
144+
1. Main checkout: ``<repo>/.git/`` is a directory.
145+
2. Git worktree: ``<repo>/.git`` is a file ``gitdir: <path>`` that
146+
points at ``<main>/.git/worktrees/<name>/``. The worktree's
147+
gitdir has HEAD + index but NOT refs/heads/ — those live in
148+
the main checkout, and ``<worktree-gitdir>/commondir`` points
149+
at the main ``.git``. We search both locations for refs.
150+
3. Packed refs: ``refs/heads/<branch>`` is absent on disk but
151+
listed in ``<main-git-dir>/packed-refs``.
152+
"""
153+
try:
154+
git_dir = repo_root / ".git"
155+
# Worktrees store ``.git`` as a file pointing at gitdir: <path>
156+
if git_dir.is_file():
157+
try:
158+
content = git_dir.read_text().strip()
159+
if content.startswith("gitdir:"):
160+
git_dir = Path(content.split(":", 1)[1].strip())
161+
if not git_dir.is_absolute():
162+
git_dir = (repo_root / git_dir).resolve()
163+
except OSError:
164+
return None
165+
if not git_dir.is_dir():
166+
return None
167+
168+
# Figure out the "common" git dir — the one that owns shared refs.
169+
# For a worktree, commondir points at it (relative path, resolve
170+
# against git_dir). For a main checkout, common_dir == git_dir.
171+
common_dir = git_dir
172+
commondir_file = git_dir / "commondir"
173+
if commondir_file.is_file():
174+
try:
175+
rel = commondir_file.read_text().strip()
176+
candidate = (git_dir / rel).resolve() if rel else git_dir
177+
if candidate.is_dir():
178+
common_dir = candidate
179+
except OSError:
180+
pass
181+
182+
head_path = git_dir / "HEAD"
183+
if not head_path.is_file():
184+
return None
185+
head_content = head_path.read_text().strip()
186+
187+
if head_content.startswith("ref:"):
188+
# Symbolic ref — follow one level (e.g. ref: refs/heads/main).
189+
# Worktree-local refs (bisect, rebase-merge state) live under
190+
# git_dir; shared refs (refs/heads/*, refs/tags/*) live under
191+
# common_dir. Try git_dir first, then common_dir.
192+
ref_rel = head_content.split(":", 1)[1].strip()
193+
for base in (git_dir, common_dir) if git_dir != common_dir else (git_dir,):
194+
ref_path = base / ref_rel
195+
if ref_path.is_file():
196+
try:
197+
sha = ref_path.read_text().strip()
198+
except OSError:
199+
continue
200+
if sha:
201+
return sha
202+
# Packed refs fallback — always stored in the common dir.
203+
packed = common_dir / "packed-refs"
204+
if packed.is_file():
205+
try:
206+
for line in packed.read_text().splitlines():
207+
line = line.strip()
208+
if not line or line.startswith("#") or line.startswith("^"):
209+
continue
210+
parts = line.split(None, 1)
211+
if len(parts) == 2 and parts[1] == ref_rel:
212+
return parts[0] or None
213+
except OSError:
214+
return None
215+
return None
216+
217+
# Detached HEAD — content is the SHA directly.
218+
return head_content or None
219+
except Exception:
220+
return None
221+
116222

117223
def _compute_repo_mtime(repo_root: Path) -> float:
118224
"""Return the newest mtime across the stale-code sentinel files.
119225

226+
Legacy fallback used only for non-git installs (``.git`` missing).
120227
Missing files are ignored (they may not exist on older checkouts).
121228
Returns 0.0 if no sentinel file is readable — treat that as "can't
122229
tell", which downstream callers interpret as "not stale" to avoid
@@ -1005,6 +1112,7 @@ class GatewayRunner:
10051112
# running __init__ don't crash when _handle_message reads these.
10061113
_boot_wall_time: float = 0.0
10071114
_boot_repo_mtime: float = 0.0
1115+
_boot_git_sha: Optional[str] = None
10081116
_stale_code_restart_triggered: bool = False
10091117

10101118
def __init__(self, config: Optional[GatewayConfig] = None):
@@ -1020,15 +1128,23 @@ def __init__(self, config: Optional[GatewayConfig] = None):
10201128
try:
10211129
self._boot_wall_time: float = time.time()
10221130
self._repo_root_for_staleness: Path = Path(__file__).resolve().parent.parent
1131+
self._boot_git_sha: Optional[str] = _read_git_head_sha(
1132+
self._repo_root_for_staleness,
1133+
)
10231134
self._boot_repo_mtime: float = _compute_repo_mtime(
10241135
self._repo_root_for_staleness,
10251136
)
10261137
except Exception:
10271138
self._boot_wall_time = 0.0
10281139
self._repo_root_for_staleness = Path(".")
1140+
self._boot_git_sha = None
10291141
self._boot_repo_mtime = 0.0
10301142
self._stale_code_notified: set[str] = set()
10311143
self._stale_code_restart_triggered: bool = False
1144+
# Cached current-SHA read, refreshed at most every
1145+
# _GIT_SHA_CACHE_TTL_SECS so bursty chats don't hammer the filesystem.
1146+
self._cached_current_sha: Optional[str] = self._boot_git_sha
1147+
self._cached_current_sha_at: float = self._boot_wall_time
10321148

10331149
# Load ephemeral config from config.yaml / env vars.
10341150
# Both are injected at API-call time only and never persisted.
@@ -2737,36 +2853,69 @@ async def _run_restart() -> None:
27372853
task.add_done_callback(self._background_tasks.discard)
27382854
return True
27392855

2856+
def _current_git_sha_cached(self) -> Optional[str]:
2857+
"""Return the current HEAD SHA, cached for _GIT_SHA_CACHE_TTL_SECS.
2858+
2859+
A bursty chat (user mashes "hello?" three times) would otherwise
2860+
re-read ``.git/HEAD`` on every message. Caching collapses that
2861+
into a single read and still re-checks within the user's
2862+
perceived "next message" window.
2863+
"""
2864+
now = time.time()
2865+
if (
2866+
self._cached_current_sha is not None
2867+
and (now - self._cached_current_sha_at) < _GIT_SHA_CACHE_TTL_SECS
2868+
):
2869+
return self._cached_current_sha
2870+
try:
2871+
sha = _read_git_head_sha(self._repo_root_for_staleness)
2872+
except Exception:
2873+
sha = None
2874+
self._cached_current_sha = sha
2875+
self._cached_current_sha_at = now
2876+
return sha
2877+
27402878
def _detect_stale_code(self) -> bool:
2741-
"""Return True if source files on disk are newer than the running process.
2879+
"""Return True if the git HEAD moved since this process booted.
27422880

27432881
A gateway that survives ``hermes update`` (manual SIGTERM never
27442882
escalated, systemd restart race, detached-process respawn failed,
27452883
etc.) keeps pre-update modules cached in ``sys.modules``. Later
27462884
imports of names added post-update — e.g. ``cfg_get`` from PR
27472885
#17304 — raise ImportError against the stale module object (see
2748-
Issue #17648). Detecting this at the source — "the code on disk
2749-
is newer than me" — lets us auto-restart instead of serving
2750-
broken responses until the user notices and runs
2751-
``hermes gateway restart`` manually.
2752-
2753-
Returns False when the boot-time snapshot is unavailable or no
2754-
sentinel file is readable, to avoid false-positive restart loops
2755-
in unusual checkouts (sparse clones, read-only filesystems).
2886+
Issue #17648).
2887+
2888+
We compare the git HEAD SHA at boot to the current SHA on disk.
2889+
``hermes update`` always moves HEAD forward via ``git pull``;
2890+
agent file edits (the agent patching ``run_agent.py`` or
2891+
``gateway/run.py`` during a self-dev session) never move HEAD.
2892+
That makes SHA comparison free of the false-positive class that
2893+
the old mtime check suffered from — the agent can edit any file
2894+
without triggering a phantom restart.
2895+
2896+
Returns False when:
2897+
- the boot SHA is unavailable (non-git install, first call
2898+
during partial init, etc.); we can't tell and refuse to loop
2899+
- the current SHA matches the boot SHA
2900+
- reading the current SHA fails for any reason
27562901
"""
2757-
if not self._boot_wall_time or not self._boot_repo_mtime:
2902+
if not self._boot_wall_time:
2903+
return False
2904+
if not self._boot_git_sha:
2905+
# Non-git install. ``hermes update`` is git-based, so a
2906+
# non-git install can't experience the stale-modules class
2907+
# this check exists to catch. Return False — no check, no
2908+
# false positives. (If we ever ship a pip-install update
2909+
# path, we'd add a persistent update marker here and compare
2910+
# its timestamp to self._boot_wall_time.)
27582911
return False
27592912
try:
2760-
current = _compute_repo_mtime(self._repo_root_for_staleness)
2913+
current = self._current_git_sha_cached()
27612914
except Exception:
27622915
return False
2763-
if current <= 0.0:
2916+
if not current:
27642917
return False
2765-
# 2-second slack guards against filesystems with coarse mtime
2766-
# resolution (FAT32, some NFS mounts). Real updates always move
2767-
# the newest-file mtime forward by minutes, so this doesn't hide
2768-
# genuine staleness.
2769-
return current > self._boot_repo_mtime + 2.0
2918+
return current != self._boot_git_sha
27702919

27712920
def _trigger_stale_code_restart(self) -> None:
27722921
"""Idempotently kick off a graceful restart after stale-code detection.
@@ -2782,12 +2931,17 @@ def _trigger_stale_code_restart(self) -> None:
27822931
if self._stale_code_restart_triggered:
27832932
return
27842933
self._stale_code_restart_triggered = True
2934+
current_sha = None
2935+
try:
2936+
current_sha = self._current_git_sha_cached()
2937+
except Exception:
2938+
pass
27852939
logger.warning(
2786-
"Stale-code self-check: source files newer than gateway boot "
2787-
"time (boot=%.0f, newest=%.0f) — requesting graceful restart. "
2940+
"Stale-code self-check: git HEAD moved since gateway boot "
2941+
"(boot=%s, current=%s) — requesting graceful restart. "
27882942
"See Issue #17648.",
2789-
self._boot_repo_mtime,
2790-
_compute_repo_mtime(self._repo_root_for_staleness),
2943+
(self._boot_git_sha or "?")[:12],
2944+
(current_sha or "?")[:12],
27912945
)
27922946
try:
27932947
self.request_restart(detached=False, via_service=True)

0 commit comments

Comments
 (0)