Skip to content

Commit 936320d

Browse files
teknium1nickdlkk
authored andcommitted
fix(gateway): use git HEAD SHA, not file mtimes, for stale-code check (NousResearch#19740)
The stale-code self-check (Issue NousResearch#17648) used sentinel-file mtimes to decide whether the gateway survived a `hermes update` with stale `sys.modules`. That signal false-positives on any write to the sentinel files — including agent-driven edits during Hermes-on-Hermes dev sessions. Telling the agent to patch `run_agent.py` would flip the check to True on the next user message and force a gateway restart even though no update happened. Switch the signal to `git rev-parse HEAD`. Agent file edits don't move HEAD; `hermes update` (git pull) always does. Reading .git/HEAD directly (no subprocess) with a 5s cache keeps the overhead negligible on bursty chats. Non-git installs short-circuit to False — the stale-modules class can't occur without a git-backed update path, so there's nothing to detect. The legacy `_compute_repo_mtime` helper is kept but unused by detection, reserved as a fallback hook for future pip-install update paths. - _read_git_head_sha(): resolves HEAD across main checkout, worktree (follows `gitdir:` + `commondir` pointers), and packed-refs layouts. - _current_git_sha_cached(): per-runner 5s SHA cache. - _detect_stale_code(): boot SHA vs current SHA, returns False when either is unavailable. - Tests cover all four layouts, the agent-edits-don't-trigger regression, and cache behavior. Refs NousResearch#17648.
1 parent 67dad76 commit 936320d

2 files changed

Lines changed: 476 additions & 133 deletions

File tree

gateway/run.py

Lines changed: 179 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,21 @@ def _replace(match: re.Match[str]) -> str:
101101
# already-loaded stale module object and raises ``ImportError`` — see
102102
# Issue #17648. Rather than papering over the import failure site-by-site
103103
# in every tool file, detect the stale state centrally and auto-restart
104-
# so the gateway reloads with fresh code. The sentinel files below are
105-
# the canonical repo-level markers that every update touches; if any is
106-
# newer than the gateway's boot time, we know the running process is out
107-
# of date.
104+
# so the gateway reloads with fresh code.
105+
#
106+
# The signal we use is ``git rev-parse HEAD`` — the only thing ``hermes
107+
# update`` moves that is NOT moved by agent-driven file edits. Earlier
108+
# revisions of this check compared file mtimes across a sentinel set
109+
# (run_agent.py, gateway/run.py, ...), but that produced false positives
110+
# whenever the agent edited its own source files during a session:
111+
# mtime jumps, stale-check fires, gateway restarts, user must retype.
112+
# See the conversation at PR #<this> for the motivating incident.
113+
#
114+
# The legacy mtime sentinels are kept ONLY as a last-resort fallback for
115+
# non-git installs (pip install from wheel, sparse clones with no .git
116+
# dir). In those environments ``hermes update`` is not a supported path,
117+
# so the check effectively no-ops — which is the safe behavior: better
118+
# to ship one broken import than to restart on every agent-edit.
108119
_STALE_CODE_SENTINELS: tuple[str, ...] = (
109120
"hermes_cli/config.py",
110121
"hermes_cli/__init__.py",
@@ -113,10 +124,106 @@ def _replace(match: re.Match[str]) -> str:
113124
"pyproject.toml",
114125
)
115126

127+
# Cache git HEAD reads across consecutive messages so a chat burst doesn't
128+
# spawn one subprocess per message. 5s is long enough to collapse a burst
129+
# and short enough that the real post-update detection still fires within
130+
# the user's perceived "next message" window.
131+
_GIT_SHA_CACHE_TTL_SECS = 5.0
132+
133+
134+
def _read_git_head_sha(repo_root: Path) -> Optional[str]:
135+
"""Return the git HEAD SHA for ``repo_root``, or None if unavailable.
136+
137+
Reads ``.git/HEAD`` directly (and follows one level of ref) instead
138+
of shelling out to ``git`` — cheaper, no subprocess tax, works on
139+
gateway hosts that don't have a ``git`` binary on PATH. Returns
140+
None for non-git installs (no ``.git`` dir) or any I/O error; callers
141+
treat None as "can't tell" and skip the check.
142+
143+
Supports the three layouts we care about:
144+
1. Main checkout: ``<repo>/.git/`` is a directory.
145+
2. Git worktree: ``<repo>/.git`` is a file ``gitdir: <path>`` that
146+
points at ``<main>/.git/worktrees/<name>/``. The worktree's
147+
gitdir has HEAD + index but NOT refs/heads/ — those live in
148+
the main checkout, and ``<worktree-gitdir>/commondir`` points
149+
at the main ``.git``. We search both locations for refs.
150+
3. Packed refs: ``refs/heads/<branch>`` is absent on disk but
151+
listed in ``<main-git-dir>/packed-refs``.
152+
"""
153+
try:
154+
git_dir = repo_root / ".git"
155+
# Worktrees store ``.git`` as a file pointing at gitdir: <path>
156+
if git_dir.is_file():
157+
try:
158+
content = git_dir.read_text().strip()
159+
if content.startswith("gitdir:"):
160+
git_dir = Path(content.split(":", 1)[1].strip())
161+
if not git_dir.is_absolute():
162+
git_dir = (repo_root / git_dir).resolve()
163+
except OSError:
164+
return None
165+
if not git_dir.is_dir():
166+
return None
167+
168+
# Figure out the "common" git dir — the one that owns shared refs.
169+
# For a worktree, commondir points at it (relative path, resolve
170+
# against git_dir). For a main checkout, common_dir == git_dir.
171+
common_dir = git_dir
172+
commondir_file = git_dir / "commondir"
173+
if commondir_file.is_file():
174+
try:
175+
rel = commondir_file.read_text().strip()
176+
candidate = (git_dir / rel).resolve() if rel else git_dir
177+
if candidate.is_dir():
178+
common_dir = candidate
179+
except OSError:
180+
pass
181+
182+
head_path = git_dir / "HEAD"
183+
if not head_path.is_file():
184+
return None
185+
head_content = head_path.read_text().strip()
186+
187+
if head_content.startswith("ref:"):
188+
# Symbolic ref — follow one level (e.g. ref: refs/heads/main).
189+
# Worktree-local refs (bisect, rebase-merge state) live under
190+
# git_dir; shared refs (refs/heads/*, refs/tags/*) live under
191+
# common_dir. Try git_dir first, then common_dir.
192+
ref_rel = head_content.split(":", 1)[1].strip()
193+
for base in (git_dir, common_dir) if git_dir != common_dir else (git_dir,):
194+
ref_path = base / ref_rel
195+
if ref_path.is_file():
196+
try:
197+
sha = ref_path.read_text().strip()
198+
except OSError:
199+
continue
200+
if sha:
201+
return sha
202+
# Packed refs fallback — always stored in the common dir.
203+
packed = common_dir / "packed-refs"
204+
if packed.is_file():
205+
try:
206+
for line in packed.read_text().splitlines():
207+
line = line.strip()
208+
if not line or line.startswith("#") or line.startswith("^"):
209+
continue
210+
parts = line.split(None, 1)
211+
if len(parts) == 2 and parts[1] == ref_rel:
212+
return parts[0] or None
213+
except OSError:
214+
return None
215+
return None
216+
217+
# Detached HEAD — content is the SHA directly.
218+
return head_content or None
219+
except Exception:
220+
return None
221+
116222

117223
def _compute_repo_mtime(repo_root: Path) -> float:
118224
"""Return the newest mtime across the stale-code sentinel files.
119225

226+
Legacy fallback used only for non-git installs (``.git`` missing).
120227
Missing files are ignored (they may not exist on older checkouts).
121228
Returns 0.0 if no sentinel file is readable — treat that as "can't
122229
tell", which downstream callers interpret as "not stale" to avoid
@@ -1001,6 +1108,7 @@ class GatewayRunner:
10011108
# running __init__ don't crash when _handle_message reads these.
10021109
_boot_wall_time: float = 0.0
10031110
_boot_repo_mtime: float = 0.0
1111+
_boot_git_sha: Optional[str] = None
10041112
_stale_code_restart_triggered: bool = False
10051113

10061114
def __init__(self, config: Optional[GatewayConfig] = None):
@@ -1016,15 +1124,23 @@ def __init__(self, config: Optional[GatewayConfig] = None):
10161124
try:
10171125
self._boot_wall_time: float = time.time()
10181126
self._repo_root_for_staleness: Path = Path(__file__).resolve().parent.parent
1127+
self._boot_git_sha: Optional[str] = _read_git_head_sha(
1128+
self._repo_root_for_staleness,
1129+
)
10191130
self._boot_repo_mtime: float = _compute_repo_mtime(
10201131
self._repo_root_for_staleness,
10211132
)
10221133
except Exception:
10231134
self._boot_wall_time = 0.0
10241135
self._repo_root_for_staleness = Path(".")
1136+
self._boot_git_sha = None
10251137
self._boot_repo_mtime = 0.0
10261138
self._stale_code_notified: set[str] = set()
10271139
self._stale_code_restart_triggered: bool = False
1140+
# Cached current-SHA read, refreshed at most every
1141+
# _GIT_SHA_CACHE_TTL_SECS so bursty chats don't hammer the filesystem.
1142+
self._cached_current_sha: Optional[str] = self._boot_git_sha
1143+
self._cached_current_sha_at: float = self._boot_wall_time
10281144

10291145
# Load ephemeral config from config.yaml / env vars.
10301146
# Both are injected at API-call time only and never persisted.
@@ -2733,36 +2849,69 @@ async def _run_restart() -> None:
27332849
task.add_done_callback(self._background_tasks.discard)
27342850
return True
27352851

2852+
def _current_git_sha_cached(self) -> Optional[str]:
2853+
"""Return the current HEAD SHA, cached for _GIT_SHA_CACHE_TTL_SECS.
2854+
2855+
A bursty chat (user mashes "hello?" three times) would otherwise
2856+
re-read ``.git/HEAD`` on every message. Caching collapses that
2857+
into a single read and still re-checks within the user's
2858+
perceived "next message" window.
2859+
"""
2860+
now = time.time()
2861+
if (
2862+
self._cached_current_sha is not None
2863+
and (now - self._cached_current_sha_at) < _GIT_SHA_CACHE_TTL_SECS
2864+
):
2865+
return self._cached_current_sha
2866+
try:
2867+
sha = _read_git_head_sha(self._repo_root_for_staleness)
2868+
except Exception:
2869+
sha = None
2870+
self._cached_current_sha = sha
2871+
self._cached_current_sha_at = now
2872+
return sha
2873+
27362874
def _detect_stale_code(self) -> bool:
2737-
"""Return True if source files on disk are newer than the running process.
2875+
"""Return True if the git HEAD moved since this process booted.
27382876

27392877
A gateway that survives ``hermes update`` (manual SIGTERM never
27402878
escalated, systemd restart race, detached-process respawn failed,
27412879
etc.) keeps pre-update modules cached in ``sys.modules``. Later
27422880
imports of names added post-update — e.g. ``cfg_get`` from PR
27432881
#17304 — raise ImportError against the stale module object (see
2744-
Issue #17648). Detecting this at the source — "the code on disk
2745-
is newer than me" — lets us auto-restart instead of serving
2746-
broken responses until the user notices and runs
2747-
``hermes gateway restart`` manually.
2748-
2749-
Returns False when the boot-time snapshot is unavailable or no
2750-
sentinel file is readable, to avoid false-positive restart loops
2751-
in unusual checkouts (sparse clones, read-only filesystems).
2882+
Issue #17648).
2883+
2884+
We compare the git HEAD SHA at boot to the current SHA on disk.
2885+
``hermes update`` always moves HEAD forward via ``git pull``;
2886+
agent file edits (the agent patching ``run_agent.py`` or
2887+
``gateway/run.py`` during a self-dev session) never move HEAD.
2888+
That makes SHA comparison free of the false-positive class that
2889+
the old mtime check suffered from — the agent can edit any file
2890+
without triggering a phantom restart.
2891+
2892+
Returns False when:
2893+
- the boot SHA is unavailable (non-git install, first call
2894+
during partial init, etc.); we can't tell and refuse to loop
2895+
- the current SHA matches the boot SHA
2896+
- reading the current SHA fails for any reason
27522897
"""
2753-
if not self._boot_wall_time or not self._boot_repo_mtime:
2898+
if not self._boot_wall_time:
2899+
return False
2900+
if not self._boot_git_sha:
2901+
# Non-git install. ``hermes update`` is git-based, so a
2902+
# non-git install can't experience the stale-modules class
2903+
# this check exists to catch. Return False — no check, no
2904+
# false positives. (If we ever ship a pip-install update
2905+
# path, we'd add a persistent update marker here and compare
2906+
# its timestamp to self._boot_wall_time.)
27542907
return False
27552908
try:
2756-
current = _compute_repo_mtime(self._repo_root_for_staleness)
2909+
current = self._current_git_sha_cached()
27572910
except Exception:
27582911
return False
2759-
if current <= 0.0:
2912+
if not current:
27602913
return False
2761-
# 2-second slack guards against filesystems with coarse mtime
2762-
# resolution (FAT32, some NFS mounts). Real updates always move
2763-
# the newest-file mtime forward by minutes, so this doesn't hide
2764-
# genuine staleness.
2765-
return current > self._boot_repo_mtime + 2.0
2914+
return current != self._boot_git_sha
27662915

27672916
def _trigger_stale_code_restart(self) -> None:
27682917
"""Idempotently kick off a graceful restart after stale-code detection.
@@ -2778,12 +2927,17 @@ def _trigger_stale_code_restart(self) -> None:
27782927
if self._stale_code_restart_triggered:
27792928
return
27802929
self._stale_code_restart_triggered = True
2930+
current_sha = None
2931+
try:
2932+
current_sha = self._current_git_sha_cached()
2933+
except Exception:
2934+
pass
27812935
logger.warning(
2782-
"Stale-code self-check: source files newer than gateway boot "
2783-
"time (boot=%.0f, newest=%.0f) — requesting graceful restart. "
2936+
"Stale-code self-check: git HEAD moved since gateway boot "
2937+
"(boot=%s, current=%s) — requesting graceful restart. "
27842938
"See Issue #17648.",
2785-
self._boot_repo_mtime,
2786-
_compute_repo_mtime(self._repo_root_for_staleness),
2939+
(self._boot_git_sha or "?")[:12],
2940+
(current_sha or "?")[:12],
27872941
)
27882942
try:
27892943
self.request_restart(detached=False, via_service=True)

0 commit comments

Comments
 (0)