@@ -101,10 +101,21 @@ def _replace(match: re.Match[str]) -> str:
101101# already-loaded stale module object and raises ``ImportError`` — see
102102# Issue #17648. Rather than papering over the import failure site-by-site
103103# in every tool file, detect the stale state centrally and auto-restart
104- # so the gateway reloads with fresh code. The sentinel files below are
105- # the canonical repo-level markers that every update touches; if any is
106- # newer than the gateway's boot time, we know the running process is out
107- # of date.
104+ # so the gateway reloads with fresh code.
105+ #
106+ # The signal we use is ``git rev-parse HEAD`` — the only thing ``hermes
107+ # update`` moves that is NOT moved by agent-driven file edits. Earlier
108+ # revisions of this check compared file mtimes across a sentinel set
109+ # (run_agent.py, gateway/run.py, ...), but that produced false positives
110+ # whenever the agent edited its own source files during a session:
111+ # mtime jumps, stale-check fires, gateway restarts, user must retype.
112+ # See the conversation at PR #<this> for the motivating incident.
113+ #
114+ # The legacy mtime sentinels are kept ONLY as a last-resort fallback for
115+ # non-git installs (pip install from wheel, sparse clones with no .git
116+ # dir). In those environments ``hermes update`` is not a supported path,
117+ # so the check effectively no-ops — which is the safe behavior: better
118+ # to ship one broken import than to restart on every agent-edit.
108119_STALE_CODE_SENTINELS: tuple[str, ...] = (
109120 "hermes_cli/config.py",
110121 "hermes_cli/__init__.py",
@@ -113,10 +124,106 @@ def _replace(match: re.Match[str]) -> str:
113124 "pyproject.toml",
114125)
115126
127+ # Cache git HEAD reads across consecutive messages so a chat burst doesn't
128+ # spawn one subprocess per message. 5s is long enough to collapse a burst
129+ # and short enough that the real post-update detection still fires within
130+ # the user's perceived "next message" window.
131+ _GIT_SHA_CACHE_TTL_SECS = 5.0
132+
133+
134+ def _read_git_head_sha(repo_root: Path) -> Optional[str]:
135+ """Return the git HEAD SHA for ``repo_root``, or None if unavailable.
136+
137+ Reads ``.git/HEAD`` directly (and follows one level of ref) instead
138+ of shelling out to ``git`` — cheaper, no subprocess tax, works on
139+ gateway hosts that don't have a ``git`` binary on PATH. Returns
140+ None for non-git installs (no ``.git`` dir) or any I/O error; callers
141+ treat None as "can't tell" and skip the check.
142+
143+ Supports the three layouts we care about:
144+ 1. Main checkout: ``<repo>/.git/`` is a directory.
145+ 2. Git worktree: ``<repo>/.git`` is a file ``gitdir: <path>`` that
146+ points at ``<main>/.git/worktrees/<name>/``. The worktree's
147+ gitdir has HEAD + index but NOT refs/heads/ — those live in
148+ the main checkout, and ``<worktree-gitdir>/commondir`` points
149+ at the main ``.git``. We search both locations for refs.
150+ 3. Packed refs: ``refs/heads/<branch>`` is absent on disk but
151+ listed in ``<main-git-dir>/packed-refs``.
152+ """
153+ try:
154+ git_dir = repo_root / ".git"
155+ # Worktrees store ``.git`` as a file pointing at gitdir: <path>
156+ if git_dir.is_file():
157+ try:
158+ content = git_dir.read_text().strip()
159+ if content.startswith("gitdir:"):
160+ git_dir = Path(content.split(":", 1)[1].strip())
161+ if not git_dir.is_absolute():
162+ git_dir = (repo_root / git_dir).resolve()
163+ except OSError:
164+ return None
165+ if not git_dir.is_dir():
166+ return None
167+
168+ # Figure out the "common" git dir — the one that owns shared refs.
169+ # For a worktree, commondir points at it (relative path, resolve
170+ # against git_dir). For a main checkout, common_dir == git_dir.
171+ common_dir = git_dir
172+ commondir_file = git_dir / "commondir"
173+ if commondir_file.is_file():
174+ try:
175+ rel = commondir_file.read_text().strip()
176+ candidate = (git_dir / rel).resolve() if rel else git_dir
177+ if candidate.is_dir():
178+ common_dir = candidate
179+ except OSError:
180+ pass
181+
182+ head_path = git_dir / "HEAD"
183+ if not head_path.is_file():
184+ return None
185+ head_content = head_path.read_text().strip()
186+
187+ if head_content.startswith("ref:"):
188+ # Symbolic ref — follow one level (e.g. ref: refs/heads/main).
189+ # Worktree-local refs (bisect, rebase-merge state) live under
190+ # git_dir; shared refs (refs/heads/*, refs/tags/*) live under
191+ # common_dir. Try git_dir first, then common_dir.
192+ ref_rel = head_content.split(":", 1)[1].strip()
193+ for base in (git_dir, common_dir) if git_dir != common_dir else (git_dir,):
194+ ref_path = base / ref_rel
195+ if ref_path.is_file():
196+ try:
197+ sha = ref_path.read_text().strip()
198+ except OSError:
199+ continue
200+ if sha:
201+ return sha
202+ # Packed refs fallback — always stored in the common dir.
203+ packed = common_dir / "packed-refs"
204+ if packed.is_file():
205+ try:
206+ for line in packed.read_text().splitlines():
207+ line = line.strip()
208+ if not line or line.startswith("#") or line.startswith("^"):
209+ continue
210+ parts = line.split(None, 1)
211+ if len(parts) == 2 and parts[1] == ref_rel:
212+ return parts[0] or None
213+ except OSError:
214+ return None
215+ return None
216+
217+ # Detached HEAD — content is the SHA directly.
218+ return head_content or None
219+ except Exception:
220+ return None
221+
116222
117223def _compute_repo_mtime(repo_root: Path) -> float:
118224 """Return the newest mtime across the stale-code sentinel files.
119225
226+ Legacy fallback used only for non-git installs (``.git`` missing).
120227 Missing files are ignored (they may not exist on older checkouts).
121228 Returns 0.0 if no sentinel file is readable — treat that as "can't
122229 tell", which downstream callers interpret as "not stale" to avoid
@@ -1005,6 +1112,7 @@ class GatewayRunner:
10051112 # running __init__ don't crash when _handle_message reads these.
10061113 _boot_wall_time: float = 0.0
10071114 _boot_repo_mtime: float = 0.0
1115+ _boot_git_sha: Optional[str] = None
10081116 _stale_code_restart_triggered: bool = False
10091117
10101118 def __init__(self, config: Optional[GatewayConfig] = None):
@@ -1020,15 +1128,23 @@ def __init__(self, config: Optional[GatewayConfig] = None):
10201128 try:
10211129 self._boot_wall_time: float = time.time()
10221130 self._repo_root_for_staleness: Path = Path(__file__).resolve().parent.parent
1131+ self._boot_git_sha: Optional[str] = _read_git_head_sha(
1132+ self._repo_root_for_staleness,
1133+ )
10231134 self._boot_repo_mtime: float = _compute_repo_mtime(
10241135 self._repo_root_for_staleness,
10251136 )
10261137 except Exception:
10271138 self._boot_wall_time = 0.0
10281139 self._repo_root_for_staleness = Path(".")
1140+ self._boot_git_sha = None
10291141 self._boot_repo_mtime = 0.0
10301142 self._stale_code_notified: set[str] = set()
10311143 self._stale_code_restart_triggered: bool = False
1144+ # Cached current-SHA read, refreshed at most every
1145+ # _GIT_SHA_CACHE_TTL_SECS so bursty chats don't hammer the filesystem.
1146+ self._cached_current_sha: Optional[str] = self._boot_git_sha
1147+ self._cached_current_sha_at: float = self._boot_wall_time
10321148
10331149 # Load ephemeral config from config.yaml / env vars.
10341150 # Both are injected at API-call time only and never persisted.
@@ -2737,36 +2853,69 @@ async def _run_restart() -> None:
27372853 task.add_done_callback(self._background_tasks.discard)
27382854 return True
27392855
2856+ def _current_git_sha_cached(self) -> Optional[str]:
2857+ """Return the current HEAD SHA, cached for _GIT_SHA_CACHE_TTL_SECS.
2858+
2859+ A bursty chat (user mashes "hello?" three times) would otherwise
2860+ re-read ``.git/HEAD`` on every message. Caching collapses that
2861+ into a single read and still re-checks within the user's
2862+ perceived "next message" window.
2863+ """
2864+ now = time.time()
2865+ if (
2866+ self._cached_current_sha is not None
2867+ and (now - self._cached_current_sha_at) < _GIT_SHA_CACHE_TTL_SECS
2868+ ):
2869+ return self._cached_current_sha
2870+ try:
2871+ sha = _read_git_head_sha(self._repo_root_for_staleness)
2872+ except Exception:
2873+ sha = None
2874+ self._cached_current_sha = sha
2875+ self._cached_current_sha_at = now
2876+ return sha
2877+
27402878 def _detect_stale_code(self) -> bool:
2741- """Return True if source files on disk are newer than the running process.
2879+ """Return True if the git HEAD moved since this process booted .
27422880
27432881 A gateway that survives ``hermes update`` (manual SIGTERM never
27442882 escalated, systemd restart race, detached-process respawn failed,
27452883 etc.) keeps pre-update modules cached in ``sys.modules``. Later
27462884 imports of names added post-update — e.g. ``cfg_get`` from PR
27472885 #17304 — raise ImportError against the stale module object (see
2748- Issue #17648). Detecting this at the source — "the code on disk
2749- is newer than me" — lets us auto-restart instead of serving
2750- broken responses until the user notices and runs
2751- ``hermes gateway restart`` manually.
2752-
2753- Returns False when the boot-time snapshot is unavailable or no
2754- sentinel file is readable, to avoid false-positive restart loops
2755- in unusual checkouts (sparse clones, read-only filesystems).
2886+ Issue #17648).
2887+
2888+ We compare the git HEAD SHA at boot to the current SHA on disk.
2889+ ``hermes update`` always moves HEAD forward via ``git pull``;
2890+ agent file edits (the agent patching ``run_agent.py`` or
2891+ ``gateway/run.py`` during a self-dev session) never move HEAD.
2892+ That makes SHA comparison free of the false-positive class that
2893+ the old mtime check suffered from — the agent can edit any file
2894+ without triggering a phantom restart.
2895+
2896+ Returns False when:
2897+ - the boot SHA is unavailable (non-git install, first call
2898+ during partial init, etc.); we can't tell and refuse to loop
2899+ - the current SHA matches the boot SHA
2900+ - reading the current SHA fails for any reason
27562901 """
2757- if not self._boot_wall_time or not self._boot_repo_mtime:
2902+ if not self._boot_wall_time:
2903+ return False
2904+ if not self._boot_git_sha:
2905+ # Non-git install. ``hermes update`` is git-based, so a
2906+ # non-git install can't experience the stale-modules class
2907+ # this check exists to catch. Return False — no check, no
2908+ # false positives. (If we ever ship a pip-install update
2909+ # path, we'd add a persistent update marker here and compare
2910+ # its timestamp to self._boot_wall_time.)
27582911 return False
27592912 try:
2760- current = _compute_repo_mtime( self._repo_root_for_staleness )
2913+ current = self._current_git_sha_cached( )
27612914 except Exception:
27622915 return False
2763- if current <= 0.0 :
2916+ if not current :
27642917 return False
2765- # 2-second slack guards against filesystems with coarse mtime
2766- # resolution (FAT32, some NFS mounts). Real updates always move
2767- # the newest-file mtime forward by minutes, so this doesn't hide
2768- # genuine staleness.
2769- return current > self._boot_repo_mtime + 2.0
2918+ return current != self._boot_git_sha
27702919
27712920 def _trigger_stale_code_restart(self) -> None:
27722921 """Idempotently kick off a graceful restart after stale-code detection.
@@ -2782,12 +2931,17 @@ def _trigger_stale_code_restart(self) -> None:
27822931 if self._stale_code_restart_triggered:
27832932 return
27842933 self._stale_code_restart_triggered = True
2934+ current_sha = None
2935+ try:
2936+ current_sha = self._current_git_sha_cached()
2937+ except Exception:
2938+ pass
27852939 logger.warning(
2786- "Stale-code self-check: source files newer than gateway boot "
2787- "time (boot=%.0f, newest=%.0f ) — requesting graceful restart. "
2940+ "Stale-code self-check: git HEAD moved since gateway boot "
2941+ "(boot=%s, current=%s ) — requesting graceful restart. "
27882942 "See Issue #17648.",
2789- self._boot_repo_mtime ,
2790- _compute_repo_mtime(self._repo_root_for_staleness) ,
2943+ ( self._boot_git_sha or "?")[:12] ,
2944+ (current_sha or "?")[:12] ,
27912945 )
27922946 try:
27932947 self.request_restart(detached=False, via_service=True)
0 commit comments