@@ -101,10 +101,21 @@ def _replace(match: re.Match[str]) -> str:
101101# already-loaded stale module object and raises ``ImportError`` — see
102102# Issue #17648. Rather than papering over the import failure site-by-site
103103# in every tool file, detect the stale state centrally and auto-restart
104- # so the gateway reloads with fresh code. The sentinel files below are
105- # the canonical repo-level markers that every update touches; if any is
106- # newer than the gateway's boot time, we know the running process is out
107- # of date.
104+ # so the gateway reloads with fresh code.
105+ #
106+ # The signal we use is ``git rev-parse HEAD`` — the only thing ``hermes
107+ # update`` moves that is NOT moved by agent-driven file edits. Earlier
108+ # revisions of this check compared file mtimes across a sentinel set
109+ # (run_agent.py, gateway/run.py, ...), but that produced false positives
110+ # whenever the agent edited its own source files during a session:
111+ # mtime jumps, stale-check fires, gateway restarts, user must retype.
112+ # See the conversation at PR #<this> for the motivating incident.
113+ #
114+ # The legacy mtime sentinels are kept ONLY as a last-resort fallback for
115+ # non-git installs (pip install from wheel, sparse clones with no .git
116+ # dir). In those environments ``hermes update`` is not a supported path,
117+ # so the check effectively no-ops — which is the safe behavior: better
118+ # to ship one broken import than to restart on every agent-edit.
108119_STALE_CODE_SENTINELS: tuple[str, ...] = (
109120 "hermes_cli/config.py",
110121 "hermes_cli/__init__.py",
@@ -113,10 +124,106 @@ def _replace(match: re.Match[str]) -> str:
113124 "pyproject.toml",
114125)
115126
127+ # Cache git HEAD reads across consecutive messages so a chat burst doesn't
128+ # spawn one subprocess per message. 5s is long enough to collapse a burst
129+ # and short enough that the real post-update detection still fires within
130+ # the user's perceived "next message" window.
131+ _GIT_SHA_CACHE_TTL_SECS = 5.0
132+
133+
134+ def _read_git_head_sha(repo_root: Path) -> Optional[str]:
135+ """Return the git HEAD SHA for ``repo_root``, or None if unavailable.
136+
137+ Reads ``.git/HEAD`` directly (and follows one level of ref) instead
138+ of shelling out to ``git`` — cheaper, no subprocess tax, works on
139+ gateway hosts that don't have a ``git`` binary on PATH. Returns
140+ None for non-git installs (no ``.git`` dir) or any I/O error; callers
141+ treat None as "can't tell" and skip the check.
142+
143+ Supports the three layouts we care about:
144+ 1. Main checkout: ``<repo>/.git/`` is a directory.
145+ 2. Git worktree: ``<repo>/.git`` is a file ``gitdir: <path>`` that
146+ points at ``<main>/.git/worktrees/<name>/``. The worktree's
147+ gitdir has HEAD + index but NOT refs/heads/ — those live in
148+ the main checkout, and ``<worktree-gitdir>/commondir`` points
149+ at the main ``.git``. We search both locations for refs.
150+ 3. Packed refs: ``refs/heads/<branch>`` is absent on disk but
151+ listed in ``<main-git-dir>/packed-refs``.
152+ """
153+ try:
154+ git_dir = repo_root / ".git"
155+ # Worktrees store ``.git`` as a file pointing at gitdir: <path>
156+ if git_dir.is_file():
157+ try:
158+ content = git_dir.read_text().strip()
159+ if content.startswith("gitdir:"):
160+ git_dir = Path(content.split(":", 1)[1].strip())
161+ if not git_dir.is_absolute():
162+ git_dir = (repo_root / git_dir).resolve()
163+ except OSError:
164+ return None
165+ if not git_dir.is_dir():
166+ return None
167+
168+ # Figure out the "common" git dir — the one that owns shared refs.
169+ # For a worktree, commondir points at it (relative path, resolve
170+ # against git_dir). For a main checkout, common_dir == git_dir.
171+ common_dir = git_dir
172+ commondir_file = git_dir / "commondir"
173+ if commondir_file.is_file():
174+ try:
175+ rel = commondir_file.read_text().strip()
176+ candidate = (git_dir / rel).resolve() if rel else git_dir
177+ if candidate.is_dir():
178+ common_dir = candidate
179+ except OSError:
180+ pass
181+
182+ head_path = git_dir / "HEAD"
183+ if not head_path.is_file():
184+ return None
185+ head_content = head_path.read_text().strip()
186+
187+ if head_content.startswith("ref:"):
188+ # Symbolic ref — follow one level (e.g. ref: refs/heads/main).
189+ # Worktree-local refs (bisect, rebase-merge state) live under
190+ # git_dir; shared refs (refs/heads/*, refs/tags/*) live under
191+ # common_dir. Try git_dir first, then common_dir.
192+ ref_rel = head_content.split(":", 1)[1].strip()
193+ for base in (git_dir, common_dir) if git_dir != common_dir else (git_dir,):
194+ ref_path = base / ref_rel
195+ if ref_path.is_file():
196+ try:
197+ sha = ref_path.read_text().strip()
198+ except OSError:
199+ continue
200+ if sha:
201+ return sha
202+ # Packed refs fallback — always stored in the common dir.
203+ packed = common_dir / "packed-refs"
204+ if packed.is_file():
205+ try:
206+ for line in packed.read_text().splitlines():
207+ line = line.strip()
208+ if not line or line.startswith("#") or line.startswith("^"):
209+ continue
210+ parts = line.split(None, 1)
211+ if len(parts) == 2 and parts[1] == ref_rel:
212+ return parts[0] or None
213+ except OSError:
214+ return None
215+ return None
216+
217+ # Detached HEAD — content is the SHA directly.
218+ return head_content or None
219+ except Exception:
220+ return None
221+
116222
117223def _compute_repo_mtime(repo_root: Path) -> float:
118224 """Return the newest mtime across the stale-code sentinel files.
119225
226+ Legacy fallback used only for non-git installs (``.git`` missing).
120227 Missing files are ignored (they may not exist on older checkouts).
121228 Returns 0.0 if no sentinel file is readable — treat that as "can't
122229 tell", which downstream callers interpret as "not stale" to avoid
@@ -1001,6 +1108,7 @@ class GatewayRunner:
10011108 # running __init__ don't crash when _handle_message reads these.
10021109 _boot_wall_time: float = 0.0
10031110 _boot_repo_mtime: float = 0.0
1111+ _boot_git_sha: Optional[str] = None
10041112 _stale_code_restart_triggered: bool = False
10051113
10061114 def __init__(self, config: Optional[GatewayConfig] = None):
@@ -1016,15 +1124,23 @@ def __init__(self, config: Optional[GatewayConfig] = None):
10161124 try:
10171125 self._boot_wall_time: float = time.time()
10181126 self._repo_root_for_staleness: Path = Path(__file__).resolve().parent.parent
1127+ self._boot_git_sha: Optional[str] = _read_git_head_sha(
1128+ self._repo_root_for_staleness,
1129+ )
10191130 self._boot_repo_mtime: float = _compute_repo_mtime(
10201131 self._repo_root_for_staleness,
10211132 )
10221133 except Exception:
10231134 self._boot_wall_time = 0.0
10241135 self._repo_root_for_staleness = Path(".")
1136+ self._boot_git_sha = None
10251137 self._boot_repo_mtime = 0.0
10261138 self._stale_code_notified: set[str] = set()
10271139 self._stale_code_restart_triggered: bool = False
1140+ # Cached current-SHA read, refreshed at most every
1141+ # _GIT_SHA_CACHE_TTL_SECS so bursty chats don't hammer the filesystem.
1142+ self._cached_current_sha: Optional[str] = self._boot_git_sha
1143+ self._cached_current_sha_at: float = self._boot_wall_time
10281144
10291145 # Load ephemeral config from config.yaml / env vars.
10301146 # Both are injected at API-call time only and never persisted.
@@ -2733,36 +2849,69 @@ async def _run_restart() -> None:
27332849 task.add_done_callback(self._background_tasks.discard)
27342850 return True
27352851
2852+ def _current_git_sha_cached(self) -> Optional[str]:
2853+ """Return the current HEAD SHA, cached for _GIT_SHA_CACHE_TTL_SECS.
2854+
2855+ A bursty chat (user mashes "hello?" three times) would otherwise
2856+ re-read ``.git/HEAD`` on every message. Caching collapses that
2857+ into a single read and still re-checks within the user's
2858+ perceived "next message" window.
2859+ """
2860+ now = time.time()
2861+ if (
2862+ self._cached_current_sha is not None
2863+ and (now - self._cached_current_sha_at) < _GIT_SHA_CACHE_TTL_SECS
2864+ ):
2865+ return self._cached_current_sha
2866+ try:
2867+ sha = _read_git_head_sha(self._repo_root_for_staleness)
2868+ except Exception:
2869+ sha = None
2870+ self._cached_current_sha = sha
2871+ self._cached_current_sha_at = now
2872+ return sha
2873+
27362874 def _detect_stale_code(self) -> bool:
2737- """Return True if source files on disk are newer than the running process.
2875+ """Return True if the git HEAD moved since this process booted .
27382876
27392877 A gateway that survives ``hermes update`` (manual SIGTERM never
27402878 escalated, systemd restart race, detached-process respawn failed,
27412879 etc.) keeps pre-update modules cached in ``sys.modules``. Later
27422880 imports of names added post-update — e.g. ``cfg_get`` from PR
27432881 #17304 — raise ImportError against the stale module object (see
2744- Issue #17648). Detecting this at the source — "the code on disk
2745- is newer than me" — lets us auto-restart instead of serving
2746- broken responses until the user notices and runs
2747- ``hermes gateway restart`` manually.
2748-
2749- Returns False when the boot-time snapshot is unavailable or no
2750- sentinel file is readable, to avoid false-positive restart loops
2751- in unusual checkouts (sparse clones, read-only filesystems).
2882+ Issue #17648).
2883+
2884+ We compare the git HEAD SHA at boot to the current SHA on disk.
2885+ ``hermes update`` always moves HEAD forward via ``git pull``;
2886+ agent file edits (the agent patching ``run_agent.py`` or
2887+ ``gateway/run.py`` during a self-dev session) never move HEAD.
2888+ That makes SHA comparison free of the false-positive class that
2889+ the old mtime check suffered from — the agent can edit any file
2890+ without triggering a phantom restart.
2891+
2892+ Returns False when:
2893+ - the boot SHA is unavailable (non-git install, first call
2894+ during partial init, etc.); we can't tell and refuse to loop
2895+ - the current SHA matches the boot SHA
2896+ - reading the current SHA fails for any reason
27522897 """
2753- if not self._boot_wall_time or not self._boot_repo_mtime:
2898+ if not self._boot_wall_time:
2899+ return False
2900+ if not self._boot_git_sha:
2901+ # Non-git install. ``hermes update`` is git-based, so a
2902+ # non-git install can't experience the stale-modules class
2903+ # this check exists to catch. Return False — no check, no
2904+ # false positives. (If we ever ship a pip-install update
2905+ # path, we'd add a persistent update marker here and compare
2906+ # its timestamp to self._boot_wall_time.)
27542907 return False
27552908 try:
2756- current = _compute_repo_mtime( self._repo_root_for_staleness )
2909+ current = self._current_git_sha_cached( )
27572910 except Exception:
27582911 return False
2759- if current <= 0.0 :
2912+ if not current :
27602913 return False
2761- # 2-second slack guards against filesystems with coarse mtime
2762- # resolution (FAT32, some NFS mounts). Real updates always move
2763- # the newest-file mtime forward by minutes, so this doesn't hide
2764- # genuine staleness.
2765- return current > self._boot_repo_mtime + 2.0
2914+ return current != self._boot_git_sha
27662915
27672916 def _trigger_stale_code_restart(self) -> None:
27682917 """Idempotently kick off a graceful restart after stale-code detection.
@@ -2778,12 +2927,17 @@ def _trigger_stale_code_restart(self) -> None:
27782927 if self._stale_code_restart_triggered:
27792928 return
27802929 self._stale_code_restart_triggered = True
2930+ current_sha = None
2931+ try:
2932+ current_sha = self._current_git_sha_cached()
2933+ except Exception:
2934+ pass
27812935 logger.warning(
2782- "Stale-code self-check: source files newer than gateway boot "
2783- "time (boot=%.0f, newest=%.0f ) — requesting graceful restart. "
2936+ "Stale-code self-check: git HEAD moved since gateway boot "
2937+ "(boot=%s, current=%s ) — requesting graceful restart. "
27842938 "See Issue #17648.",
2785- self._boot_repo_mtime ,
2786- _compute_repo_mtime(self._repo_root_for_staleness) ,
2939+ ( self._boot_git_sha or "?")[:12] ,
2940+ (current_sha or "?")[:12] ,
27872941 )
27882942 try:
27892943 self.request_restart(detached=False, via_service=True)
0 commit comments