@@ -637,6 +637,8 @@ def release_all_scoped_locks(
637637
638638_TAKEOVER_MARKER_FILENAME = ".gateway-takeover.json"
639639_TAKEOVER_MARKER_TTL_S = 60 # Marker older than this is treated as stale
640+ _PLANNED_STOP_MARKER_FILENAME = ".gateway-planned-stop.json"
641+ _PLANNED_STOP_MARKER_TTL_S = 60
640642
641643
642644def _get_takeover_marker_path () -> Path :
@@ -645,6 +647,67 @@ def _get_takeover_marker_path() -> Path:
645647 return home / _TAKEOVER_MARKER_FILENAME
646648
647649
650+ def _get_planned_stop_marker_path () -> Path :
651+ """Return the path to the intentional gateway stop marker file."""
652+ home = get_hermes_home ()
653+ return home / _PLANNED_STOP_MARKER_FILENAME
654+
655+
656+ def _marker_is_stale (written_at : str , ttl_s : int ) -> bool :
657+ try :
658+ written_dt = datetime .fromisoformat (written_at )
659+ age = (datetime .now (timezone .utc ) - written_dt ).total_seconds ()
660+ return age > ttl_s
661+ except (TypeError , ValueError ):
662+ return True
663+
664+
665+ def _consume_pid_marker_for_self (
666+ path : Path ,
667+ * ,
668+ pid_field : str ,
669+ start_time_field : str ,
670+ ttl_s : int ,
671+ ) -> bool :
672+ record = _read_json_file (path )
673+ if not record :
674+ return False
675+
676+ try :
677+ target_pid = int (record [pid_field ])
678+ target_start_time = record .get (start_time_field )
679+ written_at = record .get ("written_at" ) or ""
680+ except (KeyError , TypeError , ValueError ):
681+ try :
682+ path .unlink (missing_ok = True )
683+ except OSError :
684+ pass
685+ return False
686+
687+ if _marker_is_stale (written_at , ttl_s ):
688+ try :
689+ path .unlink (missing_ok = True )
690+ except OSError :
691+ pass
692+ return False
693+
694+ our_pid = os .getpid ()
695+ our_start_time = _get_process_start_time (our_pid )
696+ matches = (
697+ target_pid == our_pid
698+ and target_start_time is not None
699+ and our_start_time is not None
700+ and target_start_time == our_start_time
701+ )
702+
703+ try :
704+ path .unlink (missing_ok = True )
705+ except OSError :
706+ pass
707+
708+ return matches
709+
710+
648711def write_takeover_marker (target_pid : int ) -> bool :
649712 """Record that ``target_pid`` is being replaced by the current process.
650713
@@ -681,64 +744,57 @@ def consume_takeover_marker_for_self() -> bool:
681744 Always unlinks the marker on match (and on detected staleness) so
682745 subsequent unrelated signals don't re-trigger.
683746 """
684- path = _get_takeover_marker_path ()
685- record = _read_json_file (path )
686- if not record :
687- return False
747+ return _consume_pid_marker_for_self (
748+ _get_takeover_marker_path (),
749+ pid_field = "target_pid" ,
750+ start_time_field = "target_start_time" ,
751+ ttl_s = _TAKEOVER_MARKER_TTL_S ,
752+ )
688753
689- # Any malformed or stale marker → drop it and return False
690- try :
691- target_pid = int (record ["target_pid" ])
692- target_start_time = record .get ("target_start_time" )
693- written_at = record .get ("written_at" ) or ""
694- except (KeyError , TypeError , ValueError ):
695- try :
696- path .unlink (missing_ok = True )
697- except OSError :
698- pass
699- return False
700754
701- # TTL guard: a stale marker older than _TAKEOVER_MARKER_TTL_S is ignored.
702- stale = False
755+ def clear_takeover_marker () -> None :
756+ """Remove the takeover marker unconditionally. Safe to call repeatedly."""
703757 try :
704- written_dt = datetime .fromisoformat (written_at )
705- age = (datetime .now (timezone .utc ) - written_dt ).total_seconds ()
706- if age > _TAKEOVER_MARKER_TTL_S :
707- stale = True
708- except (TypeError , ValueError ):
709- stale = True # Unparseable timestamp — treat as stale
758+ _get_takeover_marker_path ().unlink (missing_ok = True )
759+ except OSError :
760+ pass
710761
711- if stale :
712- try :
713- path .unlink (missing_ok = True )
714- except OSError :
715- pass
716- return False
717762
718- # Does the marker name THIS process?
719- our_pid = os .getpid ()
720- our_start_time = _get_process_start_time (our_pid )
721- matches = (
722- target_pid == our_pid
723- and target_start_time is not None
724- and our_start_time is not None
725- and target_start_time == our_start_time
726- )
763+ def write_planned_stop_marker (target_pid : int ) -> bool :
764+ """Record that ``target_pid`` is being stopped intentionally.
727765
728- # Consume the marker whether it matched or not — a marker that doesn't
729- # match our identity is stale-for-us anyway.
766+ The gateway exits non-zero for unexpected SIGTERM so service managers can
767+ revive it. Service stop commands send the same SIGTERM, so the CLI writes
768+ this short-lived marker first to let the target process exit cleanly.
769+ """
730770 try :
731- path .unlink (missing_ok = True )
732- except OSError :
733- pass
771+ target_start_time = _get_process_start_time (target_pid )
772+ record = {
773+ "target_pid" : target_pid ,
774+ "target_start_time" : target_start_time ,
775+ "stopper_pid" : os .getpid (),
776+ "written_at" : _utc_now_iso (),
777+ }
778+ _write_json_file (_get_planned_stop_marker_path (), record )
779+ return True
780+ except (OSError , PermissionError ):
781+ return False
734782
735- return matches
736783
784+ def consume_planned_stop_marker_for_self () -> bool :
785+ """Return True when the current process is being intentionally stopped."""
786+ return _consume_pid_marker_for_self (
787+ _get_planned_stop_marker_path (),
788+ pid_field = "target_pid" ,
789+ start_time_field = "target_start_time" ,
790+ ttl_s = _PLANNED_STOP_MARKER_TTL_S ,
791+ )
737792
738- def clear_takeover_marker () -> None :
739- """Remove the takeover marker unconditionally. Safe to call repeatedly."""
793+
794+ def clear_planned_stop_marker () -> None :
795+ """Remove the planned-stop marker unconditionally."""
740796 try :
741- _get_takeover_marker_path ().unlink (missing_ok = True )
797+ _get_planned_stop_marker_path ().unlink (missing_ok = True )
742798 except OSError :
743799 pass
744800
0 commit comments