1414
1515import msgspec
1616
17+ from traceml .runtime .launch_context import LaunchContext
1718from traceml .runtime .session import get_session_id
1819from traceml .utils .ast_analysis import analyze_script , build_code_manifest
1920
@@ -200,6 +201,7 @@ def write_run_manifest(
200201 nproc_per_node : int ,
201202 history_enabled : bool ,
202203 status : str ,
204+ launch_cwd : str ,
203205 aggregator_dir : Optional [Path ] = None ,
204206 db_path : Optional [Path ] = None ,
205207 extra : Optional [Dict [str , Any ]] = None ,
@@ -233,6 +235,7 @@ def write_run_manifest(
233235 "tcp_port" : int (tcp_port ),
234236 "nproc_per_node" : int (nproc_per_node ),
235237 "history_enabled" : bool (history_enabled ),
238+ "launch_cwd" : str (Path (launch_cwd ).resolve ()),
236239 },
237240 "paths" : {
238241 "session_root" : str (session_root ),
@@ -389,10 +392,14 @@ def _handler(signum: int, _frame: Any) -> None:
389392 signal .signal (signal .SIGTERM , _handler )
390393
391394
392- def start_aggregator_process (env : Dict [str , str ]) -> subprocess .Popen :
395+ def start_aggregator_process (
396+ env : Dict [str , str ], cwd : str
397+ ) -> subprocess .Popen :
393398 """Start the TraceML aggregator as a separate process.
394399
395- Stdout/stderr are inherited so Rich output and tracebacks remain visible.
400+ The subprocess cwd is set explicitly so all child processes inherit a
401+ deterministic working directory rather than depending on ambient shell
402+ state.
396403 """
397404 aggregator_path = (
398405 Path (__file__ ).parent / "aggregator" / "aggregator_main.py"
@@ -404,18 +411,29 @@ def start_aggregator_process(env: Dict[str, str]) -> subprocess.Popen:
404411
405412 cmd = [sys .executable , str (aggregator_path )]
406413 print ("[TraceML] Launching TraceML aggregator:" , " " .join (cmd ))
407- return subprocess .Popen (cmd , env = env , start_new_session = True )
414+ return subprocess .Popen (
415+ cmd ,
416+ env = env ,
417+ cwd = cwd ,
418+ start_new_session = True ,
419+ )
408420
409421
410422def start_training_process (
411- train_cmd : list [str ], env : Dict [str , str ]
423+ train_cmd : list [str ], env : Dict [str , str ], cwd : str
412424) -> subprocess .Popen :
413425 """Start the training process in a new process group.
414426
415- Stdout/stderr are inherited so user logs and tracebacks remain visible.
427+ The subprocess cwd is set explicitly so worker processes see the same
428+ working directory the user launched TraceML from.
416429 """
417430 print ("[TraceML] Launching TraceML executor:" , " " .join (train_cmd ))
418- return subprocess .Popen (train_cmd , env = env , start_new_session = True )
431+ return subprocess .Popen (
432+ train_cmd ,
433+ env = env ,
434+ cwd = cwd ,
435+ start_new_session = True ,
436+ )
419437
420438
421439def launch_process (script_path : str , args : argparse .Namespace ) -> None :
@@ -430,6 +448,7 @@ def launch_process(script_path: str, args: argparse.Namespace) -> None:
430448 5. Keep training as the primary process; aggregator may fail open
431449 6. On shutdown, terminate child process groups and update the manifest
432450 """
451+
433452 env = os .environ .copy ()
434453 env ["PYTHONUNBUFFERED" ] = "1"
435454
@@ -452,6 +471,10 @@ def launch_process(script_path: str, args: argparse.Namespace) -> None:
452471 env ["TRACEML_NPROC_PER_NODE" ] = str (args .nproc_per_node )
453472 env ["TRACEML_HISTORY_ENABLED" ] = "0" if args .no_history else "1"
454473
474+ launch_context = LaunchContext .capture ()
475+ env .update (launch_context .to_env ())
476+ execution_cwd = launch_context .launch_cwd
477+
455478 session_id = env ["TRACEML_SESSION_ID" ]
456479 session_root = Path (args .logs_dir ).resolve () / session_id
457480 aggregator_dir = session_root / "aggregator"
@@ -474,6 +497,7 @@ def launch_process(script_path: str, args: argparse.Namespace) -> None:
474497 nproc_per_node = args .nproc_per_node ,
475498 history_enabled = not args .no_history ,
476499 status = "starting" ,
500+ launch_cwd = execution_cwd ,
477501 aggregator_dir = aggregator_dir ,
478502 db_path = db_path ,
479503 extra = (
@@ -495,7 +519,11 @@ def launch_process(script_path: str, args: argparse.Namespace) -> None:
495519 str (script_path ),
496520 * script_args ,
497521 ]
498- train_proc = start_training_process (train_cmd = train_cmd , env = env )
522+ train_proc = start_training_process (
523+ train_cmd = train_cmd ,
524+ env = env ,
525+ cwd = execution_cwd ,
526+ )
499527 install_shutdown_handlers (
500528 lambda : (train_proc , None ), manifest_path = manifest_path
501529 )
@@ -530,7 +558,7 @@ def launch_process(script_path: str, args: argparse.Namespace) -> None:
530558 f"(ui={ args .mode } , profile={ env ['TRACEML_PROFILE' ]} )"
531559 )
532560 try :
533- agg_proc = start_aggregator_process (env = env )
561+ agg_proc = start_aggregator_process (env = env , cwd = execution_cwd )
534562 except FileNotFoundError as exc :
535563 print (f"[TraceML] ERROR: { exc } " , file = sys .stderr )
536564 update_run_manifest (manifest_path , status = "failed" )
@@ -558,7 +586,11 @@ def launch_process(script_path: str, args: argparse.Namespace) -> None:
558586 print ("[TraceML] Aggregator ready." )
559587 update_run_manifest (manifest_path , status = "running" )
560588
561- train_proc = start_training_process (train_cmd = train_cmd , env = env )
589+ train_proc = start_training_process (
590+ train_cmd = train_cmd ,
591+ env = env ,
592+ cwd = execution_cwd ,
593+ )
562594
563595 while True :
564596 train_rc = train_proc .poll ()
0 commit comments