Skip to content

Commit f46486d

Browse files
committed
always terminate agent processes
1 parent 23aaae6 commit f46486d

File tree

1 file changed

+15
-14
lines changed

1 file changed

+15
-14
lines changed

src/torchrunx/launcher.py

Lines changed: 15 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@ class Launcher:
173173
env_file: str | os.PathLike | None = None
174174
timeout: int = 600
175175

176-
def run(
176+
def run( # noqa: C901, PLR0912
177177
self,
178178
func: Callable,
179179
func_args: tuple[Any] | None = None,
@@ -206,6 +206,7 @@ def run(
206206
log_receiver = None
207207
log_process = None
208208
launcher_agent_group = None
209+
agent_payloads = None
209210

210211
try:
211212
# start logging server
@@ -287,25 +288,25 @@ def run(
287288

288289
if all(s.state == "done" for s in agent_statuses):
289290
break
290-
291-
except:
292-
# cleanup: SIGTERM all agents
293-
for agent_payload, agent_hostname in zip(agent_payloads, hostnames):
294-
execute_command(
295-
command=f"kill {agent_payload.process_id}",
296-
hostname=agent_hostname,
297-
ssh_config_file=self.ssh_config_file,
298-
)
299-
raise
300291
finally:
301292
if log_receiver is not None:
302293
log_receiver.shutdown()
303-
log_receiver.server_close()
304-
if log_process is not None:
305-
log_process.kill()
294+
if log_process is not None:
295+
log_receiver.server_close()
296+
log_process.kill()
297+
306298
if launcher_agent_group is not None:
307299
launcher_agent_group.shutdown()
308300

301+
# cleanup: SIGTERM all agents
302+
if agent_payloads is not None:
303+
for agent_payload, agent_hostname in zip(agent_payloads, hostnames):
304+
execute_command(
305+
command=f"kill {agent_payload.process_id}",
306+
hostname=agent_hostname,
307+
ssh_config_file=self.ssh_config_file,
308+
)
309+
309310
return {
310311
hostname: agent_status.return_values
311312
for hostname, agent_status in zip(hostnames, agent_statuses)

0 commit comments

Comments
 (0)