Skip to content

Commit 05cf094

Browse files
Copilotpeteryang1peteryangms
authored
fix: docker container cleanup to prevent accumulation and system slowdown (#975)
* Initial plan for issue * Fix Docker container cleanup issue by using try-finally block Co-authored-by: peteryang1 <25981102+peteryang1@users.noreply.github.com> * Fix additional Docker container leaks in health_check and GPU test functions Co-authored-by: peteryang1 <25981102+peteryang1@users.noreply.github.com> * Remove temporary test files and finalize Docker container cleanup fix Co-authored-by: peteryang1 <25981102+peteryang1@users.noreply.github.com> * Refactor container cleanup code to reduce duplication as requested in review feedback Co-authored-by: peteryang1 <25981102+peteryang1@users.noreply.github.com> * Refactor container cleanup to use shared function and always stop before remove Co-authored-by: peteryang1 <25981102+peteryang1@users.noreply.github.com> * fix CI * Fix mypy type checking errors for Docker container cleanup Co-authored-by: peteryang1 <25981102+peteryang1@users.noreply.github.com> * fix CI * Remove unnecessary _cleanup_container wrapper method in DockerEnv class Co-authored-by: peteryang1 <25981102+peteryang1@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: peteryang1 <25981102+peteryang1@users.noreply.github.com> Co-authored-by: Xu Yang <xuyang1@microsoft.com>
1 parent 1ba7548 commit 05cf094

5 files changed

Lines changed: 64 additions & 21 deletions

File tree

rdagent/app/utils/health_check.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,25 @@
33
import docker
44

55
from rdagent.log import rdagent_logger as logger
6+
from rdagent.utils.env import cleanup_container
67

78

89
def check_docker() -> None:
10+
container = None
911
try:
1012
client = docker.from_env()
1113
client.images.pull("hello-world")
1214
container = client.containers.run("hello-world", detach=True)
1315
logs = container.logs().decode("utf-8")
1416
print(logs)
15-
container.remove()
1617
logger.info(f"The docker status is normal")
1718
except docker.errors.DockerException as e:
1819
logger.error(f"An error occurred: {e}")
1920
logger.warning(
2021
f"Docker status is exception, please check the docker configuration or reinstall it. Refs: https://docs.docker.com/engine/install/ubuntu/."
2122
)
23+
finally:
24+
cleanup_container(container, "health check")
2225

2326

2427
def is_port_in_use(port):

rdagent/core/conf.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,7 @@ def get_max_parallel(self) -> int:
8787
"""Based on the setting of semaphore, return the maximum number of parallel loops"""
8888
if isinstance(self.step_semaphore, int):
8989
return self.step_semaphore
90-
else:
91-
return max(self.step_semaphore.values())
90+
return max(self.step_semaphore.values())
9291

9392
# NOTE: for debug
9493
# the following function only serves as debugging and is necessary in main logic.

rdagent/core/proposal.py

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
import asyncio
66
from abc import ABC, abstractmethod
7-
from typing import TYPE_CHECKING, Generic, List, Tuple, TypeVar
7+
from typing import TYPE_CHECKING, Generic, TypeVar
88

99
from rdagent.core.conf import RD_AGENT_SETTINGS
1010
from rdagent.core.evaluation import Feedback
@@ -108,7 +108,7 @@ def __str__(self) -> str:
108108

109109
class Trace(Generic[ASpecificScen, ASpecificKB]):
110110
NodeType = tuple[Experiment, ExperimentFeedback] # Define NodeType as a new type representing the tuple
111-
NEW_ROOT: Tuple = ()
111+
NEW_ROOT: tuple = ()
112112

113113
def __init__(self, scen: ASpecificScen, knowledge_base: ASpecificKB | None = None) -> None:
114114
self.scen: ASpecificScen = scen
@@ -163,35 +163,33 @@ def get_parent_exps(
163163

164164
return [self.hist[i] for i in self.get_parents(selection[0])]
165165

166-
def exp2idx(self, exp: Experiment | List[Experiment]) -> int | List[int] | None:
166+
def exp2idx(self, exp: Experiment | list[Experiment]) -> int | list[int] | None:
167167
if isinstance(exp, list):
168-
exps: List[Experiment] = exp
168+
exps: list[Experiment] = exp
169169

170170
# keep the order
171171
exp_to_index: dict[Experiment, int] = {_exp: i for i, (_exp, _) in enumerate(self.hist)}
172172
return [exp_to_index[_exp] for _exp in exps]
173-
else:
174-
for i, (_exp, _) in enumerate(self.hist):
175-
if _exp == exp:
176-
return i
173+
for i, (_exp, _) in enumerate(self.hist):
174+
if _exp == exp:
175+
return i
177176
return None
178177

179-
def idx2exp(self, idx: int | List[int]) -> Experiment | List[Experiment]:
178+
def idx2exp(self, idx: int | list[int]) -> Experiment | list[Experiment]:
180179
if isinstance(idx, list):
181-
idxs: List[int] = idx
180+
idxs: list[int] = idx
182181
return [self.hist[_idx][0] for _idx in idxs]
183-
else:
184-
return self.hist[idx][0]
182+
return self.hist[idx][0]
185183

186184
def is_parent(self, parent_idx: int, child_idx: int) -> bool:
187185
ancestors = self.get_parents(child_idx)
188186
return parent_idx in ancestors
189187

190-
def get_parents(self, child_idx: int) -> List[int]:
188+
def get_parents(self, child_idx: int) -> list[int]:
191189
if self.is_selection_new_tree((child_idx,)):
192190
return []
193191

194-
ancestors: List[int] = []
192+
ancestors: list[int] = []
195193
curr = child_idx
196194
while True:
197195
ancestors.insert(0, curr)

rdagent/utils/env.py

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,29 @@
4444
from rdagent.utils.workflow import wait_retry
4545

4646

47+
def cleanup_container(container: docker.models.containers.Container | None, context: str = "") -> None: # type: ignore[no-any-unimported]
48+
"""
49+
Shared helper function to clean up a Docker container.
50+
Always stops the container before removing it.
51+
52+
Parameters
53+
----------
54+
container : docker container object or None
55+
The container to clean up, or None if no container to clean up
56+
context : str
57+
Additional context for logging (e.g., "health check", "GPU test")
58+
"""
59+
if container is not None:
60+
try:
61+
# Always stop first - stop() doesn't raise error if already stopped
62+
container.stop()
63+
container.remove()
64+
except Exception as cleanup_error:
65+
# Log cleanup error but don't mask the original exception
66+
context_str = f" {context}" if context else ""
67+
logger.warning(f"Failed to cleanup{context_str} container {container.id}: {cleanup_error}")
68+
69+
4770
# Normalize all bind paths in volumes to absolute paths using the workspace (working_dir).
4871
def normalize_volumes(vols: dict[str, str | dict[str, str]], working_dir: str) -> dict:
4972
abs_vols: dict[str, str | dict[str, str]] = {}
@@ -785,12 +808,17 @@ def get_image(image_name: str) -> None:
785808

786809
@wait_retry(5, 10)
787810
def _f() -> dict:
811+
container = None
788812
try:
789813
get_image(self.conf.image)
790-
client.containers.run(self.conf.image, "nvidia-smi", **gpu_kwargs)
814+
container = client.containers.run(self.conf.image, "nvidia-smi", detach=True, **gpu_kwargs)
815+
# Wait for container to complete
816+
container.wait()
791817
logger.info("GPU Devices are available.")
792818
except docker.errors.APIError:
793819
return {}
820+
finally:
821+
cleanup_container(container, context="GPU test")
794822
return gpu_kwargs
795823

796824
return _f()
@@ -835,9 +863,10 @@ def _run_ret_code(
835863
volumes = normalize_volumes(cast(dict[str, str | dict[str, str]], volumes), self.conf.mount_path)
836864

837865
log_output = ""
866+
container: docker.models.containers.Container | None = None # type: ignore[no-any-unimported]
838867

839868
try:
840-
container: docker.models.containers.Container = client.containers.run( # type: ignore[no-any-unimported]
869+
container = client.containers.run(
841870
image=self.conf.image,
842871
command=entry,
843872
volumes=volumes,
@@ -851,6 +880,7 @@ def _run_ret_code(
851880
cpu_count=self.conf.cpu_count, # Set CPU limit
852881
**self._gpu_kwargs(client),
853882
)
883+
assert container is not None # Ensure container was created successfully
854884
logs = container.logs(stream=True)
855885
print(Rule("[bold green]Docker Logs Begin[/bold green]", style="dark_orange"))
856886
table = Table(title="Run Info", show_header=False)
@@ -869,8 +899,6 @@ def _run_ret_code(
869899
Console().print(decoded_log, markup=False)
870900
log_output += decoded_log + "\n"
871901
exit_status = container.wait()["StatusCode"]
872-
container.stop()
873-
container.remove()
874902
print(Rule("[bold green]Docker Logs End[/bold green]", style="dark_orange"))
875903
return log_output, exit_status
876904
except docker.errors.ContainerError as e:
@@ -879,6 +907,8 @@ def _run_ret_code(
879907
raise RuntimeError("Docker image not found.")
880908
except docker.errors.APIError as e:
881909
raise RuntimeError(f"Error while running the container: {e}")
910+
finally:
911+
cleanup_container(container)
882912

883913

884914
class QTDockerEnv(DockerEnv):

test/utils/test_env.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
LocalEnv,
1414
QlibDockerConf,
1515
QTDockerEnv,
16+
cleanup_container,
1617
)
1718

1819
DIRNAME = Path(__file__).absolute().resolve().parent
@@ -142,6 +143,18 @@ def test_docker_mem(self):
142143
# docker run --memory=10m -it --rm local_qlib:latest python -c 'import numpy as np; print(123); size_mb = 1; size = size_mb * 1024 * 1024 // 8; array = np.random.randn(size).astype(np.float64); array[0], array[-1] = 1.0, 1.0; print(321)'
143144
# docker run --memory=10g -it --rm local_qlib:latest python -c 'import numpy as np; print(123); size_mb = 1; size = size_mb * 1024 * 1024 // 8; array = np.random.randn(size).astype(np.float64); array[0], array[-1] = 1.0, 1.0; print(321)'
144145

146+
def test_cleanup_container_import(self):
147+
"""Test that cleanup_container function can be imported and has correct interface."""
148+
# Test that the function exists and can be called
149+
self.assertTrue(callable(cleanup_container))
150+
151+
# Test with None (should not raise an exception)
152+
cleanup_container(None, "test context")
153+
154+
# The function should accept positional and keyword arguments
155+
cleanup_container(None)
156+
cleanup_container(None, context="test")
157+
145158

146159
if __name__ == "__main__":
147160
unittest.main()

0 commit comments

Comments
 (0)