4444from rdagent .utils .workflow import wait_retry
4545
4646
47+ def cleanup_container (container : docker .models .containers .Container | None , context : str = "" ) -> None : # type: ignore[no-any-unimported]
48+ """
49+ Shared helper function to clean up a Docker container.
50+ Always stops the container before removing it.
51+
52+ Parameters
53+ ----------
54+ container : docker container object or None
55+ The container to clean up, or None if no container to clean up
56+ context : str
57+ Additional context for logging (e.g., "health check", "GPU test")
58+ """
59+ if container is not None :
60+ try :
61+ # Always stop first - stop() doesn't raise error if already stopped
62+ container .stop ()
63+ container .remove ()
64+ except Exception as cleanup_error :
65+ # Log cleanup error but don't mask the original exception
66+ context_str = f" { context } " if context else ""
67+ logger .warning (f"Failed to cleanup{ context_str } container { container .id } : { cleanup_error } " )
68+
69+
4770# Normalize all bind paths in volumes to absolute paths using the workspace (working_dir).
4871def normalize_volumes (vols : dict [str , str | dict [str , str ]], working_dir : str ) -> dict :
4972 abs_vols : dict [str , str | dict [str , str ]] = {}
@@ -785,12 +808,17 @@ def get_image(image_name: str) -> None:
785808
786809 @wait_retry (5 , 10 )
787810 def _f () -> dict :
811+ container = None
788812 try :
789813 get_image (self .conf .image )
790- client .containers .run (self .conf .image , "nvidia-smi" , ** gpu_kwargs )
814+ container = client .containers .run (self .conf .image , "nvidia-smi" , detach = True , ** gpu_kwargs )
815+ # Wait for container to complete
816+ container .wait ()
791817 logger .info ("GPU Devices are available." )
792818 except docker .errors .APIError :
793819 return {}
820+ finally :
821+ cleanup_container (container , context = "GPU test" )
794822 return gpu_kwargs
795823
796824 return _f ()
@@ -835,9 +863,10 @@ def _run_ret_code(
835863 volumes = normalize_volumes (cast (dict [str , str | dict [str , str ]], volumes ), self .conf .mount_path )
836864
837865 log_output = ""
866+ container : docker .models .containers .Container | None = None # type: ignore[no-any-unimported]
838867
839868 try :
840- container : docker . models . containers . Container = client .containers .run ( # type: ignore[no-any-unimported]
869+ container = client .containers .run (
841870 image = self .conf .image ,
842871 command = entry ,
843872 volumes = volumes ,
@@ -851,6 +880,7 @@ def _run_ret_code(
851880 cpu_count = self .conf .cpu_count , # Set CPU limit
852881 ** self ._gpu_kwargs (client ),
853882 )
883+ assert container is not None # Ensure container was created successfully
854884 logs = container .logs (stream = True )
855885 print (Rule ("[bold green]Docker Logs Begin[/bold green]" , style = "dark_orange" ))
856886 table = Table (title = "Run Info" , show_header = False )
@@ -869,8 +899,6 @@ def _run_ret_code(
869899 Console ().print (decoded_log , markup = False )
870900 log_output += decoded_log + "\n "
871901 exit_status = container .wait ()["StatusCode" ]
872- container .stop ()
873- container .remove ()
874902 print (Rule ("[bold green]Docker Logs End[/bold green]" , style = "dark_orange" ))
875903 return log_output , exit_status
876904 except docker .errors .ContainerError as e :
@@ -879,6 +907,8 @@ def _run_ret_code(
879907 raise RuntimeError ("Docker image not found." )
880908 except docker .errors .APIError as e :
881909 raise RuntimeError (f"Error while running the container: { e } " )
910+ finally :
911+ cleanup_container (container )
882912
883913
884914class QTDockerEnv (DockerEnv ):
0 commit comments