Skip to content

Commit fae3def

Browse files
authored
feat: add ws CLI and support optional timeout/cache (#1066)
* feat: add ws CLI and support optional timeout/cache * lint * fix bugs * convert extra_volumes to dict for multiprocess * lint
1 parent a90e598 commit fae3def

4 files changed

Lines changed: 81 additions & 13 deletions

File tree

rdagent/app/utils/ws.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
from typing import Optional
2+
3+
import typer
4+
5+
from rdagent.app.data_science.conf import DS_RD_SETTING
6+
from rdagent.components.coder.data_science.conf import get_ds_env
7+
from rdagent.utils.agent.tpl import T
8+
9+
app = typer.Typer(help="Run data-science environment commands.")
10+
11+
12+
@app.command()
13+
def run(competition: str, cmd: str, local_path: str = "./"):
14+
"""
15+
Launch the data-science environment for a specific competition and run the
16+
provided command.
17+
18+
Example:
19+
1) start the container:
20+
dotenv run -- python -m rdagent.app.utils.ws nomad2018-predict-transparent-conductors "sleep 3600" --local-path your_workspace
21+
22+
2) then run the following command to enter the latest container:
23+
- docker exec -it `docker ps --filter 'status=running' -l --format '{{.Names}}'` bash
24+
Or you can attach to the container by specifying the container name (find it in the run info)
25+
- docker exec -it sweet_robinson bash
26+
27+
Arguments:
28+
competition: The competition slug/folder name.
29+
cmd: The shell command or script entry point to execute inside
30+
the environment.
31+
"""
32+
data_path = DS_RD_SETTING.local_data_path
33+
34+
data_path = (
35+
f"{data_path}/{competition}" if DS_RD_SETTING.sample_data_by_LLM else f"{data_path}/sample/{competition}"
36+
)
37+
target_path = T("scenarios.data_science.share:scen.input_path").r()
38+
extra_volumes = {data_path: target_path}
39+
40+
# Don't set time limitation and always disable cache
41+
env = get_ds_env(
42+
extra_volumes=extra_volumes,
43+
running_timeout_period=None,
44+
enable_cache=False,
45+
)
46+
47+
env.run(entry=cmd, local_path=local_path)
48+
49+
50+
if __name__ == "__main__": # pragma: no cover
51+
app()

rdagent/components/coder/data_science/conf.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ class Config:
2727
def get_ds_env(
2828
conf_type: Literal["kaggle", "mlebench"] = "kaggle",
2929
extra_volumes: dict = {},
30-
running_timeout_period: int = DS_RD_SETTING.debug_timeout,
30+
running_timeout_period: int | None = DS_RD_SETTING.debug_timeout,
31+
enable_cache: bool | None = None,
3132
) -> Env:
3233
"""
3334
Retrieve the appropriate environment configuration based on the env_type setting.
@@ -52,8 +53,10 @@ def get_ds_env(
5253
)
5354
else:
5455
raise ValueError(f"Unknown env type: {conf.env_type}")
55-
env.conf.extra_volumes = extra_volumes
56+
env.conf.extra_volumes = extra_volumes.copy()
5657
env.conf.running_timeout_period = running_timeout_period
58+
if enable_cache is not None:
59+
env.conf.enable_cache = enable_cache
5760
env.prepare()
5861
return env
5962

rdagent/utils/env.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -117,15 +117,19 @@ def pull_image_with_progress(image: str) -> None:
117117

118118

119119
class EnvConf(ExtendedBaseSettings):
120-
# TODO: add prefix ....
121120
default_entry: str
122121
extra_volumes: dict = {}
123-
running_timeout_period: int = 3600 # 10 minutes
122+
running_timeout_period: int | None = 3600 # 10 minutes
124123
# helper settings to support transparent;
125124
enable_cache: bool = True
126125
retry_count: int = 5 # retry count for the docker run
127126
retry_wait_seconds: int = 10 # retry wait seconds for the docker run
128127

128+
model_config = SettingsConfigDict(
129+
# TODO: add prefix ....
130+
env_parse_none_str="None", # Nthis is the key to accept `RUNNING_TIMEOUT_PERIOD=None`
131+
)
132+
129133

130134
ASpecificEnvConf = TypeVar("ASpecificEnvConf", bound=EnvConf)
131135

@@ -225,7 +229,7 @@ def __run_with_retry(
225229
)
226230
end = time.time()
227231
logger.info(f"Running time: {end - start} seconds")
228-
if end - start + 1 >= self.conf.running_timeout_period:
232+
if self.conf.running_timeout_period is not None and end - start + 1 >= self.conf.running_timeout_period:
229233
logger.warning(
230234
f"The running time exceeds {self.conf.running_timeout_period} seconds, so the process is killed."
231235
)
@@ -299,9 +303,13 @@ def _get_path_stem(path: str) -> str | None:
299303
chmod_cmd += ")"
300304
return chmod_cmd
301305

306+
if self.conf.running_timeout_period is None:
307+
timeout_cmd = entry
308+
else:
309+
timeout_cmd = f"timeout --kill-after=10 {self.conf.running_timeout_period} {entry}"
302310
entry_add_timeout = (
303-
f"/bin/sh -c 'timeout --kill-after=10 {self.conf.running_timeout_period} {entry}; "
304-
+ "entry_exit_code=$?; "
311+
f"/bin/sh -c '" # start of the sh command
312+
+ f"{timeout_cmd}; entry_exit_code=$?; "
305313
+ (
306314
f"{_get_chmod_cmd(self.conf.mount_path)}; "
307315
# We don't have to change the permission of the cache and input folder to remove it
@@ -310,7 +318,8 @@ def _get_path_stem(path: str) -> str | None:
310318
if isinstance(self.conf, DockerConf)
311319
else ""
312320
)
313-
+ "exit $entry_exit_code'"
321+
+ "exit $entry_exit_code"
322+
+ "'" # end of the sh command
314323
)
315324

316325
if self.conf.enable_cache:
@@ -635,7 +644,7 @@ class DockerConf(EnvConf):
635644
mem_limit: str | None = "48g" # Add memory limit attribute
636645
cpu_count: int | None = None # Add CPU limit attribute
637646

638-
running_timeout_period: int = 3600 # 1 hour
647+
running_timeout_period: int | None = 3600 # 1 hour
639648

640649
enable_cache: bool = True # enable the cache mechanism
641650

@@ -678,7 +687,10 @@ def prepare(self) -> None:
678687

679688

680689
class QlibDockerConf(DockerConf):
681-
model_config = SettingsConfigDict(env_prefix="QLIB_DOCKER_")
690+
model_config = SettingsConfigDict(
691+
env_prefix="QLIB_DOCKER_",
692+
env_parse_none_str="None", # Nthis is the key to accept `RUNNING_TIMEOUT_PERIOD=None`
693+
)
682694

683695
build_from_dockerfile: bool = True
684696
dockerfile_folder_path: Path = Path(__file__).parent.parent / "scenarios" / "qlib" / "docker"
@@ -707,7 +719,7 @@ class KGDockerConf(DockerConf):
707719
# Path("git_ignore_folder/data").resolve(): "/root/.data/"
708720
# }
709721

710-
running_timeout_period: int = 600
722+
running_timeout_period: int | None = 600
711723
mem_limit: str | None = (
712724
"48g" # Add memory limit attribute # new-york-city-taxi-fare-prediction may need more memory
713725
)
@@ -722,7 +734,7 @@ class DSDockerConf(DockerConf):
722734
mount_path: str = "/kaggle/workspace"
723735
default_entry: str = "python main.py"
724736

725-
running_timeout_period: int = 600
737+
running_timeout_period: int | None = 600
726738
mem_limit: str | None = (
727739
"48g" # Add memory limit attribute # new-york-city-taxi-fare-prediction may need more memory
728740
)

test/utils/test_conf.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@ def test_conf(self):
99
from rdagent.utils.env import EnvConf, QlibDockerConf
1010

1111
os.environ["MEM_LIMIT"] = "200g"
12+
os.environ["RUNNING_TIMEOUT_PERIOD"] = "None"
1213
assert QlibDockerConf().mem_limit == "200g" # base class will affect subclasses
1314
os.environ["QLIB_DOCKER_MEM_LIMIT"] = "300g"
1415
assert QlibDockerConf().mem_limit == "300g" # more accurate subclass will override the base class
16+
assert QlibDockerConf().running_timeout_period is None
1517

16-
os.environ["default_entry"] = "which python"
18+
os.environ["DEFAULT_ENTRY"] = "which python"
1719
os.environ["ENABLE_CACHE"] = "False"
1820

1921
assert EnvConf().enable_cache is False

0 commit comments

Comments
 (0)