feat: query & cache package_info (#1083)

peteryang1 · Jensen246 · web-flow · commit 19869ea4752b · 2025-07-17T18:35:36.000+08:00
* feat: add package query in draft.py (not yet enabled)

* feat: integrate package query into task_gen and cache runtime environment

- Remove pkg_query modifications from draft components
- Add package declaration requirement in task_gen prompts
- Add optional packages field to CodingSketch model
- Cache runtime_environment in scenario object for loop-wide reuse
- Parse packages from LLM response and generate runtime environment dynamically

* some refinement

* feat: merge default packages with CLI args in package_info.py

* fix: code style

---------

Co-authored-by: Qizheng Li &lt;jenssenlee@163.com&gt;
diff --git a/rdagent/components/coder/data_science/pipeline/__init__.py b/rdagent/components/coder/data_science/pipeline/__init__.py
@@ -38,7 +38,7 @@
 )
 from rdagent.components.coder.data_science.conf import DSCoderCoSTEERSettings
 from rdagent.components.coder.data_science.pipeline.eval import PipelineCoSTEEREvaluator
-from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
+from rdagent.components.coder.data_science.pipeline.exp import PipelineTask
 from rdagent.components.coder.data_science.share.eval import ModelDumpEvaluator
 from rdagent.core.exception import CoderError
 from rdagent.core.experiment import FBWorkspace
@@ -53,7 +53,7 @@
 class PipelineMultiProcessEvolvingStrategy(MultiProcessEvolvingStrategy):
     def implement_one_task(
         self,
-        target_task: DataLoaderTask,
+        target_task: PipelineTask,
         queried_knowledge: CoSTEERQueriedKnowledge | None = None,
         workspace: FBWorkspace | None = None,
         prev_task_feedback: CoSTEERSingleFeedback | None = None,
@@ -86,6 +86,7 @@ def implement_one_task(
             queried_former_failed_knowledge=queried_former_failed_knowledge[0],
             out_spec=PythonAgentOut.get_spec(),
             runtime_environment=runtime_environment,
+            package_info=target_task.package_info,
             enable_model_dump=DS_RD_SETTING.enable_model_dump,
             enable_debug_mode=DS_RD_SETTING.sample_data_by_LLM,
         )
diff --git a/rdagent/components/coder/data_science/pipeline/exp.py b/rdagent/components/coder/data_science/pipeline/exp.py
@@ -3,5 +3,6 @@
 
 # Because we use isinstance to distinguish between different types of tasks, we need to use sub classes to represent different types of tasks
 class PipelineTask(CoSTEERTask):
-    def __init__(self, name: str = "Pipeline", *args, **kwargs) -> None:
+    def __init__(self, name: str = "Pipeline", package_info: str | None = None, *args, **kwargs) -> None:
         super().__init__(name=name, *args, **kwargs)
+        self.package_info = package_info
diff --git a/rdagent/components/coder/data_science/pipeline/prompts.yaml b/rdagent/components/coder/data_science/pipeline/prompts.yaml
@@ -11,6 +11,14 @@ pipeline_coder:
     
     ## The runtime environment your code will running on
     {{ runtime_environment }}
+
+    {% if package_info is not none %}
+    To help you write the runnable code, the user has provided the package information which contains the package names and versions.
+    You should be careful about the package versions, as the code will be executed in the environment with the specified version and the api might be different from the latest version.
+    The user might provide the packages the environment doesn't have, you should avoid using any of them.
+    ## Package Information
+    {{ package_info }}
+    {% endif %}
     
     ## Hyperparameters Specification
     Follow the hyperparameter choices if they are specified in the task description, unless they are unreasonable or incorrect.
diff --git a/rdagent/scenarios/data_science/dev/runner/eval.py b/rdagent/scenarios/data_science/dev/runner/eval.py
@@ -133,7 +133,7 @@ def evaluate(
             scenario=self.scen.get_scenario_all_desc(eda_output=implementation.file_dict.get("EDA.md", None)),
             is_sub_enabled=test_eval.is_sub_enabled(self.scen.competition),
             task_desc=target_task.get_task_information(),
-            runtime_environment=self.scen.get_runtime_environment(),
+            runtime_environment=self.scen.runtime_environment,
         )
         user_prompt = T(".prompts:DSCoSTEER_eval.user").r(
             code=implementation.all_codes,
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/package_info.py b/rdagent/scenarios/data_science/proposal/exp_gen/package_info.py
@@ -0,0 +1,64 @@
+import sys
+from importlib.metadata import distributions
+
+
+def get_installed_packages():
+    return {dist.metadata["Name"].lower(): dist.version for dist in distributions()}
+
+
+def print_filtered_packages(installed_packages, filtered_packages):
+    to_print = []
+    for package_name in filtered_packages:
+        version = installed_packages.get(package_name.lower())
+        if version:
+            to_print.append((package_name, version))
+    if not to_print:
+        print("=== No matching packages found ===")
+    else:
+        print("=== Installed Packages ===")
+        for package_name, version in to_print:
+            # Print package name and version in the format "package_name==version"
+            print(f"{package_name}=={version}")
+
+
+def get_python_packages():
+    # Allow the caller to pass a custom package list via command-line arguments.
+    # Example: `python package_info.py pandas torch scikit-learn`
+    # If no extra arguments are provided we fall back to the original default list
+    # to keep full backward-compatibility.
+    packages_list = [  # default packages
+        "transformers",
+        "accelerate",
+        "torch",
+        "tensorflow",
+        "pandas",
+        "numpy",
+        "scikit-learn",
+        "scipy",
+        "xgboost",
+        "sklearn",
+        "lightgbm",
+        "vtk",
+        "opencv-python",
+        "keras",
+        "matplotlib",
+        "pydicom",
+    ]
+    if len(sys.argv) > 1:
+        packages_list = list(set(packages_list) | set(sys.argv[1:]))
+
+    installed_packages = get_installed_packages()
+
+    print_filtered_packages(installed_packages, packages_list)
+
+    # TODO: Handle missing packages.
+    # Report packages that are requested by the LLM but are not installed.
+    missing_pkgs = [pkg for pkg in packages_list if pkg.lower() not in installed_packages]
+    if missing_pkgs:
+        print("\n=== Missing Packages (Avoid using these packages) ===")
+        for pkg in missing_pkgs:
+            print(pkg)
+
+
+if __name__ == "__main__":
+    get_python_packages()
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts.yaml
@@ -345,4 +345,5 @@ output_format:
     The output should follow JSON format. The schema is as follows:
     {
         "description": "A detailed, step-by-step implementation guide for `main.py` that synthesizes planned modifications and code structure into a comprehensive coding plan. Must be formatted in Markdown with level-3 headings (###) organizing logical sections, key decision points, and implementation steps. Should provide sufficient detail covering implementation flow, algorithms, data handling, and key logic points for unambiguous developer execution.",
+        "packages": ["package1", "package2", ...] # Optional, list of packages needed for the task. If no packages are needed, leave it empty.
     }
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
@@ -270,6 +270,11 @@ task_gen:
       - For neural networks, prefer PyTorch or PyTorch based library (over TensorFlow) unless the SOTA or hypothesis dictates otherwise.
       - For neural networks, prefer fine-tuning pre-trained models over training from scratch.
 
+    ## Package Declaration
+    At the end of your design, **you MUST** provide a key `packages` in the final JSON output.  
+    It should be an **array of PyPI package names** (strings) that you expect to `import` in the forthcoming implementation.  
+    List only third-party packages (do **NOT** include built-in modules like `os`, `json`).  
+
     # Guidelines for Sketching the `main.py` Workflow
 
     YOUR TASK IS TO create a conceptual sketch for drafting or updating the `main.py` workflow. This is a plan, not code.
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -23,6 +23,7 @@
     DSDraftExpGen,  # TODO: DSDraftExpGen should be moved to router in the further
 )
 from rdagent.scenarios.data_science.proposal.exp_gen.idea_pool import DSIdea
+from rdagent.scenarios.data_science.proposal.exp_gen.utils import get_packages
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.repo.diff import generate_diff_from_dict
 from rdagent.utils.workflow import wait_retry
@@ -274,6 +275,11 @@ class CodingSketch(BaseModel):
         "The content **must** be formatted using Markdown, with logical sections, key decision points, or implementation steps clearly organized by level-3 headings (i.e., `###`). "
         "This field should provide sufficient detail for a developer to understand the implementation flow, algorithms, data handling, and key logic points without ambiguity."
     )
+    packages: List[str] = Field(
+        default=None,
+        description="A list of third-party package names (PyPI) that the planned implementation will import. "
+        "Used to query the runtime environment dynamically. Leave `null` or omit if not applicable.",
+    )
 
 
 def draft_exp_in_decomposition(scen: Scenario, trace: DSTrace) -> None | DSDraftExpGen:
@@ -775,6 +781,15 @@ def task_gen(
             name=task_name,
             description=task_desc,
         )
+
+        assert isinstance(task, PipelineTask), f"Task {task_name} is not a PipelineTask, got {type(task)}"
+        # only for llm with response schema.(TODO: support for non-schema llm?)
+        # If the LLM provides a "packages" field (list[str]), compute runtime environment now and cache it for subsequent prompts in later loops.
+        if isinstance(task_dict, dict) and "packages" in task_dict and isinstance(task_dict["packages"], list):
+            pkgs: list[str] = [str(p) for p in task_dict["packages"]]
+            # Persist for later stages
+            task.package_info = get_packages(pkgs)
+
         exp = DSExperiment(pending_tasks_list=[[task]], hypothesis=hypotheses[0])
         if sota_exp is not None:
             exp.experiment_workspace.inject_code_from_file_dict(sota_exp.experiment_workspace)
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/utils.py b/rdagent/scenarios/data_science/proposal/exp_gen/utils.py
@@ -1,13 +1,16 @@
+from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple
 
 from pydantic import BaseModel, Field
 
+from rdagent.components.coder.data_science.conf import get_ds_env
 from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
 from rdagent.components.coder.data_science.feature.exp import FeatureTask
 from rdagent.components.coder.data_science.model.exp import ModelTask
 from rdagent.components.coder.data_science.pipeline.exp import PipelineTask
 from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
 from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
+from rdagent.core.experiment import FBWorkspace
 from rdagent.utils.agent.tpl import T
 
 _COMPONENT_META: Dict[str, Dict[str, Any]] = {
@@ -86,3 +89,20 @@ class CodingSketch(BaseModel):
         "The content **must** be formatted using Markdown, with logical sections, key decision points, or implementation steps clearly organized by level-3 headings (i.e., `###`). "
         "This field should provide sufficient detail for a developer to understand the implementation flow, algorithms, data handling, and key logic points without ambiguity."
     )
+
+
+def get_packages(self, pkgs: list[str] | None = None) -> str:
+    # TODO:  add it into base class.  Environment should(i.e. `DSDockerConf`) should be part of the scenario class.
+    """Return runtime environment information."""
+    # Reuse package list cached during Draft stage when available.
+    if pkgs is None and hasattr(self, "required_packages"):
+        pkgs = getattr(self, "required_packages")  # type: ignore[arg-type]
+
+    env = get_ds_env()
+    implementation = FBWorkspace()
+    fname = "package_info.py"
+    implementation.inject_files(**{fname: (Path(__file__).absolute().resolve().parent / "package_info.py").read_text()})
+
+    pkg_args = " ".join(pkgs) if pkgs else ""
+    stdout = implementation.execute(env=env, entry=f"python {fname} {pkg_args}")
+    return stdout
diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
@@ -166,6 +166,7 @@ def get_scenario_all_desc(self, eda_output=None) -> str:
 
     def get_runtime_environment(self) -> str:
         # TODO:  add it into base class.  Environment should(i.e. `DSDockerConf`) should be part of the scenario class.
+        """Return runtime environment information."""
         env = get_ds_env()
         implementation = FBWorkspace()
         fname = "runtime_info.py"
diff --git a/rdagent/scenarios/data_science/scen/runtime_info.py b/rdagent/scenarios/data_science/scen/runtime_info.py
@@ -8,17 +8,6 @@ def print_runtime_info():
     print(f"Python {sys.version} on {platform.system()} {platform.release()}")
 
 
-def get_installed_packages():
-    return {dist.metadata["Name"].lower(): dist.version for dist in distributions()}
-
-
-def print_filtered_packages(installed_packages, filtered_packages):
-    for package_name in filtered_packages:
-        version = installed_packages.get(package_name.lower())
-        if version:
-            print(f"{package_name}=={version}")
-
-
 def get_gpu_info():
     try:
         # Option 1: Use PyTorch
@@ -53,24 +42,4 @@ def get_gpu_info():
 
 if __name__ == "__main__":
     print_runtime_info()
-    filtered_packages = [
-        "transformers",
-        "accelerate",
-        "torch",
-        "tensorflow",
-        "pandas",
-        "numpy",
-        "scikit-learn",
-        "scipy",
-        "xgboost",
-        "sklearn",
-        "lightgbm",
-        "vtk",
-        "opencv-python",
-        "keras",
-        "matplotlib",
-        "pydicom",
-    ]
-    installed_packages = get_installed_packages()
-    print_filtered_packages(installed_packages, filtered_packages)
     get_gpu_info()

Original file line number	Diff line number	Diff line change
`@@ -133,7 +133,7 @@ def evaluate(`
`133`	`133`	`scenario=self.scen.get_scenario_all_desc(eda_output=implementation.file_dict.get("EDA.md", None)),`
`134`	`134`	`is_sub_enabled=test_eval.is_sub_enabled(self.scen.competition),`
`135`	`135`	`task_desc=target_task.get_task_information(),`
`136`		`- runtime_environment=self.scen.get_runtime_environment(),`
	`136`	`+ runtime_environment=self.scen.runtime_environment,`
`137`	`137`	`)`
`138`	`138`	`user_prompt = T(".prompts:DSCoSTEER_eval.user").r(`
`139`	`139`	`code=implementation.all_codes,`
Original file line number	Diff line number	Diff line change
`@@ -345,4 +345,5 @@ output_format:`
`345`	`345`	`The output should follow JSON format. The schema is as follows:`
`346`	`346`	`{`
`347`	`347`	"description": "A detailed, step-by-step implementation guide for `main.py` that synthesizes planned modifications and code structure into a comprehensive coding plan. Must be formatted in Markdown with level-3 headings (###) organizing logical sections, key decision points, and implementation steps. Should provide sufficient detail covering implementation flow, algorithms, data handling, and key logic points for unambiguous developer execution.",
	`348`	`+ "packages": ["package1", "package2", ...] # Optional, list of packages needed for the task. If no packages are needed, leave it empty.`
`348`	`349`	`}`