Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,11 @@ constraints: deepclean

# Check lint with black.
black:
$(PIPRUN) python -m black --check --diff . --extend-exclude test/scripts --extend-exclude git_ignore_folder -l 120
$(PIPRUN) python -m black --check --diff . --extend-exclude "(test/scripts|test/notebook/testfiles|git_ignore_folder)" -l 120

# Check lint with isort.
isort:
$(PIPRUN) python -m isort --check . -s git_ignore_folder -s test/scripts
$(PIPRUN) python -m isort --check . -s git_ignore_folder -s test/scripts -s test/notebook/testfiles

# Check lint with mypy.
# First deal with the core folder, and then gradually increase the scope of detection,
Expand Down Expand Up @@ -119,11 +119,11 @@ pre-commit:

# Auto lint with black.
auto-black:
$(PIPRUN) python -m black . --extend-exclude test/scripts --extend-exclude git_ignore_folder --extend-exclude .venv -l 120
$(PIPRUN) python -m black . --extend-exclude "(test/scripts|test/notebook/testfiles|git_ignore_folder|.venv)" -l 120

# Auto lint with isort.
auto-isort:
$(PIPRUN) python -m isort . -s git_ignore_folder -s test/scripts -s .venv
$(PIPRUN) python -m isort . -s git_ignore_folder -s test/scripts -s test/notebook/testfiles -s .venv

# Auto lint with toml-sort.
auto-toml-sort:
Expand Down
3 changes: 3 additions & 0 deletions rdagent/app/data_science/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):

### specific feature

### notebook integration
enable_notebook_conversion: bool = False

#### enable specification
spec_enabled: bool = True

Expand Down
5 changes: 4 additions & 1 deletion rdagent/components/coder/data_science/pipeline/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,10 @@ def implement_one_task(
package_info=target_task.package_info,
enable_model_dump=DS_RD_SETTING.enable_model_dump,
enable_debug_mode=DS_RD_SETTING.sample_data_by_LLM,
spec=T("scenarios.data_science.share:component_spec.Pipeline").r(metric_name=self.scen.metric_name),
spec=T("scenarios.data_science.share:component_spec.Pipeline").r(
metric_name=self.scen.metric_name,
enable_notebook_conversion=DS_RD_SETTING.enable_notebook_conversion,
),
)
user_prompt = T(".prompts:pipeline_coder.user").r(
competition_info=competition_info,
Expand Down
27 changes: 26 additions & 1 deletion rdagent/components/coder/data_science/pipeline/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
CoSTEERQueriedKnowledgeV2,
)
from rdagent.components.coder.data_science.conf import get_clear_ws_cmd, get_ds_env
from rdagent.components.coder.data_science.share.notebook import NotebookConverter
from rdagent.components.coder.data_science.utils import remove_eda_part
from rdagent.core.experiment import FBWorkspace, Task
from rdagent.scenarios.data_science.test_eval import get_test_eval
Expand Down Expand Up @@ -70,6 +71,24 @@ def evaluate(
env=env, entry=f"strace -e trace=file -f -o trace.log python -m coverage run main.py"
)

nb_conversion_ret_code = 0
nb_conversion_check_text = ""
if DS_RD_SETTING.enable_notebook_conversion:
notebook_converter = NotebookConverter()
code = implementation.file_dict["main.py"]
error_msg = notebook_converter.validate_code_format(code)
if error_msg is not None:
nb_conversion_check_text = error_msg
nb_conversion_ret_code = 1
else:
notebook_converter.convert(
task=target_task,
code=code,
stdout=result.stdout,
outfile=implementation.workspace_path / "main.ipynb",
use_debug_flag=DS_RD_SETTING.sample_data_by_LLM,
)

sample_submission_check = True
test_eval = get_test_eval()
if (sample_submission_file_name := test_eval.get_sample_submission_name(self.scen.competition)) is not None:
Expand Down Expand Up @@ -173,7 +192,10 @@ def evaluate(
scenario=self.scen.get_scenario_all_desc(eda_output=eda_output),
task_desc=target_task.get_task_information(),
stdout=stdout.strip(),
spec=T("scenarios.data_science.share:component_spec.Pipeline").r(metric_name=self.scen.metric_name),
spec=T("scenarios.data_science.share:component_spec.Pipeline").r(
metric_name=self.scen.metric_name,
enable_notebook_conversion=DS_RD_SETTING.enable_notebook_conversion,
),
code=implementation.file_dict["main.py"],
)
wfb = build_cls_from_json_with_retry(
Expand All @@ -193,4 +215,7 @@ def evaluate(
wfb.return_checking += (
"\nSample submission file check failed. Code should not open the sample submission file."
)
if nb_conversion_ret_code != 0 and wfb.final_decision is True:
wfb.final_decision = False
wfb.return_checking += "\n" + nb_conversion_check_text
return wfb
135 changes: 135 additions & 0 deletions rdagent/components/coder/data_science/share/notebook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
"""
Handles conversion from a Python file to a Jupyter notebook.
"""

import argparse
from typing import Optional

import nbformat

from rdagent.components.coder.data_science.share.util import (
extract_first_section_name_from_code,
extract_function_body,
split_code_and_output_into_sections,
)
from rdagent.core.experiment import Task
from rdagent.log import rdagent_logger as logger
from rdagent.oai.llm_utils import APIBackend
from rdagent.utils.agent.ret import MarkdownAgentOut
from rdagent.utils.agent.tpl import T


class NotebookConverter:
"""
Builder responsible for writing a Jupyter notebook for a workspace.
"""

def validate_code_format(self, code: str) -> str | None:
"""
Returns None if the code format is valid, otherwise returns an error message.
"""
main_function_body = extract_function_body(code, "main")
if not main_function_body:
return "[Error] No main function found in the code. Please ensure that the main function is defined and contains the necessary print statements to divide sections."

found_section_name = extract_first_section_name_from_code(main_function_body)
if not found_section_name:
return "[Error] No sections found in the code. Expected to see 'print(\"Section: <section name>\")' as section dividers. Also make sure that they are actually run and not just comments."

return None

def convert(
self,
task: Optional[Task],
code: str,
stdout: str,
outfile: Optional[str] = None,
use_debug_flag: bool = False,
) -> str:
"""
Build a notebook based on the current progression.
"""
# Handle argparse in the code to ensure it works in a notebook environment
should_handle_argparse = "argparse" in code
sections = split_code_and_output_into_sections(code=code, stdout=stdout)
notebook = nbformat.v4.new_notebook()

# Use LLM to generate an intro cell for the notebook
if task:
system_prompt = T(".prompts:notebookconverter.system").r()
user_prompt = T(".prompts:notebookconverter.user").r(
plan=task.get_task_information(),
code=code,
)
resp = APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt, system_prompt=system_prompt
)
intro_content = MarkdownAgentOut.extract_output(resp)
notebook.cells.append(nbformat.v4.new_markdown_cell(intro_content))

if should_handle_argparse:
# Remove extra `import sys` since it will be added for argparse handling
if "import sys\n" in sections[0]["code"]:
sections[0]["code"] = sections[0]["code"].replace("import sys\n", "")

# Add sys.argv modification for argparse handling
sections[0]["code"] = (
"\n".join(
[
"import sys",
"# hack to allow argparse to work in notebook",
('sys.argv = ["main.py", "--debug"]' if use_debug_flag else 'sys.argv = ["main.py"]'),
]
)
+ "\n\n"
+ sections[0]["code"].lstrip()
)

for section in sections:
# Create a markdown cell for the section name and comments
markdown_content = ""
if section["name"]:
markdown_content += f"## {section['name']}\n"
if section["comments"]:
markdown_content += f"{section['comments']}\n"
if markdown_content:
notebook.cells.append(nbformat.v4.new_markdown_cell(markdown_content))

# Create a code cell for the section code and output
if section["code"]:
cell = nbformat.v4.new_code_cell(section["code"])
if section["output"]:
# For simplicity, treat all output as coming from stdout
# TODO: support Jupyter kernel execution and handle outputs appropriately here
cell.outputs = [nbformat.v4.new_output("stream", name="stdout", text=section["output"])]
notebook.cells.append(cell)

# Save the notebook or return it as a string
if outfile:
with open((outfile), "w", encoding="utf-8") as f:
nbformat.write(notebook, f)
logger.info(f"Notebook written to {outfile}")

return nbformat.writes(notebook)


if __name__ == "__main__":
converter = NotebookConverter()
parser = argparse.ArgumentParser(description="Convert Python code to Jupyter notebook.")
parser.add_argument("inputfile", type=str, help="Path to the input Python file.")
parser.add_argument("outfile", type=str, help="Path to the output Notebook file.")
parser.add_argument(
"--stdout",
type=str,
default="",
help="Standard output from the code execution.",
)
parser.add_argument("--debug", action="store_true", help="Use debug flag to modify sys.argv.")
args = parser.parse_args()
converter.convert(
task=None,
code=open(args.inputfile, "r").read(),
stdout=args.stdout,
outfile=args.outfile,
use_debug_flag=False,
)
22 changes: 22 additions & 0 deletions rdagent/components/coder/data_science/share/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,25 @@ docdev:
```
{% endfor %}

notebookconverter:
system: |-
{% include "scenarios.data_science.share:scen.role" %} Your task is to provide a summary for a data science solution.

You will be given:
- The original implementation plan for the script.
- A Python script that contains code and output.

Your task is to generate markdown content that includes a title and a short paragraph summarizing the technique in model training, the type of model produced and any other noteworthy details in the solution.

The return content should be like the format below(Please note that "````" is used to avoid confliction of "```" in markdown file)
````markdown
# <The title of the notebook>
<the content of markdown file>
````

user: |-
--------------- The implementation plan ---------------
{{plan}}

--------------- The Python script content ---------------
{{code}}
Loading