feat: get kaggle notebooks & disscussion text for RAG (microsoft#371)

XianBW · web-flow · commit 5787a546b5c3 · 2024-09-28T05:42:47.000+08:00
* crawl notebooks &amp; change to DS-Agent format text

* give one function in kaggle_crawler to collect kaggle knowledge texts

* fix CI

* add tool for merge .py files to one py file

* fix CI

* delete files

* changes for select function

* add nbformat

* jump crawler import test

* del test code

* CI

* change

* change

* change
diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py
@@ -19,6 +19,7 @@
 from rdagent.core.utils import import_class
 from rdagent.log import rdagent_logger as logger
 from rdagent.log.time import measure_time
+from rdagent.scenarios.kaggle.experiment.utils import python_files_to_notebook
 from rdagent.scenarios.kaggle.kaggle_crawler import download_data
 from rdagent.scenarios.kaggle.proposal.proposal import (
     KG_ACTION_FEATURE_ENGINEERING,
@@ -88,6 +89,14 @@ def running(self, prev_out: dict[str, Any]):
                 exp = self.model_runner.develop(prev_out["coding"])
             logger.log_object(exp, tag="runner result")
 
+            if KAGGLE_IMPLEMENT_SETTING.competition in ["optiver-realized-volatility-prediction"]:
+                try:
+                    python_files_to_notebook(
+                        KAGGLE_IMPLEMENT_SETTING.competition, exp.experiment_workspace.workspace_path
+                    )
+                except Exception as e:
+                    logger.error(f"Merge python files to one file failed: {e}")
+
             if KAGGLE_IMPLEMENT_SETTING.auto_submit:
                 csv_path = exp.experiment_workspace.workspace_path / "submission.csv"
                 try:
diff --git a/rdagent/scenarios/kaggle/experiment/utils.py b/rdagent/scenarios/kaggle/experiment/utils.py
@@ -0,0 +1,95 @@
+from pathlib import Path
+
+import nbformat as nbf
+
+
+def python_files_to_notebook(competition: str, py_dir: str):
+    py_dir: Path = Path(py_dir)
+    save_path: Path = py_dir / "merged.ipynb"
+
+    pre_file = py_dir / "fea_share_preprocess.py"
+    pre_py = pre_file.read_text()
+
+    pre_py = pre_py.replace("/kaggle/input", f"/kaggle/input/{competition}")
+
+    fea_files = list(py_dir.glob("feature/*.py"))
+    fea_pys = {
+        f"{fea_file.stem}_cls": fea_file.read_text().replace("feature_engineering_cls", f"{fea_file.stem}_cls").strip()
+        + "()\n"
+        for fea_file in fea_files
+    }
+
+    model_files = list(py_dir.glob("model/model*.py"))
+    model_pys = {f"{model_file.stem}": model_file.read_text().strip() for model_file in model_files}
+    for k, v in model_pys.items():
+        model_pys[k] = v.replace("def fit(", "def fit(self, ").replace("def predict(", "def predict(self, ")
+
+        lines = model_pys[k].split("\n")
+        indent = False
+        first_line = -1
+        for i, line in enumerate(lines):
+            if "def " in line:
+                indent = True
+                if first_line == -1:
+                    first_line = i
+            if indent:
+                lines[i] = "    " + line
+        lines.insert(first_line, f"class {k}:\n")
+        model_pys[k] = "\n".join(lines)
+
+    select_files = list(py_dir.glob("model/select*.py"))
+    select_pys = {
+        f"{select_file.stem}": select_file.read_text().replace("def select(", f"def {select_file.stem}(")
+        for select_file in select_files
+    }
+
+    train_file = py_dir / "train.py"
+    train_py = train_file.read_text()
+
+    train_py = train_py.replace("from fea_share_preprocess import preprocess_script", "")
+    train_py = train_py.replace("DIRNAME = Path(__file__).absolute().resolve().parent", "")
+
+    fea_cls_list_str = "[" + ", ".join(list(fea_pys.keys())) + "]"
+    train_py = train_py.replace(
+        'for f in DIRNAME.glob("feature/feat*.py"):', f"for cls in {fea_cls_list_str}:"
+    ).replace("cls = import_module_from_path(f.stem, f).feature_engineering_cls()", "")
+
+    model_cls_list_str = "[" + ", ".join(list(model_pys.keys())) + "]"
+    train_py = (
+        train_py.replace('for f in DIRNAME.glob("model/model*.py"):', f"for mc in {model_cls_list_str}:")
+        .replace("m = import_module_from_path(f.stem, f)", "m = mc()")
+        .replace('select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix)', "")
+        .replace(
+            "select_m = import_module_from_path(select_python_path.stem, select_python_path)",
+            'select_m = eval(mc.__name__.replace("model", "select"))',
+        )
+        .replace("select_m.select", "select_m")
+        .replace("[2].select", "[2]")
+    )
+
+    nb = nbf.v4.new_notebook()
+    all_py = ""
+
+    nb.cells.append(nbf.v4.new_code_cell(pre_py))
+    all_py += pre_py + "\n\n"
+
+    for v in fea_pys.values():
+        nb.cells.append(nbf.v4.new_code_cell(v))
+        all_py += v + "\n\n"
+
+    for v in model_pys.values():
+        nb.cells.append(nbf.v4.new_code_cell(v))
+        all_py += v + "\n\n"
+
+    for v in select_pys.values():
+        nb.cells.append(nbf.v4.new_code_cell(v))
+        all_py += v + "\n\n"
+
+    nb.cells.append(nbf.v4.new_code_cell(train_py))
+    all_py += train_py + "\n"
+
+    with save_path.open("w", encoding="utf-8") as f:
+        nbf.write(nb, f)
+
+    with save_path.with_suffix(".py").open("w", encoding="utf-8") as f:
+        f.write(all_py)
diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py
@@ -1,16 +1,24 @@
+# %%
 import json
 import subprocess
 import time
 import zipfile
+from itertools import chain
 from pathlib import Path
 
+import nbformat
+from jinja2 import Environment, StrictUndefined
+from rich import print
 from selenium import webdriver
 from selenium.webdriver.chrome.service import Service
 from selenium.webdriver.common.by import By
 
 from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
+from rdagent.core.prompts import Prompts
 from rdagent.log import rdagent_logger as logger
+from rdagent.oai.llm_utils import APIBackend
 
+# %%
 options = webdriver.ChromeOptions()
 options.add_argument("--no-sandbox")
 options.add_argument("--disable-dev-shm-usage")
@@ -79,6 +87,121 @@ def download_data(competition: str, local_path: str = "/data/userdata/share/kagg
             zip_ref.extractall(data_path)
 
 
+def download_notebooks(
+    competition: str, local_path: str = "/data/userdata/share/kaggle/notebooks", num: int = 15
+) -> None:
+    data_path = Path(f"{local_path}/{competition}")
+    from kaggle.api.kaggle_api_extended import KaggleApi
+
+    api = KaggleApi()
+    api.authenticate()
+
+    # judge the sort_by
+    ll = api.competition_leaderboard_view(competition)
+    score_diff = float(ll[0].score) - float(ll[-1].score)
+    if score_diff > 0:
+        sort_by = "scoreDescending"
+    else:
+        sort_by = "scoreAscending"
+
+    # download notebooks
+    nl = api.kernels_list(competition=competition, sort_by=sort_by, page=1, page_size=num)
+    for nb in nl:
+        author = nb.ref.split("/")[0]
+        api.kernels_pull(nb.ref, path=data_path / author)
+    print(f"Downloaded {len(nl)} notebooks for {competition}. ([red]{sort_by}[/red])")
+
+
+def notebook_to_knowledge(notebook_text: str) -> str:
+    prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
+
+    sys_prompt = (
+        Environment(undefined=StrictUndefined)
+        .from_string(prompt_dict["gen_knowledge_from_code_DSAgent"]["system"])
+        .render()
+    )
+
+    user_prompt = (
+        Environment(undefined=StrictUndefined)
+        .from_string(prompt_dict["gen_knowledge_from_code_DSAgent"]["user"])
+        .render(notebook=notebook_text)
+    )
+
+    response = APIBackend().build_messages_and_create_chat_completion(
+        user_prompt=user_prompt,
+        system_prompt=sys_prompt,
+        json_mode=False,
+    )
+    return response
+
+
+def convert_notebooks_to_text(competition: str, local_path: str = "/data/userdata/share/kaggle/notebooks") -> None:
+    data_path = Path(f"{local_path}/{competition}")
+    converted_num = 0
+
+    # convert ipynb and irnb files
+    for nb_path in chain(data_path.glob("**/*.ipynb"), data_path.glob("**/*.irnb")):
+        with nb_path.open("r", encoding="utf-8") as f:
+            nb = nbformat.read(f, as_version=4)
+        text = []
+        for cell in nb.cells:
+            if cell.cell_type == "markdown":
+                text.append(f"```markdown\n{cell.source}```")
+            elif cell.cell_type == "code":
+                text.append(f"```code\n{cell.source}```")
+        text = "\n\n".join(text)
+
+        text = notebook_to_knowledge(text)
+
+        text_path = nb_path.with_suffix(".txt")
+        text_path.write_text(text, encoding="utf-8")
+        converted_num += 1
+
+    # convert py files
+    for py_path in data_path.glob("**/*.py"):
+        with py_path.open("r", encoding="utf-8") as f:
+            text = f"```code\n{f.read()}```"
+
+        text = notebook_to_knowledge(text)
+
+        text_path = py_path.with_suffix(".txt")
+        text_path.write_text(text, encoding="utf-8")
+        converted_num += 1
+
+    print(f"Converted {converted_num} notebooks to text files.")
+
+
+def collect_knowledge_texts(local_path: str = "/data/userdata/share/kaggle") -> dict[str, list[str]]:
+    """
+    {
+        "competition1": [
+            "knowledge_text1",
+            "knowledge_text2",
+            ...
+        ],
+        “competition2”: [
+            "knowledge_text1",
+            "knowledge_text2",
+            ...
+        ],
+        ...
+    }
+    """
+    notebooks_dir = Path(local_path) / "notebooks"
+
+    competition_knowledge_texts_dict = {}
+    for competition_dir in notebooks_dir.iterdir():
+        knowledge_texts = []
+        for text_path in competition_dir.glob("**/*.txt"):
+            text = text_path.read_text(encoding="utf-8")
+            knowledge_texts.append(text)
+
+        competition_knowledge_texts_dict[competition_dir.name] = knowledge_texts
+
+    return competition_knowledge_texts_dict
+
+
+# %%
 if __name__ == "__main__":
     dsagent_cs = [
         "feedback-prize-english-language-learning",
@@ -124,14 +247,16 @@ def download_data(competition: str, local_path: str = "/data/userdata/share/kagg
         "store-sales-time-series-forecasting",
         "titanic",
         "tpu-getting-started",
+        # scenario competition
         "covid19-global-forecasting-week-1",
-        "birdsong-recognition",
-        "optiver-trading-at-the-close",
+        "statoil-iceberg-classifier-challenge",
+        "optiver-realized-volatility-prediction",
         "facebook-v-predicting-check-ins",
     ]
 
-    for i in dsagent_cs + other_cs:
-        crawl_descriptions(i)
+    all_cs = dsagent_cs + other_cs
+    for c in all_cs:
+        convert_notebooks_to_text(c)
     exit()
     from kaggle.api.kaggle_api_extended import KaggleApi
 
diff --git a/rdagent/scenarios/kaggle/prompts.yaml b/rdagent/scenarios/kaggle/prompts.yaml
@@ -308,10 +308,38 @@ model_feature_selection:
     {
       "Selected Group Index": [1, 3, 5], # List of selected group indices, notice: the index starts from 1
     }
-  
+
   user: |-
     Current feature groups:
     {% for feature in feature_groups %}
       Group {{ loop.index }}: 
       {{ feature }}
-    {% endfor %}
+    {% endfor %}
+
+gen_knowledge_from_code_DSAgent:
+  system: |-
+    You were a proficient data scientist.
+  user: |-
+    The following notebook (contain markdown part and code part) is a high-performing solution for a kaggle competition.
+    Please answer the following questions one by one and **as detailedly as possible**.
+    Make sure that another data scientist can exactly reproduce this copy of code based on your answer.
+    Focus on the training process.
+
+    (1) Please give a summary of the overall design.
+    (2) What is the overall model architecture? Please use a long article to answer this question as accurately and in detail as possible.
+    (3) How are the important hyper-parameters setting in this code?
+    (4) What is the optimization objective?
+    (5) What advanced machine learning technique does this copy of code use?
+    (6) What other important tricks do you think play an important role for high performance?
+    
+    Note that make sure the answers are directly included from the code or markdown text, rather than based on your assumption.
+    
+    --------------------
+    {{ notebook }}
+    --------------------
+
+gen_knowledge_from_code_RDAgent:
+  system: |-
+    You were a proficient data scientist.
+  user: |-
+    TODO...
diff --git a/requirements.txt b/requirements.txt
@@ -62,6 +62,7 @@ st-theme
 # kaggle crawler
 selenium
 kaggle
+nbformat
 
 # tool
 seaborn