Skip to content

Commit 5787a54

Browse files
authored
feat: get kaggle notebooks & disscussion text for RAG (microsoft#371)
* crawl notebooks & change to DS-Agent format text * give one function in kaggle_crawler to collect kaggle knowledge texts * fix CI * add tool for merge .py files to one py file * fix CI * delete files * changes for select function * add nbformat * jump crawler import test * del test code * CI * change * change * change
1 parent e1c3101 commit 5787a54

5 files changed

Lines changed: 264 additions & 6 deletions

File tree

rdagent/app/kaggle/loop.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from rdagent.core.utils import import_class
2020
from rdagent.log import rdagent_logger as logger
2121
from rdagent.log.time import measure_time
22+
from rdagent.scenarios.kaggle.experiment.utils import python_files_to_notebook
2223
from rdagent.scenarios.kaggle.kaggle_crawler import download_data
2324
from rdagent.scenarios.kaggle.proposal.proposal import (
2425
KG_ACTION_FEATURE_ENGINEERING,
@@ -88,6 +89,14 @@ def running(self, prev_out: dict[str, Any]):
8889
exp = self.model_runner.develop(prev_out["coding"])
8990
logger.log_object(exp, tag="runner result")
9091

92+
if KAGGLE_IMPLEMENT_SETTING.competition in ["optiver-realized-volatility-prediction"]:
93+
try:
94+
python_files_to_notebook(
95+
KAGGLE_IMPLEMENT_SETTING.competition, exp.experiment_workspace.workspace_path
96+
)
97+
except Exception as e:
98+
logger.error(f"Merge python files to one file failed: {e}")
99+
91100
if KAGGLE_IMPLEMENT_SETTING.auto_submit:
92101
csv_path = exp.experiment_workspace.workspace_path / "submission.csv"
93102
try:
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
from pathlib import Path
2+
3+
import nbformat as nbf
4+
5+
6+
def python_files_to_notebook(competition: str, py_dir: str):
7+
py_dir: Path = Path(py_dir)
8+
save_path: Path = py_dir / "merged.ipynb"
9+
10+
pre_file = py_dir / "fea_share_preprocess.py"
11+
pre_py = pre_file.read_text()
12+
13+
pre_py = pre_py.replace("/kaggle/input", f"/kaggle/input/{competition}")
14+
15+
fea_files = list(py_dir.glob("feature/*.py"))
16+
fea_pys = {
17+
f"{fea_file.stem}_cls": fea_file.read_text().replace("feature_engineering_cls", f"{fea_file.stem}_cls").strip()
18+
+ "()\n"
19+
for fea_file in fea_files
20+
}
21+
22+
model_files = list(py_dir.glob("model/model*.py"))
23+
model_pys = {f"{model_file.stem}": model_file.read_text().strip() for model_file in model_files}
24+
for k, v in model_pys.items():
25+
model_pys[k] = v.replace("def fit(", "def fit(self, ").replace("def predict(", "def predict(self, ")
26+
27+
lines = model_pys[k].split("\n")
28+
indent = False
29+
first_line = -1
30+
for i, line in enumerate(lines):
31+
if "def " in line:
32+
indent = True
33+
if first_line == -1:
34+
first_line = i
35+
if indent:
36+
lines[i] = " " + line
37+
lines.insert(first_line, f"class {k}:\n")
38+
model_pys[k] = "\n".join(lines)
39+
40+
select_files = list(py_dir.glob("model/select*.py"))
41+
select_pys = {
42+
f"{select_file.stem}": select_file.read_text().replace("def select(", f"def {select_file.stem}(")
43+
for select_file in select_files
44+
}
45+
46+
train_file = py_dir / "train.py"
47+
train_py = train_file.read_text()
48+
49+
train_py = train_py.replace("from fea_share_preprocess import preprocess_script", "")
50+
train_py = train_py.replace("DIRNAME = Path(__file__).absolute().resolve().parent", "")
51+
52+
fea_cls_list_str = "[" + ", ".join(list(fea_pys.keys())) + "]"
53+
train_py = train_py.replace(
54+
'for f in DIRNAME.glob("feature/feat*.py"):', f"for cls in {fea_cls_list_str}:"
55+
).replace("cls = import_module_from_path(f.stem, f).feature_engineering_cls()", "")
56+
57+
model_cls_list_str = "[" + ", ".join(list(model_pys.keys())) + "]"
58+
train_py = (
59+
train_py.replace('for f in DIRNAME.glob("model/model*.py"):', f"for mc in {model_cls_list_str}:")
60+
.replace("m = import_module_from_path(f.stem, f)", "m = mc()")
61+
.replace('select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix)', "")
62+
.replace(
63+
"select_m = import_module_from_path(select_python_path.stem, select_python_path)",
64+
'select_m = eval(mc.__name__.replace("model", "select"))',
65+
)
66+
.replace("select_m.select", "select_m")
67+
.replace("[2].select", "[2]")
68+
)
69+
70+
nb = nbf.v4.new_notebook()
71+
all_py = ""
72+
73+
nb.cells.append(nbf.v4.new_code_cell(pre_py))
74+
all_py += pre_py + "\n\n"
75+
76+
for v in fea_pys.values():
77+
nb.cells.append(nbf.v4.new_code_cell(v))
78+
all_py += v + "\n\n"
79+
80+
for v in model_pys.values():
81+
nb.cells.append(nbf.v4.new_code_cell(v))
82+
all_py += v + "\n\n"
83+
84+
for v in select_pys.values():
85+
nb.cells.append(nbf.v4.new_code_cell(v))
86+
all_py += v + "\n\n"
87+
88+
nb.cells.append(nbf.v4.new_code_cell(train_py))
89+
all_py += train_py + "\n"
90+
91+
with save_path.open("w", encoding="utf-8") as f:
92+
nbf.write(nb, f)
93+
94+
with save_path.with_suffix(".py").open("w", encoding="utf-8") as f:
95+
f.write(all_py)

rdagent/scenarios/kaggle/kaggle_crawler.py

Lines changed: 129 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,24 @@
1+
# %%
12
import json
23
import subprocess
34
import time
45
import zipfile
6+
from itertools import chain
57
from pathlib import Path
68

9+
import nbformat
10+
from jinja2 import Environment, StrictUndefined
11+
from rich import print
712
from selenium import webdriver
813
from selenium.webdriver.chrome.service import Service
914
from selenium.webdriver.common.by import By
1015

1116
from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
17+
from rdagent.core.prompts import Prompts
1218
from rdagent.log import rdagent_logger as logger
19+
from rdagent.oai.llm_utils import APIBackend
1320

21+
# %%
1422
options = webdriver.ChromeOptions()
1523
options.add_argument("--no-sandbox")
1624
options.add_argument("--disable-dev-shm-usage")
@@ -79,6 +87,121 @@ def download_data(competition: str, local_path: str = "/data/userdata/share/kagg
7987
zip_ref.extractall(data_path)
8088

8189

90+
def download_notebooks(
91+
competition: str, local_path: str = "/data/userdata/share/kaggle/notebooks", num: int = 15
92+
) -> None:
93+
data_path = Path(f"{local_path}/{competition}")
94+
from kaggle.api.kaggle_api_extended import KaggleApi
95+
96+
api = KaggleApi()
97+
api.authenticate()
98+
99+
# judge the sort_by
100+
ll = api.competition_leaderboard_view(competition)
101+
score_diff = float(ll[0].score) - float(ll[-1].score)
102+
if score_diff > 0:
103+
sort_by = "scoreDescending"
104+
else:
105+
sort_by = "scoreAscending"
106+
107+
# download notebooks
108+
nl = api.kernels_list(competition=competition, sort_by=sort_by, page=1, page_size=num)
109+
for nb in nl:
110+
author = nb.ref.split("/")[0]
111+
api.kernels_pull(nb.ref, path=data_path / author)
112+
print(f"Downloaded {len(nl)} notebooks for {competition}. ([red]{sort_by}[/red])")
113+
114+
115+
def notebook_to_knowledge(notebook_text: str) -> str:
116+
prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
117+
118+
sys_prompt = (
119+
Environment(undefined=StrictUndefined)
120+
.from_string(prompt_dict["gen_knowledge_from_code_DSAgent"]["system"])
121+
.render()
122+
)
123+
124+
user_prompt = (
125+
Environment(undefined=StrictUndefined)
126+
.from_string(prompt_dict["gen_knowledge_from_code_DSAgent"]["user"])
127+
.render(notebook=notebook_text)
128+
)
129+
130+
response = APIBackend().build_messages_and_create_chat_completion(
131+
user_prompt=user_prompt,
132+
system_prompt=sys_prompt,
133+
json_mode=False,
134+
)
135+
return response
136+
137+
138+
def convert_notebooks_to_text(competition: str, local_path: str = "/data/userdata/share/kaggle/notebooks") -> None:
139+
data_path = Path(f"{local_path}/{competition}")
140+
converted_num = 0
141+
142+
# convert ipynb and irnb files
143+
for nb_path in chain(data_path.glob("**/*.ipynb"), data_path.glob("**/*.irnb")):
144+
with nb_path.open("r", encoding="utf-8") as f:
145+
nb = nbformat.read(f, as_version=4)
146+
text = []
147+
for cell in nb.cells:
148+
if cell.cell_type == "markdown":
149+
text.append(f"```markdown\n{cell.source}```")
150+
elif cell.cell_type == "code":
151+
text.append(f"```code\n{cell.source}```")
152+
text = "\n\n".join(text)
153+
154+
text = notebook_to_knowledge(text)
155+
156+
text_path = nb_path.with_suffix(".txt")
157+
text_path.write_text(text, encoding="utf-8")
158+
converted_num += 1
159+
160+
# convert py files
161+
for py_path in data_path.glob("**/*.py"):
162+
with py_path.open("r", encoding="utf-8") as f:
163+
text = f"```code\n{f.read()}```"
164+
165+
text = notebook_to_knowledge(text)
166+
167+
text_path = py_path.with_suffix(".txt")
168+
text_path.write_text(text, encoding="utf-8")
169+
converted_num += 1
170+
171+
print(f"Converted {converted_num} notebooks to text files.")
172+
173+
174+
def collect_knowledge_texts(local_path: str = "/data/userdata/share/kaggle") -> dict[str, list[str]]:
175+
"""
176+
{
177+
"competition1": [
178+
"knowledge_text1",
179+
"knowledge_text2",
180+
...
181+
],
182+
“competition2”: [
183+
"knowledge_text1",
184+
"knowledge_text2",
185+
...
186+
],
187+
...
188+
}
189+
"""
190+
notebooks_dir = Path(local_path) / "notebooks"
191+
192+
competition_knowledge_texts_dict = {}
193+
for competition_dir in notebooks_dir.iterdir():
194+
knowledge_texts = []
195+
for text_path in competition_dir.glob("**/*.txt"):
196+
text = text_path.read_text(encoding="utf-8")
197+
knowledge_texts.append(text)
198+
199+
competition_knowledge_texts_dict[competition_dir.name] = knowledge_texts
200+
201+
return competition_knowledge_texts_dict
202+
203+
204+
# %%
82205
if __name__ == "__main__":
83206
dsagent_cs = [
84207
"feedback-prize-english-language-learning",
@@ -124,14 +247,16 @@ def download_data(competition: str, local_path: str = "/data/userdata/share/kagg
124247
"store-sales-time-series-forecasting",
125248
"titanic",
126249
"tpu-getting-started",
250+
# scenario competition
127251
"covid19-global-forecasting-week-1",
128-
"birdsong-recognition",
129-
"optiver-trading-at-the-close",
252+
"statoil-iceberg-classifier-challenge",
253+
"optiver-realized-volatility-prediction",
130254
"facebook-v-predicting-check-ins",
131255
]
132256

133-
for i in dsagent_cs + other_cs:
134-
crawl_descriptions(i)
257+
all_cs = dsagent_cs + other_cs
258+
for c in all_cs:
259+
convert_notebooks_to_text(c)
135260
exit()
136261
from kaggle.api.kaggle_api_extended import KaggleApi
137262

rdagent/scenarios/kaggle/prompts.yaml

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -308,10 +308,38 @@ model_feature_selection:
308308
{
309309
"Selected Group Index": [1, 3, 5], # List of selected group indices, notice: the index starts from 1
310310
}
311-
311+
312312
user: |-
313313
Current feature groups:
314314
{% for feature in feature_groups %}
315315
Group {{ loop.index }}:
316316
{{ feature }}
317-
{% endfor %}
317+
{% endfor %}
318+
319+
gen_knowledge_from_code_DSAgent:
320+
system: |-
321+
You were a proficient data scientist.
322+
user: |-
323+
The following notebook (contain markdown part and code part) is a high-performing solution for a kaggle competition.
324+
Please answer the following questions one by one and **as detailedly as possible**.
325+
Make sure that another data scientist can exactly reproduce this copy of code based on your answer.
326+
Focus on the training process.
327+
328+
(1) Please give a summary of the overall design.
329+
(2) What is the overall model architecture? Please use a long article to answer this question as accurately and in detail as possible.
330+
(3) How are the important hyper-parameters setting in this code?
331+
(4) What is the optimization objective?
332+
(5) What advanced machine learning technique does this copy of code use?
333+
(6) What other important tricks do you think play an important role for high performance?
334+
335+
Note that make sure the answers are directly included from the code or markdown text, rather than based on your assumption.
336+
337+
--------------------
338+
{{ notebook }}
339+
--------------------
340+
341+
gen_knowledge_from_code_RDAgent:
342+
system: |-
343+
You were a proficient data scientist.
344+
user: |-
345+
TODO...

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ st-theme
6262
# kaggle crawler
6363
selenium
6464
kaggle
65+
nbformat
6566

6667
# tool
6768
seaborn

0 commit comments

Comments
 (0)