Skip to content

Commit ce8eeed

Browse files
authored
feat: kaggle model and feature (#238)
* update first version code * make hypothesis_gen and experiment_builder fit for both feature and model
1 parent f218b93 commit ce8eeed

17 files changed

Lines changed: 672 additions & 439 deletions

File tree

rdagent/app/kaggle/conf.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,13 @@ class Config:
1313
"""Add 'model_' to the protected namespaces"""
1414

1515
# 1) overriding the default
16-
scen: str = "rdagent.scenarios.kaggle.experiment.model_experiment.KGModelScenario"
16+
scen: str = "rdagent.scenarios.kaggle.experiment.scenario.KGModelScenario"
1717
"""Scenario class for data mining model"""
1818

19-
hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.model_proposal.KGModelHypothesisGen"
19+
hypothesis_gen: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesisGen"
2020
"""Hypothesis generation class"""
2121

22-
hypothesis2experiment: str = "rdagent.scenarios.kaggle.proposal.model_proposal.KGModelHypothesis2Experiment"
22+
hypothesis2experiment: str = "rdagent.scenarios.kaggle.proposal.proposal.KGHypothesis2Experiment"
2323
"""Hypothesis to experiment class"""
2424

2525
coder: str = "rdagent.scenarios.kaggle.developer.model_coder.KGModelCoSTEER"
@@ -28,7 +28,7 @@ class Config:
2828
runner: str = "rdagent.scenarios.kaggle.developer.model_runner.KGModelRunner"
2929
"""Runner class"""
3030

31-
summarizer: str = "rdagent.scenarios.kaggle.developer.feedback.KGModelHypothesisExperiment2Feedback"
31+
summarizer: str = "rdagent.scenarios.kaggle.developer.feedback.KGHypothesisExperiment2Feedback"
3232
"""Summarizer class"""
3333

3434
evolving_n: int = 10
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,15 @@
55
from rdagent.app.kaggle.conf import PROP_SETTING
66
from rdagent.components.workflow.conf import BasePropSetting
77
from rdagent.components.workflow.rd_loop import RDLoop
8+
from rdagent.core.developer import Developer
89
from rdagent.core.exception import ModelEmptyError
910
from rdagent.core.proposal import (
1011
Hypothesis2Experiment,
1112
HypothesisExperiment2Feedback,
1213
HypothesisGen,
1314
Trace,
1415
)
16+
from rdagent.core.scenario import Scenario
1517
from rdagent.core.utils import import_class
1618
from rdagent.log import rdagent_logger as logger
1719

@@ -62,4 +64,7 @@ def main(path=None, step_n=None, competition=None):
6264

6365

6466
if __name__ == "__main__":
67+
from dotenv import load_dotenv
68+
69+
load_dotenv(override=True)
6570
fire.Fire(main)

rdagent/components/coder/model_coder/model.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@ def __init__(
2222
self,
2323
name: str,
2424
description: str,
25-
formulation: str,
2625
architecture: str,
27-
variables: Dict[str, str],
2826
hyperparameters: Dict[str, str],
27+
formulation: str=None,
28+
variables: Dict[str, str] = None,
2929
model_type: Optional[str] = None,
3030
) -> None:
3131
self.name: str = name
@@ -34,17 +34,20 @@ def __init__(
3434
self.architecture: str = architecture
3535
self.variables: str = variables
3636
self.hyperparameters: str = hyperparameters
37-
self.model_type: str = model_type # Tabular for tabular model, TimesSeries for time series model, Graph for graph model, XGBoost for XGBoost model
37+
self.model_type: str = (
38+
model_type # Tabular for tabular model, TimesSeries for time series model, Graph for graph model, XGBoost for XGBoost model
39+
)
3840

3941
def get_task_information(self):
40-
return f"""name: {self.name}
42+
task_desc = f"""name: {self.name}
4143
description: {self.description}
42-
formulation: {self.formulation}
43-
architecture: {self.architecture}
44-
variables: {self.variables}
45-
hyperparameters: {self.hyperparameters}
46-
model_type: {self.model_type}
4744
"""
45+
task_desc += f"formulation: {self.formulation}\n" if self.formulation else ""
46+
task_desc += f"architecture: {self.architecture}\n"
47+
task_desc += f"variables: {self.variables}\n" if self.variables else ""
48+
task_desc += f"hyperparameters: {self.hyperparameters}\n"
49+
task_desc += f"model_type: {self.model_type}\n"
50+
return task_desc
4851

4952
@staticmethod
5053
def from_dict(dict):
@@ -161,4 +164,5 @@ def execute(
161164
return execution_feedback_str, execution_model_output
162165

163166

167+
FeatureExperiment = Experiment
164168
ModelExperiment = Experiment

rdagent/components/proposal/model_proposal.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,10 @@ class ModelHypothesisGen(HypothesisGen):
2525

2626
# The following methods are scenario related so they should be implemented in the subclass
2727
@abstractmethod
28-
def prepare_context(self, trace: Trace) -> Tuple[dict, bool]:
29-
...
28+
def prepare_context(self, trace: Trace) -> Tuple[dict, bool]: ...
3029

3130
@abstractmethod
32-
def convert_response(self, response: str) -> ModelHypothesis:
33-
...
31+
def convert_response(self, response: str) -> ModelHypothesis: ...
3432

3533
def gen(self, trace: Trace) -> ModelHypothesis:
3634
context_dict, json_flag = self.prepare_context(trace)
@@ -39,7 +37,7 @@ def gen(self, trace: Trace) -> ModelHypothesis:
3937
Environment(undefined=StrictUndefined)
4038
.from_string(ModelHypothesisGen.prompts["hypothesis_gen"]["system_prompt"])
4139
.render(
42-
targets="model",
40+
targets="feature engineering and model building",
4341
scenario=self.scen.get_scenario_all_desc(),
4442
hypothesis_output_format=context_dict["hypothesis_output_format"],
4543
hypothesis_specification=context_dict["hypothesis_specification"],
@@ -49,7 +47,7 @@ def gen(self, trace: Trace) -> ModelHypothesis:
4947
Environment(undefined=StrictUndefined)
5048
.from_string(ModelHypothesisGen.prompts["hypothesis_gen"]["user_prompt"])
5149
.render(
52-
targets="model",
50+
targets="feature engineering and model building",
5351
hypothesis_and_feedback=context_dict["hypothesis_and_feedback"],
5452
RAG=context_dict["RAG"],
5553
)
@@ -69,20 +67,18 @@ def __init__(self) -> None:
6967
super().__init__()
7068

7169
@abstractmethod
72-
def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, bool]:
73-
...
70+
def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, bool]: ...
7471

7572
@abstractmethod
76-
def convert_response(self, response: str, trace: Trace) -> ModelExperiment:
77-
...
73+
def convert_response(self, response: str, trace: Trace) -> ModelExperiment: ...
7874

7975
def convert(self, hypothesis: Hypothesis, trace: Trace) -> ModelExperiment:
8076
context, json_flag = self.prepare_context(hypothesis, trace)
8177
system_prompt = (
8278
Environment(undefined=StrictUndefined)
8379
.from_string(ModelHypothesis2Experiment.prompts["hypothesis2experiment"]["system_prompt"])
8480
.render(
85-
targets="model",
81+
targets="feature engineering and model building",
8682
scenario=trace.scen.get_scenario_all_desc(),
8783
experiment_output_format=context["experiment_output_format"],
8884
)
@@ -91,7 +87,7 @@ def convert(self, hypothesis: Hypothesis, trace: Trace) -> ModelExperiment:
9187
Environment(undefined=StrictUndefined)
9288
.from_string(ModelHypothesis2Experiment.prompts["hypothesis2experiment"]["user_prompt"])
9389
.render(
94-
targets="model",
90+
targets="feature engineering and model building",
9591
target_hypothesis=context["target_hypothesis"],
9692
hypothesis_and_feedback=context["hypothesis_and_feedback"],
9793
target_list=context["target_list"],

rdagent/components/proposal/prompts.yaml

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,22 @@ hypothesis_gen:
44
The {{targets}} are used in a certain scenario, the scenario is as follows:
55
{{ scenario }}
66
The user has made several hypothesis on this scenario and did several evaluation on them. The user will provide this information to you. Check if a new hypothesis has already been proposed. If it is already generated and you agree with it, just use it. If you don't agree, generate a better one.
7+
{% if hypothesis_specification %}
78
To help you generate new hypothesis, the user has prepared some additional information for you. You should use this information to help generate new {{targets}}.
9+
Here are the specifications: {{ hypothesis_specification }}
10+
{% endif %}
811
Please generate the output following the format and specifications below:
912
{{ hypothesis_output_format }}
10-
Here are the specifications: {{ hypothesis_specification }}
1113
1214
user_prompt: |-
13-
If it is not the first round, then the user has made several hypothesis on this scenario and did several evaluation on them.
15+
{% if hypothesis_and_feedback|length == 0 %} It is the first round of hypothesis generation. The user has no hypothesis on this scenario yet.
16+
{% else %}It is not the first round, the user has made several hypothesis on this scenario and did several evaluation on them.
1417
The former hypothesis and the corresponding feedbacks are as follows (focus on the last one & the new hypothesis that it provides and reasoning to see if you agree):
1518
{{ hypothesis_and_feedback }}
16-
To help you generate new {{targets}}, we have prepared the following information for you:
17-
{{ RAG }}
18-
Please generate the new hypothesis based on the information above. Also generate the relevant keys for the reasoning and the distilled knowledge that follows. For those keys, in particular for knowledge, explain in the context of the specific scenario to build up domain knowledge in the specific field rather than genearl knowledge.
19+
{% endif %}
20+
{% if RAG %}To help you generate new {{targets}}, we have prepared the following information for you:
21+
{{ RAG }}{% endif %}
22+
Please generate the new hypothesis based on the information above. Also generate the relevant keys for the reasoning and the distilled knowledge that follows. For those keys, in particular for knowledge, explain in the context of the specific scenario to build up domain knowledge in the specific field rather than general knowledge.
1923
2024
hypothesis2experiment:
2125
system_prompt: |-

rdagent/scenarios/data_mining/proposal/model_proposal.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,15 @@ def convert_response(self, response: str, trace: Trace) -> ModelExperiment:
9898
hyperparameters = response_dict[model_name]["hyperparameters"]
9999
model_type = response_dict[model_name]["model_type"]
100100
tasks.append(
101-
ModelTask(model_name, description, formulation, architecture, variables, hyperparameters, model_type)
101+
ModelTask(
102+
name=model_name,
103+
description=description,
104+
formulation=formulation,
105+
architecture=architecture,
106+
variables=variables,
107+
hyperparameters=hyperparameters,
108+
model_type=model_type,
109+
)
102110
)
103111
exp = DMModelExperiment(tasks)
104112
exp.based_experiments = [t[1] for t in trace.hist if t[2]]

rdagent/scenarios/kaggle/developer/feedback.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
DIRNAME = Path(__file__).absolute().resolve().parent
2020

2121

22-
class KGModelHypothesisExperiment2Feedback(HypothesisExperiment2Feedback):
22+
class KGHypothesisExperiment2Feedback(HypothesisExperiment2Feedback):
2323
"""Generated feedbacks on the hypothesis from **Executed** Implementations of different tasks & their comparisons with previous performances"""
2424

2525
def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace) -> HypothesisFeedback:

rdagent/scenarios/kaggle/experiment/meta_tpl/model_rf.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,21 @@
99
from sklearn.ensemble import RandomForestClassifier
1010
from sklearn.metrics import accuracy_score
1111

12-
def select(X):
12+
13+
def select(X: pd.DataFrame) -> pd.DataFrame:
1314
"""
1415
Select relevant features. To be used in fit & predict function.
1516
"""
1617
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
1718
return X
1819

20+
1921
def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
2022
"""
2123
Define and train the Random Forest model. Merge feature selection into the pipeline.
2224
"""
2325
# Initialize the Random Forest model
24-
model = RandomForestClassifier(n_estimators=100, random_state=32)
26+
model = RandomForestClassifier(n_estimators=100, random_state=32)
2527

2628
# Select features (if any feature selection is needed)
2729
X_train_selected = select(X_train)
@@ -37,15 +39,16 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali
3739

3840
return model
3941

42+
4043
def predict(model, X):
4144
"""
4245
Keep feature selection's consistency and make predictions.
4346
"""
4447
# Select features (if any feature selection is needed)
4548
X_selected = select(X)
46-
49+
4750
# Predict using the trained model
4851
y_pred_prob = model.predict_proba(X_selected)[:, 1]
49-
52+
5053
# Apply threshold to get boolean predictions
51-
return y_pred_prob > 0.5
54+
return y_pred_prob > 0.5

0 commit comments

Comments
 (0)