Skip to content

Commit 866c2e6

Browse files
authored
fix: refine some codes (#353)
* refine some codes * fix ci errors * update * update advanced rag
1 parent b8b2cd6 commit 866c2e6

8 files changed

Lines changed: 80 additions & 39 deletions

File tree

rdagent/app/kaggle/conf.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,12 +51,18 @@ class Config:
5151

5252
local_data_path: str = "/data/userdata/share/kaggle"
5353

54+
domain_knowledge_path: str = "/data/userdata/share/kaggle/domain_knowledge"
55+
5456
rag_path: str = "git_ignore_folder/rag"
5557

5658
if_action_choosing_based_on_UCB: bool = False
5759

5860
if_using_feature_selection: bool = False
5961

62+
if_using_graph_rag: bool = False
63+
64+
if_using_vector_rag: bool = False
65+
6066
auto_submit: bool = True
6167

6268

rdagent/scenarios/kaggle/developer/feedback.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,6 @@
1414
)
1515
from rdagent.log import rdagent_logger as logger
1616
from rdagent.oai.llm_utils import APIBackend
17-
from rdagent.scenarios.kaggle.knowledge_management.extract_knowledge import (
18-
extract_knowledge_from_feedback,
19-
)
2017
from rdagent.utils import convert2bool
2118

2219
prompt_dict = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")
@@ -42,11 +39,10 @@ def process_results(self, current_result, sota_result):
4239

4340
# Add a note about metric direction
4441
evaluation_direction = "higher" if self.scen.evaluation_metric_direction else "lower"
45-
combined_df[
46-
"Note"
47-
] = f"Direction of improvement (higher/lower is better) should be judged per metric. Here '{evaluation_direction}' is better for the metrics."
42+
evaluation_description = f"Direction of improvement (higher/lower is better) should be judged per metric. Here '{evaluation_direction}' is better for the metrics."
43+
combined_df["Note"] = evaluation_description
4844

49-
return combined_df
45+
return combined_df, evaluation_description
5046

5147
def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trace) -> HypothesisFeedback:
5248
"""
@@ -75,14 +71,17 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
7571
except AttributeError:
7672
print(f"Warning: Task {task} does not have get_task_information_and_implementation_result method")
7773

74+
evaluation_description = None
7875
# Check if there are any based experiments
7976
if exp.based_experiments:
8077
sota_result = exp.based_experiments[-1].result
8178
# Process the results to filter important metrics
82-
combined_result = self.process_results(current_result, sota_result)
79+
combined_result, evaluation_description = self.process_results(current_result, sota_result)
8380
else:
8481
# If there are no based experiments, we'll only use the current result
85-
combined_result = self.process_results(current_result, current_result) # Compare with itself
82+
combined_result, evaluation_description = self.process_results(
83+
current_result, current_result
84+
) # Compare with itself
8685
print("Warning: No previous experiments to compare against. Using current result as baseline.")
8786

8887
available_features = {
@@ -129,6 +128,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
129128
"combined_result": combined_result, # This turn and sota
130129
"hypothesis_text": hypothesis_text, # This turn
131130
"task_details": tasks_factors, # This turn
131+
"evaluation_description": evaluation_description,
132132
}
133133

134134
usr_prompt = (
@@ -152,13 +152,17 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
152152
experiment_feedback = {
153153
"hypothesis_text": hypothesis_text,
154154
"current_result": current_result,
155-
"tasks_factors": tasks_factors,
155+
"model_code": model_code,
156+
"available_features": available_features,
156157
"observations": observations,
157158
"hypothesis_evaluation": hypothesis_evaluation,
158159
"reason": reason,
159160
}
160161

161-
# self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)
162+
if self.scen.if_using_vector_rag:
163+
self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)
164+
elif self.scen.if_using_graph_rag:
165+
self.scen.trace.knowledge_base.load_from_documents([experiment_feedback], self.scen)
162166

163167
return HypothesisFeedback(
164168
observations=observations,

rdagent/scenarios/kaggle/experiment/scenario.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,16 @@ def __init__(self, competition: str) -> None:
3535
self.model_output_channel = None
3636
self.evaluation_desc = None
3737
self.evaluation_metric_direction = None
38+
self.vector_base = None
3839
self._analysis_competition_description()
3940
self.if_action_choosing_based_on_UCB = KAGGLE_IMPLEMENT_SETTING.if_action_choosing_based_on_UCB
4041
self.if_using_feature_selection = KAGGLE_IMPLEMENT_SETTING.if_using_feature_selection
42+
self.if_using_graph_rag = KAGGLE_IMPLEMENT_SETTING.if_using_graph_rag
43+
self.if_using_vector_rag = KAGGLE_IMPLEMENT_SETTING.if_using_vector_rag
44+
45+
if self.if_using_vector_rag and KAGGLE_IMPLEMENT_SETTING.rag_path:
46+
self.vector_base = KaggleExperienceBase()
47+
self.vector_base.load(KAGGLE_IMPLEMENT_SETTING.rag_path)
4148

4249
self._output_format = self.output_format
4350
self._interface = self.interface
@@ -124,6 +131,9 @@ def source_data(self) -> str:
124131

125132
if (data_folder / "X_valid.pkl").exists():
126133
X_valid = pd.read_pickle(data_folder / "X_valid.pkl")
134+
# TODO: Hardcoded for now, need to be fixed
135+
if self.competition == "feedback-prize-english-language-learning":
136+
return "This is a sparse matrix of descriptive text."
127137
buffer = io.StringIO()
128138
X_valid.info(verbose=True, buf=buffer, show_counts=True)
129139
data_info = buffer.getvalue()
@@ -187,7 +197,7 @@ def simulator(self) -> str:
187197
@property
188198
def rich_style_description(self) -> str:
189199
return f"""
190-
This is the Kaggle scenario for the competition: {KAGGLE_IMPLEMENT_SETTING.competition}
200+
This is the Kaggle scenario for the competition: {self.competitionn}
191201
"""
192202

193203
def get_scenario_all_desc(self) -> str:

rdagent/scenarios/kaggle/knowledge_management/graph.py

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import json
2+
from datetime import datetime, timezone
23
from pathlib import Path
34
from typing import List
45

@@ -20,21 +21,31 @@
2021

2122

2223
class KGKnowledgeGraph(UndirectedGraph):
23-
def __init__(self, path: str | Path | None, scenario: KGScenario) -> None:
24+
def __init__(self, path: str | Path | None, scenario: KGScenario | None) -> None:
2425
super().__init__(path)
25-
if path is not None and not Path(path).exists():
26+
if path is not None and Path(path).exists():
27+
self.load()
28+
self.path = Path(path).parent / (
29+
datetime.now(timezone.utc).strftime("%Y-%m-%d-%H-%M-%S") + "_kaggle_kb.pkl"
30+
)
31+
else:
2632
documents = []
27-
for file_path in (Path(KAGGLE_IMPLEMENT_SETTING.local_data_path) / "domain_knowledge").glob("*.case"):
33+
print(Path(KAGGLE_IMPLEMENT_SETTING.domain_knowledge_path))
34+
for file_path in (Path(KAGGLE_IMPLEMENT_SETTING.domain_knowledge_path)).rglob("*.case"):
2835
with open(file_path, "r") as f:
2936
documents.append(f.read())
3037
self.load_from_documents(documents=documents, scenario=scenario)
3138
self.dump()
3239

33-
def analyze_one_document(self, document_content: str, scenario: KGScenario) -> list:
40+
def add_document(self, document_content: str, scenario: KGScenario | None) -> None:
41+
self.load_from_documents([document_content], scenario)
42+
self.dump() # Each valid experiment will overwrite this file once again.
43+
44+
def analyze_one_document(self, document_content: str, scenario: KGScenario | None) -> list:
3445
session_system_prompt = (
3546
Environment(undefined=StrictUndefined)
3647
.from_string(PROMPT_DICT["extract_knowledge_graph_from_document"]["system"])
37-
.render(scenario=scenario.get_scenario_all_desc())
48+
.render(scenario=scenario.get_scenario_all_desc() if scenario is not None else "")
3849
)
3950

4051
session = APIBackend().build_chat_session(
@@ -53,7 +64,7 @@ def analyze_one_document(self, document_content: str, scenario: KGScenario) -> l
5364
user_prompt = "Continue from the last step please. Don't extract the same knowledge again."
5465
return knowledge_list
5566

56-
def load_from_documents(self, documents: List[str], scenario: KGScenario):
67+
def load_from_documents(self, documents: List[str], scenario: KGScenario | None) -> None:
5768
knowledge_list_list = multiprocessing_wrapper(
5869
[
5970
(
@@ -105,3 +116,7 @@ def load_from_documents(self, documents: List[str], scenario: KGScenario):
105116
node_list = self.batch_embedding(node_list)
106117
for node_pair in node_pairs:
107118
self.add_node(node_pair[0], node_pair[1])
119+
120+
121+
if __name__ == "__main__":
122+
graph = KGKnowledgeGraph(path="git_ignore_folder/kg_graph.pkl", scenario=None)

rdagent/scenarios/kaggle/knowledge_management/prompts.yaml

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,14 @@ extract_kaggle_knowledge_from_feedback_prompts:
4141
4242
extract_knowledge_graph_from_document:
4343
system: |-
44-
You are helping user to extract knowledge from a document.
45-
The user is working on data science competitions in Kaggle in the following scenario:
46-
{{ scenario }}
44+
You are helping the user extract knowledge from a document.
45+
{% if scenario %}
46+
The user is working on data science competitions in Kaggle, with the following scenario: {{ scenario }}
47+
{% else %}
48+
The user is working on general data science competitions on Kaggle.
49+
{% endif %}
4750
48-
The user has found some possible high value documents from other experts, and they need your help to extract some knowledge from these documents.
51+
The user has identified valuable documents from other experts and requires your help to extract meaningful insights from them.
4952
5053
Considering each document might contain several valuable insights, you need to extract them one by one and organize them in a structured format.
5154
@@ -58,13 +61,13 @@ extract_knowledge_graph_from_document:
5861
5962
Please provide the analysis in the following JSON format:
6063
{
61-
"competition": "(Plain text) extracted competition information, including the competition name, type, description, target, and features",
64+
"competition": "(Plain text) extracted competition information, including the competition name, type, description, target, and features (If no specific competition name or other fields are found, leave them blank).",
6265
"hypothesis":
6366
{
6467
"type": "one of the hypothesis types from ['Feature engineering', 'Feature processing', 'Model feature selection', 'Model tuning']",
6568
"explanation": "(Plain text) extracted detailed explanation to the hypothesis"
6669
},
67-
"experiments": "(Plain text) extracted experiments details. You can list them in bullet points.",
70+
"experiments": "(Plain text) Detailed descriptions of the experiments conducted in the document, which can be listed in bullet points.",
6871
"code": "extracted code snippets if available",
6972
"conclusion":
7073
{

rdagent/scenarios/kaggle/knowledge_management/vector_base.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from datetime import datetime, timezone
12
from pathlib import Path
23
from typing import List, Union
34

@@ -107,7 +108,7 @@ class KaggleExperienceBase(PDVectorBase):
107108
Class for handling Kaggle competition experience posts and organizing them for reference
108109
"""
109110

110-
def __init__(self, path: Union[str, Path] = None, kaggle_experience_path: Union[str, Path] = None):
111+
def __init__(self, vector_df_path: Union[str, Path] = None, kaggle_experience_path: Union[str, Path] = None):
111112
"""
112113
Initialize the KaggleExperienceBase class
113114
@@ -118,12 +119,14 @@ def __init__(self, path: Union[str, Path] = None, kaggle_experience_path: Union[
118119
kaggle_experience_path: str or Path, optional
119120
Path to the Kaggle experience post data.
120121
"""
121-
super().__init__(path)
122+
super().__init__(vector_df_path)
122123
self.kaggle_experience_path = kaggle_experience_path
123124
self.kaggle_experience_data = []
124-
125-
if kaggle_experience_path:
126-
self.load_kaggle_experience(kaggle_experience_path)
125+
# if path is not None and Path(path).exists():
126+
# self.load_kaggle_experience(kaggle_experience_path)
127+
# self.path = Path(path).parent / (datetime.now(timezone.utc).strftime("%Y-%m-%d-%H-%M-%S") + "_kaggle_kb.pkl")
128+
# else:
129+
# pass
127130

128131
def add(self, document: Union[KGDocument, List[KGDocument]]):
129132
document.split_into_trunk()
@@ -258,7 +261,7 @@ def search_experience(self, query: str, topk_k: int = 5, similarity_threshold: f
258261

259262
kaggle_base.add_experience_to_vector_base()
260263

261-
kaggle_base.save()
264+
kaggle_base.save("git_ignore_folder/experience/tabular_cases/kaggle_vector_base.pkl")
262265

263266
print(f"There are {kaggle_base.shape()[0]} records in the vector base.")
264267

rdagent/scenarios/kaggle/prompts.yaml

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -173,8 +173,8 @@ model_tuning_feedback_generation:
173173
{{ combined_result }}
174174
175175
Analyze the combined result in the context of its ability to:
176-
1. Hypothesis Evaluation: Does the result support or refute the hypothesis?
177-
2. Result Comparison: How does the result compare to the best? (Refer to "higher is better" or "lower is better" in the combined result).
176+
1. Result Comparison: How does the result compare to the best? {{ evaluation_description }}
177+
2. To a large extent, the experiment with better metrics is the better one.
178178
179179
Consider Changing Direction for Significant Gaps with the Best Result and the last round:
180180
- If the new results significantly differ from SOTA, consider a new direction.
@@ -234,8 +234,8 @@ factor_feedback_generation:
234234
{{ combined_result }}
235235
236236
Analyze the combined result in the context of its ability to:
237-
1. Hypothesis Evaluation: Does the result support or refute the hypothesis?
238-
2. Result Comparison: How does the result compare to the best? (Refer to "higher is better" or "lower is better" in the combined result).
237+
1. Result Comparison: How does the result compare to the best? {{ evaluation_description }}
238+
2. To a large extent, the experiment with better metrics is the better one.
239239
240240
Consider Changing Direction for Significant Gaps with the Best Result:
241241
- If the new results significantly differ from the best, consider exploring a new direction.
@@ -282,8 +282,8 @@ feature_selection_feedback_generation:
282282
{{ combined_result }}
283283
284284
Analyze the combined result in the context of its ability to:
285-
1. Hypothesis Evaluation: Does the result support or refute the hypothesis?
286-
2. Result Comparison: How does the result compare to the best? (Refer to "higher is better" or "lower is better" in the combined result).
285+
1. Result Comparison: How does the result compare to the best? {{ evaluation_description }}
286+
2. To a large extent, the experiment with better metrics is the better one.
287287
288288
In your feedback, consider:
289289
1. How effective is the current feature selection strategy?

rdagent/scenarios/kaggle/proposal/proposal.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,13 @@
2929
prompt_dict = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")
3030

3131

32-
KG_ACTION_FEATURE_ENGINEERING = "Feature engineering"
3332
KG_ACTION_FEATURE_PROCESSING = "Feature processing"
33+
KG_ACTION_FEATURE_ENGINEERING = "Feature engineering"
3434
KG_ACTION_MODEL_FEATURE_SELECTION = "Model feature selection"
3535
KG_ACTION_MODEL_TUNING = "Model tuning"
3636
KG_ACTION_LIST = [
37-
KG_ACTION_FEATURE_ENGINEERING,
3837
KG_ACTION_FEATURE_PROCESSING,
38+
KG_ACTION_FEATURE_ENGINEERING,
3939
*([KG_ACTION_MODEL_FEATURE_SELECTION] if KAGGLE_IMPLEMENT_SETTING.if_using_feature_selection else []),
4040
KG_ACTION_MODEL_TUNING,
4141
]
@@ -94,7 +94,7 @@ def __init__(self, scen: Scenario) -> Tuple[dict, bool]:
9494
self.initial_performance = 0.0
9595

9696
def generate_RAG_content(self, trace: Trace) -> str:
97-
if trace.knowledge_base is None:
97+
if self.scen.if_using_graph_rag is False or trace.knowledge_base is None:
9898
return None
9999
same_competition_node = trace.knowledge_base.get_node_by_content(trace.scen.get_competition_full_desc())
100100
if same_competition_node is not None:

0 commit comments

Comments
 (0)