From f936b8f9722a04ac53d993cae3b21ddddfd92dd4 Mon Sep 17 00:00:00 2001 From: Amrutha Varshini R Date: Thu, 1 May 2025 22:01:37 +0530 Subject: [PATCH 1/5] added pattern design code --- .DS_Store | Bin 0 -> 6148 bytes main.py | 8 +- results/.DS_Store | Bin 0 -> 6148 bytes scripts/pattern_design.py | 156 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 161 insertions(+), 3 deletions(-) create mode 100644 .DS_Store create mode 100644 results/.DS_Store create mode 100644 scripts/pattern_design.py diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..a5b59e1574f6f0e349039463802f8e5782266d56 GIT binary patch literal 6148 zcmeHK%}T>S5T0$TO({YTDjpZS7Hq|zf|n5M3mDOZN=-=7V9b^#wTDv3QD4YM@p+ut z-GJ4CHxWAnyWi~m>}Ed5{s4gRCQ%ol1pp2;WQeJL2G+9>$K%zZ{C)( z?!o?1TlTug^LfMB+TA-jA3P>csd_O)3gleNhQ$(IQTbBSvp-G~mENPL%qpW9nE_^i z8DIuhjRAWSIL%cnoa5qfpBOC1ivHOM_Pzzi%hP&LCAo&RU} z%Pf85FQ)K_8DIwf83Ur$_xm0eWzW`c<s1QO$c|)z%MZH0m}$Xh5!Hn literal 0 HcmV?d00001 diff --git a/main.py b/main.py index e9eef4c..9919d77 100644 --- a/main.py +++ b/main.py @@ -1,10 +1,12 @@ import os, argparse import pandas as pd - from insightbench.utils import agent_utils as au from insightbench import agents, benchmarks from insightbench.utils import exp_utils as eu from insightbench.utils.exp_utils import hash_dict, save_json +from dotenv import load_dotenv + +load_dotenv() def main(exp_dict, savedir, args): @@ -82,7 +84,7 @@ def main(exp_dict, savedir, args): parser.add_argument("-sb", "--savedir_base", type=str, default="results") parser.add_argument("-r", "--reset", type=int, default=0) # add openai api key - parser.add_argument("-o", "--openai_api_key", type=str, default=None) + # parser.add_argument("-o", "--openai_api_key", type=str, default=None) # dataset path parser.add_argument("-d", "--datadir", type=str, default="data/notebooks") @@ -102,7 +104,7 @@ def main(exp_dict, savedir, args): ) # set open ai env - os.environ["OPENAI_API_KEY"] = args.openai_api_key + # os.environ["OPENAI_API_KEY"] = # Loop through experiments for exp_dict in exp_list: diff --git a/results/.DS_Store b/results/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..5008ddfcf53c02e82d7eee2e57c38e5672ef89f6 GIT binary patch literal 6148 zcmeH~Jr2S!425mzP>H1@V-^m;4Wg<&0T*E43hX&L&p$$qDprKhvt+--jT7}7np#A3 zem<@ulZcFPQ@L2!n>{z**++&mCkOWA81W14cNZlEfg7;MkzE(HCqgga^y>{tEnwC%0;vJ&^%eQ zLs35+`xjp>T0 str: + """Analyze the data and return a detailed summary of its structure.""" + summary = { + "num_rows": len(data), + "num_cols": len(data.columns), + "column_summaries": {}, + } + + for col in data.columns: + col_data = data[col] + col_summary = { + "dtype": str(col_data.dtype), + "num_missing": int(col_data.isnull().sum()), + "num_unique": int(col_data.nunique()), + "sample_values": col_data.dropna().unique()[:3].tolist(), + } + + if pd.api.types.is_numeric_dtype(col_data): + col_summary.update( + { + "mean": float(col_data.mean()), + "std": float(col_data.std()), + "min": float(col_data.min()), + "max": float(col_data.max()), + } + ) + elif pd.api.types.is_datetime64_any_dtype(col_data): + col_summary.update( + { + "min_date": str(col_data.min()), + "max_date": str(col_data.max()), + } + ) + elif pd.api.types.is_string_dtype(col_data): + col_summary.update( + {"top_frequent_values": col_data.value_counts().head(3).to_dict()} + ) + + summary["column_summaries"][col] = col_summary + + return json.dumps(summary, indent=2) + + def design_patterns(self, data: pd.DataFrame, task: str) -> Dict[str, List[Dict]]: + """Design patterns for each column based on the given analytics task.""" + data_summary = self.analyze_data(data) + + prompt = f""" +You are a data scientist creating a synthetic benchmark for a data analytics task. +You are given a summary of a dataset and an analytics task. +Your goal is to reason step-by-step and identify realistic patterns that can be injected into the data +to make it more suitable for evaluating the performance of models on this task. + +Please follow these steps for each column in the dataset: +1. Think about what kind of information this column conveys. +2. Consider how this column might affect or be related to the analytics task. +3. Suggest 1–2 *practical and realistic* patterns that could be injected into the column values. +4. Explain why injecting this pattern would help in evaluating the task. +5. Describe how the pattern would influence model learning or performance on the analytics task. + +Use a JSON output format with the following structure: + +{{ + "column_name_1": [ + {{ + "pattern": "description of the pattern", + "reasoning": "explanation of why this pattern is useful", + "relevance_to_task": "how this pattern helps with the task" + }}, + ... + ], + ... +}} + +Data Summary: +{data_summary} + +Analytics Task: +{task} +""" + + response = self.client.chat.completions.create( + model="gpt-4o", # Using GPT-4o (OpenAI Omni) + messages=[ + { + "role": "system", + "content": "You are a data pattern design expert. Your task is to suggest meaningful patterns that can be injected into data columns to help accomplish specific analytics tasks. Always respond with valid JSON.", + }, + {"role": "user", "content": prompt}, + ], + ) + raw_response = response.choices[0].message.content + # Strip triple backticks and optional 'json' tag + cleaned_json_str = re.sub(r"^```(?:json)?\n|\n```$", "", raw_response.strip()) + try: + return json.loads(cleaned_json_str) + except json.JSONDecodeError: + raise ValueError("Failed to parse LLM response as JSON") + + +def main(): + # Get API key from environment variable + designer = ( + PatternDesigner() + ) # Will automatically use OPENAI_API_KEY from environment + + # Sample DataFrame + data = pd.DataFrame( + { + "date": ["2023-01-01", "2023-01-02", "2023-01-03"], + "sales": [100, 150, 200], + "category": ["A", "B", "A"], + } + ) + + task = "Anomaly detection" + + try: + patterns = designer.design_patterns(data, task) + + print("\nSuggested Patterns for Each Column:") + for column, suggestions in patterns.items(): + print(f"\n{column}:") + for suggestion in suggestions: + print(f"\nPattern: {suggestion['pattern']}") + print(f"Reasoning: {suggestion['reasoning']}") + print(f"Relevance to task: {suggestion['relevance_to_task']}") + print("-" * 80) + + except Exception as e: + print(f"Error: {e}") + + +if __name__ == "__main__": + main() From 10ff39e7a05c3e2f9ced76e9cc8fca1e6de8983a Mon Sep 17 00:00:00 2001 From: Amrutha Varshini R Date: Tue, 6 May 2025 08:40:42 +0530 Subject: [PATCH 2/5] pattern generation with KPI --- .DS_Store | Bin 6148 -> 6148 bytes insightbench/agents.py | 80 +++++++++++++++++- insightbench/utils/domains_tasks.json | 28 +++++++ results/.DS_Store | Bin 6148 -> 6148 bytes scripts/pattern_design.py | 113 +++++++++++++++++--------- 5 files changed, 180 insertions(+), 41 deletions(-) create mode 100644 insightbench/utils/domains_tasks.json diff --git a/.DS_Store b/.DS_Store index a5b59e1574f6f0e349039463802f8e5782266d56..d47b2f01edca37469440f84dc1189411a2eb0398 100644 GIT binary patch delta 20 bcmZoMXffDe&B(|;*@iKikz?}=#vCyKJK+Ui delta 20 bcmZoMXffDe&B(|x*@iKik#qA5#vCyKJM#r$ diff --git a/insightbench/agents.py b/insightbench/agents.py index 494afb7..fd5cd06 100644 --- a/insightbench/agents.py +++ b/insightbench/agents.py @@ -10,6 +10,9 @@ from insightbench.utils.metrics_utils import score_insight from insightbench import metrics from PIL import Image +import pandas as pd +from typing import Dict, List, Optional +from scripts.pattern_design import PatternDesigner class Agent: @@ -167,7 +170,7 @@ def __init__( goal="I want to find interesting trends in this dataset", verbose=False, temperature=0, - n_retries=2 + n_retries=2, ): self.goal = goal if savedir is None: @@ -482,3 +485,78 @@ def save_state_dict(self, fname): def load_state_dict(self, fname): with open(fname, "r") as f: self.insights_history = json.load(f) + + +class AgentDataGen: + def __init__( + self, + api_key: Optional[str] = None, + tasks_path: str = "insightbench/utils/domains_tasks.json", + ): + """Initialize the AgentDataGen with OpenAI API key and tasks path. + + Args: + api_key: OpenAI API key. If not provided, will try to get from OPENAI_API_KEY environment variable. + tasks_path: Path to the domains_tasks.json file + """ + self.pattern_designer = PatternDesigner(api_key) + self.tasks_path = tasks_path + self.tasks = self._load_tasks() + + def _load_tasks(self) -> dict: + """Load tasks from domains_tasks.json.""" + try: + with open(self.tasks_path, "r") as f: + return json.load(f) + except Exception as e: + raise ValueError(f"Failed to load tasks from {self.tasks_path}: {str(e)}") + + def generate_patterns(self, data: pd.DataFrame, task: str) -> Dict[str, List[Dict]]: + """Generate patterns for the given data and task. + + Args: + data: Input DataFrame containing the data to analyze + task: Description of the analytics task + + Returns: + Dictionary mapping column names to lists of pattern suggestions + """ + return self.pattern_designer.design_patterns(data, task) + + def generate_all_patterns( + self, data: pd.DataFrame, output_dir: str = "results/Patterns" + ) -> None: + """Generate patterns for all tasks and save them to the output directory. + + Args: + data: Input DataFrame containing the data to analyze + output_dir: Directory to save the generated patterns + """ + os.makedirs(output_dir, exist_ok=True) + + for domain, domain_tasks in self.tasks.items(): + print(f"\nProcessing domain: {domain}") + + # Create domain directory + domain_dir = os.path.join(output_dir, domain) + os.makedirs(domain_dir, exist_ok=True) + + for task in domain_tasks: + print(f"\nGenerating patterns for task: {task}") + + try: + # Generate patterns + patterns = self.generate_patterns(data, task) + + # Save patterns to file + task_filename = task.lower().replace(" ", "_") + "_patterns.json" + output_path = os.path.join(domain_dir, task_filename) + + with open(output_path, "w") as f: + json.dump(patterns, f, indent=2) + + print(f"Saved patterns to: {output_path}") + + except Exception as e: + print(f"Error generating patterns for task '{task}': {str(e)}") + continue diff --git a/insightbench/utils/domains_tasks.json b/insightbench/utils/domains_tasks.json new file mode 100644 index 0000000..f0f3eb9 --- /dev/null +++ b/insightbench/utils/domains_tasks.json @@ -0,0 +1,28 @@ +{ + "tasks": [ + "Fraud Detection", + "Recommendation Systems", + "Churn Analysis", + "Customer Segmentation", + "Network Analysis", + "Association Rule Mining", + "Dashboard Summary", + "Predictive Maintenance", + "Cohort Analysis", + "Attribution Modeling", + "Anomaly Detection", + "Feature Importance Ranking", + "Geospatial Analysis", + "Causality", + "Logs Clustering", + "Time Series Decomposition", + "Principal Component Analysis", + "Correlation Analysis", + "Knowledge Base", + "Multi-table Search", + "Huge Table Analysis", + "Topic Modeling", + "Market Analysis", + "Data Imputation" + ] +} \ No newline at end of file diff --git a/results/.DS_Store b/results/.DS_Store index 5008ddfcf53c02e82d7eee2e57c38e5672ef89f6..80d25ef1bdfa2e702163384519b7818faac0e565 100644 GIT binary patch delta 226 zcmZoMXfc=|#>B`mF;Q%yo}wrV0|Nsi1A_nqLn=cNLmoqMPP$?6#Kh(GAPEkJ0ER?{ z5+Fp9OwP|O0LsAthML@b7nh`*{3M_PjumTVjQ&16<_Kb=o0XfNf=8VY!qS2a!(fo5 r8xynHCL4&bZ06?R<^cL&W8!z_$^0UUtV}@T6($>s2ygZf*~1I~X52L? delta 70 zcmZoMXfc=|#>AjHu~2NHo+1YW5HK<@2y9-+n8vnw1EUw?W_AvK4xj>{$am(+{342+ UKzW7)kiy9(Jj$D6L{=~Z04H+~3jhEB diff --git a/scripts/pattern_design.py b/scripts/pattern_design.py index 2603cc0..a0ee30b 100644 --- a/scripts/pattern_design.py +++ b/scripts/pattern_design.py @@ -67,38 +67,67 @@ def design_patterns(self, data: pd.DataFrame, task: str) -> Dict[str, List[Dict] data_summary = self.analyze_data(data) prompt = f""" -You are a data scientist creating a synthetic benchmark for a data analytics task. -You are given a summary of a dataset and an analytics task. -Your goal is to reason step-by-step and identify realistic patterns that can be injected into the data -to make it more suitable for evaluating the performance of models on this task. - -Please follow these steps for each column in the dataset: -1. Think about what kind of information this column conveys. -2. Consider how this column might affect or be related to the analytics task. -3. Suggest 1–2 *practical and realistic* patterns that could be injected into the column values. -4. Explain why injecting this pattern would help in evaluating the task. -5. Describe how the pattern would influence model learning or performance on the analytics task. - -Use a JSON output format with the following structure: - -{{ - "column_name_1": [ - {{ - "pattern": "description of the pattern", - "reasoning": "explanation of why this pattern is useful", - "relevance_to_task": "how this pattern helps with the task" - }}, - ... - ], - ... -}} - -Data Summary: -{data_summary} - -Analytics Task: -{task} -""" + You are a data-centric AI expert designing synthetic data benchmarks to evaluate analytics models. + + Given a dataset summary and an analytics task, your job is to inject **2–3 realistic patterns across one or more columns** that: + - Mimic real-world behaviors or anomalies + - Interact with the dataset's structure and semantics + - Meaningfully impact model performance or insight extraction + - Allow for robust benchmarking of analytical reasoning + + --- + + Please follow these explicit steps in your reasoning (Chain-of-Thought): + + ### Step 1: Infer Key Performance Indicators (KPIs) + - Based on the dataset and task, identify 2–4 relevant KPIs that would be tracked by an analyst or model. + + ### Step 2: Identify Influential Columns and Relationships + - Which columns most influence these KPIs? + - Are there any natural correlations, temporal dynamics, or category-based splits that could affect KPI computation? + + ### Step 3: Design 2–3 Global Patterns + - Each pattern may involve **1 or more columns**, and should simulate a **plausible real-world event, behavior, or trend**. + - Avoid trivial noise (e.g., "random fluctuation"). Prefer **interpretable and benchmark-worthy** signals like: + - delayed effects + - conditionally induced trends + - cross-feature dependencies + - regime shifts + - temporal or category-driven anomalies + + ### Step 4: Explain for Each Pattern: + - What exactly is the injected pattern? + - Why is it useful from a benchmarking or insight perspective? + - Which KPIs does it affect, and how? + - What kind of analytical or modeling challenges does it test? + + --- + + ### Output format (JSON): + + {{ + "kpis": ["list of important KPIs"], + "patterns": [ + {{ + "pattern": "Description of the injected pattern", + "columns_involved": ["list of columns affected"], + "reasoning": "Why this pattern is meaningful and realistic", + "relevance_to_kpi": "Which KPIs it affects and how", + "benchmark_value": "What kind of insight or model evaluation this pattern enables" + }}, + ... + ] + }} + + --- + + ### Data Summary: + {data_summary} + + ### Analytics Task: + {task} + + """ response = self.client.chat.completions.create( model="gpt-4o", # Using GPT-4o (OpenAI Omni) @@ -139,14 +168,18 @@ def main(): try: patterns = designer.design_patterns(data, task) - print("\nSuggested Patterns for Each Column:") - for column, suggestions in patterns.items(): - print(f"\n{column}:") - for suggestion in suggestions: - print(f"\nPattern: {suggestion['pattern']}") - print(f"Reasoning: {suggestion['reasoning']}") - print(f"Relevance to task: {suggestion['relevance_to_task']}") - print("-" * 80) + print("\nKey Performance Indicators (KPIs):") + for kpi in patterns.get("kpis", []): + print(f"- {kpi}") + + print("\nSuggested Patterns:") + for pattern in patterns.get("patterns", []): + print(f"\nPattern: {pattern['pattern']}") + print(f"Columns Involved: {', '.join(pattern['columns_involved'])}") + print(f"Reasoning: {pattern['reasoning']}") + print(f"Relevance to KPI: {pattern['relevance_to_kpi']}") + print(f"Benchmark Value: {pattern['benchmark_value']}") + print("-" * 80) except Exception as e: print(f"Error: {e}") From a88b0cb612b8d40bfc1fce5762ee31fe033771c9 Mon Sep 17 00:00:00 2001 From: Amirhossein Abaskohi Date: Tue, 6 May 2025 08:16:41 -0700 Subject: [PATCH 3/5] Added pattern inject. --- scripts/pattern_inject.py | 187 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 187 insertions(+) create mode 100644 scripts/pattern_inject.py diff --git a/scripts/pattern_inject.py b/scripts/pattern_inject.py new file mode 100644 index 0000000..0e88bbe --- /dev/null +++ b/scripts/pattern_inject.py @@ -0,0 +1,187 @@ +from openai import OpenAI +import json +import os +import uuid +import shutil +import re +import subprocess + +class PatternInjector: + def __init__(self, api_key: str = None): + """Initialize the PatternInjector with OpenAI API key. + + Args: + api_key: OpenAI API key. If not provided, will try to get from OPENAI_API_KEY environment variable. + """ + if api_key is None: + api_key = os.getenv("OPENAI_API_KEY") + if api_key is None: + raise ValueError( + "OpenAI API key not provided and OPENAI_API_KEY environment variable not set" + ) + self.client = OpenAI(api_key=api_key) + + def get_inject_codes(self, patterns: str) -> dict: + """Get the code to inject the pattern into the data. + + Args: + patterns: The patterns to inject. It is a json file with the following format: + { + "column_name_1": [ + { + "pattern": "description of the pattern", + "reasoning": "explanation of why this pattern is useful", + "relevance_to_task": "how this pattern helps with the task" + }, + ... + ], + ... + } + + Returns: + The code to inject the pattern into the data. + """ + + print("Started getting inject codes ...") + + patterns = json.loads(patterns) + + output = {} + + for column, pattern in patterns.items(): + + prompt = f""" + You are given a pandas DataFrame named `df` that contains a column called `{column}`. + + Your task is to write a Python function that analyzes the column based on specific patterns and data quality or relevance concerns. + + The function should: + - Be named `modify_{column}` + - Be a standalone function + - Take `df` as its only argument + - Implement logic that addresses all of the following patterns (described below) + - Use only standard libraries or common ones such as `pandas`, `numpy`, `re`, etc. + - Not use any complex or uncommon third-party libraries + + This is the signature of the function: + ```python + def modify_{column}(df: pd.DataFrame) -> pd.DataFrame: + ``` + + Here are the patterns you must handle: + {pattern} + + Output requirements: + - Only include the necessary `import` statements and the function definition. + - Do not include any explanation or comments. + - Only generate one function with all logic embedded. + - Do not include usage examples or extra text. + - Function should have a return statement that returns the modified DataFrame. + + This is an example of the expected output for a column called `age`: + ```python + import numpy as np + # And importing any other necessary libraries + + def modify_age(df: pd.DataFrame) -> pd.DataFrame: + # Example logic to handle the patterns + df['age'] = df['age'].apply(lambda x: np.nan if x < 0 else x) + df['age'] = df['age'].fillna(df['age'].mean()) + return df + ``` + + IMPORTANT NOTES: + - The function should be valid Python code and should not include any comments or explanations. + - The function should be self-contained and not rely on any external context or variables. + - The function should be able to handle the patterns described above and return a modified DataFrame. + - There should be no additional code except for the function definition and necessary imports. You should not write and include other functions or call this functions (not even writing a main function). + - The function should not include any print statements or logging. + + Please return only the code (imports + one function) in python environment. Nothing else. + """ + + response = self.client.chat.completions.create( + model="gpt-4o", # Using GPT-4o (OpenAI Omni) + messages=[ + { + "role": "system", + "content": "You are a data pattern injection coding expert. Your task is to write correct and simple codes to inject the given patterns to help accomplish specific analytics tasks. Always respond with valid Python Code.", + }, + {"role": "user", "content": prompt}, + ], + ) + + output[column] = response.choices[0].text.strip() + + print(f"Finished getting inject codes for column: {column}") + + print("Finished getting inject codes for all columns.") + + return output + + def inject_patterns(self, data_file_addr:str, pattern_codes: dict): + """Inject the patterns into the data. + + Args: + pattern_codes: The pattern codes to inject. It is a dictionary with the following format: + { + "column_name_1": "code to inject the pattern", + ... + } + data_file_addr: Address to the original CSV data file. + + Returns: + Nothing. The function creates a temp folder with modified CSV and scripts. + """ + + print("Started injecting patterns ...") + + # Step 1: Create temp directory + temp_dir = f"temp_{uuid.uuid4().hex}" + os.makedirs(temp_dir, exist_ok=True) + + # Step 2: Copy the original CSV + filename = os.path.basename(data_file_addr) + temp_csv_path = os.path.join(temp_dir, filename) + shutil.copy2(data_file_addr, temp_csv_path) + + # Step 3: Create Python scripts for each column + for column, raw_code in pattern_codes.items(): + print(f"Injecting pattern for column: {column}") + + match = re.search(r"```python(.*?)```", raw_code, re.DOTALL) + code = match.group(1).strip() if match else raw_code.strip() + + code = re.sub(r"^\s*import\s+pandas\s+as\s+pd\s*\n?", "", code, flags=re.MULTILINE) + + func_name = f"modify_{column}" + + final_code = "import pandas as pd\n" + code.strip() + "\n" + final_code += """if __name__ == "__main__":\n""" + final_code += f""" df = pd.read_csv("{filename}")\n""" + final_code += f""" df = {func_name}(df)")\n""" + final_code += f""" df.to_csv("{filename}", index=False)""" + + script_path = os.path.join(temp_dir, f"{func_name}.py") + with open(script_path, "w") as f: + f.write(final_code) + + print(f"Created script for column: {column} at {script_path}") + + # Step 4: Run the script + subprocess.run(["python", script_path], check=True, cwd=temp_dir) + + print(f"Injected pattern for column: {column}") + + print("Finished injecting patterns for all columns.") + + # Step 5: Copy modified CSV to original directory + original_dir = os.path.dirname(data_file_addr) + injected_filename = os.path.splitext(filename)[0] + "_injected.csv" + injected_path = os.path.join(original_dir, injected_filename) + shutil.copy2(temp_csv_path, injected_path) + + # Step 6: Clean up + shutil.rmtree(temp_dir) + + print(f"Injected CSV saved to: {injected_path}") \ No newline at end of file From 721598949d1a73751d372cf2f08c2de6218690b2 Mon Sep 17 00:00:00 2001 From: Amirhossein Abaskohi Date: Tue, 6 May 2025 19:07:21 -0700 Subject: [PATCH 4/5] Updated pattern inject. --- scripts/pattern_inject.py | 150 ++++++++++++++++++++++---------------- 1 file changed, 89 insertions(+), 61 deletions(-) diff --git a/scripts/pattern_inject.py b/scripts/pattern_inject.py index 0e88bbe..65e2700 100644 --- a/scripts/pattern_inject.py +++ b/scripts/pattern_inject.py @@ -1,10 +1,11 @@ from openai import OpenAI import json import os -import uuid import shutil import re import subprocess +import pandas as pd + class PatternInjector: def __init__(self, api_key: str = None): @@ -27,15 +28,17 @@ def get_inject_codes(self, patterns: str) -> dict: Args: patterns: The patterns to inject. It is a json file with the following format: { - "column_name_1": [ + "kpis": [...], + "patterns": [ { "pattern": "description of the pattern", + "columns_involved": ["col1", "col2", ...], "reasoning": "explanation of why this pattern is useful", - "relevance_to_task": "how this pattern helps with the task" + "relevance_to_kpi": "how this pattern helps with the task", + "benchmark_value": "value to test against" }, ... - ], - ... + ] } Returns: @@ -44,33 +47,44 @@ def get_inject_codes(self, patterns: str) -> dict: print("Started getting inject codes ...") - patterns = json.loads(patterns) - + patterns_dict = json.loads(patterns) + patterns_list = patterns_dict.get("patterns", []) output = {} - for column, pattern in patterns.items(): + for pattern_index, pattern_info in enumerate(patterns_list): + columns = pattern_info.get("columns_involved", []) + pattern_description = pattern_info.get("pattern", "") + reasoning = pattern_info.get("reasoning", "") + relevance = pattern_info.get("relevance_to_kpi", "") + + function_name = "modify_" + "_".join(columns) + columns_str = "" + for i, column in enumerate(columns): + columns_str += f"'{i+1}. {column}\n" prompt = f""" - You are given a pandas DataFrame named `df` that contains a column called `{column}`. + You are given a pandas DataFrame named `df` that contains the following columns: + + `{columns_str}` - Your task is to write a Python function that analyzes the column based on specific patterns and data quality or relevance concerns. + Your task is to write a Python function that modifies the columns based on specific patterns. The function should: - - Be named `modify_{column}` + - Be named `{function_name}` - Be a standalone function - Take `df` as its only argument - - Implement logic that addresses all of the following patterns (described below) - - Use only standard libraries or common ones such as `pandas`, `numpy`, `re`, etc. + - Implement logic that addresses the following pattern: + Pattern: {pattern_description} + Reasoning: {reasoning} + Relevance: {relevance} + - Use only standard libraries or common ones such as `numpy`, `re`, etc. - Not use any complex or uncommon third-party libraries This is the signature of the function: ```python - def modify_{column}(df: pd.DataFrame) -> pd.DataFrame: + def {function_name}(df: pd.DataFrame) -> pd.DataFrame: ``` - Here are the patterns you must handle: - {pattern} - Output requirements: - Only include the necessary `import` statements and the function definition. - Do not include any explanation or comments. @@ -78,15 +92,16 @@ def modify_{column}(df: pd.DataFrame) -> pd.DataFrame: - Do not include usage examples or extra text. - Function should have a return statement that returns the modified DataFrame. - This is an example of the expected output for a column called `age`: + This is an example of the expected output for columns called `age` and `height`: + ```python import numpy as np # And importing any other necessary libraries - def modify_age(df: pd.DataFrame) -> pd.DataFrame: + def modify_age_height(df: pd.DataFrame) -> pd.DataFrame: # Example logic to handle the patterns df['age'] = df['age'].apply(lambda x: np.nan if x < 0 else x) - df['age'] = df['age'].fillna(df['age'].mean()) + df['height'] = df['height'].fillna(df['height'].mean()) return df ``` @@ -101,7 +116,7 @@ def modify_age(df: pd.DataFrame) -> pd.DataFrame: """ response = self.client.chat.completions.create( - model="gpt-4o", # Using GPT-4o (OpenAI Omni) + model="gpt-4o", messages=[ { "role": "system", @@ -111,77 +126,90 @@ def modify_age(df: pd.DataFrame) -> pd.DataFrame: ], ) - output[column] = response.choices[0].text.strip() - - print(f"Finished getting inject codes for column: {column}") - - print("Finished getting inject codes for all columns.") + output["Pattern" + str(pattern_index+1) + "_".join(columns)] = response.choices[0].message.content.strip() + print(f"Finished getting inject codes for pattern on columns: {"_".join(columns)}") + print("Finished getting inject codes for all patterns.") return output - - def inject_patterns(self, data_file_addr:str, pattern_codes: dict): + + def inject_patterns( + self, + base_df: pd.DataFrame, + pattern_codes: dict, + hash_id: str = None, + ) -> pd.DataFrame: """Inject the patterns into the data. Args: + base_df: The base DataFrame to inject the patterns into. pattern_codes: The pattern codes to inject. It is a dictionary with the following format: { - "column_name_1": "code to inject the pattern", + "name1": "code to inject for pattern1", + "name1": "code to inject for pattern2", ... } - data_file_addr: Address to the original CSV data file. - - Returns: - Nothing. The function creates a temp folder with modified CSV and scripts. + hash_id: The hash ID to use for the temp directory. If not provided, will use "default". """ print("Started injecting patterns ...") - # Step 1: Create temp directory - temp_dir = f"temp_{uuid.uuid4().hex}" + # Step 1: Create temp directory inside results/{hash_id}/codefiles/ + if hash_id is None: + hash_id = "default" + temp_dir = os.path.join("results", hash_id, "codefiles") os.makedirs(temp_dir, exist_ok=True) - # Step 2: Copy the original CSV - filename = os.path.basename(data_file_addr) - temp_csv_path = os.path.join(temp_dir, filename) - shutil.copy2(data_file_addr, temp_csv_path) + # Step 2: Handle input data + if isinstance(base_df, pd.DataFrame): + df = base_df.copy() + filename = "temp_data.csv" + temp_csv_path = os.path.join(temp_dir, filename) + df.to_csv(temp_csv_path, index=False) + else: + raise ValueError( + "base_df should be a pandas DataFrame. Please provide a valid DataFrame." + ) # Step 3: Create Python scripts for each column - for column, raw_code in pattern_codes.items(): - print(f"Injecting pattern for column: {column}") + for pattern_name, raw_code in pattern_codes.items(): + print(f"Injecting pattern: {pattern_name}") match = re.search(r"```python(.*?)```", raw_code, re.DOTALL) code = match.group(1).strip() if match else raw_code.strip() - code = re.sub(r"^\s*import\s+pandas\s+as\s+pd\s*\n?", "", code, flags=re.MULTILINE) + code = re.sub( + r"^\s*import\s+pandas\s+as\s+pd\s*\n?", "", code, flags=re.MULTILINE + ) - func_name = f"modify_{column}" + func_name = f"modify_" + "_".join(pattern_name.split("_")[1:]) final_code = "import pandas as pd\n" + code.strip() + "\n" final_code += """if __name__ == "__main__":\n""" final_code += f""" df = pd.read_csv("{filename}")\n""" - final_code += f""" df = {func_name}(df)")\n""" + final_code += f""" df = {func_name}(df)\n""" final_code += f""" df.to_csv("{filename}", index=False)""" - script_path = os.path.join(temp_dir, f"{func_name}.py") + # Create script in codefiles directory + script_name = f"{func_name}.py" + script_path = os.path.join(temp_dir, script_name) + with open(script_path, "w") as f: f.write(final_code) - print(f"Created script for column: {column} at {script_path}") + print(f"Created script for pattern: {pattern_name}") # Step 4: Run the script - subprocess.run(["python", script_path], check=True, cwd=temp_dir) + subprocess.run(["python3", script_name], check=True, cwd=temp_dir) - print(f"Injected pattern for column: {column}") - - print("Finished injecting patterns for all columns.") - - # Step 5: Copy modified CSV to original directory - original_dir = os.path.dirname(data_file_addr) - injected_filename = os.path.splitext(filename)[0] + "_injected.csv" - injected_path = os.path.join(original_dir, injected_filename) - shutil.copy2(temp_csv_path, injected_path) - - # Step 6: Clean up - shutil.rmtree(temp_dir) - - print(f"Injected CSV saved to: {injected_path}") \ No newline at end of file + # Update the DataFrame with the modified data + df = pd.read_csv(temp_csv_path) + + print(f"Injected pattern for pattern: {pattern_name}") + + print("Finished injecting patterns for all patterns.") + + # Step 5: Clean up + # Don't remove the directory since we want to keep the code files + # shutil.rmtree(temp_dir) + + return df From 706979744fef7d6623000bb7ebc434716b3c0279 Mon Sep 17 00:00:00 2001 From: Amirhossein Abaskohi Date: Wed, 7 May 2025 21:34:06 -0700 Subject: [PATCH 5/5] Updated pattern inject. --- scripts/pattern_inject.py | 73 ++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 40 deletions(-) diff --git a/scripts/pattern_inject.py b/scripts/pattern_inject.py index 65e2700..98b73f6 100644 --- a/scripts/pattern_inject.py +++ b/scripts/pattern_inject.py @@ -57,54 +57,44 @@ def get_inject_codes(self, patterns: str) -> dict: reasoning = pattern_info.get("reasoning", "") relevance = pattern_info.get("relevance_to_kpi", "") - function_name = "modify_" + "_".join(columns) columns_str = "" for i, column in enumerate(columns): - columns_str += f"'{i+1}. {column}\n" + columns_str += f"'{i+1}. {column} \t" prompt = f""" - You are given a pandas DataFrame named `df` that contains the following columns: - - `{columns_str}` - - Your task is to write a Python function that modifies the columns based on specific patterns. - - The function should: - - Be named `{function_name}` - - Be a standalone function - - Take `df` as its only argument - - Implement logic that addresses the following pattern: - Pattern: {pattern_description} - Reasoning: {reasoning} - Relevance: {relevance} - - Use only standard libraries or common ones such as `numpy`, `re`, etc. - - Not use any complex or uncommon third-party libraries - - This is the signature of the function: - ```python - def {function_name}(df: pd.DataFrame) -> pd.DataFrame: - ``` + Imagine you are given a pandas DataFrame named `df`. + + You are tasked with writing a Python function that injects a pattern into the given pandas DataFrame named `df`. + Injecting a pattern means **modifying or adding values** in specific columns of the DataFrame to follow a certain logical structure, transformation rule, or simulated behavior. This may involve altering existing column values, adding new derived columns, or enforcing specific relationships between columns. + Here is what you need to do: + - The function **must be named** `pattern_{pattern_index+1}` (e.g., `pattern_3`) + - The signature of the function should be: + ```python + def pattern_{pattern_index+1}(df: pd.DataFrame) -> pd.DataFrame: + ``` + - It should be a **standalone function** (no external dependencies other than standard libraries and `pandas`, `numpy`, `re`) + - The function should: + 1. Take only one input: the DataFrame `df` + 2. **Modify `df` in-place** (i.e., apply transformations directly on `df`) + 3. Implement the following pattern injection: + - **Pattern Description**: {pattern_description} + - **Reasoning**: {reasoning} + - **Relevance**: {relevance} + - **Columns Involved**: {columns_str} + 4. Assume that the DataFrame already contains all columns listed in `Columns Involved`. + 5. Make sure your function is not using columns that are not listed in `Columns Involved`. + - If new columns need to be added as part of the pattern, ensure they are clearly named and added to `df` + - Handle errors gracefully (e.g., missing values, type mismatches) + - Return the modified `df` (even though it is modified in place, this improves flexibility) Output requirements: - Only include the necessary `import` statements and the function definition. - Do not include any explanation or comments. - Only generate one function with all logic embedded. + - The code should be valid Python code and in python environment. - Do not include usage examples or extra text. - Function should have a return statement that returns the modified DataFrame. - This is an example of the expected output for columns called `age` and `height`: - - ```python - import numpy as np - # And importing any other necessary libraries - - def modify_age_height(df: pd.DataFrame) -> pd.DataFrame: - # Example logic to handle the patterns - df['age'] = df['age'].apply(lambda x: np.nan if x < 0 else x) - df['height'] = df['height'].fillna(df['height'].mean()) - return df - ``` - IMPORTANT NOTES: - The function should be valid Python code and should not include any comments or explanations. - The function should be self-contained and not rely on any external context or variables. @@ -145,7 +135,7 @@ def inject_patterns( pattern_codes: The pattern codes to inject. It is a dictionary with the following format: { "name1": "code to inject for pattern1", - "name1": "code to inject for pattern2", + "name2": "code to inject for pattern2", ... } hash_id: The hash ID to use for the temp directory. If not provided, will use "default". @@ -171,6 +161,7 @@ def inject_patterns( ) # Step 3: Create Python scripts for each column + pattern_index = 1 for pattern_name, raw_code in pattern_codes.items(): print(f"Injecting pattern: {pattern_name}") @@ -181,13 +172,13 @@ def inject_patterns( r"^\s*import\s+pandas\s+as\s+pd\s*\n?", "", code, flags=re.MULTILINE ) - func_name = f"modify_" + "_".join(pattern_name.split("_")[1:]) + func_name = "pattern_" + pattern_index final_code = "import pandas as pd\n" + code.strip() + "\n" final_code += """if __name__ == "__main__":\n""" - final_code += f""" df = pd.read_csv("{filename}")\n""" + final_code += f""" df = pd.read_csv("./{filename}")\n""" final_code += f""" df = {func_name}(df)\n""" - final_code += f""" df.to_csv("{filename}", index=False)""" + final_code += f""" df.to_csv("./{filename}", index=False)""" # Create script in codefiles directory script_name = f"{func_name}.py" @@ -206,6 +197,8 @@ def inject_patterns( print(f"Injected pattern for pattern: {pattern_name}") + pattern_index += 1 + print("Finished injecting patterns for all patterns.") # Step 5: Clean up