Skip to content

Inject pattern #10

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
80 changes: 79 additions & 1 deletion insightbench/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
from insightbench.utils.metrics_utils import score_insight
from insightbench import metrics
from PIL import Image
import pandas as pd
from typing import Dict, List, Optional
from scripts.pattern_design import PatternDesigner


class Agent:
Expand Down Expand Up @@ -167,7 +170,7 @@ def __init__(
goal="I want to find interesting trends in this dataset",
verbose=False,
temperature=0,
n_retries=2
n_retries=2,
):
self.goal = goal
if savedir is None:
Expand Down Expand Up @@ -482,3 +485,78 @@ def save_state_dict(self, fname):
def load_state_dict(self, fname):
with open(fname, "r") as f:
self.insights_history = json.load(f)


class AgentDataGen:
def __init__(
self,
api_key: Optional[str] = None,
tasks_path: str = "insightbench/utils/domains_tasks.json",
):
"""Initialize the AgentDataGen with OpenAI API key and tasks path.

Args:
api_key: OpenAI API key. If not provided, will try to get from OPENAI_API_KEY environment variable.
tasks_path: Path to the domains_tasks.json file
"""
self.pattern_designer = PatternDesigner(api_key)
self.tasks_path = tasks_path
self.tasks = self._load_tasks()

def _load_tasks(self) -> dict:
"""Load tasks from domains_tasks.json."""
try:
with open(self.tasks_path, "r") as f:
return json.load(f)
except Exception as e:
raise ValueError(f"Failed to load tasks from {self.tasks_path}: {str(e)}")

def generate_patterns(self, data: pd.DataFrame, task: str) -> Dict[str, List[Dict]]:
"""Generate patterns for the given data and task.

Args:
data: Input DataFrame containing the data to analyze
task: Description of the analytics task

Returns:
Dictionary mapping column names to lists of pattern suggestions
"""
return self.pattern_designer.design_patterns(data, task)

def generate_all_patterns(
self, data: pd.DataFrame, output_dir: str = "results/Patterns"
) -> None:
"""Generate patterns for all tasks and save them to the output directory.

Args:
data: Input DataFrame containing the data to analyze
output_dir: Directory to save the generated patterns
"""
os.makedirs(output_dir, exist_ok=True)

for domain, domain_tasks in self.tasks.items():
print(f"\nProcessing domain: {domain}")

# Create domain directory
domain_dir = os.path.join(output_dir, domain)
os.makedirs(domain_dir, exist_ok=True)

for task in domain_tasks:
print(f"\nGenerating patterns for task: {task}")

try:
# Generate patterns
patterns = self.generate_patterns(data, task)

# Save patterns to file
task_filename = task.lower().replace(" ", "_") + "_patterns.json"
output_path = os.path.join(domain_dir, task_filename)

with open(output_path, "w") as f:
json.dump(patterns, f, indent=2)

print(f"Saved patterns to: {output_path}")

except Exception as e:
print(f"Error generating patterns for task '{task}': {str(e)}")
continue
28 changes: 28 additions & 0 deletions insightbench/utils/domains_tasks.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"tasks": [
"Fraud Detection",
"Recommendation Systems",
"Churn Analysis",
"Customer Segmentation",
"Network Analysis",
"Association Rule Mining",
"Dashboard Summary",
"Predictive Maintenance",
"Cohort Analysis",
"Attribution Modeling",
"Anomaly Detection",
"Feature Importance Ranking",
"Geospatial Analysis",
"Causality",
"Logs Clustering",
"Time Series Decomposition",
"Principal Component Analysis",
"Correlation Analysis",
"Knowledge Base",
"Multi-table Search",
"Huge Table Analysis",
"Topic Modeling",
"Market Analysis",
"Data Imputation"
]
}
8 changes: 5 additions & 3 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import os, argparse
import pandas as pd

from insightbench.utils import agent_utils as au
from insightbench import agents, benchmarks
from insightbench.utils import exp_utils as eu
from insightbench.utils.exp_utils import hash_dict, save_json
from dotenv import load_dotenv

load_dotenv()


def main(exp_dict, savedir, args):
Expand Down Expand Up @@ -82,7 +84,7 @@ def main(exp_dict, savedir, args):
parser.add_argument("-sb", "--savedir_base", type=str, default="results")
parser.add_argument("-r", "--reset", type=int, default=0)
# add openai api key
parser.add_argument("-o", "--openai_api_key", type=str, default=None)
# parser.add_argument("-o", "--openai_api_key", type=str, default=None)
# dataset path
parser.add_argument("-d", "--datadir", type=str, default="data/notebooks")

Expand All @@ -102,7 +104,7 @@ def main(exp_dict, savedir, args):
)

# set open ai env
os.environ["OPENAI_API_KEY"] = args.openai_api_key
# os.environ["OPENAI_API_KEY"] =

# Loop through experiments
for exp_dict in exp_list:
Expand Down
Binary file added results/.DS_Store
Binary file not shown.
189 changes: 189 additions & 0 deletions scripts/pattern_design.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
import pandas as pd
import json
from openai import OpenAI
from typing import Dict, List
import os, re


class PatternDesigner:
def __init__(self, api_key: str = None):
"""Initialize the PatternDesigner with OpenAI API key.

Args:
api_key: OpenAI API key. If not provided, will try to get from OPENAI_API_KEY environment variable.
"""
if api_key is None:
api_key = os.getenv("OPENAI_API_KEY")
if api_key is None:
raise ValueError(
"OpenAI API key not provided and OPENAI_API_KEY environment variable not set"
)
self.client = OpenAI(api_key=api_key)

def analyze_data(self, data: pd.DataFrame) -> str:
"""Analyze the data and return a detailed summary of its structure."""
summary = {
"num_rows": len(data),
"num_cols": len(data.columns),
"column_summaries": {},
}

for col in data.columns:
col_data = data[col]
col_summary = {
"dtype": str(col_data.dtype),
"num_missing": int(col_data.isnull().sum()),
"num_unique": int(col_data.nunique()),
"sample_values": col_data.dropna().unique()[:3].tolist(),
}

if pd.api.types.is_numeric_dtype(col_data):
col_summary.update(
{
"mean": float(col_data.mean()),
"std": float(col_data.std()),
"min": float(col_data.min()),
"max": float(col_data.max()),
}
)
elif pd.api.types.is_datetime64_any_dtype(col_data):
col_summary.update(
{
"min_date": str(col_data.min()),
"max_date": str(col_data.max()),
}
)
elif pd.api.types.is_string_dtype(col_data):
col_summary.update(
{"top_frequent_values": col_data.value_counts().head(3).to_dict()}
)

summary["column_summaries"][col] = col_summary

return json.dumps(summary, indent=2)

def design_patterns(self, data: pd.DataFrame, task: str) -> Dict[str, List[Dict]]:
"""Design patterns for each column based on the given analytics task."""
data_summary = self.analyze_data(data)

prompt = f"""
You are a data-centric AI expert designing synthetic data benchmarks to evaluate analytics models.

Given a dataset summary and an analytics task, your job is to inject **2–3 realistic patterns across one or more columns** that:
- Mimic real-world behaviors or anomalies
- Interact with the dataset's structure and semantics
- Meaningfully impact model performance or insight extraction
- Allow for robust benchmarking of analytical reasoning

---

Please follow these explicit steps in your reasoning (Chain-of-Thought):

### Step 1: Infer Key Performance Indicators (KPIs)
- Based on the dataset and task, identify 2–4 relevant KPIs that would be tracked by an analyst or model.

### Step 2: Identify Influential Columns and Relationships
- Which columns most influence these KPIs?
- Are there any natural correlations, temporal dynamics, or category-based splits that could affect KPI computation?

### Step 3: Design 2–3 Global Patterns
- Each pattern may involve **1 or more columns**, and should simulate a **plausible real-world event, behavior, or trend**.
- Avoid trivial noise (e.g., "random fluctuation"). Prefer **interpretable and benchmark-worthy** signals like:
- delayed effects
- conditionally induced trends
- cross-feature dependencies
- regime shifts
- temporal or category-driven anomalies

### Step 4: Explain for Each Pattern:
- What exactly is the injected pattern?
- Why is it useful from a benchmarking or insight perspective?
- Which KPIs does it affect, and how?
- What kind of analytical or modeling challenges does it test?

---

### Output format (JSON):

{{
"kpis": ["list of important KPIs"],
"patterns": [
{{
"pattern": "Description of the injected pattern",
"columns_involved": ["list of columns affected"],
"reasoning": "Why this pattern is meaningful and realistic",
"relevance_to_kpi": "Which KPIs it affects and how",
"benchmark_value": "What kind of insight or model evaluation this pattern enables"
}},
...
]
}}

---

### Data Summary:
{data_summary}

### Analytics Task:
{task}

"""

response = self.client.chat.completions.create(
model="gpt-4o", # Using GPT-4o (OpenAI Omni)
messages=[
{
"role": "system",
"content": "You are a data pattern design expert. Your task is to suggest meaningful patterns that can be injected into data columns to help accomplish specific analytics tasks. Always respond with valid JSON.",
},
{"role": "user", "content": prompt},
],
)
raw_response = response.choices[0].message.content
# Strip triple backticks and optional 'json' tag
cleaned_json_str = re.sub(r"^```(?:json)?\n|\n```$", "", raw_response.strip())
try:
return json.loads(cleaned_json_str)
except json.JSONDecodeError:
raise ValueError("Failed to parse LLM response as JSON")


def main():
# Get API key from environment variable
designer = (
PatternDesigner()
) # Will automatically use OPENAI_API_KEY from environment

# Sample DataFrame
data = pd.DataFrame(
{
"date": ["2023-01-01", "2023-01-02", "2023-01-03"],
"sales": [100, 150, 200],
"category": ["A", "B", "A"],
}
)

task = "Anomaly detection"

try:
patterns = designer.design_patterns(data, task)

print("\nKey Performance Indicators (KPIs):")
for kpi in patterns.get("kpis", []):
print(f"- {kpi}")

print("\nSuggested Patterns:")
for pattern in patterns.get("patterns", []):
print(f"\nPattern: {pattern['pattern']}")
print(f"Columns Involved: {', '.join(pattern['columns_involved'])}")
print(f"Reasoning: {pattern['reasoning']}")
print(f"Relevance to KPI: {pattern['relevance_to_kpi']}")
print(f"Benchmark Value: {pattern['benchmark_value']}")
print("-" * 80)

except Exception as e:
print(f"Error: {e}")


if __name__ == "__main__":
main()
Loading