From 33de05264010dd578e13039a64d979c23bebecd6 Mon Sep 17 00:00:00 2001 From: Jonathan Bragg Date: Thu, 17 Jul 2025 12:53:06 -0700 Subject: [PATCH 1/8] Pipe through evalspecs and git revision for leaderboard --- pyproject.toml | 2 +- .../leaderboard/dataset_features.yml | 18 +++++++++++ src/agenteval/leaderboard/schema_generator.py | 32 ++++++++++++++++++- src/agenteval/leaderboard/view.py | 28 ++++++++++++++++ src/agenteval/models.py | 2 +- src/agenteval/score.py | 27 +++++++++++++++- 6 files changed, 105 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f81f191..cdc5c11 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "agent-eval" -version = "0.1.15" +version = "0.1.16" description = "Agent evaluation toolkit" readme = "README.md" requires-python = ">=3.10" diff --git a/src/agenteval/leaderboard/dataset_features.yml b/src/agenteval/leaderboard/dataset_features.yml index a0773ac..2ea6690 100644 --- a/src/agenteval/leaderboard/dataset_features.yml +++ b/src/agenteval/leaderboard/dataset_features.yml @@ -28,6 +28,24 @@ dtype: float64 - name: split dtype: string +- name: eval_specs + list: + - name: solver + dtype: string + - name: solver_args + dtype: string + - name: model + dtype: string + - name: model_args + dtype: string + - name: revision + struct: + - name: type + dtype: string + - name: origin + dtype: string + - name: commit + dtype: string - name: results list: - name: task_name diff --git a/src/agenteval/leaderboard/schema_generator.py b/src/agenteval/leaderboard/schema_generator.py index c1f892b..7222c43 100644 --- a/src/agenteval/leaderboard/schema_generator.py +++ b/src/agenteval/leaderboard/schema_generator.py @@ -5,7 +5,7 @@ import datetime import types from importlib import resources -from typing import Union, get_args, get_origin +from typing import Any, Literal, Union, get_args, get_origin import pyarrow as pa import yaml @@ -40,6 +40,36 @@ def _pa_type_for_annotation(anno) -> pa.DataType: if origin is list: inner = get_args(anno)[0] return pa.list_(_pa_type_for_annotation(inner)) + # Handle dict[str, Any] specifically - these are serialized as JSON strings + if origin is dict: + args = get_args(anno) + if len(args) == 2 and args[0] is str and args[1] is Any: + return pa.string() # dict[str, Any] becomes JSON string + # Other dict types could be handled as proper Arrow maps/structs + # For now, fall through to unsupported + # Handle Literal types - infer type from literal values + if origin is Literal: + literal_values = get_args(anno) + if not literal_values: + return pa.string() # fallback + + # Check that all literal values are the same type + first_type = type(literal_values[0]) + for value in literal_values: + if type(value) != first_type: + raise ValueError(f"Literal {anno} contains mixed types: {[type(v) for v in literal_values]}") + + # Map Python type to Arrow type + if first_type is str: + return pa.string() + elif first_type is int: + return pa.int64() + elif first_type is bool: + return pa.bool_() + elif first_type is float: + return pa.float64() + else: + raise ValueError(f"Unsupported literal type {first_type} in {anno}") # Nested BaseModel if isinstance(anno, type) and issubclass(anno, BaseModel): inner_schema = _schema_from_pydantic(anno) diff --git a/src/agenteval/leaderboard/view.py b/src/agenteval/leaderboard/view.py index a334a2b..f425506 100644 --- a/src/agenteval/leaderboard/view.py +++ b/src/agenteval/leaderboard/view.py @@ -69,6 +69,7 @@ def view( "User/organization", "Submission date", "Logs", + "Source", "Openness", "Agent tooling", "LLM base", @@ -190,6 +191,31 @@ def _get_dataframe( } ) + # extract git revision source code URL with SHA + # only show source URL if all eval specs have the same revision + source_url = None + if ev.eval_specs and all(spec.revision == ev.eval_specs[0].revision for spec in ev.eval_specs): + revision = ev.eval_specs[0].revision + + # Only handle git revisions with complete info + if (revision and revision.type == 'git' and + revision.origin and revision.commit): + origin = revision.origin + commit = revision.commit + + # Convert SSH URLs to HTTPS URLs + if origin.startswith('git@'): + # Convert git@github.com:user/repo.git to https://github.com/user/repo + origin = origin.replace('git@', 'https://').replace(':', '/', 1) + + # Remove .git suffix if present + if origin.endswith('.git'): + origin = origin[:-4] + + # Only create URL if it looks like a valid HTTP(S) URL + if origin.startswith(('http://', 'https://')): + source_url = f"{origin}/tree/{commit}" + rows.append( { "id": sub.submit_time, @@ -202,6 +228,7 @@ def _get_dataframe( "base_models": model_names, **flat, "logs_url": sub.logs_url if is_internal else sub.logs_url_public, + "source_url": source_url, } ) @@ -228,6 +255,7 @@ def _pretty_column_name(col: str) -> str: "tool_usage": "Agent tooling", "base_models": "LLM base", "logs_url": "Logs", + "source_url": "Source", "overall/score": "Overall", "overall/cost": "Overall cost", } diff --git a/src/agenteval/models.py b/src/agenteval/models.py index f8bb39f..a759fab 100644 --- a/src/agenteval/models.py +++ b/src/agenteval/models.py @@ -33,7 +33,7 @@ class SubmissionMetadata(BaseModel): class EvalResult(EvalConfig): - eval_specs: list[EvalSpec] | None = Field(default=None, exclude=True) + eval_specs: list[EvalSpec] | None = None results: list[TaskResult] | None = None submission: SubmissionMetadata = Field(default_factory=SubmissionMetadata) diff --git a/src/agenteval/score.py b/src/agenteval/score.py index 65ed5b1..c22aca1 100644 --- a/src/agenteval/score.py +++ b/src/agenteval/score.py @@ -10,12 +10,16 @@ read_eval_log, read_eval_log_samples, ) -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, field_validator, field_serializer from .log import ModelUsageWithName, collect_model_usage, compute_model_cost logger = logging.getLogger(__name__) +# Fields with dict[str, Any] type that need JSON serialization for Arrow compatibility +# Arrow/Parquet cannot handle dict[str, Any] so we serialize to JSON strings +_EVALSPEC_JSON_FIELDS = ['solver_args', 'model_args'] + class Metric(BaseModel): """A metric for a task.""" @@ -42,6 +46,27 @@ def from_eval_log(cls, log: EvalLog) -> "EvalSpec": model_args=log.eval.model_args, revision=log.eval.revision, ) + + @field_validator(*_EVALSPEC_JSON_FIELDS, mode='before') + @classmethod + def deserialize_json_fields(cls, v): + """Deserialize JSON strings back to Python objects. Raises on JSON errors.""" + import json + if not isinstance(v, str): + return v # Already deserialized or None + return json.loads(v) + + @field_serializer(*_EVALSPEC_JSON_FIELDS) + def serialize_json_fields(self, v): + """Serialize Python objects to JSON strings. Logs errors and returns fallback.""" + import json + if v is None: + return None + try: + return json.dumps(v, default=str) + except (TypeError, ValueError) as e: + logger.warning(f"Failed to serialize field to JSON: {e}, returning error indicator") + return json.dumps({"__serialization_error__": str(e)}) class TaskResult(BaseModel): From 0fc2d8c230e3c0b9e85e26754879d850decab7eb Mon Sep 17 00:00:00 2001 From: Jonathan Bragg Date: Thu, 17 Jul 2025 16:04:58 -0700 Subject: [PATCH 2/8] Nest evalspec under taskresult to simplify --- src/agenteval/cli.py | 19 +++++---- .../leaderboard/dataset_features.yml | 32 +++++++------- src/agenteval/leaderboard/view.py | 42 ++++++++++--------- src/agenteval/models.py | 3 +- src/agenteval/score.py | 33 +++++++-------- 5 files changed, 66 insertions(+), 63 deletions(-) diff --git a/src/agenteval/cli.py b/src/agenteval/cli.py index 5ca06eb..a8b6c14 100644 --- a/src/agenteval/cli.py +++ b/src/agenteval/cli.py @@ -172,16 +172,21 @@ def score_command( suite_cfg = load_suite_config(config_path) eval_result = EvalResult(suite_config=suite_cfg, split=split) - task_results, eval_specs, had_errors = process_eval_logs(log_dir) - eval_result.eval_specs = eval_specs + task_results, had_errors = process_eval_logs(log_dir) eval_result.results = task_results # Warn if multiple evaluation specs present - if eval_result.eval_specs and len(eval_result.eval_specs) > 1: - click.echo( - f"Warning: Found {len(eval_result.eval_specs)} different eval specs. " - "Logs may come from mixed runs." - ) + if eval_result.results: + unique_specs = set() + for task_result in eval_result.results: + spec_hash = hash(task_result.eval_spec.model_dump_json()) + unique_specs.add(spec_hash) + + if len(unique_specs) > 1: + click.echo( + f"Warning: Found {len(unique_specs)} different eval specs. " + "Logs may come from mixed runs." + ) # Warn about any missing tasks missing_tasks = eval_result.find_missing_tasks() diff --git a/src/agenteval/leaderboard/dataset_features.yml b/src/agenteval/leaderboard/dataset_features.yml index 2ea6690..d894e08 100644 --- a/src/agenteval/leaderboard/dataset_features.yml +++ b/src/agenteval/leaderboard/dataset_features.yml @@ -28,28 +28,28 @@ dtype: float64 - name: split dtype: string -- name: eval_specs +- name: results list: - - name: solver - dtype: string - - name: solver_args - dtype: string - - name: model - dtype: string - - name: model_args + - name: task_name dtype: string - - name: revision + - name: eval_spec struct: - - name: type + - name: solver dtype: string - - name: origin + - name: solver_args dtype: string - - name: commit + - name: model dtype: string -- name: results - list: - - name: task_name - dtype: string + - name: model_args + dtype: string + - name: revision + struct: + - name: type + dtype: string + - name: origin + dtype: string + - name: commit + dtype: string - name: metrics list: - name: name diff --git a/src/agenteval/leaderboard/view.py b/src/agenteval/leaderboard/view.py index f425506..db6e0d3 100644 --- a/src/agenteval/leaderboard/view.py +++ b/src/agenteval/leaderboard/view.py @@ -194,27 +194,29 @@ def _get_dataframe( # extract git revision source code URL with SHA # only show source URL if all eval specs have the same revision source_url = None - if ev.eval_specs and all(spec.revision == ev.eval_specs[0].revision for spec in ev.eval_specs): - revision = ev.eval_specs[0].revision - - # Only handle git revisions with complete info - if (revision and revision.type == 'git' and - revision.origin and revision.commit): - origin = revision.origin - commit = revision.commit - - # Convert SSH URLs to HTTPS URLs - if origin.startswith('git@'): - # Convert git@github.com:user/repo.git to https://github.com/user/repo - origin = origin.replace('git@', 'https://').replace(':', '/', 1) - - # Remove .git suffix if present - if origin.endswith('.git'): - origin = origin[:-4] + if ev.results: + task_revisions = [tr.eval_spec.revision for tr in ev.results if tr.eval_spec.revision] + if task_revisions and all(rev == task_revisions[0] for rev in task_revisions): + revision = task_revisions[0] - # Only create URL if it looks like a valid HTTP(S) URL - if origin.startswith(('http://', 'https://')): - source_url = f"{origin}/tree/{commit}" + # Only handle git revisions with complete info + if (revision and revision.type == 'git' and + revision.origin and revision.commit): + origin = revision.origin + commit = revision.commit + + # Convert SSH URLs to HTTPS URLs + if origin.startswith('git@'): + # Convert git@github.com:user/repo.git to https://github.com/user/repo + origin = origin.replace('git@', 'https://').replace(':', '/', 1) + + # Remove .git suffix if present + if origin.endswith('.git'): + origin = origin[:-4] + + # Only create URL if it looks like a valid HTTP(S) URL + if origin.startswith(('http://', 'https://')): + source_url = f"{origin}/tree/{commit}" rows.append( { diff --git a/src/agenteval/models.py b/src/agenteval/models.py index a759fab..f017ead 100644 --- a/src/agenteval/models.py +++ b/src/agenteval/models.py @@ -6,7 +6,7 @@ from .config import SuiteConfig from .io import atomic_write_file -from .score import EvalSpec, TaskResult +from .score import TaskResult class EvalConfig(BaseModel): @@ -33,7 +33,6 @@ class SubmissionMetadata(BaseModel): class EvalResult(EvalConfig): - eval_specs: list[EvalSpec] | None = None results: list[TaskResult] | None = None submission: SubmissionMetadata = Field(default_factory=SubmissionMetadata) diff --git a/src/agenteval/score.py b/src/agenteval/score.py index c22aca1..2d02169 100644 --- a/src/agenteval/score.py +++ b/src/agenteval/score.py @@ -75,6 +75,9 @@ class TaskResult(BaseModel): task_name: str """Name of the task.""" + eval_spec: EvalSpec + """Evaluation specification used for this task.""" + metrics: list[Metric] """List of metrics.""" @@ -123,15 +126,15 @@ def get_normalized_task_name(log: EvalLog) -> str: return log.eval.task.split("/")[-1] -def process_eval_logs(log_dir: str) -> tuple[list[TaskResult], list[EvalSpec], bool]: +def process_eval_logs(log_dir: str) -> tuple[list[TaskResult], bool]: """ - Process evaluation logs from a directory and return task results and eval specs. + Process evaluation logs from a directory and return task results. Args: log_dir: Directory containing evaluation logs Returns: - A tuple containing a list of task results and a list of eval specs + A tuple containing a list of task results and whether there were errors """ # Read evaluation logs logs = {} @@ -146,21 +149,8 @@ def process_eval_logs(log_dir: str) -> tuple[list[TaskResult], list[EvalSpec], b if not logs: raise ValueError("No valid evaluation logs found.") - # Collect eval specs - eval_specs = [] - seen_specs = set() - for log in logs.values(): - next_eval_spec = EvalSpec.from_eval_log(log) - # Use the hash of the serialized spec to check for duplicates - spec_hash = hash(next_eval_spec.model_dump_json()) - if spec_hash not in seen_specs: - seen_specs.add(spec_hash) - eval_specs.append(next_eval_spec) - - if not eval_specs: - raise ValueError("Eval specification is required.") - results = [] + has_eval_specs = False for task_name, log in logs.items(): try: metrics = get_metrics(log) @@ -169,9 +159,13 @@ def process_eval_logs(log_dir: str) -> tuple[list[TaskResult], list[EvalSpec], b model_usages = get_model_usages(log) model_costs = [compute_model_cost(usages) for usages in model_usages] has_model_usages = any(len(usages) > 0 for usages in model_usages) + eval_spec = EvalSpec.from_eval_log(log) + has_eval_specs = True + results.append( TaskResult( task_name=task_name, + eval_spec=eval_spec, metrics=metrics, # Set to None to avoid incorrect pyarrow model usage type inference model_usages=model_usages if has_model_usages else None, @@ -182,4 +176,7 @@ def process_eval_logs(log_dir: str) -> tuple[list[TaskResult], list[EvalSpec], b had_errors = True logger.exception(f"No metrics for {task_name}:") - return results, eval_specs, had_errors + if not has_eval_specs: + raise ValueError("Eval specification is required.") + + return results, had_errors From b74f0790e4ac4739971bbafd5f269289ea138ee3 Mon Sep 17 00:00:00 2001 From: Jonathan Bragg Date: Thu, 17 Jul 2025 16:16:36 -0700 Subject: [PATCH 3/8] Improve error handling in process_eval_logs --- src/agenteval/score.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/src/agenteval/score.py b/src/agenteval/score.py index 2d02169..4830ac0 100644 --- a/src/agenteval/score.py +++ b/src/agenteval/score.py @@ -10,7 +10,7 @@ read_eval_log, read_eval_log_samples, ) -from pydantic import BaseModel, Field, field_validator, field_serializer +from pydantic import BaseModel, Field, field_serializer, field_validator from .log import ModelUsageWithName, collect_model_usage, compute_model_cost @@ -18,7 +18,7 @@ # Fields with dict[str, Any] type that need JSON serialization for Arrow compatibility # Arrow/Parquet cannot handle dict[str, Any] so we serialize to JSON strings -_EVALSPEC_JSON_FIELDS = ['solver_args', 'model_args'] +_EVALSPEC_JSON_FIELDS = ["solver_args", "model_args"] class Metric(BaseModel): @@ -46,26 +46,30 @@ def from_eval_log(cls, log: EvalLog) -> "EvalSpec": model_args=log.eval.model_args, revision=log.eval.revision, ) - - @field_validator(*_EVALSPEC_JSON_FIELDS, mode='before') + + @field_validator(*_EVALSPEC_JSON_FIELDS, mode="before") @classmethod def deserialize_json_fields(cls, v): """Deserialize JSON strings back to Python objects. Raises on JSON errors.""" import json + if not isinstance(v, str): return v # Already deserialized or None return json.loads(v) - + @field_serializer(*_EVALSPEC_JSON_FIELDS) def serialize_json_fields(self, v): """Serialize Python objects to JSON strings. Logs errors and returns fallback.""" import json + if v is None: return None try: return json.dumps(v, default=str) except (TypeError, ValueError) as e: - logger.warning(f"Failed to serialize field to JSON: {e}, returning error indicator") + logger.warning( + f"Failed to serialize field to JSON: {e}, returning error indicator" + ) return json.dumps({"__serialization_error__": str(e)}) @@ -150,8 +154,8 @@ def process_eval_logs(log_dir: str) -> tuple[list[TaskResult], bool]: raise ValueError("No valid evaluation logs found.") results = [] - has_eval_specs = False for task_name, log in logs.items(): + eval_spec = EvalSpec.from_eval_log(log) try: metrics = get_metrics(log) if len(metrics) == 0: @@ -159,9 +163,6 @@ def process_eval_logs(log_dir: str) -> tuple[list[TaskResult], bool]: model_usages = get_model_usages(log) model_costs = [compute_model_cost(usages) for usages in model_usages] has_model_usages = any(len(usages) > 0 for usages in model_usages) - eval_spec = EvalSpec.from_eval_log(log) - has_eval_specs = True - results.append( TaskResult( task_name=task_name, @@ -176,7 +177,4 @@ def process_eval_logs(log_dir: str) -> tuple[list[TaskResult], bool]: had_errors = True logger.exception(f"No metrics for {task_name}:") - if not has_eval_specs: - raise ValueError("Eval specification is required.") - return results, had_errors From 094feb6595ca76b8a3263a360c7ca3985f2659d5 Mon Sep 17 00:00:00 2001 From: Jonathan Bragg Date: Thu, 17 Jul 2025 16:34:12 -0700 Subject: [PATCH 4/8] maintain backwards compatibility with old results missing evalspec --- src/agenteval/leaderboard/view.py | 2 +- src/agenteval/score.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/agenteval/leaderboard/view.py b/src/agenteval/leaderboard/view.py index db6e0d3..4bdabd8 100644 --- a/src/agenteval/leaderboard/view.py +++ b/src/agenteval/leaderboard/view.py @@ -195,7 +195,7 @@ def _get_dataframe( # only show source URL if all eval specs have the same revision source_url = None if ev.results: - task_revisions = [tr.eval_spec.revision for tr in ev.results if tr.eval_spec.revision] + task_revisions = [tr.eval_spec.revision for tr in ev.results if tr.eval_spec and tr.eval_spec.revision] if task_revisions and all(rev == task_revisions[0] for rev in task_revisions): revision = task_revisions[0] diff --git a/src/agenteval/score.py b/src/agenteval/score.py index 4830ac0..0a15de3 100644 --- a/src/agenteval/score.py +++ b/src/agenteval/score.py @@ -79,7 +79,7 @@ class TaskResult(BaseModel): task_name: str """Name of the task.""" - eval_spec: EvalSpec + eval_spec: EvalSpec | None = None """Evaluation specification used for this task.""" metrics: list[Metric] From 99f9ce2ad474f8eb10c0ccf59e8095055538079c Mon Sep 17 00:00:00 2001 From: Jonathan Bragg Date: Thu, 17 Jul 2025 16:37:15 -0700 Subject: [PATCH 5/8] fix docstring --- src/agenteval/models.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/agenteval/models.py b/src/agenteval/models.py index f017ead..9fb4e89 100644 --- a/src/agenteval/models.py +++ b/src/agenteval/models.py @@ -79,8 +79,7 @@ def dump_json_bytes( **model_dump_kwargs, ) -> bytes: """ - Return the JSON representation of this EvalResult as bytes, - always excluding `eval_specs` and null/default values. + Return the JSON representation of this EvalResult as bytes. """ return self.model_dump_json( indent=indent, From 678b298d46ea90974207cabb7c26722a87f6d015 Mon Sep 17 00:00:00 2001 From: Jonathan Bragg Date: Sun, 20 Jul 2025 14:00:01 -0700 Subject: [PATCH 6/8] sort keys for serialization consistency --- src/agenteval/score.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/agenteval/score.py b/src/agenteval/score.py index 0a15de3..674d6b0 100644 --- a/src/agenteval/score.py +++ b/src/agenteval/score.py @@ -65,7 +65,7 @@ def serialize_json_fields(self, v): if v is None: return None try: - return json.dumps(v, default=str) + return json.dumps(v, default=str, sort_keys=True) except (TypeError, ValueError) as e: logger.warning( f"Failed to serialize field to JSON: {e}, returning error indicator" From f88cb034417365147440b450a71b52ea673d4137 Mon Sep 17 00:00:00 2001 From: Jonathan Bragg Date: Sun, 20 Jul 2025 18:01:31 -0700 Subject: [PATCH 7/8] log task_args and packages for understanding if submissions were run in a compliant way --- pyproject.toml | 2 +- src/agenteval/cli.py | 60 +++++++++++++++---- .../leaderboard/dataset_features.yml | 4 ++ src/agenteval/leaderboard/schema_generator.py | 6 +- src/agenteval/leaderboard/view.py | 38 +++++++----- src/agenteval/score.py | 24 +++++--- 6 files changed, 96 insertions(+), 38 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cdc5c11..9c2a121 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ readme = "README.md" requires-python = ">=3.10" dependencies = [ "click", - "inspect-ai", + "inspect-ai>=0.3.104", "litellm", "pydantic>=2.0.0", # For leaderboard diff --git a/src/agenteval/cli.py b/src/agenteval/cli.py index a8b6c14..c6b132f 100644 --- a/src/agenteval/cli.py +++ b/src/agenteval/cli.py @@ -26,12 +26,12 @@ "c": "Closed", "api": "API Available", "os": "Open Source", - "ow": "Open Source + Open Weights" + "ow": "Open Source + Open Weights", } TOOL_MAPPING = { "s": "Standard", "css": "Custom with Standard Search", - "c": "Fully Custom" + "c": "Fully Custom", } @@ -177,15 +177,51 @@ def score_command( # Warn if multiple evaluation specs present if eval_result.results: - unique_specs = set() - for task_result in eval_result.results: - spec_hash = hash(task_result.eval_spec.model_dump_json()) - unique_specs.add(spec_hash) + # Check for different solver/model configurations (different agents) + unique_agent_specs = set() + # Check for different code versions (revision/packages) + unique_code_specs = set() - if len(unique_specs) > 1: + for task_result in eval_result.results: + if task_result.eval_spec: + agent_hash = hash( + task_result.eval_spec.model_dump_json( + include={"solver", "solver_args", "model", "model_args"} + ) + ) + unique_agent_specs.add(agent_hash) + + code_hash = hash( + task_result.eval_spec.model_dump_json( + include={"revision", "packages"} + ) + ) + unique_code_specs.add(code_hash) + + if len(unique_agent_specs) > 1: + click.echo( + f"Warning: Found {len(unique_agent_specs)} different agent configurations. " + "Use a single solver + model config per log directory to measure a single " + "agent's performance across tasks." + ) + + if len(unique_code_specs) > 1: + click.echo( + f"Warning: Found {len(unique_code_specs)} different code versions " + "(revision/packages). This may indicate mixed evaluation runs from " + "different code states." + ) + + # Warn if user-specified task arguments are present + tasks_with_args = [] + for task_result in eval_result.results: + if task_result.eval_spec and task_result.eval_spec.task_args_passed: + tasks_with_args.append(task_result.task_name) + + if tasks_with_args: click.echo( - f"Warning: Found {len(unique_specs)} different eval specs. " - "Logs may come from mixed runs." + f"Warning: User-specified task arguments found for tasks: {', '.join(tasks_with_args)}. " + "For fair comparison, do not override the task arg defaults." ) # Warn about any missing tasks @@ -239,13 +275,15 @@ def score_command( help="HF repo id for result stats. Defaults to RESULTS_REPO_ID env var.", ) @click.option( - "-o", "--openness", + "-o", + "--openness", type=AliasedChoice(OPENNESS_MAPPING), required=True, help=generate_choice_help(OPENNESS_MAPPING, "Level of openness for the agent."), ) @click.option( - "-t", "--tool-usage", + "-t", + "--tool-usage", type=AliasedChoice(TOOL_MAPPING), required=True, help=generate_choice_help(TOOL_MAPPING, "Tool choices available to the agent."), diff --git a/src/agenteval/leaderboard/dataset_features.yml b/src/agenteval/leaderboard/dataset_features.yml index d894e08..7865d2f 100644 --- a/src/agenteval/leaderboard/dataset_features.yml +++ b/src/agenteval/leaderboard/dataset_features.yml @@ -42,6 +42,8 @@ dtype: string - name: model_args dtype: string + - name: task_args + dtype: string - name: revision struct: - name: type @@ -50,6 +52,8 @@ dtype: string - name: commit dtype: string + - name: packages + dtype: string - name: metrics list: - name: name diff --git a/src/agenteval/leaderboard/schema_generator.py b/src/agenteval/leaderboard/schema_generator.py index 7222c43..299bd38 100644 --- a/src/agenteval/leaderboard/schema_generator.py +++ b/src/agenteval/leaderboard/schema_generator.py @@ -40,11 +40,11 @@ def _pa_type_for_annotation(anno) -> pa.DataType: if origin is list: inner = get_args(anno)[0] return pa.list_(_pa_type_for_annotation(inner)) - # Handle dict[str, Any] specifically - these are serialized as JSON strings + # Handle dict[str, Any] and dict[str, str] specifically - these are serialized as JSON strings if origin is dict: args = get_args(anno) - if len(args) == 2 and args[0] is str and args[1] is Any: - return pa.string() # dict[str, Any] becomes JSON string + if len(args) == 2 and args[0] is str and (args[1] is Any or args[1] is str): + return pa.string() # dict[str, Any] and dict[str, str] become JSON strings # Other dict types could be handled as proper Arrow maps/structs # For now, fall through to unsupported # Handle Literal types - infer type from literal values diff --git a/src/agenteval/leaderboard/view.py b/src/agenteval/leaderboard/view.py index 4bdabd8..de431b3 100644 --- a/src/agenteval/leaderboard/view.py +++ b/src/agenteval/leaderboard/view.py @@ -141,9 +141,9 @@ def _get_dataframe( for usage_list in task_result.model_usages: for model_usage in usage_list: base_models.add(model_usage.model) - + model_names = sorted(list(base_models)) - + sub = ev.submission # only format if submit_time present, else leave as None ts = sub.submit_time @@ -195,27 +195,37 @@ def _get_dataframe( # only show source URL if all eval specs have the same revision source_url = None if ev.results: - task_revisions = [tr.eval_spec.revision for tr in ev.results if tr.eval_spec and tr.eval_spec.revision] - if task_revisions and all(rev == task_revisions[0] for rev in task_revisions): + task_revisions = [ + tr.eval_spec.revision + for tr in ev.results + if tr.eval_spec and tr.eval_spec.revision + ] + if task_revisions and all( + rev == task_revisions[0] for rev in task_revisions + ): revision = task_revisions[0] - + # Only handle git revisions with complete info - if (revision and revision.type == 'git' and - revision.origin and revision.commit): + if ( + revision + and revision.type == "git" + and revision.origin + and revision.commit + ): origin = revision.origin commit = revision.commit - + # Convert SSH URLs to HTTPS URLs - if origin.startswith('git@'): + if origin.startswith("git@"): # Convert git@github.com:user/repo.git to https://github.com/user/repo - origin = origin.replace('git@', 'https://').replace(':', '/', 1) - + origin = origin.replace("git@", "https://").replace(":", "/", 1) + # Remove .git suffix if present - if origin.endswith('.git'): + if origin.endswith(".git"): origin = origin[:-4] - + # Only create URL if it looks like a valid HTTP(S) URL - if origin.startswith(('http://', 'https://')): + if origin.startswith(("http://", "https://")): source_url = f"{origin}/tree/{commit}" rows.append( diff --git a/src/agenteval/score.py b/src/agenteval/score.py index 674d6b0..5f65391 100644 --- a/src/agenteval/score.py +++ b/src/agenteval/score.py @@ -16,9 +16,9 @@ logger = logging.getLogger(__name__) -# Fields with dict[str, Any] type that need JSON serialization for Arrow compatibility -# Arrow/Parquet cannot handle dict[str, Any] so we serialize to JSON strings -_EVALSPEC_JSON_FIELDS = ["solver_args", "model_args"] +# Fields with dict type that need JSON serialization for Arrow/Parquet and HuggingFace datasets compatibility +# These systems cannot handle dict types so we serialize to JSON strings +_EVALSPEC_JSON_FIELDS = ["solver_args", "model_args", "task_args", "packages"] class Metric(BaseModel): @@ -34,8 +34,11 @@ class EvalSpec(BaseModel): solver: str | None = None solver_args: dict[str, Any] | None = None model: str - model_args: dict[str, Any] = Field(default_factory=dict) + model_args: dict[str, Any] | None = None + task_args: dict[str, Any] | None = None + task_args_passed: dict[str, Any] | None = Field(default=None, exclude=True) revision: EvalRevision | None = None + packages: dict[str, str] | None = None @classmethod def from_eval_log(cls, log: EvalLog) -> "EvalSpec": @@ -44,7 +47,10 @@ def from_eval_log(cls, log: EvalLog) -> "EvalSpec": solver_args=log.eval.solver_args, model=log.eval.model, model_args=log.eval.model_args, + task_args=log.eval.task_args, + task_args_passed=log.eval.task_args_passed, revision=log.eval.revision, + packages=log.eval.packages, ) @field_validator(*_EVALSPEC_JSON_FIELDS, mode="before") @@ -77,19 +83,19 @@ class TaskResult(BaseModel): """Results for a single task.""" task_name: str - """Name of the task.""" + """Name of the task. Derived from Inspect `EvalLog.eval.task`.""" eval_spec: EvalSpec | None = None - """Evaluation specification used for this task.""" + """Evaluation specification used for this task. Derived from Inspect `EvalLog.eval`.""" metrics: list[Metric] - """List of metrics.""" + """List of metrics. Derived from Inspect `EvalLog.results.scores`.""" model_usages: list[list[ModelUsageWithName]] | None = None - """List of model usage lists per sample.""" + """List of model usage lists per sample. Derived from Inspect `EvalLog.samples`.""" model_costs: list[float | None] | None = None - """List of model costs per sample.""" + """List of model costs per sample. Computed from `model_usages`.""" def get_metrics(log: EvalLog) -> list[Metric]: From 0dfbf41271f1d570044d5f8c21bcd0e93d5a7e2f Mon Sep 17 00:00:00 2001 From: Regan Huff <35933912+regan-huff@users.noreply.github.com> Date: Thu, 24 Jul 2025 12:07:24 -0700 Subject: [PATCH 8/8] Update pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9c2a121..c6feecc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "agent-eval" -version = "0.1.16" +version = "0.1.18" description = "Agent evaluation toolkit" readme = "README.md" requires-python = ">=3.10"