Skip to content

Pipe through evalspecs w/ git revision, packages, and solver/model/task args for leaderboard #24

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jul 24, 2025
Merged
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@ build-backend = "setuptools.build_meta"

[project]
name = "agent-eval"
version = "0.1.17"
version = "0.1.18"
description = "Agent evaluation toolkit"
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"click",
"inspect-ai",
"inspect-ai>=0.3.104",
"litellm",
"pydantic>=2.0.0",
# For leaderboard
Expand Down
65 changes: 54 additions & 11 deletions src/agenteval/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@
"c": "Closed",
"api": "API Available",
"os": "Open Source",
"ow": "Open Source + Open Weights"
"ow": "Open Source + Open Weights",
}
TOOL_MAPPING = {
"s": "Standard",
"css": "Custom with Standard Search",
"c": "Fully Custom"
"c": "Fully Custom",
}


Expand Down Expand Up @@ -172,16 +172,57 @@ def score_command(
suite_cfg = load_suite_config(config_path)
eval_result = EvalResult(suite_config=suite_cfg, split=split)

task_results, eval_specs, had_errors = process_eval_logs(log_dir)
eval_result.eval_specs = eval_specs
task_results, had_errors = process_eval_logs(log_dir)
eval_result.results = task_results

# Warn if multiple evaluation specs present
if eval_result.eval_specs and len(eval_result.eval_specs) > 1:
click.echo(
f"Warning: Found {len(eval_result.eval_specs)} different eval specs. "
"Logs may come from mixed runs."
)
if eval_result.results:
# Check for different solver/model configurations (different agents)
unique_agent_specs = set()
# Check for different code versions (revision/packages)
unique_code_specs = set()

for task_result in eval_result.results:
if task_result.eval_spec:
agent_hash = hash(
task_result.eval_spec.model_dump_json(
include={"solver", "solver_args", "model", "model_args"}
)
)
unique_agent_specs.add(agent_hash)

code_hash = hash(
task_result.eval_spec.model_dump_json(
include={"revision", "packages"}
)
)
unique_code_specs.add(code_hash)

if len(unique_agent_specs) > 1:
click.echo(
f"Warning: Found {len(unique_agent_specs)} different agent configurations. "
"Use a single solver + model config per log directory to measure a single "
"agent's performance across tasks."
)

if len(unique_code_specs) > 1:
click.echo(
f"Warning: Found {len(unique_code_specs)} different code versions "
"(revision/packages). This may indicate mixed evaluation runs from "
"different code states."
)

# Warn if user-specified task arguments are present
tasks_with_args = []
for task_result in eval_result.results:
if task_result.eval_spec and task_result.eval_spec.task_args_passed:
tasks_with_args.append(task_result.task_name)

if tasks_with_args:
click.echo(
f"Warning: User-specified task arguments found for tasks: {', '.join(tasks_with_args)}. "
"For fair comparison, do not override the task arg defaults."
)

# Warn about any missing tasks
missing_tasks = eval_result.find_missing_tasks()
Expand Down Expand Up @@ -234,13 +275,15 @@ def score_command(
help="HF repo id for result stats. Defaults to RESULTS_REPO_ID env var.",
)
@click.option(
"-o", "--openness",
"-o",
"--openness",
type=AliasedChoice(OPENNESS_MAPPING),
required=True,
help=generate_choice_help(OPENNESS_MAPPING, "Level of openness for the agent."),
)
@click.option(
"-t", "--tool-usage",
"-t",
"--tool-usage",
type=AliasedChoice(TOOL_MAPPING),
required=True,
help=generate_choice_help(TOOL_MAPPING, "Tool choices available to the agent."),
Expand Down
22 changes: 22 additions & 0 deletions src/agenteval/leaderboard/dataset_features.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,28 @@
list:
- name: task_name
dtype: string
- name: eval_spec
struct:
- name: solver
dtype: string
- name: solver_args
dtype: string
- name: model
dtype: string
- name: model_args
dtype: string
- name: task_args
dtype: string
- name: revision
struct:
- name: type
dtype: string
- name: origin
dtype: string
- name: commit
dtype: string
- name: packages
dtype: string
- name: metrics
list:
- name: name
Expand Down
32 changes: 31 additions & 1 deletion src/agenteval/leaderboard/schema_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import datetime
import types
from importlib import resources
from typing import Union, get_args, get_origin
from typing import Any, Literal, Union, get_args, get_origin

import pyarrow as pa
import yaml
Expand Down Expand Up @@ -40,6 +40,36 @@ def _pa_type_for_annotation(anno) -> pa.DataType:
if origin is list:
inner = get_args(anno)[0]
return pa.list_(_pa_type_for_annotation(inner))
# Handle dict[str, Any] and dict[str, str] specifically - these are serialized as JSON strings
if origin is dict:
args = get_args(anno)
if len(args) == 2 and args[0] is str and (args[1] is Any or args[1] is str):
return pa.string() # dict[str, Any] and dict[str, str] become JSON strings
# Other dict types could be handled as proper Arrow maps/structs
# For now, fall through to unsupported
# Handle Literal types - infer type from literal values
if origin is Literal:
literal_values = get_args(anno)
if not literal_values:
return pa.string() # fallback

# Check that all literal values are the same type
first_type = type(literal_values[0])
for value in literal_values:
if type(value) != first_type:
raise ValueError(f"Literal {anno} contains mixed types: {[type(v) for v in literal_values]}")

# Map Python type to Arrow type
if first_type is str:
return pa.string()
elif first_type is int:
return pa.int64()
elif first_type is bool:
return pa.bool_()
elif first_type is float:
return pa.float64()
else:
raise ValueError(f"Unsupported literal type {first_type} in {anno}")
# Nested BaseModel
if isinstance(anno, type) and issubclass(anno, BaseModel):
inner_schema = _schema_from_pydantic(anno)
Expand Down
44 changes: 42 additions & 2 deletions src/agenteval/leaderboard/view.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def view(
"User/organization",
"Submission date",
"Logs",
"Source",
"Openness",
"Agent tooling",
"LLM base",
Expand Down Expand Up @@ -151,9 +152,9 @@ def _get_dataframe(
for usage_list in task_result.model_usages:
for model_usage in usage_list:
base_models.add(model_usage.model)

model_names = sorted(list(base_models))

sub = ev.submission
# only format if submit_time present, else leave as None
ts = sub.submit_time
Expand Down Expand Up @@ -201,6 +202,43 @@ def _get_dataframe(
}
)

# extract git revision source code URL with SHA
# only show source URL if all eval specs have the same revision
source_url = None
if ev.results:
task_revisions = [
tr.eval_spec.revision
for tr in ev.results
if tr.eval_spec and tr.eval_spec.revision
]
if task_revisions and all(
rev == task_revisions[0] for rev in task_revisions
):
revision = task_revisions[0]

# Only handle git revisions with complete info
if (
revision
and revision.type == "git"
and revision.origin
and revision.commit
):
origin = revision.origin
commit = revision.commit

# Convert SSH URLs to HTTPS URLs
if origin.startswith("git@"):
# Convert [email protected]:user/repo.git to https://github.com/user/repo
origin = origin.replace("git@", "https://").replace(":", "/", 1)

# Remove .git suffix if present
if origin.endswith(".git"):
origin = origin[:-4]

# Only create URL if it looks like a valid HTTP(S) URL
if origin.startswith(("http://", "https://")):
source_url = f"{origin}/tree/{commit}"

rows.append(
{
"id": sub.submit_time,
Expand All @@ -213,6 +251,7 @@ def _get_dataframe(
"base_models": model_names,
**flat,
"logs_url": sub.logs_url if is_internal else sub.logs_url_public,
"source_url": source_url,
}
)

Expand All @@ -239,6 +278,7 @@ def _pretty_column_name(col: str) -> str:
"tool_usage": "Agent tooling",
"base_models": "LLM base",
"logs_url": "Logs",
"source_url": "Source",
"overall/score": "Overall",
"overall/cost": "Overall cost",
}
Expand Down
6 changes: 2 additions & 4 deletions src/agenteval/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from .config import SuiteConfig
from .io import atomic_write_file
from .score import EvalSpec, TaskResult
from .score import TaskResult


class EvalConfig(BaseModel):
Expand All @@ -33,7 +33,6 @@ class SubmissionMetadata(BaseModel):


class EvalResult(EvalConfig):
eval_specs: list[EvalSpec] | None = Field(default=None, exclude=True)
results: list[TaskResult] | None = None
submission: SubmissionMetadata = Field(default_factory=SubmissionMetadata)

Expand Down Expand Up @@ -80,8 +79,7 @@ def dump_json_bytes(
**model_dump_kwargs,
) -> bytes:
"""
Return the JSON representation of this EvalResult as bytes,
always excluding `eval_specs` and null/default values.
Return the JSON representation of this EvalResult as bytes.
"""
return self.model_dump_json(
indent=indent,
Expand Down
Loading