allenai · regan-huff · Jul 24, 2025 · Jul 17, 2025 · Jul 17, 2025 · Jul 17, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,13 +4,13 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "agent-eval"
-version = "0.1.17"
+version = "0.1.18"
 description = "Agent evaluation toolkit"
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
   "click",
-  "inspect-ai",
+  "inspect-ai>=0.3.104",
   "litellm",
   "pydantic>=2.0.0",
   # For leaderboard

diff --git a/src/agenteval/cli.py b/src/agenteval/cli.py
@@ -26,12 +26,12 @@
     "c": "Closed",
     "api": "API Available",
     "os": "Open Source",
-    "ow": "Open Source + Open Weights"
+    "ow": "Open Source + Open Weights",
 }
 TOOL_MAPPING = {
     "s": "Standard",
     "css": "Custom with Standard Search",
-    "c": "Fully Custom"
+    "c": "Fully Custom",
 }
 
 
@@ -172,16 +172,57 @@ def score_command(
         suite_cfg = load_suite_config(config_path)
         eval_result = EvalResult(suite_config=suite_cfg, split=split)
 
-    task_results, eval_specs, had_errors = process_eval_logs(log_dir)
-    eval_result.eval_specs = eval_specs
+    task_results, had_errors = process_eval_logs(log_dir)
     eval_result.results = task_results
 
     # Warn if multiple evaluation specs present
-    if eval_result.eval_specs and len(eval_result.eval_specs) > 1:
-        click.echo(
-            f"Warning: Found {len(eval_result.eval_specs)} different eval specs. "
-            "Logs may come from mixed runs."
-        )
+    if eval_result.results:
+        # Check for different solver/model configurations (different agents)
+        unique_agent_specs = set()
+        # Check for different code versions (revision/packages)
+        unique_code_specs = set()
+
+        for task_result in eval_result.results:
+            if task_result.eval_spec:
+                agent_hash = hash(
+                    task_result.eval_spec.model_dump_json(
+                        include={"solver", "solver_args", "model", "model_args"}
+                    )
+                )
+                unique_agent_specs.add(agent_hash)
+
+                code_hash = hash(
+                    task_result.eval_spec.model_dump_json(
+                        include={"revision", "packages"}
+                    )
+                )
+                unique_code_specs.add(code_hash)
+
+        if len(unique_agent_specs) > 1:
+            click.echo(
+                f"Warning: Found {len(unique_agent_specs)} different agent configurations. "
+                "Use a single solver + model config per log directory to measure a single "
+                "agent's performance across tasks."
+            )
+
+        if len(unique_code_specs) > 1:
+            click.echo(
+                f"Warning: Found {len(unique_code_specs)} different code versions "
+                "(revision/packages). This may indicate mixed evaluation runs from "
+                "different code states."
+            )
+
+        # Warn if user-specified task arguments are present
+        tasks_with_args = []
+        for task_result in eval_result.results:
+            if task_result.eval_spec and task_result.eval_spec.task_args_passed:
+                tasks_with_args.append(task_result.task_name)
+
+        if tasks_with_args:
+            click.echo(
+                f"Warning: User-specified task arguments found for tasks: {', '.join(tasks_with_args)}. "
+                "For fair comparison, do not override the task arg defaults."
+            )
 
     # Warn about any missing tasks
     missing_tasks = eval_result.find_missing_tasks()
@@ -234,13 +275,15 @@ def score_command(
     help="HF repo id for result stats. Defaults to RESULTS_REPO_ID env var.",
 )
 @click.option(
-    "-o", "--openness",
+    "-o",
+    "--openness",
     type=AliasedChoice(OPENNESS_MAPPING),
     required=True,
     help=generate_choice_help(OPENNESS_MAPPING, "Level of openness for the agent."),
 )
 @click.option(
-    "-t", "--tool-usage",
+    "-t",
+    "--tool-usage",
     type=AliasedChoice(TOOL_MAPPING),
     required=True,
     help=generate_choice_help(TOOL_MAPPING, "Tool choices available to the agent."),

diff --git a/src/agenteval/leaderboard/dataset_features.yml b/src/agenteval/leaderboard/dataset_features.yml
@@ -32,6 +32,28 @@
   list:
   - name: task_name
     dtype: string
+  - name: eval_spec
+    struct:
+    - name: solver
+      dtype: string
+    - name: solver_args
+      dtype: string
+    - name: model
+      dtype: string
+    - name: model_args
+      dtype: string
+    - name: task_args
+      dtype: string
+    - name: revision
+      struct:
+      - name: type
+        dtype: string
+      - name: origin
+        dtype: string
+      - name: commit
+        dtype: string
+    - name: packages
+      dtype: string
   - name: metrics
     list:
     - name: name

diff --git a/src/agenteval/leaderboard/schema_generator.py b/src/agenteval/leaderboard/schema_generator.py
@@ -5,7 +5,7 @@
 import datetime
 import types
 from importlib import resources
-from typing import Union, get_args, get_origin
+from typing import Any, Literal, Union, get_args, get_origin
 
 import pyarrow as pa
 import yaml
@@ -40,6 +40,36 @@ def _pa_type_for_annotation(anno) -> pa.DataType:
     if origin is list:
         inner = get_args(anno)[0]
         return pa.list_(_pa_type_for_annotation(inner))
+    # Handle dict[str, Any] and dict[str, str] specifically - these are serialized as JSON strings
+    if origin is dict:
+        args = get_args(anno)
+        if len(args) == 2 and args[0] is str and (args[1] is Any or args[1] is str):
+            return pa.string()  # dict[str, Any] and dict[str, str] become JSON strings
+        # Other dict types could be handled as proper Arrow maps/structs
+        # For now, fall through to unsupported
+    # Handle Literal types - infer type from literal values
+    if origin is Literal:
+        literal_values = get_args(anno)
+        if not literal_values:
+            return pa.string()  # fallback
+
+        # Check that all literal values are the same type
+        first_type = type(literal_values[0])
+        for value in literal_values:
+            if type(value) != first_type:
+                raise ValueError(f"Literal {anno} contains mixed types: {[type(v) for v in literal_values]}")
+
+        # Map Python type to Arrow type
+        if first_type is str:
+            return pa.string()
+        elif first_type is int:
+            return pa.int64()
+        elif first_type is bool:
+            return pa.bool_()
+        elif first_type is float:
+            return pa.float64()
+        else:
+            raise ValueError(f"Unsupported literal type {first_type} in {anno}")
     # Nested BaseModel
     if isinstance(anno, type) and issubclass(anno, BaseModel):
         inner_schema = _schema_from_pydantic(anno)

diff --git a/src/agenteval/leaderboard/view.py b/src/agenteval/leaderboard/view.py
@@ -69,6 +69,7 @@ def view(
             "User/organization",
             "Submission date",
             "Logs",
+            "Source",
             "Openness",
             "Agent tooling",
             "LLM base",
@@ -151,9 +152,9 @@ def _get_dataframe(
                     for usage_list in task_result.model_usages:
                         for model_usage in usage_list:
                             base_models.add(model_usage.model)
-        
+
         model_names = sorted(list(base_models))
-        
+
         sub = ev.submission
         # only format if submit_time present, else leave as None
         ts = sub.submit_time
@@ -201,6 +202,43 @@ def _get_dataframe(
                     }
                 )
 
+        # extract git revision source code URL with SHA
+        # only show source URL if all eval specs have the same revision
+        source_url = None
+        if ev.results:
+            task_revisions = [
+                tr.eval_spec.revision
+                for tr in ev.results
+                if tr.eval_spec and tr.eval_spec.revision
+            ]
+            if task_revisions and all(
+                rev == task_revisions[0] for rev in task_revisions
+            ):
+                revision = task_revisions[0]
+
+                # Only handle git revisions with complete info
+                if (
+                    revision
+                    and revision.type == "git"
+                    and revision.origin
+                    and revision.commit
+                ):
+                    origin = revision.origin
+                    commit = revision.commit
+
+                    # Convert SSH URLs to HTTPS URLs
+                    if origin.startswith("git@"):
+                        # Convert [email protected]:user/repo.git to https://github.com/user/repo
+                        origin = origin.replace("git@", "https://").replace(":", "/", 1)
+
+                    # Remove .git suffix if present
+                    if origin.endswith(".git"):
+                        origin = origin[:-4]
+
+                    # Only create URL if it looks like a valid HTTP(S) URL
+                    if origin.startswith(("http://", "https://")):
+                        source_url = f"{origin}/tree/{commit}"
+
         rows.append(
             {
                 "id": sub.submit_time,
@@ -213,6 +251,7 @@ def _get_dataframe(
                 "base_models": model_names,
                 **flat,
                 "logs_url": sub.logs_url if is_internal else sub.logs_url_public,
+                "source_url": source_url,
             }
         )
 
@@ -239,6 +278,7 @@ def _pretty_column_name(col: str) -> str:
         "tool_usage": "Agent tooling",
         "base_models": "LLM base",
         "logs_url": "Logs",
+        "source_url": "Source",
         "overall/score": "Overall",
         "overall/cost": "Overall cost",
     }

diff --git a/src/agenteval/models.py b/src/agenteval/models.py
@@ -6,7 +6,7 @@
 
 from .config import SuiteConfig
 from .io import atomic_write_file
-from .score import EvalSpec, TaskResult
+from .score import TaskResult
 
 
 class EvalConfig(BaseModel):
@@ -33,7 +33,6 @@ class SubmissionMetadata(BaseModel):
 
 
 class EvalResult(EvalConfig):
-    eval_specs: list[EvalSpec] | None = Field(default=None, exclude=True)
     results: list[TaskResult] | None = None
     submission: SubmissionMetadata = Field(default_factory=SubmissionMetadata)
 
@@ -80,8 +79,7 @@ def dump_json_bytes(
         **model_dump_kwargs,
     ) -> bytes:
         """
-        Return the JSON representation of this EvalResult as bytes,
-        always excluding `eval_specs` and null/default values.
+        Return the JSON representation of this EvalResult as bytes.
         """
         return self.model_dump_json(
             indent=indent,