From 33de05264010dd578e13039a64d979c23bebecd6 Mon Sep 17 00:00:00 2001
From: Jonathan Bragg <jbragg@allenai.org>
Date: Thu, 17 Jul 2025 12:53:06 -0700
Subject: [PATCH 1/8] Pipe through evalspecs and git revision for leaderboard

---
 pyproject.toml                                |  2 +-
 .../leaderboard/dataset_features.yml          | 18 +++++++++++
 src/agenteval/leaderboard/schema_generator.py | 32 ++++++++++++++++++-
 src/agenteval/leaderboard/view.py             | 28 ++++++++++++++++
 src/agenteval/models.py                       |  2 +-
 src/agenteval/score.py                        | 27 +++++++++++++++-
 6 files changed, 105 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index f81f191..cdc5c11 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "agent-eval"
-version = "0.1.15"
+version = "0.1.16"
 description = "Agent evaluation toolkit"
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/src/agenteval/leaderboard/dataset_features.yml b/src/agenteval/leaderboard/dataset_features.yml
index a0773ac..2ea6690 100644
--- a/src/agenteval/leaderboard/dataset_features.yml
+++ b/src/agenteval/leaderboard/dataset_features.yml
@@ -28,6 +28,24 @@
         dtype: float64
 - name: split
   dtype: string
+- name: eval_specs
+  list:
+  - name: solver
+    dtype: string
+  - name: solver_args
+    dtype: string
+  - name: model
+    dtype: string
+  - name: model_args
+    dtype: string
+  - name: revision
+    struct:
+    - name: type
+      dtype: string
+    - name: origin
+      dtype: string
+    - name: commit
+      dtype: string
 - name: results
   list:
   - name: task_name
diff --git a/src/agenteval/leaderboard/schema_generator.py b/src/agenteval/leaderboard/schema_generator.py
index c1f892b..7222c43 100644
--- a/src/agenteval/leaderboard/schema_generator.py
+++ b/src/agenteval/leaderboard/schema_generator.py
@@ -5,7 +5,7 @@
 import datetime
 import types
 from importlib import resources
-from typing import Union, get_args, get_origin
+from typing import Any, Literal, Union, get_args, get_origin
 
 import pyarrow as pa
 import yaml
@@ -40,6 +40,36 @@ def _pa_type_for_annotation(anno) -> pa.DataType:
     if origin is list:
         inner = get_args(anno)[0]
         return pa.list_(_pa_type_for_annotation(inner))
+    # Handle dict[str, Any] specifically - these are serialized as JSON strings
+    if origin is dict:
+        args = get_args(anno)
+        if len(args) == 2 and args[0] is str and args[1] is Any:
+            return pa.string()  # dict[str, Any] becomes JSON string
+        # Other dict types could be handled as proper Arrow maps/structs
+        # For now, fall through to unsupported
+    # Handle Literal types - infer type from literal values
+    if origin is Literal:
+        literal_values = get_args(anno)
+        if not literal_values:
+            return pa.string()  # fallback
+        
+        # Check that all literal values are the same type
+        first_type = type(literal_values[0])
+        for value in literal_values:
+            if type(value) != first_type:
+                raise ValueError(f"Literal {anno} contains mixed types: {[type(v) for v in literal_values]}")
+        
+        # Map Python type to Arrow type
+        if first_type is str:
+            return pa.string()
+        elif first_type is int:
+            return pa.int64()
+        elif first_type is bool:
+            return pa.bool_()
+        elif first_type is float:
+            return pa.float64()
+        else:
+            raise ValueError(f"Unsupported literal type {first_type} in {anno}")
     # Nested BaseModel
     if isinstance(anno, type) and issubclass(anno, BaseModel):
         inner_schema = _schema_from_pydantic(anno)
diff --git a/src/agenteval/leaderboard/view.py b/src/agenteval/leaderboard/view.py
index a334a2b..f425506 100644
--- a/src/agenteval/leaderboard/view.py
+++ b/src/agenteval/leaderboard/view.py
@@ -69,6 +69,7 @@ def view(
             "User/organization",
             "Submission date",
             "Logs",
+            "Source",
             "Openness",
             "Agent tooling",
             "LLM base",
@@ -190,6 +191,31 @@ def _get_dataframe(
                     }
                 )
 
+        # extract git revision source code URL with SHA
+        # only show source URL if all eval specs have the same revision
+        source_url = None
+        if ev.eval_specs and all(spec.revision == ev.eval_specs[0].revision for spec in ev.eval_specs):
+            revision = ev.eval_specs[0].revision
+            
+            # Only handle git revisions with complete info
+            if (revision and revision.type == 'git' and 
+                revision.origin and revision.commit):
+                origin = revision.origin
+                commit = revision.commit
+                
+                # Convert SSH URLs to HTTPS URLs
+                if origin.startswith('git@'):
+                    # Convert git@github.com:user/repo.git to https://github.com/user/repo
+                    origin = origin.replace('git@', 'https://').replace(':', '/', 1)
+                
+                # Remove .git suffix if present
+                if origin.endswith('.git'):
+                    origin = origin[:-4]
+                
+                # Only create URL if it looks like a valid HTTP(S) URL
+                if origin.startswith(('http://', 'https://')):
+                    source_url = f"{origin}/tree/{commit}"
+
         rows.append(
             {
                 "id": sub.submit_time,
@@ -202,6 +228,7 @@ def _get_dataframe(
                 "base_models": model_names,
                 **flat,
                 "logs_url": sub.logs_url if is_internal else sub.logs_url_public,
+                "source_url": source_url,
             }
         )
 
@@ -228,6 +255,7 @@ def _pretty_column_name(col: str) -> str:
         "tool_usage": "Agent tooling",
         "base_models": "LLM base",
         "logs_url": "Logs",
+        "source_url": "Source",
         "overall/score": "Overall",
         "overall/cost": "Overall cost",
     }
diff --git a/src/agenteval/models.py b/src/agenteval/models.py
index f8bb39f..a759fab 100644
--- a/src/agenteval/models.py
+++ b/src/agenteval/models.py
@@ -33,7 +33,7 @@ class SubmissionMetadata(BaseModel):
 
 
 class EvalResult(EvalConfig):
-    eval_specs: list[EvalSpec] | None = Field(default=None, exclude=True)
+    eval_specs: list[EvalSpec] | None = None
     results: list[TaskResult] | None = None
     submission: SubmissionMetadata = Field(default_factory=SubmissionMetadata)
 
diff --git a/src/agenteval/score.py b/src/agenteval/score.py
index 65ed5b1..c22aca1 100644
--- a/src/agenteval/score.py
+++ b/src/agenteval/score.py
@@ -10,12 +10,16 @@
     read_eval_log,
     read_eval_log_samples,
 )
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, Field, field_validator, field_serializer
 
 from .log import ModelUsageWithName, collect_model_usage, compute_model_cost
 
 logger = logging.getLogger(__name__)
 
+# Fields with dict[str, Any] type that need JSON serialization for Arrow compatibility
+# Arrow/Parquet cannot handle dict[str, Any] so we serialize to JSON strings
+_EVALSPEC_JSON_FIELDS = ['solver_args', 'model_args']
+
 
 class Metric(BaseModel):
     """A metric for a task."""
@@ -42,6 +46,27 @@ def from_eval_log(cls, log: EvalLog) -> "EvalSpec":
             model_args=log.eval.model_args,
             revision=log.eval.revision,
         )
+    
+    @field_validator(*_EVALSPEC_JSON_FIELDS, mode='before')
+    @classmethod
+    def deserialize_json_fields(cls, v):
+        """Deserialize JSON strings back to Python objects. Raises on JSON errors."""
+        import json
+        if not isinstance(v, str):
+            return v  # Already deserialized or None
+        return json.loads(v)
+    
+    @field_serializer(*_EVALSPEC_JSON_FIELDS)
+    def serialize_json_fields(self, v):
+        """Serialize Python objects to JSON strings. Logs errors and returns fallback."""
+        import json
+        if v is None:
+            return None
+        try:
+            return json.dumps(v, default=str)
+        except (TypeError, ValueError) as e:
+            logger.warning(f"Failed to serialize field to JSON: {e}, returning error indicator")
+            return json.dumps({"__serialization_error__": str(e)})
 
 
 class TaskResult(BaseModel):

From 0fc2d8c230e3c0b9e85e26754879d850decab7eb Mon Sep 17 00:00:00 2001
From: Jonathan Bragg <jbragg@allenai.org>
Date: Thu, 17 Jul 2025 16:04:58 -0700
Subject: [PATCH 2/8] Nest evalspec under taskresult to simplify

---
 src/agenteval/cli.py                          | 19 +++++----
 .../leaderboard/dataset_features.yml          | 32 +++++++-------
 src/agenteval/leaderboard/view.py             | 42 ++++++++++---------
 src/agenteval/models.py                       |  3 +-
 src/agenteval/score.py                        | 33 +++++++--------
 5 files changed, 66 insertions(+), 63 deletions(-)

diff --git a/src/agenteval/cli.py b/src/agenteval/cli.py
index 5ca06eb..a8b6c14 100644
--- a/src/agenteval/cli.py
+++ b/src/agenteval/cli.py
@@ -172,16 +172,21 @@ def score_command(
         suite_cfg = load_suite_config(config_path)
         eval_result = EvalResult(suite_config=suite_cfg, split=split)
 
-    task_results, eval_specs, had_errors = process_eval_logs(log_dir)
-    eval_result.eval_specs = eval_specs
+    task_results, had_errors = process_eval_logs(log_dir)
     eval_result.results = task_results
 
     # Warn if multiple evaluation specs present
-    if eval_result.eval_specs and len(eval_result.eval_specs) > 1:
-        click.echo(
-            f"Warning: Found {len(eval_result.eval_specs)} different eval specs. "
-            "Logs may come from mixed runs."
-        )
+    if eval_result.results:
+        unique_specs = set()
+        for task_result in eval_result.results:
+            spec_hash = hash(task_result.eval_spec.model_dump_json())
+            unique_specs.add(spec_hash)
+        
+        if len(unique_specs) > 1:
+            click.echo(
+                f"Warning: Found {len(unique_specs)} different eval specs. "
+                "Logs may come from mixed runs."
+            )
 
     # Warn about any missing tasks
     missing_tasks = eval_result.find_missing_tasks()
diff --git a/src/agenteval/leaderboard/dataset_features.yml b/src/agenteval/leaderboard/dataset_features.yml
index 2ea6690..d894e08 100644
--- a/src/agenteval/leaderboard/dataset_features.yml
+++ b/src/agenteval/leaderboard/dataset_features.yml
@@ -28,28 +28,28 @@
         dtype: float64
 - name: split
   dtype: string
-- name: eval_specs
+- name: results
   list:
-  - name: solver
-    dtype: string
-  - name: solver_args
-    dtype: string
-  - name: model
-    dtype: string
-  - name: model_args
+  - name: task_name
     dtype: string
-  - name: revision
+  - name: eval_spec
     struct:
-    - name: type
+    - name: solver
       dtype: string
-    - name: origin
+    - name: solver_args
       dtype: string
-    - name: commit
+    - name: model
       dtype: string
-- name: results
-  list:
-  - name: task_name
-    dtype: string
+    - name: model_args
+      dtype: string
+    - name: revision
+      struct:
+      - name: type
+        dtype: string
+      - name: origin
+        dtype: string
+      - name: commit
+        dtype: string
   - name: metrics
     list:
     - name: name
diff --git a/src/agenteval/leaderboard/view.py b/src/agenteval/leaderboard/view.py
index f425506..db6e0d3 100644
--- a/src/agenteval/leaderboard/view.py
+++ b/src/agenteval/leaderboard/view.py
@@ -194,27 +194,29 @@ def _get_dataframe(
         # extract git revision source code URL with SHA
         # only show source URL if all eval specs have the same revision
         source_url = None
-        if ev.eval_specs and all(spec.revision == ev.eval_specs[0].revision for spec in ev.eval_specs):
-            revision = ev.eval_specs[0].revision
-            
-            # Only handle git revisions with complete info
-            if (revision and revision.type == 'git' and 
-                revision.origin and revision.commit):
-                origin = revision.origin
-                commit = revision.commit
-                
-                # Convert SSH URLs to HTTPS URLs
-                if origin.startswith('git@'):
-                    # Convert git@github.com:user/repo.git to https://github.com/user/repo
-                    origin = origin.replace('git@', 'https://').replace(':', '/', 1)
-                
-                # Remove .git suffix if present
-                if origin.endswith('.git'):
-                    origin = origin[:-4]
+        if ev.results:
+            task_revisions = [tr.eval_spec.revision for tr in ev.results if tr.eval_spec.revision]
+            if task_revisions and all(rev == task_revisions[0] for rev in task_revisions):
+                revision = task_revisions[0]
                 
-                # Only create URL if it looks like a valid HTTP(S) URL
-                if origin.startswith(('http://', 'https://')):
-                    source_url = f"{origin}/tree/{commit}"
+                # Only handle git revisions with complete info
+                if (revision and revision.type == 'git' and 
+                    revision.origin and revision.commit):
+                    origin = revision.origin
+                    commit = revision.commit
+                    
+                    # Convert SSH URLs to HTTPS URLs
+                    if origin.startswith('git@'):
+                        # Convert git@github.com:user/repo.git to https://github.com/user/repo
+                        origin = origin.replace('git@', 'https://').replace(':', '/', 1)
+                    
+                    # Remove .git suffix if present
+                    if origin.endswith('.git'):
+                        origin = origin[:-4]
+                    
+                    # Only create URL if it looks like a valid HTTP(S) URL
+                    if origin.startswith(('http://', 'https://')):
+                        source_url = f"{origin}/tree/{commit}"
 
         rows.append(
             {
diff --git a/src/agenteval/models.py b/src/agenteval/models.py
index a759fab..f017ead 100644
--- a/src/agenteval/models.py
+++ b/src/agenteval/models.py
@@ -6,7 +6,7 @@
 
 from .config import SuiteConfig
 from .io import atomic_write_file
-from .score import EvalSpec, TaskResult
+from .score import TaskResult
 
 
 class EvalConfig(BaseModel):
@@ -33,7 +33,6 @@ class SubmissionMetadata(BaseModel):
 
 
 class EvalResult(EvalConfig):
-    eval_specs: list[EvalSpec] | None = None
     results: list[TaskResult] | None = None
     submission: SubmissionMetadata = Field(default_factory=SubmissionMetadata)
 
diff --git a/src/agenteval/score.py b/src/agenteval/score.py
index c22aca1..2d02169 100644
--- a/src/agenteval/score.py
+++ b/src/agenteval/score.py
@@ -75,6 +75,9 @@ class TaskResult(BaseModel):
     task_name: str
     """Name of the task."""
 
+    eval_spec: EvalSpec
+    """Evaluation specification used for this task."""
+
     metrics: list[Metric]
     """List of metrics."""
 
@@ -123,15 +126,15 @@ def get_normalized_task_name(log: EvalLog) -> str:
     return log.eval.task.split("/")[-1]
 
 
-def process_eval_logs(log_dir: str) -> tuple[list[TaskResult], list[EvalSpec], bool]:
+def process_eval_logs(log_dir: str) -> tuple[list[TaskResult], bool]:
     """
-    Process evaluation logs from a directory and return task results and eval specs.
+    Process evaluation logs from a directory and return task results.
 
     Args:
         log_dir: Directory containing evaluation logs
 
     Returns:
-        A tuple containing a list of task results and a list of eval specs
+        A tuple containing a list of task results and whether there were errors
     """
     # Read evaluation logs
     logs = {}
@@ -146,21 +149,8 @@ def process_eval_logs(log_dir: str) -> tuple[list[TaskResult], list[EvalSpec], b
     if not logs:
         raise ValueError("No valid evaluation logs found.")
 
-    # Collect eval specs
-    eval_specs = []
-    seen_specs = set()
-    for log in logs.values():
-        next_eval_spec = EvalSpec.from_eval_log(log)
-        # Use the hash of the serialized spec to check for duplicates
-        spec_hash = hash(next_eval_spec.model_dump_json())
-        if spec_hash not in seen_specs:
-            seen_specs.add(spec_hash)
-            eval_specs.append(next_eval_spec)
-
-    if not eval_specs:
-        raise ValueError("Eval specification is required.")
-
     results = []
+    has_eval_specs = False
     for task_name, log in logs.items():
         try:
             metrics = get_metrics(log)
@@ -169,9 +159,13 @@ def process_eval_logs(log_dir: str) -> tuple[list[TaskResult], list[EvalSpec], b
             model_usages = get_model_usages(log)
             model_costs = [compute_model_cost(usages) for usages in model_usages]
             has_model_usages = any(len(usages) > 0 for usages in model_usages)
+            eval_spec = EvalSpec.from_eval_log(log)
+            has_eval_specs = True
+            
             results.append(
                 TaskResult(
                     task_name=task_name,
+                    eval_spec=eval_spec,
                     metrics=metrics,
                     # Set to None to avoid incorrect pyarrow model usage type inference
                     model_usages=model_usages if has_model_usages else None,
@@ -182,4 +176,7 @@ def process_eval_logs(log_dir: str) -> tuple[list[TaskResult], list[EvalSpec], b
             had_errors = True
             logger.exception(f"No metrics for {task_name}:")
 
-    return results, eval_specs, had_errors
+    if not has_eval_specs:
+        raise ValueError("Eval specification is required.")
+
+    return results, had_errors

From b74f0790e4ac4739971bbafd5f269289ea138ee3 Mon Sep 17 00:00:00 2001
From: Jonathan Bragg <jbragg@allenai.org>
Date: Thu, 17 Jul 2025 16:16:36 -0700
Subject: [PATCH 3/8] Improve error handling in process_eval_logs

---
 src/agenteval/score.py | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/src/agenteval/score.py b/src/agenteval/score.py
index 2d02169..4830ac0 100644
--- a/src/agenteval/score.py
+++ b/src/agenteval/score.py
@@ -10,7 +10,7 @@
     read_eval_log,
     read_eval_log_samples,
 )
-from pydantic import BaseModel, Field, field_validator, field_serializer
+from pydantic import BaseModel, Field, field_serializer, field_validator
 
 from .log import ModelUsageWithName, collect_model_usage, compute_model_cost
 
@@ -18,7 +18,7 @@
 
 # Fields with dict[str, Any] type that need JSON serialization for Arrow compatibility
 # Arrow/Parquet cannot handle dict[str, Any] so we serialize to JSON strings
-_EVALSPEC_JSON_FIELDS = ['solver_args', 'model_args']
+_EVALSPEC_JSON_FIELDS = ["solver_args", "model_args"]
 
 
 class Metric(BaseModel):
@@ -46,26 +46,30 @@ def from_eval_log(cls, log: EvalLog) -> "EvalSpec":
             model_args=log.eval.model_args,
             revision=log.eval.revision,
         )
-    
-    @field_validator(*_EVALSPEC_JSON_FIELDS, mode='before')
+
+    @field_validator(*_EVALSPEC_JSON_FIELDS, mode="before")
     @classmethod
     def deserialize_json_fields(cls, v):
         """Deserialize JSON strings back to Python objects. Raises on JSON errors."""
         import json
+
         if not isinstance(v, str):
             return v  # Already deserialized or None
         return json.loads(v)
-    
+
     @field_serializer(*_EVALSPEC_JSON_FIELDS)
     def serialize_json_fields(self, v):
         """Serialize Python objects to JSON strings. Logs errors and returns fallback."""
         import json
+
         if v is None:
             return None
         try:
             return json.dumps(v, default=str)
         except (TypeError, ValueError) as e:
-            logger.warning(f"Failed to serialize field to JSON: {e}, returning error indicator")
+            logger.warning(
+                f"Failed to serialize field to JSON: {e}, returning error indicator"
+            )
             return json.dumps({"__serialization_error__": str(e)})
 
 
@@ -150,8 +154,8 @@ def process_eval_logs(log_dir: str) -> tuple[list[TaskResult], bool]:
         raise ValueError("No valid evaluation logs found.")
 
     results = []
-    has_eval_specs = False
     for task_name, log in logs.items():
+        eval_spec = EvalSpec.from_eval_log(log)
         try:
             metrics = get_metrics(log)
             if len(metrics) == 0:
@@ -159,9 +163,6 @@ def process_eval_logs(log_dir: str) -> tuple[list[TaskResult], bool]:
             model_usages = get_model_usages(log)
             model_costs = [compute_model_cost(usages) for usages in model_usages]
             has_model_usages = any(len(usages) > 0 for usages in model_usages)
-            eval_spec = EvalSpec.from_eval_log(log)
-            has_eval_specs = True
-            
             results.append(
                 TaskResult(
                     task_name=task_name,
@@ -176,7 +177,4 @@ def process_eval_logs(log_dir: str) -> tuple[list[TaskResult], bool]:
             had_errors = True
             logger.exception(f"No metrics for {task_name}:")
 
-    if not has_eval_specs:
-        raise ValueError("Eval specification is required.")
-
     return results, had_errors

From 094feb6595ca76b8a3263a360c7ca3985f2659d5 Mon Sep 17 00:00:00 2001
From: Jonathan Bragg <jbragg@allenai.org>
Date: Thu, 17 Jul 2025 16:34:12 -0700
Subject: [PATCH 4/8] maintain backwards compatibility with old results missing
 evalspec

---
 src/agenteval/leaderboard/view.py | 2 +-
 src/agenteval/score.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/agenteval/leaderboard/view.py b/src/agenteval/leaderboard/view.py
index db6e0d3..4bdabd8 100644
--- a/src/agenteval/leaderboard/view.py
+++ b/src/agenteval/leaderboard/view.py
@@ -195,7 +195,7 @@ def _get_dataframe(
         # only show source URL if all eval specs have the same revision
         source_url = None
         if ev.results:
-            task_revisions = [tr.eval_spec.revision for tr in ev.results if tr.eval_spec.revision]
+            task_revisions = [tr.eval_spec.revision for tr in ev.results if tr.eval_spec and tr.eval_spec.revision]
             if task_revisions and all(rev == task_revisions[0] for rev in task_revisions):
                 revision = task_revisions[0]
                 
diff --git a/src/agenteval/score.py b/src/agenteval/score.py
index 4830ac0..0a15de3 100644
--- a/src/agenteval/score.py
+++ b/src/agenteval/score.py
@@ -79,7 +79,7 @@ class TaskResult(BaseModel):
     task_name: str
     """Name of the task."""
 
-    eval_spec: EvalSpec
+    eval_spec: EvalSpec | None = None
     """Evaluation specification used for this task."""
 
     metrics: list[Metric]

From 99f9ce2ad474f8eb10c0ccf59e8095055538079c Mon Sep 17 00:00:00 2001
From: Jonathan Bragg <jbragg@allenai.org>
Date: Thu, 17 Jul 2025 16:37:15 -0700
Subject: [PATCH 5/8] fix docstring

---
 src/agenteval/models.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/agenteval/models.py b/src/agenteval/models.py
index f017ead..9fb4e89 100644
--- a/src/agenteval/models.py
+++ b/src/agenteval/models.py
@@ -79,8 +79,7 @@ def dump_json_bytes(
         **model_dump_kwargs,
     ) -> bytes:
         """
-        Return the JSON representation of this EvalResult as bytes,
-        always excluding `eval_specs` and null/default values.
+        Return the JSON representation of this EvalResult as bytes.
         """
         return self.model_dump_json(
             indent=indent,

From 678b298d46ea90974207cabb7c26722a87f6d015 Mon Sep 17 00:00:00 2001
From: Jonathan Bragg <jbragg@allenai.org>
Date: Sun, 20 Jul 2025 14:00:01 -0700
Subject: [PATCH 6/8] sort keys for serialization consistency

---
 src/agenteval/score.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/agenteval/score.py b/src/agenteval/score.py
index 0a15de3..674d6b0 100644
--- a/src/agenteval/score.py
+++ b/src/agenteval/score.py
@@ -65,7 +65,7 @@ def serialize_json_fields(self, v):
         if v is None:
             return None
         try:
-            return json.dumps(v, default=str)
+            return json.dumps(v, default=str, sort_keys=True)
         except (TypeError, ValueError) as e:
             logger.warning(
                 f"Failed to serialize field to JSON: {e}, returning error indicator"

From f88cb034417365147440b450a71b52ea673d4137 Mon Sep 17 00:00:00 2001
From: Jonathan Bragg <jbragg@allenai.org>
Date: Sun, 20 Jul 2025 18:01:31 -0700
Subject: [PATCH 7/8] log task_args and packages for understanding if
 submissions were run in a compliant way

---
 pyproject.toml                                |  2 +-
 src/agenteval/cli.py                          | 60 +++++++++++++++----
 .../leaderboard/dataset_features.yml          |  4 ++
 src/agenteval/leaderboard/schema_generator.py |  6 +-
 src/agenteval/leaderboard/view.py             | 38 +++++++-----
 src/agenteval/score.py                        | 24 +++++---
 6 files changed, 96 insertions(+), 38 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index cdc5c11..9c2a121 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -10,7 +10,7 @@ readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
   "click",
-  "inspect-ai",
+  "inspect-ai>=0.3.104",
   "litellm",
   "pydantic>=2.0.0",
   # For leaderboard
diff --git a/src/agenteval/cli.py b/src/agenteval/cli.py
index a8b6c14..c6b132f 100644
--- a/src/agenteval/cli.py
+++ b/src/agenteval/cli.py
@@ -26,12 +26,12 @@
     "c": "Closed",
     "api": "API Available",
     "os": "Open Source",
-    "ow": "Open Source + Open Weights"
+    "ow": "Open Source + Open Weights",
 }
 TOOL_MAPPING = {
     "s": "Standard",
     "css": "Custom with Standard Search",
-    "c": "Fully Custom"
+    "c": "Fully Custom",
 }
 
 
@@ -177,15 +177,51 @@ def score_command(
 
     # Warn if multiple evaluation specs present
     if eval_result.results:
-        unique_specs = set()
-        for task_result in eval_result.results:
-            spec_hash = hash(task_result.eval_spec.model_dump_json())
-            unique_specs.add(spec_hash)
+        # Check for different solver/model configurations (different agents)
+        unique_agent_specs = set()
+        # Check for different code versions (revision/packages)
+        unique_code_specs = set()
         
-        if len(unique_specs) > 1:
+        for task_result in eval_result.results:
+            if task_result.eval_spec:
+                agent_hash = hash(
+                    task_result.eval_spec.model_dump_json(
+                        include={"solver", "solver_args", "model", "model_args"}
+                    )
+                )
+                unique_agent_specs.add(agent_hash)
+                
+                code_hash = hash(
+                    task_result.eval_spec.model_dump_json(
+                        include={"revision", "packages"}
+                    )
+                )
+                unique_code_specs.add(code_hash)
+
+        if len(unique_agent_specs) > 1:
+            click.echo(
+                f"Warning: Found {len(unique_agent_specs)} different agent configurations. "
+                "Use a single solver + model config per log directory to measure a single "
+                "agent's performance across tasks."
+            )
+            
+        if len(unique_code_specs) > 1:
+            click.echo(
+                f"Warning: Found {len(unique_code_specs)} different code versions "
+                "(revision/packages). This may indicate mixed evaluation runs from "
+                "different code states."
+            )
+
+        # Warn if user-specified task arguments are present
+        tasks_with_args = []
+        for task_result in eval_result.results:
+            if task_result.eval_spec and task_result.eval_spec.task_args_passed:
+                tasks_with_args.append(task_result.task_name)
+
+        if tasks_with_args:
             click.echo(
-                f"Warning: Found {len(unique_specs)} different eval specs. "
-                "Logs may come from mixed runs."
+                f"Warning: User-specified task arguments found for tasks: {', '.join(tasks_with_args)}. "
+                "For fair comparison, do not override the task arg defaults."
             )
 
     # Warn about any missing tasks
@@ -239,13 +275,15 @@ def score_command(
     help="HF repo id for result stats. Defaults to RESULTS_REPO_ID env var.",
 )
 @click.option(
-    "-o", "--openness",
+    "-o",
+    "--openness",
     type=AliasedChoice(OPENNESS_MAPPING),
     required=True,
     help=generate_choice_help(OPENNESS_MAPPING, "Level of openness for the agent."),
 )
 @click.option(
-    "-t", "--tool-usage",
+    "-t",
+    "--tool-usage",
     type=AliasedChoice(TOOL_MAPPING),
     required=True,
     help=generate_choice_help(TOOL_MAPPING, "Tool choices available to the agent."),
diff --git a/src/agenteval/leaderboard/dataset_features.yml b/src/agenteval/leaderboard/dataset_features.yml
index d894e08..7865d2f 100644
--- a/src/agenteval/leaderboard/dataset_features.yml
+++ b/src/agenteval/leaderboard/dataset_features.yml
@@ -42,6 +42,8 @@
       dtype: string
     - name: model_args
       dtype: string
+    - name: task_args
+      dtype: string
     - name: revision
       struct:
       - name: type
@@ -50,6 +52,8 @@
         dtype: string
       - name: commit
         dtype: string
+    - name: packages
+      dtype: string
   - name: metrics
     list:
     - name: name
diff --git a/src/agenteval/leaderboard/schema_generator.py b/src/agenteval/leaderboard/schema_generator.py
index 7222c43..299bd38 100644
--- a/src/agenteval/leaderboard/schema_generator.py
+++ b/src/agenteval/leaderboard/schema_generator.py
@@ -40,11 +40,11 @@ def _pa_type_for_annotation(anno) -> pa.DataType:
     if origin is list:
         inner = get_args(anno)[0]
         return pa.list_(_pa_type_for_annotation(inner))
-    # Handle dict[str, Any] specifically - these are serialized as JSON strings
+    # Handle dict[str, Any] and dict[str, str] specifically - these are serialized as JSON strings
     if origin is dict:
         args = get_args(anno)
-        if len(args) == 2 and args[0] is str and args[1] is Any:
-            return pa.string()  # dict[str, Any] becomes JSON string
+        if len(args) == 2 and args[0] is str and (args[1] is Any or args[1] is str):
+            return pa.string()  # dict[str, Any] and dict[str, str] become JSON strings
         # Other dict types could be handled as proper Arrow maps/structs
         # For now, fall through to unsupported
     # Handle Literal types - infer type from literal values
diff --git a/src/agenteval/leaderboard/view.py b/src/agenteval/leaderboard/view.py
index 4bdabd8..de431b3 100644
--- a/src/agenteval/leaderboard/view.py
+++ b/src/agenteval/leaderboard/view.py
@@ -141,9 +141,9 @@ def _get_dataframe(
                     for usage_list in task_result.model_usages:
                         for model_usage in usage_list:
                             base_models.add(model_usage.model)
-        
+
         model_names = sorted(list(base_models))
-        
+
         sub = ev.submission
         # only format if submit_time present, else leave as None
         ts = sub.submit_time
@@ -195,27 +195,37 @@ def _get_dataframe(
         # only show source URL if all eval specs have the same revision
         source_url = None
         if ev.results:
-            task_revisions = [tr.eval_spec.revision for tr in ev.results if tr.eval_spec and tr.eval_spec.revision]
-            if task_revisions and all(rev == task_revisions[0] for rev in task_revisions):
+            task_revisions = [
+                tr.eval_spec.revision
+                for tr in ev.results
+                if tr.eval_spec and tr.eval_spec.revision
+            ]
+            if task_revisions and all(
+                rev == task_revisions[0] for rev in task_revisions
+            ):
                 revision = task_revisions[0]
-                
+
                 # Only handle git revisions with complete info
-                if (revision and revision.type == 'git' and 
-                    revision.origin and revision.commit):
+                if (
+                    revision
+                    and revision.type == "git"
+                    and revision.origin
+                    and revision.commit
+                ):
                     origin = revision.origin
                     commit = revision.commit
-                    
+
                     # Convert SSH URLs to HTTPS URLs
-                    if origin.startswith('git@'):
+                    if origin.startswith("git@"):
                         # Convert git@github.com:user/repo.git to https://github.com/user/repo
-                        origin = origin.replace('git@', 'https://').replace(':', '/', 1)
-                    
+                        origin = origin.replace("git@", "https://").replace(":", "/", 1)
+
                     # Remove .git suffix if present
-                    if origin.endswith('.git'):
+                    if origin.endswith(".git"):
                         origin = origin[:-4]
-                    
+
                     # Only create URL if it looks like a valid HTTP(S) URL
-                    if origin.startswith(('http://', 'https://')):
+                    if origin.startswith(("http://", "https://")):
                         source_url = f"{origin}/tree/{commit}"
 
         rows.append(
diff --git a/src/agenteval/score.py b/src/agenteval/score.py
index 674d6b0..5f65391 100644
--- a/src/agenteval/score.py
+++ b/src/agenteval/score.py
@@ -16,9 +16,9 @@
 
 logger = logging.getLogger(__name__)
 
-# Fields with dict[str, Any] type that need JSON serialization for Arrow compatibility
-# Arrow/Parquet cannot handle dict[str, Any] so we serialize to JSON strings
-_EVALSPEC_JSON_FIELDS = ["solver_args", "model_args"]
+# Fields with dict type that need JSON serialization for Arrow/Parquet and HuggingFace datasets compatibility
+# These systems cannot handle dict types so we serialize to JSON strings
+_EVALSPEC_JSON_FIELDS = ["solver_args", "model_args", "task_args", "packages"]
 
 
 class Metric(BaseModel):
@@ -34,8 +34,11 @@ class EvalSpec(BaseModel):
     solver: str | None = None
     solver_args: dict[str, Any] | None = None
     model: str
-    model_args: dict[str, Any] = Field(default_factory=dict)
+    model_args: dict[str, Any] | None = None
+    task_args: dict[str, Any] | None = None
+    task_args_passed: dict[str, Any] | None = Field(default=None, exclude=True)
     revision: EvalRevision | None = None
+    packages: dict[str, str] | None = None
 
     @classmethod
     def from_eval_log(cls, log: EvalLog) -> "EvalSpec":
@@ -44,7 +47,10 @@ def from_eval_log(cls, log: EvalLog) -> "EvalSpec":
             solver_args=log.eval.solver_args,
             model=log.eval.model,
             model_args=log.eval.model_args,
+            task_args=log.eval.task_args,
+            task_args_passed=log.eval.task_args_passed,
             revision=log.eval.revision,
+            packages=log.eval.packages,
         )
 
     @field_validator(*_EVALSPEC_JSON_FIELDS, mode="before")
@@ -77,19 +83,19 @@ class TaskResult(BaseModel):
     """Results for a single task."""
 
     task_name: str
-    """Name of the task."""
+    """Name of the task. Derived from Inspect `EvalLog.eval.task`."""
 
     eval_spec: EvalSpec | None = None
-    """Evaluation specification used for this task."""
+    """Evaluation specification used for this task. Derived from Inspect `EvalLog.eval`."""
 
     metrics: list[Metric]
-    """List of metrics."""
+    """List of metrics. Derived from Inspect `EvalLog.results.scores`."""
 
     model_usages: list[list[ModelUsageWithName]] | None = None
-    """List of model usage lists per sample."""
+    """List of model usage lists per sample. Derived from Inspect `EvalLog.samples`."""
 
     model_costs: list[float | None] | None = None
-    """List of model costs per sample."""
+    """List of model costs per sample. Computed from `model_usages`."""
 
 
 def get_metrics(log: EvalLog) -> list[Metric]:

From 0dfbf41271f1d570044d5f8c21bcd0e93d5a7e2f Mon Sep 17 00:00:00 2001
From: Regan Huff <35933912+regan-huff@users.noreply.github.com>
Date: Thu, 24 Jul 2025 12:07:24 -0700
Subject: [PATCH 8/8] Update pyproject.toml

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 9c2a121..c6feecc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "agent-eval"
-version = "0.1.16"
+version = "0.1.18"
 description = "Agent evaluation toolkit"
 readme = "README.md"
 requires-python = ">=3.10"