Extracts generic changes identified when onboarding code review category (#655)

haoranpb · Copilot · web-flow · commit e00d063f839f · 2026-05-29T16:07:13.000+02:00
Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/.github/workflows/claude-evaluation.yml b/.github/workflows/claude-evaluation.yml
@@ -60,7 +60,7 @@ jobs:
       category: ${{ inputs.category }}
 
   evaluate-with-claude-code:
-    runs-on: [ GitHub-BCBench ]
+    runs-on: ${{ needs.get-entries.outputs.runner }}
     needs: get-entries
     outputs:
       results-dir: ${{ env.EVALUATION_RESULTS_DIR }}
@@ -91,6 +91,7 @@ jobs:
           azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
           azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
           github-token: ${{ secrets.GITHUB_TOKEN }}
+          skip-container: ${{ needs.get-entries.outputs.requires-container != 'true' }}
 
       - name: Setup Python with UV
         uses: ./.github/actions/setup-python-uv
@@ -110,6 +111,7 @@ jobs:
 
       - name: Run Claude Code for entry ${{ matrix.entry }}
         timeout-minutes: 120
+        shell: pwsh
         env:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
         run: |
diff --git a/.github/workflows/copilot-evaluation.yml b/.github/workflows/copilot-evaluation.yml
@@ -67,7 +67,7 @@ jobs:
       category: ${{ inputs.category }}
 
   evaluate-with-copilot-cli:
-    runs-on: [ GitHub-BCBench ]
+    runs-on: ${{ needs.get-entries.outputs.runner }}
     needs: get-entries
     outputs:
       results-dir: ${{ env.EVALUATION_RESULTS_DIR }}
@@ -98,6 +98,7 @@ jobs:
           azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
           azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
           github-token: ${{ secrets.GITHUB_TOKEN }}
+          skip-container: ${{ needs.get-entries.outputs.requires-container != 'true' }}
 
       - name: Setup Python with UV
         uses: ./.github/actions/setup-python-uv
@@ -123,6 +124,7 @@ jobs:
 
       - name: Run GitHub Copilot CLI for entry ${{ matrix.entry }}
         timeout-minutes: 120
+        shell: pwsh
         env:
           COPILOT_GITHUB_TOKEN: ${{ steps.select-pat.outputs.pat_index == '0' &&
             secrets.COPILOT_PAT || (steps.select-pat.outputs.pat_index == '1' &&
diff --git a/.github/workflows/get-entries.yml b/.github/workflows/get-entries.yml
@@ -24,12 +24,20 @@ on:
       entries:
         description: JSON array of dataset entries
         value: ${{ jobs.get-entries.outputs.entries }}
+      runner:
+        description: GitHub Actions runner label to evaluate this category on
+        value: ${{ jobs.get-entries.outputs.runner }}
+      requires-container:
+        description: Whether this category needs a BC container ("true"/"false")
+        value: ${{ jobs.get-entries.outputs.requires-container }}
 
 jobs:
   get-entries:
     runs-on: ubuntu-latest
     outputs:
       entries: ${{ steps.get-entries.outputs.entries }}
+      runner: ${{ steps.runtime-config.outputs.runner }}
+      requires-container: ${{ steps.runtime-config.outputs.requires-container }}
     steps:
       - name: Checkout repository
         uses: actions/checkout@v5
@@ -39,6 +47,10 @@ jobs:
       - name: Setup Python with UV
         uses: ./.github/actions/setup-python-uv
 
+      - name: Get runtime config for matrix
+        id: runtime-config
+        run: uv run bcbench category runtime-config --category ${{ inputs.category }}
+
       - name: Get entries for matrix
         id: get-entries
         run: |
diff --git a/src/bcbench/agent/copilot/agent.py b/src/bcbench/agent/copilot/agent.py
@@ -49,7 +49,9 @@ def run_copilot_agent(
     logger.info(f"Executing Copilot CLI in directory: {repo_path}")
     logger.debug(f"Using prompt:\n{prompt}")
 
-    copilot_cmd = shutil.which("copilot.cmd") or shutil.which("copilot")
+    # Prefer copilot.exe over copilot.bat/copilot.cmd shims on Windows: the .bat shim invokes PowerShell,
+    # which re-parses arguments and corrupts prompts containing double quotes (e.g. JSON examples).
+    copilot_cmd = shutil.which("copilot.exe") or shutil.which("copilot.cmd") or shutil.which("copilot")
     if not copilot_cmd:
         raise AgentError("Copilot CLI not found in PATH. Please ensure it is installed and available.")
 
diff --git a/src/bcbench/commands/category.py b/src/bcbench/commands/category.py
@@ -1,11 +1,9 @@
-import os
 import sys
-from pathlib import Path
 
 import typer
-from typing_extensions import Annotated
 
 from bcbench.cli_options import EvaluationCategoryOption
+from bcbench.github_actions import write_step_outputs
 from bcbench.types import EvaluationCategory
 
 category_app = typer.Typer(help="Category-specific configuration helpers")
@@ -19,32 +17,26 @@ def list_categories() -> None:
 
 
 @category_app.command("bceval-config")
-def bceval_config(
-    category: EvaluationCategoryOption,
-    github_output: Annotated[
-        Path | None,
-        typer.Option(envvar="GITHUB_OUTPUT", help="Append outputs to this file (typically $GITHUB_OUTPUT)"),
-    ] = None,
-) -> None:
+def bceval_config(category: EvaluationCategoryOption) -> None:
     """
-    Print the bc-eval evaluator list and core score for a category as key=value lines.
+    Emit the bc-eval evaluator list and core score for a category as step outputs.
 
-    When run inside a GitHub Actions step with $GITHUB_OUTPUT set, the lines are
-    appended to that file so they become step outputs. Otherwise they're written
-    to stdout.
+    The lines are appended to $GITHUB_OUTPUT so they become GitHub Actions step outputs. Outside of Actions nothing is written.
     """
-    lines: list[str] = [
-        f"evaluators={','.join(category.evaluators)}",
-        f"core_score={category.core_score}",
-    ]
-    payload: str = "\n".join(lines) + "\n"
-
-    if github_output:
-        with open(github_output, "a", encoding="utf-8") as file:
-            file.write(payload)
-    else:
-        sys.stdout.write(payload)
-
-    # Always echo to stderr so workflow logs show what was emitted.
-    if os.getenv("GITHUB_ACTIONS"):
-        sys.stderr.write(payload)
+    write_step_outputs(
+        {
+            "evaluators": ",".join(category.evaluators),
+            "core_score": category.core_score,
+        }
+    )
+
+
+@category_app.command("runtime-config")
+def runtime_config(category: EvaluationCategoryOption) -> None:
+    """Emit the GitHub Actions runner label and container requirement for a category."""
+    write_step_outputs(
+        {
+            "runner": category.runner,
+            "requires-container": str(category.requires_container).lower(),
+        }
+    )
diff --git a/src/bcbench/commands/dataset.py b/src/bcbench/commands/dataset.py
@@ -6,10 +6,9 @@
 from typing_extensions import Annotated
 
 from bcbench.cli_options import EvaluationCategoryOption
-from bcbench.config import get_config
 from bcbench.dataset import BaseDatasetEntry
 from bcbench.dataset.dataset_entry import _BugFixTestGenBase
-from bcbench.exceptions import ConfigurationError
+from bcbench.github_actions import write_step_outputs
 from bcbench.logger import get_logger
 from bcbench.types import EvaluationCategory
 
@@ -60,7 +59,7 @@ def list_entries(
         print(f"  - {entry_id}")
 
     if github_output:
-        _write_github_output(github_output, json.dumps(entry_ids))
+        write_step_outputs({github_output: json.dumps(entry_ids)})
 
 
 @dataset_app.command("view")
@@ -150,12 +149,3 @@ def _modified_instance_ids_from_diff(diff_output: str) -> list[str]:
             instance_ids.append(entry_data["instance_id"])
 
     return instance_ids
-
-
-def _write_github_output(key: str, value: str) -> None:
-    """Write a value to GitHub Actions output."""
-    config = get_config()
-    if not config.env.github_output:
-        raise ConfigurationError("GITHUB_OUTPUT environment variable not set. This feature is only available when running in GitHub Actions.")
-    with open(config.env.github_output, "a", encoding="utf-8") as f:
-        f.write(f"{key}={value}\n")
diff --git a/src/bcbench/evaluate/bugfix.py b/src/bcbench/evaluate/bugfix.py
@@ -4,7 +4,8 @@
 from bcbench.dataset import BugFixEntry
 from bcbench.evaluate.base import EvaluationPipeline
 from bcbench.exceptions import BuildError, TestExecutionError
-from bcbench.logger import get_logger, github_log_group
+from bcbench.github_actions import github_log_group
+from bcbench.logger import get_logger
 from bcbench.operations import (
     apply_patch,
     build_and_publish_projects,
diff --git a/src/bcbench/evaluate/testgeneration.py b/src/bcbench/evaluate/testgeneration.py
@@ -8,7 +8,8 @@
 from bcbench.dataset import TestEntry, TestGenEntry
 from bcbench.evaluate.base import EvaluationPipeline
 from bcbench.exceptions import BuildError, NoTestsExtractedError, TestExecutionError
-from bcbench.logger import get_logger, github_log_group
+from bcbench.github_actions import github_log_group
+from bcbench.logger import get_logger
 from bcbench.operations import (
     apply_patch,
     build_and_publish_projects,
diff --git a/src/bcbench/github_actions.py b/src/bcbench/github_actions.py
@@ -0,0 +1,51 @@
+"""Helpers for interacting with GitHub Actions.
+
+These wrap GitHub Actions workflow features (step outputs, log groups) and are no-ops when not running inside Actions.
+"""
+
+from __future__ import annotations
+
+import os
+from collections.abc import Iterator
+from contextlib import contextmanager
+
+from bcbench.config import get_config
+from bcbench.logger import get_logger
+
+__all__ = ["github_log_group", "write_step_outputs"]
+
+logger = get_logger(__name__)
+
+
+def write_step_outputs(outputs: dict[str, str]) -> None:
+    """Append ``key=value`` step outputs to the GitHub Actions output file.
+
+    The values become outputs of the current workflow step, available to downstream steps via ``steps.<id>.outputs.<key>``.
+
+    Args:
+        outputs: Mapping of output names to their string values.
+
+    Note:
+        When not running inside GitHub Actions (``$GITHUB_OUTPUT`` is unset), nothing is written and a warning is logged.
+    """
+    github_output: str | None = os.getenv("GITHUB_OUTPUT")
+    if not github_output:
+        logger.warning("Not running in GitHub Actions; skipping step outputs: %s", ", ".join(outputs))
+        return
+
+    with open(github_output, "a", encoding="utf-8") as file:
+        file.writelines(f"{key}={value}\n" for key, value in outputs.items())
+
+
+@contextmanager
+def github_log_group(title: str) -> Iterator[None]:
+    in_actions: bool = get_config().env.github_actions
+
+    if in_actions:
+        print(f"::group::{title}", flush=True)  # noqa: T201
+
+    try:
+        yield
+    finally:
+        if in_actions:
+            print("::endgroup::", flush=True)  # noqa: T201
diff --git a/src/bcbench/logger.py b/src/bcbench/logger.py
@@ -3,13 +3,11 @@
 import logging
 import re
 import sys
-from collections.abc import Iterator
-from contextlib import contextmanager
 from typing import ClassVar
 
 from bcbench.config import get_config
 
-__all__ = ["get_logger", "github_log_group", "setup_logger"]
+__all__ = ["get_logger", "setup_logger"]
 
 
 class SensitiveDataFilter(logging.Filter):
@@ -217,17 +215,3 @@ def get_logger(name: str) -> logging.Logger:
         name = f"bcbench.{name}"
 
     return logging.getLogger(name)
-
-
-@contextmanager
-def github_log_group(title: str) -> Iterator[None]:
-    config = get_config()
-
-    if config.env.github_actions:
-        print(f"::group::{title}", flush=True)  # noqa: T201
-
-    try:
-        yield
-    finally:
-        if config.env.github_actions:
-            print("::endgroup::", flush=True)  # noqa: T201
diff --git a/src/bcbench/types.py b/src/bcbench/types.py
@@ -214,6 +214,27 @@ def core_score(self) -> str:
 
         raise ValueError(f"Unknown evaluation category: {self}")
 
+    @property
+    def requires_container(self) -> bool:
+        """Whether evaluating this category builds/runs AL code and therefore needs a BC container."""
+        match self:
+            case EvaluationCategory.BUG_FIX | EvaluationCategory.TEST_GENERATION:
+                return True
+
+        raise ValueError(f"Unknown evaluation category: {self}")
+
+    @property
+    def runner(self) -> str:
+        """GitHub Actions runner label for evaluating this category.
+
+        Only categories that require building BaseApp needs self-hosted runners.
+        """
+        match self:
+            case EvaluationCategory.BUG_FIX | EvaluationCategory.TEST_GENERATION:
+                return "GitHub-BCBench"
+
+        raise ValueError(f"Unknown evaluation category: {self}")
+
 
 @dataclass(frozen=True)
 class ContainerConfig:
diff --git a/tests/test_category_command.py b/tests/test_category_command.py
@@ -6,15 +6,15 @@
 runner = CliRunner()
 
 
-def test_bceval_config_prints_evaluators_and_core_score_to_stdout_when_no_github_output(monkeypatch):
+def test_bceval_config_writes_nothing_to_stdout_when_no_github_output(monkeypatch):
     monkeypatch.delenv("GITHUB_OUTPUT", raising=False)
     monkeypatch.delenv("GITHUB_ACTIONS", raising=False)
 
     result = runner.invoke(app, ["category", "bceval-config", "--category", "bug-fix"])
 
     assert result.exit_code == 0
-    assert "evaluators=resolution_rate,build_rate" in result.stdout
-    assert "core_score=ResolutionRate" in result.stdout
+    assert "evaluators=" not in result.stdout
+    assert "core_score=" not in result.stdout
 
 
 def test_bceval_config_appends_to_github_output_file_when_set(tmp_path, monkeypatch):
@@ -32,15 +32,19 @@ def test_bceval_config_appends_to_github_output_file_when_set(tmp_path, monkeypa
     assert "core_score=ResolutionRate" in contents
 
 
-def test_bceval_config_supports_every_category(monkeypatch):
-    monkeypatch.delenv("GITHUB_OUTPUT", raising=False)
+def test_bceval_config_supports_every_category(tmp_path, monkeypatch):
     monkeypatch.delenv("GITHUB_ACTIONS", raising=False)
 
     for category in EvaluationCategory:
+        output_file = tmp_path / f"gh_output_{category.value}"
+        monkeypatch.setenv("GITHUB_OUTPUT", str(output_file))
+
         result = runner.invoke(app, ["category", "bceval-config", "--category", category.value])
         assert result.exit_code == 0, f"{category}: {result.stdout}"
-        assert f"evaluators={','.join(category.evaluators)}" in result.stdout
-        assert f"core_score={category.core_score}" in result.stdout
+
+        contents = output_file.read_text(encoding="utf-8")
+        assert f"evaluators={','.join(category.evaluators)}" in contents
+        assert f"core_score={category.core_score}" in contents
 
 
 def test_list_prints_every_category_one_per_line():
@@ -49,3 +53,18 @@ def test_list_prints_every_category_one_per_line():
     assert result.exit_code == 0
     lines = [line for line in result.stdout.splitlines() if line]
     assert lines == [c.value for c in EvaluationCategory]
+
+
+def test_runtime_config_supports_every_category(tmp_path, monkeypatch):
+    monkeypatch.delenv("GITHUB_ACTIONS", raising=False)
+
+    for category in EvaluationCategory:
+        output_file = tmp_path / f"gh_output_{category.value}"
+        monkeypatch.setenv("GITHUB_OUTPUT", str(output_file))
+
+        result = runner.invoke(app, ["category", "runtime-config", "--category", category.value])
+        assert result.exit_code == 0, f"{category}: {result.stdout}"
+
+        contents = output_file.read_text(encoding="utf-8")
+        assert f"runner={category.runner}" in contents
+        assert f"requires-container={str(category.requires_container).lower()}" in contents