polaris-hub · kirahowe · May 27, 2024 · May 8, 2024 · May 15, 2024 · May 16, 2024
diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py
@@ -5,7 +5,6 @@
 
 import fsspec
 import numpy as np
-import pandas as pd
 from datamol.utils import fs
 from pydantic import (
     Field,
@@ -19,9 +18,9 @@
 
 from polaris._artifact import BaseArtifactModel
 from polaris.dataset import Dataset, Subset
-from polaris.evaluate import BenchmarkResults, Metric, ResultsType
+from polaris.evaluate import BenchmarkResults, Metric
+from polaris.evaluate.utils import evaluate_benchmark
 from polaris.hub.settings import PolarisHubSettings
-from polaris.utils.context import tmp_attribute_change
 from polaris.utils.dict2html import dict2html
 from polaris.utils.errors import InvalidBenchmarkError, PolarisChecksumError
 from polaris.utils.misc import listit
@@ -351,6 +350,37 @@ def task_type(self) -> TaskType:
         v = TaskType.MULTI_TASK if len(self.target_cols) > 1 else TaskType.SINGLE_TASK
         return v.value
 
+    def _get_subset(self, indices, hide_targets=True, featurization_fn=None):
+        """Returns a [`Subset`][polaris.dataset.Subset] using the given indices. Used
+        internally to construct the train and test sets."""
+        return Subset(
+            dataset=self.dataset,
+            indices=indices,
+            input_cols=self.input_cols,
+            target_cols=self.target_cols,
+            hide_targets=hide_targets,
+            featurization_fn=featurization_fn,
+        )
+
+    def _get_test_set(
+        self, hide_targets=True, featurization_fn: Optional[Callable] = None
+    ) -> Union["Subset", dict[str, Subset]]:
+        """Construct the test set(s), given the split in the benchmark specification. Used
+        internally to construct the test set for client use and evaluation.
+        """
+        def make_test_subset(vals):
+            return self._get_subset(vals,
+                                    hide_targets=hide_targets,
+                                    featurization_fn=featurization_fn)
+
+        test_split = self.split[1]
+        if isinstance(test_split, dict):
+            test = {k: make_test_subset(v) for k, v in test_split.items()}
+        else:
+            test = make_test_subset(test_split)
+
+        return test
+
     def get_train_test_split(
         self, featurization_fn: Optional[Callable] = None
     ) -> tuple[Subset, Union["Subset", dict[str, Subset]]]:
@@ -366,25 +396,12 @@ def get_train_test_split(
 
         Returns:
             A tuple with the train `Subset` and test `Subset` objects.
-                If there are multiple test sets, these are returned in a dictionary and each test set has
-                an associated name. The targets of the test set can not be accessed.
+            If there are multiple test sets, these are returned in a dictionary and each test set has
+            an associated name. The targets of the test set can not be accessed.
         """
 
-        def _get_subset(indices, hide_targets):
-            return Subset(
-                dataset=self.dataset,
-                indices=indices,
-                input_cols=self.input_cols,
-                target_cols=self.target_cols,
-                hide_targets=hide_targets,
-                featurization_fn=featurization_fn,
-            )
-
-        train = _get_subset(self.split[0], hide_targets=False)
-        if isinstance(self.split[1], dict):
-            test = {k: _get_subset(v, hide_targets=True) for k, v in self.split[1].items()}
-        else:
-            test = _get_subset(self.split[1], hide_targets=True)
+        train = self._get_subset(self.split[0], hide_targets=False, featurization_fn=featurization_fn)
+        test = self._get_test_set(hide_targets=True, featurization_fn=featurization_fn)
 
         return train, test
 
@@ -416,60 +433,13 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
         # Instead of having the user pass the ground truth, we extract it from the benchmark spec ourselves.
         # This simplifies the API, but also was added to make accidental access to the test set targets less likely.
         # See also the `hide_targets` parameter in the `Subset` class.
-        test = self.get_train_test_split()[1]
-
-        if not isinstance(test, dict):
-            test = {"test": test}
-
-        y_true = {}
-        for k, test_subset in test.items():
-            with tmp_attribute_change(test_subset, "_hide_targets", False):
-                y_true[k] = test_subset.targets
-
-        if not isinstance(y_pred, dict) or all(k in self.target_cols for k in y_pred):
-            y_pred = {"test": y_pred}
-
-        if any(k not in y_pred for k in test.keys()):
-            raise KeyError(
-                f"Missing keys for at least one of the test sets. Expecting: {sorted(test.keys())}"
-            )
+        test = self._get_test_set(hide_targets=False)
+        y_true = test.targets
+        scores = evaluate_benchmark(y_pred, y_true, self.target_cols, self.metrics)
 
-        # Results are saved in a tabular format. For more info, see the BenchmarkResults docs.
-        scores: ResultsType = pd.DataFrame(columns=BenchmarkResults.RESULTS_COLUMNS)
-
-        # For every test set...
-        for test_label, y_true_subset in y_true.items():
-            # For every metric...
-            for metric in self.metrics:
-                if metric.is_multitask:
-                    # Multi-task but with a metric across targets
-                    score = metric(y_true=y_true_subset, y_pred=y_pred[test_label])
-                    scores.loc[len(scores)] = (test_label, "aggregated", metric, score)
-                    continue
-
-                if not isinstance(y_true_subset, dict):
-                    # Single task
-                    score = metric(y_true=y_true_subset, y_pred=y_pred[test_label])
-                    scores.loc[len(scores)] = (
-                        test_label,
-                        self.target_cols[0],
-                        metric,
-                        score,
-                    )
-                    continue
-
-                # Otherwise, for every target...
-                for target_label, y_true_target in y_true_subset.items():
-                    # Single-task metrics for a multi-task benchmark
-                    # In such a setting, there can be NaN values, which we thus have to filter out.
-                    mask = ~np.isnan(y_true_target)
-                    score = metric(
-                        y_true=y_true_target[mask],
-                        y_pred=y_pred[test_label][target_label][mask],
-                    )
-                    scores.loc[len(scores)] = (test_label, target_label, metric, score)
-
-        return BenchmarkResults(results=scores, benchmark_name=self.name, benchmark_owner=self.owner)
+        return BenchmarkResults(results=scores,
+                                benchmark_name=self.name,
+                                benchmark_owner=self.owner)
 
     def upload_to_hub(
         self,

diff --git a/polaris/competition/_competition.py b/polaris/competition/_competition.py
@@ -1,12 +1,14 @@
 from datetime import datetime
 import os
+import numpy as np
 from typing import Optional, Union
 
 from pydantic import field_serializer
 from polaris.benchmark import BenchmarkSpecification
+from polaris.evaluate import BenchmarkResults
+from polaris.evaluate.utils import evaluate_benchmark
 from polaris.hub.settings import PolarisHubSettings
-from polaris.utils.types import AccessType, HubOwner, TimeoutTypes, ZarrConflictResolution
-
+from polaris.utils.types import AccessType, HubOwner, PredictionsType, TimeoutTypes, ZarrConflictResolution
 
 class CompetitionSpecification(BenchmarkSpecification):
     """This class extends the [`BenchmarkSpecification`][polaris.benchmark.BenchmarkSpecification] to
@@ -15,6 +17,11 @@ class CompetitionSpecification(BenchmarkSpecification):
     Much of the underlying data model and logic is shared across Benchmarks and Competitions, and
     anything within this class serves as a point of differentiation between the two.
 
+    facilitate interactions with Polaris Competitions.
+
+    Much of the underlying data model and logic is shared across Benchmarks and Competitions, and
+    anything within this class serves as a point of differentiation between the two.
+
     Currently, these entities will primarily differ at how user predictions are evaluated.
     """
 
@@ -23,13 +30,47 @@ class CompetitionSpecification(BenchmarkSpecification):
     scheduled_end_time: datetime | None = None
     actual_end_time: datetime | None = None
 
-    def evaluate(self, predictions):
-        """Wrapper method which ultimately triggers an evaluation service to assess and score user predictions
-        for a given competition
+    def evaluate(
+        self,
+        y_pred: PredictionsType,
+        env_file: Optional[Union[str, os.PathLike]] = None,
+        settings: Optional[PolarisHubSettings] = None,
+        cache_auth_token: bool = True,
+        **kwargs: dict
+    ):
+        """Light convenience wrapper around
+        [`PolarisHubClient.evaluate_competition`][polaris.hub.client.PolarisHubClient.evaluate_competition].
         """
+        from polaris.hub.client import PolarisHubClient
 
-        # TODO validate that the number of predictions supplied matches the number of test set rows
-        pass
+        with PolarisHubClient(
+            env_file=env_file,
+            settings=settings,
+            cache_auth_token=cache_auth_token,
+            **kwargs,
+        ) as client:
+            client.evaluate_competition(self, y_pred=y_pred)
+
+    def _hub_evaluate(self, y_pred: PredictionsType, y_true: PredictionsType):
+        """Executes the evaluation logic for a competition, given a set of predictions.
+        Called only by Polaris Hub to evaluate competitions after labels are
+        downloaded from R2 on the hub. Evalutaion logic is the same as for regular benchmarks.
+
+        Args:
+            y_pred: The predictions for the test set, as NumPy arrays.
+                If there are multiple targets, the predictions should be wrapped in a
+                dictionary with the target labels as keys.
+
+            test: The test set. If there are multiple targets, the target columns should
+                be wrapped in a dictionary with the target labels as keys.
+
+        Returns:
+            A `BenchmarkResults` object containing the evaluation results.
+        """
+        scores = evaluate_benchmark(y_pred, y_true, self.target_cols, self.metrics)
+        return BenchmarkResults(results=scores,
+                                benchmark_name=self.name,
+                                benchmark_owner=self.owner)
 
     def upload_to_hub(
         self,

diff --git a/polaris/evaluate/utils.py b/polaris/evaluate/utils.py
@@ -0,0 +1,62 @@
+import numpy as np
+import pandas as pd
+from typing import Union
+
+from polaris.evaluate import BenchmarkResults, ResultsType
+from polaris.utils.types import PredictionsType
+from polaris.evaluate import Metric
+
+def is_multi_task_single_test_set(vals: PredictionsType, target_cols: list[str]):
+    """Check if the given values are for a multiple-task benchmark with a single
+    test set. This is inferred by comparing the target names with the keys of the
+    given data. If all keys in the given data match the target column names, we
+    assume they are target names (as opposed to test set names for a single-task,
+    multiple test set benchmark)."""
+    return not isinstance(vals, dict) or set(vals.keys()) == set(target_cols)
+
+def evaluate_benchmark(y_pred: PredictionsType,
+                       y_true: PredictionsType,
+                       target_cols: list[str],
+                       metrics: Union[str, Metric, list[Union[str, Metric]]]):
+    if is_multi_task_single_test_set(y_true, target_cols):
+        y_true = {"test": y_true}
+
+    if is_multi_task_single_test_set(y_pred, target_cols):
+        y_pred = {"test": y_pred}
+
+    if set(y_true.keys()) != set(y_pred.keys()):
+        raise KeyError(
+            f"Missing keys for at least one of the test sets. Expecting: {sorted(y_true.keys())}"
+        )
+
+    # Results are saved in a tabular format. For more info, see the BenchmarkResults docs.
+    scores: ResultsType = pd.DataFrame(columns=BenchmarkResults.RESULTS_COLUMNS)
+
+    # For every test set...
+    for test_label, y_true_subset in y_true.items():
+        # For every metric...
+        for metric in metrics:
+            if metric.is_multitask:
+                # Multi-task but with a metric across targets
+                score = metric(y_true=y_true_subset, y_pred=y_pred[test_label])
+                scores.loc[len(scores)] = (test_label, "aggregated", metric, score)
+                continue
+
+            if not isinstance(y_true_subset, dict):
+                # Single task
+                score = metric(y_true=y_true_subset, y_pred=y_pred[test_label])
+                scores.loc[len(scores)] = (test_label, target_cols[0], metric, score)
+                continue
+
+            # Otherwise, for every target...
+            for target_label, y_true_target in y_true_subset.items():
+                # Single-task metrics for a multi-task benchmark
+                # In such a setting, there can be NaN values, which we thus have to filter out.
+                mask = ~np.isnan(y_true_target)
+                score = metric(
+                    y_true=y_true_target[mask],
+                    y_pred=y_pred[test_label][target_label][mask],
+                )
+                scores.loc[len(scores)] = (test_label, target_label, metric, score)
+
+    return scores
diff --git a/polaris/hub/client.py b/polaris/hub/client.py
@@ -40,6 +40,7 @@
     ArtifactType,
     HubOwner,
     IOMode,
+    PredictionsType,
     SupportedLicenseType,
     TimeoutTypes,
     ZarrConflictResolution,
@@ -776,7 +777,6 @@ def upload_competition(
         """Upload a competition to the Polaris Hub.
 
         Args:
-            dataset: The dataset to upload.
             competition: The competition to upload.
             timeout: Request timeout values. User can modify the value when uploading large dataset as needed.
                 This can be a single value with the timeout in seconds for all IO operations, or a more granular
@@ -796,7 +796,6 @@ def upload_competition(
         dataset_response = self._upload_dataset(
             competition.dataset, ArtifactType.COMPETITION.value, ACCESS, timeout, owner, if_exists
         )
-
         # Upload competition benchmark
         competition_response = self._upload_benchmark(
             competition, ArtifactType.COMPETITION.value, ACCESS, owner
@@ -851,3 +850,26 @@ def list_competitions(self, limit: int = 100, offset: int = 0) -> list[str]:
         )
         benchmarks_list = [f"{HubOwner(**bm['owner'])}/{bm['name']}" for bm in response["data"]]
         return benchmarks_list
+
+    def evaluate_competition(
+        self,
+        competition: CompetitionSpecification,
+        y_pred: PredictionsType
+    ) -> BenchmarkResults:
+        """Evaluate the predictions for a competition on the Polaris Hub.
+
+        Args:
+            competition: The competition to evaluate the predictions for.
+            y_pred: The predictions for the test set, as NumPy arrays.
+                If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys.
+
+        Returns:
+             A `BenchmarkResults` object.
+        """
+        return self._base_request_to_hub(
+            url=f"/v2/competition/evaluate",
+            method="PUT",
+            json={
+                "competition": competition.artifact_id,
+                "predictions": y_pred
+            })
diff --git a/polaris/utils/context.py b/polaris/utils/context.py
@@ -1,6 +1,5 @@
 from contextlib import contextmanager
 
-
 @contextmanager
 def tmp_attribute_change(obj, attribute, value):
     """Temporarily set and reset an attribute of an object."""

diff --git a/polaris/utils/types.py b/polaris/utils/types.py
@@ -28,7 +28,7 @@
 A prediction is one of three things:
 
 - A single array (single-task, single test set)
-- A dictionary of arrays (single-task, multiple test sets) 
+- A dictionary of arrays (single-task, multiple test sets)
 - A dictionary of dictionaries of arrays (multi-task, multiple test sets)
 """
 
@@ -54,14 +54,14 @@
 """
 A URL-compatible string that can be turned into a slug by the hub.
 
-Can only use alpha-numeric characters, underscores and dashes. 
+Can only use alpha-numeric characters, underscores and dashes.
 The string must be at least 4 and at most 64 characters long.
 """
 
 
 HubUser: TypeAlias = SlugCompatibleStringType
 """
-A user on the Polaris Hub is identified by a username, 
+A user on the Polaris Hub is identified by a username,
 which is a [`SlugCompatibleStringType`][polaris.utils.types.SlugCompatibleStringType].
 """