diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index 0d4f12ed..48a87158 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -5,7 +5,6 @@ import fsspec import numpy as np -import pandas as pd from datamol.utils import fs from pydantic import ( Field, @@ -19,9 +18,9 @@ from polaris._artifact import BaseArtifactModel from polaris.dataset import Dataset, Subset -from polaris.evaluate import BenchmarkResults, Metric, ResultsType +from polaris.evaluate import BenchmarkResults, Metric +from polaris.evaluate.utils import evaluate_benchmark from polaris.hub.settings import PolarisHubSettings -from polaris.utils.context import tmp_attribute_change from polaris.utils.dict2html import dict2html from polaris.utils.errors import InvalidBenchmarkError, PolarisChecksumError from polaris.utils.misc import listit @@ -351,6 +350,37 @@ def task_type(self) -> TaskType: v = TaskType.MULTI_TASK if len(self.target_cols) > 1 else TaskType.SINGLE_TASK return v.value + def _get_subset(self, indices, hide_targets=True, featurization_fn=None): + """Returns a [`Subset`][polaris.dataset.Subset] using the given indices. Used + internally to construct the train and test sets.""" + return Subset( + dataset=self.dataset, + indices=indices, + input_cols=self.input_cols, + target_cols=self.target_cols, + hide_targets=hide_targets, + featurization_fn=featurization_fn, + ) + + def _get_test_set( + self, hide_targets=True, featurization_fn: Optional[Callable] = None + ) -> Union["Subset", dict[str, Subset]]: + """Construct the test set(s), given the split in the benchmark specification. Used + internally to construct the test set for client use and evaluation. + """ + def make_test_subset(vals): + return self._get_subset(vals, + hide_targets=hide_targets, + featurization_fn=featurization_fn) + + test_split = self.split[1] + if isinstance(test_split, dict): + test = {k: make_test_subset(v) for k, v in test_split.items()} + else: + test = make_test_subset(test_split) + + return test + def get_train_test_split( self, featurization_fn: Optional[Callable] = None ) -> tuple[Subset, Union["Subset", dict[str, Subset]]]: @@ -366,25 +396,12 @@ def get_train_test_split( Returns: A tuple with the train `Subset` and test `Subset` objects. - If there are multiple test sets, these are returned in a dictionary and each test set has - an associated name. The targets of the test set can not be accessed. + If there are multiple test sets, these are returned in a dictionary and each test set has + an associated name. The targets of the test set can not be accessed. """ - def _get_subset(indices, hide_targets): - return Subset( - dataset=self.dataset, - indices=indices, - input_cols=self.input_cols, - target_cols=self.target_cols, - hide_targets=hide_targets, - featurization_fn=featurization_fn, - ) - - train = _get_subset(self.split[0], hide_targets=False) - if isinstance(self.split[1], dict): - test = {k: _get_subset(v, hide_targets=True) for k, v in self.split[1].items()} - else: - test = _get_subset(self.split[1], hide_targets=True) + train = self._get_subset(self.split[0], hide_targets=False, featurization_fn=featurization_fn) + test = self._get_test_set(hide_targets=True, featurization_fn=featurization_fn) return train, test @@ -416,60 +433,13 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults: # Instead of having the user pass the ground truth, we extract it from the benchmark spec ourselves. # This simplifies the API, but also was added to make accidental access to the test set targets less likely. # See also the `hide_targets` parameter in the `Subset` class. - test = self.get_train_test_split()[1] - - if not isinstance(test, dict): - test = {"test": test} - - y_true = {} - for k, test_subset in test.items(): - with tmp_attribute_change(test_subset, "_hide_targets", False): - y_true[k] = test_subset.targets - - if not isinstance(y_pred, dict) or all(k in self.target_cols for k in y_pred): - y_pred = {"test": y_pred} - - if any(k not in y_pred for k in test.keys()): - raise KeyError( - f"Missing keys for at least one of the test sets. Expecting: {sorted(test.keys())}" - ) + test = self._get_test_set(hide_targets=False) + y_true = test.targets + scores = evaluate_benchmark(y_pred, y_true, self.target_cols, self.metrics) - # Results are saved in a tabular format. For more info, see the BenchmarkResults docs. - scores: ResultsType = pd.DataFrame(columns=BenchmarkResults.RESULTS_COLUMNS) - - # For every test set... - for test_label, y_true_subset in y_true.items(): - # For every metric... - for metric in self.metrics: - if metric.is_multitask: - # Multi-task but with a metric across targets - score = metric(y_true=y_true_subset, y_pred=y_pred[test_label]) - scores.loc[len(scores)] = (test_label, "aggregated", metric, score) - continue - - if not isinstance(y_true_subset, dict): - # Single task - score = metric(y_true=y_true_subset, y_pred=y_pred[test_label]) - scores.loc[len(scores)] = ( - test_label, - self.target_cols[0], - metric, - score, - ) - continue - - # Otherwise, for every target... - for target_label, y_true_target in y_true_subset.items(): - # Single-task metrics for a multi-task benchmark - # In such a setting, there can be NaN values, which we thus have to filter out. - mask = ~np.isnan(y_true_target) - score = metric( - y_true=y_true_target[mask], - y_pred=y_pred[test_label][target_label][mask], - ) - scores.loc[len(scores)] = (test_label, target_label, metric, score) - - return BenchmarkResults(results=scores, benchmark_name=self.name, benchmark_owner=self.owner) + return BenchmarkResults(results=scores, + benchmark_name=self.name, + benchmark_owner=self.owner) def upload_to_hub( self, diff --git a/polaris/competition/_competition.py b/polaris/competition/_competition.py index 860ce802..d21a4016 100644 --- a/polaris/competition/_competition.py +++ b/polaris/competition/_competition.py @@ -1,12 +1,14 @@ from datetime import datetime import os +import numpy as np from typing import Optional, Union from pydantic import field_serializer from polaris.benchmark import BenchmarkSpecification +from polaris.evaluate import BenchmarkResults +from polaris.evaluate.utils import evaluate_benchmark from polaris.hub.settings import PolarisHubSettings -from polaris.utils.types import AccessType, HubOwner, TimeoutTypes, ZarrConflictResolution - +from polaris.utils.types import AccessType, HubOwner, PredictionsType, TimeoutTypes, ZarrConflictResolution class CompetitionSpecification(BenchmarkSpecification): """This class extends the [`BenchmarkSpecification`][polaris.benchmark.BenchmarkSpecification] to @@ -15,6 +17,11 @@ class CompetitionSpecification(BenchmarkSpecification): Much of the underlying data model and logic is shared across Benchmarks and Competitions, and anything within this class serves as a point of differentiation between the two. + facilitate interactions with Polaris Competitions. + + Much of the underlying data model and logic is shared across Benchmarks and Competitions, and + anything within this class serves as a point of differentiation between the two. + Currently, these entities will primarily differ at how user predictions are evaluated. """ @@ -23,13 +30,47 @@ class CompetitionSpecification(BenchmarkSpecification): scheduled_end_time: datetime | None = None actual_end_time: datetime | None = None - def evaluate(self, predictions): - """Wrapper method which ultimately triggers an evaluation service to assess and score user predictions - for a given competition + def evaluate( + self, + y_pred: PredictionsType, + env_file: Optional[Union[str, os.PathLike]] = None, + settings: Optional[PolarisHubSettings] = None, + cache_auth_token: bool = True, + **kwargs: dict + ): + """Light convenience wrapper around + [`PolarisHubClient.evaluate_competition`][polaris.hub.client.PolarisHubClient.evaluate_competition]. """ + from polaris.hub.client import PolarisHubClient - # TODO validate that the number of predictions supplied matches the number of test set rows - pass + with PolarisHubClient( + env_file=env_file, + settings=settings, + cache_auth_token=cache_auth_token, + **kwargs, + ) as client: + client.evaluate_competition(self, y_pred=y_pred) + + def _hub_evaluate(self, y_pred: PredictionsType, y_true: PredictionsType): + """Executes the evaluation logic for a competition, given a set of predictions. + Called only by Polaris Hub to evaluate competitions after labels are + downloaded from R2 on the hub. Evalutaion logic is the same as for regular benchmarks. + + Args: + y_pred: The predictions for the test set, as NumPy arrays. + If there are multiple targets, the predictions should be wrapped in a + dictionary with the target labels as keys. + + test: The test set. If there are multiple targets, the target columns should + be wrapped in a dictionary with the target labels as keys. + + Returns: + A `BenchmarkResults` object containing the evaluation results. + """ + scores = evaluate_benchmark(y_pred, y_true, self.target_cols, self.metrics) + return BenchmarkResults(results=scores, + benchmark_name=self.name, + benchmark_owner=self.owner) def upload_to_hub( self, diff --git a/polaris/evaluate/utils.py b/polaris/evaluate/utils.py new file mode 100644 index 00000000..1f601538 --- /dev/null +++ b/polaris/evaluate/utils.py @@ -0,0 +1,62 @@ +import numpy as np +import pandas as pd +from typing import Union + +from polaris.evaluate import BenchmarkResults, ResultsType +from polaris.utils.types import PredictionsType +from polaris.evaluate import Metric + +def is_multi_task_single_test_set(vals: PredictionsType, target_cols: list[str]): + """Check if the given values are for a multiple-task benchmark with a single + test set. This is inferred by comparing the target names with the keys of the + given data. If all keys in the given data match the target column names, we + assume they are target names (as opposed to test set names for a single-task, + multiple test set benchmark).""" + return not isinstance(vals, dict) or set(vals.keys()) == set(target_cols) + +def evaluate_benchmark(y_pred: PredictionsType, + y_true: PredictionsType, + target_cols: list[str], + metrics: Union[str, Metric, list[Union[str, Metric]]]): + if is_multi_task_single_test_set(y_true, target_cols): + y_true = {"test": y_true} + + if is_multi_task_single_test_set(y_pred, target_cols): + y_pred = {"test": y_pred} + + if set(y_true.keys()) != set(y_pred.keys()): + raise KeyError( + f"Missing keys for at least one of the test sets. Expecting: {sorted(y_true.keys())}" + ) + + # Results are saved in a tabular format. For more info, see the BenchmarkResults docs. + scores: ResultsType = pd.DataFrame(columns=BenchmarkResults.RESULTS_COLUMNS) + + # For every test set... + for test_label, y_true_subset in y_true.items(): + # For every metric... + for metric in metrics: + if metric.is_multitask: + # Multi-task but with a metric across targets + score = metric(y_true=y_true_subset, y_pred=y_pred[test_label]) + scores.loc[len(scores)] = (test_label, "aggregated", metric, score) + continue + + if not isinstance(y_true_subset, dict): + # Single task + score = metric(y_true=y_true_subset, y_pred=y_pred[test_label]) + scores.loc[len(scores)] = (test_label, target_cols[0], metric, score) + continue + + # Otherwise, for every target... + for target_label, y_true_target in y_true_subset.items(): + # Single-task metrics for a multi-task benchmark + # In such a setting, there can be NaN values, which we thus have to filter out. + mask = ~np.isnan(y_true_target) + score = metric( + y_true=y_true_target[mask], + y_pred=y_pred[test_label][target_label][mask], + ) + scores.loc[len(scores)] = (test_label, target_label, metric, score) + + return scores diff --git a/polaris/hub/client.py b/polaris/hub/client.py index e693f02c..7970648b 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -40,6 +40,7 @@ ArtifactType, HubOwner, IOMode, + PredictionsType, SupportedLicenseType, TimeoutTypes, ZarrConflictResolution, @@ -776,7 +777,6 @@ def upload_competition( """Upload a competition to the Polaris Hub. Args: - dataset: The dataset to upload. competition: The competition to upload. timeout: Request timeout values. User can modify the value when uploading large dataset as needed. This can be a single value with the timeout in seconds for all IO operations, or a more granular @@ -796,7 +796,6 @@ def upload_competition( dataset_response = self._upload_dataset( competition.dataset, ArtifactType.COMPETITION.value, ACCESS, timeout, owner, if_exists ) - # Upload competition benchmark competition_response = self._upload_benchmark( competition, ArtifactType.COMPETITION.value, ACCESS, owner @@ -851,3 +850,26 @@ def list_competitions(self, limit: int = 100, offset: int = 0) -> list[str]: ) benchmarks_list = [f"{HubOwner(**bm['owner'])}/{bm['name']}" for bm in response["data"]] return benchmarks_list + + def evaluate_competition( + self, + competition: CompetitionSpecification, + y_pred: PredictionsType + ) -> BenchmarkResults: + """Evaluate the predictions for a competition on the Polaris Hub. + + Args: + competition: The competition to evaluate the predictions for. + y_pred: The predictions for the test set, as NumPy arrays. + If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys. + + Returns: + A `BenchmarkResults` object. + """ + return self._base_request_to_hub( + url=f"/v2/competition/evaluate", + method="PUT", + json={ + "competition": competition.artifact_id, + "predictions": y_pred + }) diff --git a/polaris/utils/context.py b/polaris/utils/context.py index c5c1520c..ed85c92c 100644 --- a/polaris/utils/context.py +++ b/polaris/utils/context.py @@ -1,6 +1,5 @@ from contextlib import contextmanager - @contextmanager def tmp_attribute_change(obj, attribute, value): """Temporarily set and reset an attribute of an object.""" diff --git a/polaris/utils/types.py b/polaris/utils/types.py index b8ad3fd8..62fc59f0 100644 --- a/polaris/utils/types.py +++ b/polaris/utils/types.py @@ -28,7 +28,7 @@ A prediction is one of three things: - A single array (single-task, single test set) -- A dictionary of arrays (single-task, multiple test sets) +- A dictionary of arrays (single-task, multiple test sets) - A dictionary of dictionaries of arrays (multi-task, multiple test sets) """ @@ -54,14 +54,14 @@ """ A URL-compatible string that can be turned into a slug by the hub. -Can only use alpha-numeric characters, underscores and dashes. +Can only use alpha-numeric characters, underscores and dashes. The string must be at least 4 and at most 64 characters long. """ HubUser: TypeAlias = SlugCompatibleStringType """ -A user on the Polaris Hub is identified by a username, +A user on the Polaris Hub is identified by a username, which is a [`SlugCompatibleStringType`][polaris.utils.types.SlugCompatibleStringType]. """ diff --git a/tests/conftest.py b/tests/conftest.py index 1ebc0a02..5a9c2e40 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,6 +9,7 @@ MultiTaskBenchmarkSpecification, SingleTaskBenchmarkSpecification, ) +from polaris.competition import CompetitionSpecification from polaris.dataset import ColumnAnnotation, Dataset from polaris.utils.types import HubOwner @@ -192,3 +193,26 @@ def test_multi_task_benchmark_clf(test_dataset): ) check_version(benchmark) return benchmark + +@pytest.fixture(scope="function") +def test_competition(test_dataset): + train_indices = list(range(90)) + test_indices = list(range(90, 100)) + competition = CompetitionSpecification( + name="test-competition", + dataset=test_dataset, + metrics=[ + "mean_absolute_error", + "mean_squared_error", + "r2", + "spearmanr", + "pearsonr", + "explained_var", + ], + main_metric="mean_absolute_error", + split=(train_indices, test_indices), + target_cols="expt", + input_cols="smiles", + ) + check_version(competition) + return competition \ No newline at end of file diff --git a/tests/test_competition.py b/tests/test_competition.py new file mode 100644 index 00000000..3ee25647 --- /dev/null +++ b/tests/test_competition.py @@ -0,0 +1,27 @@ +import pandas as pd +import numpy as np + +from polaris.competition import CompetitionSpecification + +test = [-3.84, -9.73, -2.49, -4.13, -4.02, -2.1 , -4.59, 2.3 , -6.09, -7.07] +predictions = test + np.random.uniform(0, 3, size=len(test)) + +def test_competition_from_json(test_competition, tmpdir): + """Test whether we can successfully save and load a competition from JSON.""" + path = test_competition.to_json(str(tmpdir)) + new_competition = CompetitionSpecification.from_json(path) + assert new_competition == test_competition + +def test_competition_evaluation(test_competition): + """Test whether we can successfully evaluate a competition.""" + competition = test_competition + result = competition._hub_evaluate(predictions, test) + assert isinstance(result.results, pd.DataFrame) + assert set(result.results.columns) == { + "Test set", + "Target label", + "Metric", + "Score", + } + for metric in competition.metrics: + assert metric in result.results.Metric.tolist() \ No newline at end of file diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index 5a4332c9..a241cf8e 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -101,4 +101,4 @@ def test_metrics_multitask_clf(tmpdir: str, test_multi_task_benchmark_clf: Multi def test_metric_direction(): for metric in Metric: - assert metric.value.direction in ["min", "max"] + assert metric.value.direction in ["min", "max"] \ No newline at end of file