From 8978856f9d8bab4a0df35e4a48e5979c4aeff3d1 Mon Sep 17 00:00:00 2001 From: Kira McLean Date: Tue, 7 May 2024 23:16:38 -0300 Subject: [PATCH 01/18] call hub evaluate endpoint from client evaluate_competitions method --- polaris/competition/_competition.py | 32 ++++++++++++++++++++++------- polaris/hub/client.py | 29 ++++++++++++++++++++++++-- tests/conftest.py | 1 + 3 files changed, 53 insertions(+), 9 deletions(-) diff --git a/polaris/competition/_competition.py b/polaris/competition/_competition.py index 860ce802..8f00992f 100644 --- a/polaris/competition/_competition.py +++ b/polaris/competition/_competition.py @@ -3,10 +3,10 @@ from typing import Optional, Union from pydantic import field_serializer +import numpy as np from polaris.benchmark import BenchmarkSpecification from polaris.hub.settings import PolarisHubSettings -from polaris.utils.types import AccessType, HubOwner, TimeoutTypes, ZarrConflictResolution - +from polaris.utils.types import AccessType, HubOwner, PredictionsType, TimeoutTypes, ZarrConflictResolution class CompetitionSpecification(BenchmarkSpecification): """This class extends the [`BenchmarkSpecification`][polaris.benchmark.BenchmarkSpecification] to @@ -23,13 +23,31 @@ class CompetitionSpecification(BenchmarkSpecification): scheduled_end_time: datetime | None = None actual_end_time: datetime | None = None - def evaluate(self, predictions): - """Wrapper method which ultimately triggers an evaluation service to assess and score user predictions - for a given competition + def evaluate( + self, + y_pred: PredictionsType, + env_file: Optional[Union[str, os.PathLike]] = None, + settings: Optional[PolarisHubSettings] = None, + cache_auth_token: bool = True, + **kwargs: dict + ): + """Light convenience wrapper around + [`PolarisHubClient.evaluate_competition`][polaris.hub.client.PolarisHubClient.evaluate_competition]. """ + from polaris.hub.client import PolarisHubClient - # TODO validate that the number of predictions supplied matches the number of test set rows - pass + with PolarisHubClient( + env_file=env_file, + settings=settings, + cache_auth_token=cache_auth_token, + **kwargs, + ) as client: + return client.evaluate_competition(self, access, owner) + + def _hub_evaluate(self, y_pred: np.ndarray, test: np.ndarray): + """Method called only by Polaris Hub to evaluate competitions. Labels are provided after being downloaded from R2 on the hub. + """ + return "Internal hub evaluation.." def upload_to_hub( self, diff --git a/polaris/hub/client.py b/polaris/hub/client.py index e693f02c..23c7573b 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -40,6 +40,7 @@ ArtifactType, HubOwner, IOMode, + PredictionsType, SupportedLicenseType, TimeoutTypes, ZarrConflictResolution, @@ -776,7 +777,6 @@ def upload_competition( """Upload a competition to the Polaris Hub. Args: - dataset: The dataset to upload. competition: The competition to upload. timeout: Request timeout values. User can modify the value when uploading large dataset as needed. This can be a single value with the timeout in seconds for all IO operations, or a more granular @@ -796,7 +796,6 @@ def upload_competition( dataset_response = self._upload_dataset( competition.dataset, ArtifactType.COMPETITION.value, ACCESS, timeout, owner, if_exists ) - # Upload competition benchmark competition_response = self._upload_benchmark( competition, ArtifactType.COMPETITION.value, ACCESS, owner @@ -851,3 +850,29 @@ def list_competitions(self, limit: int = 100, offset: int = 0) -> list[str]: ) benchmarks_list = [f"{HubOwner(**bm['owner'])}/{bm['name']}" for bm in response["data"]] return benchmarks_list + + def evaluate_competition( + self, + competition: CompetitionSpecification, + y_pred: PredictionsType + ) -> BenchmarkResults: + """Evaluate the predictions for a competition on the Polaris Hub. + + Args: + competition: The competition to evaluate the predictions for. + y_pred: The predictions for the test set, as NumPy arrays. + If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys. + + Returns: + A `BenchmarkResults` object. + """ + return self._base_request_to_hub( + url=f"/v2/competition/evaluate", + method="PUT", + json={ + "competition": competition.model_dump(exclude_none=True, + exclude=["dataset", + "split"], + by_alias=True), + "predictions": y_pred + }) \ No newline at end of file diff --git a/tests/conftest.py b/tests/conftest.py index 1ebc0a02..3cae11c6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -9,6 +9,7 @@ MultiTaskBenchmarkSpecification, SingleTaskBenchmarkSpecification, ) +from polaris.competition import CompetitionSpecification from polaris.dataset import ColumnAnnotation, Dataset from polaris.utils.types import HubOwner From 6d4a9586e380a7044dc8d3fb51a28697c38d4ec8 Mon Sep 17 00:00:00 2001 From: Kira McLean Date: Wed, 15 May 2024 11:33:20 -0300 Subject: [PATCH 02/18] add super basic test for evaluating competitions --- polaris/benchmark/_base.py | 58 ++-------------------------- polaris/competition/_competition.py | 36 ++++++++++++++++-- polaris/evaluate/utils.py | 59 +++++++++++++++++++++++++++++ polaris/utils/context.py | 1 - tests/conftest.py | 23 +++++++++++ tests/test_competition.py | 26 +++++++++++++ tests/test_evaluate.py | 2 +- 7 files changed, 144 insertions(+), 61 deletions(-) create mode 100644 polaris/evaluate/utils.py create mode 100644 tests/test_competition.py diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index 0d4f12ed..ba93924b 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -5,7 +5,6 @@ import fsspec import numpy as np -import pandas as pd from datamol.utils import fs from pydantic import ( Field, @@ -19,9 +18,9 @@ from polaris._artifact import BaseArtifactModel from polaris.dataset import Dataset, Subset -from polaris.evaluate import BenchmarkResults, Metric, ResultsType +from polaris.evaluate import BenchmarkResults, Metric +from polaris.evaluate.utils import evaluate_benchmark from polaris.hub.settings import PolarisHubSettings -from polaris.utils.context import tmp_attribute_change from polaris.utils.dict2html import dict2html from polaris.utils.errors import InvalidBenchmarkError, PolarisChecksumError from polaris.utils.misc import listit @@ -418,58 +417,7 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults: # See also the `hide_targets` parameter in the `Subset` class. test = self.get_train_test_split()[1] - if not isinstance(test, dict): - test = {"test": test} - - y_true = {} - for k, test_subset in test.items(): - with tmp_attribute_change(test_subset, "_hide_targets", False): - y_true[k] = test_subset.targets - - if not isinstance(y_pred, dict) or all(k in self.target_cols for k in y_pred): - y_pred = {"test": y_pred} - - if any(k not in y_pred for k in test.keys()): - raise KeyError( - f"Missing keys for at least one of the test sets. Expecting: {sorted(test.keys())}" - ) - - # Results are saved in a tabular format. For more info, see the BenchmarkResults docs. - scores: ResultsType = pd.DataFrame(columns=BenchmarkResults.RESULTS_COLUMNS) - - # For every test set... - for test_label, y_true_subset in y_true.items(): - # For every metric... - for metric in self.metrics: - if metric.is_multitask: - # Multi-task but with a metric across targets - score = metric(y_true=y_true_subset, y_pred=y_pred[test_label]) - scores.loc[len(scores)] = (test_label, "aggregated", metric, score) - continue - - if not isinstance(y_true_subset, dict): - # Single task - score = metric(y_true=y_true_subset, y_pred=y_pred[test_label]) - scores.loc[len(scores)] = ( - test_label, - self.target_cols[0], - metric, - score, - ) - continue - - # Otherwise, for every target... - for target_label, y_true_target in y_true_subset.items(): - # Single-task metrics for a multi-task benchmark - # In such a setting, there can be NaN values, which we thus have to filter out. - mask = ~np.isnan(y_true_target) - score = metric( - y_true=y_true_target[mask], - y_pred=y_pred[test_label][target_label][mask], - ) - scores.loc[len(scores)] = (test_label, target_label, metric, score) - - return BenchmarkResults(results=scores, benchmark_name=self.name, benchmark_owner=self.owner) + return evaluate_benchmark(self, y_pred, test) def upload_to_hub( self, diff --git a/polaris/competition/_competition.py b/polaris/competition/_competition.py index 8f00992f..bf8a2460 100644 --- a/polaris/competition/_competition.py +++ b/polaris/competition/_competition.py @@ -4,7 +4,10 @@ from pydantic import field_serializer import numpy as np +import pandas as pd from polaris.benchmark import BenchmarkSpecification +from polaris.dataset import Dataset, Subset +from polaris.evaluate.utils import evaluate_benchmark from polaris.hub.settings import PolarisHubSettings from polaris.utils.types import AccessType, HubOwner, PredictionsType, TimeoutTypes, ZarrConflictResolution @@ -42,12 +45,37 @@ def evaluate( cache_auth_token=cache_auth_token, **kwargs, ) as client: - return client.evaluate_competition(self, access, owner) + client.evaluate_competition(self, y_pred=y_pred) - def _hub_evaluate(self, y_pred: np.ndarray, test: np.ndarray): - """Method called only by Polaris Hub to evaluate competitions. Labels are provided after being downloaded from R2 on the hub. + def _hub_evaluate(self, y_pred: PredictionsType, test: PredictionsType): + """Executes the evaluation logic for a competition, given a set of predictions. + Called only by Polaris Hub to evaluate competitions after labels are + downloaded from R2 on the hub. Evalutaion logic is the same as for regular benchmarks. + + Args: + y_pred: The predictions for the test set, as NumPy arrays. + If there are multiple targets, the predictions should be wrapped in a + dictionary with the target labels as keys. + + test: The test set. If there are multiple targets, the target columns should + be wrapped in a dictionary with the target labels as keys. + + Returns: + A `BenchmarkResults` object containing the evaluation results. """ - return "Internal hub evaluation.." + dataset = Dataset( + table=pd.DataFrame(test, columns=self.target_cols), + name=f'{self.name}_test_set', + description=f"Target labels for competition {self.name}. Used internally to evaluate competition predictions." + ) + test_subset = Subset( + dataset=dataset, + indices=list(range(len(self.split[1]))), + input_cols=self.input_cols, + target_cols=self.target_cols, + hide_targets=False + ) + return evaluate_benchmark(self, y_pred, test_subset) def upload_to_hub( self, diff --git a/polaris/evaluate/utils.py b/polaris/evaluate/utils.py new file mode 100644 index 00000000..da1692e8 --- /dev/null +++ b/polaris/evaluate/utils.py @@ -0,0 +1,59 @@ +import pandas as pd +import numpy as np + +from polaris.utils.context import tmp_attribute_change +from polaris.evaluate import BenchmarkResults, ResultsType + +def evaluate_benchmark(model, y_pred, test): + if not isinstance(test, dict): + test = {"test": test} + + y_true = {} + for k, test_subset in test.items(): + with tmp_attribute_change(test_subset, "_hide_targets", False): + y_true[k] = test_subset.targets + + if not isinstance(y_pred, dict) or all(k in model.target_cols for k in y_pred): + y_pred = {"test": y_pred} + + if any(k not in y_pred for k in test.keys()): + raise KeyError( + f"Missing keys for at least one of the test sets. Expecting: {sorted(test.keys())}" + ) + + # Results are saved in a tabular format. For more info, see the BenchmarkResults docs. + scores: ResultsType = pd.DataFrame(columns=BenchmarkResults.RESULTS_COLUMNS) + + # For every test set... + for test_label, y_true_subset in y_true.items(): + # For every metric... + for metric in model.metrics: + if metric.is_multitask: + # Multi-task but with a metric across targets + score = metric(y_true=y_true_subset, y_pred=y_pred[test_label]) + scores.loc[len(scores)] = (test_label, "aggregated", metric, score) + continue + + if not isinstance(y_true_subset, dict): + # Single task + score = metric(y_true=y_true_subset, y_pred=y_pred[test_label]) + scores.loc[len(scores)] = ( + test_label, + model.target_cols[0], + metric, + score, + ) + continue + + # Otherwise, for every target... + for target_label, y_true_target in y_true_subset.items(): + # Single-task metrics for a multi-task benchmark + # In such a setting, there can be NaN values, which we thus have to filter out. + mask = ~np.isnan(y_true_target) + score = metric( + y_true=y_true_target[mask], + y_pred=y_pred[test_label][target_label][mask], + ) + scores.loc[len(scores)] = (test_label, target_label, metric, score) + + return BenchmarkResults(results=scores, benchmark_name=model.name, benchmark_owner=model.owner) diff --git a/polaris/utils/context.py b/polaris/utils/context.py index c5c1520c..ed85c92c 100644 --- a/polaris/utils/context.py +++ b/polaris/utils/context.py @@ -1,6 +1,5 @@ from contextlib import contextmanager - @contextmanager def tmp_attribute_change(obj, attribute, value): """Temporarily set and reset an attribute of an object.""" diff --git a/tests/conftest.py b/tests/conftest.py index 3cae11c6..5a9c2e40 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -193,3 +193,26 @@ def test_multi_task_benchmark_clf(test_dataset): ) check_version(benchmark) return benchmark + +@pytest.fixture(scope="function") +def test_competition(test_dataset): + train_indices = list(range(90)) + test_indices = list(range(90, 100)) + competition = CompetitionSpecification( + name="test-competition", + dataset=test_dataset, + metrics=[ + "mean_absolute_error", + "mean_squared_error", + "r2", + "spearmanr", + "pearsonr", + "explained_var", + ], + main_metric="mean_absolute_error", + split=(train_indices, test_indices), + target_cols="expt", + input_cols="smiles", + ) + check_version(competition) + return competition \ No newline at end of file diff --git a/tests/test_competition.py b/tests/test_competition.py new file mode 100644 index 00000000..b7639879 --- /dev/null +++ b/tests/test_competition.py @@ -0,0 +1,26 @@ +import pandas as pd +import numpy as np + +from polaris.competition import CompetitionSpecification + +def test_competition_from_json(test_competition, tmpdir): + """Test whether we can successfully save and load a competition from JSON.""" + path = test_competition.to_json(str(tmpdir)) + new_competition = CompetitionSpecification.from_json(path) + assert new_competition == test_competition + +def test_competition_evaluation(test_competition): + """Test whether we can successfully evaluate a competition.""" + competition = test_competition + test = [-3.84, -9.73, -2.49, -4.13, -4.02, -2.1 , -4.59, 2.3 , -6.09, -7.07] + predictions = test + np.random.uniform(0, 3, size=len(test)) + result = competition._hub_evaluate(predictions, test) + assert isinstance(result.results, pd.DataFrame) + assert set(result.results.columns) == { + "Test set", + "Target label", + "Metric", + "Score", + } + for metric in competition.metrics: + assert metric in result.results.Metric.tolist() \ No newline at end of file diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py index 5a4332c9..a241cf8e 100644 --- a/tests/test_evaluate.py +++ b/tests/test_evaluate.py @@ -101,4 +101,4 @@ def test_metrics_multitask_clf(tmpdir: str, test_multi_task_benchmark_clf: Multi def test_metric_direction(): for metric in Metric: - assert metric.value.direction in ["min", "max"] + assert metric.value.direction in ["min", "max"] \ No newline at end of file From d990c50bd55cb9bd514d779da42b8d23e9cdae6b Mon Sep 17 00:00:00 2001 From: Kira McLean Date: Thu, 16 May 2024 18:00:17 -0300 Subject: [PATCH 03/18] be more specific in evaluate_benchmark signature --- polaris/evaluate/utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/polaris/evaluate/utils.py b/polaris/evaluate/utils.py index da1692e8..69c027f6 100644 --- a/polaris/evaluate/utils.py +++ b/polaris/evaluate/utils.py @@ -1,12 +1,16 @@ import pandas as pd import numpy as np +from polaris.dataset import Subset from polaris.utils.context import tmp_attribute_change from polaris.evaluate import BenchmarkResults, ResultsType +from polaris.utils.types import PredictionsType -def evaluate_benchmark(model, y_pred, test): - if not isinstance(test, dict): - test = {"test": test} +def evaluate_benchmark(model, + y_pred: PredictionsType, + test_set: Subset): + if not isinstance(test_set, dict): + test = {"test": test_set} y_true = {} for k, test_subset in test.items(): From 7ef3b797ac8dd45c72f9ea1c21791b0af0d14235 Mon Sep 17 00:00:00 2001 From: Kira McLean Date: Thu, 16 May 2024 18:06:29 -0300 Subject: [PATCH 04/18] Update polaris/hub/client.py Co-authored-by: Andrew Quirke <75542075+Andrewq11@users.noreply.github.com> --- polaris/hub/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polaris/hub/client.py b/polaris/hub/client.py index 23c7573b..f78ab07d 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -867,7 +867,7 @@ def evaluate_competition( A `BenchmarkResults` object. """ return self._base_request_to_hub( - url=f"/v2/competition/evaluate", + url="/v2/competition/evaluate", method="PUT", json={ "competition": competition.model_dump(exclude_none=True, From 02db480b232d0ea398701ef02f9c44ffb4c7261d Mon Sep 17 00:00:00 2001 From: Kira McLean Date: Fri, 17 May 2024 22:47:53 -0300 Subject: [PATCH 05/18] start refactoring object dependencies out of evaluation logic --- polaris/benchmark/_base.py | 57 +++++++++++++++++++---------- polaris/competition/_competition.py | 3 +- polaris/evaluate/utils.py | 35 ++++++++++-------- 3 files changed, 60 insertions(+), 35 deletions(-) diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index ba93924b..d11f9464 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -350,6 +350,37 @@ def task_type(self) -> TaskType: v = TaskType.MULTI_TASK if len(self.target_cols) > 1 else TaskType.SINGLE_TASK return v.value + def _get_subset(self, indices, hide_targets=True, featurization_fn=None): + """Returns a [`Subset`][polaris.dataset.Subset] using the given indices. Used + internally to construct the train and test sets.""" + return Subset( + dataset=self.dataset, + indices=indices, + input_cols=self.input_cols, + target_cols=self.target_cols, + hide_targets=hide_targets, + featurization_fn=featurization_fn, + ) + + def _get_test_set( + self, hide_targets=True, featurization_fn: Optional[Callable] = None + ) -> Union["Subset", dict[str, Subset]]: + """Construct the test set(s), given the split in the benchmark specification. Used + internally to construct the test set for client use and evaluation. + """ + def make_test_subset(vals): + return self._get_subset(vals, + hide_targets=hide_targets, + featurization_fn=featurization_fn) + + test_split = self.split[1] + if isinstance(test_split, dict): + test = {k: make_test_subset(v) for k, v in test_split.items()} + else: + test = make_test_subset(test_split) + + return test + def get_train_test_split( self, featurization_fn: Optional[Callable] = None ) -> tuple[Subset, Union["Subset", dict[str, Subset]]]: @@ -365,25 +396,12 @@ def get_train_test_split( Returns: A tuple with the train `Subset` and test `Subset` objects. - If there are multiple test sets, these are returned in a dictionary and each test set has - an associated name. The targets of the test set can not be accessed. + If there are multiple test sets, these are returned in a dictionary and each test set has + an associated name. The targets of the test set can not be accessed. """ - def _get_subset(indices, hide_targets): - return Subset( - dataset=self.dataset, - indices=indices, - input_cols=self.input_cols, - target_cols=self.target_cols, - hide_targets=hide_targets, - featurization_fn=featurization_fn, - ) - - train = _get_subset(self.split[0], hide_targets=False) - if isinstance(self.split[1], dict): - test = {k: _get_subset(v, hide_targets=True) for k, v in self.split[1].items()} - else: - test = _get_subset(self.split[1], hide_targets=True) + train = self._get_subset(self.split[0], hide_targets=False, featurization_fn=featurization_fn) + test = self._get_test_set(featurization_fn) return train, test @@ -415,9 +433,10 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults: # Instead of having the user pass the ground truth, we extract it from the benchmark spec ourselves. # This simplifies the API, but also was added to make accidental access to the test set targets less likely. # See also the `hide_targets` parameter in the `Subset` class. - test = self.get_train_test_split()[1] + test = self._get_test_set(hide_targets=False) - return evaluate_benchmark(self, y_pred, test) + return evaluate_benchmark(y_pred, test, self.target_cols, + self.name, self.owner, self.metrics) def upload_to_hub( self, diff --git a/polaris/competition/_competition.py b/polaris/competition/_competition.py index bf8a2460..1c38f39a 100644 --- a/polaris/competition/_competition.py +++ b/polaris/competition/_competition.py @@ -75,7 +75,8 @@ def _hub_evaluate(self, y_pred: PredictionsType, test: PredictionsType): target_cols=self.target_cols, hide_targets=False ) - return evaluate_benchmark(self, y_pred, test_subset) + return evaluate_benchmark(y_pred, test_subset, self.target_cols, + self.name, self.owner, self.metrics) def upload_to_hub( self, diff --git a/polaris/evaluate/utils.py b/polaris/evaluate/utils.py index 69c027f6..9b7c6730 100644 --- a/polaris/evaluate/utils.py +++ b/polaris/evaluate/utils.py @@ -1,23 +1,26 @@ -import pandas as pd import numpy as np +import pandas as pd +from typing import Union from polaris.dataset import Subset -from polaris.utils.context import tmp_attribute_change from polaris.evaluate import BenchmarkResults, ResultsType from polaris.utils.types import PredictionsType +from polaris.evaluate import Metric -def evaluate_benchmark(model, - y_pred: PredictionsType, - test_set: Subset): - if not isinstance(test_set, dict): - test = {"test": test_set} +def evaluate_benchmark(y_pred: PredictionsType, + test_vals: Subset, + target_cols: list[str], + benchmark_name: str, + benchmark_owner: str, + metrics: Union[str, Metric, list[Union[str, Metric]]]): + if not isinstance(test_vals, dict): + test = {"test": test_vals} + else: + test = test_vals - y_true = {} - for k, test_subset in test.items(): - with tmp_attribute_change(test_subset, "_hide_targets", False): - y_true[k] = test_subset.targets + y_true = {k: test_subset.targets for k, test_subset in test.items()} - if not isinstance(y_pred, dict) or all(k in model.target_cols for k in y_pred): + if not isinstance(y_pred, dict) or all(k in target_cols for k in y_pred): y_pred = {"test": y_pred} if any(k not in y_pred for k in test.keys()): @@ -31,7 +34,7 @@ def evaluate_benchmark(model, # For every test set... for test_label, y_true_subset in y_true.items(): # For every metric... - for metric in model.metrics: + for metric in metrics: if metric.is_multitask: # Multi-task but with a metric across targets score = metric(y_true=y_true_subset, y_pred=y_pred[test_label]) @@ -43,7 +46,7 @@ def evaluate_benchmark(model, score = metric(y_true=y_true_subset, y_pred=y_pred[test_label]) scores.loc[len(scores)] = ( test_label, - model.target_cols[0], + target_cols[0], metric, score, ) @@ -60,4 +63,6 @@ def evaluate_benchmark(model, ) scores.loc[len(scores)] = (test_label, target_label, metric, score) - return BenchmarkResults(results=scores, benchmark_name=model.name, benchmark_owner=model.owner) + return BenchmarkResults(results=scores, + benchmark_name=benchmark_name, + benchmark_owner=benchmark_owner) From 34edabead7bc5ece39e87a5f366cdb910704080f Mon Sep 17 00:00:00 2001 From: Kira McLean Date: Fri, 17 May 2024 23:40:34 -0300 Subject: [PATCH 06/18] refactor test subset object out of evaluation logic --- polaris/benchmark/_base.py | 4 ++-- polaris/competition/_competition.py | 2 +- polaris/evaluate/utils.py | 24 ++++++++++++++---------- polaris/utils/types.py | 6 +++--- tests/test_competition.py | 5 +++-- 5 files changed, 23 insertions(+), 18 deletions(-) diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index d11f9464..4af61688 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -401,7 +401,7 @@ def get_train_test_split( """ train = self._get_subset(self.split[0], hide_targets=False, featurization_fn=featurization_fn) - test = self._get_test_set(featurization_fn) + test = self._get_test_set(hide_targets=True, featurization_fn=featurization_fn) return train, test @@ -435,7 +435,7 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults: # See also the `hide_targets` parameter in the `Subset` class. test = self._get_test_set(hide_targets=False) - return evaluate_benchmark(y_pred, test, self.target_cols, + return evaluate_benchmark(y_pred, test.targets, self.target_cols, self.name, self.owner, self.metrics) def upload_to_hub( diff --git a/polaris/competition/_competition.py b/polaris/competition/_competition.py index 1c38f39a..c3403e41 100644 --- a/polaris/competition/_competition.py +++ b/polaris/competition/_competition.py @@ -75,7 +75,7 @@ def _hub_evaluate(self, y_pred: PredictionsType, test: PredictionsType): target_cols=self.target_cols, hide_targets=False ) - return evaluate_benchmark(y_pred, test_subset, self.target_cols, + return evaluate_benchmark(y_pred, test_subset.targets, self.target_cols, self.name, self.owner, self.metrics) def upload_to_hub( diff --git a/polaris/evaluate/utils.py b/polaris/evaluate/utils.py index 9b7c6730..b9b1def8 100644 --- a/polaris/evaluate/utils.py +++ b/polaris/evaluate/utils.py @@ -7,25 +7,29 @@ from polaris.utils.types import PredictionsType from polaris.evaluate import Metric +def is_multi_task_single_test_set(vals: PredictionsType, target_cols: list[str]): + """Check if the given values are for a multiple-task benchmark with a single + test set. This is inferred by comparing the target names with the keys of the + given data. If all keys in the given data match the target column names, we + assume they are target names (as opposed to test set names for a single-task, + multiple test set benchmark).""" + return not isinstance(vals, dict) or set(vals.keys()) == set(target_cols) + def evaluate_benchmark(y_pred: PredictionsType, - test_vals: Subset, + y_true: PredictionsType, target_cols: list[str], benchmark_name: str, benchmark_owner: str, metrics: Union[str, Metric, list[Union[str, Metric]]]): - if not isinstance(test_vals, dict): - test = {"test": test_vals} - else: - test = test_vals - - y_true = {k: test_subset.targets for k, test_subset in test.items()} + if is_multi_task_single_test_set(y_true, target_cols): + y_true = {"test": y_true} - if not isinstance(y_pred, dict) or all(k in target_cols for k in y_pred): + if is_multi_task_single_test_set(y_pred, target_cols): y_pred = {"test": y_pred} - if any(k not in y_pred for k in test.keys()): + if set(y_true.keys()) != set(y_pred.keys()): raise KeyError( - f"Missing keys for at least one of the test sets. Expecting: {sorted(test.keys())}" + f"Missing keys for at least one of the test sets. Expecting: {sorted(y_true.keys())}" ) # Results are saved in a tabular format. For more info, see the BenchmarkResults docs. diff --git a/polaris/utils/types.py b/polaris/utils/types.py index b8ad3fd8..62fc59f0 100644 --- a/polaris/utils/types.py +++ b/polaris/utils/types.py @@ -28,7 +28,7 @@ A prediction is one of three things: - A single array (single-task, single test set) -- A dictionary of arrays (single-task, multiple test sets) +- A dictionary of arrays (single-task, multiple test sets) - A dictionary of dictionaries of arrays (multi-task, multiple test sets) """ @@ -54,14 +54,14 @@ """ A URL-compatible string that can be turned into a slug by the hub. -Can only use alpha-numeric characters, underscores and dashes. +Can only use alpha-numeric characters, underscores and dashes. The string must be at least 4 and at most 64 characters long. """ HubUser: TypeAlias = SlugCompatibleStringType """ -A user on the Polaris Hub is identified by a username, +A user on the Polaris Hub is identified by a username, which is a [`SlugCompatibleStringType`][polaris.utils.types.SlugCompatibleStringType]. """ diff --git a/tests/test_competition.py b/tests/test_competition.py index b7639879..3ee25647 100644 --- a/tests/test_competition.py +++ b/tests/test_competition.py @@ -3,6 +3,9 @@ from polaris.competition import CompetitionSpecification +test = [-3.84, -9.73, -2.49, -4.13, -4.02, -2.1 , -4.59, 2.3 , -6.09, -7.07] +predictions = test + np.random.uniform(0, 3, size=len(test)) + def test_competition_from_json(test_competition, tmpdir): """Test whether we can successfully save and load a competition from JSON.""" path = test_competition.to_json(str(tmpdir)) @@ -12,8 +15,6 @@ def test_competition_from_json(test_competition, tmpdir): def test_competition_evaluation(test_competition): """Test whether we can successfully evaluate a competition.""" competition = test_competition - test = [-3.84, -9.73, -2.49, -4.13, -4.02, -2.1 , -4.59, 2.3 , -6.09, -7.07] - predictions = test + np.random.uniform(0, 3, size=len(test)) result = competition._hub_evaluate(predictions, test) assert isinstance(result.results, pd.DataFrame) assert set(result.results.columns) == { From 8a1e613e3db90778898832ccc630b3bdf46a7d80 Mon Sep 17 00:00:00 2001 From: Kira McLean Date: Fri, 17 May 2024 23:57:38 -0300 Subject: [PATCH 07/18] clean up as much as possible for now --- polaris/benchmark/_base.py | 7 +++++-- polaris/competition/_competition.py | 22 ++++++---------------- polaris/evaluate/utils.py | 14 ++------------ 3 files changed, 13 insertions(+), 30 deletions(-) diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index 4af61688..48a87158 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -434,9 +434,12 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults: # This simplifies the API, but also was added to make accidental access to the test set targets less likely. # See also the `hide_targets` parameter in the `Subset` class. test = self._get_test_set(hide_targets=False) + y_true = test.targets + scores = evaluate_benchmark(y_pred, y_true, self.target_cols, self.metrics) - return evaluate_benchmark(y_pred, test.targets, self.target_cols, - self.name, self.owner, self.metrics) + return BenchmarkResults(results=scores, + benchmark_name=self.name, + benchmark_owner=self.owner) def upload_to_hub( self, diff --git a/polaris/competition/_competition.py b/polaris/competition/_competition.py index c3403e41..2a8f3817 100644 --- a/polaris/competition/_competition.py +++ b/polaris/competition/_competition.py @@ -6,7 +6,7 @@ import numpy as np import pandas as pd from polaris.benchmark import BenchmarkSpecification -from polaris.dataset import Dataset, Subset +from polaris.evaluate import BenchmarkResults from polaris.evaluate.utils import evaluate_benchmark from polaris.hub.settings import PolarisHubSettings from polaris.utils.types import AccessType, HubOwner, PredictionsType, TimeoutTypes, ZarrConflictResolution @@ -47,7 +47,7 @@ def evaluate( ) as client: client.evaluate_competition(self, y_pred=y_pred) - def _hub_evaluate(self, y_pred: PredictionsType, test: PredictionsType): + def _hub_evaluate(self, y_pred: PredictionsType, y_true: PredictionsType): """Executes the evaluation logic for a competition, given a set of predictions. Called only by Polaris Hub to evaluate competitions after labels are downloaded from R2 on the hub. Evalutaion logic is the same as for regular benchmarks. @@ -63,20 +63,10 @@ def _hub_evaluate(self, y_pred: PredictionsType, test: PredictionsType): Returns: A `BenchmarkResults` object containing the evaluation results. """ - dataset = Dataset( - table=pd.DataFrame(test, columns=self.target_cols), - name=f'{self.name}_test_set', - description=f"Target labels for competition {self.name}. Used internally to evaluate competition predictions." - ) - test_subset = Subset( - dataset=dataset, - indices=list(range(len(self.split[1]))), - input_cols=self.input_cols, - target_cols=self.target_cols, - hide_targets=False - ) - return evaluate_benchmark(y_pred, test_subset.targets, self.target_cols, - self.name, self.owner, self.metrics) + scores = evaluate_benchmark(y_pred, y_true, self.target_cols, self.metrics) + return BenchmarkResults(results=scores, + benchmark_name=self.name, + benchmark_owner=self.owner) def upload_to_hub( self, diff --git a/polaris/evaluate/utils.py b/polaris/evaluate/utils.py index b9b1def8..1f601538 100644 --- a/polaris/evaluate/utils.py +++ b/polaris/evaluate/utils.py @@ -2,7 +2,6 @@ import pandas as pd from typing import Union -from polaris.dataset import Subset from polaris.evaluate import BenchmarkResults, ResultsType from polaris.utils.types import PredictionsType from polaris.evaluate import Metric @@ -18,8 +17,6 @@ def is_multi_task_single_test_set(vals: PredictionsType, target_cols: list[str]) def evaluate_benchmark(y_pred: PredictionsType, y_true: PredictionsType, target_cols: list[str], - benchmark_name: str, - benchmark_owner: str, metrics: Union[str, Metric, list[Union[str, Metric]]]): if is_multi_task_single_test_set(y_true, target_cols): y_true = {"test": y_true} @@ -48,12 +45,7 @@ def evaluate_benchmark(y_pred: PredictionsType, if not isinstance(y_true_subset, dict): # Single task score = metric(y_true=y_true_subset, y_pred=y_pred[test_label]) - scores.loc[len(scores)] = ( - test_label, - target_cols[0], - metric, - score, - ) + scores.loc[len(scores)] = (test_label, target_cols[0], metric, score) continue # Otherwise, for every target... @@ -67,6 +59,4 @@ def evaluate_benchmark(y_pred: PredictionsType, ) scores.loc[len(scores)] = (test_label, target_label, metric, score) - return BenchmarkResults(results=scores, - benchmark_name=benchmark_name, - benchmark_owner=benchmark_owner) + return scores From 66614d0c8469cc274b131bac847b17e37aab7724 Mon Sep 17 00:00:00 2001 From: Andrew Quirke Date: Fri, 24 May 2024 11:10:06 -0400 Subject: [PATCH 08/18] updating date serializer --- polaris/competition/_competition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polaris/competition/_competition.py b/polaris/competition/_competition.py index 2a8f3817..1fee1de8 100644 --- a/polaris/competition/_competition.py +++ b/polaris/competition/_competition.py @@ -20,7 +20,7 @@ class CompetitionSpecification(BenchmarkSpecification): Currently, these entities will primarily differ at how user predictions are evaluated. """ - + # Additional properties specific to Competitions start_time: datetime | None = None scheduled_end_time: datetime | None = None From 4decc80b1d43eca32a74547a79bada156b00aef7 Mon Sep 17 00:00:00 2001 From: Kira McLean Date: Tue, 7 May 2024 23:16:38 -0300 Subject: [PATCH 09/18] call hub evaluate endpoint from client evaluate_competitions method --- polaris/competition/_competition.py | 3 ++- polaris/hub/client.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/polaris/competition/_competition.py b/polaris/competition/_competition.py index 1fee1de8..fed9dfac 100644 --- a/polaris/competition/_competition.py +++ b/polaris/competition/_competition.py @@ -1,5 +1,6 @@ from datetime import datetime import os +import numpy as np from typing import Optional, Union from pydantic import field_serializer @@ -20,7 +21,7 @@ class CompetitionSpecification(BenchmarkSpecification): Currently, these entities will primarily differ at how user predictions are evaluated. """ - + # Additional properties specific to Competitions start_time: datetime | None = None scheduled_end_time: datetime | None = None diff --git a/polaris/hub/client.py b/polaris/hub/client.py index f78ab07d..23c7573b 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -867,7 +867,7 @@ def evaluate_competition( A `BenchmarkResults` object. """ return self._base_request_to_hub( - url="/v2/competition/evaluate", + url=f"/v2/competition/evaluate", method="PUT", json={ "competition": competition.model_dump(exclude_none=True, From 7e9746f9e65cf8e2ede2c7ad11455390e9bb9680 Mon Sep 17 00:00:00 2001 From: Kira McLean Date: Thu, 16 May 2024 18:05:38 -0300 Subject: [PATCH 10/18] Update polaris/competition/_competition.py Co-authored-by: Andrew Quirke <75542075+Andrewq11@users.noreply.github.com> --- polaris/competition/_competition.py | 1 + 1 file changed, 1 insertion(+) diff --git a/polaris/competition/_competition.py b/polaris/competition/_competition.py index fed9dfac..c7be073b 100644 --- a/polaris/competition/_competition.py +++ b/polaris/competition/_competition.py @@ -1,6 +1,7 @@ from datetime import datetime import os import numpy as np +import pandas as pd from typing import Optional, Union from pydantic import field_serializer From 2be92b2b985623810b46c1cff686476206a0e253 Mon Sep 17 00:00:00 2001 From: Andrew Quirke Date: Fri, 24 May 2024 11:10:06 -0400 Subject: [PATCH 11/18] updating date serializer --- polaris/competition/_competition.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/polaris/competition/_competition.py b/polaris/competition/_competition.py index c7be073b..049f1801 100644 --- a/polaris/competition/_competition.py +++ b/polaris/competition/_competition.py @@ -11,7 +11,8 @@ from polaris.evaluate import BenchmarkResults from polaris.evaluate.utils import evaluate_benchmark from polaris.hub.settings import PolarisHubSettings -from polaris.utils.types import AccessType, HubOwner, PredictionsType, TimeoutTypes, ZarrConflictResolution +from polaris.utils.types import AccessType, HubOwner, TimeoutTypes, ZarrConflictResolution + class CompetitionSpecification(BenchmarkSpecification): """This class extends the [`BenchmarkSpecification`][polaris.benchmark.BenchmarkSpecification] to @@ -20,6 +21,11 @@ class CompetitionSpecification(BenchmarkSpecification): Much of the underlying data model and logic is shared across Benchmarks and Competitions, and anything within this class serves as a point of differentiation between the two. + facilitate interactions with Polaris Competitions. + + Much of the underlying data model and logic is shared across Benchmarks and Competitions, and + anything within this class serves as a point of differentiation between the two. + Currently, these entities will primarily differ at how user predictions are evaluated. """ From b1b7d24f257e7e02be9eb1958bf653b09909cdf5 Mon Sep 17 00:00:00 2001 From: Kira McLean Date: Tue, 7 May 2024 23:16:38 -0300 Subject: [PATCH 12/18] call hub evaluate endpoint from client evaluate_competitions method --- polaris/competition/_competition.py | 3 +-- polaris/hub/client.py | 4 +++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/polaris/competition/_competition.py b/polaris/competition/_competition.py index 049f1801..196ea619 100644 --- a/polaris/competition/_competition.py +++ b/polaris/competition/_competition.py @@ -11,8 +11,7 @@ from polaris.evaluate import BenchmarkResults from polaris.evaluate.utils import evaluate_benchmark from polaris.hub.settings import PolarisHubSettings -from polaris.utils.types import AccessType, HubOwner, TimeoutTypes, ZarrConflictResolution - +from polaris.utils.types import AccessType, HubOwner, PredictionsType, TimeoutTypes, ZarrConflictResolution class CompetitionSpecification(BenchmarkSpecification): """This class extends the [`BenchmarkSpecification`][polaris.benchmark.BenchmarkSpecification] to diff --git a/polaris/hub/client.py b/polaris/hub/client.py index 23c7573b..dc4012ed 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -875,4 +875,6 @@ def evaluate_competition( "split"], by_alias=True), "predictions": y_pred - }) \ No newline at end of file + }) + + From d56e86043597e54f94258597cccd7e09151205a1 Mon Sep 17 00:00:00 2001 From: Kira McLean Date: Wed, 15 May 2024 11:33:20 -0300 Subject: [PATCH 13/18] add super basic test for evaluating competitions --- polaris/competition/_competition.py | 1 + polaris/hub/client.py | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/polaris/competition/_competition.py b/polaris/competition/_competition.py index 196ea619..e304662a 100644 --- a/polaris/competition/_competition.py +++ b/polaris/competition/_competition.py @@ -102,3 +102,4 @@ def _serialize_start_date(self, v): """Convert from datetime to string to make sure it's serializable""" if v: return v.isoformat() + diff --git a/polaris/hub/client.py b/polaris/hub/client.py index dc4012ed..20e92205 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -358,7 +358,7 @@ def _get_dataset( verify_checksum: bool = True, ) -> Dataset: """Loads either a standard or competition dataset from Polaris Hub - + Args: owner: The owner of the dataset. Can be either a user or organization from the Polaris Hub. name: The name of the dataset. @@ -876,5 +876,3 @@ def evaluate_competition( by_alias=True), "predictions": y_pred }) - - From 9a2677366fe7ef7bd73a8f0c0464d6db85e067c2 Mon Sep 17 00:00:00 2001 From: Andrew Quirke Date: Thu, 16 May 2024 14:42:01 -0400 Subject: [PATCH 14/18] comp wip --- polaris/hub/client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/polaris/hub/client.py b/polaris/hub/client.py index 20e92205..09e25fec 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -358,7 +358,7 @@ def _get_dataset( verify_checksum: bool = True, ) -> Dataset: """Loads either a standard or competition dataset from Polaris Hub - + Args: owner: The owner of the dataset. Can be either a user or organization from the Polaris Hub. name: The name of the dataset. @@ -867,7 +867,7 @@ def evaluate_competition( A `BenchmarkResults` object. """ return self._base_request_to_hub( - url=f"/v2/competition/evaluate", + url="/v2/competition/evaluate", method="PUT", json={ "competition": competition.model_dump(exclude_none=True, From a9a20068b51d83feba0ce6f87ac9a81176b293ed Mon Sep 17 00:00:00 2001 From: Andrew Quirke Date: Fri, 24 May 2024 11:10:06 -0400 Subject: [PATCH 15/18] updating date serializer --- polaris/competition/_competition.py | 1 - 1 file changed, 1 deletion(-) diff --git a/polaris/competition/_competition.py b/polaris/competition/_competition.py index e304662a..196ea619 100644 --- a/polaris/competition/_competition.py +++ b/polaris/competition/_competition.py @@ -102,4 +102,3 @@ def _serialize_start_date(self, v): """Convert from datetime to string to make sure it's serializable""" if v: return v.isoformat() - From ae5affbf0d7f92b6ae77b33a3a3fb7341bd8c067 Mon Sep 17 00:00:00 2001 From: Kira McLean Date: Tue, 7 May 2024 23:16:38 -0300 Subject: [PATCH 16/18] call hub evaluate endpoint from client evaluate_competitions method --- polaris/competition/_competition.py | 1 - polaris/hub/client.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/polaris/competition/_competition.py b/polaris/competition/_competition.py index 196ea619..007502da 100644 --- a/polaris/competition/_competition.py +++ b/polaris/competition/_competition.py @@ -1,7 +1,6 @@ from datetime import datetime import os import numpy as np -import pandas as pd from typing import Optional, Union from pydantic import field_serializer diff --git a/polaris/hub/client.py b/polaris/hub/client.py index 09e25fec..95f3565b 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -867,7 +867,7 @@ def evaluate_competition( A `BenchmarkResults` object. """ return self._base_request_to_hub( - url="/v2/competition/evaluate", + url=f"/v2/competition/evaluate", method="PUT", json={ "competition": competition.model_dump(exclude_none=True, From 4d7124290573bf612801eb9308b76ff8a68d3599 Mon Sep 17 00:00:00 2001 From: Kira McLean Date: Mon, 27 May 2024 13:03:15 -0300 Subject: [PATCH 17/18] fix bad merge resolution --- polaris/competition/_competition.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/polaris/competition/_competition.py b/polaris/competition/_competition.py index 007502da..d21a4016 100644 --- a/polaris/competition/_competition.py +++ b/polaris/competition/_competition.py @@ -4,8 +4,6 @@ from typing import Optional, Union from pydantic import field_serializer -import numpy as np -import pandas as pd from polaris.benchmark import BenchmarkSpecification from polaris.evaluate import BenchmarkResults from polaris.evaluate.utils import evaluate_benchmark From aee2b556cbfa7f7d558c17b61f70e1f8dcfce63b Mon Sep 17 00:00:00 2001 From: Kira McLean Date: Mon, 27 May 2024 13:13:18 -0300 Subject: [PATCH 18/18] only send competition artifact ID to hub --- polaris/hub/client.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/polaris/hub/client.py b/polaris/hub/client.py index 95f3565b..7970648b 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -870,9 +870,6 @@ def evaluate_competition( url=f"/v2/competition/evaluate", method="PUT", json={ - "competition": competition.model_dump(exclude_none=True, - exclude=["dataset", - "split"], - by_alias=True), + "competition": competition.artifact_id, "predictions": y_pred })