Skip to content

Competition evaluation #103

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
May 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 43 additions & 73 deletions polaris/benchmark/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

import fsspec
import numpy as np
import pandas as pd
from datamol.utils import fs
from pydantic import (
Field,
Expand All @@ -19,9 +18,9 @@

from polaris._artifact import BaseArtifactModel
from polaris.dataset import Dataset, Subset
from polaris.evaluate import BenchmarkResults, Metric, ResultsType
from polaris.evaluate import BenchmarkResults, Metric
from polaris.evaluate.utils import evaluate_benchmark
from polaris.hub.settings import PolarisHubSettings
from polaris.utils.context import tmp_attribute_change
from polaris.utils.dict2html import dict2html
from polaris.utils.errors import InvalidBenchmarkError, PolarisChecksumError
from polaris.utils.misc import listit
Expand Down Expand Up @@ -351,6 +350,37 @@ def task_type(self) -> TaskType:
v = TaskType.MULTI_TASK if len(self.target_cols) > 1 else TaskType.SINGLE_TASK
return v.value

def _get_subset(self, indices, hide_targets=True, featurization_fn=None):
"""Returns a [`Subset`][polaris.dataset.Subset] using the given indices. Used
internally to construct the train and test sets."""
return Subset(
dataset=self.dataset,
indices=indices,
input_cols=self.input_cols,
target_cols=self.target_cols,
hide_targets=hide_targets,
featurization_fn=featurization_fn,
)

def _get_test_set(
self, hide_targets=True, featurization_fn: Optional[Callable] = None
) -> Union["Subset", dict[str, Subset]]:
"""Construct the test set(s), given the split in the benchmark specification. Used
internally to construct the test set for client use and evaluation.
"""
def make_test_subset(vals):
return self._get_subset(vals,
hide_targets=hide_targets,
featurization_fn=featurization_fn)

test_split = self.split[1]
if isinstance(test_split, dict):
test = {k: make_test_subset(v) for k, v in test_split.items()}
else:
test = make_test_subset(test_split)

return test

def get_train_test_split(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the cleanup you've done here is quite nice. In combination with pulling out the evaluation logic into a separate utility, it makes the base logic here more easily understandable.

self, featurization_fn: Optional[Callable] = None
) -> tuple[Subset, Union["Subset", dict[str, Subset]]]:
Expand All @@ -366,25 +396,12 @@ def get_train_test_split(

Returns:
A tuple with the train `Subset` and test `Subset` objects.
If there are multiple test sets, these are returned in a dictionary and each test set has
an associated name. The targets of the test set can not be accessed.
If there are multiple test sets, these are returned in a dictionary and each test set has
an associated name. The targets of the test set can not be accessed.
"""

def _get_subset(indices, hide_targets):
return Subset(
dataset=self.dataset,
indices=indices,
input_cols=self.input_cols,
target_cols=self.target_cols,
hide_targets=hide_targets,
featurization_fn=featurization_fn,
)

train = _get_subset(self.split[0], hide_targets=False)
if isinstance(self.split[1], dict):
test = {k: _get_subset(v, hide_targets=True) for k, v in self.split[1].items()}
else:
test = _get_subset(self.split[1], hide_targets=True)
train = self._get_subset(self.split[0], hide_targets=False, featurization_fn=featurization_fn)
test = self._get_test_set(hide_targets=True, featurization_fn=featurization_fn)

return train, test

Expand Down Expand Up @@ -416,60 +433,13 @@ def evaluate(self, y_pred: PredictionsType) -> BenchmarkResults:
# Instead of having the user pass the ground truth, we extract it from the benchmark spec ourselves.
# This simplifies the API, but also was added to make accidental access to the test set targets less likely.
# See also the `hide_targets` parameter in the `Subset` class.
test = self.get_train_test_split()[1]

if not isinstance(test, dict):
test = {"test": test}

y_true = {}
for k, test_subset in test.items():
with tmp_attribute_change(test_subset, "_hide_targets", False):
y_true[k] = test_subset.targets

if not isinstance(y_pred, dict) or all(k in self.target_cols for k in y_pred):
y_pred = {"test": y_pred}

if any(k not in y_pred for k in test.keys()):
raise KeyError(
f"Missing keys for at least one of the test sets. Expecting: {sorted(test.keys())}"
)
test = self._get_test_set(hide_targets=False)
y_true = test.targets
scores = evaluate_benchmark(y_pred, y_true, self.target_cols, self.metrics)

# Results are saved in a tabular format. For more info, see the BenchmarkResults docs.
scores: ResultsType = pd.DataFrame(columns=BenchmarkResults.RESULTS_COLUMNS)

# For every test set...
for test_label, y_true_subset in y_true.items():
# For every metric...
for metric in self.metrics:
if metric.is_multitask:
# Multi-task but with a metric across targets
score = metric(y_true=y_true_subset, y_pred=y_pred[test_label])
scores.loc[len(scores)] = (test_label, "aggregated", metric, score)
continue

if not isinstance(y_true_subset, dict):
# Single task
score = metric(y_true=y_true_subset, y_pred=y_pred[test_label])
scores.loc[len(scores)] = (
test_label,
self.target_cols[0],
metric,
score,
)
continue

# Otherwise, for every target...
for target_label, y_true_target in y_true_subset.items():
# Single-task metrics for a multi-task benchmark
# In such a setting, there can be NaN values, which we thus have to filter out.
mask = ~np.isnan(y_true_target)
score = metric(
y_true=y_true_target[mask],
y_pred=y_pred[test_label][target_label][mask],
)
scores.loc[len(scores)] = (test_label, target_label, metric, score)

return BenchmarkResults(results=scores, benchmark_name=self.name, benchmark_owner=self.owner)
return BenchmarkResults(results=scores,
benchmark_name=self.name,
benchmark_owner=self.owner)

def upload_to_hub(
self,
Expand Down
55 changes: 48 additions & 7 deletions polaris/competition/_competition.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from datetime import datetime
import os
import numpy as np
from typing import Optional, Union

from pydantic import field_serializer
from polaris.benchmark import BenchmarkSpecification
from polaris.evaluate import BenchmarkResults
from polaris.evaluate.utils import evaluate_benchmark
from polaris.hub.settings import PolarisHubSettings
from polaris.utils.types import AccessType, HubOwner, TimeoutTypes, ZarrConflictResolution

from polaris.utils.types import AccessType, HubOwner, PredictionsType, TimeoutTypes, ZarrConflictResolution

class CompetitionSpecification(BenchmarkSpecification):
"""This class extends the [`BenchmarkSpecification`][polaris.benchmark.BenchmarkSpecification] to
Expand All @@ -15,6 +17,11 @@ class CompetitionSpecification(BenchmarkSpecification):
Much of the underlying data model and logic is shared across Benchmarks and Competitions, and
anything within this class serves as a point of differentiation between the two.

facilitate interactions with Polaris Competitions.

Much of the underlying data model and logic is shared across Benchmarks and Competitions, and
anything within this class serves as a point of differentiation between the two.

Currently, these entities will primarily differ at how user predictions are evaluated.
"""

Expand All @@ -23,13 +30,47 @@ class CompetitionSpecification(BenchmarkSpecification):
scheduled_end_time: datetime | None = None
actual_end_time: datetime | None = None

def evaluate(self, predictions):
"""Wrapper method which ultimately triggers an evaluation service to assess and score user predictions
for a given competition
def evaluate(
self,
y_pred: PredictionsType,
env_file: Optional[Union[str, os.PathLike]] = None,
settings: Optional[PolarisHubSettings] = None,
cache_auth_token: bool = True,
**kwargs: dict
):
"""Light convenience wrapper around
[`PolarisHubClient.evaluate_competition`][polaris.hub.client.PolarisHubClient.evaluate_competition].
"""
from polaris.hub.client import PolarisHubClient

# TODO validate that the number of predictions supplied matches the number of test set rows
pass
with PolarisHubClient(
env_file=env_file,
settings=settings,
cache_auth_token=cache_auth_token,
**kwargs,
) as client:
client.evaluate_competition(self, y_pred=y_pred)

def _hub_evaluate(self, y_pred: PredictionsType, y_true: PredictionsType):
"""Executes the evaluation logic for a competition, given a set of predictions.
Called only by Polaris Hub to evaluate competitions after labels are
downloaded from R2 on the hub. Evalutaion logic is the same as for regular benchmarks.

Args:
y_pred: The predictions for the test set, as NumPy arrays.
If there are multiple targets, the predictions should be wrapped in a
dictionary with the target labels as keys.

test: The test set. If there are multiple targets, the target columns should
be wrapped in a dictionary with the target labels as keys.

Returns:
A `BenchmarkResults` object containing the evaluation results.
"""
scores = evaluate_benchmark(y_pred, y_true, self.target_cols, self.metrics)
return BenchmarkResults(results=scores,
benchmark_name=self.name,
benchmark_owner=self.owner)

def upload_to_hub(
self,
Expand Down
62 changes: 62 additions & 0 deletions polaris/evaluate/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import numpy as np
import pandas as pd
from typing import Union

from polaris.evaluate import BenchmarkResults, ResultsType
from polaris.utils.types import PredictionsType
from polaris.evaluate import Metric

def is_multi_task_single_test_set(vals: PredictionsType, target_cols: list[str]):
"""Check if the given values are for a multiple-task benchmark with a single
test set. This is inferred by comparing the target names with the keys of the
given data. If all keys in the given data match the target column names, we
assume they are target names (as opposed to test set names for a single-task,
multiple test set benchmark)."""
Comment on lines +12 to +14
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes sense, but I can see this becoming a little messy in the future. Maybe this is something we can think about making more robust after we finish competitions.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I totally agree.. this is at least isolated and explained here now, but we may want to re-think the data format we accept for predictions. There's a brief discussion about it here. I think it would be reasonable to always expect something like:

{"test_set_name": {"target_col_name": [1, 2, 3, 4, 5]}"

I get the desire to be as succinct as possible too though.. you could imagine it would be annoying to have to submit that ☝️ every time you just have a list of numbers to submit.

return not isinstance(vals, dict) or set(vals.keys()) == set(target_cols)

def evaluate_benchmark(y_pred: PredictionsType,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This depends on fewer objects now, with the evaluation logic not requiring any unnecessary or wrapper-only data types. Thanks for being up for a bit of re-jiggering things around for the sake of simplicity.

y_true: PredictionsType,
target_cols: list[str],
metrics: Union[str, Metric, list[Union[str, Metric]]]):
if is_multi_task_single_test_set(y_true, target_cols):
y_true = {"test": y_true}

if is_multi_task_single_test_set(y_pred, target_cols):
y_pred = {"test": y_pred}

if set(y_true.keys()) != set(y_pred.keys()):
raise KeyError(
f"Missing keys for at least one of the test sets. Expecting: {sorted(y_true.keys())}"
)

# Results are saved in a tabular format. For more info, see the BenchmarkResults docs.
scores: ResultsType = pd.DataFrame(columns=BenchmarkResults.RESULTS_COLUMNS)

# For every test set...
for test_label, y_true_subset in y_true.items():
# For every metric...
for metric in metrics:
if metric.is_multitask:
# Multi-task but with a metric across targets
score = metric(y_true=y_true_subset, y_pred=y_pred[test_label])
scores.loc[len(scores)] = (test_label, "aggregated", metric, score)
continue

if not isinstance(y_true_subset, dict):
# Single task
score = metric(y_true=y_true_subset, y_pred=y_pred[test_label])
scores.loc[len(scores)] = (test_label, target_cols[0], metric, score)
continue

# Otherwise, for every target...
for target_label, y_true_target in y_true_subset.items():
# Single-task metrics for a multi-task benchmark
# In such a setting, there can be NaN values, which we thus have to filter out.
mask = ~np.isnan(y_true_target)
score = metric(
y_true=y_true_target[mask],
y_pred=y_pred[test_label][target_label][mask],
)
scores.loc[len(scores)] = (test_label, target_label, metric, score)

return scores
26 changes: 24 additions & 2 deletions polaris/hub/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
ArtifactType,
HubOwner,
IOMode,
PredictionsType,
SupportedLicenseType,
TimeoutTypes,
ZarrConflictResolution,
Expand Down Expand Up @@ -776,7 +777,6 @@ def upload_competition(
"""Upload a competition to the Polaris Hub.

Args:
dataset: The dataset to upload.
competition: The competition to upload.
timeout: Request timeout values. User can modify the value when uploading large dataset as needed.
This can be a single value with the timeout in seconds for all IO operations, or a more granular
Expand All @@ -796,7 +796,6 @@ def upload_competition(
dataset_response = self._upload_dataset(
competition.dataset, ArtifactType.COMPETITION.value, ACCESS, timeout, owner, if_exists
)

# Upload competition benchmark
competition_response = self._upload_benchmark(
competition, ArtifactType.COMPETITION.value, ACCESS, owner
Expand Down Expand Up @@ -851,3 +850,26 @@ def list_competitions(self, limit: int = 100, offset: int = 0) -> list[str]:
)
benchmarks_list = [f"{HubOwner(**bm['owner'])}/{bm['name']}" for bm in response["data"]]
return benchmarks_list

def evaluate_competition(
self,
competition: CompetitionSpecification,
y_pred: PredictionsType
) -> BenchmarkResults:
"""Evaluate the predictions for a competition on the Polaris Hub.

Args:
competition: The competition to evaluate the predictions for.
y_pred: The predictions for the test set, as NumPy arrays.
If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys.

Returns:
A `BenchmarkResults` object.
"""
return self._base_request_to_hub(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will call the corresponding hub endpoint (we should decide on what the URL should be), download the labels, then run the evaluation, returning a benchmark results object.

url=f"/v2/competition/evaluate",
method="PUT",
json={
"competition": competition.artifact_id,
"predictions": y_pred
})
1 change: 0 additions & 1 deletion polaris/utils/context.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from contextlib import contextmanager


@contextmanager
def tmp_attribute_change(obj, attribute, value):
"""Temporarily set and reset an attribute of an object."""
Expand Down
6 changes: 3 additions & 3 deletions polaris/utils/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
A prediction is one of three things:

- A single array (single-task, single test set)
- A dictionary of arrays (single-task, multiple test sets)
- A dictionary of arrays (single-task, multiple test sets)
- A dictionary of dictionaries of arrays (multi-task, multiple test sets)
"""

Expand All @@ -54,14 +54,14 @@
"""
A URL-compatible string that can be turned into a slug by the hub.

Can only use alpha-numeric characters, underscores and dashes.
Can only use alpha-numeric characters, underscores and dashes.
The string must be at least 4 and at most 64 characters long.
"""


HubUser: TypeAlias = SlugCompatibleStringType
"""
A user on the Polaris Hub is identified by a username,
A user on the Polaris Hub is identified by a username,
which is a [`SlugCompatibleStringType`][polaris.utils.types.SlugCompatibleStringType].
"""

Expand Down
Loading