-
Notifications
You must be signed in to change notification settings - Fork 9
Competition evaluation #103
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8978856
6d4a958
d990c50
7ef3b79
02db480
34edabe
8a1e613
66614d0
4decc80
7e9746f
2be92b2
b1b7d24
d56e860
9a26773
a9a2006
ae5affb
4d71242
aee2b55
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import numpy as np | ||
import pandas as pd | ||
from typing import Union | ||
|
||
from polaris.evaluate import BenchmarkResults, ResultsType | ||
from polaris.utils.types import PredictionsType | ||
from polaris.evaluate import Metric | ||
|
||
def is_multi_task_single_test_set(vals: PredictionsType, target_cols: list[str]): | ||
"""Check if the given values are for a multiple-task benchmark with a single | ||
test set. This is inferred by comparing the target names with the keys of the | ||
given data. If all keys in the given data match the target column names, we | ||
assume they are target names (as opposed to test set names for a single-task, | ||
multiple test set benchmark).""" | ||
Comment on lines
+12
to
+14
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This makes sense, but I can see this becoming a little messy in the future. Maybe this is something we can think about making more robust after we finish competitions. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah I totally agree.. this is at least isolated and explained here now, but we may want to re-think the data format we accept for predictions. There's a brief discussion about it here. I think it would be reasonable to always expect something like:
I get the desire to be as succinct as possible too though.. you could imagine it would be annoying to have to submit that ☝️ every time you just have a list of numbers to submit. |
||
return not isinstance(vals, dict) or set(vals.keys()) == set(target_cols) | ||
|
||
def evaluate_benchmark(y_pred: PredictionsType, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This depends on fewer objects now, with the evaluation logic not requiring any unnecessary or wrapper-only data types. Thanks for being up for a bit of re-jiggering things around for the sake of simplicity. |
||
y_true: PredictionsType, | ||
target_cols: list[str], | ||
metrics: Union[str, Metric, list[Union[str, Metric]]]): | ||
if is_multi_task_single_test_set(y_true, target_cols): | ||
y_true = {"test": y_true} | ||
|
||
if is_multi_task_single_test_set(y_pred, target_cols): | ||
y_pred = {"test": y_pred} | ||
|
||
if set(y_true.keys()) != set(y_pred.keys()): | ||
raise KeyError( | ||
f"Missing keys for at least one of the test sets. Expecting: {sorted(y_true.keys())}" | ||
) | ||
|
||
# Results are saved in a tabular format. For more info, see the BenchmarkResults docs. | ||
scores: ResultsType = pd.DataFrame(columns=BenchmarkResults.RESULTS_COLUMNS) | ||
|
||
# For every test set... | ||
for test_label, y_true_subset in y_true.items(): | ||
# For every metric... | ||
for metric in metrics: | ||
if metric.is_multitask: | ||
# Multi-task but with a metric across targets | ||
score = metric(y_true=y_true_subset, y_pred=y_pred[test_label]) | ||
scores.loc[len(scores)] = (test_label, "aggregated", metric, score) | ||
continue | ||
|
||
if not isinstance(y_true_subset, dict): | ||
# Single task | ||
score = metric(y_true=y_true_subset, y_pred=y_pred[test_label]) | ||
scores.loc[len(scores)] = (test_label, target_cols[0], metric, score) | ||
continue | ||
|
||
# Otherwise, for every target... | ||
for target_label, y_true_target in y_true_subset.items(): | ||
# Single-task metrics for a multi-task benchmark | ||
# In such a setting, there can be NaN values, which we thus have to filter out. | ||
mask = ~np.isnan(y_true_target) | ||
score = metric( | ||
y_true=y_true_target[mask], | ||
y_pred=y_pred[test_label][target_label][mask], | ||
) | ||
scores.loc[len(scores)] = (test_label, target_label, metric, score) | ||
|
||
return scores |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,6 +40,7 @@ | |
ArtifactType, | ||
HubOwner, | ||
IOMode, | ||
PredictionsType, | ||
SupportedLicenseType, | ||
TimeoutTypes, | ||
ZarrConflictResolution, | ||
|
@@ -776,7 +777,6 @@ def upload_competition( | |
"""Upload a competition to the Polaris Hub. | ||
|
||
Args: | ||
dataset: The dataset to upload. | ||
competition: The competition to upload. | ||
timeout: Request timeout values. User can modify the value when uploading large dataset as needed. | ||
This can be a single value with the timeout in seconds for all IO operations, or a more granular | ||
|
@@ -796,7 +796,6 @@ def upload_competition( | |
dataset_response = self._upload_dataset( | ||
competition.dataset, ArtifactType.COMPETITION.value, ACCESS, timeout, owner, if_exists | ||
) | ||
|
||
# Upload competition benchmark | ||
competition_response = self._upload_benchmark( | ||
competition, ArtifactType.COMPETITION.value, ACCESS, owner | ||
|
@@ -851,3 +850,26 @@ def list_competitions(self, limit: int = 100, offset: int = 0) -> list[str]: | |
) | ||
benchmarks_list = [f"{HubOwner(**bm['owner'])}/{bm['name']}" for bm in response["data"]] | ||
return benchmarks_list | ||
|
||
def evaluate_competition( | ||
self, | ||
competition: CompetitionSpecification, | ||
y_pred: PredictionsType | ||
) -> BenchmarkResults: | ||
"""Evaluate the predictions for a competition on the Polaris Hub. | ||
|
||
Args: | ||
competition: The competition to evaluate the predictions for. | ||
y_pred: The predictions for the test set, as NumPy arrays. | ||
If there are multiple targets, the predictions should be wrapped in a dictionary with the target labels as keys. | ||
|
||
Returns: | ||
A `BenchmarkResults` object. | ||
""" | ||
return self._base_request_to_hub( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This will call the corresponding hub endpoint (we should decide on what the URL should be), download the labels, then run the evaluation, returning a benchmark results object. |
||
url=f"/v2/competition/evaluate", | ||
kirahowe marked this conversation as resolved.
Show resolved
Hide resolved
|
||
method="PUT", | ||
json={ | ||
"competition": competition.artifact_id, | ||
"predictions": y_pred | ||
}) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the cleanup you've done here is quite nice. In combination with pulling out the evaluation logic into a separate utility, it makes the base logic here more easily understandable.