Skip to content

Include a checksum for Zarr archives in the Dataset checksum #102

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 35 commits into from
Jul 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
4e37683
First implementation of the zarr checksum
cwognum May 8, 2024
22156f8
Removed left-over print statements
cwognum May 8, 2024
cefde26
Minor changes to docs
cwognum May 8, 2024
269886a
Removed unused method
cwognum May 8, 2024
9bc8086
Update polaris/dataset/_dataset.py
cwognum May 9, 2024
69aea30
Update polaris/dataset/zarr/_checksum.py
cwognum May 9, 2024
2529e47
Merge branch 'main' into feat/zarr-checksum
cwognum Jun 26, 2024
ad7aac6
Lazily compute the checksum
cwognum Jun 26, 2024
35246ce
Save the checksum per file
cwognum Jun 26, 2024
e924f01
Merge branch 'main' into feat/zarr-checksum
cwognum Jun 27, 2024
b2aae93
Improved docs because I kept forgetting how it works
cwognum Jun 27, 2024
a7d6aef
Only support running the checksum algorithm locally
cwognum Jun 28, 2024
10a9b53
Add a verify_checksum method and use it by default when caching a dat…
cwognum Jun 28, 2024
eaaa961
Added serialization to the checksum manifest on the client
cwognum Jun 28, 2024
18fd500
WIP: Integration with Hub
cwognum Jul 3, 2024
bcdb10d
WIP: Trying to get Zarr up- and downloads to work again...
cwognum Jul 3, 2024
12874c5
WIP: Further debugging of Zarr datasets
cwognum Jul 3, 2024
3fb065c
Removed caching from the PolarisFS ls() endpoint and changed verify_c…
cwognum Jul 4, 2024
6909de4
Minor changes in line with Hub changes
cwognum Jul 4, 2024
012d8c2
Set md5sum from the Hub
cwognum Jul 4, 2024
61dcb46
Merge branch 'main' into feat/zarr-checksum
cwognum Jul 4, 2024
0292a2c
Merge branch 'main' into feat/zarr-checksum
cwognum Jul 4, 2024
367b43d
Fixed bug in saving the md5Sum
cwognum Jul 4, 2024
71bf63f
Use request instead of fsspec.open to support custom headers in signe…
cwognum Jul 4, 2024
c67b6b5
Verify checksum on downloading a single chunk
cwognum Jul 5, 2024
1e37a46
Self review
cwognum Jul 5, 2024
1c10763
Trigger CICD
cwognum Jul 5, 2024
7fb571a
Address PR feedback
cwognum Jul 10, 2024
fdb2d6a
Merge branch 'main' into feat/zarr-checksum
cwognum Jul 11, 2024
dfe03bc
Fixed import error
cwognum Jul 11, 2024
cb1c6bc
Merge branch 'main' into feat/zarr-checksum
cwognum Jul 11, 2024
5330316
Remove Content-MD5 header from client
cwognum Jul 11, 2024
83162d4
Addressed feedback from PR
cwognum Jul 12, 2024
ce3fce4
Use RE to match checksum
cwognum Jul 12, 2024
fba8ce3
Clarify docs
cwognum Jul 12, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@
same "printed page" as the copyright notice for easier
identification within third-party archives.

Copyright 2021 Valence
Copyright [yyyy] [name of copyright owner]

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down
13 changes: 13 additions & 0 deletions NOTICE
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Copyright 2023 Valence Labs

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
6 changes: 6 additions & 0 deletions docs/api/dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,10 @@
options:
filters: ["!^_"]

---

::: polaris.dataset.zarr
options:
filters: ["!^_"]

---
10 changes: 10 additions & 0 deletions polaris/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,14 @@
import os
import sys

from loguru import logger

from ._version import __version__
from .loader import load_benchmark, load_dataset

__all__ = ["load_dataset", "load_benchmark", "__version__"]

# Configure the default logging level
os.environ["LOGURU_LEVEL"] = os.environ.get("LOGURU_LEVEL", "INFO")
logger.remove()
logger.add(sys.stderr, level=os.environ["LOGURU_LEVEL"])
71 changes: 71 additions & 0 deletions polaris/_mixins.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import abc
import re

from loguru import logger
from pydantic import BaseModel, PrivateAttr, computed_field

from polaris.utils.errors import PolarisChecksumError


class ChecksumMixin(BaseModel, abc.ABC):
"""
Mixin class to add checksum functionality to a class.
"""

_md5sum: str | None = PrivateAttr(None)

@abc.abstractmethod
def _compute_checksum(self) -> str:
"""Compute the checksum of the dataset."""
raise NotImplementedError

@computed_field
@property
def md5sum(self) -> str:
"""Lazily compute the checksum once needed."""
if not self.has_md5sum:
logger.info("Computing the checksum. This can be slow for large datasets.")
self.md5sum = self._compute_checksum()
return self._md5sum

@md5sum.setter
def md5sum(self, value: str):
"""Set the checksum."""
if not re.fullmatch(r"^[a-f0-9]{32}$", value):
raise ValueError("The checksum should be the 32-character hexdigest of a 128 bit MD5 hash.")
self._md5sum = value

@property
def has_md5sum(self) -> bool:
"""Whether the md5sum for this class has been computed and stored."""
return self._md5sum is not None

def verify_checksum(self, md5sum: str | None = None):
"""
Recomputes the checksum and verifies whether it matches the stored checksum.

Warning: Slow operation
This operation can be slow for large datasets.

Info: Only works for locally stored datasets
The checksum verification only works for datasets that are stored locally in its entirety.
We don't have to verify the checksum for datasets stored on the Hub, as the Hub will do this on upload.
And if you're streaming the data from the Hub, we will check the checksum of each chunk on download.
"""
if md5sum is None:
md5sum = self._md5sum
if md5sum is None:
logger.warning(
"No checksum to verify against. Specify either the md5sum parameter or "
"store the checksum in the dataset.md5sum attribute."
)
return

# Recompute the checksum
logger.info("To verify the checksum, we need to recompute it. This can be slow for large datasets.")
self.md5sum = self._compute_checksum()

if self.md5sum != md5sum:
raise PolarisChecksumError(
f"The specified checksum {md5sum} does not match the computed checksum {self.md5sum}"
)
54 changes: 17 additions & 37 deletions polaris/benchmark/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,13 @@
from sklearn.utils.multiclass import type_of_target

from polaris._artifact import BaseArtifactModel
from polaris._mixins import ChecksumMixin
from polaris.dataset import Dataset, Subset
from polaris.evaluate import BenchmarkResults, Metric, ResultsType
from polaris.hub.settings import PolarisHubSettings
from polaris.utils.context import tmp_attribute_change
from polaris.utils.dict2html import dict2html
from polaris.utils.errors import InvalidBenchmarkError, PolarisChecksumError
from polaris.utils.errors import InvalidBenchmarkError
from polaris.utils.misc import listit
from polaris.utils.types import (
AccessType,
Expand All @@ -36,7 +37,7 @@
ColumnsType = Union[str, list[str]]


class BenchmarkSpecification(BaseArtifactModel):
class BenchmarkSpecification(BaseArtifactModel, ChecksumMixin):
"""This class wraps a [`Dataset`][polaris.dataset.Dataset] with additional data
to specify the evaluation logic.

Expand Down Expand Up @@ -85,8 +86,6 @@ class BenchmarkSpecification(BaseArtifactModel):
split: The predefined train-test split to use for evaluation.
metrics: The metrics to use for evaluating performance
main_metric: The main metric used to rank methods. If `None`, the first of the `metrics` field.
md5sum: The checksum is used to verify the version of the dataset specification. If specified, it will
raise an error if the specified checksum doesn't match the computed checksum.
readme: Markdown text that can be used to provide a formatted description of the benchmark.
If using the Polaris Hub, it is worth noting that this field is more easily edited through the Hub UI
as it provides a rich text editor for writing markdown.
Expand All @@ -102,7 +101,6 @@ class BenchmarkSpecification(BaseArtifactModel):
split: SplitType
metrics: Union[str, Metric, list[Union[str, Metric]]]
main_metric: Optional[Union[str, Metric]] = None
md5sum: Optional[str] = None

# Additional meta-data
readme: str = ""
Expand Down Expand Up @@ -214,6 +212,12 @@ def _validate_target_types(cls, v, info: ValidationInfo):
for target in target_cols:
if target not in v:
val = dataset[:, target]

# Non numeric columns can be targets (e.g. prediction molecular reactions),
# but in that case we currently don't infer the target type.
if not np.issubdtype(val.dtype, np.number):
continue

# remove the nans for mutiple task dataset when the table is sparse
target_type = type_of_target(val[~np.isnan(val)])
if target_type == "continuous":
Expand All @@ -230,34 +234,11 @@ def _validate_target_types(cls, v, info: ValidationInfo):
@classmethod
def _validate_model(cls, m: "BenchmarkSpecification"):
"""
If a checksum is provided, verify it matches what the checksum should be.
If no checksum is provided, make sure it is set.
Also sets a default metric if missing.
Sets a default metric if missing.
"""

# Validate checksum
checksum = m.md5sum

expected = cls._compute_checksum(
dataset=m.dataset,
target_cols=m.target_cols,
input_cols=m.input_cols,
split=m.split,
metrics=m.metrics,
)

if checksum is None:
m.md5sum = expected
elif checksum != expected:
raise PolarisChecksumError(
"The dataset checksum does not match what was specified in the meta-data. "
f"{checksum} != {expected}"
)

# Set a default main metric if not set yet
if m.main_metric is None:
m.main_metric = m.metrics[0]

return m

@field_serializer("metrics", "main_metric")
Expand All @@ -277,25 +258,24 @@ def _serialize_target_types(self, v):
"""Convert from enum to string to make sure it's serializable"""
return {k: v.value for k, v in self.target_types.items()}

@staticmethod
def _compute_checksum(dataset, target_cols, input_cols, split, metrics):
def _compute_checksum(self):
"""
Computes a hash of the benchmark.

This is meant to uniquely identify the benchmark and can be used to verify the version.
"""

hash_fn = md5()
hash_fn.update(dataset.md5sum.encode("utf-8"))
for c in sorted(target_cols):
hash_fn.update(self.dataset.md5sum.encode("utf-8"))
for c in sorted(self.target_cols):
hash_fn.update(c.encode("utf-8"))
for c in sorted(input_cols):
for c in sorted(self.input_cols):
hash_fn.update(c.encode("utf-8"))
for m in sorted(metrics, key=lambda k: k.name):
for m in sorted(self.metrics, key=lambda k: k.name):
hash_fn.update(m.name.encode("utf-8"))

if not isinstance(split[1], dict):
split = split[0], {"test": split[1]}
if not isinstance(self.split[1], dict):
split = self.split[0], {"test": self.split[1]}

# Train set
s = json.dumps(sorted(split[0]))
Expand Down
1 change: 1 addition & 0 deletions polaris/dataset/_adapters.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from enum import Enum, auto, unique

import datamol as dm

# Map of conversion operations which can be applied to dataset columns
Expand Down
Loading
Loading