polaris-hub · cwognum · Jul 12, 2024 · May 8, 2024 · May 8, 2024 · May 8, 2024
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright 2021 Valence
+   Copyright [yyyy] [name of copyright owner]
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

@@ -0,0 +1,13 @@
+Copyright 2023 Valence Labs
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
@@ -8,4 +8,10 @@
     options:
         filters: ["!^_"]
 
+---
+
+::: polaris.dataset.zarr
+    options:
+        filters: ["!^_"]
+
 ---
@@ -1,4 +1,14 @@
+import os
+import sys
+
+from loguru import logger
+
 from ._version import __version__
 from .loader import load_benchmark, load_dataset
 
 __all__ = ["load_dataset", "load_benchmark", "__version__"]
+
+# Configure the default logging level
+os.environ["LOGURU_LEVEL"] = os.environ.get("LOGURU_LEVEL", "INFO")
+logger.remove()
+logger.add(sys.stderr, level=os.environ["LOGURU_LEVEL"])
@@ -0,0 +1,71 @@
+import abc
+import re
+
+from loguru import logger
+from pydantic import BaseModel, PrivateAttr, computed_field
+
+from polaris.utils.errors import PolarisChecksumError
+
+
+class ChecksumMixin(BaseModel, abc.ABC):
+    """
+    Mixin class to add checksum functionality to a class.
+    """
+
+    _md5sum: str | None = PrivateAttr(None)
+
+    @abc.abstractmethod
+    def _compute_checksum(self) -> str:
+        """Compute the checksum of the dataset."""
+        raise NotImplementedError
+
+    @computed_field
+    @property
+    def md5sum(self) -> str:
+        """Lazily compute the checksum once needed."""
+        if not self.has_md5sum:
+            logger.info("Computing the checksum. This can be slow for large datasets.")
+            self.md5sum = self._compute_checksum()
+        return self._md5sum
+
+    @md5sum.setter
+    def md5sum(self, value: str):
+        """Set the checksum."""
+        if not re.fullmatch(r"^[a-f0-9]{32}$", value):
+            raise ValueError("The checksum should be the 32-character hexdigest of a 128 bit MD5 hash.")
+        self._md5sum = value
+
+    @property
+    def has_md5sum(self) -> bool:
+        """Whether the md5sum for this class has been computed and stored."""
+        return self._md5sum is not None
+
+    def verify_checksum(self, md5sum: str | None = None):
+        """
+        Recomputes the checksum and verifies whether it matches the stored checksum.
+
+        Warning: Slow operation
+            This operation can be slow for large datasets.
+
+        Info: Only works for locally stored datasets
+            The checksum verification only works for datasets that are stored locally in its entirety.
+            We don't have to verify the checksum for datasets stored on the Hub, as the Hub will do this on upload.
+            And if you're streaming the data from the Hub, we will check the checksum of each chunk on download.
+        """
+        if md5sum is None:
+            md5sum = self._md5sum
+        if md5sum is None:
+            logger.warning(
+                "No checksum to verify against. Specify either the md5sum parameter or "
+                "store the checksum in the dataset.md5sum attribute."
+            )
+            return
+
+        # Recompute the checksum
+        logger.info("To verify the checksum, we need to recompute it. This can be slow for large datasets.")
+        self.md5sum = self._compute_checksum()
+
+        if self.md5sum != md5sum:
+            raise PolarisChecksumError(
+                f"The specified checksum {md5sum} does not match the computed checksum {self.md5sum}"
+            )
@@ -17,12 +17,13 @@
 from sklearn.utils.multiclass import type_of_target
 
 from polaris._artifact import BaseArtifactModel
+from polaris._mixins import ChecksumMixin
 from polaris.dataset import Dataset, Subset
 from polaris.evaluate import BenchmarkResults, Metric, ResultsType
 from polaris.hub.settings import PolarisHubSettings
 from polaris.utils.context import tmp_attribute_change
 from polaris.utils.dict2html import dict2html
-from polaris.utils.errors import InvalidBenchmarkError, PolarisChecksumError
+from polaris.utils.errors import InvalidBenchmarkError
 from polaris.utils.misc import listit
 from polaris.utils.types import (
     AccessType,
@@ -36,7 +37,7 @@
 ColumnsType = Union[str, list[str]]
 
 
-class BenchmarkSpecification(BaseArtifactModel):
+class BenchmarkSpecification(BaseArtifactModel, ChecksumMixin):
     """This class wraps a [`Dataset`][polaris.dataset.Dataset] with additional data
      to specify the evaluation logic.
 
@@ -85,8 +86,6 @@ class BenchmarkSpecification(BaseArtifactModel):
         split: The predefined train-test split to use for evaluation.
         metrics: The metrics to use for evaluating performance
         main_metric: The main metric used to rank methods. If `None`, the first of the `metrics` field.
-        md5sum: The checksum is used to verify the version of the dataset specification. If specified, it will
-            raise an error if the specified checksum doesn't match the computed checksum.
         readme: Markdown text that can be used to provide a formatted description of the benchmark.
             If using the Polaris Hub, it is worth noting that this field is more easily edited through the Hub UI
             as it provides a rich text editor for writing markdown.
@@ -102,7 +101,6 @@ class BenchmarkSpecification(BaseArtifactModel):
     split: SplitType
     metrics: Union[str, Metric, list[Union[str, Metric]]]
     main_metric: Optional[Union[str, Metric]] = None
-    md5sum: Optional[str] = None
 
     # Additional meta-data
     readme: str = ""
@@ -214,6 +212,12 @@ def _validate_target_types(cls, v, info: ValidationInfo):
         for target in target_cols:
             if target not in v:
                 val = dataset[:, target]
+
+                # Non numeric columns can be targets (e.g. prediction molecular reactions),
+                # but in that case we currently don't infer the target type.
+                if not np.issubdtype(val.dtype, np.number):
+                    continue
+
                 # remove the nans for mutiple task dataset when the table is sparse
                 target_type = type_of_target(val[~np.isnan(val)])
                 if target_type == "continuous":
@@ -230,34 +234,11 @@ def _validate_target_types(cls, v, info: ValidationInfo):
     @classmethod
     def _validate_model(cls, m: "BenchmarkSpecification"):
         """
-        If a checksum is provided, verify it matches what the checksum should be.
-        If no checksum is provided, make sure it is set.
-        Also sets a default metric if missing.
+        Sets a default metric if missing.
         """
-
-        # Validate checksum
-        checksum = m.md5sum
-
-        expected = cls._compute_checksum(
-            dataset=m.dataset,
-            target_cols=m.target_cols,
-            input_cols=m.input_cols,
-            split=m.split,
-            metrics=m.metrics,
-        )
-
-        if checksum is None:
-            m.md5sum = expected
-        elif checksum != expected:
-            raise PolarisChecksumError(
-                "The dataset checksum does not match what was specified in the meta-data. "
-                f"{checksum} != {expected}"
-            )
-
         # Set a default main metric if not set yet
         if m.main_metric is None:
             m.main_metric = m.metrics[0]
-
         return m
 
     @field_serializer("metrics", "main_metric")
@@ -277,25 +258,24 @@ def _serialize_target_types(self, v):
         """Convert from enum to string to make sure it's serializable"""
         return {k: v.value for k, v in self.target_types.items()}
 
-    @staticmethod
-    def _compute_checksum(dataset, target_cols, input_cols, split, metrics):
+    def _compute_checksum(self):
         """
         Computes a hash of the benchmark.
 
         This is meant to uniquely identify the benchmark and can be used to verify the version.
         """
 
         hash_fn = md5()
-        hash_fn.update(dataset.md5sum.encode("utf-8"))
-        for c in sorted(target_cols):
+        hash_fn.update(self.dataset.md5sum.encode("utf-8"))
+        for c in sorted(self.target_cols):
             hash_fn.update(c.encode("utf-8"))
-        for c in sorted(input_cols):
+        for c in sorted(self.input_cols):
             hash_fn.update(c.encode("utf-8"))
-        for m in sorted(metrics, key=lambda k: k.name):
+        for m in sorted(self.metrics, key=lambda k: k.name):
             hash_fn.update(m.name.encode("utf-8"))
 
-        if not isinstance(split[1], dict):
-            split = split[0], {"test": split[1]}
+        if not isinstance(self.split[1], dict):
+            split = self.split[0], {"test": self.split[1]}
 
         # Train set
         s = json.dumps(sorted(split[0]))

@@ -1,4 +1,5 @@
 from enum import Enum, auto, unique
+
 import datamol as dm
 
 # Map of conversion operations which can be applied to dataset columns