Skip to content

XL Datasets: Minimal Zarr-only dataset implementation #186

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 23 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
2f4fd18
Extracted common interface between V1 and V2
cwognum Aug 26, 2024
81cee18
Skeleton structure for tests and Dataset V2. Small changes to shared API
cwognum Aug 27, 2024
613dcb2
Implemented the test cases
cwognum Aug 27, 2024
27c73ab
Basic test cases passed
cwognum Aug 27, 2024
6484216
Added additional validation
cwognum Aug 27, 2024
df33bfc
Improved docs
cwognum Aug 27, 2024
7d4b718
Fixed some reference errors in the docs
cwognum Aug 27, 2024
295265c
Merge branch 'main' into feat/dataset-v2
cwognum Aug 27, 2024
0484d68
Disable use of iloc to loc mapping for Dataset V2
cwognum Aug 27, 2024
ca76f9d
Updated import to prevent circular import
cwognum Aug 27, 2024
f0b7c4b
Ruff check and format
cwognum Aug 27, 2024
a968e0b
Adding new Zarr manifest generation to DatasetV2 class (#185)
Andrewq11 Sep 1, 2024
18bde88
fixing code check test
Andrewq11 Sep 1, 2024
75fe310
Move code to dataset base class
cwognum Sep 3, 2024
1194836
Merge branch 'main' into feat/dataset-v2
cwognum Sep 5, 2024
024e71d
Addressed most feedback on the PR, still need to revisit the __getite…
cwognum Sep 5, 2024
13fa9f1
Worked on the __getitem__ method
cwognum Sep 5, 2024
0e04c1f
Address special case of pointer columns
cwognum Sep 6, 2024
d3a18d5
Renamed md5sum to zarr_manifest_md5sum for clarity, remove equality t…
cwognum Sep 6, 2024
6efee7d
Merge branch 'main' into feat/dataset-v2
cwognum Sep 11, 2024
7bf7ac8
Fix missing import
cwognum Sep 11, 2024
6d35122
Added PR feedback
cwognum Sep 11, 2024
8ae8e5e
Update decorators
cwognum Sep 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/api/dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@
options:
filters: ["!^_"]

---

::: polaris.dataset._base.BaseDataset
options:
filters: ["!^_"]

---

::: polaris.dataset.ColumnAnnotation
Expand Down
14 changes: 7 additions & 7 deletions polaris/benchmark/_base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from itertools import chain
import json
from hashlib import md5
from itertools import chain
from typing import Any, Callable, Optional, Union

import fsspec
Expand All @@ -18,11 +18,11 @@
from sklearn.utils.multiclass import type_of_target

from polaris._artifact import BaseArtifactModel
from polaris.mixins import ChecksumMixin
from polaris.dataset import Dataset, Subset, CompetitionDataset
from polaris.dataset import CompetitionDataset, DatasetV1, Subset
from polaris.evaluate import BenchmarkResults, Metric
from polaris.evaluate.utils import evaluate_benchmark
from polaris.hub.settings import PolarisHubSettings
from polaris.mixins import ChecksumMixin
from polaris.utils.dict2html import dict2html
from polaris.utils.errors import InvalidBenchmarkError
from polaris.utils.misc import listit
Expand Down Expand Up @@ -96,7 +96,7 @@ class BenchmarkSpecification(BaseArtifactModel, ChecksumMixin):

# Public attributes
# Data
dataset: Union[Dataset, CompetitionDataset, str, dict[str, Any]]
dataset: Union[DatasetV1, CompetitionDataset, str, dict[str, Any]]
target_cols: ColumnsType
input_cols: ColumnsType
split: SplitType
Expand All @@ -111,12 +111,11 @@ class BenchmarkSpecification(BaseArtifactModel, ChecksumMixin):
def _validate_dataset(cls, v):
"""
Allows either passing a Dataset object or the kwargs to create one
TODO (cwognum): Allow multiple datasets to be used as part of a benchmark
"""
if isinstance(v, dict):
v = Dataset(**v)
v = DatasetV1(**v)
elif isinstance(v, str):
v = Dataset.from_json(v)
v = DatasetV1.from_json(v)
return v

@field_validator("target_cols", "input_cols")
Expand Down Expand Up @@ -345,6 +344,7 @@ def n_classes(self) -> dict[str, int]:
target_type = self.target_types[target]
if target_type is None or target_type == TargetType.REGRESSION:
continue
# TODO: Don't use table attribute
n_classes[target] = self.dataset.table.loc[:, target].nunique()
return n_classes

Expand Down
8 changes: 5 additions & 3 deletions polaris/dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from polaris.dataset._column import ColumnAnnotation, Modality, KnownContentType
from polaris.dataset._dataset import Dataset
from polaris.dataset._column import ColumnAnnotation, KnownContentType, Modality
from polaris.dataset._competition_dataset import CompetitionDataset
from polaris.dataset._dataset import DatasetV1
from polaris.dataset._dataset import DatasetV1 as Dataset
from polaris.dataset._factory import DatasetFactory, create_dataset_from_file, create_dataset_from_files
from polaris.dataset._subset import Subset
from polaris.dataset._competition_dataset import CompetitionDataset

__all__ = [
"ColumnAnnotation",
Expand All @@ -14,4 +15,5 @@
"DatasetFactory",
"create_dataset_from_file",
"create_dataset_from_files",
"DatasetV1",
]
Loading
Loading