Skip to content

Added info for Group and Array #2400

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 24 commits into from
Nov 11, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/zarr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
zeros,
zeros_like,
)
from zarr.core._info import GroupInfo
from zarr.core.array import Array, AsyncArray
from zarr.core.config import config
from zarr.core.group import AsyncGroup, Group
Expand All @@ -38,6 +39,7 @@
"AsyncArray",
"AsyncGroup",
"Group",
"GroupInfo",
"__version__",
"array",
"config",
Expand Down
143 changes: 143 additions & 0 deletions src/zarr/core/_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import dataclasses
import textwrap
from typing import Literal


@dataclasses.dataclass(kw_only=True)
class GroupInfo:
"""
Information about a group.

Attributes
----------
name : str
The path of the group within the Store
type : "Group"
zarr_format : {2, 3}
The zarr format of the Group.
read_only : bool
Whether the Group's access mode is read only.
store_type : str
The name of the Store class containing this group.
count_members : int, optional
The number of child members below this group. This
will be set when the Group has consolidated metadata
or when using :class:`Group.info_complete`.
count_arrays : int, optional
The number of child arrays below this group. This
will be set when the Group has consolidated metadata
or when using :class:`Group.info_complete`.
count_groups : int, optional
The number of child groups below this group. This
will be set when the Group has consolidated metadata
or when using :class:`Group.info_complete`.
"""

name: str
type: Literal["Group"] = "Group"
zarr_format: Literal[2, 3]
read_only: bool
store_type: str
count_members: int | None = None
count_arrays: int | None = None
count_groups: int | None = None

def __repr__(self) -> str:
template = textwrap.dedent("""\
Name : {name}
Type : {type}
Zarr format : {zarr_format}
Read-only : {read_only}
Store type : {store_type}""")

if self.count_members is not None:
template += "\nNo. members : {count_members}"
if self.count_arrays is not None:
template += "\nNo. arrays : {count_arrays}"
if self.count_groups is not None:
template += "\nNo. groups : {count_groups}"
return template.format(**dataclasses.asdict(self))


def human_readable_size(size: int) -> str:
if size < 2**10:
return f"{size}"
elif size < 2**20:
return f"{size / float(2**10):.1f}K"
elif size < 2**30:
return f"{size / float(2**20):.1f}M"
elif size < 2**40:
return f"{size / float(2**30):.1f}G"
elif size < 2**50:
return f"{size / float(2**40):.1f}T"
else:
return f"{size / float(2**50):.1f}P"


def byte_info(size: int) -> str:
if size < 2**10:
return str(size)
else:
return f"{size} ({human_readable_size(size)})"


@dataclasses.dataclass(kw_only=True)
class ArrayInfo:
type: Literal["Array"] = "Array"
zarr_format: Literal[2, 3]
data_type: str
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For this, compressor, filters, codecs, and maybe store_type pick whether we want the string repr as an argument or the concrete value. I think initially I wanted to use concrete values for everything, which gives us a bit more flexiblity at formatting time (we can always call str then if we just want the string repr).

I'll look into why I switched over to strs for these.

shape: tuple[int, ...]
chunk_shape: tuple[int, ...] | None = None
order: Literal["C", "F"]
read_only: bool
store_type: str
compressor: str | None = None
filters: list[str] | None = None
codecs: str | None = None
count_bytes: int | None = None
count_bytes_stored: int | None = None
count_chunks_initialized: int | None = None

def __repr__(self) -> str:
template = textwrap.dedent("""\
Type : {type}
Zarr format : {zarr_format}
Data type : {data_type}
Shape : {shape}
Chunk shape : {chunk_shape}
Order : {order}
Read-only : {read_only}
Store type : {store_type}""")

kwargs = dataclasses.asdict(self)
if self.chunk_shape is None:
# for non-regular chunk grids
kwargs["chunk_shape"] = "<variable>"
if self.compressor is not None:
template += "\nCompressor : {compressor}"

if self.filters is not None:
template += "\nFilters : {filters}"

if self.codecs is not None:
template += "\nCodecs : {codecs}"

if self.count_bytes is not None:
template += "\nNo. bytes : {count_bytes}"
kwargs["count_bytes"] = byte_info(self.count_bytes)

if self.count_bytes_stored is not None:
template += "\nNo. bytes stored : {count_bytes_stored}"
kwargs["count_stored"] = byte_info(self.count_bytes_stored)

if (
self.count_bytes is not None
and self.count_bytes_stored is not None
and self.count_bytes_stored > 0
):
template += "\nStorage ratio : {storage_ratio}"
kwargs["storage_ratio"] = f"{self.count_bytes / self.count_bytes_stored:.1f}"

if self.count_chunks_initialized is not None:
template += "\nChunks Initialized : {count_chunks_initialized}"
return template.format(**kwargs)
82 changes: 77 additions & 5 deletions src/zarr/core/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from zarr.abc.store import Store, set_or_delete
from zarr.codecs import _get_default_array_bytes_codec
from zarr.codecs._v2 import V2Compressor, V2Filters
from zarr.core._info import ArrayInfo
from zarr.core.attributes import Attributes
from zarr.core.buffer import (
BufferPrototype,
Expand Down Expand Up @@ -1199,9 +1200,65 @@ async def update_attributes(self, new_attributes: dict[str, JSON]) -> Self:
def __repr__(self) -> str:
return f"<AsyncArray {self.store_path} shape={self.shape} dtype={self.dtype}>"

async def info(self) -> None:
@property
def info(self) -> ArrayInfo:
"""
Return the statically known information for an array.

Returns
-------
ArrayInfo

See Also
--------
AsyncArray.info_complete
All information about a group, including dynamic information
like the number of bytes and chunks written.
"""
return self._info()

async def info_complete(self) -> ArrayInfo:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we just don't have this in the API till we implement it (since it's new).

# TODO: get the size of the object from the store.
extra = {
"count_chunks_initialized": self.nchunks_initialized, # this should be async?
# count_bytes_stored isn't yet implemented.
}
return self._info(extra=extra)

raise NotImplementedError

def _info(self, extra: dict[str, int] | None = None) -> ArrayInfo:
kwargs: dict[str, Any] = {}
if self.metadata.zarr_format == 2:
assert isinstance(self.metadata, ArrayV2Metadata)
if self.metadata.compressor is not None:
kwargs["compressor"] = str(self.metadata.compressor)
if self.metadata.filters is not None:
kwargs["filters"] = str(self.metadata.filters)
kwargs["data_type"] = str(self.metadata.dtype)
kwargs["chunk_shape"] = self.metadata.chunks
else:
kwargs["codecs"] = str(self.metadata.codecs)
kwargs["data_type"] = str(self.metadata.data_type)
# just regular?
chunk_grid = self.metadata.chunk_grid
if isinstance(chunk_grid, RegularChunkGrid):
kwargs["chunk_shape"] = chunk_grid.chunk_shape
else:
raise NotImplementedError(
"'info' is not yet implemented for chunk grids of type {type(self.metadata.chunk_grid)}"
)

return ArrayInfo(
zarr_format=self.metadata.zarr_format,
shape=self.shape,
order=self.order,
read_only=self.store_path.store.mode.readonly,
store_type=type(self.store_path.store).__name__,
count_bytes=self.dtype.itemsize * self.size,
**kwargs,
)


# TODO: Array can be a frozen data class again once property setters (e.g. shape) are removed
@dataclass(frozen=False)
Expand Down Expand Up @@ -2900,10 +2957,25 @@ def update_attributes(self, new_attributes: dict[str, JSON]) -> Array:
def __repr__(self) -> str:
return f"<Array {self.store_path} shape={self.shape} dtype={self.dtype}>"

def info(self) -> None:
return sync(
self._async_array.info(),
)
@property
def info(self) -> ArrayInfo:
"""
Return the statically known information for an array.

Returns
-------
ArrayInfo

See Also
--------
Array.info_complete
All information about a group, including dynamic information
like the number of bytes and chunks written.
"""
return self._async_array.info
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add docstrings here and in info_complete


def info_complete(self) -> ArrayInfo:
return sync(self._async_array.info_complete())


def nchunks_initialized(
Expand Down
100 changes: 96 additions & 4 deletions src/zarr/core/group.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from zarr._compat import _deprecate_positional_args
from zarr.abc.metadata import Metadata
from zarr.abc.store import Store, set_or_delete
from zarr.core._info import GroupInfo
from zarr.core.array import Array, AsyncArray, _build_parents
from zarr.core.attributes import Attributes
from zarr.core.buffer import default_buffer_prototype
Expand Down Expand Up @@ -793,8 +794,69 @@ def attrs(self) -> dict[str, Any]:
return self.metadata.attributes

@property
def info(self) -> None:
raise NotImplementedError
def info(self) -> GroupInfo:
"""
Return the statically known information for a group.

Returns
-------
GroupInfo

See Also
--------
AsyncGroup.info_complete
All information about a group, including dynamic information
like the children members.
"""

if self.metadata.consolidated_metadata:
members = list(self.metadata.consolidated_metadata.flattened_metadata.values())
else:
members = None
return self._info(members=members)

async def info_complete(self) -> GroupInfo:
"""
Return information for a group.

If this group doesn't contain consolidated metadata then
this will need to read from the backing Store.

Returns
-------
GroupInfo

See Also
--------
AsyncGroup.info
"""
members = [x[1].metadata async for x in self.members(max_depth=None)]
return self._info(members=members)

def _info(
self, members: list[ArrayV2Metadata | ArrayV3Metadata | GroupMetadata] | None = None
) -> GroupInfo:
kwargs = {}
if members is not None:
kwargs["count_members"] = len(members)
count_arrays = 0
count_groups = 0
for member in members:
if isinstance(member, GroupMetadata):
count_groups += 1
else:
count_arrays += 1
kwargs["count_arrays"] = count_arrays
kwargs["count_groups"] = count_groups

return GroupInfo(
name=self.store_path.path,
read_only=self.store_path.store.mode.readonly,
store_type=type(self.store_path.store).__name__,
zarr_format=self.metadata.zarr_format,
# maybe do a typeddict
**kwargs, # type: ignore[arg-type]
)

@property
def store(self) -> Store:
Expand Down Expand Up @@ -1439,8 +1501,38 @@ def attrs(self) -> Attributes:
return Attributes(self)

@property
def info(self) -> None:
raise NotImplementedError
def info(self) -> GroupInfo:
"""
Return the statically known information for a group.

Returns
-------
GroupInfo

See Also
--------
Group.info_complete
All information about a group, including dynamic information
like the children members.
"""
return self._async_group.info

def info_complete(self) -> GroupInfo:
"""
Return information for a group.

If this group doesn't contain consolidated metadata then
this will need to read from the backing Store.

Returns
-------
GroupInfo

See Also
--------
Group.info
"""
return self._sync(self._async_group.info_complete())

@property
def store(self) -> Store:
Expand Down
Loading