Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions .github/workflows/python-package-pip.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,20 @@ on:

jobs:
test:
name: Test pip install python ${{ matrix.python-version }} on ${{ matrix.os }}
name: Test pip install python ${{ matrix.python-version }} on ${{ matrix.os }} with zarr ${{ matrix.zarr-version }}

runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest]
python-version: ["3.8", "3.9", "3.10"]
zarr-version: [">=2,<3", ">2,<=3"]
steps:
- uses: actions/checkout@v2
- name: Install package with pip
run: |
python -m pip install .
python -m pip install . "zarr${{ matrix.zarr-version }}"
python -m pip install pytest

- name: Run tests
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Added

- add github PR template to guide development process on github [\#44](https://github.com/mllam/mllam-data-prep/pull/44), @leifdenby
- add support for zarr 3.0.0 and above [\#51](https://github.com/mllam/mllam-data-prep/pull/51), @kashif

## [v0.5.0](https://github.com/mllam/mllam-data-prep/releases/tag/v0.5.0)

Expand Down
2 changes: 1 addition & 1 deletion mllam_data_prep/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
)

logger.info(
f"Setting up dask.distributed.LocalCluster with {n_local_cores} cores and {memory_per_worker/1024/1024:0.0f} MB of memory per worker"
f"Setting up dask.distributed.LocalCluster with {n_local_cores} cores and {memory_per_worker / 1024 / 1024:0.0f} MB of memory per worker"
)

cluster = LocalCluster(
Expand Down
16 changes: 13 additions & 3 deletions mllam_data_prep/create_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@

import numpy as np
import xarray as xr
import zarr
from loguru import logger
from numcodecs import Blosc
from packaging.version import Version

from . import __version__
from .config import Config, InvalidConfigException
Expand All @@ -15,6 +16,11 @@
from .ops.selection import select_by_kwargs
from .ops.statistics import calc_stats

if Version(zarr.__version__) >= Version("3"):
from zarr.codecs import BloscCodec, BloscShuffle
else:
from numcodecs import Blosc

# the `extra` field in the config that was added between v0.2.0 and v0.5.0 is
# optional, so we can support both v0.2.0 and v0.5.0
SUPPORTED_CONFIG_VERSIONS = ["v0.2.0", "v0.5.0"]
Expand Down Expand Up @@ -271,8 +277,12 @@ def create_dataset_zarr(fp_config, fp_zarr: str = None):

# use zstd compression since it has a good balance of speed and compression ratio
# https://engineering.fb.com/2016/08/31/core-infra/smaller-and-faster-data-compression-with-zstandard/
compressor = Blosc(cname="zstd", clevel=1, shuffle=Blosc.BITSHUFFLE)
encoding = {v: {"compressor": compressor} for v in ds.data_vars}
if Version(zarr.__version__) >= Version("3"):
compressor = BloscCodec(cname="zstd", clevel=3, shuffle=BloscShuffle.bitshuffle)
encoding = {v: {"compressors": compressor} for v in ds.data_vars}
else:
compressor = Blosc(cname="zstd", clevel=1, shuffle=Blosc.BITSHUFFLE)
encoding = {v: {"compressor": compressor} for v in ds.data_vars}

ds.to_zarr(fp_zarr, consolidated=True, mode="w", encoding=encoding)
logger.info(f"Wrote training-ready dataset to {fp_zarr}")
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ authors = [
{name = "Emy Alerskans", email = "ea@dmi.dk"},
{name = "Eleni Briola", email = "elb@dmi.dk"},
{name = "Joel Oskarsson", email = "joel.oskarsson@liu.se"},
{name = "Kashif Rasul", email = "kashif.rasul@gmail.com"},
]
dependencies = [
"xarray>=2024.2.0",
Expand All @@ -22,6 +23,7 @@ dependencies = [
"rich>=13.7.1",
"dask>=2024.2.1",
"psutil>=5.7.2",
"packaging>=23.1",
]
requires-python = ">=3.9"
readme = "README.md"
Expand Down