pydata
diff --git a/‎doc/io.rst
Lines changed: 111 additions & 34 deletions b/‎doc/io.rst
Lines changed: 111 additions & 34 deletions
diff --git a/‎doc/whats-new.rst
Lines changed: 4 additions & 0 deletions b/‎doc/whats-new.rst
Lines changed: 4 additions & 0 deletions
diff --git a/‎xarray/backends/api.py
Lines changed: 106 additions & 22 deletions b/‎xarray/backends/api.py
Lines changed: 106 additions & 22 deletions
@@ -833,7 +833,9 @@ N-dimensional arrays.
 Zarr has the ability to store arrays in a range of ways, including in memory,
 in files, and in cloud-based object storage such as `Amazon S3`_ and
 `Google Cloud Storage`_.
-Xarray's Zarr backend allows xarray to leverage these capabilities.
+Xarray's Zarr backend allows xarray to leverage these capabilities, including
+the ability to store and analyze datasets far too large fit onto disk
+(particularly :ref:`in combination with dask <dask>`).
 
 .. warning::
 
@@ -845,7 +847,8 @@ metadata (attributes) describing the dataset dimensions and coordinates.
 At this time, xarray can only open zarr datasets that have been written by
 xarray. For implementation details, see :ref:`zarr_encoding`.
 
-To write a dataset with zarr, we use the :py:attr:`Dataset.to_zarr` method.
+To write a dataset with zarr, we use the :py:meth:`Dataset.to_zarr` method.
+
 To write to a local directory, we pass a path to a directory:
 
 .. ipython:: python
@@ -869,39 +872,10 @@ To write to a local directory, we pass a path to a directory:
 there.) If the directory does not exist, it will be created. If a zarr
 store is already present at that path, an error will be raised, preventing it
 from being overwritten. To override this behavior and overwrite an existing
-store, add ``mode='w'`` when invoking ``to_zarr``.
-
-It is also possible to append to an existing store. For that, set
-``append_dim`` to the name of the dimension along which to append. ``mode``
-can be omitted as it will internally be set to ``'a'``.
-
-.. ipython:: python
-    :suppress:
-
-    ! rm -rf path/to/directory.zarr
-
-.. ipython:: python
-
-    ds1 = xr.Dataset(
-        {"foo": (("x", "y", "t"), np.random.rand(4, 5, 2))},
-        coords={
-            "x": [10, 20, 30, 40],
-            "y": [1, 2, 3, 4, 5],
-            "t": pd.date_range("2001-01-01", periods=2),
-        },
-    )
-    ds1.to_zarr("path/to/directory.zarr")
-    ds2 = xr.Dataset(
-        {"foo": (("x", "y", "t"), np.random.rand(4, 5, 2))},
-        coords={
-            "x": [10, 20, 30, 40],
-            "y": [1, 2, 3, 4, 5],
-            "t": pd.date_range("2001-01-03", periods=2),
-        },
-    )
-    ds2.to_zarr("path/to/directory.zarr", append_dim="t")
+store, add ``mode='w'`` when invoking :py:meth:`~Dataset.to_zarr`.
 
-To store variable length strings use ``dtype=object``.
+To store variable length strings, convert them to object arrays first with
+``dtype=object``.
 
 To read back a zarr dataset that has been created this way, we use the
 :py:func:`open_zarr` method:
@@ -987,6 +961,109 @@ Xarray can't perform consolidation on pre-existing zarr datasets. This should
 be done directly from zarr, as described in the
 `zarr docs <https://zarr.readthedocs.io/en/latest/tutorial.html#consolidating-metadata>`_.
 
+.. _io.zarr.appending:
+
+Appending to existing Zarr stores
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Xarray supports several ways of incrementally writing variables to a Zarr
+store. These options are useful for scenarios when it is infeasible or
+undesirable to write your entire dataset at once.
+
+.. tip::
+
+    If you can load all of your data into a single ``Dataset`` using dask, a
+    single call to ``to_zarr()`` will write all of your data in parallel.
+
+.. warning::
+
+    Alignment of coordinates is currently not checked when modifying an
+    existing Zarr store. It is up to the user to ensure that coordinates are
+    consistent.
+
+To add or overwrite entire variables, simply call :py:meth:`~Dataset.to_zarr`
+with ``mode='a'`` on a Dataset containing the new variables, passing in an
+existing Zarr store or path to a Zarr store.
+
+To resize and then append values along an existing dimension in a store, set
+``append_dim``. This is a good option if data always arives in a particular
+order, e.g., for time-stepping a simulation:
+
+.. ipython:: python
+    :suppress:
+
+    ! rm -rf path/to/directory.zarr
+
+.. ipython:: python
+
+    ds1 = xr.Dataset(
+        {"foo": (("x", "y", "t"), np.random.rand(4, 5, 2))},
+        coords={
+            "x": [10, 20, 30, 40],
+            "y": [1, 2, 3, 4, 5],
+            "t": pd.date_range("2001-01-01", periods=2),
+        },
+    )
+    ds1.to_zarr("path/to/directory.zarr")
+    ds2 = xr.Dataset(
+        {"foo": (("x", "y", "t"), np.random.rand(4, 5, 2))},
+        coords={
+            "x": [10, 20, 30, 40],
+            "y": [1, 2, 3, 4, 5],
+            "t": pd.date_range("2001-01-03", periods=2),
+        },
+    )
+    ds2.to_zarr("path/to/directory.zarr", append_dim="t")
+
+Finally, you can use ``region`` to write to limited regions of existing arrays
+in an existing Zarr store. This is a good option for writing data in parallel
+from independent processes.
+
+To scale this up to writing large datasets, the first step is creating an
+initial Zarr store without writing all of its array data. This can be done by
+first creating a ``Dataset`` with dummy values stored in :ref:`dask <dask>`,
+and then calling ``to_zarr`` with ``compute=False`` to write only metadata
+(including ``attrs``) to Zarr:
+
+.. ipython:: python
+    :suppress:
+
+    ! rm -rf path/to/directory.zarr
+
+.. ipython:: python
+
+    import dask.array
+    # The values of this dask array are entirely irrelevant; only the dtype,
+    # shape and chunks are used
+    dummies = dask.array.zeros(30, chunks=10)
+    ds = xr.Dataset({"foo": ("x", dummies)})
+    path = "path/to/directory.zarr"
+    # Now we write the metadata without computing any array values
+    ds.to_zarr(path, compute=False, consolidated=True)
+
+Now, a Zarr store with the correct variable shapes and attributes exists that
+can be filled out by subsequent calls to ``to_zarr``. The ``region`` provides a
+mapping from dimension names to Python ``slice`` objects indicating where the
+data should be written (in index space, not coordinate space), e.g.,
+
+.. ipython:: python
+
+    # For convenience, we'll slice a single dataset, but in the real use-case
+    # we would create them separately, possibly even from separate processes.
+    ds = xr.Dataset({"foo": ("x", np.arange(30))})
+    ds.isel(x=slice(0, 10)).to_zarr(path, region={"x": slice(0, 10)})
+    ds.isel(x=slice(10, 20)).to_zarr(path, region={"x": slice(10, 20)})
+    ds.isel(x=slice(20, 30)).to_zarr(path, region={"x": slice(20, 30)})
+
+Concurrent writes with ``region`` are safe as long as they modify distinct
+chunks in the underlying Zarr arrays (or use an appropriate ``lock``).
+
+As a safety check to make it harder to inadvertently override existing values,
+if you set ``region`` then *all* variables included in a Dataset must have
+dimensions included in ``region``. Other variables (typically coordinates)
+need to be explicitly dropped and/or written in a separate calls to ``to_zarr``
+with ``mode='a'``.
+
 .. _io.cfgrib:
 
 .. ipython:: python
 
@@ -32,6 +32,10 @@ New Features
   By `Miguel Jimenez <https://github.com/Mikejmnez>`_ and `Wei Ji Leong <https://github.com/weiji14>`_.
 - Unary & binary operations follow the ``keep_attrs`` flag (:issue:`3490`, :issue:`4065`, :issue:`3433`, :issue:`3595`, :pull:`4195`).
   By `Deepak Cherian <https://github.com/dcherian>`_.
+- :py:meth:`Dataset.to_zarr` now supports a ``region`` keyword for writing to
+  limited regions of existing Zarr stores (:pull:`4035`).
+  See :ref:`io.zarr.appending` for full details.
+  By `Stephan Hoyer <https://github.com/shoyer>`_.
 
 Bug fixes
 ~~~~~~~~~
 
@@ -11,6 +11,7 @@
     Hashable,
     Iterable,
     Mapping,
+    MutableMapping,
     Tuple,
     Union,
 )
@@ -1304,38 +1305,89 @@ def check_dtype(var):
 
 
 def _validate_append_dim_and_encoding(
-    ds_to_append, store, append_dim, encoding, **open_kwargs
+    ds_to_append, store, append_dim, region, encoding, **open_kwargs
 ):
     try:
         ds = backends.zarr.open_zarr(store, **open_kwargs)
     except ValueError:  # store empty
         return
+
     if append_dim:
         if append_dim not in ds.dims:
             raise ValueError(
                 f"append_dim={append_dim!r} does not match any existing "
                 f"dataset dimensions {ds.dims}"
             )
-    for var_name in ds_to_append:
-        if var_name in ds:
-            if ds_to_append[var_name].dims != ds[var_name].dims:
+        if region is not None and append_dim in region:
+            raise ValueError(
+                f"cannot list the same dimension in both ``append_dim`` and "
+                f"``region`` with to_zarr(), got {append_dim} in both"
+            )
+
+    if region is not None:
+        if not isinstance(region, dict):
+            raise TypeError(f"``region`` must be a dict, got {type(region)}")
+        for k, v in region.items():
+            if k not in ds_to_append.dims:
+                raise ValueError(
+                    f"all keys in ``region`` are not in Dataset dimensions, got "
+                    f"{list(region)} and {list(ds_to_append.dims)}"
+                )
+            if not isinstance(v, slice):
+                raise TypeError(
+                    "all values in ``region`` must be slice objects, got "
+                    f"region={region}"
+                )
+            if v.step not in {1, None}:
+                raise ValueError(
+                    "step on all slices in ``region`` must be 1 or None, got "
+                    f"region={region}"
+                )
+
+        non_matching_vars = [
+            k
+            for k, v in ds_to_append.variables.items()
+            if not set(region).intersection(v.dims)
+        ]
+        if non_matching_vars:
+            raise ValueError(
+                f"when setting `region` explicitly in to_zarr(), all "
+                f"variables in the dataset to write must have at least "
+                f"one dimension in common with the region's dimensions "
+                f"{list(region.keys())}, but that is not "
+                f"the case for some variables here. To drop these variables "
+                f"from this dataset before exporting to zarr, write: "
+                f".drop({non_matching_vars!r})"
+            )
+
+    for var_name, new_var in ds_to_append.variables.items():
+        if var_name in ds.variables:
+            existing_var = ds.variables[var_name]
+            if new_var.dims != existing_var.dims:
                 raise ValueError(
                     f"variable {var_name!r} already exists with different "
-                    f"dimension names {ds[var_name].dims} != "
-                    f"{ds_to_append[var_name].dims}, but changing variable "
-                    "dimensions is not supported by to_zarr()."
+                    f"dimension names {existing_var.dims} != "
+                    f"{new_var.dims}, but changing variable "
+                    f"dimensions is not supported by to_zarr()."
                 )
-            existing_sizes = {
-                k: v for k, v in ds[var_name].sizes.items() if k != append_dim
-            }
+
+            existing_sizes = {}
+            for dim, size in existing_var.sizes.items():
+                if region is not None and dim in region:
+                    start, stop, stride = region[dim].indices(size)
+                    assert stride == 1  # region was already validated above
+                    size = stop - start
+                if dim != append_dim:
+                    existing_sizes[dim] = size
+
             new_sizes = {
-                k: v for k, v in ds_to_append[var_name].sizes.items() if k != append_dim
+                dim: size for dim, size in new_var.sizes.items() if dim != append_dim
             }
             if existing_sizes != new_sizes:
                 raise ValueError(
                     f"variable {var_name!r} already exists with different "
-                    "dimension sizes: {existing_sizes} != {new_sizes}. "
-                    "to_zarr() only supports changing dimension sizes when "
+                    f"dimension sizes: {existing_sizes} != {new_sizes}. "
+                    f"to_zarr() only supports changing dimension sizes when "
                     f"explicitly appending, but append_dim={append_dim!r}."
                 )
             if var_name in encoding.keys():
@@ -1345,16 +1397,17 @@ def _validate_append_dim_and_encoding(
 
 
 def to_zarr(
-    dataset,
-    store=None,
+    dataset: Dataset,
+    store: Union[MutableMapping, str, Path] = None,
     chunk_store=None,
-    mode=None,
+    mode: str = None,
     synchronizer=None,
-    group=None,
-    encoding=None,
-    compute=True,
-    consolidated=False,
-    append_dim=None,
+    group: str = None,
+    encoding: Mapping = None,
+    compute: bool = True,
+    consolidated: bool = False,
+    append_dim: Hashable = None,
+    region: Mapping[str, slice] = None,
 ):
     """This function creates an appropriate datastore for writing a dataset to
     a zarr ztore
@@ -1368,6 +1421,35 @@ def to_zarr(
     if encoding is None:
         encoding = {}
 
+    if mode is None:
+        if append_dim is not None or region is not None:
+            mode = "a"
+        else:
+            mode = "w-"
+
+    if mode != "a" and append_dim is not None:
+        raise ValueError("cannot set append_dim unless mode='a' or mode=None")
+
+    if mode != "a" and region is not None:
+        raise ValueError("cannot set region unless mode='a' or mode=None")
+
+    if mode not in ["w", "w-", "a"]:
+        # TODO: figure out how to handle 'r+'
+        raise ValueError(
+            "The only supported options for mode are 'w', "
+            f"'w-' and 'a', but mode={mode!r}"
+        )
+
+    if consolidated and region is not None:
+        raise ValueError(
+            "cannot use consolidated=True when the region argument is set. "
+            "Instead, set consolidated=True when writing to zarr with "
+            "compute=False before writing data."
+        )
+
+    if isinstance(store, Path):
+        store = str(store)
+
     # validate Dataset keys, DataArray names, and attr keys/values
     _validate_dataset_names(dataset)
     _validate_attrs(dataset)
@@ -1380,6 +1462,7 @@ def to_zarr(
             append_dim,
             group=group,
             consolidated=consolidated,
+            region=region,
             encoding=encoding,
         )
 
@@ -1390,8 +1473,9 @@ def to_zarr(
         group=group,
         consolidate_on_close=consolidated,
         chunk_store=chunk_store,
+        append_dim=append_dim,
+        write_region=region,
     )
-    zstore.append_dim = append_dim
     writer = ArrayWriter()
     # TODO: figure out how to properly handle unlimited_dims
     dump_to_store(dataset, zstore, writer, encoding=encoding)