pydata · TomNicholas · Oct 24, 2024 · Oct 24, 2024 · Nov 8, 2024 · Nov 20, 2024
diff --git a/ci/minimum_versions.py b/ci/minimum_versions.py
@@ -30,6 +30,7 @@
     "coveralls",
     "pip",
     "pytest",
+    "pytest-asyncio",
     "pytest-cov",
     "pytest-env",
     "pytest-mypy-plugins",

diff --git a/ci/requirements/all-but-dask.yml b/ci/requirements/all-but-dask.yml
@@ -28,6 +28,7 @@ dependencies:
   - pip
   - pydap
   - pytest
+  - pytest-asyncio
   - pytest-cov
   - pytest-env
   - pytest-mypy-plugins

diff --git a/ci/requirements/all-but-numba.yml b/ci/requirements/all-but-numba.yml
@@ -41,6 +41,7 @@ dependencies:
   - pyarrow # pandas raises a deprecation warning without this, breaking doctests
   - pydap
   - pytest
+  - pytest-asyncio
   - pytest-cov
   - pytest-env
   - pytest-mypy-plugins

diff --git a/ci/requirements/bare-minimum.yml b/ci/requirements/bare-minimum.yml
@@ -7,6 +7,7 @@ dependencies:
   - coveralls
   - pip
   - pytest
+  - pytest-asyncio
   - pytest-cov
   - pytest-env
   - pytest-mypy-plugins

diff --git a/ci/requirements/environment-3.14.yml b/ci/requirements/environment-3.14.yml
@@ -37,6 +37,7 @@ dependencies:
   - pyarrow # pandas raises a deprecation warning without this, breaking doctests
   - pydap
   - pytest
+  - pytest-asyncio
   - pytest-cov
   - pytest-env
   - pytest-mypy-plugins

diff --git a/ci/requirements/environment-windows-3.14.yml b/ci/requirements/environment-windows-3.14.yml
@@ -32,6 +32,7 @@ dependencies:
   - pyarrow # importing dask.dataframe raises an ImportError without this
   - pydap
   - pytest
+  - pytest-asyncio
   - pytest-cov
   - pytest-env
   - pytest-mypy-plugins

diff --git a/ci/requirements/environment-windows.yml b/ci/requirements/environment-windows.yml
@@ -32,6 +32,7 @@ dependencies:
   - pyarrow # importing dask.dataframe raises an ImportError without this
   - pydap
   - pytest
+  - pytest-asyncio
   - pytest-cov
   - pytest-env
   - pytest-mypy-plugins

diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml
@@ -38,6 +38,7 @@ dependencies:
   - pydap
   - pydap-server
   - pytest
+  - pytest-asyncio
   - pytest-cov
   - pytest-env
   - pytest-mypy-plugins

diff --git a/ci/requirements/min-all-deps.yml b/ci/requirements/min-all-deps.yml
@@ -44,6 +44,7 @@ dependencies:
   - pip
   - pydap=3.5
   - pytest
+  - pytest-asyncio
   - pytest-cov
   - pytest-env
   - pytest-mypy-plugins

diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst
@@ -228,6 +228,7 @@
    Variable.isnull
    Variable.item
    Variable.load
+   Variable.load_async
    Variable.max
    Variable.mean
    Variable.median

diff --git a/doc/api.rst b/doc/api.rst
@@ -1122,6 +1122,7 @@ Dataset methods
    Dataset.filter_by_attrs
    Dataset.info
    Dataset.load
+   Dataset.load_async
    Dataset.persist
    Dataset.unify_chunks
 
@@ -1154,6 +1155,7 @@ DataArray methods
    DataArray.compute
    DataArray.persist
    DataArray.load
+   DataArray.load_async
    DataArray.unify_chunks
 
 DataTree methods

diff --git a/doc/internals/how-to-add-new-backend.rst b/doc/internals/how-to-add-new-backend.rst
@@ -325,10 +325,12 @@ information on plugins.
 How to support lazy loading
 +++++++++++++++++++++++++++
 
-If you want to make your backend effective with big datasets, then you should
-support lazy loading.
-Basically, you shall replace the :py:class:`numpy.ndarray` inside the
-variables with a custom class that supports lazy loading indexing.
+If you want to make your backend effective with big datasets, then you should take advantage of xarray's
+support for lazy loading and indexing.
+
+Basically, when your backend constructs the ``Variable`` objects,
+you need to replace the :py:class:`numpy.ndarray` inside the
+variables with a custom :py:class:`~xarray.backends.BackendArray` subclass that supports lazy loading and indexing.
 See the example below:
 
 .. code-block:: python
@@ -339,25 +341,27 @@ See the example below:
 
 Where:
 
-- :py:class:`~xarray.core.indexing.LazilyIndexedArray` is a class
-  provided by Xarray that manages the lazy loading.
-- ``MyBackendArray`` shall be implemented by the backend and shall inherit
+- :py:class:`~xarray.core.indexing.LazilyIndexedArray` is a wrapper class
+  provided by Xarray that manages the lazy loading and indexing.
+- ``MyBackendArray`` should be implemented by the backend and must inherit
   from :py:class:`~xarray.backends.BackendArray`.
 
 BackendArray subclassing
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
-The BackendArray subclass shall implement the following method and attributes:
+The BackendArray subclass must implement the following method and attributes:
 
-- the ``__getitem__`` method that takes in input an index and returns a
-  `NumPy <https://numpy.org/>`__ array
-- the ``shape`` attribute
+- the ``__getitem__`` method that takes an index as an input and returns a
+  `NumPy <https://numpy.org/>`__ array,
+- the ``shape`` attribute,
 - the ``dtype`` attribute.
 
-Xarray supports different type of :doc:`/user-guide/indexing`, that can be
-grouped in three types of indexes
+It may also optionally implement an additional ``async_getitem`` method.
+
+Xarray supports different types of :doc:`/user-guide/indexing`, that can be
+grouped in three types of indexes:
 :py:class:`~xarray.core.indexing.BasicIndexer`,
-:py:class:`~xarray.core.indexing.OuterIndexer` and
+:py:class:`~xarray.core.indexing.OuterIndexer`, and
 :py:class:`~xarray.core.indexing.VectorizedIndexer`.
 This implies that the implementation of the method ``__getitem__`` can be tricky.
 In order to simplify this task, Xarray provides a helper function,
@@ -413,8 +417,22 @@ input the ``key``, the array ``shape`` and the following parameters:
 For more details see
 :py:class:`~xarray.core.indexing.IndexingSupport` and :ref:`RST indexing`.
 
+Async support
+^^^^^^^^^^^^^
+
+Backends can also optionally support loading data asynchronously via xarray's asynchronous loading methods
+(e.g. ``~xarray.Dataset.load_async``).
+To support async loading the ``BackendArray`` subclass must additionally implement the ``BackendArray.async_getitem`` method.
+
+Note that implementing this method is only necessary if you want to be able to load data from different xarray objects concurrently.
+Even without this method your ``BackendArray`` implementation is still free to concurrently load chunks of data for a single ``Variable`` itself,
+so long as it does so behind the synchronous ``__getitem__`` interface.
+
+Dask support
+^^^^^^^^^^^^
+
 In order to support `Dask Distributed <https://distributed.dask.org/>`__ and
-:py:mod:`multiprocessing`, ``BackendArray`` subclass should be serializable
+:py:mod:`multiprocessing`, the ``BackendArray`` subclass should be serializable
 either with :ref:`io.pickle` or
 `cloudpickle <https://github.com/cloudpipe/cloudpickle>`__.
 That implies that all the reference to open files should be dropped. For

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -24,6 +24,9 @@ v2025.05.0 (unreleased)
 
 New Features
 ~~~~~~~~~~~~
+
+- Added new asynchronous loading methods :py:meth:`~xarray.Dataset.load_async`, :py:meth:`~xarray.DataArray.load_async`, :py:meth:`~xarray.Variable.load_async`.
+  (:issue:`10326`, :pull:`10327`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
 - Allow an Xarray index that uses multiple dimensions checking equality with another
   index for only a subset of those dimensions (i.e., ignoring the dimensions
   that are excluded from alignment).
@@ -42,7 +45,6 @@ Bug fixes
 ~~~~~~~~~
 - Fix :py:class:`~xarray.groupers.BinGrouper` when ``labels`` is not specified (:issue:`10284`).
   By `Deepak Cherian <https://github.com/dcherian>`_.
-
 - Allow accessing arbitrary attributes on Pandas ExtensionArrays.
   By `Deepak Cherian <https://github.com/dcherian>`_.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -74,6 +74,7 @@ dev = [
   "pytest-mypy-plugins",
   "pytest-timeout",
   "pytest-xdist",
+  "pytest-asyncio",
   "ruff>=0.8.0",
   "sphinx",
   "sphinx_autosummary_accessors",

diff --git a/xarray/backends/common.py b/xarray/backends/common.py
@@ -4,6 +4,7 @@
 import os
 import time
 import traceback
+from abc import ABC, abstractmethod
 from collections.abc import Hashable, Iterable, Mapping, Sequence
 from glob import glob
 from typing import TYPE_CHECKING, Any, ClassVar, TypeVar, Union, overload
@@ -267,13 +268,23 @@ def robust_getitem(array, key, catch=Exception, max_retries=6, initial_delay=500
             time.sleep(1e-3 * next_delay)
 
 
-class BackendArray(NdimSizeLenMixin, indexing.ExplicitlyIndexed):
+class BackendArray(ABC, NdimSizeLenMixin, indexing.ExplicitlyIndexed):
     __slots__ = ()
 
+    @abstractmethod
+    def __getitem__(key: indexing.ExplicitIndexer) -> np.typing.ArrayLike: ...
+
+    async def async_getitem(key: indexing.ExplicitIndexer) -> np.typing.ArrayLike:
+        raise NotImplementedError("Backend does not not support asynchronous loading")
+
     def get_duck_array(self, dtype: np.typing.DTypeLike = None):
         key = indexing.BasicIndexer((slice(None),) * self.ndim)
         return self[key]  # type: ignore[index]
 
+    async def async_get_duck_array(self, dtype: np.typing.DTypeLike = None):
+        key = indexing.BasicIndexer((slice(None),) * self.ndim)
+        return await self.async_getitem(key)  # type: ignore[index]
+
 
 class AbstractDataStore:
     __slots__ = ()

diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py
@@ -185,6 +185,8 @@ class ZarrArrayWrapper(BackendArray):
     def __init__(self, zarr_array):
         # some callers attempt to evaluate an array if an `array` property exists on the object.
         # we prefix with _ to avoid this inference.
+
+        # TODO type hint this?
         self._array = zarr_array
         self.shape = self._array.shape
 
@@ -212,6 +214,14 @@ def _vindex(self, key):
     def _getitem(self, key):
         return self._array[key]
 
+    async def _async_getitem(self, key):
+        async_array = self._array._async_array
+        return await async_array.getitem(key)
+
+    async def _async_oindex(self, key):
+        async_array = self._array._async_array
+        return await async_array.oindex.getitem(key)
+
     def __getitem__(self, key):
         array = self._array
         if isinstance(key, indexing.BasicIndexer):
@@ -227,6 +237,21 @@ def __getitem__(self, key):
         # if self.ndim == 0:
         # could possibly have a work-around for 0d data here
 
+    async def async_getitem(self, key):
+        array = self._array
+        if isinstance(key, indexing.BasicIndexer):
+            method = self._async_getitem
+        elif isinstance(key, indexing.VectorizedIndexer):
+            # method = self._vindex
+            raise NotImplementedError("async lazy vectorized indexing is not supported")
+        elif isinstance(key, indexing.OuterIndexer):
+            method = self._async_oindex
+
+        print("did an async get")
+        return await indexing.async_explicit_indexing_adapter(
+            key, array.shape, indexing.IndexingSupport.OUTER, method
+        )
+
 
 def _determine_zarr_chunks(
     enc_chunks, var_chunks, ndim, name, safe_chunks, region, mode, shape

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -1160,6 +1160,14 @@ def load(self, **kwargs) -> Self:
         self._coords = new._coords
         return self
 
+    async def load_async(self, **kwargs) -> Self:
+        temp_ds = self._to_temp_dataset()
+        ds = await temp_ds.load_async(**kwargs)
+        new = self._from_temp_dataset(ds)
+        self._variable = new._variable
+        self._coords = new._coords
+        return self
+
     def compute(self, **kwargs) -> Self:
         """Manually trigger loading of this array's data from disk or a
         remote source into memory and return a new array.

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import asyncio
 import copy
 import datetime
 import math
@@ -531,24 +532,50 @@ def load(self, **kwargs) -> Self:
         dask.compute
         """
         # access .data to coerce everything to numpy or dask arrays
-        lazy_data = {
+        chunked_data = {
             k: v._data for k, v in self.variables.items() if is_chunked_array(v._data)
         }
-        if lazy_data:
-            chunkmanager = get_chunked_array_type(*lazy_data.values())
+        if chunked_data:
+            chunkmanager = get_chunked_array_type(*chunked_data.values())
 
             # evaluate all the chunked arrays simultaneously
             evaluated_data: tuple[np.ndarray[Any, Any], ...] = chunkmanager.compute(
-                *lazy_data.values(), **kwargs
+                *chunked_data.values(), **kwargs
             )
 
-            for k, data in zip(lazy_data, evaluated_data, strict=False):
+            for k, data in zip(chunked_data, evaluated_data, strict=False):
                 self.variables[k].data = data
 
         # load everything else sequentially
-        for k, v in self.variables.items():
-            if k not in lazy_data:
-                v.load()
+        [v.load() for k, v in self.variables.items() if k not in chunked_data]
+
+        return self
+
+    async def load_async(self, **kwargs) -> Self:
+        # TODO refactor this to pull out the common chunked_data codepath
+
+        # this blocks on chunked arrays but not on lazily indexed arrays
+
+        # access .data to coerce everything to numpy or dask arrays
+        chunked_data = {
+            k: v._data for k, v in self.variables.items() if is_chunked_array(v._data)
+        }
+        if chunked_data:
+            chunkmanager = get_chunked_array_type(*chunked_data.values())
+
+            # evaluate all the chunked arrays simultaneously
+            evaluated_data: tuple[np.ndarray[Any, Any], ...] = chunkmanager.compute(
+                *chunked_data.values(), **kwargs
+            )
+
+            for k, data in zip(chunked_data, evaluated_data, strict=False):
+                self.variables[k].data = data
+
+        # load everything else concurrently
+        coros = [
+            v.load_async() for k, v in self.variables.items() if k not in chunked_data
+        ]
+        await asyncio.gather(*coros)
 
         return self