From eb4ae4a8e118382accf7172aad0da255bd68bc4e Mon Sep 17 00:00:00 2001 From: Alessandro Amici Date: Wed, 3 Mar 2021 17:54:55 +0100 Subject: [PATCH 01/12] Switch backend API to v2 and drop v1 --- .github/workflows/ci-additional.yaml | 8 +- xarray/backends/api.py | 420 ++++++++++++++------------- xarray/backends/apiv2.py | 286 ------------------ 3 files changed, 213 insertions(+), 501 deletions(-) delete mode 100644 xarray/backends/apiv2.py diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index 4bf85458211..5d9c94d639c 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -42,7 +42,6 @@ jobs: "py37-min-all-deps", "py37-min-nep18", "py38-all-but-dask", - "py38-backend-api-v2", "py38-flaky", ] steps: @@ -56,12 +55,7 @@ jobs: - name: Set environment variables run: | - if [[ ${{ matrix.env }} == "py38-backend-api-v2" ]] ; - then - echo "CONDA_ENV_FILE=ci/requirements/environment.yml" >> $GITHUB_ENV - echo "XARRAY_BACKEND_API=v2" >> $GITHUB_ENV - - elif [[ ${{ matrix.env }} == "py38-flaky" ]] ; + if [[ ${{ matrix.env }} == "py38-flaky" ]] ; then echo "CONDA_ENV_FILE=ci/requirements/environment.yml" >> $GITHUB_ENV echo "PYTEST_EXTRA_FLAGS=--run-flaky --run-network-tests" >> $GITHUB_ENV diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 4fa34b39925..b13a45a0d72 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -26,9 +26,10 @@ ) from ..core.dataarray import DataArray from ..core.dataset import Dataset, _get_chunk, _maybe_chunk -from ..core.utils import close_on_error, is_grib_path, is_remote_uri, read_magic_number +from ..core.utils import is_grib_path, is_remote_uri, read_magic_number from .common import AbstractDataStore, ArrayWriter from .locks import _get_scheduler +from . import plugins if TYPE_CHECKING: try: @@ -236,6 +237,31 @@ def check_attr(name, value): check_attr(k, v) +def _resolve_decoders_kwargs(decode_cf, open_backend_dataset_parameters, **decoders): + for d in list(decoders): + if decode_cf is False and d in open_backend_dataset_parameters: + decoders[d] = False + if decoders[d] is None: + decoders.pop(d) + return decoders + + +def _get_mtime(filename_or_obj): + # if passed an actual file path, augment the token with + # the file modification time + mtime = None + + try: + path = os.fspath(filename_or_obj) + except TypeError: + path = None + + if path and not is_remote_uri(path): + mtime = os.path.getmtime(filename_or_obj) + + return mtime + + def _protect_dataset_variables_inplace(dataset, cache): for name, variable in dataset.variables.items(): if name not in variable.dims: @@ -304,22 +330,90 @@ def load_dataarray(filename_or_obj, **kwargs): return da.load() +def _chunk_ds( + backend_ds, + filename_or_obj, + engine, + chunks, + overwrite_encoded_chunks, + **extra_tokens, +): + from dask.base import tokenize + + mtime = _get_mtime(filename_or_obj) + token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens) + name_prefix = "open_dataset-%s" % token + + variables = {} + for name, var in backend_ds.variables.items(): + var_chunks = _get_chunk(var, chunks) + variables[name] = _maybe_chunk( + name, + var, + var_chunks, + overwrite_encoded_chunks=overwrite_encoded_chunks, + name_prefix=name_prefix, + token=token, + ) + ds = backend_ds._replace(variables) + return ds + + +def _dataset_from_backend_dataset( + backend_ds, + filename_or_obj, + engine, + chunks, + cache, + overwrite_encoded_chunks, + **extra_tokens, +): + if not (isinstance(chunks, (int, dict)) or chunks is None): + if chunks != "auto": + raise ValueError( + "chunks must be an int, dict, 'auto', or None. " + "Instead found %s. " % chunks + ) + + _protect_dataset_variables_inplace(backend_ds, cache) + if chunks is None: + ds = backend_ds + else: + ds = _chunk_ds( + backend_ds, + filename_or_obj, + engine, + chunks, + overwrite_encoded_chunks, + **extra_tokens, + ) + + ds.set_close(backend_ds._close) + + # Ensure source filename always stored in dataset object (GH issue #2550) + if "source" not in ds.encoding: + if isinstance(filename_or_obj, str): + ds.encoding["source"] = filename_or_obj + + return ds + + def open_dataset( filename_or_obj, - group=None, - decode_cf=True, - mask_and_scale=None, - decode_times=True, - concat_characters=True, - decode_coords=True, + *, engine=None, chunks=None, - lock=None, cache=None, + decode_cf=None, + mask_and_scale=None, + decode_times=None, + decode_timedelta=None, + use_cftime=None, + concat_characters=None, + decode_coords=None, drop_variables=None, backend_kwargs=None, - use_cftime=None, - decode_timedelta=None, + **kwargs, ): """Open and decode a dataset from a file or file-like object. @@ -328,45 +422,14 @@ def open_dataset( filename_or_obj : str, Path, file-like or DataStore Strings and Path objects are interpreted as a path to a netCDF file or an OpenDAP URL and opened with python-netCDF4, unless the filename - ends with .gz, in which case the file is gunzipped and opened with + ends with .gz, in which case the file is unzipped and opened with scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). - group : str, optional - Path to the netCDF4 group in the given file to open (only works for - netCDF4 files). - decode_cf : bool, optional - Whether to decode these variables, assuming they were saved according - to CF conventions. - mask_and_scale : bool, optional - If True, replace array values equal to `_FillValue` with NA and scale - values according to the formula `original_values * scale_factor + - add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are - taken from variable attributes (if they exist). If the `_FillValue` or - `missing_value` attribute contains multiple values a warning will be - issued and all array values matching one of the multiple values will - be replaced by NA. mask_and_scale defaults to True except for the - pseudonetcdf backend. - decode_times : bool, optional - If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, leave them encoded as numbers. - concat_characters : bool, optional - If True, concatenate along the last dimension of character arrays to - form string arrays. Dimensions will only be concatenated over (and - removed) if they have no corresponding variable and if they are only - used as the last dimension of character arrays. - decode_coords : bool or {"coordinates", "all"}, optional - Controls which variables are set as coordinate variables: - - - "coordinates" or True: Set variables referred to in the - ``'coordinates'`` attribute of the datasets or individual variables - as coordinate variables. - - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and - other attributes as coordinate variables. - engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", \ - "pseudonetcdf", "zarr"}, optional + engine : str, optional Engine to use when reading files. If not provided, the default engine is chosen based on available dependencies, with a preference for - "netcdf4". + "netcdf4". Options are: {"netcdf4", "scipy", "pydap", "h5netcdf",\ + "pynio", "cfgrib", "pseudonetcdf", "zarr"}. chunks : int or dict, optional If chunks is provided, it is used to load the new dataset into dask arrays. ``chunks=-1`` loads the dataset with dask using a single @@ -375,26 +438,36 @@ def open_dataset( a single chunk for all arrays. ``chunks='auto'`` will use dask ``auto`` chunking taking into account the engine preferred chunks. See dask chunking for more details. - lock : False or lock-like, optional - Resource lock to use when reading data from disk. Only relevant when - using dask or another form of parallelism. By default, appropriate - locks are chosen to safely read and write files with the currently - active dask scheduler. cache : bool, optional - If True, cache data loaded from the underlying datastore in memory as + If True, cache data is loaded from the underlying datastore in memory as NumPy arrays when accessed to avoid reading from the underlying data- store multiple times. Defaults to True unless you specify the `chunks` argument to use dask, in which case it defaults to False. Does not change the behavior of coordinates corresponding to dimensions, which always load their data from disk into a ``pandas.Index``. - drop_variables: str or iterable, optional - A variable or list of variables to exclude from being parsed from the - dataset. This may be useful to drop variables with problems or - inconsistent values. - backend_kwargs: dict, optional - A dictionary of keyword arguments to pass on to the backend. This - may be useful when backend options would improve performance or - allow user control of dataset processing. + decode_cf : bool, optional + Setting ``decode_cf=False`` will disable ``mask_and_scale``, + ``decode_times``, ``decode_timedelta``, ``concat_characters``, + ``decode_coords``. + mask_and_scale : bool, optional + If True, array values equal to `_FillValue` are replaced with NA and other + values are scaled according to the formula `original_values * scale_factor + + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are + taken from variable attributes (if they exist). If the `_FillValue` or + `missing_value` attribute contains multiple values, a warning will be + issued and all array values matching one of the multiple values will + be replaced by NA. mask_and_scale defaults to True except for the + pseudonetcdf backend. This keyword may not be supported by all the backends. + decode_times : bool, optional + If True, decode times encoded in the standard NetCDF datetime format + into datetime objects. Otherwise, leave them encoded as numbers. + This keyword may not be supported by all the backends. + decode_timedelta : bool, optional + If True, decode variables and coordinates with time units in + {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} + into timedelta objects. If False, they remain encoded as numbers. + If None (default), assume the same value of decode_time. + This keyword may not be supported by all the backends. use_cftime: bool, optional Only relevant if encoded dates come from a standard calendar (e.g. "gregorian", "proleptic_gregorian", "standard", or not @@ -404,12 +477,42 @@ def open_dataset( ``cftime.datetime`` objects, regardless of whether or not they can be represented using ``np.datetime64[ns]`` objects. If False, always decode times to ``np.datetime64[ns]`` objects; if this is not possible - raise an error. - decode_timedelta : bool, optional - If True, decode variables and coordinates with time units in - {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} - into timedelta objects. If False, leave them encoded as numbers. - If None (default), assume the same value of decode_time. + raise an error. This keyword may not be supported by all the backends. + concat_characters : bool, optional + If True, concatenate along the last dimension of character arrays to + form string arrays. Dimensions will only be concatenated over (and + removed) if they have no corresponding variable and if they are only + used as the last dimension of character arrays. + This keyword may not be supported by all the backends. + decode_coords : bool or {"coordinates", "all"}, optional + Controls which variables are set as coordinate variables: + + - "coordinates" or True: Set variables referred to in the + ``'coordinates'`` attribute of the datasets or individual variables + as coordinate variables. + - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and + other attributes as coordinate variables. + drop_variables: str or iterable, optional + A variable or list of variables to exclude from the dataset parsing. + This may be useful to drop variables with problems or + inconsistent values. + backend_kwargs: + Additional keyword arguments passed on to the engine open function. + **kwargs: dict + Additional keyword arguments passed on to the engine open function. + For example: + + - 'group': path to the netCDF4 group in the given file to open given as + a str,supported by "netcdf4", "h5netcdf", "zarr". + + - 'lock': resource lock to use when reading data from disk. Only + relevant when using dask or another form of parallelism. By default, + appropriate locks are chosen to safely read and write files with the + currently active dask scheduler. Supported by "netcdf4", "h5netcdf", + "pynio", "pseudonetcdf", "cfgrib". + + See engine open function for kwargs accepted by each specific engine. + Returns ------- @@ -427,159 +530,67 @@ def open_dataset( -------- open_mfdataset """ - if os.environ.get("XARRAY_BACKEND_API", "v1") == "v2": - kwargs = {k: v for k, v in locals().items() if v is not None} - from . import apiv2 - - return apiv2.open_dataset(**kwargs) - - if mask_and_scale is None: - mask_and_scale = not engine == "pseudonetcdf" - - if not decode_cf: - mask_and_scale = False - decode_times = False - concat_characters = False - decode_coords = False - decode_timedelta = False if cache is None: cache = chunks is None - if backend_kwargs is None: - backend_kwargs = {} - - def maybe_decode_store(store, chunks): - ds = conventions.decode_cf( - store, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - drop_variables=drop_variables, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, - ) - - _protect_dataset_variables_inplace(ds, cache) - - if chunks is not None and engine != "zarr": - from dask.base import tokenize - - # if passed an actual file path, augment the token with - # the file modification time - if isinstance(filename_or_obj, str) and not is_remote_uri(filename_or_obj): - mtime = os.path.getmtime(filename_or_obj) - else: - mtime = None - token = tokenize( - filename_or_obj, - mtime, - group, - decode_cf, - mask_and_scale, - decode_times, - concat_characters, - decode_coords, - engine, - chunks, - drop_variables, - use_cftime, - decode_timedelta, - ) - name_prefix = "open_dataset-%s" % token - ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token) - - elif engine == "zarr": - # adapted from Dataset.Chunk() and taken from open_zarr - if not (isinstance(chunks, (int, dict)) or chunks is None): - if chunks != "auto": - raise ValueError( - "chunks must be an int, dict, 'auto', or None. " - "Instead found %s. " % chunks - ) - - if chunks == "auto": - try: - import dask.array # noqa - except ImportError: - chunks = None - - # auto chunking needs to be here and not in ZarrStore because - # the variable chunks does not survive decode_cf - # return trivial case - if chunks is None: - return ds - - if isinstance(chunks, int): - chunks = dict.fromkeys(ds.dims, chunks) - - variables = { - k: _maybe_chunk( - k, - v, - _get_chunk(v, chunks), - overwrite_encoded_chunks=overwrite_encoded_chunks, - ) - for k, v in ds.variables.items() - } - ds2 = ds._replace(variables) + if backend_kwargs is not None: + kwargs.update(backend_kwargs) - else: - ds2 = ds - ds2.set_close(ds._close) - return ds2 + if engine is None: + engine = plugins.guess_engine(filename_or_obj) - filename_or_obj = _normalize_path(filename_or_obj) - - if isinstance(filename_or_obj, AbstractDataStore): - store = filename_or_obj - else: - if engine is None: - engine = _autodetect_engine(filename_or_obj) - - extra_kwargs = {} - if group is not None: - extra_kwargs["group"] = group - if lock is not None: - extra_kwargs["lock"] = lock - - if engine == "zarr": - backend_kwargs = backend_kwargs.copy() - overwrite_encoded_chunks = backend_kwargs.pop( - "overwrite_encoded_chunks", None - ) + backend = plugins.get_backend(engine) - opener = _get_backend_cls(engine) - store = opener(filename_or_obj, **extra_kwargs, **backend_kwargs) - - with close_on_error(store): - ds = maybe_decode_store(store, chunks) + decoders = _resolve_decoders_kwargs( + decode_cf, + open_backend_dataset_parameters=backend.open_dataset_parameters, + mask_and_scale=mask_and_scale, + decode_times=decode_times, + decode_timedelta=decode_timedelta, + concat_characters=concat_characters, + use_cftime=use_cftime, + decode_coords=decode_coords, + ) - # Ensure source filename always stored in dataset object (GH issue #2550) - if "source" not in ds.encoding: - if isinstance(filename_or_obj, str): - ds.encoding["source"] = filename_or_obj + overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None) + backend_ds = backend.open_dataset( + filename_or_obj, + drop_variables=drop_variables, + **decoders, + **kwargs, + ) + ds = _dataset_from_backend_dataset( + backend_ds, + filename_or_obj, + engine, + chunks, + cache, + overwrite_encoded_chunks, + drop_variables=drop_variables, + **decoders, + **kwargs, + ) return ds def open_dataarray( filename_or_obj, - group=None, - decode_cf=True, - mask_and_scale=None, - decode_times=True, - concat_characters=True, - decode_coords=True, + *, engine=None, chunks=None, - lock=None, cache=None, + decode_cf=None, + mask_and_scale=None, + decode_times=None, + decode_timedelta=None, + use_cftime=None, + concat_characters=None, + decode_coords=None, drop_variables=None, backend_kwargs=None, - use_cftime=None, - decode_timedelta=None, + **kwargs, ): """Open an DataArray from a file or file-like object containing a single data variable. @@ -688,7 +699,6 @@ def open_dataarray( dataset = open_dataset( filename_or_obj, - group=group, decode_cf=decode_cf, mask_and_scale=mask_and_scale, decode_times=decode_times, @@ -696,12 +706,12 @@ def open_dataarray( decode_coords=decode_coords, engine=engine, chunks=chunks, - lock=lock, cache=cache, drop_variables=drop_variables, backend_kwargs=backend_kwargs, use_cftime=use_cftime, decode_timedelta=decode_timedelta, + **kwargs ) if len(dataset.data_vars) != 1: @@ -734,7 +744,6 @@ def open_mfdataset( compat="no_conflicts", preprocess=None, engine=None, - lock=None, data_vars="all", coords="different", combine="by_coords", @@ -804,11 +813,6 @@ def open_mfdataset( Engine to use when reading files. If not provided, the default engine is chosen based on available dependencies, with a preference for "netcdf4". - lock : False or lock-like, optional - Resource lock to use when reading data from disk. Only relevant when - using dask or another form of parallelism. By default, appropriate - locks are chosen to safely read and write files with the currently - active dask scheduler. data_vars : {"minimal", "different", "all"} or list of str, optional These data variables will be concatenated together: * "minimal": Only data variables in which the dimension already @@ -923,7 +927,7 @@ def open_mfdataset( combined_ids_paths = _infer_concat_order_from_positions(paths) ids, paths = (list(combined_ids_paths.keys()), list(combined_ids_paths.values())) - open_kwargs = dict(engine=engine, chunks=chunks or {}, lock=lock, **kwargs) + open_kwargs = dict(engine=engine, chunks=chunks or {}, **kwargs) if parallel: import dask diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py deleted file mode 100644 index de1b3e1bb29..00000000000 --- a/xarray/backends/apiv2.py +++ /dev/null @@ -1,286 +0,0 @@ -import os - -from ..core import indexing -from ..core.dataset import _get_chunk, _maybe_chunk -from ..core.utils import is_remote_uri -from . import plugins - - -def _protect_dataset_variables_inplace(dataset, cache): - for name, variable in dataset.variables.items(): - if name not in variable.dims: - # no need to protect IndexVariable objects - data = indexing.CopyOnWriteArray(variable._data) - if cache: - data = indexing.MemoryCachedArray(data) - variable.data = data - - -def _get_mtime(filename_or_obj): - # if passed an actual file path, augment the token with - # the file modification time - mtime = None - - try: - path = os.fspath(filename_or_obj) - except TypeError: - path = None - - if path and not is_remote_uri(path): - mtime = os.path.getmtime(filename_or_obj) - - return mtime - - -def _chunk_ds( - backend_ds, - filename_or_obj, - engine, - chunks, - overwrite_encoded_chunks, - **extra_tokens, -): - from dask.base import tokenize - - mtime = _get_mtime(filename_or_obj) - token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens) - name_prefix = "open_dataset-%s" % token - - variables = {} - for name, var in backend_ds.variables.items(): - var_chunks = _get_chunk(var, chunks) - variables[name] = _maybe_chunk( - name, - var, - var_chunks, - overwrite_encoded_chunks=overwrite_encoded_chunks, - name_prefix=name_prefix, - token=token, - ) - ds = backend_ds._replace(variables) - return ds - - -def _dataset_from_backend_dataset( - backend_ds, - filename_or_obj, - engine, - chunks, - cache, - overwrite_encoded_chunks, - **extra_tokens, -): - if not (isinstance(chunks, (int, dict)) or chunks is None): - if chunks != "auto": - raise ValueError( - "chunks must be an int, dict, 'auto', or None. " - "Instead found %s. " % chunks - ) - - _protect_dataset_variables_inplace(backend_ds, cache) - if chunks is None: - ds = backend_ds - else: - ds = _chunk_ds( - backend_ds, - filename_or_obj, - engine, - chunks, - overwrite_encoded_chunks, - **extra_tokens, - ) - - ds.set_close(backend_ds._close) - - # Ensure source filename always stored in dataset object (GH issue #2550) - if "source" not in ds.encoding: - if isinstance(filename_or_obj, str): - ds.encoding["source"] = filename_or_obj - - return ds - - -def _resolve_decoders_kwargs(decode_cf, open_backend_dataset_parameters, **decoders): - for d in list(decoders): - if decode_cf is False and d in open_backend_dataset_parameters: - decoders[d] = False - if decoders[d] is None: - decoders.pop(d) - return decoders - - -def open_dataset( - filename_or_obj, - *, - engine=None, - chunks=None, - cache=None, - decode_cf=None, - mask_and_scale=None, - decode_times=None, - decode_timedelta=None, - use_cftime=None, - concat_characters=None, - decode_coords=None, - drop_variables=None, - backend_kwargs=None, - **kwargs, -): - """Open and decode a dataset from a file or file-like object. - - Parameters - ---------- - filename_or_obj : str, Path, file-like or DataStore - Strings and Path objects are interpreted as a path to a netCDF file - or an OpenDAP URL and opened with python-netCDF4, unless the filename - ends with .gz, in which case the file is unzipped and opened with - scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like - objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). - engine : str, optional - Engine to use when reading files. If not provided, the default engine - is chosen based on available dependencies, with a preference for - "netcdf4". Options are: {"netcdf4", "scipy", "pydap", "h5netcdf",\ - "pynio", "cfgrib", "pseudonetcdf", "zarr"}. - chunks : int or dict, optional - If chunks is provided, it is used to load the new dataset into dask - arrays. ``chunks=-1`` loads the dataset with dask using a single - chunk for all arrays. `chunks={}`` loads the dataset with dask using - engine preferred chunks if exposed by the backend, otherwise with - a single chunk for all arrays. - ``chunks='auto'`` will use dask ``auto`` chunking taking into account the - engine preferred chunks. See dask chunking for more details. - cache : bool, optional - If True, cache data is loaded from the underlying datastore in memory as - NumPy arrays when accessed to avoid reading from the underlying data- - store multiple times. Defaults to True unless you specify the `chunks` - argument to use dask, in which case it defaults to False. Does not - change the behavior of coordinates corresponding to dimensions, which - always load their data from disk into a ``pandas.Index``. - decode_cf : bool, optional - Setting ``decode_cf=False`` will disable ``mask_and_scale``, - ``decode_times``, ``decode_timedelta``, ``concat_characters``, - ``decode_coords``. - mask_and_scale : bool, optional - If True, array values equal to `_FillValue` are replaced with NA and other - values are scaled according to the formula `original_values * scale_factor + - add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are - taken from variable attributes (if they exist). If the `_FillValue` or - `missing_value` attribute contains multiple values, a warning will be - issued and all array values matching one of the multiple values will - be replaced by NA. mask_and_scale defaults to True except for the - pseudonetcdf backend. This keyword may not be supported by all the backends. - decode_times : bool, optional - If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, leave them encoded as numbers. - This keyword may not be supported by all the backends. - decode_timedelta : bool, optional - If True, decode variables and coordinates with time units in - {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} - into timedelta objects. If False, they remain encoded as numbers. - If None (default), assume the same value of decode_time. - This keyword may not be supported by all the backends. - use_cftime: bool, optional - Only relevant if encoded dates come from a standard calendar - (e.g. "gregorian", "proleptic_gregorian", "standard", or not - specified). If None (default), attempt to decode times to - ``np.datetime64[ns]`` objects; if this is not possible, decode times to - ``cftime.datetime`` objects. If True, always decode times to - ``cftime.datetime`` objects, regardless of whether or not they can be - represented using ``np.datetime64[ns]`` objects. If False, always - decode times to ``np.datetime64[ns]`` objects; if this is not possible - raise an error. This keyword may not be supported by all the backends. - concat_characters : bool, optional - If True, concatenate along the last dimension of character arrays to - form string arrays. Dimensions will only be concatenated over (and - removed) if they have no corresponding variable and if they are only - used as the last dimension of character arrays. - This keyword may not be supported by all the backends. - decode_coords : bool or {"coordinates", "all"}, optional - Controls which variables are set as coordinate variables: - - - "coordinates" or True: Set variables referred to in the - ``'coordinates'`` attribute of the datasets or individual variables - as coordinate variables. - - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and - other attributes as coordinate variables. - drop_variables: str or iterable, optional - A variable or list of variables to exclude from the dataset parsing. - This may be useful to drop variables with problems or - inconsistent values. - backend_kwargs: - Additional keyword arguments passed on to the engine open function. - **kwargs: dict - Additional keyword arguments passed on to the engine open function. - For example: - - - 'group': path to the netCDF4 group in the given file to open given as - a str,supported by "netcdf4", "h5netcdf", "zarr". - - - 'lock': resource lock to use when reading data from disk. Only - relevant when using dask or another form of parallelism. By default, - appropriate locks are chosen to safely read and write files with the - currently active dask scheduler. Supported by "netcdf4", "h5netcdf", - "pynio", "pseudonetcdf", "cfgrib". - - See engine open function for kwargs accepted by each specific engine. - - - Returns - ------- - dataset : Dataset - The newly created dataset. - - Notes - ----- - ``open_dataset`` opens the file with read-only access. When you modify - values of a Dataset, even one linked to files on disk, only the in-memory - copy you are manipulating in xarray is modified: the original file on disk - is never touched. - - See Also - -------- - open_mfdataset - """ - - if cache is None: - cache = chunks is None - - if backend_kwargs is not None: - kwargs.update(backend_kwargs) - - if engine is None: - engine = plugins.guess_engine(filename_or_obj) - - backend = plugins.get_backend(engine) - - decoders = _resolve_decoders_kwargs( - decode_cf, - open_backend_dataset_parameters=backend.open_dataset_parameters, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - decode_timedelta=decode_timedelta, - concat_characters=concat_characters, - use_cftime=use_cftime, - decode_coords=decode_coords, - ) - - overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None) - backend_ds = backend.open_dataset( - filename_or_obj, - drop_variables=drop_variables, - **decoders, - **kwargs, - ) - ds = _dataset_from_backend_dataset( - backend_ds, - filename_or_obj, - engine, - chunks, - cache, - overwrite_encoded_chunks, - drop_variables=drop_variables, - **decoders, - **kwargs, - ) - - return ds From 93a14ad9c15a398d16c6adbccbd14c076c774b2c Mon Sep 17 00:00:00 2001 From: Alessandro Amici Date: Wed, 3 Mar 2021 18:02:30 +0100 Subject: [PATCH 02/12] Remove APIv1 functions now unused --- xarray/backends/api.py | 61 +----------------------------------------- xarray/core/utils.py | 6 ----- 2 files changed, 1 insertion(+), 66 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index b13a45a0d72..f3ce1b0aaf1 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -26,7 +26,7 @@ ) from ..core.dataarray import DataArray from ..core.dataset import Dataset, _get_chunk, _maybe_chunk -from ..core.utils import is_grib_path, is_remote_uri, read_magic_number +from ..core.utils import is_remote_uri, read_magic_number from .common import AbstractDataStore, ArrayWriter from .locks import _get_scheduler from . import plugins @@ -71,26 +71,6 @@ def _get_default_engine_remote_uri(): return engine -def _get_default_engine_grib(): - msgs = [] - try: - import Nio # noqa: F401 - - msgs += ["set engine='pynio' to access GRIB files with PyNIO"] - except ImportError: # pragma: no cover - pass - try: - import cfgrib # noqa: F401 - - msgs += ["set engine='cfgrib' to access GRIB files with cfgrib"] - except ImportError: # pragma: no cover - pass - if msgs: - raise ValueError(" or\n".join(msgs)) - else: - raise ValueError("PyNIO or cfgrib is required for accessing GRIB files") - - def _get_default_engine_gz(): try: import scipy # noqa: F401 @@ -119,27 +99,9 @@ def _get_default_engine_netcdf(): return engine -def _get_engine_from_magic_number(filename_or_obj): - magic_number = read_magic_number(filename_or_obj) - - if magic_number.startswith(b"CDF"): - engine = "scipy" - elif magic_number.startswith(b"\211HDF\r\n\032\n"): - engine = "h5netcdf" - else: - raise ValueError( - "cannot guess the engine, " - f"{magic_number} is not the signature of any supported file format " - "did you mean to pass a string for a path instead?" - ) - return engine - - def _get_default_engine(path: str, allow_remote: bool = False): if allow_remote and is_remote_uri(path): engine = _get_default_engine_remote_uri() - elif is_grib_path(path): - engine = _get_default_engine_grib() elif path.endswith(".gz"): engine = _get_default_engine_gz() else: @@ -147,27 +109,6 @@ def _get_default_engine(path: str, allow_remote: bool = False): return engine -def _autodetect_engine(filename_or_obj): - if isinstance(filename_or_obj, AbstractDataStore): - engine = "store" - elif isinstance(filename_or_obj, (str, Path)): - engine = _get_default_engine(str(filename_or_obj), allow_remote=True) - else: - engine = _get_engine_from_magic_number(filename_or_obj) - return engine - - -def _get_backend_cls(engine, engines=ENGINES): - """Select open_dataset method based on current engine""" - try: - return engines[engine] - except KeyError: - raise ValueError( - "unrecognized engine for open_dataset: {}\n" - "must be one of: {}".format(engine, list(ENGINES)) - ) - - def _normalize_path(path): if isinstance(path, Path): path = str(path) diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 9648458ec6d..f9c8523306c 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -4,7 +4,6 @@ import functools import io import itertools -import os.path import re import warnings from enum import Enum @@ -671,11 +670,6 @@ def read_magic_number(filename_or_obj, count=8): return magic_number -def is_grib_path(path: str) -> bool: - _, ext = os.path.splitext(path) - return ext in [".grib", ".grb", ".grib2", ".grb2"] - - def is_uniform_spaced(arr, **kwargs) -> bool: """Return True if values of an array are uniformly spaced and sorted. From 76bc93e919f4419c94ff633e2ddd1678b62b9e2f Mon Sep 17 00:00:00 2001 From: Alessandro Amici Date: Wed, 3 Mar 2021 18:06:51 +0100 Subject: [PATCH 03/12] Clean up imports --- xarray/backends/api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index f3ce1b0aaf1..776b083ba28 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -26,7 +26,7 @@ ) from ..core.dataarray import DataArray from ..core.dataset import Dataset, _get_chunk, _maybe_chunk -from ..core.utils import is_remote_uri, read_magic_number +from ..core.utils import is_remote_uri from .common import AbstractDataStore, ArrayWriter from .locks import _get_scheduler from . import plugins From 382ae5ecde05b5584a1520cb45d6a18c7d891898 Mon Sep 17 00:00:00 2001 From: Alessandro Amici Date: Wed, 3 Mar 2021 18:18:26 +0100 Subject: [PATCH 04/12] Fix code style --- xarray/backends/api.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 776b083ba28..92f1590ab2c 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -27,9 +27,9 @@ from ..core.dataarray import DataArray from ..core.dataset import Dataset, _get_chunk, _maybe_chunk from ..core.utils import is_remote_uri +from . import plugins from .common import AbstractDataStore, ArrayWriter from .locks import _get_scheduler -from . import plugins if TYPE_CHECKING: try: @@ -652,7 +652,7 @@ def open_dataarray( backend_kwargs=backend_kwargs, use_cftime=use_cftime, decode_timedelta=decode_timedelta, - **kwargs + **kwargs, ) if len(dataset.data_vars) != 1: From 6469008aa57f37d7103e0baca855e654f75ef4b1 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 4 Mar 2021 07:34:09 +0100 Subject: [PATCH 05/12] fix open_dataset documentation --- xarray/backends/api.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 92f1590ab2c..92e987a0059 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -363,14 +363,14 @@ def open_dataset( filename_or_obj : str, Path, file-like or DataStore Strings and Path objects are interpreted as a path to a netCDF file or an OpenDAP URL and opened with python-netCDF4, unless the filename - ends with .gz, in which case the file is unzipped and opened with + ends with .gz, in which case the file is gunzipped and opened with scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). - engine : str, optional + engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", \ + "pseudonetcdf", "zarr"}, optional Engine to use when reading files. If not provided, the default engine is chosen based on available dependencies, with a preference for - "netcdf4". Options are: {"netcdf4", "scipy", "pydap", "h5netcdf",\ - "pynio", "cfgrib", "pseudonetcdf", "zarr"}. + "netcdf4". chunks : int or dict, optional If chunks is provided, it is used to load the new dataset into dask arrays. ``chunks=-1`` loads the dataset with dask using a single @@ -380,22 +380,21 @@ def open_dataset( ``chunks='auto'`` will use dask ``auto`` chunking taking into account the engine preferred chunks. See dask chunking for more details. cache : bool, optional - If True, cache data is loaded from the underlying datastore in memory as + If True, cache data loaded from the underlying datastore in memory as NumPy arrays when accessed to avoid reading from the underlying data- store multiple times. Defaults to True unless you specify the `chunks` argument to use dask, in which case it defaults to False. Does not change the behavior of coordinates corresponding to dimensions, which always load their data from disk into a ``pandas.Index``. decode_cf : bool, optional - Setting ``decode_cf=False`` will disable ``mask_and_scale``, - ``decode_times``, ``decode_timedelta``, ``concat_characters``, - ``decode_coords``. + Whether to decode these variables, assuming they were saved according + to CF conventions. mask_and_scale : bool, optional - If True, array values equal to `_FillValue` are replaced with NA and other - values are scaled according to the formula `original_values * scale_factor + + If True, replace array values equal to `_FillValue` with NA and scale + values according to the formula `original_values * scale_factor + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are taken from variable attributes (if they exist). If the `_FillValue` or - `missing_value` attribute contains multiple values, a warning will be + `missing_value` attribute contains multiple values a warning will be issued and all array values matching one of the multiple values will be replaced by NA. mask_and_scale defaults to True except for the pseudonetcdf backend. This keyword may not be supported by all the backends. @@ -406,7 +405,7 @@ def open_dataset( decode_timedelta : bool, optional If True, decode variables and coordinates with time units in {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} - into timedelta objects. If False, they remain encoded as numbers. + into timedelta objects. If False, leave them encoded as numbers. If None (default), assume the same value of decode_time. This keyword may not be supported by all the backends. use_cftime: bool, optional @@ -434,8 +433,8 @@ def open_dataset( - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and other attributes as coordinate variables. drop_variables: str or iterable, optional - A variable or list of variables to exclude from the dataset parsing. - This may be useful to drop variables with problems or + A variable or list of variables to exclude from being parsed from the + dataset. This may be useful to drop variables with problems or inconsistent values. backend_kwargs: Additional keyword arguments passed on to the engine open function. From 41baee57a0db6ebd32d1bb10702dfbab3e6c0f39 Mon Sep 17 00:00:00 2001 From: Aureliana Barghini Date: Thu, 4 Mar 2021 15:22:06 +0100 Subject: [PATCH 06/12] fix open_dataarray doc --- xarray/backends/api.py | 104 ++++++++++++++++++++++------------------- 1 file changed, 56 insertions(+), 48 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 92e987a0059..9405e474900 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -541,14 +541,31 @@ def open_dataarray( Parameters ---------- filename_or_obj : str, Path, file-like or DataStore - Strings and Paths are interpreted as a path to a netCDF file or an - OpenDAP URL and opened with python-netCDF4, unless the filename ends - with .gz, in which case the file is gunzipped and opened with + Strings and Path objects are interpreted as a path to a netCDF file + or an OpenDAP URL and opened with python-netCDF4, unless the filename + ends with .gz, in which case the file is gunzipped and opened with scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). - group : str, optional - Path to the netCDF4 group in the given file to open (only works for - netCDF4 files). + engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", \ + "pseudonetcdf", "zarr"}, optional + Engine to use when reading files. If not provided, the default engine + is chosen based on available dependencies, with a preference for + "netcdf4". + chunks : int or dict, optional + If chunks is provided, it is used to load the new dataset into dask + arrays. ``chunks=-1`` loads the dataset with dask using a single + chunk for all arrays. `chunks={}`` loads the dataset with dask using + engine preferred chunks if exposed by the backend, otherwise with + a single chunk for all arrays. + ``chunks='auto'`` will use dask ``auto`` chunking taking into account the + engine preferred chunks. See dask chunking for more details. + cache : bool, optional + If True, cache data loaded from the underlying datastore in memory as + NumPy arrays when accessed to avoid reading from the underlying data- + store multiple times. Defaults to True unless you specify the `chunks` + argument to use dask, in which case it defaults to False. Does not + change the behavior of coordinates corresponding to dimensions, which + always load their data from disk into a ``pandas.Index``. decode_cf : bool, optional Whether to decode these variables, assuming they were saved according to CF conventions. @@ -560,15 +577,33 @@ def open_dataarray( `missing_value` attribute contains multiple values a warning will be issued and all array values matching one of the multiple values will be replaced by NA. mask_and_scale defaults to True except for the - pseudonetcdf backend. + pseudonetcdf backend. This keyword may not be supported by all the backends. decode_times : bool, optional If True, decode times encoded in the standard NetCDF datetime format into datetime objects. Otherwise, leave them encoded as numbers. + This keyword may not be supported by all the backends. + decode_timedelta : bool, optional + If True, decode variables and coordinates with time units in + {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} + into timedelta objects. If False, leave them encoded as numbers. + If None (default), assume the same value of decode_time. + This keyword may not be supported by all the backends. + use_cftime: bool, optional + Only relevant if encoded dates come from a standard calendar + (e.g. "gregorian", "proleptic_gregorian", "standard", or not + specified). If None (default), attempt to decode times to + ``np.datetime64[ns]`` objects; if this is not possible, decode times to + ``cftime.datetime`` objects. If True, always decode times to + ``cftime.datetime`` objects, regardless of whether or not they can be + represented using ``np.datetime64[ns]`` objects. If False, always + decode times to ``np.datetime64[ns]`` objects; if this is not possible + raise an error. This keyword may not be supported by all the backends. concat_characters : bool, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and removed) if they have no corresponding variable and if they are only used as the last dimension of character arrays. + This keyword may not be supported by all the backends. decode_coords : bool or {"coordinates", "all"}, optional Controls which variables are set as coordinate variables: @@ -577,51 +612,24 @@ def open_dataarray( as coordinate variables. - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and other attributes as coordinate variables. - engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib"}, \ - optional - Engine to use when reading files. If not provided, the default engine - is chosen based on available dependencies, with a preference for - "netcdf4". - chunks : int or dict, optional - If chunks is provided, it used to load the new dataset into dask - arrays. - lock : False or lock-like, optional - Resource lock to use when reading data from disk. Only relevant when - using dask or another form of parallelism. By default, appropriate - locks are chosen to safely read and write files with the currently - active dask scheduler. - cache : bool, optional - If True, cache data loaded from the underlying datastore in memory as - NumPy arrays when accessed to avoid reading from the underlying data- - store multiple times. Defaults to True unless you specify the `chunks` - argument to use dask, in which case it defaults to False. Does not - change the behavior of coordinates corresponding to dimensions, which - always load their data from disk into a ``pandas.Index``. drop_variables: str or iterable, optional A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. - backend_kwargs: dict, optional - A dictionary of keyword arguments to pass on to the backend. This - may be useful when backend options would improve performance or - allow user control of dataset processing. If using fsspec URLs, - include the key "storage_options" to pass arguments to the - storage layer. - use_cftime: bool, optional - Only relevant if encoded dates come from a standard calendar - (e.g. "gregorian", "proleptic_gregorian", "standard", or not - specified). If None (default), attempt to decode times to - ``np.datetime64[ns]`` objects; if this is not possible, decode times to - ``cftime.datetime`` objects. If True, always decode times to - ``cftime.datetime`` objects, regardless of whether or not they can be - represented using ``np.datetime64[ns]`` objects. If False, always - decode times to ``np.datetime64[ns]`` objects; if this is not possible - raise an error. - decode_timedelta : bool, optional - If True, decode variables and coordinates with time units in - {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} - into timedelta objects. If False, leave them encoded as numbers. - If None (default), assume the same value of decode_time. + backend_kwargs: + Additional keyword arguments passed on to the engine open function. + **kwargs: dict + Additional keyword arguments passed on to the engine open function. + For example: + + - 'group': path to the netCDF4 group in the given file to open given as + a str,supported by "netcdf4", "h5netcdf", "zarr". + + - 'lock': resource lock to use when reading data from disk. Only + relevant when using dask or another form of parallelism. By default, + appropriate locks are chosen to safely read and write files with the + currently active dask scheduler. Supported by "netcdf4", "h5netcdf", + "pynio", "pseudonetcdf", "cfgrib". Notes ----- From 0f9df59f4672d96c8a96aa711b2b925242066192 Mon Sep 17 00:00:00 2001 From: Alessandro Amici Date: Fri, 5 Mar 2021 12:13:09 +0100 Subject: [PATCH 07/12] Remove stale test on removed functions --- xarray/tests/test_backends_api.py | 5 ----- xarray/tests/test_utils.py | 9 --------- 2 files changed, 14 deletions(-) diff --git a/xarray/tests/test_backends_api.py b/xarray/tests/test_backends_api.py index d19f5aab585..340495d4564 100644 --- a/xarray/tests/test_backends_api.py +++ b/xarray/tests/test_backends_api.py @@ -1,5 +1,3 @@ -import pytest - from xarray.backends.api import _get_default_engine from . import requires_netCDF4, requires_scipy @@ -14,8 +12,5 @@ def test__get_default_engine(): engine_gz = _get_default_engine("/example.gz") assert engine_gz == "scipy" - with pytest.raises(ValueError): - _get_default_engine("/example.grib") - engine_default = _get_default_engine("/example") assert engine_default == "netcdf4" diff --git a/xarray/tests/test_utils.py b/xarray/tests/test_utils.py index 193c45f01cd..9d278a6cfb6 100644 --- a/xarray/tests/test_utils.py +++ b/xarray/tests/test_utils.py @@ -233,15 +233,6 @@ def test_is_remote_uri(): assert not utils.is_remote_uri("example.nc") -def test_is_grib_path(): - assert not utils.is_grib_path("example.nc") - assert not utils.is_grib_path("example.grib ") - assert utils.is_grib_path("example.grib") - assert utils.is_grib_path("example.grib2") - assert utils.is_grib_path("example.grb") - assert utils.is_grib_path("example.grb2") - - class Test_is_uniform_and_sorted: def test_sorted_uniform(self): assert utils.is_uniform_spaced(np.arange(5)) From a188b070d06848578c67c92bd05db76356e169de Mon Sep 17 00:00:00 2001 From: Alessandro Amici Date: Fri, 5 Mar 2021 12:55:26 +0100 Subject: [PATCH 08/12] Fix entry point definition --- xarray/backends/pynio_.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py index 8ace5697d09..5bcf285ee38 100644 --- a/xarray/backends/pynio_.py +++ b/xarray/backends/pynio_.py @@ -99,6 +99,7 @@ def close(self): class PynioBackendEntrypoint(BackendEntrypoint): def open_dataset( + self, filename_or_obj, mask_and_scale=True, decode_times=True, From 1316588b31329d2e92902ee0b75684bdfb7f2b6b Mon Sep 17 00:00:00 2001 From: Alessandro Amici Date: Fri, 5 Mar 2021 16:35:37 +0100 Subject: [PATCH 09/12] Fix API documentation errors. --- xarray/backends/api.py | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 9405e474900..3fd85a3ed31 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -366,7 +366,7 @@ def open_dataset( ends with .gz, in which case the file is gunzipped and opened with scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). - engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", \ + engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", \ "pseudonetcdf", "zarr"}, optional Engine to use when reading files. If not provided, the default engine is chosen based on available dependencies, with a preference for @@ -436,24 +436,23 @@ def open_dataset( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. - backend_kwargs: - Additional keyword arguments passed on to the engine open function. + backend_kwargs: dict + Additional keyword arguments passed on to the engine open function, + equivalent to `**kwargs`. **kwargs: dict Additional keyword arguments passed on to the engine open function. For example: - 'group': path to the netCDF4 group in the given file to open given as - a str,supported by "netcdf4", "h5netcdf", "zarr". - + a str,supported by "netcdf4", "h5netcdf", "zarr". - 'lock': resource lock to use when reading data from disk. Only - relevant when using dask or another form of parallelism. By default, - appropriate locks are chosen to safely read and write files with the - currently active dask scheduler. Supported by "netcdf4", "h5netcdf", - "pynio", "pseudonetcdf", "cfgrib". + relevant when using dask or another form of parallelism. By default, + appropriate locks are chosen to safely read and write files with the + currently active dask scheduler. Supported by "netcdf4", "h5netcdf", + "pynio", "pseudonetcdf", "cfgrib". See engine open function for kwargs accepted by each specific engine. - Returns ------- dataset : Dataset @@ -546,7 +545,7 @@ def open_dataarray( ends with .gz, in which case the file is gunzipped and opened with scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). - engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", \ + engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", \ "pseudonetcdf", "zarr"}, optional Engine to use when reading files. If not provided, the default engine is chosen based on available dependencies, with a preference for @@ -616,20 +615,22 @@ def open_dataarray( A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. - backend_kwargs: - Additional keyword arguments passed on to the engine open function. + backend_kwargs: dict + Additional keyword arguments passed on to the engine open function, + equivalent to `**kwargs`. **kwargs: dict Additional keyword arguments passed on to the engine open function. For example: - 'group': path to the netCDF4 group in the given file to open given as - a str,supported by "netcdf4", "h5netcdf", "zarr". - + a str,supported by "netcdf4", "h5netcdf", "zarr". - 'lock': resource lock to use when reading data from disk. Only - relevant when using dask or another form of parallelism. By default, - appropriate locks are chosen to safely read and write files with the - currently active dask scheduler. Supported by "netcdf4", "h5netcdf", - "pynio", "pseudonetcdf", "cfgrib". + relevant when using dask or another form of parallelism. By default, + appropriate locks are chosen to safely read and write files with the + currently active dask scheduler. Supported by "netcdf4", "h5netcdf", + "pynio", "pseudonetcdf", "cfgrib". + + See engine open function for kwargs accepted by each specific engine. Notes ----- From 09d39a3adc3ca998688e4b653efbe084097a8907 Mon Sep 17 00:00:00 2001 From: Alessandro Amici Date: Fri, 5 Mar 2021 23:13:46 +0100 Subject: [PATCH 10/12] Add informative error message when using positional arguments --- xarray/backends/api.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 3fd85a3ed31..ca09d910416 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -341,7 +341,7 @@ def _dataset_from_backend_dataset( def open_dataset( filename_or_obj, - *, + *args, engine=None, chunks=None, cache=None, @@ -469,6 +469,11 @@ def open_dataset( -------- open_mfdataset """ + if len(args) > 0: + raise TypeError( + "open_dataset() takes only 1 positional argument starting from version 0.18.0, " + "all other options must be passed as keyword arguments" + ) if cache is None: cache = chunks is None From 1e936a152d83cc091b4d66a5505b16a278c4ade2 Mon Sep 17 00:00:00 2001 From: Alessandro Amici Date: Mon, 8 Mar 2021 22:54:28 +0100 Subject: [PATCH 11/12] Add `*args` check to `open_dataarray` as well --- xarray/backends/api.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index ca09d910416..382fad60acb 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -521,7 +521,7 @@ def open_dataset( def open_dataarray( filename_or_obj, - *, + *args, engine=None, chunks=None, cache=None, @@ -650,6 +650,11 @@ def open_dataarray( -------- open_dataset """ + if len(args) > 0: + raise TypeError( + "open_dataarray() takes only 1 positional argument starting from version 0.18.0, " + "all other options must be passed as keyword arguments" + ) dataset = open_dataset( filename_or_obj, From 205737a9dc89c8716c1b0be6cad1e3cc382e9f55 Mon Sep 17 00:00:00 2001 From: Alessandro Amici Date: Mon, 8 Mar 2021 22:55:02 +0100 Subject: [PATCH 12/12] Add whats-new entries --- doc/internals.rst | 1 + doc/whats-new.rst | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/doc/internals.rst b/doc/internals.rst index 9998043b35b..f94371731de 100644 --- a/doc/internals.rst +++ b/doc/internals.rst @@ -249,6 +249,7 @@ re-open it directly with Zarr: print(zgroup.tree()) dict(zgroup["Tair"].attrs) +.. _add_a_backend: How to add a new backend ------------------------ diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9c16fb74a7b..cbafe4d5999 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -27,15 +27,25 @@ New Features - Support for `dask.graph_manipulation `_ (requires dask >=2021.3) By `Guido Imperiale `_ +- Thanks to the new pluggable backend infrastructure external packages may now + use the ``xarray.backends`` entry point to register additional engines to be used in + :py:func:`open_dataset`, see the documentation in :ref:`add_a_backend` + (:issue:`4309`, :issue:`4803`, :pull:`4989`, :pull:`4810` and many others). + The backend refactor has been sponsored with the "Essential Open Source Software for Science" + grant from the `Chan Zuckerberg Initiative `_ and + developed by `B-Open `_. + By `Aureliana Barghini `_ and `Alessandro Amici `_. Breaking changes ~~~~~~~~~~~~~~~~ - +- :py:func:`open_dataset` and :py:func:`open_dataarray` now accept only the first argument + as positional, all others need to be passed are keyword arguments. This is part of the + refactor to support external backends (:issue:`4309`, :pull:`4989`). + By `Alessandro Amici `_. Deprecations ~~~~~~~~~~~~ - Bug fixes ~~~~~~~~~ - Don't allow passing ``axis`` to :py:meth:`Dataset.reduce` methods (:issue:`3510`, :pull:`4940`). @@ -43,6 +53,8 @@ Bug fixes Documentation ~~~~~~~~~~~~~ +- New section on :ref:`add_a_backend` in the "Internals" chapter aimed to backend developers + (:issue:`4803`, :pull:`4810`). By `Aureliana Barghini `_. Internal Changes