Skip to content

Commit cc271e6

Browse files
aurghsTheRed86alexamici
authored
WIP: Proposed refactor of read API for backends (#4477)
* add in api.open_dataset dispatching to stub apiv2 * remove in apiv2 check for input AbstractDataStore * bugfix typo * add kwarg engines in _get_backend_cls needed by apiv2 * add alpha support for h5netcdf * style: clean not used code, modify some variable/function name * Add ENGINES entry for cfgrib. * Define function open_backend_dataset_cfgrib() to be used in apiv2.py. Add necessary imports for this function. * Apply black to check formatting. * Apply black to check formatting. * add dummy zarr apiv2 backend * align apiv2.open_dataset to api.open_dataset * remove unused extra_coords in open_backend_dataset_* * remove extra_coords in open_backend_dataset_cfgrib * transform zarr maybe_chunk and get_chunks in classmethod - to be used in apiv2 without instantiate the object * make alpha zarr apiv2 working * refactor apiv2.open_dataset: - modify signature - move default setting inside backends * move dataset_from_backend_dataset out of apiv2.open_dataset * remove blank lines * remove blank lines * style * Re-write error messages * Fix code style * Fix code style * remove unused import * replace warning with ValueError for not supported kwargs in backends * change zarr.ZarStore.get_chunks into a static method * group `backend_kwargs` and `kwargs` in `extra_tokes` argument in apiv2.dataset_from_backend_dataset` * remove in open_backend_dayaset_${engine} signature kwarags and the related error message * black * Try add a strategy with an environment variable * Try add a strategy with an environment variable * black Co-authored-by: TheRed86 <[email protected]> Co-authored-by: Alessandro Amici <[email protected]>
1 parent 2ce1cfc commit cc271e6

File tree

7 files changed

+447
-8
lines changed

7 files changed

+447
-8
lines changed

azure-pipelines.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ jobs:
2020
conda_env: py37
2121
py38:
2222
conda_env: py38
23+
py38-backend-api-v2:
24+
conda_env: py38
25+
environment_variables: XARRAY_BACKEND_API=v2
2326
py38-all-but-dask:
2427
conda_env: py38-all-but-dask
2528
py38-upstream-dev:

ci/azure/unit-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ steps:
1111
# https://github.com/microsoft/azure-pipelines-tasks/issues/9302
1212
- bash: |
1313
source activate xarray-tests
14-
pytest \
14+
$(environment_variables) pytest \
1515
--junitxml=junit/test-results.xml \
1616
--cov=xarray \
1717
--cov-report=xml \

xarray/backends/api.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import os.path
1+
import os
22
import warnings
33
from glob import glob
44
from io import BytesIO
@@ -163,10 +163,10 @@ def _autodetect_engine(filename_or_obj):
163163
return engine
164164

165165

166-
def _get_backend_cls(engine):
166+
def _get_backend_cls(engine, engines=ENGINES):
167167
"""Select open_dataset method based on current engine"""
168168
try:
169-
return ENGINES[engine]
169+
return engines[engine]
170170
except KeyError:
171171
raise ValueError(
172172
"unrecognized engine for open_dataset: {}\n"
@@ -432,6 +432,13 @@ def open_dataset(
432432
--------
433433
open_mfdataset
434434
"""
435+
if os.environ.get("XARRAY_BACKEND_API", "v1") == "v2":
436+
kwargs = locals().copy()
437+
from . import apiv2
438+
439+
if engine in apiv2.ENGINES:
440+
return apiv2.open_dataset(**kwargs)
441+
435442
if autoclose is not None:
436443
warnings.warn(
437444
"The autoclose argument is no longer used by "

xarray/backends/apiv2.py

Lines changed: 224 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,224 @@
1+
import os
2+
3+
from ..core.utils import is_remote_uri
4+
from . import cfgrib_, h5netcdf_, zarr
5+
from .api import (
6+
_autodetect_engine,
7+
_get_backend_cls,
8+
_normalize_path,
9+
_protect_dataset_variables_inplace,
10+
)
11+
12+
ENGINES = {
13+
"h5netcdf": h5netcdf_.open_backend_dataset_h5necdf,
14+
"zarr": zarr.open_backend_dataset_zarr,
15+
"cfgrib": cfgrib_.open_backend_dataset_cfgrib,
16+
}
17+
18+
19+
def dataset_from_backend_dataset(
20+
ds,
21+
filename_or_obj,
22+
engine,
23+
chunks,
24+
cache,
25+
overwrite_encoded_chunks,
26+
extra_tokens,
27+
):
28+
if not (isinstance(chunks, (int, dict)) or chunks is None):
29+
if chunks != "auto":
30+
raise ValueError(
31+
"chunks must be an int, dict, 'auto', or None. "
32+
"Instead found %s. " % chunks
33+
)
34+
35+
_protect_dataset_variables_inplace(ds, cache)
36+
if chunks is not None and engine != "zarr":
37+
from dask.base import tokenize
38+
39+
# if passed an actual file path, augment the token with
40+
# the file modification time
41+
if isinstance(filename_or_obj, str) and not is_remote_uri(filename_or_obj):
42+
mtime = os.path.getmtime(filename_or_obj)
43+
else:
44+
mtime = None
45+
token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens)
46+
name_prefix = "open_dataset-%s" % token
47+
ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token)
48+
49+
elif engine == "zarr":
50+
51+
if chunks == "auto":
52+
try:
53+
import dask.array # noqa
54+
except ImportError:
55+
chunks = None
56+
57+
if chunks is None:
58+
return ds
59+
60+
if isinstance(chunks, int):
61+
chunks = dict.fromkeys(ds.dims, chunks)
62+
63+
variables = {
64+
k: zarr.ZarrStore.maybe_chunk(k, v, chunks, overwrite_encoded_chunks)
65+
for k, v in ds.variables.items()
66+
}
67+
ds2 = ds._replace(variables)
68+
69+
else:
70+
ds2 = ds
71+
ds2._file_obj = ds._file_obj
72+
73+
# Ensure source filename always stored in dataset object (GH issue #2550)
74+
if "source" not in ds.encoding:
75+
if isinstance(filename_or_obj, str):
76+
ds.encoding["source"] = filename_or_obj
77+
78+
return ds2
79+
80+
81+
def open_dataset(
82+
filename_or_obj,
83+
*,
84+
engine=None,
85+
chunks=None,
86+
cache=None,
87+
backend_kwargs=None,
88+
**kwargs,
89+
):
90+
"""Open and decode a dataset from a file or file-like object.
91+
92+
Parameters
93+
----------
94+
filename_or_obj : str, Path, file-like or DataStore
95+
Strings and Path objects are interpreted as a path to a netCDF file
96+
or an OpenDAP URL and opened with python-netCDF4, unless the filename
97+
ends with .gz, in which case the file is gunzipped and opened with
98+
scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like
99+
objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF).
100+
group : str, optional
101+
Path to the netCDF4 group in the given file to open (only works for
102+
netCDF4 files).
103+
decode_cf : bool, optional
104+
Whether to decode these variables, assuming they were saved according
105+
to CF conventions.
106+
mask_and_scale : bool, optional
107+
If True, replace array values equal to `_FillValue` with NA and scale
108+
values according to the formula `original_values * scale_factor +
109+
add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are
110+
taken from variable attributes (if they exist). If the `_FillValue` or
111+
`missing_value` attribute contains multiple values a warning will be
112+
issued and all array values matching one of the multiple values will
113+
be replaced by NA. mask_and_scale defaults to True except for the
114+
pseudonetcdf backend.
115+
decode_times : bool, optional
116+
If True, decode times encoded in the standard NetCDF datetime format
117+
into datetime objects. Otherwise, leave them encoded as numbers.
118+
autoclose : bool, optional
119+
If True, automatically close files to avoid OS Error of too many files
120+
being open. However, this option doesn't work with streams, e.g.,
121+
BytesIO.
122+
concat_characters : bool, optional
123+
If True, concatenate along the last dimension of character arrays to
124+
form string arrays. Dimensions will only be concatenated over (and
125+
removed) if they have no corresponding variable and if they are only
126+
used as the last dimension of character arrays.
127+
decode_coords : bool, optional
128+
If True, decode the 'coordinates' attribute to identify coordinates in
129+
the resulting dataset.
130+
engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", \
131+
"pseudonetcdf", "zarr"}, optional
132+
Engine to use when reading files. If not provided, the default engine
133+
is chosen based on available dependencies, with a preference for
134+
"netcdf4".
135+
chunks : int or dict, optional
136+
If chunks is provided, it is used to load the new dataset into dask
137+
arrays. ``chunks={}`` loads the dataset with dask using a single
138+
chunk for all arrays. When using ``engine="zarr"``, setting
139+
``chunks='auto'`` will create dask chunks based on the variable's zarr
140+
chunks.
141+
lock : False or lock-like, optional
142+
Resource lock to use when reading data from disk. Only relevant when
143+
using dask or another form of parallelism. By default, appropriate
144+
locks are chosen to safely read and write files with the currently
145+
active dask scheduler.
146+
cache : bool, optional
147+
If True, cache data loaded from the underlying datastore in memory as
148+
NumPy arrays when accessed to avoid reading from the underlying data-
149+
store multiple times. Defaults to True unless you specify the `chunks`
150+
argument to use dask, in which case it defaults to False. Does not
151+
change the behavior of coordinates corresponding to dimensions, which
152+
always load their data from disk into a ``pandas.Index``.
153+
drop_variables: str or iterable, optional
154+
A variable or list of variables to exclude from being parsed from the
155+
dataset. This may be useful to drop variables with problems or
156+
inconsistent values.
157+
backend_kwargs: dict, optional
158+
A dictionary of keyword arguments to pass on to the backend. This
159+
may be useful when backend options would improve performance or
160+
allow user control of dataset processing.
161+
use_cftime: bool, optional
162+
Only relevant if encoded dates come from a standard calendar
163+
(e.g. "gregorian", "proleptic_gregorian", "standard", or not
164+
specified). If None (default), attempt to decode times to
165+
``np.datetime64[ns]`` objects; if this is not possible, decode times to
166+
``cftime.datetime`` objects. If True, always decode times to
167+
``cftime.datetime`` objects, regardless of whether or not they can be
168+
represented using ``np.datetime64[ns]`` objects. If False, always
169+
decode times to ``np.datetime64[ns]`` objects; if this is not possible
170+
raise an error.
171+
decode_timedelta : bool, optional
172+
If True, decode variables and coordinates with time units in
173+
{"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"}
174+
into timedelta objects. If False, leave them encoded as numbers.
175+
If None (default), assume the same value of decode_time.
176+
177+
Returns
178+
-------
179+
dataset : Dataset
180+
The newly created dataset.
181+
182+
Notes
183+
-----
184+
``open_dataset`` opens the file with read-only access. When you modify
185+
values of a Dataset, even one linked to files on disk, only the in-memory
186+
copy you are manipulating in xarray is modified: the original file on disk
187+
is never touched.
188+
189+
See Also
190+
--------
191+
open_mfdataset
192+
"""
193+
194+
if cache is None:
195+
cache = chunks is None
196+
197+
if backend_kwargs is None:
198+
backend_kwargs = {}
199+
200+
filename_or_obj = _normalize_path(filename_or_obj)
201+
202+
if engine is None:
203+
engine = _autodetect_engine(filename_or_obj)
204+
205+
backend_kwargs = backend_kwargs.copy()
206+
overwrite_encoded_chunks = backend_kwargs.pop("overwrite_encoded_chunks", None)
207+
208+
open_backend_dataset = _get_backend_cls(engine, engines=ENGINES)
209+
backend_ds = open_backend_dataset(
210+
filename_or_obj,
211+
**backend_kwargs,
212+
**{k: v for k, v in kwargs.items() if v is not None},
213+
)
214+
ds = dataset_from_backend_dataset(
215+
backend_ds,
216+
filename_or_obj,
217+
engine,
218+
chunks,
219+
cache,
220+
overwrite_encoded_chunks,
221+
{**backend_kwargs, **kwargs},
222+
)
223+
224+
return ds

xarray/backends/cfgrib_.py

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import numpy as np
22

3+
from .. import conventions
34
from ..core import indexing
4-
from ..core.utils import Frozen, FrozenDict
5+
from ..core.dataset import Dataset
6+
from ..core.utils import Frozen, FrozenDict, close_on_error
57
from ..core.variable import Variable
68
from .common import AbstractDataStore, BackendArray
79
from .locks import SerializableLock, ensure_lock
@@ -69,3 +71,66 @@ def get_encoding(self):
6971
dims = self.get_dimensions()
7072
encoding = {"unlimited_dims": {k for k, v in dims.items() if v is None}}
7173
return encoding
74+
75+
76+
def open_backend_dataset_cfgrib(
77+
filename_or_obj,
78+
*,
79+
decode_cf=True,
80+
mask_and_scale=True,
81+
decode_times=None,
82+
concat_characters=None,
83+
decode_coords=None,
84+
drop_variables=None,
85+
use_cftime=None,
86+
decode_timedelta=None,
87+
lock=None,
88+
indexpath="{path}.{short_hash}.idx",
89+
filter_by_keys={},
90+
read_keys=[],
91+
encode_cf=("parameter", "time", "geography", "vertical"),
92+
squeeze=True,
93+
time_dims=("time", "step"),
94+
):
95+
96+
if not decode_cf:
97+
mask_and_scale = False
98+
decode_times = False
99+
concat_characters = False
100+
decode_coords = False
101+
decode_timedelta = False
102+
103+
store = CfGribDataStore(
104+
filename_or_obj,
105+
indexpath=indexpath,
106+
filter_by_keys=filter_by_keys,
107+
read_keys=read_keys,
108+
encode_cf=encode_cf,
109+
squeeze=squeeze,
110+
time_dims=time_dims,
111+
lock=lock,
112+
)
113+
114+
with close_on_error(store):
115+
vars, attrs = store.load()
116+
file_obj = store
117+
encoding = store.get_encoding()
118+
119+
vars, attrs, coord_names = conventions.decode_cf_variables(
120+
vars,
121+
attrs,
122+
mask_and_scale=mask_and_scale,
123+
decode_times=decode_times,
124+
concat_characters=concat_characters,
125+
decode_coords=decode_coords,
126+
drop_variables=drop_variables,
127+
use_cftime=use_cftime,
128+
decode_timedelta=decode_timedelta,
129+
)
130+
131+
ds = Dataset(vars, attrs=attrs)
132+
ds = ds.set_coords(coord_names.intersection(vars))
133+
ds._file_obj = file_obj
134+
ds.encoding = encoding
135+
136+
return ds

0 commit comments

Comments
 (0)