Skip to content

Open dataset cf options conflict solving #3544

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 62 additions & 13 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,12 +287,12 @@ def load_dataarray(filename_or_obj, **kwargs):
def open_dataset(
filename_or_obj,
group=None,
decode_cf=True,
decode_cf=None,
mask_and_scale=None,
decode_times=True,
decode_times=None,
autoclose=None,
concat_characters=True,
decode_coords=True,
concat_characters=None,
decode_coords=None,
engine=None,
chunks=None,
lock=None,
Expand All @@ -316,19 +316,27 @@ def open_dataset(
netCDF4 files).
decode_cf : bool, optional
Whether to decode these variables, assuming they were saved according
to CF conventions.
to CF conventions. Defaults to None which means decode variables while
allowing any of the arguments mask_and_scale, decode_times,
concat_characters, or decode_coords being deactivated. If decode_cf is
explicitly set True or False, the four options mask_and_scale,
decode_times, concat_characters, and decode_coords are set likewise.
A ValueError is raised if decode_cf conflicts with any of the four
mentioned options set explicitly.
mask_and_scale : bool, optional
If True, replace array values equal to `_FillValue` with NA and scale
values according to the formula `original_values * scale_factor +
add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are
taken from variable attributes (if they exist). If the `_FillValue` or
`missing_value` attribute contains multiple values a warning will be
issued and all array values matching one of the multiple values will
be replaced by NA. mask_and_scale defaults to True except for the
pseudonetcdf backend.
be replaced by NA. mask_and_scale defaults to None which means True
except for the pseudonetcdf backend.
decode_times : bool, optional
If True, decode times encoded in the standard NetCDF datetime format
into datetime objects. Otherwise, leave them encoded as numbers.
Defaults to None which means decoding unless decode_cf is False in
which case the time is not decoded.
autoclose : bool, optional
If True, automatically close files to avoid OS Error of too many files
being open. However, this option doesn't work with streams, e.g.,
Expand All @@ -337,10 +345,13 @@ def open_dataset(
If True, concatenate along the last dimension of character arrays to
form string arrays. Dimensions will only be concatenated over (and
removed) if they have no corresponding variable and if they are only
used as the last dimension of character arrays.
used as the last dimension of character arrays. Defaults to None which
means concatenating unless decode_cf is False in which case characters
are not concatenated.
decode_coords : bool, optional
If True, decode the 'coordinates' attribute to identify coordinates in
the resulting dataset.
the resulting dataset. Defaults to None which means decoding unless
decode_cf is False in which case coordinates are not decoded.
engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio', 'cfgrib', \
'pseudonetcdf'}, optional
Engine to use when reading files. If not provided, the default engine
Expand Down Expand Up @@ -424,15 +435,53 @@ def open_dataset(
stacklevel=2,
)

if mask_and_scale is None:
mask_and_scale = not engine == "pseudonetcdf"

if not decode_cf:
if decode_cf:
if mask_and_scale is not None and not mask_and_scale:
raise ValueError(
"cannot deactivate mask_and_scale if decode_cf=True is explicitly set"
)
if decode_times is not None and not decode_times:
raise ValueError(
"cannot deactivate decode_times if decode_cf=True is explicitly set"
)
if concat_characters is not None and not concat_characters:
raise ValueError(
"cannot deactivate concat_characters if "
"decode_cf=True is explicitly set"
)
if decode_coords is not None and not decode_coords:
raise ValueError(
"cannot deactivate decode_coords if decode_cf=True is explicitly set"
)
elif decode_cf is None:
# by default decode in general unless specific options are turned off
decode_cf = True
elif not decode_cf:
if mask_and_scale:
raise ValueError("cannot use mask_and_scale if decode_cf=False")
mask_and_scale = False
if decode_times:
raise ValueError("cannot use decode_times if decode_cf=False")
decode_times = False
if concat_characters:
raise ValueError("cannot use concat_characters if decode_cf=False")
concat_characters = False
if decode_coords:
raise ValueError("cannot use decode_coords if decode_cf=False")
decode_coords = False

if mask_and_scale is None:
mask_and_scale = not engine == "pseudonetcdf"

if decode_times is None:
decode_times = True

if concat_characters is None:
concat_characters = True

if decode_coords is None:
decode_coords = True

if cache is None:
cache = chunks is None

Expand Down
136 changes: 136 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -979,6 +979,142 @@ def test_multiindex_not_implemented(self):
with self.roundtrip(ds):
pass

@requires_netCDF4
def test_open_dataset_decode_cf_mask_and_scale_conflicts(self):
with create_tmp_file() as tmp_file:
with nc4.Dataset(tmp_file, mode="w") as nc:
nc.createDimension("t", 5)
nc.createVariable("x", "int16", ("t",), fill_value=-1)
v = nc.variables["x"]
v.set_auto_maskandscale(False)
v.add_offset = 10
v.scale_factor = 0.1
v[:] = np.array([-1, -1, 0, 1, 2])

expected_decoded = create_masked_and_scaled_data()
expected_encoded = np.array([-1, -1, 0, 1, 2])

with open_dataset(tmp_file, decode_cf=True) as ds:
assert_identical(expected_decoded, ds)

with open_dataset(tmp_file, decode_cf=False) as ds:
assert_array_equal(expected_encoded, ds.x.values)

with open_dataset(tmp_file, decode_cf=None, mask_and_scale=True) as ds:
assert_identical(expected_decoded, ds)

with open_dataset(tmp_file, decode_cf=True, mask_and_scale=True) as ds:
assert_identical(expected_decoded, ds)

with open_dataset(tmp_file, decode_cf=False, mask_and_scale=False) as ds:
assert_array_equal(expected_encoded, ds.x.values)

with raises_regex(
ValueError, "cannot use mask_and_scale if decode_cf=False"
):
open_dataset(tmp_file, decode_cf=False, mask_and_scale=True)

with raises_regex(
ValueError, "cannot deactivate mask_and_scale if decode_cf=True"
):
open_dataset(tmp_file, decode_cf=True, mask_and_scale=False)

@requires_netCDF4
def test_open_dataset_decode_cf_decode_time_conflicts(self):
expected_decoded = np.dtype("<M8[ns]")
expected_encoded = np.dtype("int16")

with open_example_dataset("example_1.nc", decode_cf=True) as ds:
assert ds.time.dtype == np.dtype(expected_decoded)

with open_example_dataset("example_1.nc", decode_cf=False) as ds:
assert ds.time.dtype == expected_encoded

with open_example_dataset(
"example_1.nc", decode_cf=None, decode_times=True
) as ds:
assert ds.time.dtype == expected_decoded

with open_example_dataset(
"example_1.nc", decode_cf=True, decode_times=True
) as ds:
assert ds.time.dtype == expected_decoded

with open_example_dataset(
"example_1.nc", decode_cf=False, decode_times=False
) as ds:
assert ds.time.dtype == expected_encoded

with raises_regex(ValueError, "cannot use decode_times if decode_cf=False"):
open_example_dataset("example_1.nc", decode_cf=False, decode_times=True)

with raises_regex(
ValueError, "cannot deactivate decode_times if decode_cf=True"
):
open_example_dataset("example_1.nc", decode_cf=True, decode_times=False)

@requires_netCDF4
def test_open_dataset_decode_cf_concat_characters_conflicts(self):
expected_decoded = np.dtype("|S4")
expected_encoded = np.dtype("|S1")

with open_example_dataset("bears.nc", decode_cf=True) as ds:
assert ds.bears.dtype == np.dtype(expected_decoded)

with open_example_dataset("bears.nc", decode_cf=False) as ds:
assert ds.bears.dtype == expected_encoded

with open_example_dataset(
"bears.nc", decode_cf=None, concat_characters=True
) as ds:
assert ds.bears.dtype == expected_decoded

with open_example_dataset(
"bears.nc", decode_cf=True, concat_characters=True
) as ds:
assert ds.bears.dtype == expected_decoded

with open_example_dataset(
"bears.nc", decode_cf=False, concat_characters=False
) as ds:
assert ds.bears.dtype == expected_encoded

with raises_regex(
ValueError, "cannot use concat_characters if decode_cf=False"
):
open_example_dataset("bears.nc", decode_cf=False, concat_characters=True)

with raises_regex(
ValueError, "cannot deactivate concat_characters if decode_cf=True"
):
open_example_dataset("bears.nc", decode_cf=True, concat_characters=False)

@requires_netCDF4
def test_open_dataset_decodeing_conflicts(self):
expected_decoded = np.dtype("|S4")
expected_encoded = np.dtype("|S1")

# individual decodings can be (de)activated without ValueError
with open_example_dataset(
"bears.nc", concat_characters=True, decode_times=False
) as ds:
assert ds.bears.dtype == expected_decoded

with open_example_dataset(
"bears.nc", concat_characters=True, decode_times=True
) as ds:
assert ds.bears.dtype == expected_decoded

with open_example_dataset(
"bears.nc", concat_characters=False, decode_times=True
) as ds:
assert ds.bears.dtype == expected_encoded

with open_example_dataset(
"bears.nc", concat_characters=False, decode_times=False
) as ds:
assert ds.bears.dtype == expected_encoded


_counter = itertools.count()

Expand Down