From 4bfeee7a2266c3c72d6aa0ea225c8c666a3261de Mon Sep 17 00:00:00 2001 From: Eric Bruning Date: Tue, 13 Jun 2017 21:47:54 -0500 Subject: [PATCH 01/16] Add support for _Unsigned attribute --- xarray/conventions.py | 22 ++++++++++++--- xarray/tests/test_backends.py | 53 ++++++++++++++++++++++++----------- 2 files changed, 54 insertions(+), 21 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index d39ae20925a..04bc371e615 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -26,7 +26,7 @@ def mask_and_scale(array, fill_value=None, scale_factor=None, add_offset=None, - dtype=float): + dtype=float, is_unsigned=False): """Scale and mask array values according to CF conventions for packed and missing values @@ -59,8 +59,13 @@ def mask_and_scale(array, fill_value=None, scale_factor=None, add_offset=None, ---------- http://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html """ + # check for unsigned integers before casting to dtype (by default, float) + if is_unsigned and array.dtype.kind == 'i': + array = array.view('u%s' % array.dtype.itemsize) + # by default, cast to float to ensure NaN is meaningful values = np.array(array, dtype=dtype, copy=True) + if fill_value is not None and not np.all(pd.isnull(fill_value)): if getattr(fill_value, 'size', 1) > 1: fill_values = fill_value # multiple fill values @@ -337,7 +342,7 @@ class MaskedAndScaledArray(utils.NDArrayMixin): http://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html """ def __init__(self, array, fill_value=None, scale_factor=None, - add_offset=None, dtype=float): + add_offset=None, dtype=float, is_unsigned=False): """ Parameters ---------- @@ -357,6 +362,7 @@ def __init__(self, array, fill_value=None, scale_factor=None, self.scale_factor = scale_factor self.add_offset = add_offset self._dtype = dtype + self.is_unsigned = is_unsigned @property def dtype(self): @@ -364,7 +370,8 @@ def dtype(self): def __getitem__(self, key): return mask_and_scale(self.array[key], self.fill_value, - self.scale_factor, self.add_offset, self._dtype) + self.scale_factor, self.add_offset, self._dtype, + self.is_unsigned) def __repr__(self): return ("%s(%r, fill_value=%r, scale_factor=%r, add_offset=%r, " @@ -637,6 +644,8 @@ def maybe_encode_dtype(var, name=None): 'any _FillValue to use for NaNs' % name, RuntimeWarning, stacklevel=3) data = duck_array_ops.around(data)[...] + if '_Unsigned' in var.encoding: + data = data.astype('u%s' % dtype.itemsize) if dtype == 'S1' and data.dtype != 'S1': data = string_to_char(np.asarray(data, 'S')) dims = dims + ('string%s' % data.shape[-1],) @@ -801,6 +810,10 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, "xarray.conventions.decode_cf(ds)") attributes['_FillValue'] = attributes.pop('missing_value') + is_unsigned = False + if '_Unsigned' in attributes: + is_unsigned = attributes['_Unsigned'] + fill_value = np.array(pop_to(attributes, encoding, '_FillValue')) if fill_value.size > 1: warnings.warn("variable has multiple fill values {0}, decoding " @@ -815,7 +828,8 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, else: dtype = float data = MaskedAndScaledArray(data, fill_value, scale_factor, - add_offset, dtype) + add_offset, dtype, + is_unsigned=is_unsigned) if decode_times and 'units' in attributes: if 'since' in attributes['units']: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 3fa6fff9f4b..a1d38061417 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -55,12 +55,23 @@ def create_masked_and_scaled_data(): 'scale_factor': np.float32(0.1), 'dtype': 'i2'} return Dataset({'x': ('t', x, {}, encoding)}) - def create_encoded_masked_and_scaled_data(): attributes = {'_FillValue': -1, 'add_offset': 10, 'scale_factor': np.float32(0.1)} return Dataset({'x': ('t', [-1, -1, 0, 1, 2], attributes)}) +def create_unsigned_masked_scaled_data(): + encoding = {'_FillValue':255, '_Unsigned':'true', 'dtype':'i1', + 'add_offset':10, 'scale_factor':np.float32(0.1)} + x = np.array([10.0, 10.1, 22.7, 22.8, np.nan]) + return Dataset({'x': ('t', x, {}, encoding)}) + +def create_encoded_unsigned_masked_scaled_data(): + attributes = {'_FillValue':255, '_Unsigned':'true', + 'add_offset':10, 'scale_factor':np.float32(0.1)} + # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned + sb = np.asarray([0, 1, 127, -128, -1], dtype='i1') + return Dataset({'x': ('t', sb, attributes)}) def create_boolean_data(): attributes = {'units': '-'} @@ -363,22 +374,30 @@ def test_roundtrip_strings_with_fill_value(self): def test_roundtrip_mask_and_scale(self): decoded = create_masked_and_scaled_data() encoded = create_encoded_masked_and_scaled_data() - with self.roundtrip(decoded) as actual: - self.assertDatasetAllClose(decoded, actual) - with self.roundtrip(decoded, open_kwargs=dict(decode_cf=False)) as actual: - # TODO: this assumes that all roundtrips will first - # encode. Is that something we want to test for? - self.assertDatasetAllClose(encoded, actual) - with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual: - self.assertDatasetAllClose(encoded, actual) - # make sure roundtrip encoding didn't change the - # original dataset. - self.assertDatasetIdentical(encoded, - create_encoded_masked_and_scaled_data()) - with self.roundtrip(encoded) as actual: - self.assertDatasetAllClose(decoded, actual) - with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual: - self.assertDatasetAllClose(encoded, actual) + ordinary = (decoded, encoded) + decoded = create_unsigned_masked_scaled_data() + encoded = create_encoded_unsigned_masked_scaled_data() + unsigned_attr = (decoded, encoded) + for decoded, encoded in (ordinary, unsigned_attr): + with self.roundtrip(decoded) as actual: + self.assertDatasetAllClose(decoded, actual) + with self.roundtrip(decoded, + open_kwargs=dict(decode_cf=False)) as actual: + # TODO: this assumes that all roundtrips will first + # encode. Is that something we want to test for? + self.assertDatasetAllClose(encoded, actual) + with self.roundtrip(encoded, + open_kwargs=dict(decode_cf=False)) as actual: + self.assertDatasetAllClose(encoded, actual) + # make sure roundtrip encoding didn't change the + # original dataset. + self.assertDatasetIdentical(encoded, + create_encoded_masked_and_scaled_data()) + with self.roundtrip(encoded) as actual: + self.assertDatasetAllClose(decoded, actual) + with self.roundtrip(encoded, + open_kwargs=dict(decode_cf=False)) as actual: + self.assertDatasetAllClose(encoded, actual) def test_coordinates_encoding(self): def equals_latlon(obj): From 32a67f38163bf532840d302e3c1ca488da3fb63a Mon Sep 17 00:00:00 2001 From: Eric Bruning Date: Tue, 13 Jun 2017 22:05:56 -0500 Subject: [PATCH 02/16] Update docstrings with new unsigned behavior --- xarray/conventions.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index 04bc371e615..54936939cd5 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -49,6 +49,8 @@ def mask_and_scale(array, fill_value=None, scale_factor=None, add_offset=None, add_offset : number, optional After applying scale_factor, add this number to entries in the original array. + is_unsigned : prior to applying fill, scale, or offset, treat + integer array as unsigned Returns ------- @@ -356,6 +358,8 @@ def __init__(self, array, fill_value=None, scale_factor=None, add_offset : number, optional After applying scale_factor, add this number to entries in the original array. + is_unsigned : prior to applying fill, scale, or offset, treat + integer array as unsigned """ self.array = array self.fill_value = fill_value @@ -770,7 +774,8 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, example: ['h', 'e', 'l', 'l', 'o'] -> 'hello' mask_and_scale: bool Lazily scale (using scale_factor and add_offset) and mask - (using _FillValue). + (using _FillValue). If the _Unsigned attribute is present + treat integer arrays as unsigned. decode_times : bool Decode cf times ('hours since 2000-01-01') to np.datetime64. decode_endianness : bool From ddbc99df83e84b6d08b24a19a651d9765ffbca24 Mon Sep 17 00:00:00 2001 From: Eric Bruning Date: Tue, 13 Jun 2017 23:25:39 -0500 Subject: [PATCH 03/16] Cast to unsigned with copy instead of view, fixing infinite recursion. Move _Unsigned between attributes and encoding --- xarray/conventions.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index 54936939cd5..a1ae4a36277 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -26,7 +26,7 @@ def mask_and_scale(array, fill_value=None, scale_factor=None, add_offset=None, - dtype=float, is_unsigned=False): + dtype=float, is_unsigned=None): """Scale and mask array values according to CF conventions for packed and missing values @@ -62,8 +62,9 @@ def mask_and_scale(array, fill_value=None, scale_factor=None, add_offset=None, http://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html """ # check for unsigned integers before casting to dtype (by default, float) - if is_unsigned and array.dtype.kind == 'i': - array = array.view('u%s' % array.dtype.itemsize) + if is_unsigned is not None and array.dtype.kind == 'i': + unsigned_dtype = 'u%s' % array.dtype.itemsize + array = np.array(array, dtype=unsigned_dtype, copy=True) # by default, cast to float to ensure NaN is meaningful values = np.array(array, dtype=dtype, copy=True) @@ -648,8 +649,9 @@ def maybe_encode_dtype(var, name=None): 'any _FillValue to use for NaNs' % name, RuntimeWarning, stacklevel=3) data = duck_array_ops.around(data)[...] - if '_Unsigned' in var.encoding: + if '_Unsigned' in encoding: data = data.astype('u%s' % dtype.itemsize) + is_unsigned = pop_to(encoding, attrs, '_Unsigned') if dtype == 'S1' and data.dtype != 'S1': data = string_to_char(np.asarray(data, 'S')) dims = dims + ('string%s' % data.shape[-1],) @@ -815,9 +817,7 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, "xarray.conventions.decode_cf(ds)") attributes['_FillValue'] = attributes.pop('missing_value') - is_unsigned = False - if '_Unsigned' in attributes: - is_unsigned = attributes['_Unsigned'] + is_unsigned = pop_to(attributes, encoding, '_Unsigned') fill_value = np.array(pop_to(attributes, encoding, '_FillValue')) if fill_value.size > 1: From 11a39d05f9be8436d682c3d6fe91abe86081ed3a Mon Sep 17 00:00:00 2001 From: Eric Bruning Date: Wed, 14 Jun 2017 00:10:48 -0500 Subject: [PATCH 04/16] Fix default argument for is_unsigned --- xarray/conventions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index a1ae4a36277..72206cca63a 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -345,7 +345,7 @@ class MaskedAndScaledArray(utils.NDArrayMixin): http://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html """ def __init__(self, array, fill_value=None, scale_factor=None, - add_offset=None, dtype=float, is_unsigned=False): + add_offset=None, dtype=float, is_unsigned=None): """ Parameters ---------- From e248ae51dbf7363a85ed2c1b844eb5f05690ac52 Mon Sep 17 00:00:00 2001 From: Eric Bruning Date: Wed, 14 Jun 2017 00:40:53 -0500 Subject: [PATCH 05/16] Separate test for unsigned roundtrip --- xarray/conventions.py | 4 +- xarray/tests/test_backends.py | 80 +++++++++++++++++++++++------------ 2 files changed, 54 insertions(+), 30 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index 72206cca63a..9fede72cf45 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -777,7 +777,7 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, mask_and_scale: bool Lazily scale (using scale_factor and add_offset) and mask (using _FillValue). If the _Unsigned attribute is present - treat integer arrays as unsigned. + treat integer arrays as unsigned. decode_times : bool Decode cf times ('hours since 2000-01-01') to np.datetime64. decode_endianness : bool @@ -833,7 +833,7 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, else: dtype = float data = MaskedAndScaledArray(data, fill_value, scale_factor, - add_offset, dtype, + add_offset, dtype, is_unsigned=is_unsigned) if decode_times and 'units' in attributes: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a1d38061417..9f2c52f0c2e 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -55,24 +55,28 @@ def create_masked_and_scaled_data(): 'scale_factor': np.float32(0.1), 'dtype': 'i2'} return Dataset({'x': ('t', x, {}, encoding)}) + def create_encoded_masked_and_scaled_data(): attributes = {'_FillValue': -1, 'add_offset': 10, 'scale_factor': np.float32(0.1)} return Dataset({'x': ('t', [-1, -1, 0, 1, 2], attributes)}) + def create_unsigned_masked_scaled_data(): - encoding = {'_FillValue':255, '_Unsigned':'true', 'dtype':'i1', - 'add_offset':10, 'scale_factor':np.float32(0.1)} + encoding = {'_FillValue': 255, '_Unsigned': 'true', 'dtype': 'i1', + 'add_offset': 10, 'scale_factor': np.float32(0.1)} x = np.array([10.0, 10.1, 22.7, 22.8, np.nan]) return Dataset({'x': ('t', x, {}, encoding)}) + def create_encoded_unsigned_masked_scaled_data(): - attributes = {'_FillValue':255, '_Unsigned':'true', - 'add_offset':10, 'scale_factor':np.float32(0.1)} + attributes = {'_FillValue': 255, '_Unsigned': 'true', + 'add_offset': 10, 'scale_factor': np.float32(0.1)} # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned sb = np.asarray([0, 1, 127, -128, -1], dtype='i1') return Dataset({'x': ('t', sb, attributes)}) + def create_boolean_data(): attributes = {'units': '-'} return Dataset({'x': ('t', [True, False, False, True], attributes)}) @@ -371,33 +375,53 @@ def test_roundtrip_strings_with_fill_value(self): with self.roundtrip(original) as actual: self.assertDatasetIdentical(expected, actual) + def test_unsigned_roundtrip_mask_and_scale(self): + decoded = create_unsigned_masked_scaled_data() + encoded = create_encoded_unsigned_masked_scaled_data() + with self.roundtrip(decoded) as actual: + self.assertDatasetAllClose(decoded, actual) + with self.roundtrip(decoded, + open_kwargs=dict(decode_cf=False)) as actual: + # TODO: this assumes that all roundtrips will first + # encode. Is that something we want to test for? + self.assertDatasetAllClose(encoded, actual) + with self.roundtrip(encoded, + open_kwargs=dict(decode_cf=False)) as actual: + self.assertDatasetAllClose(encoded, actual) + # make sure roundtrip encoding didn't change the + # original dataset. + self.assertDatasetIdentical(encoded, + create_encoded_unsigned_masked_scaled_data() + ) + with self.roundtrip(encoded) as actual: + self.assertDatasetAllClose(decoded, actual) + with self.roundtrip(encoded, + open_kwargs=dict(decode_cf=False)) as actual: + self.assertDatasetAllClose(encoded, actual) + def test_roundtrip_mask_and_scale(self): decoded = create_masked_and_scaled_data() encoded = create_encoded_masked_and_scaled_data() - ordinary = (decoded, encoded) - decoded = create_unsigned_masked_scaled_data() - encoded = create_encoded_unsigned_masked_scaled_data() - unsigned_attr = (decoded, encoded) - for decoded, encoded in (ordinary, unsigned_attr): - with self.roundtrip(decoded) as actual: - self.assertDatasetAllClose(decoded, actual) - with self.roundtrip(decoded, - open_kwargs=dict(decode_cf=False)) as actual: - # TODO: this assumes that all roundtrips will first - # encode. Is that something we want to test for? - self.assertDatasetAllClose(encoded, actual) - with self.roundtrip(encoded, - open_kwargs=dict(decode_cf=False)) as actual: - self.assertDatasetAllClose(encoded, actual) - # make sure roundtrip encoding didn't change the - # original dataset. - self.assertDatasetIdentical(encoded, - create_encoded_masked_and_scaled_data()) - with self.roundtrip(encoded) as actual: - self.assertDatasetAllClose(decoded, actual) - with self.roundtrip(encoded, - open_kwargs=dict(decode_cf=False)) as actual: - self.assertDatasetAllClose(encoded, actual) + with self.roundtrip(decoded) as actual: + self.assertDatasetAllClose(decoded, actual) + with self.roundtrip(decoded, + open_kwargs=dict(decode_cf=False)) as actual: + # TODO: this assumes that all roundtrips will first + # encode. Is that something we want to test for? + self.assertDatasetAllClose(encoded, actual) + with self.roundtrip(encoded, + open_kwargs=dict(decode_cf=False)) as actual: + self.assertDatasetAllClose(encoded, actual) + # make sure roundtrip encoding didn't change the + # original dataset. + self.assertDatasetIdentical(encoded, + create_encoded_masked_and_scaled_data() + ) + with self.roundtrip(encoded) as actual: + self.assertDatasetAllClose(decoded, actual) + with self.roundtrip(encoded, + open_kwargs=dict(decode_cf=False)) as actual: + self.assertDatasetAllClose(encoded, actual) def test_coordinates_encoding(self): def equals_latlon(obj): From 079a7e581b707b91f302e878e9ee05ffa817eb9d Mon Sep 17 00:00:00 2001 From: Eric Bruning Date: Mon, 19 Jun 2017 20:48:32 -0500 Subject: [PATCH 06/16] Move unsigned support out of mask_and_scale, update whats-new --- doc/whats-new.rst | 4 ++ xarray/conventions.py | 63 ++++++++++++++++++++++---------- xarray/tests/test_backends.py | 3 +- xarray/tests/test_conventions.py | 9 +++++ 4 files changed, 58 insertions(+), 21 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b55d59077fc..da389fd8e13 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -61,6 +61,10 @@ Enhancements for the values they contain, similar to ``pandas.Series`` (:issue:`358`). By `Daniel Rothenberg `_. +- Support for NetCDF files using an ``_Unsigned`` attribute to indicate that a + signed integer data type should be interpreted as unsigned bytes. + By `Eric Bruning `_. + - Renamed internal dask arrays created by ``open_dataset`` to match new dask conventions (:issue:`1343`). By `Ryan Abernathey `_. diff --git a/xarray/conventions.py b/xarray/conventions.py index 9fede72cf45..89130fedfa0 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -26,7 +26,7 @@ def mask_and_scale(array, fill_value=None, scale_factor=None, add_offset=None, - dtype=float, is_unsigned=None): + dtype=float): """Scale and mask array values according to CF conventions for packed and missing values @@ -49,8 +49,6 @@ def mask_and_scale(array, fill_value=None, scale_factor=None, add_offset=None, add_offset : number, optional After applying scale_factor, add this number to entries in the original array. - is_unsigned : prior to applying fill, scale, or offset, treat - integer array as unsigned Returns ------- @@ -61,11 +59,6 @@ def mask_and_scale(array, fill_value=None, scale_factor=None, add_offset=None, ---------- http://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html """ - # check for unsigned integers before casting to dtype (by default, float) - if is_unsigned is not None and array.dtype.kind == 'i': - unsigned_dtype = 'u%s' % array.dtype.itemsize - array = np.array(array, dtype=unsigned_dtype, copy=True) - # by default, cast to float to ensure NaN is meaningful values = np.array(array, dtype=dtype, copy=True) @@ -345,7 +338,7 @@ class MaskedAndScaledArray(utils.NDArrayMixin): http://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html """ def __init__(self, array, fill_value=None, scale_factor=None, - add_offset=None, dtype=float, is_unsigned=None): + add_offset=None, dtype=float): """ Parameters ---------- @@ -359,15 +352,12 @@ def __init__(self, array, fill_value=None, scale_factor=None, add_offset : number, optional After applying scale_factor, add this number to entries in the original array. - is_unsigned : prior to applying fill, scale, or offset, treat - integer array as unsigned """ self.array = array self.fill_value = fill_value self.scale_factor = scale_factor self.add_offset = add_offset self._dtype = dtype - self.is_unsigned = is_unsigned @property def dtype(self): @@ -375,8 +365,7 @@ def dtype(self): def __getitem__(self, key): return mask_and_scale(self.array[key], self.fill_value, - self.scale_factor, self.add_offset, self._dtype, - self.is_unsigned) + self.scale_factor, self.add_offset, self._dtype) def __repr__(self): return ("%s(%r, fill_value=%r, scale_factor=%r, add_offset=%r, " @@ -546,6 +535,34 @@ def __getitem__(self, key): return np.asarray(self.array[key], dtype=self.dtype) +class UnsignedIntTypeArray(utils.NDArrayMixin): + """Decode arrays on the fly from signed integer to unsigned + integer. Typically used when _Unsigned is set at as a netCDF + attribute on a signed integer variable. + + >>> sb = np.asarray([0, 1, 127, -128, -1], dtype='i1') + + >>> sb.dtype + dtype('int8') + + >>> UnsignedIntTypeArray(sb).dtype + dtype('uint8') + + >>> UnsignedIntTypeArray(sb)[:] + array([ 0, 1, 127, 128, 255], dtype=uint8) + """ + def __init__(self, array): + self.array = array + self.unsigned_dtype = np.dtype('u%s' % array.dtype.itemsize) + + @property + def dtype(self): + return self.unsigned_dtype + + def __getitem__(self, key): + return np.asarray(self.array[key], dtype=self.dtype) + + def string_to_char(arr): """Like netCDF4.stringtochar, but faster and more flexible. """ @@ -650,8 +667,8 @@ def maybe_encode_dtype(var, name=None): RuntimeWarning, stacklevel=3) data = duck_array_ops.around(data)[...] if '_Unsigned' in encoding: - data = data.astype('u%s' % dtype.itemsize) - is_unsigned = pop_to(encoding, attrs, '_Unsigned') + data = data.astype('i%s' % dtype.itemsize) + pop_to(encoding, attrs, '_Unsigned') if dtype == 'S1' and data.dtype != 'S1': data = string_to_char(np.asarray(data, 'S')) dims = dims + ('string%s' % data.shape[-1],) @@ -802,6 +819,15 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, dimensions = dimensions[:-1] data = CharToStringArray(data) + is_unsigned = pop_to(attributes, encoding, '_Unsigned') + if is_unsigned is not None: + if data.dtype.kind == 'i': + data = UnsignedIntTypeArray(data) + else: + warnings.warn("variable has _Unsigned attribute but is not " + "of integer type. Ignoring attribute.", + RuntimeWarning, stacklevel=3) + if mask_and_scale: if 'missing_value' in attributes: # missing_value is deprecated, but we still want to support it as @@ -817,8 +843,6 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, "xarray.conventions.decode_cf(ds)") attributes['_FillValue'] = attributes.pop('missing_value') - is_unsigned = pop_to(attributes, encoding, '_Unsigned') - fill_value = np.array(pop_to(attributes, encoding, '_FillValue')) if fill_value.size > 1: warnings.warn("variable has multiple fill values {0}, decoding " @@ -833,8 +857,7 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, else: dtype = float data = MaskedAndScaledArray(data, fill_value, scale_factor, - add_offset, dtype, - is_unsigned=is_unsigned) + add_offset, dtype) if decode_times and 'units' in attributes: if 'since' in attributes['units']: diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 9f2c52f0c2e..b639f40a259 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -391,7 +391,8 @@ def test_unsigned_roundtrip_mask_and_scale(self): # make sure roundtrip encoding didn't change the # original dataset. self.assertDatasetIdentical(encoded, - create_encoded_unsigned_masked_scaled_data() + create_encoded_unsigned_masked_scaled_data( + ) ) with self.roundtrip(encoded) as actual: self.assertDatasetAllClose(decoded, actual) diff --git a/xarray/tests/test_conventions.py b/xarray/tests/test_conventions.py index 6c9d791660d..a6230761b86 100644 --- a/xarray/tests/test_conventions.py +++ b/xarray/tests/test_conventions.py @@ -108,6 +108,15 @@ def test_string_to_char(self): self.assertArrayEqual(actual, expected) +class TestUnsignedIntTypeArray(TestCase): + def test_unsignedinttype_array(self): + sb = np.asarray([0, 1, 127, -128, -1], dtype='i1') + ub = conventions.UnsignedIntTypeArray(sb) + self.assertEqual(ub.dtype, np.dtype('u1')) + self.assertArrayEqual(ub, np.array([0, 1, 127, 128, 255], + dtype=np.dtype('u1'))) + + class TestBoolTypeArray(TestCase): def test_booltype_array(self): x = np.array([1, 0, 1, 1, 0], dtype='i1') From e06f6570c23bc96c102a69283a0e2216cfcd4bfb Mon Sep 17 00:00:00 2001 From: Eric Bruning Date: Mon, 19 Jun 2017 20:51:42 -0500 Subject: [PATCH 07/16] Fix what's new date and add issue --- doc/whats-new.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index da389fd8e13..39b57497b31 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -42,7 +42,7 @@ Bug fixes .. _whats-new.0.9.6: -v0.9.6 (8 June 2017) +v0.9.6 (19 June 2017) -------------------- This release includes a number of backwards compatible enhancements and bug @@ -61,8 +61,9 @@ Enhancements for the values they contain, similar to ``pandas.Series`` (:issue:`358`). By `Daniel Rothenberg `_. -- Support for NetCDF files using an ``_Unsigned`` attribute to indicate that a - signed integer data type should be interpreted as unsigned bytes. +- Support for NetCDF files using an ``_Unsigned`` attribute to indicate that a + a signed integer data type should be interpreted as unsigned bytes + (:issue:`1444`). By `Eric Bruning `_. - Renamed internal dask arrays created by ``open_dataset`` to match new dask From b56cb757aaa8374f68e6192c2341af6ab68616e9 Mon Sep 17 00:00:00 2001 From: Eric Bruning Date: Mon, 19 Jun 2017 20:59:54 -0500 Subject: [PATCH 08/16] Putting enhancement in correct section of whats-new --- doc/whats-new.rst | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 39b57497b31..8c61d71bf55 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -24,6 +24,10 @@ Enhancements - More attributes available in :py:attr:`~xarray.Dataset.attrs` dictionary when raster files are opened with :py:func:`~xarray.open_rasterio`. By `Greg Brener `_ +- Support for NetCDF files using an ``_Unsigned`` attribute to indicate that a + a signed integer data type should be interpreted as unsigned bytes + (:issue:`1444`). + By `Eric Bruning `_. Bug fixes ~~~~~~~~~ @@ -42,7 +46,7 @@ Bug fixes .. _whats-new.0.9.6: -v0.9.6 (19 June 2017) +v0.9.6 (8 June 2017) -------------------- This release includes a number of backwards compatible enhancements and bug @@ -61,11 +65,6 @@ Enhancements for the values they contain, similar to ``pandas.Series`` (:issue:`358`). By `Daniel Rothenberg `_. -- Support for NetCDF files using an ``_Unsigned`` attribute to indicate that a - a signed integer data type should be interpreted as unsigned bytes - (:issue:`1444`). - By `Eric Bruning `_. - - Renamed internal dask arrays created by ``open_dataset`` to match new dask conventions (:issue:`1343`). By `Ryan Abernathey `_. From 8b7df53d0ab23be95f2486bcd6a07d2be0eeb5a6 Mon Sep 17 00:00:00 2001 From: Eric Bruning Date: Sun, 16 Jul 2017 10:02:47 -0500 Subject: [PATCH 09/16] Turn off _FillValue support provided by PyNIO. Let xarray handle it. --- xarray/backends/pynio_.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py index 449971a9145..ad70d79a298 100644 --- a/xarray/backends/pynio_.py +++ b/xarray/backends/pynio_.py @@ -42,6 +42,9 @@ def __init__(self, filename, mode='r', autoclose=False): import Nio opener = functools.partial(Nio.open_file, filename, mode=mode) self.ds = opener() + # xarray provides its own support for FillValue, + # so turn off PyNIO's support for the same. + self.ds.set_option('MaskedArrayMode','MaskedNever') self._autoclose = autoclose self._isopen = True self._opener = opener From 466928c9bf2cf84893c1c74759a3b85e5c8f8a7e Mon Sep 17 00:00:00 2001 From: Eric Bruning Date: Sun, 16 Jul 2017 10:08:37 -0500 Subject: [PATCH 10/16] Convert _FillValue when _Unsigned is present --- xarray/conventions.py | 15 +++++++++++---- xarray/tests/test_backends.py | 4 +++- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index 89130fedfa0..03e0fbb6ef0 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -61,7 +61,6 @@ def mask_and_scale(array, fill_value=None, scale_factor=None, add_offset=None, """ # by default, cast to float to ensure NaN is meaningful values = np.array(array, dtype=dtype, copy=True) - if fill_value is not None and not np.all(pd.isnull(fill_value)): if getattr(fill_value, 'size', 1) > 1: fill_values = fill_value # multiple fill values @@ -667,7 +666,11 @@ def maybe_encode_dtype(var, name=None): RuntimeWarning, stacklevel=3) data = duck_array_ops.around(data)[...] if '_Unsigned' in encoding: - data = data.astype('i%s' % dtype.itemsize) + unsigned_dtype = 'i%s' % dtype.itemsize + old_fill = np.asarray(attrs['_FillValue']) + new_fill = old_fill.astype(unsigned_dtype) + attrs['_FillValue'] = new_fill + data = data.astype(unsigned_dtype) pop_to(encoding, attrs, '_Unsigned') if dtype == 'S1' and data.dtype != 'S1': data = string_to_char(np.asarray(data, 'S')) @@ -820,7 +823,7 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, data = CharToStringArray(data) is_unsigned = pop_to(attributes, encoding, '_Unsigned') - if is_unsigned is not None: + if (is_unsigned is not None) & (mask_and_scale == True): if data.dtype.kind == 'i': data = UnsignedIntTypeArray(data) else: @@ -842,7 +845,6 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, "and decoding explicitly using " "xarray.conventions.decode_cf(ds)") attributes['_FillValue'] = attributes.pop('missing_value') - fill_value = np.array(pop_to(attributes, encoding, '_FillValue')) if fill_value.size > 1: warnings.warn("variable has multiple fill values {0}, decoding " @@ -856,6 +858,11 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, dtype = object else: dtype = float + if is_unsigned is not None: + # Need to convert the fill_value to unsigned, too + # According to the CF spec, the fill value is of the same + # type as its variable, i.e. its storage format on disk + fill_value = np.asarray(fill_value, dtype=data.unsigned_dtype) data = MaskedAndScaledArray(data, fill_value, scale_factor, add_offset, dtype) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index b639f40a259..145983c5915 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -70,7 +70,9 @@ def create_unsigned_masked_scaled_data(): def create_encoded_unsigned_masked_scaled_data(): - attributes = {'_FillValue': 255, '_Unsigned': 'true', + # These are values as written to the file: the _FillValue will + # be represented in the signed form. + attributes = {'_FillValue': -1, '_Unsigned': 'true', 'add_offset': 10, 'scale_factor': np.float32(0.1)} # Create signed data corresponding to [0, 1, 127, 128, 255] unsigned sb = np.asarray([0, 1, 127, -128, -1], dtype='i1') From 31b9aeab54e928e2a8141cc7bc3fe31f8f5b6006 Mon Sep 17 00:00:00 2001 From: Eric Bruning Date: Sun, 16 Jul 2017 10:10:42 -0500 Subject: [PATCH 11/16] PEP8 --- xarray/backends/pynio_.py | 2 +- xarray/conventions.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py index ad70d79a298..f70c286a990 100644 --- a/xarray/backends/pynio_.py +++ b/xarray/backends/pynio_.py @@ -44,7 +44,7 @@ def __init__(self, filename, mode='r', autoclose=False): self.ds = opener() # xarray provides its own support for FillValue, # so turn off PyNIO's support for the same. - self.ds.set_option('MaskedArrayMode','MaskedNever') + self.ds.set_option('MaskedArrayMode', 'MaskedNever') self._autoclose = autoclose self._isopen = True self._opener = opener diff --git a/xarray/conventions.py b/xarray/conventions.py index 03e0fbb6ef0..c2250d0a1aa 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -823,7 +823,7 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, data = CharToStringArray(data) is_unsigned = pop_to(attributes, encoding, '_Unsigned') - if (is_unsigned is not None) & (mask_and_scale == True): + if (is_unsigned is not None) & (mask_and_scale is True): if data.dtype.kind == 'i': data = UnsignedIntTypeArray(data) else: From 4a7e48139b9619d9a21aba32207c70c631640317 Mon Sep 17 00:00:00 2001 From: Eric Bruning Date: Sun, 16 Jul 2017 10:34:14 -0500 Subject: [PATCH 12/16] No need to convert unsigned fill value if there is not fill value --- xarray/conventions.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index c2250d0a1aa..1df12474c87 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -852,13 +852,14 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, RuntimeWarning, stacklevel=3) scale_factor = pop_to(attributes, encoding, 'scale_factor') add_offset = pop_to(attributes, encoding, 'add_offset') - if ((fill_value is not None and not np.any(pd.isnull(fill_value))) or - scale_factor is not None or add_offset is not None): + has_fill = (fill_value is not None and + not np.any(pd.isnull(fill_value))) + if (has_fill or scale_factor is not None or add_offset is not None): if fill_value.dtype.kind in ['U', 'S']: dtype = object else: dtype = float - if is_unsigned is not None: + if (is_unsigned is not None) and has_fill: # Need to convert the fill_value to unsigned, too # According to the CF spec, the fill value is of the same # type as its variable, i.e. its storage format on disk From d6a90c7ce4c06cdcd02682b6e1cdc07985e28a77 Mon Sep 17 00:00:00 2001 From: Eric Bruning Date: Sun, 16 Jul 2017 10:36:32 -0500 Subject: [PATCH 13/16] PEP8 --- xarray/conventions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index 1df12474c87..d19c6e363c0 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -852,7 +852,7 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, RuntimeWarning, stacklevel=3) scale_factor = pop_to(attributes, encoding, 'scale_factor') add_offset = pop_to(attributes, encoding, 'add_offset') - has_fill = (fill_value is not None and + has_fill = (fill_value is not None and not np.any(pd.isnull(fill_value))) if (has_fill or scale_factor is not None or add_offset is not None): if fill_value.dtype.kind in ['U', 'S']: From bef02b4e596f03c101265c2315fe83c0cf12d21a Mon Sep 17 00:00:00 2001 From: Eric Bruning Date: Mon, 17 Jul 2017 16:37:09 -0500 Subject: [PATCH 14/16] yet more PEP8 --- xarray/tests/test_backends.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 145983c5915..6e118fdca32 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -392,10 +392,8 @@ def test_unsigned_roundtrip_mask_and_scale(self): self.assertDatasetAllClose(encoded, actual) # make sure roundtrip encoding didn't change the # original dataset. - self.assertDatasetIdentical(encoded, - create_encoded_unsigned_masked_scaled_data( - ) - ) + self.assertDatasetIdentical( + encoded, create_encoded_unsigned_masked_scaled_data()) with self.roundtrip(encoded) as actual: self.assertDatasetAllClose(decoded, actual) with self.roundtrip(encoded, @@ -418,8 +416,7 @@ def test_roundtrip_mask_and_scale(self): # make sure roundtrip encoding didn't change the # original dataset. self.assertDatasetIdentical(encoded, - create_encoded_masked_and_scaled_data() - ) + create_encoded_masked_and_scaled_data()) with self.roundtrip(encoded) as actual: self.assertDatasetAllClose(decoded, actual) with self.roundtrip(encoded, From 748a1069888e6958b96e1b6ba75cfb7c8cfbca76 Mon Sep 17 00:00:00 2001 From: Eric Bruning Date: Mon, 17 Jul 2017 17:03:41 -0500 Subject: [PATCH 15/16] Be more careful with _Unsigned attribute check --- xarray/conventions.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index d19c6e363c0..b3a885aab1d 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -665,7 +665,7 @@ def maybe_encode_dtype(var, name=None): 'any _FillValue to use for NaNs' % name, RuntimeWarning, stacklevel=3) data = duck_array_ops.around(data)[...] - if '_Unsigned' in encoding: + if encoding.get('_Unsigned', False): unsigned_dtype = 'i%s' % dtype.itemsize old_fill = np.asarray(attrs['_FillValue']) new_fill = old_fill.astype(unsigned_dtype) @@ -822,8 +822,9 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, dimensions = dimensions[:-1] data = CharToStringArray(data) - is_unsigned = pop_to(attributes, encoding, '_Unsigned') - if (is_unsigned is not None) & (mask_and_scale is True): + pop_to(attributes, encoding, '_Unsigned') + is_unsigned = encoding.get('_Unsigned', False) + if (is_unsigned) and (mask_and_scale is True): if data.dtype.kind == 'i': data = UnsignedIntTypeArray(data) else: @@ -859,7 +860,7 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, dtype = object else: dtype = float - if (is_unsigned is not None) and has_fill: + if is_unsigned and has_fill: # Need to convert the fill_value to unsigned, too # According to the CF spec, the fill value is of the same # type as its variable, i.e. its storage format on disk From 9bec545bfa668765858c707498a38c3663e20519 Mon Sep 17 00:00:00 2001 From: Eric Bruning Date: Fri, 21 Jul 2017 14:43:09 -0500 Subject: [PATCH 16/16] Better fencing for attribute checks. Style fixes. Test for dtypes. --- xarray/conventions.py | 24 +++++++++++++----------- xarray/tests/test_backends.py | 17 +++++++++++++++-- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/xarray/conventions.py b/xarray/conventions.py index b3a885aab1d..fac70ac4615 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -666,11 +666,12 @@ def maybe_encode_dtype(var, name=None): RuntimeWarning, stacklevel=3) data = duck_array_ops.around(data)[...] if encoding.get('_Unsigned', False): - unsigned_dtype = 'i%s' % dtype.itemsize - old_fill = np.asarray(attrs['_FillValue']) - new_fill = old_fill.astype(unsigned_dtype) - attrs['_FillValue'] = new_fill - data = data.astype(unsigned_dtype) + signed_dtype = 'i%s' % dtype.itemsize + if '_FillValue' in var.attrs: + old_fill = np.asarray(attrs['_FillValue']) + new_fill = old_fill.astype(signed_dtype) + attrs['_FillValue'] = new_fill + data = data.astype(signed_dtype) pop_to(encoding, attrs, '_Unsigned') if dtype == 'S1' and data.dtype != 'S1': data = string_to_char(np.asarray(data, 'S')) @@ -824,7 +825,7 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, pop_to(attributes, encoding, '_Unsigned') is_unsigned = encoding.get('_Unsigned', False) - if (is_unsigned) and (mask_and_scale is True): + if is_unsigned and mask_and_scale: if data.dtype.kind == 'i': data = UnsignedIntTypeArray(data) else: @@ -860,11 +861,12 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, dtype = object else: dtype = float - if is_unsigned and has_fill: - # Need to convert the fill_value to unsigned, too - # According to the CF spec, the fill value is of the same - # type as its variable, i.e. its storage format on disk - fill_value = np.asarray(fill_value, dtype=data.unsigned_dtype) + # According to the CF spec, the fill value is of the same + # type as its variable, i.e. its storage format on disk. + # This handles the case where the fill_value also needs to be + # converted to its unsigned value. + if has_fill: + fill_value = np.asarray(fill_value, dtype=data.dtype) data = MaskedAndScaledArray(data, fill_value, scale_factor, add_offset, dtype) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 6e118fdca32..f5bd615cfe2 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -381,23 +381,36 @@ def test_unsigned_roundtrip_mask_and_scale(self): decoded = create_unsigned_masked_scaled_data() encoded = create_encoded_unsigned_masked_scaled_data() with self.roundtrip(decoded) as actual: + for k in decoded.variables: + self.assertEqual(decoded.variables[k].dtype, + actual.variables[k].dtype) self.assertDatasetAllClose(decoded, actual) with self.roundtrip(decoded, open_kwargs=dict(decode_cf=False)) as actual: - # TODO: this assumes that all roundtrips will first - # encode. Is that something we want to test for? + for k in encoded.variables: + self.assertEqual(encoded.variables[k].dtype, + actual.variables[k].dtype) self.assertDatasetAllClose(encoded, actual) with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual: + for k in encoded.variables: + self.assertEqual(encoded.variables[k].dtype, + actual.variables[k].dtype) self.assertDatasetAllClose(encoded, actual) # make sure roundtrip encoding didn't change the # original dataset. self.assertDatasetIdentical( encoded, create_encoded_unsigned_masked_scaled_data()) with self.roundtrip(encoded) as actual: + for k in decoded.variables: + self.assertEqual(decoded.variables[k].dtype, + actual.variables[k].dtype) self.assertDatasetAllClose(decoded, actual) with self.roundtrip(encoded, open_kwargs=dict(decode_cf=False)) as actual: + for k in encoded.variables: + self.assertEqual(encoded.variables[k].dtype, + actual.variables[k].dtype) self.assertDatasetAllClose(encoded, actual) def test_roundtrip_mask_and_scale(self):