diff --git a/xray/backends/netCDF4_.py b/xray/backends/netCDF4_.py index 1094c142039..8a924ab35f8 100644 --- a/xray/backends/netCDF4_.py +++ b/xray/backends/netCDF4_.py @@ -86,8 +86,7 @@ class NetCDF4DataStore(AbstractWritableDataStore): This store supports NetCDF3, NetCDF4 and OpenDAP datasets. """ def __init__(self, filename, mode='r', clobber=True, diskless=False, - persist=False, format='NETCDF4', group=None, - *args, **kwdargs): + persist=False, format='NETCDF4', group=None): import netCDF4 as nc4 ds = nc4.Dataset(filename, mode=mode, clobber=clobber, diskless=diskless, persist=persist, @@ -95,15 +94,11 @@ def __init__(self, filename, mode='r', clobber=True, diskless=False, self.ds = _nc4_group(ds, group) self.format = format self._filename = filename - self._encoder_args = args - self._encoder_kwdargs = kwdargs def store(self, variables, attributes): # All NetCDF files get CF encoded by default, without this attempting # to write times, for example, would fail. - cf_variables, cf_attrs = cf_encoder(variables, attributes, - *self._encoder_args, - **self._encoder_kwdargs) + cf_variables, cf_attrs = cf_encoder(variables, attributes) AbstractWritableDataStore.store(self, cf_variables, cf_attrs) def open_store_variable(self, var): diff --git a/xray/backends/scipy_.py b/xray/backends/scipy_.py index 25a45374a16..e92232951e9 100644 --- a/xray/backends/scipy_.py +++ b/xray/backends/scipy_.py @@ -34,8 +34,7 @@ class ScipyDataStore(AbstractWritableDataStore): It only supports the NetCDF3 file-format. """ - def __init__(self, filename_or_obj, mode='r', mmap=None, - version=1, *args, **kwdargs): + def __init__(self, filename_or_obj, mode='r', mmap=None, version=1): import scipy if mode != 'r' and scipy.__version__ < '0.13': warnings.warn('scipy %s detected; ' @@ -53,15 +52,11 @@ def __init__(self, filename_or_obj, mode='r', mmap=None, filename_or_obj = BytesIO(filename_or_obj) self.ds = scipy.io.netcdf.netcdf_file( filename_or_obj, mode=mode, mmap=mmap, version=version) - self._encoder_args = args - self._encoder_kwdargs = kwdargs def store(self, variables, attributes): # All Scipy objects get CF encoded by default, without this attempting # to write times, for example, would fail. - cf_variables, cf_attrs = cf_encoder(variables, attributes, - *self._encoder_args, - **self._encoder_kwdargs) + cf_variables, cf_attrs = cf_encoder(variables, attributes) AbstractWritableDataStore.store(self, cf_variables, cf_attrs) def open_store_variable(self, var): diff --git a/xray/conventions.py b/xray/conventions.py index c30f58d8124..dd33b720253 100644 --- a/xray/conventions.py +++ b/xray/conventions.py @@ -1,3 +1,4 @@ +import functools import numpy as np import pandas as pd import warnings @@ -372,6 +373,73 @@ def pop_to(source, dest, key, default=None): return value +def _var_as_tuple(var): + return var.dims, var.values, var.attrs.copy(), var.encoding.copy() + + +def maybe_encode_datetime(var): + if (np.issubdtype(var.dtype, np.datetime64) + or (var.dtype.kind == 'O' + and isinstance(var.values.flat[0], datetime))): + + dims, values, attrs, encoding = _var_as_tuple(var) + if 'units' in attrs or 'calendar' in attrs: + raise ValueError( + "Failed hard to prevent overwriting 'units' or 'calendar'") + + (values, units, calendar) = encode_cf_datetime( + values, encoding.pop('units', None), encoding.pop('calendar', None)) + attrs['units'] = units + attrs['calendar'] = calendar + var = Variable(dims, values, attrs, encoding) + return var + + +def maybe_encode_offset_and_scale(var, needs_copy=True): + if any(k in var.encoding for k in ['add_offset', 'scale_factor']): + dims, values, attrs, encoding = _var_as_tuple(var) + values = np.array(values, dtype=float, copy=needs_copy) + needs_copy = False + if 'add_offset' in encoding: + values -= pop_to(encoding, attrs, 'add_offset') + if 'scale_factor' in encoding: + values /= pop_to(encoding, attrs, 'scale_factor') + var = Variable(dims, values, attrs, encoding) + return var, needs_copy + + +def maybe_encode_fill_value(var, needs_copy=True): + # replace NaN with the fill value + if '_FillValue' in var.encoding: + dims, values, attrs, encoding = _var_as_tuple(var) + fill_value = pop_to(encoding, attrs, '_FillValue') + if not pd.isnull(fill_value): + missing = pd.isnull(values) + if missing.any(): + if needs_copy: + values = values.copy() + needs_copy = False + values[missing] = fill_value + var = Variable(dims, values, attrs, encoding) + return var, needs_copy + + +def maybe_encode_dtype(var, needs_copy=True): + if 'dtype' in var.encoding: + dims, values, attrs, encoding = _var_as_tuple(var) + dtype = np.dtype(encoding.pop('dtype')) + if dtype.kind != 'O': + if np.issubdtype(dtype, int): + out = np.empty_like(values) if needs_copy else values + np.around(values, out=out) + if dtype == 'S1' and values.dtype != 'S1': + values = string_to_char(np.asarray(values, 'S')) + dims = dims + ('string%s' % values.shape[-1],) + values = np.asarray(values, dtype=dtype) + var = Variable(dims, values, attrs, encoding) + return var + + def _infer_dtype(array): """Given an object array with no missing values, infer its dtype from its first element @@ -390,7 +458,36 @@ def _infer_dtype(array): return dtype -def encode_cf_variable(var): +def ensure_dtype_not_object(var): + # TODO: move this from conventions to backends? (it's not CF related) + if var.dtype.kind == 'O': + dims, values, attrs, encoding = _var_as_tuple(var) + missing = pd.isnull(values) + if missing.any(): + non_missing_values = values[~missing] + inferred_dtype = _infer_dtype(non_missing_values) + + if inferred_dtype.kind in ['S', 'U']: + # There is no safe bit-pattern for NA in typical binary string + # formats, we so can't set a fill_value. Unfortunately, this + # means we won't be able to restore string arrays with missing + # values. + fill_value = '' + else: + # insist on using float for numeric values + if not np.issubdtype(inferred_dtype, float): + inferred_dtype = np.dtype(float) + fill_value = np.nan + + values = np.array(values, dtype=inferred_dtype, copy=True) + values[missing] = fill_value + else: + values = np.asarray(values, dtype=_infer_dtype(values)) + var = Variable(dims, values, attrs, encoding) + return var + + +def encode_cf_variable(var, needs_copy=True): """ Converts an Variable into an Variable which follows some of the CF conventions: @@ -410,86 +507,12 @@ def encode_cf_variable(var): out : xray.Variable A variable which has been encoded as described above. """ - dimensions = var.dims - data = var.values - attributes = var.attrs.copy() - encoding = var.encoding.copy() - - # convert datetimes into numbers - if (np.issubdtype(data.dtype, np.datetime64) - or (data.dtype.kind == 'O' - and isinstance(data.reshape(-1)[0], datetime))): - if 'units' in attributes or 'calendar' in attributes: - raise ValueError( - "Failed hard to prevent overwriting 'units' or 'calendar'") - (data, units, calendar) = encode_cf_datetime( - data, encoding.pop('units', None), encoding.pop('calendar', None)) - attributes['units'] = units - attributes['calendar'] = calendar - - # unscale/mask - if any(k in encoding for k in ['add_offset', 'scale_factor']): - data = np.array(data, dtype=float, copy=True) - if 'add_offset' in encoding: - data -= pop_to(encoding, attributes, 'add_offset') - if 'scale_factor' in encoding: - data /= pop_to(encoding, attributes, 'scale_factor') - - # replace NaN with the fill value - if '_FillValue' in encoding: - fill_value = pop_to(encoding, attributes, '_FillValue') - if not pd.isnull(fill_value): - missing = pd.isnull(data) - if missing.any(): - data = data.copy() - data[missing] = fill_value - - # replace NaN with the missing_value - if 'missing_value' in encoding: - missing_value = pop_to(encoding, attributes, 'missing_value') - if not pd.isnull(missing_value): - missing = pd.isnull(data) - if missing.any(): - data = data.copy() - data[missing] = missing_value - - # cast to encoded dtype - if 'dtype' in encoding: - dtype = np.dtype(encoding.pop('dtype')) - if dtype.kind != 'O': - if np.issubdtype(dtype, int): - data = data.round() - if dtype == 'S1' and data.dtype != 'S1': - data = string_to_char(np.asarray(data, 'S')) - dimensions = dimensions + ('string%s' % data.shape[-1],) - data = np.asarray(data, dtype=dtype) - - # infer a valid dtype if necessary - # TODO: move this from conventions to backends (it's not CF related) - if data.dtype.kind == 'O': - missing = pd.isnull(data) - if missing.any(): - non_missing_data = data[~missing] - inferred_dtype = _infer_dtype(non_missing_data) - - if inferred_dtype.kind in ['S', 'U']: - # There is no safe bit-pattern for NA in typical binary string - # formats, we so can't set a fill_value. Unfortunately, this - # means we won't be able to restore string arrays with missing - # values. - fill_value = '' - else: - # insist on using float for numeric data - if not np.issubdtype(inferred_dtype, float): - inferred_dtype = np.dtype(float) - fill_value = np.nan - - data = np.array(data, dtype=inferred_dtype, copy=True) - data[missing] = fill_value - else: - data = np.asarray(data, dtype=_infer_dtype(data)) - - return Variable(dimensions, data, attributes, encoding=encoding) + var = maybe_encode_datetime(var) + var, needs_copy = maybe_encode_offset_and_scale(var, needs_copy) + var, needs_copy = maybe_encode_fill_value(var, needs_copy) + var = maybe_encode_dtype(var, needs_copy) + var = ensure_dtype_not_object(var) + return var def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, @@ -539,15 +562,15 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, data = CharToStringArray(data) if mask_and_scale: - # missing_value is deprecated, but we still want to support it. - missing_value = pop_to(attributes, encoding, 'missing_value') + if 'missing_value' in attributes: + # missing_value is deprecated, but we still want to support it as + # an alias for _FillValue. + assert ('_FillValue' not in attributes + or utils.equivalent(attributes['_FillValue'], + attributes['missing_value'])) + attributes['_FillValue'] = attributes.pop('missing_value') + fill_value = pop_to(attributes, encoding, '_FillValue') - # if missing_value is given but not fill_value we use missing_value - if fill_value is None and missing_value is not None: - fill_value = missing_value - # if both were given we make sure they are the same. - if fill_value is not None and missing_value is not None: - assert fill_value == missing_value scale_factor = pop_to(attributes, encoding, 'scale_factor') add_offset = pop_to(attributes, encoding, 'add_offset') if ((fill_value is not None and not pd.isnull(fill_value)) diff --git a/xray/core/dataset.py b/xray/core/dataset.py index 59f42933986..13cce5b2794 100644 --- a/xray/core/dataset.py +++ b/xray/core/dataset.py @@ -84,11 +84,14 @@ def open_dataset(nc, decode_cf=True, mask_and_scale=True, decode_times=True, # If nc is a file-like object we read it using # the scipy.io.netcdf package store = backends.ScipyDataStore(nc, *args, **kwargs) - decoder = conventions.cf_decoder if decode_cf else None - return Dataset.load_store(store, decoder=decoder, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters) + if decode_cf: + decoder = functools.partial(conventions.cf_decoder, + mask_and_scale=mask_and_scale, + decode_times=decode_times, + concat_characters=concat_characters) + else: + decoder = None + return Dataset.load_store(store, decoder=decoder) # list of attributes of pd.DatetimeIndex that are ndarrays of time info @@ -399,14 +402,13 @@ def _set_init_vars_and_dims(self, vars, coords): check_coord_names=False) @classmethod - def load_store(cls, store, decoder=None, *args, **kwdargs): + def load_store(cls, store, decoder=None): """Create a new dataset from the contents of a backends.*DataStore object """ variables, attributes = store.load() if decoder: - variables, attributes = decoder(variables, attributes, - *args, **kwdargs) + variables, attributes = decoder(variables, attributes) obj = cls(variables, attrs=attributes) obj._file_obj = store return obj @@ -785,13 +787,11 @@ def reset_coords(self, names=None, drop=False, inplace=False): del obj._arrays[name] return obj - def dump_to_store(self, store, encoder=None, - *args, **kwdargs): + def dump_to_store(self, store, encoder=None): """Store dataset contents to a backends.*DataStore object.""" variables, attributes = self, self.attrs if encoder: - variables, attributes = encoder(variables, attributes, - *args, **kwdargs) + variables, attributes = encoder(variables, attributes) store.store(variables, attributes) store.sync() diff --git a/xray/test/test_dataset.py b/xray/test/test_dataset.py index 87e12861456..1531620e6fb 100644 --- a/xray/test/test_dataset.py +++ b/xray/test/test_dataset.py @@ -8,7 +8,8 @@ import numpy as np import pandas as pd -from xray import align, concat, backends, Dataset, DataArray, Variable +from xray import (align, concat, conventions, backends, Dataset, DataArray, + Variable) from xray.core import indexing, utils from xray.core.pycompat import iteritems, OrderedDict @@ -1020,8 +1021,8 @@ def test_lazy_load(self): store = InaccessibleVariableDataStore() create_test_data().dump_to_store(store) - for decode_cf in [False, True]: - ds = Dataset.load_store(store, decode_cf=decode_cf) + for decoder in [None, conventions.cf_decoder]: + ds = Dataset.load_store(store, decoder=decoder) with self.assertRaises(UnexpectedDataAccess): ds.load_data() with self.assertRaises(UnexpectedDataAccess):