From c185df55d9ccfdb62915c362a09bd724a73de2d4 Mon Sep 17 00:00:00 2001 From: Alex Kleeman Date: Tue, 24 Jun 2014 05:57:41 -0700 Subject: [PATCH 1/5] Restructured Backends to make CF convention application more consistent. Amongst other things this includes: - EncodedDataStores which can wrap other stores and allow for modular encoding/decoding. - Trivial indices ds['x'] = ('x', np.arange(10)) are no longer stored on disk and are only created when accessed. - AbstractDataStore API change. Shouldn't effect external users. - missing_value attributes now function like _FillValue All current tests are passing (though it could use more new ones). Post rebase notes (shoyer, Oct 2, 2014): Most tests are passing, though a couple are broken: - test_roundtrip_mask_and_scale (because this change needs a fix to not break the current API) - test_roundtrip_strings_with_fill_value on TestCFEncodedDataStore (I don't entirely understand why, let's come back to it later) --- xray/backends/common.py | 214 ++++++++++++++++++++++++++++++++-- xray/backends/memory.py | 36 +++--- xray/backends/netCDF4_.py | 24 ++-- xray/backends/pydap_.py | 12 +- xray/backends/scipy_.py | 18 +-- xray/conventions.py | 155 +++++++++++++++++++++++- xray/core/dataset.py | 28 +++-- xray/core/indexing.py | 41 +++++++ xray/test/test_backends.py | 147 ++++++++++++++++------- xray/test/test_conventions.py | 70 ++++++++++- xray/test/test_dataset.py | 22 +--- 11 files changed, 646 insertions(+), 121 deletions(-) diff --git a/xray/backends/common.py b/xray/backends/common.py index f651002c741..e20515a2383 100644 --- a/xray/backends/common.py +++ b/xray/backends/common.py @@ -1,7 +1,11 @@ import numpy as np +import inspect +import itertools +import functools from ..core.utils import FrozenOrderedDict from ..core.pycompat import iteritems +from ..core.variable import Coordinate NONE_VAR_NAME = '__values__' @@ -19,22 +23,52 @@ def _decode_variable_name(name): return name +def is_trivial_index(var): + """ + Determines if in index is 'trivial' mean that it is + equivalent to np.arange(). This is determined by + checking if there are any attributes or encodings, + if ndims is one, dtype is int and finally by comparing + the actual values to np.arange() + """ + # if either attributes or encodings are defined + # the index is not trival. + if len(var.attrs) or len(var.encoding): + return False + # if the index is not a 1d integer array + if var.ndim > 1 or not var.dtype.kind == 'i': + return False + if isinstance(var, Coordinate): + arange = np.arange(var.size, dtype=var.dtype) + if np.any(var.values != arange): + return False + return True + + class AbstractDataStore(object): - def open_store_variable(self, v): + + def get_attrs(self): raise NotImplementedError - @property - def store_variables(self): - return self.ds.variables + def get_variables(self): + raise NotImplementedError + + def get_dimensions(self): + return list(itertools.chain(*[x.dimensions + for x in self.get_variables().values()])) @property def variables(self): - return FrozenOrderedDict((_decode_variable_name(k), - self.open_store_variable(v)) - for k, v in iteritems(self.store_variables)) + return FrozenOrderedDict((_decode_variable_name(k), v) + for k, v in iteritems(self.get_variables())) - def sync(self): - pass + @property + def attrs(self): + return FrozenOrderedDict(self.get_attrs()) + + @property + def dimensions(self): + return self.get_dimensions() def close(self): pass @@ -47,6 +81,30 @@ def __exit__(self, exception_type, exception_value, tracebook): class AbstractWritableDataStore(AbstractDataStore): + + def set_dimension(self, d, l): + raise NotImplementedError + + def set_attribute(self, k, v): + raise NotImplementedError + + def set_variable(self, k, v): + raise NotImplementedError + + def sync(self): + pass + + def store(self, dataset): + self.set_attributes(dataset.attrs) + neccesary_dims = [[d for d in v.dimensions] + for v in dataset.variables.values()] + neccesary_dims = set(itertools.chain(*neccesary_dims)) + # set all non-indexes and any index which is not trivial. + variables = {k: v for k, v in iteritems(dataset.variables) + if not (k in neccesary_dims and is_trivial_index(v))} + self.set_variables(variables) + #self.set_variables(dataset.variables) + def set_dimensions(self, dimensions): for d, l in iteritems(dimensions): self.set_dimension(d, l) @@ -58,8 +116,144 @@ def set_attributes(self, attributes): def set_variables(self, variables): for vn, v in iteritems(variables): self.set_variable(_encode_variable_name(vn), v) + self.set_necessary_dimensions(v) def set_necessary_dimensions(self, variable): for d, l in zip(variable.dims, variable.shape): - if d not in self.ds.dimensions: + if d not in self.dimensions: self.set_dimension(d, l) + + +class AbstractEncodedDataStore(AbstractWritableDataStore): + """ + AbstractEncodedDataStore is an interface for making a + DataStore which wraps another DataStore while first passing + all input/output through an encoding/decoding layer. + This allows more modular application of things such as + conforming to CF Conventions. + + There are no explicity restrictions requiring an + EncodedDataStore to be roundtrip-able, but when this is desired + (probably often) consider passing implementing + classes through test_backends:DatasetIOTestCases. + + Requires Implementation + -------- + encode : function(self, datastore) + + + decode : function(self, datastore) + + """ + def encode(self, datastore): + """ + A function which takes an un-encoded datastore and returns + a new DataStore (or Dataset) which has been encoded. Returning + an InMemoryDataStore for this is encouraged since it avoids + the xray consistency checks making it faster / more flexible. + + """ + raise NotImplementedError + + def decode(self, datastore): + """ + A function which takes an encoded datastore and returns + a new DataStore which has been decoded. Again consider + using an InMemoryDataStore, though returning a Dataset + will work perfectly fine in most situations. + + Also note that directly accessing variable data may cause + remote DataStores to be loaded into memory. + See conventions.decode_cf_variable for examples of wrapping + computations to make them lazy. + """ + raise NotImplementedError + + @property + def decoded(self): + if not hasattr(self, '_decoded'): + self._decoded = self.decode(self.ds) + return self._decoded + + def get_dimensions(self): + return self.decoded.dimensions + + def get_variables(self): + return self.decoded.variables + + def get_attrs(self): + return self.decoded.attrs + + def store(self, dataset): + self.ds.store(self.encode(dataset)) + self.ds.sync() + + def sync(self): + self.ds.sync() + + def close(self): + self.ds.close() + + +def encoding_decorator(encoder, decoder): + """ + This is a Class decorating function which makes wrapping DataStores + in additional encoding layers easier. + + Note that often times the encoders and decoders will require arguments + at class creation time. To handle this, the encoder and decoder args + are first inspected. Any arguments they require are used first, and + any remaining arguments are passed onto the DataStore being wrapped. + + Parameters + ---------- + encoder : function + Takes a Datastore (or Dataset) and returns an encoded Datastore. + decoder : function + Takes a Datastore (or Dataset) and returns a decoded Datastore. + + Returns + ------- + class_wrapper: A function which wraps a DataStore class and turns + it into an EncodingWrappedDataStore. + """ + + def class_wrapper(cls): + class EncodingWrappedDataStore(AbstractEncodedDataStore): + + def __init__(self, *args, **kwdargs): + # NOTE: we assume that any arguments for the encoder + # and decoder are keyword args. All position arguments + # are passed on to the DataStore. + encoder_argnames = set(inspect.getargspec(encoder).args[1:]) + decoder_argnames = set(inspect.getargspec(decoder).args[1:]) + # make sure there aren't any argument collisions, that would + # get pretty confusing. + constructor_args = set(inspect.getargspec(cls.__init__)[1:]) + if constructor_args.intersection(encoder_argnames): + bad_args = constructor_args.intersection(encoder_argnames) + raise ValueError("encoder and class have overlapping args: %s" + % ', '.join(bad_args)) + if constructor_args.intersection(decoder_argnames): + bad_args = constructor_args.intersection(decoder_argnames) + raise ValueError("decoder and class have overlapping args: %s" + % ', '.join(bad_args)) + # create a set of keyword arguments for both the encoder and decoder + encoder_args = {} + decoder_args = {} + for k in encoder_argnames.union(decoder_argnames): + if k in kwdargs: + v = kwdargs.pop(k) + if k in encoder_argnames: + encoder_args[k] = v + if k in decoder_argnames: + decoder_args[k] = v + # create the data store. + self.ds = cls(*args, **kwdargs) + # set the encode and decode function using the provided args + self.encode = functools.partial(encoder, **encoder_args) + self.decode = functools.partial(decoder, **decoder_args) + + return EncodingWrappedDataStore + + return class_wrapper diff --git a/xray/backends/memory.py b/xray/backends/memory.py index ec8352450ff..95f416d4d3b 100644 --- a/xray/backends/memory.py +++ b/xray/backends/memory.py @@ -1,4 +1,5 @@ from ..core.pycompat import OrderedDict +import copy from .common import AbstractWritableDataStore @@ -7,22 +8,29 @@ class InMemoryDataStore(AbstractWritableDataStore): """ Stores dimensions, variables and attributes in ordered dictionaries, making this store - fast compared to stores which store to disk. + fast compared to stores which save to disk. """ - def __init__(self): - self.dimensions = OrderedDict() - self.variables = OrderedDict() - self.attributes = OrderedDict() + def __init__(self, dict_store=None): + if dict_store is None: + dict_store = {} + dict_store['variables'] = OrderedDict() + dict_store['attributes'] = OrderedDict() + self.ds = dict_store - def set_dimension(self, name, length): - self.dimensions[name] = length + def get_attrs(self): + return self.ds['attributes'] - def set_attribute(self, key, value): - self.attributes[key] = value + def get_variables(self): + return self.ds['variables'] - def set_variable(self, name, variable): - self.variables[name] = variable - return self.variables[name] + def set_variable(self, k, v): + new_var = copy.deepcopy(v) + # we copy the variable and stuff all encodings in the + # attributes to imitate what happens when writting to disk. + new_var.attrs.update(new_var.encoding) + new_var.encoding.clear() + self.ds['variables'][k] = new_var - def del_attribute(self, key): - del self.attributes[key] + def set_attribute(self, k, v): + # copy to imitate writing to disk. + self.ds['attributes'][k] = copy.deepcopy(v) diff --git a/xray/backends/netCDF4_.py b/xray/backends/netCDF4_.py index aaabcf121d7..782b54ca76b 100644 --- a/xray/backends/netCDF4_.py +++ b/xray/backends/netCDF4_.py @@ -3,7 +3,7 @@ import numpy as np from .. import Variable -from ..conventions import encode_cf_variable +from ..conventions import encode_cf_variable, pop_to, cf_encoded from ..core import indexing from ..core.utils import FrozenOrderedDict, NDArrayMixin from ..core.pycompat import iteritems, basestring, OrderedDict @@ -80,6 +80,7 @@ def _ensure_fill_value_valid(data, attributes): attributes['_FillValue'] = np.string_(attributes['_FillValue']) +@cf_encoded class NetCDF4DataStore(AbstractWritableDataStore): """Store for reading and writing data via the Python-NetCDF4 library. @@ -118,19 +119,22 @@ def open_store_variable(self, var): # TODO: figure out how to round-trip "endian-ness" without raising # warnings from netCDF4 # encoding['endian'] = var.endian() - encoding['least_significant_digit'] = \ - attributes.pop('least_significant_digit', None) + # encoding['least_significant_digit'] = \ + # attributes.pop('least_significant_digit', None) + pop_to(attributes, encoding, 'least_significant_digit') # save source so __repr__ can detect if it's local or not encoding['source'] = self._filename return Variable(dimensions, data, attributes, encoding) - @property - def attrs(self): + def get_variables(self): + return FrozenOrderedDict((k, self.open_store_variable(v)) + for k, v in iteritems(self.ds.variables)) + + def get_attrs(self): return FrozenOrderedDict((k, self.ds.getncattr(k)) for k in self.ds.ncattrs()) - @property - def dimensions(self): + def get_dimensions(self): return FrozenOrderedDict((k, len(v)) for k, v in iteritems(self.ds.dimensions)) @@ -141,7 +145,7 @@ def set_attribute(self, key, value): self.ds.setncattr(key, value) def set_variable(self, name, variable): - variable = encode_cf_variable(variable) + attrs = variable.attrs.copy() if self.format == 'NETCDF4': variable, datatype = _nc4_values_and_dtype(variable) else: @@ -150,7 +154,7 @@ def set_variable(self, name, variable): self.set_necessary_dimensions(variable) - fill_value = variable.attrs.pop('_FillValue', None) + fill_value = attrs.pop('_FillValue', None) if fill_value in ['', '\x00']: # these are equivalent to the default FillValue, but netCDF4 # doesn't like setting fill_value to an empty string @@ -172,7 +176,7 @@ def set_variable(self, name, variable): fill_value=fill_value) nc4_var.set_auto_maskandscale(False) nc4_var[:] = variable.values - for k, v in iteritems(variable.attrs): + for k, v in iteritems(attrs): # set attributes one-by-one since netCDF4<1.0.10 can't handle # OrderedDict as the input to setncatts nc4_var.setncattr(k, v) diff --git a/xray/backends/pydap_.py b/xray/backends/pydap_.py index 21377cd28d4..49dda7ff065 100644 --- a/xray/backends/pydap_.py +++ b/xray/backends/pydap_.py @@ -50,10 +50,12 @@ def open_store_variable(self, var): data = indexing.LazilyIndexedArray(PydapArrayWrapper(var)) return Variable(var.dimensions, data, var.attributes) - @property - def store_variables(self): - return self.ds + def get_variables(self): + return FrozenOrderedDict((k, self.open_store_variable(v)) + for k, v in self.ds.iteritems()) - @property - def attrs(self): + def get_attrs(self): return Frozen(self.ds.attributes) + + def get_dimensions(self): + return Frozen(self.ds.dimensions) diff --git a/xray/backends/scipy_.py b/xray/backends/scipy_.py index 5d1d15a6c62..20d277881fe 100644 --- a/xray/backends/scipy_.py +++ b/xray/backends/scipy_.py @@ -5,7 +5,7 @@ from .. import conventions, Variable from ..core.pycompat import iteritems, basestring, unicode_type, OrderedDict -from ..core.utils import Frozen +from ..core.utils import Frozen, FrozenOrderedDict from .common import AbstractWritableDataStore from .netcdf3 import is_valid_nc3_name, coerce_nc3_dtype, encode_nc3_variable @@ -23,7 +23,7 @@ def _decode_attrs(d): return OrderedDict((k, v if k == '_FillValue' else _decode_string(v)) for (k, v) in iteritems(d)) - +@conventions.cf_encoded class ScipyDataStore(AbstractWritableDataStore): """Store for reading and writing data via scipy.io.netcdf. @@ -55,12 +55,14 @@ def open_store_variable(self, var): return Variable(var.dimensions, var.data, _decode_attrs(var._attributes)) - @property - def attrs(self): + def get_variables(self): + return FrozenOrderedDict((k, self.open_store_variable(v)) + for k, v in iteritems(self.ds.variables)) + + def get_attrs(self): return Frozen(_decode_attrs(self.ds._attributes)) - @property - def dimensions(self): + def get_dimensions(self): return Frozen(self.ds.dimensions) def set_dimension(self, name, length): @@ -88,8 +90,8 @@ def set_attribute(self, key, value): setattr(self.ds, key, self._cast_attr_value(value)) def set_variable(self, name, variable): - variable = encode_nc3_variable( - conventions.encode_cf_variable(variable)) + # TODO, create a netCDF3 encoder + variable = encode_nc3_variable(variable) self.set_necessary_dimensions(variable) data = variable.values self.ds.createVariable(name, data.dtype, variable.dims) diff --git a/xray/conventions.py b/xray/conventions.py index 94af6fd2f1a..26e49797f24 100644 --- a/xray/conventions.py +++ b/xray/conventions.py @@ -8,6 +8,10 @@ from .core.variable import as_variable, Variable from .core.pycompat import iteritems, bytes_type, unicode_type, OrderedDict +from .backends.common import encoding_decorator +from .backends import InMemoryDataStore + + # standard calendars recognized by netcdftime _STANDARD_CALENDARS = set(['standard', 'gregorian', 'proleptic_gregorian']) @@ -390,8 +394,24 @@ def _infer_dtype(array): def encode_cf_variable(var): - """Converts an Variable into an Variable suitable for saving as a netCDF - variable + """ + Converts an Variable into an Variable which follows some + of the CF conventions: + + - Nans are masked using _FillValue (or the deprecated missing_value) + - Rescaling via: scale_factor and add_offset + - datetimes are converted to the CF 'units since time' format + - dtype encodings are enforced. + + Parameters + ---------- + var : xray.Variable + A variable holding un-encoded data. + + Returns + ------- + out : xray.Variable + A variable which has been encoded as described above. """ dimensions = var.dims data = var.values @@ -427,6 +447,15 @@ def encode_cf_variable(var): data = data.copy() data[missing] = fill_value + # replace NaN with the missing_value + if 'missing_value' in encoding: + missing_value = pop_to(encoding, attributes, 'missing_value') + if not pd.isnull(missing_value): + missing = pd.isnull(data) + if missing.any(): + data = data.copy() + data[missing] = missing_value + # cast to encoded dtype if 'dtype' in encoding: dtype = np.dtype(encoding.pop('dtype')) @@ -448,7 +477,7 @@ def encode_cf_variable(var): if inferred_dtype.kind in ['S', 'U']: # There is no safe bit-pattern for NA in typical binary string - # formats, we so can't set a _FillValue. Unfortunately, this + # formats, we so can't set a fill_value. Unfortunately, this # means we won't be able to restore string arrays with missing # values. fill_value = '' @@ -468,6 +497,32 @@ def encode_cf_variable(var): def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, decode_times=True): + """ + Decodes a variable which may hold CF encoded information. + + This includes variables that have been masked and scaled, which + hold CF style time variables (this is almost always the case if + the dataset has been serialized) and which have strings encoded + as character arrays. + + Parameters + ---------- + var : Variable + A variable holding potentially CF encoded information. + concat_characters : bool + Should character arrays be concatenated to strings, for + example: ['h', 'e', 'l', 'l', 'o'] -> 'hello' + mask_and_scale: bool + Lazily scale (using scale_factor and add_offset) and mask + (using _FillValue). + decode_times : bool + Decode cf times ('hours since 2000-01-01') to np.datetime64. + + Returns + ------- + out : Variable + A variable holding the decoded equivalent of var + """ # use _data instead of data so as not to trigger loading data var = as_variable(var) data = var._data @@ -477,8 +532,9 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, if 'dtype' in encoding: if data.dtype != encoding['dtype']: - raise ValueError("Refused to overwrite dtype") - encoding['dtype'] = data.dtype + warnings.warn("CF decoding is overwriting dtype") + else: + encoding['dtype'] = data.dtype if concat_characters: if data.dtype.kind == 'S' and data.dtype.itemsize == 1: @@ -486,7 +542,15 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, data = CharToStringArray(data) if mask_and_scale: + # missing_value is deprecated, but we still want to support it. + missing_value = pop_to(attributes, encoding, 'missing_value') fill_value = pop_to(attributes, encoding, '_FillValue') + # if missing_value is given but not fill_value we use missing_value + if fill_value is None and missing_value is not None: + fill_value = missing_value + # if both were given we make sure they are the same. + if fill_value is not None and missing_value is not None: + assert fill_value == missing_value scale_factor = pop_to(attributes, encoding, 'scale_factor') add_offset = pop_to(attributes, encoding, 'add_offset') if ((fill_value is not None and not pd.isnull(fill_value)) @@ -510,7 +574,10 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, def decode_cf_variables(variables, concat_characters=True, mask_and_scale=True, decode_times=True): - """Decode a bunch of CF variables together. + """ + Decode a several CF encoded variables. + + See: decode_cf_variable """ dimensions_used_by = defaultdict(list) for v in variables.values(): @@ -534,3 +601,79 @@ def stackable(dim): v, concat_characters=concat, mask_and_scale=mask_and_scale, decode_times=decode_times) return new_vars + + +def cf_decoder(ds, concat_characters=True, mask_and_scale=True, + decode_times=True, decode_cf=True): + """ + Decode a data store or Dataset which holds CF encoded variables. + + See Also, decode_cf_variable + + Parameters + ---------- + ds : Datastore + This can technically be any object with properties 'variables' + and 'attrs' and whose constructor follows type(ds)(variables, attrs) + concat_characters : bool + Should character arrays be concatenated to strings, for + example: ['h', 'e', 'l', 'l', 'o'] -> 'hello' + mask_and_scale: bool + Lazily scale (using scale_factor and add_offset) and mask + (using _FillValue). + decode_times : bool + Decode cf times ('hours since 2000-01-01') to np.datetime64. + decode_cf : bool + If false this skips decoding. This is around for backward + compatibility. + Returns + ------- + ds : DataStore + A DataStore holding the decoded variables and attributes + """ + # if decode_cf is false, we do nothing. + if not decode_cf: + return ds + new_vars = decode_cf_variables(ds.variables, concat_characters, + mask_and_scale, decode_times) + # Note that we don't return a Dataset because in some (though + # very few) cases the backend.stores are more flexible than + # xray. For example a string Index which gets expanded to a + # a character array during CF encoding would result in the + # requirement for multidimensional indexes which xray does + # not currently support. Instead we store the variables as + # a dictionary of variables and attributes in an in memory store. + return InMemoryDataStore({'variables': new_vars, + 'attributes': ds.attrs}) + + +def cf_encoder(ds, encode_cf=True): + """ + A function which takes a DataStore (ds) and encodes its + variables and attributes to conform to CF conventions as much + as possible. This includes masking, scaling, character + array handling, and CF-time encoding. + + See also: encode_cf_variable + """ + if not encode_cf: + return ds + new_vars = OrderedDict((k, encode_cf_variable(v)) + for k, v in iteritems(ds.variables)) + return InMemoryDataStore({'variables': new_vars, + 'attributes': ds.attrs}) + + +def cf_encoded(*args, **kwdargs): + """ + This Class decorator can be used to turn a DataStore into a + CF encoded DataStore. For example, to take some DataStore + and add a CF encoding layer you can do this: + + @cf_encoded + CFPunchCardDataStore(PunchCardDataStore): + pass + + See also: encoding_decorator, cf_encoder, cf_decoder + """ + return encoding_decorator(cf_encoder, cf_decoder)(*args, **kwdargs) diff --git a/xray/core/dataset.py b/xray/core/dataset.py index fcfb194a78e..79d0e8758ce 100644 --- a/xray/core/dataset.py +++ b/xray/core/dataset.py @@ -338,7 +338,10 @@ def _add_missing_coords(self): """ for dim, size in iteritems(self.dims): if dim not in self._arrays: - coord = variable.Coordinate(dim, np.arange(size)) + # This is equivalent to np.arange(size), but + # waits to create the array until its actually accessed. + data = indexing.LazyIntegerRange(size) + coord = variable.Coordinate(dim, data) self._arrays[dim] = coord def _update_vars_and_coords(self, new_arrays, new_coord_names={}, @@ -387,17 +390,16 @@ def _set_init_vars_and_dims(self, vars, coords): check_coord_names=False) @classmethod - def load_store(cls, store, decode_cf=True, mask_and_scale=True, - decode_times=True, concat_characters=True): + def load_store(cls, store, decoder=None, *args, **kwdargs): """Create a new dataset from the contents of a backends.*DataStore object """ - variables = store.variables - if decode_cf: - variables = conventions.decode_cf_variables( - variables, mask_and_scale=mask_and_scale, - decode_times=decode_times, concat_characters=concat_characters) - obj = cls(variables, attrs=store.attrs) + if decoder: + # here the new 'store' name is a bit overloaded, it will + # typically actually be a Dataset, but still functions + # the way a store does. + store = decoder(store, *args, **kwdargs) + obj = cls(store.variables, attrs=store.attrs) obj._file_obj = store return obj @@ -775,10 +777,12 @@ def reset_coords(self, names=None, drop=False, inplace=False): del obj._arrays[name] return obj - def dump_to_store(self, store): + def dump_to_store(self, store, encoder=None): """Store dataset contents to a backends.*DataStore object.""" - store.set_variables(self._arrays) - store.set_attributes(self.attrs) + ds = self + if encoder: + ds = encoder(ds) + store.store(ds) store.sync() def to_netcdf(self, filepath, **kwdargs): diff --git a/xray/core/indexing.py b/xray/core/indexing.py index 74019d4df45..72d381ad389 100644 --- a/xray/core/indexing.py +++ b/xray/core/indexing.py @@ -177,6 +177,47 @@ def _index_indexer_1d(old_indexer, applied_indexer, size): return indexer +class LazyIntegerRange(utils.NDArrayMixin): + + def __init__(self, *args, **kwdargs): + """ + Parameters + ---------- + See np.arange + """ + self.args = args + self.kwdargs = kwdargs + assert 'dtype' not in self.kwdargs + # range will fail if any arguments are not integers + self.array = range(*args, **kwdargs) + + @property + def shape(self): + return (len(self.array),) + + @property + def dtype(self): + return np.dtype('int64') + + @property + def ndim(self): + return 1 + + @property + def size(self): + return len(self.array) + + def __getitem__(self, key): + return np.array(self)[key] + + def __array__(self, dtype=None): + return np.arange(*self.args, **self.kwdargs) + + def __repr__(self): + return ('%s(array=%r)' % + (type(self).__name__, self.array)) + + class LazilyIndexedArray(utils.NDArrayMixin): """Wrap an array that handles orthogonal indexing to make indexing lazy """ diff --git a/xray/test/test_backends.py b/xray/test/test_backends.py index e242ab1a898..668ca8ecf91 100644 --- a/xray/test/test_backends.py +++ b/xray/test/test_backends.py @@ -42,6 +42,10 @@ def create_encoded_masked_and_scaled_data(): return Dataset({'x': ('t', [-1, -1, 0, 1, 2], attributes)}) +class CastUnicodeToBytes(object): + pass + + class DatasetIOTestCases(object): def create_store(self): raise NotImplementedError @@ -120,21 +124,66 @@ def test_roundtrip_object_dtype(self): # see the note under test_zero_dimensional_variable del original['nan'] expected = original.copy(deep=True) - expected['letters_nans'][-1] = '' - if type(self) is not NetCDF4DataTest: + if type(self) in [NetCDF3ViaNetCDF4DataTest, ScipyDataTest]: # for netCDF3 tests, expect the results to come back as characters expected['letters_nans'] = expected['letters_nans'].astype('S') expected['letters'] = expected['letters'].astype('S') with self.roundtrip(original) as actual: - self.assertDatasetIdentical(expected, actual) + try: + self.assertDatasetIdentical(expected, actual) + except: + # Most stores use '' for nans in strings, but some don't + # first try the ideal case (where the store returns exactly) + # the original Dataset), then try a more realistic case. + expected['letters_nans'][-1] = '' + self.assertDatasetIdentical(expected, actual) def test_roundtrip_string_data(self): expected = Dataset({'x': ('t', ['ab', 'cdef'])}) with self.roundtrip(expected) as actual: - if type(self) is not NetCDF4DataTest: + if isinstance(self, CastUnicodeToBytes): expected['x'] = expected['x'].astype('S') self.assertDatasetIdentical(expected, actual) + def test_roundtrip_example_1_netcdf(self): + expected = open_example_dataset('example_1.nc') + with self.roundtrip(expected) as actual: + # we allow the attributes to differ since that + # will depend on the encoding used. For example, + # without CF encoding 'actual' will end up with + # a dtype attribute. + self.assertDatasetEqual(expected, actual) + + def test_roundtrip_example_1_netcdf_gz(self): + if sys.version_info[:2] < (2, 7): + with self.assertRaisesRegexp(ValueError, 'gzipped netCDF not supported'): + open_example_dataset('example_1.nc.gz') + else: + with open_example_dataset('example_1.nc.gz') as expected: + with open_example_dataset('example_1.nc') as actual: + self.assertDatasetIdentical(expected, actual) + + def test_orthogonal_indexing(self): + in_memory = create_test_data() + with self.roundtrip(in_memory) as on_disk: + indexers = {'dim1': np.arange(3), 'dim2': np.arange(4), + 'dim3': np.arange(5)} + expected = in_memory.isel(**indexers) + actual = on_disk.isel(**indexers) + self.assertDatasetAllClose(expected, actual) + # do it twice, to make sure we're switched from orthogonal -> numpy + # when we cached the values + actual = on_disk.isel(**indexers) + self.assertDatasetAllClose(expected, actual) + + def test_pickle(self): + on_disk = open_example_dataset('bears.nc') + unpickled = pickle.loads(pickle.dumps(on_disk)) + self.assertDatasetIdentical(on_disk, unpickled) + + +class CFEncodedDataTest(DatasetIOTestCases): + def test_roundtrip_strings_with_fill_value(self): values = np.array(['ab', 'cdef', np.nan], dtype=object) encoding = {'_FillValue': np.string_('X'), 'dtype': np.dtype('S1')} @@ -165,43 +214,17 @@ def test_roundtrip_mask_and_scale(self): self.assertDatasetAllClose(decoded, actual) with self.roundtrip(decoded, decode_cf=False) as actual: self.assertDatasetAllClose(encoded, actual) + with self.roundtrip(encoded, decode_cf=False) as actual: + self.assertDatasetAllClose(encoded, actual) + # make sure roundtrip encoding didn't change the + # original dataset. + self.assertDatasetIdentical(encoded, + create_encoded_masked_and_scaled_data()) with self.roundtrip(encoded) as actual: self.assertDatasetAllClose(decoded, actual) with self.roundtrip(encoded, decode_cf=False) as actual: self.assertDatasetAllClose(encoded, actual) - def test_roundtrip_example_1_netcdf(self): - with open_example_dataset('example_1.nc') as expected: - with self.roundtrip(expected) as actual: - self.assertDatasetIdentical(expected, actual) - - def test_roundtrip_example_1_netcdf_gz(self): - if sys.version_info[:2] < (2, 7): - with self.assertRaisesRegexp(ValueError, 'gzipped netCDF not supported'): - open_example_dataset('example_1.nc.gz') - else: - with open_example_dataset('example_1.nc.gz') as expected: - with open_example_dataset('example_1.nc') as actual: - self.assertDatasetIdentical(expected, actual) - - def test_orthogonal_indexing(self): - in_memory = create_test_data() - with self.roundtrip(in_memory) as on_disk: - indexers = {'dim1': np.arange(3), 'dim2': np.arange(4), - 'dim3': np.arange(5)} - expected = in_memory.isel(**indexers) - actual = on_disk.isel(**indexers) - self.assertDatasetAllClose(expected, actual) - # do it twice, to make sure we're switched from orthogonal -> numpy - # when we cached the values - actual = on_disk.isel(**indexers) - self.assertDatasetAllClose(expected, actual) - - def test_pickle(self): - with open_example_dataset('bears.nc') as on_disk: - unpickled = pickle.loads(pickle.dumps(on_disk)) - self.assertDatasetIdentical(on_disk, unpickled) - @contextlib.contextmanager def create_tmp_file(suffix='.nc'): @@ -214,7 +237,7 @@ def create_tmp_file(suffix='.nc'): @requires_netCDF4 -class NetCDF4DataTest(DatasetIOTestCases, TestCase): +class NetCDF4DataTest(CFEncodedDataTest, TestCase): @contextlib.contextmanager def create_store(self): with create_tmp_file() as tmp_file: @@ -402,10 +425,56 @@ def test_default_to_char_arrays(self): self.assertDatasetIdentical(data, actual) self.assertEqual(actual['x'].dtype, np.dtype('S4')) + def test_open_encodings(self): + # Create a netCDF file with explicit time units + # and make sure it makes it into the encodings + # and survives a round trip + with create_tmp_file() as tmp_file: + with nc4.Dataset(tmp_file, 'w') as ds: + ds.createDimension('time', size=10) + ds.createVariable('time', np.int32, dimensions=('time',)) + units = 'days since 1999-01-01' + ds.variables['time'].setncattr('units', units) + ds.variables['time'][:] = np.arange(10) + 4 + + expected = Dataset() + + time = pd.date_range('1999-01-05', periods=10) + encoding = {'units': units, 'dtype': np.dtype('int32')} + expected['time'] = ('time', time, {}, encoding) + + actual = open_dataset(tmp_file) + + self.assertVariableEqual(actual['time'], expected['time']) + actual_encoding = {k: v for k, v in iteritems(actual['time'].encoding) + if k in expected['time'].encoding} + self.assertDictEqual(actual_encoding, expected['time'].encoding) + + def test_dump_and_open_encodings(self): + # Create a netCDF file with explicit time units + # and make sure it makes it into the encodings + # and survives a round trip + with create_tmp_file() as tmp_file: + with nc4.Dataset(tmp_file, 'w') as ds: + ds.createDimension('time', size=10) + ds.createVariable('time', np.int32, dimensions=('time',)) + units = 'days since 1999-01-01' + ds.variables['time'].setncattr('units', units) + ds.variables['time'][:] = np.arange(10) + 4 + + xray_dataset = open_dataset(tmp_file) + + with create_tmp_file() as tmp_file2: + xray_dataset.dump(tmp_file2) + + with nc4.Dataset(tmp_file2, 'r') as ds: + self.assertEqual(ds.variables['time'].getncattr('units'), units) + self.assertArrayEqual(ds.variables['time'], np.arange(10) + 4) + @requires_netCDF4 @requires_scipy -class ScipyDataTest(DatasetIOTestCases, TestCase): +class ScipyDataTest(CFEncodedDataTest, CastUnicodeToBytes, TestCase): @contextlib.contextmanager def create_store(self): fobj = BytesIO() @@ -419,7 +488,7 @@ def roundtrip(self, data, **kwargs): @requires_netCDF4 -class NetCDF3ViaNetCDF4DataTest(DatasetIOTestCases, TestCase): +class NetCDF3ViaNetCDF4DataTest(CFEncodedDataTest, CastUnicodeToBytes, TestCase): @contextlib.contextmanager def create_store(self): with create_tmp_file() as tmp_file: diff --git a/xray/test/test_conventions.py b/xray/test/test_conventions.py index 30db0f93553..0375508da7d 100644 --- a/xray/test/test_conventions.py +++ b/xray/test/test_conventions.py @@ -1,9 +1,13 @@ import numpy as np import pandas as pd import warnings +import contextlib -from xray import conventions, Variable +from xray import conventions, Variable, Dataset +from xray.core import utils, indexing from . import TestCase, requires_netCDF4 +from .test_backends import CFEncodedDataTest, DatasetIOTestCases +from xray.backends.memory import InMemoryDataStore class TestMaskedAndScaledArray(TestCase): @@ -275,3 +279,67 @@ def test_incompatible_attributes(self): for var in invalid_vars: with self.assertRaises(ValueError): conventions.encode_cf_variable(var) + + +@conventions.cf_encoded +class CFEncodedInMemoryStore(InMemoryDataStore): + pass + + +class TestCFEncodedDataStore(CFEncodedDataTest, TestCase): + @contextlib.contextmanager + def create_store(self): + yield CFEncodedInMemoryStore() + + @contextlib.contextmanager + def roundtrip(self, data, **kwargs): + store = CFEncodedInMemoryStore(**kwargs) + store.store(data) + yield Dataset.load_store(store, decoder=None) + + +class NullWrapper(utils.NDArrayMixin): + """ + Just for testing, this lets us create a numpy array directly + but make it look like its not in memory yet. + """ + def __init__(self, array): + self.array = array + + def __getitem__(self, key): + return self.array[indexing.orthogonal_indexer(key, self.shape)] + + +def lazy_identity(x): + """ + Given a data store this wraps each variable in a NullWrapper so that + it appears to be out of memory. + """ + variables = {k: Variable(v.dimensions, + NullWrapper(v.values), + v.attrs) for k, v in x.variables.items()} + return InMemoryDataStore({'variables': variables, + 'attributes': x.attrs}) + + +@conventions.encoding_decorator(lambda x: x, lazy_identity) +class IdentityEncodedInMemoryStore(InMemoryDataStore): + """ + This InMemoryStore does no encoding or decoding, other than + wrapping all variables in NullWrappers, which lets us + test the trivial case of encoding and decoding. + """ + pass + + +class EncodedDataTest(DatasetIOTestCases, TestCase): + + @contextlib.contextmanager + def create_store(self): + yield IdentityEncodedInMemoryStore() + + @contextlib.contextmanager + def roundtrip(self, data, **kwargs): + store = IdentityEncodedInMemoryStore(**kwargs) + store.store(data) + yield Dataset.load_store(store, decoder=None) diff --git a/xray/test/test_dataset.py b/xray/test/test_dataset.py index 81b7766c377..2a15407cc76 100644 --- a/xray/test/test_dataset.py +++ b/xray/test/test_dataset.py @@ -47,22 +47,12 @@ def __getitem__(self, key): class InaccessibleVariableDataStore(backends.InMemoryDataStore): - def __init__(self): - self.dims = OrderedDict() - self._variables = OrderedDict() - self.attrs = OrderedDict() - - def set_variable(self, name, variable): - self._variables[name] = variable - return self._variables[name] - - def open_store_variable(self, var): - data = indexing.LazilyIndexedArray(InaccessibleArray(var.values)) - return Variable(var.dims, data, var.attrs) - - @property - def store_variables(self): - return self._variables + def get_variables(self): + def lazy_inaccessible(x): + data = indexing.LazilyIndexedArray(InaccessibleArray(x.values)) + return Variable(x.dimensions, data, x.attrs) + return dict((k, lazy_inaccessible(v)) for + k, v in iteritems(self.ds['variables'])) class TestDataset(TestCase): From c2e46d3b216d6da143c8f3c066e0ea000dff6ad8 Mon Sep 17 00:00:00 2001 From: Alex Kleeman Date: Wed, 8 Oct 2014 12:34:23 -0700 Subject: [PATCH 2/5] Removed the object oriented encoding/decoding scheme in favor of a model where encoding/decoding happens when a dataset is stored to/ loaded from a DataStore. Conventions can now be enforced at the DataStore level by overwritting the Datastore.store() and Datastore.load() methods, or as an optional arg to Dataset.load_store, Dataset.dump_to_store. Includes miscelanous cleanup. --- xray/backends/common.py | 215 ++++++++++------------------------ xray/backends/memory.py | 7 +- xray/backends/netCDF4_.py | 18 ++- xray/backends/scipy_.py | 19 ++- xray/conventions.py | 85 ++++++-------- xray/core/dataset.py | 64 +++++----- xray/test/test_backends.py | 16 ++- xray/test/test_conventions.py | 80 +++++++------ 8 files changed, 228 insertions(+), 276 deletions(-) diff --git a/xray/backends/common.py b/xray/backends/common.py index e20515a2383..2d804220ed8 100644 --- a/xray/backends/common.py +++ b/xray/backends/common.py @@ -1,7 +1,7 @@ import numpy as np -import inspect import itertools -import functools + +from collections import Mapping from ..core.utils import FrozenOrderedDict from ..core.pycompat import iteritems @@ -25,7 +25,7 @@ def _decode_variable_name(name): def is_trivial_index(var): """ - Determines if in index is 'trivial' mean that it is + Determines if in index is 'trivial' meaning that it is equivalent to np.arange(). This is determined by checking if there are any attributes or encodings, if ndims is one, dtype is int and finally by comparing @@ -45,7 +45,16 @@ def is_trivial_index(var): return True -class AbstractDataStore(object): +class AbstractDataStore(Mapping): + + def __iter__(self): + return iter(self.variables) + + def __getitem__(self, key): + return self.variables[key] + + def __len__(self): + return len(self.variables) def get_attrs(self): raise NotImplementedError @@ -53,18 +62,53 @@ def get_attrs(self): def get_variables(self): raise NotImplementedError + def load(self): + """ + This loads the variables and attributes simultaneously. + A centralized loading function makes it easier to create + data stores that do automatic encoding/decoding. + + For example: + + class SuffixAppendingDataStore(AbstractDataStore): + + def load(self): + variables, attributes = AbstractDataStore.load(self) + variables = {'%s_suffix' % k: v + for k, v in iteritems(variables)} + attributes = {'%s_suffix' % k: v + for k, v in iteritems(attributes)} + return variables, attributes + + This function will be called anytime variables or attributes + are requested, so care should be taken to make sure its fast. + """ + variables = FrozenOrderedDict((_decode_variable_name(k), v) + for k, v in iteritems(self.get_variables())) + attributes = FrozenOrderedDict(self.get_attrs()) + return variables, attributes + def get_dimensions(self): - return list(itertools.chain(*[x.dimensions - for x in self.get_variables().values()])) + return list(itertools.chain(*[x.dims + for x in self.variables.values()])) @property def variables(self): - return FrozenOrderedDict((_decode_variable_name(k), v) - for k, v in iteritems(self.get_variables())) + # Because encoding/decoding might happen which may require both the + # attributes and the variables, and because a store may be updated + # we need to load both the attributes and variables + # anytime either one is requested. + variables, _ = self.load() + return variables @property def attrs(self): - return FrozenOrderedDict(self.get_attrs()) + # Because encoding/decoding might happen which may require both the + # attributes and the variables, and because a store may be updated + # we need to load both the attributes and variables + # anytime either one is requested. + _, attributes = self.load() + return attributes @property def dimensions(self): @@ -94,16 +138,22 @@ def set_variable(self, k, v): def sync(self): pass - def store(self, dataset): - self.set_attributes(dataset.attrs) - neccesary_dims = [[d for d in v.dimensions] - for v in dataset.variables.values()] + def store_dataset(self, dataset): + # in stores variables are all variables AND coordinates + # in xray.Dataset variables are variables NOT coordinates, + # so here we pass the whole dataset in instead of doing + # dataset.variables + self.store(dataset, dataset.attrs) + + def store(self, variables, attributes): + self.set_attributes(attributes) + neccesary_dims = [[d for d in v.dims] + for v in variables.values()] neccesary_dims = set(itertools.chain(*neccesary_dims)) # set all non-indexes and any index which is not trivial. - variables = {k: v for k, v in iteritems(dataset.variables) + variables = {k: v for k, v in iteritems(variables) if not (k in neccesary_dims and is_trivial_index(v))} self.set_variables(variables) - #self.set_variables(dataset.variables) def set_dimensions(self, dimensions): for d, l in iteritems(dimensions): @@ -122,138 +172,3 @@ def set_necessary_dimensions(self, variable): for d, l in zip(variable.dims, variable.shape): if d not in self.dimensions: self.set_dimension(d, l) - - -class AbstractEncodedDataStore(AbstractWritableDataStore): - """ - AbstractEncodedDataStore is an interface for making a - DataStore which wraps another DataStore while first passing - all input/output through an encoding/decoding layer. - This allows more modular application of things such as - conforming to CF Conventions. - - There are no explicity restrictions requiring an - EncodedDataStore to be roundtrip-able, but when this is desired - (probably often) consider passing implementing - classes through test_backends:DatasetIOTestCases. - - Requires Implementation - -------- - encode : function(self, datastore) - - - decode : function(self, datastore) - - """ - def encode(self, datastore): - """ - A function which takes an un-encoded datastore and returns - a new DataStore (or Dataset) which has been encoded. Returning - an InMemoryDataStore for this is encouraged since it avoids - the xray consistency checks making it faster / more flexible. - - """ - raise NotImplementedError - - def decode(self, datastore): - """ - A function which takes an encoded datastore and returns - a new DataStore which has been decoded. Again consider - using an InMemoryDataStore, though returning a Dataset - will work perfectly fine in most situations. - - Also note that directly accessing variable data may cause - remote DataStores to be loaded into memory. - See conventions.decode_cf_variable for examples of wrapping - computations to make them lazy. - """ - raise NotImplementedError - - @property - def decoded(self): - if not hasattr(self, '_decoded'): - self._decoded = self.decode(self.ds) - return self._decoded - - def get_dimensions(self): - return self.decoded.dimensions - - def get_variables(self): - return self.decoded.variables - - def get_attrs(self): - return self.decoded.attrs - - def store(self, dataset): - self.ds.store(self.encode(dataset)) - self.ds.sync() - - def sync(self): - self.ds.sync() - - def close(self): - self.ds.close() - - -def encoding_decorator(encoder, decoder): - """ - This is a Class decorating function which makes wrapping DataStores - in additional encoding layers easier. - - Note that often times the encoders and decoders will require arguments - at class creation time. To handle this, the encoder and decoder args - are first inspected. Any arguments they require are used first, and - any remaining arguments are passed onto the DataStore being wrapped. - - Parameters - ---------- - encoder : function - Takes a Datastore (or Dataset) and returns an encoded Datastore. - decoder : function - Takes a Datastore (or Dataset) and returns a decoded Datastore. - - Returns - ------- - class_wrapper: A function which wraps a DataStore class and turns - it into an EncodingWrappedDataStore. - """ - - def class_wrapper(cls): - class EncodingWrappedDataStore(AbstractEncodedDataStore): - - def __init__(self, *args, **kwdargs): - # NOTE: we assume that any arguments for the encoder - # and decoder are keyword args. All position arguments - # are passed on to the DataStore. - encoder_argnames = set(inspect.getargspec(encoder).args[1:]) - decoder_argnames = set(inspect.getargspec(decoder).args[1:]) - # make sure there aren't any argument collisions, that would - # get pretty confusing. - constructor_args = set(inspect.getargspec(cls.__init__)[1:]) - if constructor_args.intersection(encoder_argnames): - bad_args = constructor_args.intersection(encoder_argnames) - raise ValueError("encoder and class have overlapping args: %s" - % ', '.join(bad_args)) - if constructor_args.intersection(decoder_argnames): - bad_args = constructor_args.intersection(decoder_argnames) - raise ValueError("decoder and class have overlapping args: %s" - % ', '.join(bad_args)) - # create a set of keyword arguments for both the encoder and decoder - encoder_args = {} - decoder_args = {} - for k in encoder_argnames.union(decoder_argnames): - if k in kwdargs: - v = kwdargs.pop(k) - if k in encoder_argnames: - encoder_args[k] = v - if k in decoder_argnames: - decoder_args[k] = v - # create the data store. - self.ds = cls(*args, **kwdargs) - # set the encode and decode function using the provided args - self.encode = functools.partial(encoder, **encoder_args) - self.decode = functools.partial(decoder, **decoder_args) - - return EncodingWrappedDataStore - - return class_wrapper diff --git a/xray/backends/memory.py b/xray/backends/memory.py index 95f416d4d3b..ff5258c7e63 100644 --- a/xray/backends/memory.py +++ b/xray/backends/memory.py @@ -26,11 +26,16 @@ def get_variables(self): def set_variable(self, k, v): new_var = copy.deepcopy(v) # we copy the variable and stuff all encodings in the - # attributes to imitate what happens when writting to disk. + # attributes to imitate what happens when writing to disk. new_var.attrs.update(new_var.encoding) new_var.encoding.clear() + print self.ds['variables'].keys() self.ds['variables'][k] = new_var def set_attribute(self, k, v): # copy to imitate writing to disk. self.ds['attributes'][k] = copy.deepcopy(v) + + def set_dimension(self, d, l): + # in this model, dimensions are accounted for in the variables + pass \ No newline at end of file diff --git a/xray/backends/netCDF4_.py b/xray/backends/netCDF4_.py index 782b54ca76b..1094c142039 100644 --- a/xray/backends/netCDF4_.py +++ b/xray/backends/netCDF4_.py @@ -3,7 +3,7 @@ import numpy as np from .. import Variable -from ..conventions import encode_cf_variable, pop_to, cf_encoded +from ..conventions import pop_to, cf_encoder from ..core import indexing from ..core.utils import FrozenOrderedDict, NDArrayMixin from ..core.pycompat import iteritems, basestring, OrderedDict @@ -80,14 +80,14 @@ def _ensure_fill_value_valid(data, attributes): attributes['_FillValue'] = np.string_(attributes['_FillValue']) -@cf_encoded class NetCDF4DataStore(AbstractWritableDataStore): """Store for reading and writing data via the Python-NetCDF4 library. This store supports NetCDF3, NetCDF4 and OpenDAP datasets. """ def __init__(self, filename, mode='r', clobber=True, diskless=False, - persist=False, format='NETCDF4', group=None): + persist=False, format='NETCDF4', group=None, + *args, **kwdargs): import netCDF4 as nc4 ds = nc4.Dataset(filename, mode=mode, clobber=clobber, diskless=diskless, persist=persist, @@ -95,6 +95,16 @@ def __init__(self, filename, mode='r', clobber=True, diskless=False, self.ds = _nc4_group(ds, group) self.format = format self._filename = filename + self._encoder_args = args + self._encoder_kwdargs = kwdargs + + def store(self, variables, attributes): + # All NetCDF files get CF encoded by default, without this attempting + # to write times, for example, would fail. + cf_variables, cf_attrs = cf_encoder(variables, attributes, + *self._encoder_args, + **self._encoder_kwdargs) + AbstractWritableDataStore.store(self, cf_variables, cf_attrs) def open_store_variable(self, var): var.set_auto_maskandscale(False) @@ -119,8 +129,6 @@ def open_store_variable(self, var): # TODO: figure out how to round-trip "endian-ness" without raising # warnings from netCDF4 # encoding['endian'] = var.endian() - # encoding['least_significant_digit'] = \ - # attributes.pop('least_significant_digit', None) pop_to(attributes, encoding, 'least_significant_digit') # save source so __repr__ can detect if it's local or not encoding['source'] = self._filename diff --git a/xray/backends/scipy_.py b/xray/backends/scipy_.py index 20d277881fe..25a45374a16 100644 --- a/xray/backends/scipy_.py +++ b/xray/backends/scipy_.py @@ -3,12 +3,14 @@ import numpy as np import warnings -from .. import conventions, Variable +from .. import Variable +from ..conventions import cf_encoder from ..core.pycompat import iteritems, basestring, unicode_type, OrderedDict from ..core.utils import Frozen, FrozenOrderedDict from .common import AbstractWritableDataStore from .netcdf3 import is_valid_nc3_name, coerce_nc3_dtype, encode_nc3_variable +from xray.conventions import cf_decoder def _decode_string(s): @@ -23,7 +25,7 @@ def _decode_attrs(d): return OrderedDict((k, v if k == '_FillValue' else _decode_string(v)) for (k, v) in iteritems(d)) -@conventions.cf_encoded + class ScipyDataStore(AbstractWritableDataStore): """Store for reading and writing data via scipy.io.netcdf. @@ -32,7 +34,8 @@ class ScipyDataStore(AbstractWritableDataStore): It only supports the NetCDF3 file-format. """ - def __init__(self, filename_or_obj, mode='r', mmap=None, version=1): + def __init__(self, filename_or_obj, mode='r', mmap=None, + version=1, *args, **kwdargs): import scipy if mode != 'r' and scipy.__version__ < '0.13': warnings.warn('scipy %s detected; ' @@ -50,6 +53,16 @@ def __init__(self, filename_or_obj, mode='r', mmap=None, version=1): filename_or_obj = BytesIO(filename_or_obj) self.ds = scipy.io.netcdf.netcdf_file( filename_or_obj, mode=mode, mmap=mmap, version=version) + self._encoder_args = args + self._encoder_kwdargs = kwdargs + + def store(self, variables, attributes): + # All Scipy objects get CF encoded by default, without this attempting + # to write times, for example, would fail. + cf_variables, cf_attrs = cf_encoder(variables, attributes, + *self._encoder_args, + **self._encoder_kwdargs) + AbstractWritableDataStore.store(self, cf_variables, cf_attrs) def open_store_variable(self, var): return Variable(var.dimensions, var.data, diff --git a/xray/conventions.py b/xray/conventions.py index 26e49797f24..c30f58d8124 100644 --- a/xray/conventions.py +++ b/xray/conventions.py @@ -8,9 +8,6 @@ from .core.variable import as_variable, Variable from .core.pycompat import iteritems, bytes_type, unicode_type, OrderedDict -from .backends.common import encoding_decorator -from .backends import InMemoryDataStore - # standard calendars recognized by netcdftime _STANDARD_CALENDARS = set(['standard', 'gregorian', 'proleptic_gregorian']) @@ -603,18 +600,20 @@ def stackable(dim): return new_vars -def cf_decoder(ds, concat_characters=True, mask_and_scale=True, - decode_times=True, decode_cf=True): +def cf_decoder(variables, attributes, + concat_characters=True, mask_and_scale=True, + decode_times=True): """ - Decode a data store or Dataset which holds CF encoded variables. + Decode a set of CF encoded variables and attributes. See Also, decode_cf_variable Parameters ---------- - ds : Datastore - This can technically be any object with properties 'variables' - and 'attrs' and whose constructor follows type(ds)(variables, attrs) + variables : dict + A dictionary mapping from variable name to xray.Variable + attributes : dict + A dictionary mapping from attribute name to value concat_characters : bool Should character arrays be concatenated to strings, for example: ['h', 'e', 'l', 'l', 'o'] -> 'hello' @@ -626,54 +625,46 @@ def cf_decoder(ds, concat_characters=True, mask_and_scale=True, decode_cf : bool If false this skips decoding. This is around for backward compatibility. + Returns ------- - ds : DataStore - A DataStore holding the decoded variables and attributes + decoded_variables : dict + A dictionary mapping from variable name to xray.Variable, + decoded_attributes : dict + A dictionary mapping from attribute name to value """ - # if decode_cf is false, we do nothing. - if not decode_cf: - return ds - new_vars = decode_cf_variables(ds.variables, concat_characters, + new_vars = decode_cf_variables(variables, concat_characters, mask_and_scale, decode_times) - # Note that we don't return a Dataset because in some (though - # very few) cases the backend.stores are more flexible than - # xray. For example a string Index which gets expanded to a - # a character array during CF encoding would result in the - # requirement for multidimensional indexes which xray does - # not currently support. Instead we store the variables as - # a dictionary of variables and attributes in an in memory store. - return InMemoryDataStore({'variables': new_vars, - 'attributes': ds.attrs}) - - -def cf_encoder(ds, encode_cf=True): + return new_vars, attributes + + +def cf_encoder(variables, attributes): """ - A function which takes a DataStore (ds) and encodes its - variables and attributes to conform to CF conventions as much + A function which takes a dicts of variables and attributes + and encodes them to conform to CF conventions as much as possible. This includes masking, scaling, character array handling, and CF-time encoding. - See also: encode_cf_variable - """ - if not encode_cf: - return ds - new_vars = OrderedDict((k, encode_cf_variable(v)) - for k, v in iteritems(ds.variables)) - return InMemoryDataStore({'variables': new_vars, - 'attributes': ds.attrs}) + Decode a set of CF encoded variables and attributes. + See Also, decode_cf_variable -def cf_encoded(*args, **kwdargs): - """ - This Class decorator can be used to turn a DataStore into a - CF encoded DataStore. For example, to take some DataStore - and add a CF encoding layer you can do this: + Parameters + ---------- + variables : dict + A dictionary mapping from variable name to xray.Variable + attributes : dict + A dictionary mapping from attribute name to value - @cf_encoded - CFPunchCardDataStore(PunchCardDataStore): - pass + Returns + ------- + encoded_variables : dict + A dictionary mapping from variable name to xray.Variable, + encoded_attributes : dict + A dictionary mapping from attribute name to value - See also: encoding_decorator, cf_encoder, cf_decoder + See also: encode_cf_variable """ - return encoding_decorator(cf_encoder, cf_decoder)(*args, **kwdargs) + new_vars = OrderedDict((k, encode_cf_variable(v)) + for k, v in iteritems(variables)) + return new_vars, attributes diff --git a/xray/core/dataset.py b/xray/core/dataset.py index 79d0e8758ce..1e4c082b603 100644 --- a/xray/core/dataset.py +++ b/xray/core/dataset.py @@ -1,27 +1,27 @@ -from collections import Mapping +import sys +import gzip +import warnings import functools from io import BytesIO -import warnings -import sys +from collections import Mapping import numpy as np import pandas as pd -from .. import backends, conventions -from . import alignment +from . import ops +from . import utils from . import common -from . import formatting from . import groupby from . import indexing from . import variable -from . import utils -from . import ops +from . import alignment +from . import formatting +from .. import backends, conventions from .coordinates import DatasetCoordinates, Indexes from .utils import (Frozen, SortedKeysDict, ChainMap, multi_index_from_product) from .pycompat import iteritems, itervalues, basestring, OrderedDict -import gzip def open_dataset(nc, decode_cf=True, mask_and_scale=True, decode_times=True, concat_characters=True, *args, **kwargs): @@ -63,20 +63,29 @@ def open_dataset(nc, decode_cf=True, mask_and_scale=True, decode_times=True, if isinstance(nc, basestring): # If the initialization nc is a string and if nc.endswith('.gz'): - # the name ends with .gz, then gunzip and open as netcdf file - # FIXME: does ScipyDataStore handle NetCDF4 files? - if sys.version_info[:2] < (2, 7): - raise ValueError('reading a gzipped netCDF not supported on Python 2.6') - store = backends.ScipyDataStore(gzip.open(nc), *args, **kwargs) + # if the string ends with .gz, then gunzip and open as netcdf file + if sys.version_info[:2] < (2, 7): + raise ValueError('reading a gzipped netCDF not ' + 'supported on Python 2.6') + try: + store = backends.ScipyDataStore(gzip.open(nc), *args, **kwargs) + except TypeError, e: + # TODO: gzipped loading only works with NetCDF3 files. + if 'is not a valid NetCDF 3 file' in e.message: + raise TypeError("xray: gzipped file loading only supports NetCDF 3 files.") + else: + raise e elif not nc.startswith('CDF'): - # it does not appear to be the contents of a netcdf file we load - # it using the netCDF4 package - store = backends.NetCDF4DataStore(nc, *args, **kwargs) + # nc does not appear to be a string holding the contents of a + # netcdf file so we treat it as a path and load it using the + # netCDF4 package + store = backends.NetCDF4DataStore(nc, *args, **kwargs) else: # If nc is a file-like object we read it using # the scipy.io.netcdf package store = backends.ScipyDataStore(nc, *args, **kwargs) - return Dataset.load_store(store, decode_cf=decode_cf, + decoder = conventions.cf_decoder if decode_cf else None + return Dataset.load_store(store, decoder=decoder, mask_and_scale=mask_and_scale, decode_times=decode_times, concat_characters=concat_characters) @@ -394,12 +403,11 @@ def load_store(cls, store, decoder=None, *args, **kwdargs): """Create a new dataset from the contents of a backends.*DataStore object """ + variables, attributes = store.load() if decoder: - # here the new 'store' name is a bit overloaded, it will - # typically actually be a Dataset, but still functions - # the way a store does. - store = decoder(store, *args, **kwdargs) - obj = cls(store.variables, attrs=store.attrs) + variables, attributes = decoder(variables, attributes, + *args, **kwdargs) + obj = cls(variables, attrs=attributes) obj._file_obj = store return obj @@ -777,12 +785,14 @@ def reset_coords(self, names=None, drop=False, inplace=False): del obj._arrays[name] return obj - def dump_to_store(self, store, encoder=None): + def dump_to_store(self, store, encoder=None, + *args, **kwdargs): """Store dataset contents to a backends.*DataStore object.""" - ds = self + variables, attributes = self, self.attrs if encoder: - ds = encoder(ds) - store.store(ds) + variables, attributes = encoder(variables, attributes, + *args, **kwdargs) + store.store(variables, attributes) store.sync() def to_netcdf(self, filepath, **kwdargs): diff --git a/xray/test/test_backends.py b/xray/test/test_backends.py index 668ca8ecf91..707339b92ec 100644 --- a/xray/test/test_backends.py +++ b/xray/test/test_backends.py @@ -1,3 +1,4 @@ +from xray.conventions import cf_decoder try: import cPickle as pickle except ImportError: @@ -63,14 +64,18 @@ def test_zero_dimensional_variable(self): {'units': 'units of awesome'}) with self.create_store() as store: expected.dump_to_store(store) - actual = Dataset.load_store(store) + # the test data contains times. In case the store + # cf_encodes them we need to cf_decode them. + actual = Dataset.load_store(store, cf_decoder) self.assertDatasetAllClose(expected, actual) def test_write_store(self): expected = create_test_data() with self.create_store() as store: expected.dump_to_store(store) - actual = Dataset.load_store(store) + # the test data contains times. In case the store + # cf_encodes them we need to cf_decode them. + actual = Dataset.load_store(store, cf_decoder) self.assertDatasetAllClose(expected, actual) def test_roundtrip_test_data(self): @@ -156,10 +161,11 @@ def test_roundtrip_example_1_netcdf(self): def test_roundtrip_example_1_netcdf_gz(self): if sys.version_info[:2] < (2, 7): - with self.assertRaisesRegexp(ValueError, 'gzipped netCDF not supported'): + with self.assertRaisesRegexp(ValueError, + 'gzipped netCDF not supported'): open_example_dataset('example_1.nc.gz') else: - with open_example_dataset('example_1.nc.gz') as expected: + with open_example_dataset('example_1.nc.gz') as expected: with open_example_dataset('example_1.nc') as actual: self.assertDatasetIdentical(expected, actual) @@ -213,6 +219,8 @@ def test_roundtrip_mask_and_scale(self): with self.roundtrip(decoded) as actual: self.assertDatasetAllClose(decoded, actual) with self.roundtrip(decoded, decode_cf=False) as actual: + # TODO: this assumes that all roundtrips will first + # encode. Is that something we want to test for? self.assertDatasetAllClose(encoded, actual) with self.roundtrip(encoded, decode_cf=False) as actual: self.assertDatasetAllClose(encoded, actual) diff --git a/xray/test/test_conventions.py b/xray/test/test_conventions.py index 0375508da7d..495b8582000 100644 --- a/xray/test/test_conventions.py +++ b/xray/test/test_conventions.py @@ -6,8 +6,10 @@ from xray import conventions, Variable, Dataset from xray.core import utils, indexing from . import TestCase, requires_netCDF4 -from .test_backends import CFEncodedDataTest, DatasetIOTestCases +from .test_backends import CFEncodedDataTest +from xray.core.pycompat import iteritems from xray.backends.memory import InMemoryDataStore +from xray.conventions import cf_encoder, cf_decoder class TestMaskedAndScaledArray(TestCase): @@ -26,7 +28,8 @@ def test(self): x = conventions.MaskedAndScaledArray(np.arange(3), scale_factor=2) self.assertArrayEqual(2 * np.arange(3), x) - x = conventions.MaskedAndScaledArray(np.array([-99, -1, 0, 1, 2]), -99, 0.01, 1) + x = conventions.MaskedAndScaledArray(np.array([-99, -1, 0, 1, 2]), + -99, 0.01, 1) expected = np.array([np.nan, 0.99, 1, 1.01, 1.02]) self.assertArrayEqual(expected, x) @@ -91,8 +94,10 @@ def test_cf_datetime(self): expected = nc4.num2date(num_dates, units, calendar) print(num_dates, units, calendar) with warnings.catch_warnings(): - warnings.filterwarnings('ignore', 'Unable to decode time axis') - actual = conventions.decode_cf_datetime(num_dates, units, calendar) + warnings.filterwarnings('ignore', + 'Unable to decode time axis') + actual = conventions.decode_cf_datetime(num_dates, units, + calendar) if (isinstance(actual, np.ndarray) and np.issubdtype(actual.dtype, np.datetime64)): self.assertEqual(actual.dtype, np.dtype('M8[ns]')) @@ -104,7 +109,8 @@ def test_cf_datetime(self): else: actual_cmp = actual self.assertArrayEqual(expected, actual_cmp) - encoded, _, _ = conventions.encode_cf_datetime(actual, units, calendar) + encoded, _, _ = conventions.encode_cf_datetime(actual, units, + calendar) self.assertArrayEqual(num_dates, np.around(encoded)) if (hasattr(num_dates, 'ndim') and num_dates.ndim == 1 and '1000' not in units): @@ -169,7 +175,8 @@ def test_decode_non_standard_calendar_single_element(self): '366_day']: for num_time in [735368, [735368], [[735368]]]: with warnings.catch_warnings(): - warnings.filterwarnings('ignore', 'Unable to decode time axis') + warnings.filterwarnings('ignore', + 'Unable to decode time axis') actual = conventions.decode_cf_datetime(num_time, units, calendar=calendar) self.assertEqual(actual.dtype, np.dtype('M8[ns]')) @@ -223,7 +230,8 @@ def test_decode_non_standard_calendar_multidim_time(self): @requires_netCDF4 def test_decode_non_standard_calendar_fallback(self): import netCDF4 as nc4 - for year in [2010, 2011, 2012, 2013, 2014]: # insure leap year doesn't matter + # ensure leap year doesn't matter + for year in [2010, 2011, 2012, 2013, 2014]: for calendar in ['360_day', '366_day', 'all_leap']: calendar = '360_day' units = 'days since {0}-01-01'.format(year) @@ -281,21 +289,25 @@ def test_incompatible_attributes(self): conventions.encode_cf_variable(var) -@conventions.cf_encoded class CFEncodedInMemoryStore(InMemoryDataStore): - pass + def __init__(self, *args, **kwdargs): + InMemoryDataStore.__init__(self, dict_store=None) + self._args = args + self._kwdargs = kwdargs -class TestCFEncodedDataStore(CFEncodedDataTest, TestCase): - @contextlib.contextmanager - def create_store(self): - yield CFEncodedInMemoryStore() + def store(self, variables, attributes): + variables, attributes = cf_encoder(variables, attributes) + InMemoryDataStore.store(self, variables, attributes) - @contextlib.contextmanager - def roundtrip(self, data, **kwargs): - store = CFEncodedInMemoryStore(**kwargs) - store.store(data) - yield Dataset.load_store(store, decoder=None) + def load(self): + variables, attributes = InMemoryDataStore.load(self) + if self._kwdargs.get('decode_cf', True): + kwd_args = self._kwdargs.copy() + kwd_args.pop('decode_cf', None) + variables, attributes = cf_decoder(variables, attributes, + *self._args, **kwd_args) + return variables, attributes class NullWrapper(utils.NDArrayMixin): @@ -310,36 +322,26 @@ def __getitem__(self, key): return self.array[indexing.orthogonal_indexer(key, self.shape)] -def lazy_identity(x): +def null_wrap(ds): """ Given a data store this wraps each variable in a NullWrapper so that it appears to be out of memory. """ - variables = {k: Variable(v.dimensions, + variables = {k: Variable(v.dims, NullWrapper(v.values), - v.attrs) for k, v in x.variables.items()} + v.attrs) for k, v in iteritems(ds)} return InMemoryDataStore({'variables': variables, - 'attributes': x.attrs}) - - -@conventions.encoding_decorator(lambda x: x, lazy_identity) -class IdentityEncodedInMemoryStore(InMemoryDataStore): - """ - This InMemoryStore does no encoding or decoding, other than - wrapping all variables in NullWrappers, which lets us - test the trivial case of encoding and decoding. - """ - pass + 'attributes': ds.attrs}) -class EncodedDataTest(DatasetIOTestCases, TestCase): - +class TestCFEncodedDataStore(CFEncodedDataTest, TestCase): @contextlib.contextmanager def create_store(self): - yield IdentityEncodedInMemoryStore() + yield CFEncodedInMemoryStore() @contextlib.contextmanager - def roundtrip(self, data, **kwargs): - store = IdentityEncodedInMemoryStore(**kwargs) - store.store(data) - yield Dataset.load_store(store, decoder=None) + def roundtrip(self, data, **kwdargs): + store = CFEncodedInMemoryStore(**kwdargs) + data.dump_to_store(store) + store.store_dataset(data) + yield Dataset.load_store(null_wrap(store)) \ No newline at end of file From 04e019521dc838d8cc4b47718933599fe1d31920 Mon Sep 17 00:00:00 2001 From: Alex Kleeman Date: Thu, 9 Oct 2014 10:58:16 -0700 Subject: [PATCH 3/5] Responded to shoyer's comments --- xray/backends/common.py | 3 +-- xray/backends/memory.py | 27 +++++++++++++++------------ xray/core/dataset.py | 4 ++-- xray/test/test_backends.py | 4 +++- xray/test/test_conventions.py | 6 +++--- xray/test/test_dataset.py | 2 +- 6 files changed, 25 insertions(+), 21 deletions(-) diff --git a/xray/backends/common.py b/xray/backends/common.py index 2d804220ed8..1f00ed16128 100644 --- a/xray/backends/common.py +++ b/xray/backends/common.py @@ -147,8 +147,7 @@ def store_dataset(self, dataset): def store(self, variables, attributes): self.set_attributes(attributes) - neccesary_dims = [[d for d in v.dims] - for v in variables.values()] + neccesary_dims = [v.dims for v in variables.values()] neccesary_dims = set(itertools.chain(*neccesary_dims)) # set all non-indexes and any index which is not trivial. variables = {k: v for k, v in iteritems(variables) diff --git a/xray/backends/memory.py b/xray/backends/memory.py index ff5258c7e63..0878ec8438e 100644 --- a/xray/backends/memory.py +++ b/xray/backends/memory.py @@ -10,18 +10,22 @@ class InMemoryDataStore(AbstractWritableDataStore): in ordered dictionaries, making this store fast compared to stores which save to disk. """ - def __init__(self, dict_store=None): - if dict_store is None: - dict_store = {} - dict_store['variables'] = OrderedDict() - dict_store['attributes'] = OrderedDict() - self.ds = dict_store + def __init__(self, variables=None, attributes=None): + if variables is None: + self._variables = OrderedDict() + else: + self._variables = variables + + if attributes is None: + self._attributes = OrderedDict() + else: + self._attributes = attributes def get_attrs(self): - return self.ds['attributes'] + return self._attributes def get_variables(self): - return self.ds['variables'] + return self._variables def set_variable(self, k, v): new_var = copy.deepcopy(v) @@ -29,13 +33,12 @@ def set_variable(self, k, v): # attributes to imitate what happens when writing to disk. new_var.attrs.update(new_var.encoding) new_var.encoding.clear() - print self.ds['variables'].keys() - self.ds['variables'][k] = new_var + self._variables[k] = new_var def set_attribute(self, k, v): # copy to imitate writing to disk. - self.ds['attributes'][k] = copy.deepcopy(v) + self._attributes[k] = copy.deepcopy(v) def set_dimension(self, d, l): # in this model, dimensions are accounted for in the variables - pass \ No newline at end of file + pass diff --git a/xray/core/dataset.py b/xray/core/dataset.py index 1e4c082b603..dca23b9051f 100644 --- a/xray/core/dataset.py +++ b/xray/core/dataset.py @@ -69,10 +69,10 @@ def open_dataset(nc, decode_cf=True, mask_and_scale=True, decode_times=True, 'supported on Python 2.6') try: store = backends.ScipyDataStore(gzip.open(nc), *args, **kwargs) - except TypeError, e: + except TypeError as e: # TODO: gzipped loading only works with NetCDF3 files. if 'is not a valid NetCDF 3 file' in e.message: - raise TypeError("xray: gzipped file loading only supports NetCDF 3 files.") + raise ValueError("xray: gzipped file loading only supports NetCDF 3 files.") else: raise e elif not nc.startswith('CDF'): diff --git a/xray/test/test_backends.py b/xray/test/test_backends.py index 707339b92ec..ace8834c760 100644 --- a/xray/test/test_backends.py +++ b/xray/test/test_backends.py @@ -136,10 +136,12 @@ def test_roundtrip_object_dtype(self): with self.roundtrip(original) as actual: try: self.assertDatasetIdentical(expected, actual) - except: + except AssertionError: # Most stores use '' for nans in strings, but some don't # first try the ideal case (where the store returns exactly) # the original Dataset), then try a more realistic case. + # ScipyDataTest, NetCDF3ViaNetCDF4DataTest and NetCDF4DataTest + # all end up using this case. expected['letters_nans'][-1] = '' self.assertDatasetIdentical(expected, actual) diff --git a/xray/test/test_conventions.py b/xray/test/test_conventions.py index 495b8582000..bbe76d12242 100644 --- a/xray/test/test_conventions.py +++ b/xray/test/test_conventions.py @@ -292,7 +292,7 @@ def test_incompatible_attributes(self): class CFEncodedInMemoryStore(InMemoryDataStore): def __init__(self, *args, **kwdargs): - InMemoryDataStore.__init__(self, dict_store=None) + InMemoryDataStore.__init__(self) self._args = args self._kwdargs = kwdargs @@ -330,8 +330,8 @@ def null_wrap(ds): variables = {k: Variable(v.dims, NullWrapper(v.values), v.attrs) for k, v in iteritems(ds)} - return InMemoryDataStore({'variables': variables, - 'attributes': ds.attrs}) + return InMemoryDataStore(variables=variables, + attributes=ds.attrs) class TestCFEncodedDataStore(CFEncodedDataTest, TestCase): diff --git a/xray/test/test_dataset.py b/xray/test/test_dataset.py index 2a15407cc76..94f1948584c 100644 --- a/xray/test/test_dataset.py +++ b/xray/test/test_dataset.py @@ -52,7 +52,7 @@ def lazy_inaccessible(x): data = indexing.LazilyIndexedArray(InaccessibleArray(x.values)) return Variable(x.dimensions, data, x.attrs) return dict((k, lazy_inaccessible(v)) for - k, v in iteritems(self.ds['variables'])) + k, v in iteritems(self._variables)) class TestDataset(TestCase): From e9d4426dc4fe83446c3481e081dcd7bc25e4f8a3 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 10 Oct 2014 20:53:32 -0700 Subject: [PATCH 4/5] abbreviate InMemoryDataStore.__init__ --- xray/backends/memory.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/xray/backends/memory.py b/xray/backends/memory.py index 0878ec8438e..5cb7c3c5566 100644 --- a/xray/backends/memory.py +++ b/xray/backends/memory.py @@ -11,15 +11,8 @@ class InMemoryDataStore(AbstractWritableDataStore): fast compared to stores which save to disk. """ def __init__(self, variables=None, attributes=None): - if variables is None: - self._variables = OrderedDict() - else: - self._variables = variables - - if attributes is None: - self._attributes = OrderedDict() - else: - self._attributes = attributes + self._variables = OrderedDict() if variables is None else variables + self._attributes = OrderedDict() if attributes is None else attributes def get_attrs(self): return self._attributes From ad7df2b257b1dc6e391d01c2e04af2e1e97eb797 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 10 Oct 2014 22:25:56 -0700 Subject: [PATCH 5/5] Fix failing tests without deps or not on Python 2.7 --- xray/backends/common.py | 4 ++-- xray/test/test_backends.py | 18 ++++++++++-------- xray/test/test_conventions.py | 11 +++++------ xray/test/test_dataset.py | 2 +- 4 files changed, 18 insertions(+), 17 deletions(-) diff --git a/xray/backends/common.py b/xray/backends/common.py index 1f00ed16128..1c3be326bf0 100644 --- a/xray/backends/common.py +++ b/xray/backends/common.py @@ -150,8 +150,8 @@ def store(self, variables, attributes): neccesary_dims = [v.dims for v in variables.values()] neccesary_dims = set(itertools.chain(*neccesary_dims)) # set all non-indexes and any index which is not trivial. - variables = {k: v for k, v in iteritems(variables) - if not (k in neccesary_dims and is_trivial_index(v))} + variables = dict((k, v) for k, v in iteritems(variables) + if not (k in neccesary_dims and is_trivial_index(v))) self.set_variables(variables) def set_dimensions(self, dimensions): diff --git a/xray/test/test_backends.py b/xray/test/test_backends.py index ace8834c760..168e1978a4a 100644 --- a/xray/test/test_backends.py +++ b/xray/test/test_backends.py @@ -43,7 +43,7 @@ def create_encoded_masked_and_scaled_data(): return Dataset({'x': ('t', [-1, -1, 0, 1, 2], attributes)}) -class CastUnicodeToBytes(object): +class CastsUnicodeToBytes(object): pass @@ -148,7 +148,7 @@ def test_roundtrip_object_dtype(self): def test_roundtrip_string_data(self): expected = Dataset({'x': ('t', ['ab', 'cdef'])}) with self.roundtrip(expected) as actual: - if isinstance(self, CastUnicodeToBytes): + if isinstance(self, CastsUnicodeToBytes): expected['x'] = expected['x'].astype('S') self.assertDatasetIdentical(expected, actual) @@ -202,9 +202,10 @@ def test_roundtrip_strings_with_fill_value(self): self.assertDatasetIdentical(expected, actual) original = Dataset({'x': ('t', values, {}, {'_FillValue': '\x00'})}) - if type(self) is NetCDF4DataTest: - # NetCDF4 should still write a VLEN (unicode) string + if not isinstance(self, CastsUnicodeToBytes): + # these stores can save unicode strings expected = original.copy(deep=True) + if type(self) is NetCDF4DataTest: # the netCDF4 library can't keep track of an empty _FillValue for # VLEN variables: expected['x'][-1] = '' @@ -456,8 +457,9 @@ def test_open_encodings(self): actual = open_dataset(tmp_file) self.assertVariableEqual(actual['time'], expected['time']) - actual_encoding = {k: v for k, v in iteritems(actual['time'].encoding) - if k in expected['time'].encoding} + actual_encoding = dict((k, v) for k, v + in iteritems(actual['time'].encoding) + if k in expected['time'].encoding) self.assertDictEqual(actual_encoding, expected['time'].encoding) def test_dump_and_open_encodings(self): @@ -484,7 +486,7 @@ def test_dump_and_open_encodings(self): @requires_netCDF4 @requires_scipy -class ScipyDataTest(CFEncodedDataTest, CastUnicodeToBytes, TestCase): +class ScipyDataTest(CFEncodedDataTest, CastsUnicodeToBytes, TestCase): @contextlib.contextmanager def create_store(self): fobj = BytesIO() @@ -498,7 +500,7 @@ def roundtrip(self, data, **kwargs): @requires_netCDF4 -class NetCDF3ViaNetCDF4DataTest(CFEncodedDataTest, CastUnicodeToBytes, TestCase): +class NetCDF3ViaNetCDF4DataTest(CFEncodedDataTest, CastsUnicodeToBytes, TestCase): @contextlib.contextmanager def create_store(self): with create_tmp_file() as tmp_file: diff --git a/xray/test/test_conventions.py b/xray/test/test_conventions.py index bbe76d12242..3f6f7b52205 100644 --- a/xray/test/test_conventions.py +++ b/xray/test/test_conventions.py @@ -327,13 +327,12 @@ def null_wrap(ds): Given a data store this wraps each variable in a NullWrapper so that it appears to be out of memory. """ - variables = {k: Variable(v.dims, - NullWrapper(v.values), - v.attrs) for k, v in iteritems(ds)} - return InMemoryDataStore(variables=variables, - attributes=ds.attrs) + variables = dict((k, Variable(v.dims, NullWrapper(v.values), v.attrs)) + for k, v in iteritems(ds)) + return InMemoryDataStore(variables=variables, attributes=ds.attrs) +@requires_netCDF4 class TestCFEncodedDataStore(CFEncodedDataTest, TestCase): @contextlib.contextmanager def create_store(self): @@ -344,4 +343,4 @@ def roundtrip(self, data, **kwdargs): store = CFEncodedInMemoryStore(**kwdargs) data.dump_to_store(store) store.store_dataset(data) - yield Dataset.load_store(null_wrap(store)) \ No newline at end of file + yield Dataset.load_store(null_wrap(store)) diff --git a/xray/test/test_dataset.py b/xray/test/test_dataset.py index 94f1948584c..82f72adffbb 100644 --- a/xray/test/test_dataset.py +++ b/xray/test/test_dataset.py @@ -50,7 +50,7 @@ class InaccessibleVariableDataStore(backends.InMemoryDataStore): def get_variables(self): def lazy_inaccessible(x): data = indexing.LazilyIndexedArray(InaccessibleArray(x.values)) - return Variable(x.dimensions, data, x.attrs) + return Variable(x.dims, data, x.attrs) return dict((k, lazy_inaccessible(v)) for k, v in iteritems(self._variables))