diff --git a/test/test_backends.py b/test/test_backends.py index efbf811c0be..a3ad74b67ab 100644 --- a/test/test_backends.py +++ b/test/test_backends.py @@ -121,13 +121,19 @@ def test_roundtrip_object_dtype(self): # see the note under test_zero_dimensional_variable del original['nan'] expected = original.copy(deep=True) - expected['letters_nans'][-1] = '' - if type(self) is not NetCDF4DataTest: + if type(self) in [NetCDF3ViaNetCDF4DataTest, ScipyDataTest]: # for netCDF3 tests, expect the results to come back as characters expected['letters_nans'] = expected['letters_nans'].astype('S') expected['letters'] = expected['letters'].astype('S') with self.roundtrip(original) as actual: - self.assertDatasetIdentical(expected, actual) + try: + self.assertDatasetIdentical(expected, actual) + except: + # Most stores use '' for nans in strings, but some don't + # first try the ideal case (where the store returns exactly) + # the original Dataset), then try a more realistic case. + expected['letters_nans'][-1] = '' + self.assertDatasetIdentical(expected, actual) def test_roundtrip_string_data(self): expected = Dataset({'x': ('t', ['ab', 'cdef'])}) @@ -136,6 +142,36 @@ def test_roundtrip_string_data(self): expected['x'] = expected['x'].astype('S') self.assertDatasetIdentical(expected, actual) + def test_roundtrip_example_1_netcdf(self): + expected = open_example_dataset('example_1.nc') + with self.roundtrip(expected) as actual: + # we allow the attributes to differ since that + # will depend on the encoding used. For example, + # without CF encoding 'actual' will end up with + # a dtype attribute. + self.assertDatasetEqual(expected, actual) + + def test_orthogonal_indexing(self): + in_memory = create_test_data() + with self.roundtrip(in_memory) as on_disk: + indexers = {'dim1': np.arange(3), 'dim2': np.arange(4), + 'dim3': np.arange(5)} + expected = in_memory.isel(**indexers) + actual = on_disk.isel(**indexers) + self.assertDatasetAllClose(expected, actual) + # do it twice, to make sure we're switched from orthogonal -> numpy + # when we cached the values + actual = on_disk.isel(**indexers) + self.assertDatasetAllClose(expected, actual) + + def test_pickle(self): + on_disk = open_example_dataset('bears.nc') + unpickled = pickle.loads(pickle.dumps(on_disk)) + self.assertDatasetIdentical(on_disk, unpickled) + + +class CFEncodedDataTest(DatasetIOTestCases): + def test_roundtrip_strings_with_fill_value(self): values = np.array(['ab', 'cdef', np.nan], dtype=object) encoding = {'_FillValue': np.string_('X'), 'dtype': np.dtype('S1')} @@ -166,34 +202,17 @@ def test_roundtrip_mask_and_scale(self): self.assertDatasetAllClose(decoded, actual) with self.roundtrip(decoded, decode_cf=False) as actual: self.assertDatasetAllClose(encoded, actual) + with self.roundtrip(encoded, decode_cf=False) as actual: + self.assertDatasetAllClose(encoded, actual) + # make sure roundtrip encoding didn't change the + # original dataset. + self.assertDatasetIdentical(encoded, + create_encoded_masked_and_scaled_data()) with self.roundtrip(encoded) as actual: self.assertDatasetAllClose(decoded, actual) with self.roundtrip(encoded, decode_cf=False) as actual: self.assertDatasetAllClose(encoded, actual) - def test_roundtrip_example_1_netcdf(self): - expected = open_example_dataset('example_1.nc') - with self.roundtrip(expected) as actual: - self.assertDatasetIdentical(expected, actual) - - def test_orthogonal_indexing(self): - in_memory = create_test_data() - with self.roundtrip(in_memory) as on_disk: - indexers = {'dim1': np.arange(3), 'dim2': np.arange(4), - 'dim3': np.arange(5)} - expected = in_memory.isel(**indexers) - actual = on_disk.isel(**indexers) - self.assertDatasetAllClose(expected, actual) - # do it twice, to make sure we're switched from orthogonal -> numpy - # when we cached the values - actual = on_disk.isel(**indexers) - self.assertDatasetAllClose(expected, actual) - - def test_pickle(self): - on_disk = open_example_dataset('bears.nc') - unpickled = pickle.loads(pickle.dumps(on_disk)) - self.assertDatasetIdentical(on_disk, unpickled) - @contextlib.contextmanager def create_tmp_file(suffix='.nc'): @@ -206,7 +225,7 @@ def create_tmp_file(suffix='.nc'): @requires_netCDF4 -class NetCDF4DataTest(DatasetIOTestCases, TestCase): +class NetCDF4DataTest(CFEncodedDataTest, TestCase): @contextlib.contextmanager def create_store(self): with create_tmp_file() as tmp_file: @@ -215,33 +234,12 @@ def create_store(self): @contextlib.contextmanager def roundtrip(self, data, **kwargs): with create_tmp_file() as tmp_file: - data.dump(tmp_file) - yield open_dataset(tmp_file, **kwargs) - - def test_open_encodings(self): - # Create a netCDF file with explicit time units - # and make sure it makes it into the encodings - # and survives a round trip - with create_tmp_file() as tmp_file: - with nc4.Dataset(tmp_file, 'w') as ds: - ds.createDimension('time', size=10) - ds.createVariable('time', np.int32, dimensions=('time',)) - units = 'days since 1999-01-01' - ds.variables['time'].setncattr('units', units) - ds.variables['time'][:] = np.arange(10) + 4 - - expected = Dataset() - - time = pd.date_range('1999-01-05', periods=10) - encoding = {'units': units, 'dtype': np.dtype('int32')} - expected['time'] = ('time', time, {}, encoding) - - actual = open_dataset(tmp_file) - - self.assertVariableEqual(actual['time'], expected['time']) - actual_encoding = {k: v for k, v in iteritems(actual['time'].encoding) - if k in expected['time'].encoding} - self.assertDictEqual(actual_encoding, expected['time'].encoding) + with backends.NetCDF4DataStore(tmp_file, + mode='w', **kwargs) as store: + store.store(data) + with backends.NetCDF4DataStore(tmp_file, + mode='r', **kwargs) as store: + yield Dataset.load_store(store, decoder=None) def test_open_group(self): # Create a netCDF file with a dataset stored within a group @@ -288,27 +286,6 @@ def test_open_subgroup(self): actual = open_dataset(tmp_file, group=group) self.assertVariableEqual(actual['x'], expected['x']) - def test_dump_and_open_encodings(self): - # Create a netCDF file with explicit time units - # and make sure it makes it into the encodings - # and survives a round trip - with create_tmp_file() as tmp_file: - with nc4.Dataset(tmp_file, 'w') as ds: - ds.createDimension('time', size=10) - ds.createVariable('time', np.int32, dimensions=('time',)) - units = 'days since 1999-01-01' - ds.variables['time'].setncattr('units', units) - ds.variables['time'][:] = np.arange(10) + 4 - - xray_dataset = open_dataset(tmp_file) - - with create_tmp_file() as tmp_file2: - xray_dataset.dump(tmp_file2) - - with nc4.Dataset(tmp_file2, 'r') as ds: - self.assertEqual(ds.variables['time'].getncattr('units'), units) - self.assertArrayEqual(ds.variables['time'], np.arange(10) + 4) - def test_compression_encoding(self): data = create_test_data() data['var2'].encoding.update({'zlib': True, @@ -391,10 +368,56 @@ def test_roundtrip_character_array(self): with self.roundtrip(actual) as roundtripped: self.assertDatasetIdentical(expected, roundtripped) + def test_open_encodings(self): + # Create a netCDF file with explicit time units + # and make sure it makes it into the encodings + # and survives a round trip + with create_tmp_file() as tmp_file: + with nc4.Dataset(tmp_file, 'w') as ds: + ds.createDimension('time', size=10) + ds.createVariable('time', np.int32, dimensions=('time',)) + units = 'days since 1999-01-01' + ds.variables['time'].setncattr('units', units) + ds.variables['time'][:] = np.arange(10) + 4 + + expected = Dataset() + + time = pd.date_range('1999-01-05', periods=10) + encoding = {'units': units, 'dtype': np.dtype('int32')} + expected['time'] = ('time', time, {}, encoding) + + actual = open_dataset(tmp_file) + + self.assertVariableEqual(actual['time'], expected['time']) + actual_encoding = {k: v for k, v in iteritems(actual['time'].encoding) + if k in expected['time'].encoding} + self.assertDictEqual(actual_encoding, expected['time'].encoding) + + def test_dump_and_open_encodings(self): + # Create a netCDF file with explicit time units + # and make sure it makes it into the encodings + # and survives a round trip + with create_tmp_file() as tmp_file: + with nc4.Dataset(tmp_file, 'w') as ds: + ds.createDimension('time', size=10) + ds.createVariable('time', np.int32, dimensions=('time',)) + units = 'days since 1999-01-01' + ds.variables['time'].setncattr('units', units) + ds.variables['time'][:] = np.arange(10) + 4 + + xray_dataset = open_dataset(tmp_file) + + with create_tmp_file() as tmp_file2: + xray_dataset.dump(tmp_file2) + + with nc4.Dataset(tmp_file2, 'r') as ds: + self.assertEqual(ds.variables['time'].getncattr('units'), units) + self.assertArrayEqual(ds.variables['time'], np.arange(10) + 4) + @requires_netCDF4 @requires_scipy -class ScipyDataTest(DatasetIOTestCases, TestCase): +class ScipyDataTest(CFEncodedDataTest, TestCase): @contextlib.contextmanager def create_store(self): fobj = BytesIO() @@ -402,12 +425,16 @@ def create_store(self): @contextlib.contextmanager def roundtrip(self, data, **kwargs): - serialized = data.dumps() - yield open_dataset(BytesIO(serialized), **kwargs) + fobj = BytesIO() + with backends.ScipyDataStore(fobj, 'w', **kwargs) as store: + store.store(data) + seralized = fobj.getvalue() + with backends.ScipyDataStore(seralized, 'r', **kwargs) as store: + yield Dataset.load_store(store, decoder=None) @requires_netCDF4 -class NetCDF3ViaNetCDF4DataTest(DatasetIOTestCases, TestCase): +class NetCDF3ViaNetCDF4DataTest(CFEncodedDataTest, TestCase): @contextlib.contextmanager def create_store(self): with create_tmp_file() as tmp_file: @@ -417,8 +444,12 @@ def create_store(self): @contextlib.contextmanager def roundtrip(self, data, **kwargs): with create_tmp_file() as tmp_file: - data.dump(tmp_file, format='NETCDF3_CLASSIC') - yield open_dataset(tmp_file, **kwargs) + with backends.NetCDF4DataStore(tmp_file, mode='w', + format='NETCDF3_CLASSIC', **kwargs) as store: + store.store(data) + with backends.NetCDF4DataStore(tmp_file, + mode='r', **kwargs) as store: + yield Dataset.load_store(store, decoder=None) @requires_netCDF4 diff --git a/test/test_conventions.py b/test/test_conventions.py index 30db0f93553..f3c849beb70 100644 --- a/test/test_conventions.py +++ b/test/test_conventions.py @@ -1,9 +1,12 @@ import numpy as np import pandas as pd import warnings +import contextlib -from xray import conventions, Variable +from xray import conventions, Variable, Dataset, utils, indexing from . import TestCase, requires_netCDF4 +from .test_backends import CFEncodedDataTest, DatasetIOTestCases +from xray.backends.memory import InMemoryDataStore class TestMaskedAndScaledArray(TestCase): @@ -275,3 +278,67 @@ def test_incompatible_attributes(self): for var in invalid_vars: with self.assertRaises(ValueError): conventions.encode_cf_variable(var) + + +@conventions.cf_encoded +class CFEncodedInMemoryStore(InMemoryDataStore): + pass + + +class TestCFEncodedDataStore(CFEncodedDataTest, TestCase): + @contextlib.contextmanager + def create_store(self): + yield CFEncodedInMemoryStore() + + @contextlib.contextmanager + def roundtrip(self, data, **kwargs): + store = CFEncodedInMemoryStore(**kwargs) + store.store(data) + yield Dataset.load_store(store, decoder=None) + + +class NullWrapper(utils.NDArrayMixin): + """ + Just for testing, this lets us create a numpy array directly + but make it look like its not in memory yet. + """ + def __init__(self, array): + self.array = array + + def __getitem__(self, key): + return self.array[indexing.orthogonal_indexer(key, self.shape)] + + +def lazy_identity(x): + """ + Given a data store this wraps each variable in a NullWrapper so that + it appears to be out of memory. + """ + variables = {k: Variable(v.dimensions, + NullWrapper(v.values), + v.attrs) for k, v in x.variables.iteritems()} + return InMemoryDataStore({'variables': variables, + 'attributes': x.attrs}) + + +@conventions.encoding_decorator(lambda x: x, lazy_identity) +class IdentityEncodedInMemoryStore(InMemoryDataStore): + """ + This InMemoryStore does no encoding or decoding, other than + wrapping all variables in NullWrappers, which lets us + test the trivial case of encoding and decoding. + """ + pass + + +class EncodedDataTest(DatasetIOTestCases, TestCase): + + @contextlib.contextmanager + def create_store(self): + yield IdentityEncodedInMemoryStore() + + @contextlib.contextmanager + def roundtrip(self, data, **kwargs): + store = IdentityEncodedInMemoryStore(**kwargs) + store.store(data) + yield Dataset.load_store(store, decoder=None) diff --git a/test/test_dataset.py b/test/test_dataset.py index 9f669134705..a79c0e152ec 100644 --- a/test/test_dataset.py +++ b/test/test_dataset.py @@ -51,22 +51,13 @@ def __getitem__(self, key): class InaccessibleVariableDataStore(backends.InMemoryDataStore): - def __init__(self): - self.dimensions = OrderedDict() - self._variables = OrderedDict() - self.attrs = OrderedDict() - - def set_variable(self, name, variable): - self._variables[name] = variable - return self._variables[name] - - def open_store_variable(self, var): - data = indexing.LazilyIndexedArray(InaccessibleArray(var.values)) - return Variable(var.dimensions, data, var.attrs) - - @property - def store_variables(self): - return self._variables + + def get_variables(self): + def lazy_inaccessible(x): + data = indexing.LazilyIndexedArray(InaccessibleArray(x.values)) + return Variable(x.dimensions, data, x.attrs) + return {k: lazy_inaccessible(v) for + k, v in self.ds['variables'].iteritems()} class TestDataset(TestCase): diff --git a/xray/backends/common.py b/xray/backends/common.py index 0d83b63ab66..1e235903caa 100644 --- a/xray/backends/common.py +++ b/xray/backends/common.py @@ -1,8 +1,11 @@ import numpy as np +import inspect +import itertools +import functools from xray.utils import FrozenOrderedDict from xray.pycompat import iteritems - +from xray.variable import Index NONE_VAR_NAME = '__values__' @@ -19,22 +22,52 @@ def _decode_variable_name(name): return name +def is_trivial_index(var): + """ + Determines if in index is 'trivial' mean that it is + equivalent to np.arange(). This is determined by + checking if there are any attributes or encodings, + if ndims is one, dtype is int and finally by comparing + the actual values to np.arange() + """ + # if either attributes or encodings are defined + # the index is not trival. + if len(var.attrs) or len(var.encoding): + return False + # if the index is not a 1d integer array + if var.ndim > 1 or not var.dtype.kind == 'i': + return False + if isinstance(var, Index): + arange = np.arange(var.size, dtype=var.dtype) + if np.any(var.values != arange): + return False + return True + + class AbstractDataStore(object): - def open_store_variable(self, v): + + def get_attrs(self): raise NotImplementedError - @property - def store_variables(self): - return self.ds.variables + def get_variables(self): + raise NotImplementedError + + def get_dimensions(self): + return list(itertools.chain(*[x.dimensions + for x in self.get_variables().values()])) @property def variables(self): - return FrozenOrderedDict((_decode_variable_name(k), - self.open_store_variable(v)) - for k, v in iteritems(self.store_variables)) + return FrozenOrderedDict((_decode_variable_name(k), v) + for k, v in iteritems(self.get_variables())) - def sync(self): - pass + @property + def attrs(self): + return FrozenOrderedDict(self.get_attrs()) + + @property + def dimensions(self): + return self.get_dimensions() def close(self): pass @@ -47,6 +80,30 @@ def __exit__(self, exception_type, exception_value, tracebook): class AbstractWritableDataStore(AbstractDataStore): + + def set_dimension(self, d, l): + raise NotImplementedError + + def set_attribute(self, k, v): + raise NotImplementedError + + def set_variable(self, k, v): + raise NotImplementedError + + def sync(self): + pass + + def store(self, dataset): + self.set_attributes(dataset.attrs) + neccesary_dims = [[d for d in v.dimensions] + for v in dataset.variables.values()] + neccesary_dims = set(itertools.chain(*neccesary_dims)) + # set all non-indexes and any index which is not trivial. + variables = {k: v for k, v in iteritems(dataset.variables) + if not (k in neccesary_dims and is_trivial_index(v))} + self.set_variables(variables) + #self.set_variables(dataset.variables) + def set_dimensions(self, dimensions): for d, l in iteritems(dimensions): self.set_dimension(d, l) @@ -58,8 +115,144 @@ def set_attributes(self, attributes): def set_variables(self, variables): for vn, v in iteritems(variables): self.set_variable(_encode_variable_name(vn), v) + self.set_necessary_dimensions(v) def set_necessary_dimensions(self, variable): for d, l in zip(variable.dimensions, variable.shape): - if d not in self.ds.dimensions: + if d not in self.dimensions: self.set_dimension(d, l) + + +class AbstractEncodedDataStore(AbstractWritableDataStore): + """ + AbstractEncodedDataStore is an interface for making a + DataStore which wraps another DataStore while first passing + all input/output through an encoding/decoding layer. + This allows more modular application of things such as + conforming to CF Conventions. + + There are no explicity restrictions requiring an + EncodedDataStore to be roundtrip-able, but when this is desired + (probably often) consider passing implementing + classes through test_backends:DatasetIOTestCases. + + Requires Implementation + -------- + encode : function(self, datastore) + + + decode : function(self, datastore) + + """ + def encode(self, datastore): + """ + A function which takes an un-encoded datastore and returns + a new DataStore (or Dataset) which has been encoded. Returning + an InMemoryDataStore for this is encouraged since it avoids + the xray consistency checks making it faster / more flexible. + + """ + raise NotImplementedError + + def decode(self, datastore): + """ + A function which takes an encoded datastore and returns + a new DataStore which has been decoded. Again consider + using an InMemoryDataStore, though returning a Dataset + will work perfectly fine in most situations. + + Also note that directly accessing variable data may cause + remote DataStores to be loaded into memory. + See conventions.decode_cf_variable for examples of wrapping + computations to make them lazy. + """ + raise NotImplementedError + + @property + def decoded(self): + if not hasattr(self, '_decoded'): + self._decoded = self.decode(self.ds) + return self._decoded + + def get_dimensions(self): + return self.decoded.dimensions + + def get_variables(self): + return self.decoded.variables + + def get_attrs(self): + return self.decoded.attrs + + def store(self, dataset): + self.ds.store(self.encode(dataset)) + self.ds.sync() + + def sync(self): + self.ds.sync() + + def close(self): + self.ds.close() + + +def encoding_decorator(encoder, decoder): + """ + This is a Class decorating function which makes wrapping DataStores + in additional encoding layers easier. + + Note that often times the encoders and decoders will require arguments + at class creation time. To handle this, the encoder and decoder args + are first inspected. Any arguments they require are used first, and + any remaining arguments are passed onto the DataStore being wrapped. + + Parameters + ---------- + encoder : function + Takes a Datastore (or Dataset) and returns an encoded Datastore. + decoder : function + Takes a Datastore (or Dataset) and returns a decoded Datastore. + + Returns + ------- + class_wrapper: A function which wraps a DataStore class and turns + it into an EncodingWrappedDataStore. + """ + + def class_wrapper(cls): + class EncodingWrappedDataStore(AbstractEncodedDataStore): + + def __init__(self, *args, **kwdargs): + # NOTE: we assume that any arguments for the encoder + # and decoder are keyword args. All position arguments + # are passed on to the DataStore. + encoder_argnames = set(inspect.getargspec(encoder).args[1:]) + decoder_argnames = set(inspect.getargspec(decoder).args[1:]) + # make sure there aren't any argument collisions, that would + # get pretty confusing. + constructor_args = set(inspect.getargspec(cls.__init__)[1:]) + if constructor_args.intersection(encoder_argnames): + bad_args = constructor_args.intersection(encoder_argnames) + raise ValueError("encoder and class have overlapping args: %s" + % ', '.join(bad_args)) + if constructor_args.intersection(decoder_argnames): + bad_args = constructor_args.intersection(decoder_argnames) + raise ValueError("decoder and class have overlapping args: %s" + % ', '.join(bad_args)) + # create a set of keyword arguments for both the encoder and decoder + encoder_args = {} + decoder_args = {} + for k in encoder_argnames.union(decoder_argnames): + if k in kwdargs: + v = kwdargs.pop(k) + if k in encoder_argnames: + encoder_args[k] = v + if k in decoder_argnames: + decoder_args[k] = v + # create the data store. + self.ds = cls(*args, **kwdargs) + # set the encode and decode function using the provided args + self.encode = functools.partial(encoder, **encoder_args) + self.decode = functools.partial(decoder, **decoder_args) + + return EncodingWrappedDataStore + + return class_wrapper \ No newline at end of file diff --git a/xray/backends/memory.py b/xray/backends/memory.py index 1a180b21c6a..4215526ce24 100644 --- a/xray/backends/memory.py +++ b/xray/backends/memory.py @@ -1,3 +1,6 @@ +import copy +import itertools + from collections import OrderedDict from .common import AbstractWritableDataStore @@ -7,22 +10,29 @@ class InMemoryDataStore(AbstractWritableDataStore): """ Stores dimensions, variables and attributes in ordered dictionaries, making this store - fast compared to stores which store to disk. + fast compared to stores which save to disk. """ - def __init__(self): - self.dimensions = OrderedDict() - self.variables = OrderedDict() - self.attributes = OrderedDict() + def __init__(self, dict_store=None): + if dict_store is None: + dict_store = {} + dict_store['variables'] = OrderedDict() + dict_store['attributes'] = OrderedDict() + self.ds = dict_store - def set_dimension(self, name, length): - self.dimensions[name] = length + def get_attrs(self): + return self.ds['attributes'] - def set_attribute(self, key, value): - self.attributes[key] = value + def get_variables(self): + return self.ds['variables'] - def set_variable(self, name, variable): - self.variables[name] = variable - return self.variables[name] + def set_variable(self, k, v): + new_var = copy.deepcopy(v) + # we copy the variable and stuff all encodings in the + # attributes to imitate what happens when writting to disk. + new_var.attrs.update(new_var.encoding) + new_var.encoding.clear() + self.ds['variables'][k] = new_var - def del_attribute(self, key): - del self.attributes[key] + def set_attribute(self, k, v): + # copy to imitate writing to disk. + self.ds['attributes'][k] = copy.deepcopy(v) \ No newline at end of file diff --git a/xray/backends/netCDF4_.py b/xray/backends/netCDF4_.py index 1a39e310d1e..5645b4702bc 100644 --- a/xray/backends/netCDF4_.py +++ b/xray/backends/netCDF4_.py @@ -6,7 +6,7 @@ from .common import AbstractWritableDataStore from .netcdf3 import encode_nc3_variable import xray -from xray.conventions import encode_cf_variable +from xray.conventions import cf_encoded, pop_to from xray.utils import FrozenOrderedDict, NDArrayMixin from xray import indexing from xray.pycompat import iteritems, basestring, bytes_type @@ -88,6 +88,7 @@ def _ensure_fill_value_valid(data, attributes): attributes['_FillValue'] = np.string_(attributes['_FillValue']) +@cf_encoded class NetCDF4DataStore(AbstractWritableDataStore): """Store for reading and writing data via the Python-NetCDF4 library. @@ -131,17 +132,18 @@ def open_store_variable(self, var): # TODO: figure out how to round-trip "endian-ness" without raising # warnings from netCDF4 # encoding['endian'] = var.endian() - encoding['least_significant_digit'] = \ - attributes.pop('least_significant_digit', None) + pop_to(attributes, encoding, 'least_significant_digit') return xray.Variable(dimensions, data, attributes, encoding) - @property - def attrs(self): + def get_variables(self): + return FrozenOrderedDict((k, self.open_store_variable(v)) + for k, v in iteritems(self.ds.variables)) + + def get_attrs(self): return FrozenOrderedDict((k, self.ds.getncattr(k)) for k in self.ds.ncattrs()) - @property - def dimensions(self): + def get_dimensions(self): return FrozenOrderedDict((k, len(v)) for k, v in iteritems(self.ds.dimensions)) @@ -152,7 +154,7 @@ def set_attribute(self, key, value): self.ds.setncattr(key, value) def set_variable(self, name, variable): - variable = encode_cf_variable(variable) + attrs = variable.attrs.copy() if self.format == 'NETCDF4': values, datatype = _nc4_values_and_dtype(variable) else: @@ -162,7 +164,7 @@ def set_variable(self, name, variable): self.set_necessary_dimensions(variable) - fill_value = variable.attrs.pop('_FillValue', None) + fill_value = attrs.pop('_FillValue', None) if fill_value in ['', '\x00']: # these are equivalent to the default FillValue, but netCDF4 # doesn't like setting fill_value to an empty string @@ -184,7 +186,7 @@ def set_variable(self, name, variable): fill_value=fill_value) nc4_var.set_auto_maskandscale(False) nc4_var[:] = values - for k, v in iteritems(variable.attrs): + for k, v in iteritems(attrs): # set attributes one-by-one since netCDF4<1.0.10 can't handle # OrderedDict as the input to setncatts nc4_var.setncattr(k, v) @@ -196,4 +198,4 @@ def sync(self): self.ds.sync() def close(self): - self.ds.close() + self.ds.close() \ No newline at end of file diff --git a/xray/backends/pydap_.py b/xray/backends/pydap_.py index 4717c3fc4ad..4320261b744 100644 --- a/xray/backends/pydap_.py +++ b/xray/backends/pydap_.py @@ -50,10 +50,12 @@ def open_store_variable(self, var): data = indexing.LazilyIndexedArray(PydapArrayWrapper(var)) return xray.Variable(var.dimensions, data, var.attributes) - @property - def store_variables(self): - return self.ds + def get_variables(self): + return FrozenOrderedDict((k, self.open_store_variable(v)) + for k, v in self.ds.iteritems()) - @property - def attrs(self): + def get_attrs(self): return Frozen(self.ds.attributes) + + def get_dimensions(self): + return Frozen(self.ds.dimensions) diff --git a/xray/backends/scipy_.py b/xray/backends/scipy_.py index 0162fc93f9b..0335b4d62a7 100644 --- a/xray/backends/scipy_.py +++ b/xray/backends/scipy_.py @@ -8,7 +8,7 @@ import xray from xray.backends.common import AbstractWritableDataStore -from xray.utils import Frozen +from xray.utils import Frozen, FrozenOrderedDict from xray.pycompat import iteritems, basestring, unicode_type from .. import conventions @@ -27,7 +27,7 @@ def _decode_attrs(d): return OrderedDict((k, v if k == '_FillValue' else _decode_string(v)) for (k, v) in iteritems(d)) - +@conventions.cf_encoded class ScipyDataStore(AbstractWritableDataStore): """Store for reading and writing data via scipy.io.netcdf. @@ -55,16 +55,15 @@ def __init__(self, filename_or_obj, mode='r', mmap=None, version=1): self.ds = scipy.io.netcdf.netcdf_file( filename_or_obj, mode=mode, mmap=mmap, version=version) - def open_store_variable(self, var): - return xray.Variable(var.dimensions, var.data, - _decode_attrs(var._attributes)) + def get_variables(self): + return FrozenOrderedDict((k, xray.Variable(v.dimensions, v.data, + _decode_attrs(v._attributes))) + for k, v in self.ds.variables.iteritems()) - @property - def attrs(self): + def get_attrs(self): return Frozen(_decode_attrs(self.ds._attributes)) - @property - def dimensions(self): + def get_dimensions(self): return Frozen(self.ds.dimensions) def set_dimension(self, name, length): @@ -92,8 +91,8 @@ def set_attribute(self, key, value): setattr(self.ds, key, self._cast_attr_value(value)) def set_variable(self, name, variable): - variable = encode_nc3_variable( - conventions.encode_cf_variable(variable)) + # TODO, create a netCDF3 encoder + variable = encode_nc3_variable(variable) self.set_necessary_dimensions(variable) data = variable.values self.ds.createVariable(name, data.dtype, variable.dimensions) diff --git a/xray/conventions.py b/xray/conventions.py index b36025d4466..da224a68643 100644 --- a/xray/conventions.py +++ b/xray/conventions.py @@ -8,6 +8,8 @@ from . import utils from .pycompat import iteritems, bytes_type, unicode_type import xray +from xray.backends.common import encoding_decorator +from xray.backends import InMemoryDataStore # standard calendars recognized by netcdftime _STANDARD_CALENDARS = {'standard', 'gregorian', 'proleptic_gregorian'} @@ -391,8 +393,24 @@ def _infer_dtype(array): def encode_cf_variable(var): - """Converts an Variable into an Variable suitable for saving as a netCDF - variable + """ + Converts an Variable into an Variable which follows some + of the CF conventions: + + - Nans are masked using _FillValue (or the deprecated missing_value) + - Rescaling via: scale_factor and add_offset + - datetimes are converted to the CF 'units since time' format + - dtype encodings are enforced. + + Parameters + ---------- + var : xray.Variable + A variable holding un-encoded data. + + Returns + ------- + out : xray.Variable + A variable which has been encoded as described above. """ dimensions = var.dimensions data = var.values @@ -428,6 +446,15 @@ def encode_cf_variable(var): data = data.copy() data[missing] = fill_value + # replace NaN with the missing_value + if 'missing_value' in encoding: + missing_value = pop_to(encoding, attributes, 'missing_value') + if not pd.isnull(missing_value): + missing = pd.isnull(data) + if missing.any(): + data = data.copy() + data[missing] = missing_value + # cast to encoded dtype if 'dtype' in encoding: dtype = np.dtype(encoding.pop('dtype')) @@ -449,7 +476,7 @@ def encode_cf_variable(var): if inferred_dtype.kind in ['S', 'U']: # There is no safe bit-pattern for NA in typical binary string - # formats, we so can't set a _FillValue. Unfortunately, this + # formats, we so can't set a fill_value. Unfortunately, this # means we won't be able to restore string arrays with missing # values. fill_value = '' @@ -463,12 +490,37 @@ def encode_cf_variable(var): data[missing] = fill_value else: data = np.asarray(data, dtype=_infer_dtype(data)) - return xray.Variable(dimensions, data, attributes, encoding=encoding) def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, decode_times=True): + """ + Decodes a variable which may hold CF encoded information. + + This includes variables that have been masked and scaled, which + hold CF style time variables (this is almost always the case if + the dataset has been serialized) and which have strings encoded + as character arrays. + + Parameters + ---------- + var : Variable + A variable holding potentially CF encoded information. + concat_characters : bool + Should character arrays be concatenated to strings, for + example: ['h', 'e', 'l', 'l', 'o'] -> 'hello' + mask_and_scale: bool + Lazily scale (using scale_factor and add_offset) and mask + (using _FillValue). + decode_times : bool + Decode cf times ('hours since 2000-01-01') to np.datetime64. + + Returns + ------- + out : Variable + A variable holding the decoded equivalent of var + """ # use _data instead of data so as not to trigger loading data var = xray.variable.as_variable(var) data = var._data @@ -478,8 +530,9 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, if 'dtype' in encoding: if data.dtype != encoding['dtype']: - raise ValueError("Refused to overwrite dtype") - encoding['dtype'] = data.dtype + warnings.warn("CF decoding is overwriting dtype") + else: + encoding['dtype'] = data.dtype if concat_characters: if data.dtype.kind == 'S' and data.dtype.itemsize == 1: @@ -487,7 +540,15 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, data = CharToStringArray(data) if mask_and_scale: + # missing_value is deprecated, but we still want to support it. + missing_value = pop_to(attributes, encoding, 'missing_value') fill_value = pop_to(attributes, encoding, '_FillValue') + # if missing_value is given but not fill_value we use missing_value + if fill_value is None and missing_value is not None: + fill_value = missing_value + # if both were given we make sure they are the same. + if fill_value is not None and missing_value is not None: + assert fill_value == missing_value scale_factor = pop_to(attributes, encoding, 'scale_factor') add_offset = pop_to(attributes, encoding, 'add_offset') if ((fill_value is not None and not pd.isnull(fill_value)) @@ -511,7 +572,10 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True, def decode_cf_variables(variables, concat_characters=True, mask_and_scale=True, decode_times=True): - """Decode a bunch of CF variables together. + """ + Decode a several CF encoded variables. + + See: decode_cf_variable """ dimensions_used_by = defaultdict(list) for v in variables.values(): @@ -535,3 +599,79 @@ def stackable(dim): v, concat_characters=concat, mask_and_scale=mask_and_scale, decode_times=decode_times) return new_vars + + +def cf_decoder(ds, concat_characters=True, mask_and_scale=True, + decode_times=True, decode_cf=True): + """ + Decode a data store or Dataset which holds CF encoded variables. + + See Also, decode_cf_variable + + Parameters + ---------- + ds : Datastore + This can technically be any object with properties 'variables' + and 'attrs' and whose constructor follows type(ds)(variables, attrs) + concat_characters : bool + Should character arrays be concatenated to strings, for + example: ['h', 'e', 'l', 'l', 'o'] -> 'hello' + mask_and_scale: bool + Lazily scale (using scale_factor and add_offset) and mask + (using _FillValue). + decode_times : bool + Decode cf times ('hours since 2000-01-01') to np.datetime64. + decode_cf : bool + If false this skips decoding. This is around for backward + compatibility. + Returns + ------- + ds : DataStore + A DataStore holding the decoded variables and attributes + """ + # if decode_cf is false, we do nothing. + if not decode_cf: + return ds + new_vars = decode_cf_variables(ds.variables, concat_characters, + mask_and_scale, decode_times) + # Note that we don't return a Dataset because in some (though + # very few) cases the backend.stores are more flexible than + # xray. For example a string Index which gets expanded to a + # a character array during CF encoding would result in the + # requirement for multidimensional indexes which xray does + # not currently support. Instead we store the variables as + # a dictionary of variables and attributes in an in memory store. + return InMemoryDataStore({'variables': new_vars, + 'attributes': ds.attrs}) + + +def cf_encoder(ds, encode_cf=True): + """ + A function which takes a DataStore (ds) and encodes its + variables and attributes to conform to CF conventions as much + as possible. This includes masking, scaling, character + array handling, and CF-time encoding. + + See also: encode_cf_variable + """ + if not encode_cf: + return ds + new_vars = OrderedDict((k, encode_cf_variable(v)) + for k, v in iteritems(ds.variables)) + return InMemoryDataStore({'variables': new_vars, + 'attributes': ds.attrs}) + + +def cf_encoded(*args, **kwdargs): + """ + This Class decorator can be used to turn a DataStore into a + CF encoded DataStore. For example, to take some DataStore + and add a CF encoding layer you can do this: + + @cf_encoded + CFPunchCardDataStore(PunchCardDataStore): + pass + + See also: encoding_decorator, cf_encoder, cf_decoder + """ + return encoding_decorator(cf_encoder, cf_decoder)(*args, **kwdargs) diff --git a/xray/dataset.py b/xray/dataset.py index e6f96308ac3..e0c4c658d35 100644 --- a/xray/dataset.py +++ b/xray/dataset.py @@ -166,7 +166,7 @@ def _expand_variables(raw_variables, old_variables={}, compat='identical'): This includes converting tuples (dimensions, data) into Variable objects, converting index variables into Index objects and expanding DataArray - objects into Variables plus Indexs. + objects into Variables plus Indexes. Raises ValueError if any conflicting values are found, between any of the new or old variables. @@ -307,7 +307,10 @@ def _add_missing_indexes(self): """ for dim, size in iteritems(self._dimensions): if dim not in self._variables: - coord = variable.Index(dim, np.arange(size)) + # This is equivalent to np.arange(size), but + # waits to create the array until its actually accessed. + data = indexing.LazyIntegerRange(size) + coord = variable.Index(dim, data) self._variables[dim] = coord def _update_vars_and_dims(self, new_variables, needs_copy=True): @@ -337,17 +340,16 @@ def _set_init_vars_and_dims(self, variables): self._update_vars_and_dims(new_variables, needs_copy=False) @classmethod - def load_store(cls, store, decode_cf=True, mask_and_scale=True, - decode_times=True, concat_characters=True): + def load_store(cls, store, decoder=None, *args, **kwdargs): """Create a new dataset from the contents of a backends.*DataStore object """ - variables = store.variables - if decode_cf: - variables = conventions.decode_cf_variables( - store.variables, mask_and_scale=mask_and_scale, - decode_times=decode_times, concat_characters=concat_characters) - return cls(variables, store.attrs) + if decoder: + # here the new 'store' name is a bit overloaded, it will + # typically actually be a Dataset, but still functions + # the way a store does. + store = decoder(store, *args, **kwdargs) + return cls(store.variables, store.attrs) @property def variables(self): @@ -562,10 +564,12 @@ def nonindexes(self): return FrozenOrderedDict((name, self[name]) for name in self if name not in self.dimensions) - def dump_to_store(self, store): + def dump_to_store(self, store, encoder=None): """Store dataset contents to a backends.*DataStore object.""" - store.set_variables(self.variables) - store.set_attributes(self.attrs) + ds = self + if encoder: + ds = encoder(ds) + store.store(ds) store.sync() def to_netcdf(self, filepath, **kwdargs): diff --git a/xray/indexing.py b/xray/indexing.py index 454f191e71e..a952b00d428 100644 --- a/xray/indexing.py +++ b/xray/indexing.py @@ -174,6 +174,47 @@ def _index_indexer_1d(old_indexer, applied_indexer, size): return indexer +class LazyIntegerRange(utils.NDArrayMixin): + + def __init__(self, *args, **kwdargs): + """ + Parameters + ---------- + See np.arange + """ + self.args = args + self.kwdargs = kwdargs + assert 'dtype' not in self.kwdargs + # xrange will fail if any arguments are not integers + self.array = xrange(*args, **kwdargs) + + @property + def shape(self): + return (len(self.array),) + + @property + def dtype(self): + return np.dtype('int64') + + @property + def ndim(self): + return 1 + + @property + def size(self): + return len(self.array) + + def __getitem__(self, key): + return np.array(self)[key] + + def __array__(self, dtype=None): + return np.arange(*self.args, **self.kwdargs) + + def __repr__(self): + return ('%s(array=%r)' % + (type(self).__name__, self.array)) + + class LazilyIndexedArray(utils.NDArrayMixin): """Wrap an array that handles orthogonal indexing to make indexing lazy """