Skip to content

Modular encoding #175

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 112 additions & 81 deletions test/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,19 @@ def test_roundtrip_object_dtype(self):
# see the note under test_zero_dimensional_variable
del original['nan']
expected = original.copy(deep=True)
expected['letters_nans'][-1] = ''
if type(self) is not NetCDF4DataTest:
if type(self) in [NetCDF3ViaNetCDF4DataTest, ScipyDataTest]:
# for netCDF3 tests, expect the results to come back as characters
expected['letters_nans'] = expected['letters_nans'].astype('S')
expected['letters'] = expected['letters'].astype('S')
with self.roundtrip(original) as actual:
self.assertDatasetIdentical(expected, actual)
try:
self.assertDatasetIdentical(expected, actual)
except:
# Most stores use '' for nans in strings, but some don't
# first try the ideal case (where the store returns exactly)
# the original Dataset), then try a more realistic case.
expected['letters_nans'][-1] = ''
self.assertDatasetIdentical(expected, actual)

def test_roundtrip_string_data(self):
expected = Dataset({'x': ('t', ['ab', 'cdef'])})
Expand All @@ -136,6 +142,36 @@ def test_roundtrip_string_data(self):
expected['x'] = expected['x'].astype('S')
self.assertDatasetIdentical(expected, actual)

def test_roundtrip_example_1_netcdf(self):
expected = open_example_dataset('example_1.nc')
with self.roundtrip(expected) as actual:
# we allow the attributes to differ since that
# will depend on the encoding used. For example,
# without CF encoding 'actual' will end up with
# a dtype attribute.
self.assertDatasetEqual(expected, actual)

def test_orthogonal_indexing(self):
in_memory = create_test_data()
with self.roundtrip(in_memory) as on_disk:
indexers = {'dim1': np.arange(3), 'dim2': np.arange(4),
'dim3': np.arange(5)}
expected = in_memory.isel(**indexers)
actual = on_disk.isel(**indexers)
self.assertDatasetAllClose(expected, actual)
# do it twice, to make sure we're switched from orthogonal -> numpy
# when we cached the values
actual = on_disk.isel(**indexers)
self.assertDatasetAllClose(expected, actual)

def test_pickle(self):
on_disk = open_example_dataset('bears.nc')
unpickled = pickle.loads(pickle.dumps(on_disk))
self.assertDatasetIdentical(on_disk, unpickled)


class CFEncodedDataTest(DatasetIOTestCases):

def test_roundtrip_strings_with_fill_value(self):
values = np.array(['ab', 'cdef', np.nan], dtype=object)
encoding = {'_FillValue': np.string_('X'), 'dtype': np.dtype('S1')}
Expand Down Expand Up @@ -166,34 +202,17 @@ def test_roundtrip_mask_and_scale(self):
self.assertDatasetAllClose(decoded, actual)
with self.roundtrip(decoded, decode_cf=False) as actual:
self.assertDatasetAllClose(encoded, actual)
with self.roundtrip(encoded, decode_cf=False) as actual:
self.assertDatasetAllClose(encoded, actual)
# make sure roundtrip encoding didn't change the
# original dataset.
self.assertDatasetIdentical(encoded,
create_encoded_masked_and_scaled_data())
with self.roundtrip(encoded) as actual:
self.assertDatasetAllClose(decoded, actual)
with self.roundtrip(encoded, decode_cf=False) as actual:
self.assertDatasetAllClose(encoded, actual)

def test_roundtrip_example_1_netcdf(self):
expected = open_example_dataset('example_1.nc')
with self.roundtrip(expected) as actual:
self.assertDatasetIdentical(expected, actual)

def test_orthogonal_indexing(self):
in_memory = create_test_data()
with self.roundtrip(in_memory) as on_disk:
indexers = {'dim1': np.arange(3), 'dim2': np.arange(4),
'dim3': np.arange(5)}
expected = in_memory.isel(**indexers)
actual = on_disk.isel(**indexers)
self.assertDatasetAllClose(expected, actual)
# do it twice, to make sure we're switched from orthogonal -> numpy
# when we cached the values
actual = on_disk.isel(**indexers)
self.assertDatasetAllClose(expected, actual)

def test_pickle(self):
on_disk = open_example_dataset('bears.nc')
unpickled = pickle.loads(pickle.dumps(on_disk))
self.assertDatasetIdentical(on_disk, unpickled)


@contextlib.contextmanager
def create_tmp_file(suffix='.nc'):
Expand All @@ -206,7 +225,7 @@ def create_tmp_file(suffix='.nc'):


@requires_netCDF4
class NetCDF4DataTest(DatasetIOTestCases, TestCase):
class NetCDF4DataTest(CFEncodedDataTest, TestCase):
@contextlib.contextmanager
def create_store(self):
with create_tmp_file() as tmp_file:
Expand All @@ -215,33 +234,12 @@ def create_store(self):
@contextlib.contextmanager
def roundtrip(self, data, **kwargs):
with create_tmp_file() as tmp_file:
data.dump(tmp_file)
yield open_dataset(tmp_file, **kwargs)

def test_open_encodings(self):
# Create a netCDF file with explicit time units
# and make sure it makes it into the encodings
# and survives a round trip
with create_tmp_file() as tmp_file:
with nc4.Dataset(tmp_file, 'w') as ds:
ds.createDimension('time', size=10)
ds.createVariable('time', np.int32, dimensions=('time',))
units = 'days since 1999-01-01'
ds.variables['time'].setncattr('units', units)
ds.variables['time'][:] = np.arange(10) + 4

expected = Dataset()

time = pd.date_range('1999-01-05', periods=10)
encoding = {'units': units, 'dtype': np.dtype('int32')}
expected['time'] = ('time', time, {}, encoding)

actual = open_dataset(tmp_file)

self.assertVariableEqual(actual['time'], expected['time'])
actual_encoding = {k: v for k, v in iteritems(actual['time'].encoding)
if k in expected['time'].encoding}
self.assertDictEqual(actual_encoding, expected['time'].encoding)
with backends.NetCDF4DataStore(tmp_file,
mode='w', **kwargs) as store:
store.store(data)
with backends.NetCDF4DataStore(tmp_file,
mode='r', **kwargs) as store:
yield Dataset.load_store(store, decoder=None)

def test_open_group(self):
# Create a netCDF file with a dataset stored within a group
Expand Down Expand Up @@ -288,27 +286,6 @@ def test_open_subgroup(self):
actual = open_dataset(tmp_file, group=group)
self.assertVariableEqual(actual['x'], expected['x'])

def test_dump_and_open_encodings(self):
# Create a netCDF file with explicit time units
# and make sure it makes it into the encodings
# and survives a round trip
with create_tmp_file() as tmp_file:
with nc4.Dataset(tmp_file, 'w') as ds:
ds.createDimension('time', size=10)
ds.createVariable('time', np.int32, dimensions=('time',))
units = 'days since 1999-01-01'
ds.variables['time'].setncattr('units', units)
ds.variables['time'][:] = np.arange(10) + 4

xray_dataset = open_dataset(tmp_file)

with create_tmp_file() as tmp_file2:
xray_dataset.dump(tmp_file2)

with nc4.Dataset(tmp_file2, 'r') as ds:
self.assertEqual(ds.variables['time'].getncattr('units'), units)
self.assertArrayEqual(ds.variables['time'], np.arange(10) + 4)

def test_compression_encoding(self):
data = create_test_data()
data['var2'].encoding.update({'zlib': True,
Expand Down Expand Up @@ -391,23 +368,73 @@ def test_roundtrip_character_array(self):
with self.roundtrip(actual) as roundtripped:
self.assertDatasetIdentical(expected, roundtripped)

def test_open_encodings(self):
# Create a netCDF file with explicit time units
# and make sure it makes it into the encodings
# and survives a round trip
with create_tmp_file() as tmp_file:
with nc4.Dataset(tmp_file, 'w') as ds:
ds.createDimension('time', size=10)
ds.createVariable('time', np.int32, dimensions=('time',))
units = 'days since 1999-01-01'
ds.variables['time'].setncattr('units', units)
ds.variables['time'][:] = np.arange(10) + 4

expected = Dataset()

time = pd.date_range('1999-01-05', periods=10)
encoding = {'units': units, 'dtype': np.dtype('int32')}
expected['time'] = ('time', time, {}, encoding)

actual = open_dataset(tmp_file)

self.assertVariableEqual(actual['time'], expected['time'])
actual_encoding = {k: v for k, v in iteritems(actual['time'].encoding)
if k in expected['time'].encoding}
self.assertDictEqual(actual_encoding, expected['time'].encoding)

def test_dump_and_open_encodings(self):
# Create a netCDF file with explicit time units
# and make sure it makes it into the encodings
# and survives a round trip
with create_tmp_file() as tmp_file:
with nc4.Dataset(tmp_file, 'w') as ds:
ds.createDimension('time', size=10)
ds.createVariable('time', np.int32, dimensions=('time',))
units = 'days since 1999-01-01'
ds.variables['time'].setncattr('units', units)
ds.variables['time'][:] = np.arange(10) + 4

xray_dataset = open_dataset(tmp_file)

with create_tmp_file() as tmp_file2:
xray_dataset.dump(tmp_file2)

with nc4.Dataset(tmp_file2, 'r') as ds:
self.assertEqual(ds.variables['time'].getncattr('units'), units)
self.assertArrayEqual(ds.variables['time'], np.arange(10) + 4)


@requires_netCDF4
@requires_scipy
class ScipyDataTest(DatasetIOTestCases, TestCase):
class ScipyDataTest(CFEncodedDataTest, TestCase):
@contextlib.contextmanager
def create_store(self):
fobj = BytesIO()
yield backends.ScipyDataStore(fobj, 'w')

@contextlib.contextmanager
def roundtrip(self, data, **kwargs):
serialized = data.dumps()
yield open_dataset(BytesIO(serialized), **kwargs)
fobj = BytesIO()
with backends.ScipyDataStore(fobj, 'w', **kwargs) as store:
store.store(data)
seralized = fobj.getvalue()
with backends.ScipyDataStore(seralized, 'r', **kwargs) as store:
yield Dataset.load_store(store, decoder=None)


@requires_netCDF4
class NetCDF3ViaNetCDF4DataTest(DatasetIOTestCases, TestCase):
class NetCDF3ViaNetCDF4DataTest(CFEncodedDataTest, TestCase):
@contextlib.contextmanager
def create_store(self):
with create_tmp_file() as tmp_file:
Expand All @@ -417,8 +444,12 @@ def create_store(self):
@contextlib.contextmanager
def roundtrip(self, data, **kwargs):
with create_tmp_file() as tmp_file:
data.dump(tmp_file, format='NETCDF3_CLASSIC')
yield open_dataset(tmp_file, **kwargs)
with backends.NetCDF4DataStore(tmp_file, mode='w',
format='NETCDF3_CLASSIC', **kwargs) as store:
store.store(data)
with backends.NetCDF4DataStore(tmp_file,
mode='r', **kwargs) as store:
yield Dataset.load_store(store, decoder=None)


@requires_netCDF4
Expand Down
69 changes: 68 additions & 1 deletion test/test_conventions.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import numpy as np
import pandas as pd
import warnings
import contextlib

from xray import conventions, Variable
from xray import conventions, Variable, Dataset, utils, indexing
from . import TestCase, requires_netCDF4
from .test_backends import CFEncodedDataTest, DatasetIOTestCases
from xray.backends.memory import InMemoryDataStore


class TestMaskedAndScaledArray(TestCase):
Expand Down Expand Up @@ -275,3 +278,67 @@ def test_incompatible_attributes(self):
for var in invalid_vars:
with self.assertRaises(ValueError):
conventions.encode_cf_variable(var)


@conventions.cf_encoded
class CFEncodedInMemoryStore(InMemoryDataStore):
pass


class TestCFEncodedDataStore(CFEncodedDataTest, TestCase):
@contextlib.contextmanager
def create_store(self):
yield CFEncodedInMemoryStore()

@contextlib.contextmanager
def roundtrip(self, data, **kwargs):
store = CFEncodedInMemoryStore(**kwargs)
store.store(data)
yield Dataset.load_store(store, decoder=None)


class NullWrapper(utils.NDArrayMixin):
"""
Just for testing, this lets us create a numpy array directly
but make it look like its not in memory yet.
"""
def __init__(self, array):
self.array = array

def __getitem__(self, key):
return self.array[indexing.orthogonal_indexer(key, self.shape)]


def lazy_identity(x):
"""
Given a data store this wraps each variable in a NullWrapper so that
it appears to be out of memory.
"""
variables = {k: Variable(v.dimensions,
NullWrapper(v.values),
v.attrs) for k, v in x.variables.iteritems()}
return InMemoryDataStore({'variables': variables,
'attributes': x.attrs})


@conventions.encoding_decorator(lambda x: x, lazy_identity)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

decorator + inheritance feels like one more step than necessary...

class IdentityEncodedInMemoryStore(InMemoryDataStore):
"""
This InMemoryStore does no encoding or decoding, other than
wrapping all variables in NullWrappers, which lets us
test the trivial case of encoding and decoding.
"""
pass


class EncodedDataTest(DatasetIOTestCases, TestCase):

@contextlib.contextmanager
def create_store(self):
yield IdentityEncodedInMemoryStore()

@contextlib.contextmanager
def roundtrip(self, data, **kwargs):
store = IdentityEncodedInMemoryStore(**kwargs)
store.store(data)
yield Dataset.load_store(store, decoder=None)
23 changes: 7 additions & 16 deletions test/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,22 +51,13 @@ def __getitem__(self, key):


class InaccessibleVariableDataStore(backends.InMemoryDataStore):
def __init__(self):
self.dimensions = OrderedDict()
self._variables = OrderedDict()
self.attrs = OrderedDict()

def set_variable(self, name, variable):
self._variables[name] = variable
return self._variables[name]

def open_store_variable(self, var):
data = indexing.LazilyIndexedArray(InaccessibleArray(var.values))
return Variable(var.dimensions, data, var.attrs)

@property
def store_variables(self):
return self._variables

def get_variables(self):
def lazy_inaccessible(x):
data = indexing.LazilyIndexedArray(InaccessibleArray(x.values))
return Variable(x.dimensions, data, x.attrs)
return {k: lazy_inaccessible(v) for
k, v in self.ds['variables'].iteritems()}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should be pycompat.iteritems for python 3



class TestDataset(TestCase):
Expand Down
Loading