Skip to content

Modular encodings (rebased) #245

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Oct 11, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 119 additions & 11 deletions xray/backends/common.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import numpy as np
import itertools

from collections import Mapping

from ..core.utils import FrozenOrderedDict
from ..core.pycompat import iteritems
from ..core.variable import Coordinate


NONE_VAR_NAME = '__values__'
Expand All @@ -19,22 +23,96 @@ def _decode_variable_name(name):
return name


class AbstractDataStore(object):
def open_store_variable(self, v):
def is_trivial_index(var):
"""
Determines if in index is 'trivial' meaning that it is
equivalent to np.arange(). This is determined by
checking if there are any attributes or encodings,
if ndims is one, dtype is int and finally by comparing
the actual values to np.arange()
"""
# if either attributes or encodings are defined
# the index is not trival.
if len(var.attrs) or len(var.encoding):
return False
# if the index is not a 1d integer array
if var.ndim > 1 or not var.dtype.kind == 'i':
return False
if isinstance(var, Coordinate):
arange = np.arange(var.size, dtype=var.dtype)
if np.any(var.values != arange):
return False
return True


class AbstractDataStore(Mapping):

def __iter__(self):
return iter(self.variables)

def __getitem__(self, key):
return self.variables[key]

def __len__(self):
return len(self.variables)

def get_attrs(self):
raise NotImplementedError

@property
def store_variables(self):
return self.ds.variables
def get_variables(self):
raise NotImplementedError

def load(self):
"""
This loads the variables and attributes simultaneously.
A centralized loading function makes it easier to create
data stores that do automatic encoding/decoding.

For example:

class SuffixAppendingDataStore(AbstractDataStore):

def load(self):
variables, attributes = AbstractDataStore.load(self)
variables = {'%s_suffix' % k: v
for k, v in iteritems(variables)}
attributes = {'%s_suffix' % k: v
for k, v in iteritems(attributes)}
return variables, attributes

This function will be called anytime variables or attributes
are requested, so care should be taken to make sure its fast.
"""
variables = FrozenOrderedDict((_decode_variable_name(k), v)
for k, v in iteritems(self.get_variables()))
attributes = FrozenOrderedDict(self.get_attrs())
return variables, attributes

def get_dimensions(self):
return list(itertools.chain(*[x.dims
for x in self.variables.values()]))

@property
def variables(self):
return FrozenOrderedDict((_decode_variable_name(k),
self.open_store_variable(v))
for k, v in iteritems(self.store_variables))
# Because encoding/decoding might happen which may require both the
# attributes and the variables, and because a store may be updated
# we need to load both the attributes and variables
# anytime either one is requested.
variables, _ = self.load()
return variables

def sync(self):
pass
@property
def attrs(self):
# Because encoding/decoding might happen which may require both the
# attributes and the variables, and because a store may be updated
# we need to load both the attributes and variables
# anytime either one is requested.
_, attributes = self.load()
return attributes

@property
def dimensions(self):
return self.get_dimensions()

def close(self):
pass
Expand All @@ -47,6 +125,35 @@ def __exit__(self, exception_type, exception_value, tracebook):


class AbstractWritableDataStore(AbstractDataStore):

def set_dimension(self, d, l):
raise NotImplementedError

def set_attribute(self, k, v):
raise NotImplementedError

def set_variable(self, k, v):
raise NotImplementedError

def sync(self):
pass

def store_dataset(self, dataset):
# in stores variables are all variables AND coordinates
# in xray.Dataset variables are variables NOT coordinates,
# so here we pass the whole dataset in instead of doing
# dataset.variables
self.store(dataset, dataset.attrs)

def store(self, variables, attributes):
self.set_attributes(attributes)
neccesary_dims = [v.dims for v in variables.values()]
neccesary_dims = set(itertools.chain(*neccesary_dims))
# set all non-indexes and any index which is not trivial.
variables = dict((k, v) for k, v in iteritems(variables)
if not (k in neccesary_dims and is_trivial_index(v)))
self.set_variables(variables)

def set_dimensions(self, dimensions):
for d, l in iteritems(dimensions):
self.set_dimension(d, l)
Expand All @@ -58,8 +165,9 @@ def set_attributes(self, attributes):
def set_variables(self, variables):
for vn, v in iteritems(variables):
self.set_variable(_encode_variable_name(vn), v)
self.set_necessary_dimensions(v)

def set_necessary_dimensions(self, variable):
for d, l in zip(variable.dims, variable.shape):
if d not in self.ds.dimensions:
if d not in self.dimensions:
self.set_dimension(d, l)
37 changes: 23 additions & 14 deletions xray/backends/memory.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from ..core.pycompat import OrderedDict
import copy

from .common import AbstractWritableDataStore

Expand All @@ -7,22 +8,30 @@ class InMemoryDataStore(AbstractWritableDataStore):
"""
Stores dimensions, variables and attributes
in ordered dictionaries, making this store
fast compared to stores which store to disk.
fast compared to stores which save to disk.
"""
def __init__(self):
self.dimensions = OrderedDict()
self.variables = OrderedDict()
self.attributes = OrderedDict()
def __init__(self, variables=None, attributes=None):
self._variables = OrderedDict() if variables is None else variables
self._attributes = OrderedDict() if attributes is None else attributes

def set_dimension(self, name, length):
self.dimensions[name] = length
def get_attrs(self):
return self._attributes

def set_attribute(self, key, value):
self.attributes[key] = value
def get_variables(self):
return self._variables

def set_variable(self, name, variable):
self.variables[name] = variable
return self.variables[name]
def set_variable(self, k, v):
new_var = copy.deepcopy(v)
# we copy the variable and stuff all encodings in the
# attributes to imitate what happens when writing to disk.
new_var.attrs.update(new_var.encoding)
new_var.encoding.clear()
self._variables[k] = new_var

def del_attribute(self, key):
del self.attributes[key]
def set_attribute(self, k, v):
# copy to imitate writing to disk.
self._attributes[k] = copy.deepcopy(v)

def set_dimension(self, d, l):
# in this model, dimensions are accounted for in the variables
pass
34 changes: 23 additions & 11 deletions xray/backends/netCDF4_.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import numpy as np

from .. import Variable
from ..conventions import encode_cf_variable
from ..conventions import pop_to, cf_encoder
from ..core import indexing
from ..core.utils import FrozenOrderedDict, NDArrayMixin
from ..core.pycompat import iteritems, basestring, OrderedDict
Expand Down Expand Up @@ -86,14 +86,25 @@ class NetCDF4DataStore(AbstractWritableDataStore):
This store supports NetCDF3, NetCDF4 and OpenDAP datasets.
"""
def __init__(self, filename, mode='r', clobber=True, diskless=False,
persist=False, format='NETCDF4', group=None):
persist=False, format='NETCDF4', group=None,
*args, **kwdargs):
import netCDF4 as nc4
ds = nc4.Dataset(filename, mode=mode, clobber=clobber,
diskless=diskless, persist=persist,
format=format)
self.ds = _nc4_group(ds, group)
self.format = format
self._filename = filename
self._encoder_args = args
self._encoder_kwdargs = kwdargs

def store(self, variables, attributes):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we move this method to a mixin class (something like AlwaysWriteCFEncoded) and use multiple inheritance to add it to NetCDF4DataStore and ScipyDataStore?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doing that would get a little awkward since the encoding is now embedded within a store, so arguments to the encoder are passed into the DataStore constructor. As a result the mixin class would need to implement a constructor which stored arguments which are intended for the encoder, and the DataStore that extends the mixin would need to distinguish between arguments intended for the store/mixin. Doing this in any sort of automated way leads to the nasty bit of logic (encoding_decorator) that I just removed.

One alteration which might make that all less nasty is to have any encoding/decoding arguments be directly passed into DataStore store/load. Though, in the interest of not further bloating this PR I'd like to save that future change.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I do like the idea of passing encoding/decoding arguments directly into store/load.

On second thought, I do agree with you -- this is not enough redundant code to worry about factoring out.

# All NetCDF files get CF encoded by default, without this attempting
# to write times, for example, would fail.
cf_variables, cf_attrs = cf_encoder(variables, attributes,
*self._encoder_args,
**self._encoder_kwdargs)
AbstractWritableDataStore.store(self, cf_variables, cf_attrs)

def open_store_variable(self, var):
var.set_auto_maskandscale(False)
Expand All @@ -118,19 +129,20 @@ def open_store_variable(self, var):
# TODO: figure out how to round-trip "endian-ness" without raising
# warnings from netCDF4
# encoding['endian'] = var.endian()
encoding['least_significant_digit'] = \
attributes.pop('least_significant_digit', None)
pop_to(attributes, encoding, 'least_significant_digit')
# save source so __repr__ can detect if it's local or not
encoding['source'] = self._filename
return Variable(dimensions, data, attributes, encoding)

@property
def attrs(self):
def get_variables(self):
return FrozenOrderedDict((k, self.open_store_variable(v))
for k, v in iteritems(self.ds.variables))

def get_attrs(self):
return FrozenOrderedDict((k, self.ds.getncattr(k))
for k in self.ds.ncattrs())

@property
def dimensions(self):
def get_dimensions(self):
return FrozenOrderedDict((k, len(v))
for k, v in iteritems(self.ds.dimensions))

Expand All @@ -141,7 +153,7 @@ def set_attribute(self, key, value):
self.ds.setncattr(key, value)

def set_variable(self, name, variable):
variable = encode_cf_variable(variable)
attrs = variable.attrs.copy()
if self.format == 'NETCDF4':
variable, datatype = _nc4_values_and_dtype(variable)
else:
Expand All @@ -150,7 +162,7 @@ def set_variable(self, name, variable):

self.set_necessary_dimensions(variable)

fill_value = variable.attrs.pop('_FillValue', None)
fill_value = attrs.pop('_FillValue', None)
if fill_value in ['', '\x00']:
# these are equivalent to the default FillValue, but netCDF4
# doesn't like setting fill_value to an empty string
Expand All @@ -172,7 +184,7 @@ def set_variable(self, name, variable):
fill_value=fill_value)
nc4_var.set_auto_maskandscale(False)
nc4_var[:] = variable.values
for k, v in iteritems(variable.attrs):
for k, v in iteritems(attrs):
# set attributes one-by-one since netCDF4<1.0.10 can't handle
# OrderedDict as the input to setncatts
nc4_var.setncattr(k, v)
Expand Down
12 changes: 7 additions & 5 deletions xray/backends/pydap_.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,12 @@ def open_store_variable(self, var):
data = indexing.LazilyIndexedArray(PydapArrayWrapper(var))
return Variable(var.dimensions, data, var.attributes)

@property
def store_variables(self):
return self.ds
def get_variables(self):
return FrozenOrderedDict((k, self.open_store_variable(v))
for k, v in self.ds.iteritems())

@property
def attrs(self):
def get_attrs(self):
return Frozen(self.ds.attributes)

def get_dimensions(self):
return Frozen(self.ds.dimensions)
33 changes: 24 additions & 9 deletions xray/backends/scipy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,14 @@
import numpy as np
import warnings

from .. import conventions, Variable
from .. import Variable
from ..conventions import cf_encoder
from ..core.pycompat import iteritems, basestring, unicode_type, OrderedDict
from ..core.utils import Frozen
from ..core.utils import Frozen, FrozenOrderedDict

from .common import AbstractWritableDataStore
from .netcdf3 import is_valid_nc3_name, coerce_nc3_dtype, encode_nc3_variable
from xray.conventions import cf_decoder


def _decode_string(s):
Expand All @@ -32,7 +34,8 @@ class ScipyDataStore(AbstractWritableDataStore):

It only supports the NetCDF3 file-format.
"""
def __init__(self, filename_or_obj, mode='r', mmap=None, version=1):
def __init__(self, filename_or_obj, mode='r', mmap=None,
version=1, *args, **kwdargs):
import scipy
if mode != 'r' and scipy.__version__ < '0.13':
warnings.warn('scipy %s detected; '
Expand All @@ -50,17 +53,29 @@ def __init__(self, filename_or_obj, mode='r', mmap=None, version=1):
filename_or_obj = BytesIO(filename_or_obj)
self.ds = scipy.io.netcdf.netcdf_file(
filename_or_obj, mode=mode, mmap=mmap, version=version)
self._encoder_args = args
self._encoder_kwdargs = kwdargs

def store(self, variables, attributes):
# All Scipy objects get CF encoded by default, without this attempting
# to write times, for example, would fail.
cf_variables, cf_attrs = cf_encoder(variables, attributes,
*self._encoder_args,
**self._encoder_kwdargs)
AbstractWritableDataStore.store(self, cf_variables, cf_attrs)

def open_store_variable(self, var):
return Variable(var.dimensions, var.data,
_decode_attrs(var._attributes))

@property
def attrs(self):
def get_variables(self):
return FrozenOrderedDict((k, self.open_store_variable(v))
for k, v in iteritems(self.ds.variables))

def get_attrs(self):
return Frozen(_decode_attrs(self.ds._attributes))

@property
def dimensions(self):
def get_dimensions(self):
return Frozen(self.ds.dimensions)

def set_dimension(self, name, length):
Expand Down Expand Up @@ -88,8 +103,8 @@ def set_attribute(self, key, value):
setattr(self.ds, key, self._cast_attr_value(value))

def set_variable(self, name, variable):
variable = encode_nc3_variable(
conventions.encode_cf_variable(variable))
# TODO, create a netCDF3 encoder
variable = encode_nc3_variable(variable)
self.set_necessary_dimensions(variable)
data = variable.values
self.ds.createVariable(name, data.dtype, variable.dims)
Expand Down
Loading