Skip to content

Encoding improvements #251

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Oct 14, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 2 additions & 7 deletions xray/backends/netCDF4_.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,24 +86,19 @@ class NetCDF4DataStore(AbstractWritableDataStore):
This store supports NetCDF3, NetCDF4 and OpenDAP datasets.
"""
def __init__(self, filename, mode='r', clobber=True, diskless=False,
persist=False, format='NETCDF4', group=None,
*args, **kwdargs):
persist=False, format='NETCDF4', group=None):
import netCDF4 as nc4
ds = nc4.Dataset(filename, mode=mode, clobber=clobber,
diskless=diskless, persist=persist,
format=format)
self.ds = _nc4_group(ds, group)
self.format = format
self._filename = filename
self._encoder_args = args
self._encoder_kwdargs = kwdargs

def store(self, variables, attributes):
# All NetCDF files get CF encoded by default, without this attempting
# to write times, for example, would fail.
cf_variables, cf_attrs = cf_encoder(variables, attributes,
*self._encoder_args,
**self._encoder_kwdargs)
cf_variables, cf_attrs = cf_encoder(variables, attributes)
AbstractWritableDataStore.store(self, cf_variables, cf_attrs)

def open_store_variable(self, var):
Expand Down
9 changes: 2 additions & 7 deletions xray/backends/scipy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@ class ScipyDataStore(AbstractWritableDataStore):

It only supports the NetCDF3 file-format.
"""
def __init__(self, filename_or_obj, mode='r', mmap=None,
version=1, *args, **kwdargs):
def __init__(self, filename_or_obj, mode='r', mmap=None, version=1):
import scipy
if mode != 'r' and scipy.__version__ < '0.13':
warnings.warn('scipy %s detected; '
Expand All @@ -53,15 +52,11 @@ def __init__(self, filename_or_obj, mode='r', mmap=None,
filename_or_obj = BytesIO(filename_or_obj)
self.ds = scipy.io.netcdf.netcdf_file(
filename_or_obj, mode=mode, mmap=mmap, version=version)
self._encoder_args = args
self._encoder_kwdargs = kwdargs

def store(self, variables, attributes):
# All Scipy objects get CF encoded by default, without this attempting
# to write times, for example, would fail.
cf_variables, cf_attrs = cf_encoder(variables, attributes,
*self._encoder_args,
**self._encoder_kwdargs)
cf_variables, cf_attrs = cf_encoder(variables, attributes)
AbstractWritableDataStore.store(self, cf_variables, cf_attrs)

def open_store_variable(self, var):
Expand Down
201 changes: 112 additions & 89 deletions xray/conventions.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import functools
import numpy as np
import pandas as pd
import warnings
Expand Down Expand Up @@ -372,6 +373,73 @@ def pop_to(source, dest, key, default=None):
return value


def _var_as_tuple(var):
return var.dims, var.values, var.attrs.copy(), var.encoding.copy()


def maybe_encode_datetime(var):
if (np.issubdtype(var.dtype, np.datetime64)
or (var.dtype.kind == 'O'
and isinstance(var.values.flat[0], datetime))):

dims, values, attrs, encoding = _var_as_tuple(var)
if 'units' in attrs or 'calendar' in attrs:
raise ValueError(
"Failed hard to prevent overwriting 'units' or 'calendar'")

(values, units, calendar) = encode_cf_datetime(
values, encoding.pop('units', None), encoding.pop('calendar', None))
attrs['units'] = units
attrs['calendar'] = calendar
var = Variable(dims, values, attrs, encoding)
return var


def maybe_encode_offset_and_scale(var, needs_copy=True):
if any(k in var.encoding for k in ['add_offset', 'scale_factor']):
dims, values, attrs, encoding = _var_as_tuple(var)
values = np.array(values, dtype=float, copy=needs_copy)
needs_copy = False
if 'add_offset' in encoding:
values -= pop_to(encoding, attrs, 'add_offset')
if 'scale_factor' in encoding:
values /= pop_to(encoding, attrs, 'scale_factor')
var = Variable(dims, values, attrs, encoding)
return var, needs_copy


def maybe_encode_fill_value(var, needs_copy=True):
# replace NaN with the fill value
if '_FillValue' in var.encoding:
dims, values, attrs, encoding = _var_as_tuple(var)
fill_value = pop_to(encoding, attrs, '_FillValue')
if not pd.isnull(fill_value):
missing = pd.isnull(values)
if missing.any():
if needs_copy:
values = values.copy()
needs_copy = False
values[missing] = fill_value
var = Variable(dims, values, attrs, encoding)
return var, needs_copy


def maybe_encode_dtype(var, needs_copy=True):
if 'dtype' in var.encoding:
dims, values, attrs, encoding = _var_as_tuple(var)
dtype = np.dtype(encoding.pop('dtype'))
if dtype.kind != 'O':
if np.issubdtype(dtype, int):
out = np.empty_like(values) if needs_copy else values
np.around(values, out=out)
if dtype == 'S1' and values.dtype != 'S1':
values = string_to_char(np.asarray(values, 'S'))
dims = dims + ('string%s' % values.shape[-1],)
values = np.asarray(values, dtype=dtype)
var = Variable(dims, values, attrs, encoding)
return var


def _infer_dtype(array):
"""Given an object array with no missing values, infer its dtype from its
first element
Expand All @@ -390,7 +458,36 @@ def _infer_dtype(array):
return dtype


def encode_cf_variable(var):
def ensure_dtype_not_object(var):
# TODO: move this from conventions to backends? (it's not CF related)
if var.dtype.kind == 'O':
dims, values, attrs, encoding = _var_as_tuple(var)
missing = pd.isnull(values)
if missing.any():
non_missing_values = values[~missing]
inferred_dtype = _infer_dtype(non_missing_values)

if inferred_dtype.kind in ['S', 'U']:
# There is no safe bit-pattern for NA in typical binary string
# formats, we so can't set a fill_value. Unfortunately, this
# means we won't be able to restore string arrays with missing
# values.
fill_value = ''
else:
# insist on using float for numeric values
if not np.issubdtype(inferred_dtype, float):
inferred_dtype = np.dtype(float)
fill_value = np.nan

values = np.array(values, dtype=inferred_dtype, copy=True)
values[missing] = fill_value
else:
values = np.asarray(values, dtype=_infer_dtype(values))
var = Variable(dims, values, attrs, encoding)
return var


def encode_cf_variable(var, needs_copy=True):
"""
Converts an Variable into an Variable which follows some
of the CF conventions:
Expand All @@ -410,86 +507,12 @@ def encode_cf_variable(var):
out : xray.Variable
A variable which has been encoded as described above.
"""
dimensions = var.dims
data = var.values
attributes = var.attrs.copy()
encoding = var.encoding.copy()

# convert datetimes into numbers
if (np.issubdtype(data.dtype, np.datetime64)
or (data.dtype.kind == 'O'
and isinstance(data.reshape(-1)[0], datetime))):
if 'units' in attributes or 'calendar' in attributes:
raise ValueError(
"Failed hard to prevent overwriting 'units' or 'calendar'")
(data, units, calendar) = encode_cf_datetime(
data, encoding.pop('units', None), encoding.pop('calendar', None))
attributes['units'] = units
attributes['calendar'] = calendar

# unscale/mask
if any(k in encoding for k in ['add_offset', 'scale_factor']):
data = np.array(data, dtype=float, copy=True)
if 'add_offset' in encoding:
data -= pop_to(encoding, attributes, 'add_offset')
if 'scale_factor' in encoding:
data /= pop_to(encoding, attributes, 'scale_factor')

# replace NaN with the fill value
if '_FillValue' in encoding:
fill_value = pop_to(encoding, attributes, '_FillValue')
if not pd.isnull(fill_value):
missing = pd.isnull(data)
if missing.any():
data = data.copy()
data[missing] = fill_value

# replace NaN with the missing_value
if 'missing_value' in encoding:
missing_value = pop_to(encoding, attributes, 'missing_value')
if not pd.isnull(missing_value):
missing = pd.isnull(data)
if missing.any():
data = data.copy()
data[missing] = missing_value

# cast to encoded dtype
if 'dtype' in encoding:
dtype = np.dtype(encoding.pop('dtype'))
if dtype.kind != 'O':
if np.issubdtype(dtype, int):
data = data.round()
if dtype == 'S1' and data.dtype != 'S1':
data = string_to_char(np.asarray(data, 'S'))
dimensions = dimensions + ('string%s' % data.shape[-1],)
data = np.asarray(data, dtype=dtype)

# infer a valid dtype if necessary
# TODO: move this from conventions to backends (it's not CF related)
if data.dtype.kind == 'O':
missing = pd.isnull(data)
if missing.any():
non_missing_data = data[~missing]
inferred_dtype = _infer_dtype(non_missing_data)

if inferred_dtype.kind in ['S', 'U']:
# There is no safe bit-pattern for NA in typical binary string
# formats, we so can't set a fill_value. Unfortunately, this
# means we won't be able to restore string arrays with missing
# values.
fill_value = ''
else:
# insist on using float for numeric data
if not np.issubdtype(inferred_dtype, float):
inferred_dtype = np.dtype(float)
fill_value = np.nan

data = np.array(data, dtype=inferred_dtype, copy=True)
data[missing] = fill_value
else:
data = np.asarray(data, dtype=_infer_dtype(data))

return Variable(dimensions, data, attributes, encoding=encoding)
var = maybe_encode_datetime(var)
var, needs_copy = maybe_encode_offset_and_scale(var, needs_copy)
var, needs_copy = maybe_encode_fill_value(var, needs_copy)
var = maybe_encode_dtype(var, needs_copy)
var = ensure_dtype_not_object(var)
return var


def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
Expand Down Expand Up @@ -539,15 +562,15 @@ def decode_cf_variable(var, concat_characters=True, mask_and_scale=True,
data = CharToStringArray(data)

if mask_and_scale:
# missing_value is deprecated, but we still want to support it.
missing_value = pop_to(attributes, encoding, 'missing_value')
if 'missing_value' in attributes:
# missing_value is deprecated, but we still want to support it as
# an alias for _FillValue.
assert ('_FillValue' not in attributes
or utils.equivalent(attributes['_FillValue'],
attributes['missing_value']))
attributes['_FillValue'] = attributes.pop('missing_value')

fill_value = pop_to(attributes, encoding, '_FillValue')
# if missing_value is given but not fill_value we use missing_value
if fill_value is None and missing_value is not None:
fill_value = missing_value
# if both were given we make sure they are the same.
if fill_value is not None and missing_value is not None:
assert fill_value == missing_value
scale_factor = pop_to(attributes, encoding, 'scale_factor')
add_offset = pop_to(attributes, encoding, 'add_offset')
if ((fill_value is not None and not pd.isnull(fill_value))
Expand Down
24 changes: 12 additions & 12 deletions xray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,14 @@ def open_dataset(nc, decode_cf=True, mask_and_scale=True, decode_times=True,
# If nc is a file-like object we read it using
# the scipy.io.netcdf package
store = backends.ScipyDataStore(nc, *args, **kwargs)
decoder = conventions.cf_decoder if decode_cf else None
return Dataset.load_store(store, decoder=decoder,
mask_and_scale=mask_and_scale,
decode_times=decode_times,
concat_characters=concat_characters)
if decode_cf:
decoder = functools.partial(conventions.cf_decoder,
mask_and_scale=mask_and_scale,
decode_times=decode_times,
concat_characters=concat_characters)
else:
decoder = None
return Dataset.load_store(store, decoder=decoder)


# list of attributes of pd.DatetimeIndex that are ndarrays of time info
Expand Down Expand Up @@ -399,14 +402,13 @@ def _set_init_vars_and_dims(self, vars, coords):
check_coord_names=False)

@classmethod
def load_store(cls, store, decoder=None, *args, **kwdargs):
def load_store(cls, store, decoder=None):
"""Create a new dataset from the contents of a backends.*DataStore
object
"""
variables, attributes = store.load()
if decoder:
variables, attributes = decoder(variables, attributes,
*args, **kwdargs)
variables, attributes = decoder(variables, attributes)
obj = cls(variables, attrs=attributes)
obj._file_obj = store
return obj
Expand Down Expand Up @@ -785,13 +787,11 @@ def reset_coords(self, names=None, drop=False, inplace=False):
del obj._arrays[name]
return obj

def dump_to_store(self, store, encoder=None,
*args, **kwdargs):
def dump_to_store(self, store, encoder=None):
"""Store dataset contents to a backends.*DataStore object."""
variables, attributes = self, self.attrs
if encoder:
variables, attributes = encoder(variables, attributes,
*args, **kwdargs)
variables, attributes = encoder(variables, attributes)
store.store(variables, attributes)
store.sync()

Expand Down
7 changes: 4 additions & 3 deletions xray/test/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
import numpy as np
import pandas as pd

from xray import align, concat, backends, Dataset, DataArray, Variable
from xray import (align, concat, conventions, backends, Dataset, DataArray,
Variable)
from xray.core import indexing, utils
from xray.core.pycompat import iteritems, OrderedDict

Expand Down Expand Up @@ -1020,8 +1021,8 @@ def test_lazy_load(self):
store = InaccessibleVariableDataStore()
create_test_data().dump_to_store(store)

for decode_cf in [False, True]:
ds = Dataset.load_store(store, decode_cf=decode_cf)
for decoder in [None, conventions.cf_decoder]:
ds = Dataset.load_store(store, decoder=decoder)
with self.assertRaises(UnexpectedDataAccess):
ds.load_data()
with self.assertRaises(UnexpectedDataAccess):
Expand Down