From 34d4852ff8e015dc15e5fe0f5db939d6c512ed1d Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Wed, 14 Jan 2015 13:22:15 -0800 Subject: [PATCH 1/3] Automatic alignment on indexes in binary arithmetic --- xray/core/alignment.py | 10 ++++++---- xray/core/dataarray.py | 8 ++++++++ xray/core/dataset.py | 7 +++++++ xray/test/test_dataarray.py | 26 +++++++++++++++++--------- xray/test/test_dataset.py | 15 +++++++++++++++ 5 files changed, 53 insertions(+), 13 deletions(-) diff --git a/xray/core/alignment.py b/xray/core/alignment.py index a1025b6d64d..d3bfb58fa5c 100644 --- a/xray/core/alignment.py +++ b/xray/core/alignment.py @@ -27,8 +27,7 @@ def align(*objects, **kwargs): objects with aligned indexes. Array from the aligned objects are suitable as input to mathematical - operators, because along each dimension they are indexed by the same - indexes. + operators, because along each dimension they have the same indexes. Missing values (if ``join != 'inner'``) are filled with NaN. @@ -44,8 +43,8 @@ def align(*objects, **kwargs): - 'left': use indexes from the first object with each dimension - 'right': use indexes from the last object with each dimension copy : bool, optional - If `copy=True`, the returned objects contain all new variables. If - `copy=False` and no reindexing is required then the aligned objects + If ``copy=True``, the returned objects contain all new variables. If + ``copy=False`` and no reindexing is required then the aligned objects will include original variables. Returns @@ -55,6 +54,9 @@ def align(*objects, **kwargs): """ join = kwargs.pop('join', 'inner') copy = kwargs.pop('copy', True) + if kwargs: + raise TypeError('align() got unexpected keyword arguments: %s' + % list(kwargs)) if join == 'outer': join_indices = functools.partial(functools.reduce, operator.or_) diff --git a/xray/core/dataarray.py b/xray/core/dataarray.py index 85ce7e2cd77..44259d038d4 100644 --- a/xray/core/dataarray.py +++ b/xray/core/dataarray.py @@ -9,6 +9,7 @@ from . import ops from . import utils from . import variable +from .alignment import align from .common import AbstractArray, AttrAccessMixin from .coordinates import DataArrayCoordinates, Indexes from .dataset import Dataset @@ -950,6 +951,13 @@ def _binary_op(f, reflexive=False): def func(self, other): if isinstance(other, (Dataset, groupby.GroupBy)): return NotImplemented + if hasattr(other, 'indexes'): + self, other = align(self, other, join='inner', copy=False) + empty_indexes = [d for d, s in zip(self.dims, self.shape) + if s == 0] + if empty_indexes: + raise ValueError('no overlapping labels for some ' + 'dimensions: %s' % empty_indexes) other_coords = getattr(other, 'coords', None) other_variable = getattr(other, 'variable', other) ds = self.coords.merge(other_coords) diff --git a/xray/core/dataset.py b/xray/core/dataset.py index 9fb18c79f0a..d3361df00fa 100644 --- a/xray/core/dataset.py +++ b/xray/core/dataset.py @@ -17,6 +17,7 @@ from . import alignment from . import formatting from .. import backends, conventions +from .alignment import align from .coordinates import DatasetCoordinates, Indexes from .common import ImplementsDatasetReduce, AttrAccessMixin from .utils import Frozen, SortedKeysDict, ChainMap @@ -1651,6 +1652,12 @@ def _binary_op(f, reflexive=False): def func(self, other): if isinstance(other, groupby.GroupBy): return NotImplemented + if hasattr(other, 'indexes'): + self, other = align(self, other, join='inner', copy=False) + empty_indexes = [d for d, s in self.dims.items() if s == 0] + if empty_indexes: + raise ValueError('no overlapping labels for some ' + 'dimensions: %s' % empty_indexes) other_coords = getattr(other, 'coords', None) ds = self.coords.merge(other_coords) g = f if not reflexive else lambda x, y: f(y, x) diff --git a/xray/test/test_dataarray.py b/xray/test/test_dataarray.py index 77571f22c38..1956805aaf3 100644 --- a/xray/test/test_dataarray.py +++ b/xray/test/test_dataarray.py @@ -530,13 +530,15 @@ def test_math(self): self.assertDataArrayEqual(a, 0 * x + a) self.assertDataArrayEqual(a, a + 0 * a) self.assertDataArrayEqual(a, 0 * a + a) - # test different indices - b = a.copy() - b.coords['x'] = 3 + np.arange(10) - with self.assertRaisesRegexp(ValueError, 'not aligned'): - a + b - with self.assertRaisesRegexp(ValueError, 'not aligned'): - b + a + + def test_math_automatic_alignment(self): + a = DataArray(range(5), [('x', range(5))]) + b = DataArray(range(5), [('x', range(1, 6))]) + expected = DataArray(np.ones(4), [('x', [1, 2, 3, 4])]) + self.assertDataArrayIdentical(a - b, expected) + + with self.assertRaisesRegexp(ValueError, 'no overlapping labels'): + a.isel(x=slice(2)) + a.isel(x=slice(2, None)) def test_inplace_math_basics(self): x = self.x @@ -550,6 +552,14 @@ def test_inplace_math_basics(self): self.assertIs(source_ndarray(b.values), x) self.assertDatasetIdentical(b._dataset, self.ds) + def test_inplace_math_automatic_alignment(self): + a = DataArray(range(5), [('x', range(5))]) + b = DataArray(range(1, 6), [('x', range(1, 6))]) + with self.assertRaisesRegexp(ValueError, 'not aligned'): + a += b + with self.assertRaisesRegexp(ValueError, 'not aligned'): + b += a + def test_math_name(self): # Verify that name is preserved only when it can be done unambiguously. # The rule (copied from pandas.Series) is keep the current name only if @@ -902,8 +912,6 @@ def test_concat(self): def test_align(self): self.ds['x'] = ('x', np.array(list('abcdefghij'))) - with self.assertRaises(ValueError): - self.dv + self.dv[:5] dv1, dv2 = align(self.dv, self.dv[:5], join='inner') self.assertDataArrayIdentical(dv1, self.dv[:5]) self.assertDataArrayIdentical(dv2, self.dv[:5]) diff --git a/xray/test/test_dataset.py b/xray/test/test_dataset.py index fb4ec3fad51..b77495318c7 100644 --- a/xray/test/test_dataset.py +++ b/xray/test/test_dataset.py @@ -1481,6 +1481,21 @@ def test_dataset_dataset_math(self): self.assertDatasetIdentical(ds == ds, ds.notnull()) + subsampled = ds.isel(y=slice(2)) + expected = 2 * subsampled + self.assertDatasetIdentical(expected, subsampled + ds) + self.assertDatasetIdentical(expected, ds + subsampled) + + def test_dataset_math_automatic_alignment(self): + ds = self.make_example_math_dataset() + subset = ds.isel(x=slice(2), y=[1, 3]) + expected = 2 * subset + actual = ds + subset + self.assertDatasetIdentical(expected, actual) + + with self.assertRaisesRegexp(ValueError, 'no overlapping labels'): + ds.isel(x=slice(1)) + ds.isel(x=slice(1, None)) + def test_dataset_math_errors(self): ds = self.make_example_math_dataset() From a0c0c89a3147c8c94d2e93f22832c7c90a9cf177 Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 13 Feb 2015 00:25:10 -0800 Subject: [PATCH 2/3] Auto-align Dataset __init__, __setitem__, merge, update --- doc/whats-new.rst | 8 ++ xray/core/alignment.py | 64 +++++++++++----- xray/core/dataset.py | 157 ++++++++++++++++++++++---------------- xray/test/test_dataset.py | 111 +++++++++++++++++++++++++-- 4 files changed, 250 insertions(+), 90 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index d20bd79fc03..798bbef040e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -9,6 +9,14 @@ What's New import xray np.random.seed(123456) +v0.4 (unreleased) +----------------- + +.. These need tests: +.. ``Dataset.update`` no longer updates attributes as well as variables. +.. ``Dataset.__init__`` is not longer as strict by default +.. TODO: auto-align Dataset arithmetic + v0.3.2 (23 December, 2014) -------------------------- diff --git a/xray/core/alignment.py b/xray/core/alignment.py index d3bfb58fa5c..e25941c18e7 100644 --- a/xray/core/alignment.py +++ b/xray/core/alignment.py @@ -12,14 +12,39 @@ from .variable import as_variable, Variable, Coordinate, broadcast_variables -def _get_all_indexes(objects): +def _get_joiner(join): + if join == 'outer': + return functools.partial(functools.reduce, operator.or_) + elif join == 'inner': + return functools.partial(functools.reduce, operator.and_) + elif join == 'left': + return operator.itemgetter(0) + elif join == 'right': + return operator.itemgetter(-1) + else: + raise ValueError('invalid value for join: %s' % join) + + +def _get_all_indexes(objects, exclude=set()): all_indexes = defaultdict(list) for obj in objects: for k, v in iteritems(obj.indexes): - all_indexes[k].append(v) + if k not in exclude: + all_indexes[k].append(v) return all_indexes +def _join_indexes(join, objects, exclude=set()): + joiner = _get_joiner(join) + indexes = _get_all_indexes(objects, exclude=exclude) + # exclude dimensions with all equal indices (the usual case) to avoid + # unnecessary reindexing work. + # TODO: don't bother to check equals for left or right joins + joined_indexes = dict((k, joiner(v)) for k, v in iteritems(indexes) + if any(not v[0].equals(idx) for idx in v[1:])) + return joined_indexes + + def align(*objects, **kwargs): """align(*objects, join='inner', copy=True) @@ -38,10 +63,10 @@ def align(*objects, **kwargs): join : {'outer', 'inner', 'left', 'right'}, optional Method for joining the indexes of the passed objects along each dimension: - - 'outer': use the union of object indexes - - 'inner': use the intersection of object indexes - - 'left': use indexes from the first object with each dimension - - 'right': use indexes from the last object with each dimension + - 'outer': use the union of object indexes + - 'inner': use the intersection of object indexes + - 'left': use indexes from the first object with each dimension + - 'right': use indexes from the last object with each dimension copy : bool, optional If ``copy=True``, the returned objects contain all new variables. If ``copy=False`` and no reindexing is required then the aligned objects @@ -58,22 +83,23 @@ def align(*objects, **kwargs): raise TypeError('align() got unexpected keyword arguments: %s' % list(kwargs)) - if join == 'outer': - join_indices = functools.partial(functools.reduce, operator.or_) - elif join == 'inner': - join_indices = functools.partial(functools.reduce, operator.and_) - elif join == 'left': - join_indices = operator.itemgetter(0) - elif join == 'right': - join_indices = operator.itemgetter(-1) + joined_indexes = _join_indexes(join, objects) + return tuple(obj.reindex(copy=copy, **joined_indexes) for obj in objects) - all_indexes = _get_all_indexes(objects) - # Exclude dimensions with all equal indices to avoid unnecessary reindexing - # work. - joined_indexes = dict((k, join_indices(v)) for k, v in iteritems(all_indexes) - if any(not v[0].equals(idx) for idx in v[1:])) +def partial_align(*objects, **kwargs): + """partial_align(*objects, join='inner', copy=True, exclude=set() + + Like align, but don't align along dimensions in exclude. Not public API. + """ + join = kwargs.pop('join', 'inner') + copy = kwargs.pop('copy', True) + exclude = kwargs.pop('exclude', set()) + if kwargs: + raise TypeError('align() got unexpected keyword arguments: %s' + % list(kwargs)) + joined_indexes = _join_indexes(join, objects, exclude=exclude) return tuple(obj.reindex(copy=copy, **joined_indexes) for obj in objects) diff --git a/xray/core/dataset.py b/xray/core/dataset.py index d3361df00fa..c68e7b07409 100644 --- a/xray/core/dataset.py +++ b/xray/core/dataset.py @@ -17,7 +17,7 @@ from . import alignment from . import formatting from .. import backends, conventions -from .alignment import align +from .alignment import align, partial_align from .coordinates import DatasetCoordinates, Indexes from .common import ImplementsDatasetReduce, AttrAccessMixin from .utils import Frozen, SortedKeysDict, ChainMap @@ -172,7 +172,18 @@ def _as_dataset_variable(name, var): return var -def _expand_arrays(raw_variables, old_variables={}, compat='identical'): +def _align_variables(arrays, join='outer'): + """Align all DataArrays in the provided dict, leaving other values alone. + """ + alignable = [k for k, v in arrays.items() if hasattr(v, 'indexes')] + aligned = align(*[arrays[a] for a in alignable], + join=join, copy=False) + new_arrays = OrderedDict(arrays) + new_arrays.update(zip(alignable, aligned)) + return new_arrays + + +def _expand_variables(raw_variables, old_variables={}, compat='identical'): """Expand a dictionary of variables. Returns a dictionary of Variable objects suitable for inserting into a @@ -190,15 +201,16 @@ def _expand_arrays(raw_variables, old_variables={}, compat='identical'): variables = ChainMap(new_variables, old_variables) def add_variable(name, var): + var = _as_dataset_variable(name, var) if name not in variables: - variables[name] = _as_dataset_variable(name, var) + variables[name] = var + new_coord_names.update(variables[name].dims) else: if not getattr(variables[name], compat)(var): raise ValueError('conflicting value for variable %s:\n' 'first value: %r\nsecond value: %r' % (name, variables[name], var)) if compat == 'broadcast_equals': - new_dims = _as_dataset_variable(name, var).dims common_dims = OrderedDict(zip(variables[name].dims, variables[name].shape)) common_dims.update(zip(var.dims, var.shape)) @@ -210,8 +222,10 @@ def add_variable(name, var): new_coord_names.update(var.coords) for dim, coord in iteritems(var.coords): if dim != name: - add_variable(dim, coord) + add_variable(dim, coord.variable) + var = var.variable add_variable(name, var) + return new_variables, new_coord_names @@ -239,19 +253,38 @@ def _calculate_dims(variables): return dims -class _DatasetLike(object): - """A Dataset-like object that only contains a few private attributes +def _merge_expand(aligned_self, other, overwrite_vars, compat): + possible_conflicts = dict((k, v) for k, v in aligned_self._arrays.items() + if k not in overwrite_vars) + new_vars, new_coord_names = _expand_variables(other, possible_conflicts, compat) + replace_vars = aligned_self._arrays.copy() + replace_vars.update(new_vars) + return replace_vars, new_vars, new_coord_names - Like `as_dataset`, handles DataArrays, Datasets and dictionaries of - variables. The difference is that this method never creates a new Dataset - object, and hence is much more lightweight, avoiding any consistency - checks on the variables (that should be handled later). - """ - def __init__(self, obj): - obj = getattr(obj, '_dataset', obj) - self._arrays = getattr(obj, '_arrays', obj) - self._coord_names = getattr(obj, '_coord_names', set()) - self.attrs = getattr(obj, 'attrs', {}) + +def _merge_dataset(self, other, overwrite_vars, compat, join): + aligned_self, other = partial_align(self, other, join=join, copy=False) + + replace_vars, new_vars, new_coord_names = _merge_expand( + aligned_self, other._arrays, overwrite_vars, compat) + new_coord_names.update(other._coord_names) + + return replace_vars, new_vars, new_coord_names + + +def _merge_dict(self, other, overwrite_vars, compat, join): + other = _align_variables(other, join='outer') + + alignable = [k for k, v in other.items() if hasattr(v, 'indexes')] + aligned = partial_align(self, *[other[a] for a in alignable], + join=join, copy=False, exclude=overwrite_vars) + + aligned_self = aligned[0] + + other = OrderedDict(other) + other.update(zip(alignable, aligned[1:])) + + return _merge_expand(aligned_self, other, overwrite_vars, compat) def _assert_empty(args, msg='%s'): @@ -324,7 +357,8 @@ class Dataset(Mapping, ImplementsDatasetReduce, AttrAccessMixin): _attrs = None _arrays = Frozen({}) - def __init__(self, variables=None, coords=None, attrs=None): + def __init__(self, variables=None, coords=None, attrs=None, + compat='broadcast_equals'): """To load data from a file or file-like object, use the `open_dataset` function. @@ -359,12 +393,12 @@ def __init__(self, variables=None, coords=None, attrs=None): if coords is None: coords = set() if variables or coords: - self._set_init_vars_and_dims(variables, coords) + self._set_init_vars_and_dims(variables, coords, compat) if attrs is not None: self.attrs = attrs - def _add_missing_coords(self): - """Add missing coordinates IN-PLACE to _arrays + def _add_missing_coords_inplace(self): + """Add missing coordinates to self._arrays """ for dim, size in iteritems(self.dims): if dim not in self._arrays: @@ -399,25 +433,23 @@ def _update_vars_and_coords(self, new_arrays, new_coord_names={}, # all checks are complete: it's safe to update self._arrays = arrays self._dims = dims - self._add_missing_coords() - self._coord_names.update(dims) + self._add_missing_coords_inplace() self._coord_names.update(new_coord_names) - def _set_init_vars_and_dims(self, vars, coords): + def _set_init_vars_and_dims(self, vars, coords, compat): """Set the initial value of Dataset arrays and dimensions """ _assert_empty([k for k in vars if k in coords], 'redundant variables and coordinates: %s') arrays = ChainMap(vars, coords) - new_arrays, new_coord_names = _expand_arrays(arrays) - _assert_empty([k for k in new_coord_names if k not in new_arrays], - 'no matching variables exist for some coordinates: %s') + aligned = _align_variables(arrays) + new_variables, new_coord_names = _expand_variables(aligned, + compat=compat) new_coord_names.update(coords) - self._update_vars_and_coords(new_arrays, new_coord_names, - needs_copy=False, - check_coord_names=False) + self._update_vars_and_coords(new_variables, new_coord_names, + needs_copy=False, check_coord_names=False) @classmethod def load_store(cls, store, decoder=None): @@ -682,7 +714,7 @@ def __setitem__(self, key, value): if utils.is_dict_like(key): raise NotImplementedError('cannot yet use a dictionary as a key ' 'to set Dataset values') - self.merge({key: value}, inplace=True, overwrite_vars=[key]) + self.update({key: value}) def __delitem__(self, key): """Remove a variable from this dataset. @@ -1079,8 +1111,7 @@ def rename(self, name_dict, inplace=False): return obj def update(self, other, inplace=True): - """Update this dataset's variables and attributes with those from - another dataset. + """Update this dataset's variables with those from another dataset. Parameters ---------- @@ -1101,14 +1132,11 @@ def update(self, other, inplace=True): If any dimensions would have inconsistent sizes in the updated dataset. """ - other = _DatasetLike(other) - obj = self.merge(other, inplace=inplace, - overwrite_vars=other._arrays) - obj.attrs.update(other.attrs) - return obj + return self.merge( + other, inplace=inplace, overwrite_vars=list(other), join='left') def merge(self, other, inplace=False, overwrite_vars=set(), - compat='broadcast_equals'): + compat='broadcast_equals', join='outer'): """Merge the arrays of two datasets into a single dataset. This method generally not allow for overriding data, with the exception @@ -1135,6 +1163,13 @@ def merge(self, other, inplace=False, overwrite_vars=set(), - 'equals': all values and dimensions must be the same. - 'identical': all values, dimensions and attributes must be the same. + join : {'outer', 'inner', 'left', 'right'}, optional + Method for joining ``self`` and ``other`` along shared dimensions: + + - 'outer': use the union of the indexes + - 'inner': use the intersection of the indexes + - 'left': use indexes from ``self`` + - 'right': use indexes from ``other`` Returns ------- @@ -1149,34 +1184,26 @@ def merge(self, other, inplace=False, overwrite_vars=set(), if compat not in ['broadcast_equals', 'equals', 'identical']: raise ValueError("compat=%r invalid: must be 'broadcast_equals', " "'equals' or 'identical'" % compat) - other = _DatasetLike(other) - # determine variables to check for conflicts - if not overwrite_vars: - potential_conflicts = self._arrays - else: - if isinstance(overwrite_vars, basestring): - overwrite_vars = set([overwrite_vars]) - else: - overwrite_vars = set(overwrite_vars) - potential_conflicts = dict((k, v) for k, v in iteritems(self._arrays) - if k not in overwrite_vars) - - new_variables, new_coord_names = _expand_arrays( - other._arrays, potential_conflicts, compat) - new_coord_names |= other._coord_names - - _assert_empty([k for k in other._arrays - if k in potential_conflicts - and k not in new_coord_names - and k in self.coords], - 'variables with these names already exist as ' - 'coordinates: %s') - - # update variables + if isinstance(overwrite_vars, basestring): + overwrite_vars = [overwrite_vars] + overwrite_vars = set(overwrite_vars) + + merge = _merge_dataset if isinstance(other, Dataset) else _merge_dict + + replace_vars, new_vars, new_coord_names = merge( + self, other, overwrite_vars, compat=compat, join=join) + + newly_coords = new_coord_names & (set(self) - set(self.coords)) + no_longer_coords = set(self.coords) & (set(new_vars) - new_coord_names) + ambiguous_coords = (newly_coords | no_longer_coords) - overwrite_vars + if ambiguous_coords: + raise ValueError('cannot merge: the following variables are ' + 'coordinates on one dataset but not the other: %s' + % list(ambiguous_coords)) + obj = self if inplace else self.copy() - obj._update_vars_and_coords(new_variables, new_coord_names, - needs_copy=inplace) + obj._update_vars_and_coords(replace_vars, new_coord_names) return obj def _assert_all_in_dataset(self, names, virtual_okay=False): diff --git a/xray/test/test_dataset.py b/xray/test/test_dataset.py index b77495318c7..5b25f99b27f 100644 --- a/xray/test/test_dataset.py +++ b/xray/test/test_dataset.py @@ -141,6 +141,46 @@ def test_constructor_0d(self): actual = Dataset({'x': arg}) self.assertDatasetIdentical(expected, actual) + def test_constructor_auto_align(self): + a = DataArray([1, 2], [('x', [0, 1])]) + b = DataArray([3, 4], [('x', [1, 2])]) + + # verify align uses outer join + expected = Dataset({'a': ('x', [1, 2, np.nan]), + 'b': ('x', [np.nan, 3, 4])}) + actual = Dataset({'a': a, 'b': b}) + self.assertDatasetIdentical(expected, actual) + + # var with different dimensions + c = ('y', [3, 4]) + expected2 = expected.merge({'c': c}) + actual = Dataset({'a': a, 'b': b, 'c': c}) + self.assertDatasetIdentical(expected2, actual) + + # var that is only aligned against the aligned variables + d = ('x', [3, 2, 1]) + expected3 = expected.merge({'d': d}) + actual = Dataset({'a': a, 'b': b, 'd': d}) + self.assertDatasetIdentical(expected3, actual) + + e = ('x', [0, 0]) + with self.assertRaisesRegexp(ValueError, 'conflicting sizes'): + Dataset({'a': a, 'b': b, 'e': e}) + + def test_constructor_compat(self): + data = {'x': DataArray(0, coords={'y': 1}), 'y': ('z', [1, 1, 1])} + with self.assertRaisesRegexp(ValueError, 'conflicting value'): + Dataset(data, compat='equals') + expected = Dataset({'x': 0}, {'y': ('z', [1, 1, 1])}) + actual = Dataset(data) + self.assertDatasetIdentical(expected, actual) + actual = Dataset(data, compat='broadcast_equals') + self.assertDatasetIdentical(expected, actual) + + data = {'x': DataArray(0, coords={'y': 3}), 'y': ('z', [1, 1, 1])} + with self.assertRaisesRegexp(ValueError, 'conflicting value'): + Dataset(data) + def test_constructor_with_coords(self): with self.assertRaisesRegexp(ValueError, 'redundant variables and co'): Dataset({'a': ('x', [1])}, {'a': ('x', [1])}) @@ -731,6 +771,28 @@ def test_update(self): self.assertIsNot(actual, expected) self.assertDatasetIdentical(expected, actual) + other = Dataset(attrs={'new': 'attr'}) + actual = data.copy() + actual.update(other) + self.assertDatasetIdentical(expected, actual) + + def test_update_auto_align(self): + ds = Dataset({'x': ('t', [3, 4])}) + + expected = Dataset({'x': ('t', [3, 4]), 'y': ('t', [np.nan, 5])}) + actual = ds.copy() + other = {'y': ('t', [5]), 't': [1]} + with self.assertRaisesRegexp(ValueError, 'conflicting sizes'): + actual.update(other) + actual.update(Dataset(other)) + self.assertDatasetIdentical(expected, actual) + + actual = ds.copy() + other = Dataset({'y': ('t', [5]), 't': [100]}) + actual.update(other) + expected = Dataset({'x': ('t', [3, 4]), 'y': ('t', [np.nan] * 2)}) + self.assertDatasetIdentical(expected, actual) + def test_merge(self): data = create_test_data() ds1 = data[['var1']] @@ -749,13 +811,11 @@ def test_merge(self): actual = data.merge(data.reset_coords(drop=True)) self.assertDatasetIdentical(data, actual) - with self.assertRaises(ValueError): - ds1.merge(ds2.isel(dim1=slice(2))) with self.assertRaises(ValueError): ds1.merge(ds2.rename({'var3': 'var1'})) - with self.assertRaisesRegexp(ValueError, 'coordinates with these'): + with self.assertRaisesRegexp(ValueError, 'cannot merge'): data.reset_coords().merge(data) - with self.assertRaisesRegexp(ValueError, 'variables with these'): + with self.assertRaisesRegexp(ValueError, 'cannot merge'): data.merge(data.reset_coords()) def test_merge_broadcast_equals(self): @@ -795,6 +855,22 @@ def test_merge_compat(self): with self.assertRaisesRegexp(ValueError, 'compat=\S+ invalid'): ds1.merge(ds2, compat='foobar') + def test_merge_auto_align(self): + ds1 = Dataset({'a': ('x', [1, 2])}) + ds2 = Dataset({'b': ('x', [3, 4]), 'x': [1, 2]}) + expected = Dataset({'a': ('x', [1, 2, np.nan]), + 'b': ('x', [np.nan, 3, 4])}) + self.assertDatasetIdentical(expected, ds1.merge(ds2)) + self.assertDatasetIdentical(expected, ds2.merge(ds1)) + + expected = expected.isel(x=slice(2)) + self.assertDatasetIdentical(expected, ds1.merge(ds2, join='left')) + self.assertDatasetIdentical(expected, ds2.merge(ds1, join='right')) + + expected = expected.isel(x=slice(1, 2)) + self.assertDatasetIdentical(expected, ds1.merge(ds2, join='inner')) + self.assertDatasetIdentical(expected, ds2.merge(ds1, join='inner')) + def test_getitem(self): data = create_test_data() self.assertIsInstance(data['var1'], DataArray) @@ -869,7 +945,7 @@ def test_setitem(self): data2['scalar'] = ([], 0) self.assertDatasetIdentical(data1, data2) # can't use the same dimension name as a scalar var - with self.assertRaisesRegexp(ValueError, 'already exists as a scalar'): + with self.assertRaisesRegexp(ValueError, 'cannot merge'): data1['newvar'] = ('scalar', [3, 4, 5]) # can't resize a used dimension with self.assertRaisesRegexp(ValueError, 'conflicting sizes'): @@ -881,6 +957,29 @@ def test_setitem(self): with self.assertRaises(NotImplementedError): data1[{'x': 0}] = 0 + def test_setitem_auto_align(self): + ds = Dataset() + ds['x'] = ('y', range(3)) + ds['y'] = 1 + np.arange(3) + expected = Dataset({'x': ('y', range(3)), 'y': 1 + np.arange(3)}) + self.assertDatasetIdentical(ds, expected) + + ds['y'] = DataArray(range(3), dims='y') + expected = Dataset({'x': ('y', range(3))}) + self.assertDatasetIdentical(ds, expected) + + ds['x'] = DataArray([1, 2], dims='y') + expected = Dataset({'x': ('y', [1, 2, np.nan])}) + self.assertDatasetIdentical(ds, expected) + + ds['x'] = 42 + expected = Dataset({'x': 42, 'y': range(3)}) + self.assertDatasetIdentical(ds, expected) + + ds['x'] = DataArray([4, 5, 6, 7], dims='y') + expected = Dataset({'x': ('y', [4, 5, 6])}) + self.assertDatasetIdentical(ds, expected) + def test_delitem(self): data = create_test_data() all_items = set(data) @@ -1486,7 +1585,7 @@ def test_dataset_dataset_math(self): self.assertDatasetIdentical(expected, subsampled + ds) self.assertDatasetIdentical(expected, ds + subsampled) - def test_dataset_math_automatic_alignment(self): + def test_dataset_math_auto_align(self): ds = self.make_example_math_dataset() subset = ds.isel(x=slice(2), y=[1, 3]) expected = 2 * subset From 6ccf629511dce9ef3fb6074fe2133d3277b506da Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Fri, 13 Feb 2015 00:54:27 -0800 Subject: [PATCH 3/3] Auto-alignment of variables for Dataset binary ops --- doc/whats-new.rst | 11 +++++++---- xray/core/dataset.py | 21 +++++++++++++++------ xray/test/test_dataset.py | 17 +++++++++++++++-- 3 files changed, 37 insertions(+), 12 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 798bbef040e..ebff3eba79d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -12,10 +12,13 @@ What's New v0.4 (unreleased) ----------------- -.. These need tests: -.. ``Dataset.update`` no longer updates attributes as well as variables. -.. ``Dataset.__init__`` is not longer as strict by default -.. TODO: auto-align Dataset arithmetic +Highlights +~~~~~~~~~~ + +- Automatic alignment of index labels in arithmetic, dataset cosntruction and + merging. +- Aggregation operations skip missing values by default. +- Lots of bug fixes. v0.3.2 (23 December, 2014) -------------------------- diff --git a/xray/core/dataset.py b/xray/core/dataset.py index c68e7b07409..d9ac8c0e48b 100644 --- a/xray/core/dataset.py +++ b/xray/core/dataset.py @@ -215,6 +215,7 @@ def add_variable(name, var): variables[name].shape)) common_dims.update(zip(var.dims, var.shape)) variables[name] = variables[name].set_dims(common_dims) + new_coord_names.update(var.dims) for name, var in iteritems(raw_variables): if hasattr(var, 'coords'): @@ -1718,15 +1719,23 @@ def _calculate_binary_op(f, dataset, other, dest_vars): if utils.is_dict_like(other): other_arrays = getattr(other, '_arrays', other) other_vars = getattr(other, 'vars', other) - if set(dataset_vars) != set(other_vars): - raise ValueError('Datasets do not have the same variables: ' - '%s, %s' % (list(dataset_vars), list(other_vars))) + performed_op = False for k in dataset_vars: - dest_vars[k] = f(dataset_arrays[k], other_arrays[k]) + if k in other_vars: + dest_vars[k] = f(dataset_arrays[k], other_arrays[k]) + performed_op = True + elif k in dest_vars: + # we are doing an in-place operation + raise ValueError('datasets must have the same variables for ' + 'in-place arithmetic operations: %s, %s' + % (list(dataset_vars), list(other_vars))) + if not performed_op: + raise ValueError('datasets have no overlapping variables: %s, %s' + % (list(dataset_vars), list(other_vars))) else: - other_arrays = getattr(other, 'variable', other) + other_variable = getattr(other, 'variable', other) for k in dataset_vars: - dest_vars[k] = f(dataset_arrays[k], other_arrays) + dest_vars[k] = f(dataset_arrays[k], other_variable) ops.inject_all_ops_and_reduce_methods(Dataset, array_only=False) diff --git a/xray/test/test_dataset.py b/xray/test/test_dataset.py index 5b25f99b27f..4e918460e72 100644 --- a/xray/test/test_dataset.py +++ b/xray/test/test_dataset.py @@ -1595,6 +1595,19 @@ def test_dataset_math_auto_align(self): with self.assertRaisesRegexp(ValueError, 'no overlapping labels'): ds.isel(x=slice(1)) + ds.isel(x=slice(1, None)) + actual = ds + ds[['bar']] + expected = (2 * ds[['bar']]).merge(ds.coords) + self.assertDatasetIdentical(expected, actual) + + with self.assertRaisesRegexp(ValueError, 'no overlapping variables'): + ds + Dataset() + + with self.assertRaisesRegexp(ValueError, 'no overlapping variables'): + Dataset() + Dataset() + + # maybe unary arithmetic with empty datasets should raise instead? + self.assertDatasetIdentical(Dataset() + 1, Dataset()) + def test_dataset_math_errors(self): ds = self.make_example_math_dataset() @@ -1602,8 +1615,8 @@ def test_dataset_math_errors(self): ds['foo'] += ds with self.assertRaises(TypeError): ds['foo'].variable += ds - with self.assertRaisesRegexp(ValueError, 'do not have the same'): - ds + ds[['bar']] + with self.assertRaisesRegexp(ValueError, 'must have the same'): + ds += ds[['bar']] # verify we can rollback in-place operations if something goes wrong # nb. inplace datetime64 math actually will work with an integer array