-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Add methods for combining variables of differing dimensionality #1597
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 19 commits
8c947e7
e997f7f
3d757da
151dc71
8a1a8ef
e8594f1
0f1ba22
1e1f4d9
35e0ecf
23d9246
099d440
e40b6a2
35a2365
35715dc
5ca9a1d
2979c75
c17dc09
ce3b52e
4ade43d
6d520c2
2669797
24b2237
13587c2
95e2da9
7aa7095
e08622a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -103,6 +103,8 @@ Enhancements | |
- Allow ``expand_dims`` method to support inserting/broadcasting dimensions | ||
with size > 1. (:issue:`2710`) | ||
By `Martin Pletcher <https://github.com/pletchm>`_. | ||
- New methods for reshaping Datasets of variables with different dimensions | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this needs to move up to 0.12.3 now -- sorry for the churn here! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure. I moved this up to that release and added a new section header
|
||
(:issue:`1317`). By `Noah Brenowitz <https://github.com/nbren12>`_. | ||
|
||
Bug fixes | ||
~~~~~~~~~ | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -1408,6 +1408,72 @@ def unstack(self, dim=None): | |||||
ds = self._to_temp_dataset().unstack(dim) | ||||||
return self._from_temp_dataset(ds) | ||||||
|
||||||
def to_unstacked_dataset(self, dim, level=0): | ||||||
"""Unstack DataArray expanding to Dataset along a given level of a | ||||||
stacked coordinate. | ||||||
|
||||||
This is the inverse operation of Dataset.to_stacked_array. | ||||||
|
||||||
Parameters | ||||||
---------- | ||||||
dim : str | ||||||
Name of existing dimension to unstack | ||||||
level : int or str | ||||||
The MultiIndex level to expand to a dataset along. Can either be | ||||||
the integer index of the level or its name. | ||||||
label : int, optional | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Generally I think we've said There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Okay. I changed this to your suggestion |
||||||
Label of the level to expand dataset along. Overrides the label | ||||||
argument if given. | ||||||
|
||||||
benbovy marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
Returns | ||||||
------- | ||||||
unstacked: Dataset | ||||||
|
||||||
rabernat marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
Examples | ||||||
-------- | ||||||
>>> import xarray as xr | ||||||
>>> arr = DataArray(np.arange(6).reshape(2, 3), | ||||||
... coords=[('x', ['a', 'b']), ('y', [0, 1, 2])]) | ||||||
>>> data = xr.Dataset({'a': arr, 'b': arr.isel(y=0)}) | ||||||
>>> data | ||||||
<xarray.Dataset> | ||||||
Dimensions: (x: 2, y: 3) | ||||||
Coordinates: | ||||||
* x (x) <U1 'a' 'b' | ||||||
* y (y) int64 0 1 2 | ||||||
Data variables: | ||||||
a (x, y) int64 0 1 2 3 4 5 | ||||||
b (x) int64 0 3 | ||||||
>>> stacked = data.to_stacked_array("z", ['y']) | ||||||
>>> stacked.indexes['z'] | ||||||
benbovy marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
MultiIndex(levels=[['a', 'b'], [0, 1, 2]], | ||||||
labels=[[0, 0, 0, 1], [0, 1, 2, -1]], | ||||||
names=['variable', 'y']) | ||||||
>>> roundtripped = stacked.to_unstacked_dataset(dim='z') | ||||||
>>> data.identical(roundtripped) | ||||||
True | ||||||
|
||||||
See Also | ||||||
-------- | ||||||
Dataset.to_stacked_array | ||||||
""" | ||||||
|
||||||
idx = self.indexes[dim] | ||||||
if not isinstance(idx, pd.MultiIndex): | ||||||
raise ValueError(dim, "is not a stacked coordinate") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please add test coverage for this error There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. sure. done |
||||||
|
||||||
level_number = idx._get_level_number(level) | ||||||
variables = idx.levels[level_number] | ||||||
variable_dim = idx.names[level_number] | ||||||
|
||||||
# pull variables out of datarray | ||||||
data_dict = OrderedDict() | ||||||
for k in variables: | ||||||
data_dict[k] = self.sel(**{variable_dim: k}).squeeze(drop=True) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Rather than sending as kwargs, if we send as a dict then this will work with non-str keys (though dim names is only partially supported anyway atm)
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have made this change. |
||||||
|
||||||
# unstacked dataset | ||||||
return Dataset(data_dict) | ||||||
|
||||||
def transpose(self, *dims, transpose_coords=None) -> 'DataArray': | ||||||
"""Return a new DataArray object with transposed dimensions. | ||||||
|
||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2650,6 +2650,119 @@ def stack(self, dimensions=None, **dimensions_kwargs): | |
result = result._stack_once(dims, new_dim) | ||
return result | ||
|
||
def to_stacked_array(self, new_dim, sample_dims, variable_dim='variable', | ||
name=None): | ||
"""Combine variables of differing dimensionality into a DataArray | ||
without broadcasting. | ||
|
||
This method is similar to Dataset.to_array but does not broadcast the | ||
variables. | ||
|
||
Parameters | ||
---------- | ||
new_dim : str | ||
Name of the new stacked coordinate | ||
sample_dims : Sequence[str] | ||
Dimensions that **will not** be stacked. Each array in the dataset | ||
must share these dimensions. For machine learning applications, | ||
these define the dimensions over which samples are drawn. | ||
variable_dim : str, optional | ||
Name of the level in the stacked coordinate which corresponds to | ||
the variables. | ||
dcherian marked this conversation as resolved.
Show resolved
Hide resolved
|
||
name : str, optional | ||
Name of the new data array. | ||
|
||
Returns | ||
------- | ||
stacked : DataArray | ||
DataArray with the specified dimensions and data variables | ||
stacked together. The stacked coordinate is named ``new_dim`` | ||
and represented by a MultiIndex object with a level containing the | ||
data variable names. The name of this level is controlled using | ||
the ``variable_dim`` argument. | ||
|
||
See Also | ||
-------- | ||
Dataset.to_array | ||
Dataset.stack | ||
DataArray.to_unstacked_dataset | ||
|
||
Examples | ||
-------- | ||
>>> data = Dataset( | ||
... data_vars={'a': (('x', 'y'), [[0, 1, 2], [3, 4, 5]]), | ||
... 'b': ('x', [6, 7])}, | ||
... coords={'y': ['u', 'v', 'w']} | ||
... ) | ||
|
||
>>> data | ||
<xarray.Dataset> | ||
Dimensions: (x: 2, y: 3) | ||
Coordinates: | ||
* y (y) <U1 'u' 'v' 'w' | ||
Dimensions without coordinates: x | ||
Data variables: | ||
a (x, y) int64 0 1 2 3 4 5 | ||
b (x) int64 6 7 | ||
|
||
>>> data.to_stacked_array("z", ['x']) | ||
jhamman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
<xarray.DataArray (x: 2, z: 4)> | ||
array([[0, 1, 2, 6], | ||
[3, 4, 5, 7]]) | ||
Coordinates: | ||
* z (z) MultiIndex | ||
- variable (z) object 'a' 'a' 'a' 'b' | ||
- y (z) object 'u' 'v' 'w' nan | ||
Dimensions without coordinates: x | ||
|
||
""" | ||
stacking_dims = tuple(dim for dim in self.dims | ||
if dim not in sample_dims) | ||
|
||
for variable in self: | ||
dims = self[variable].dims | ||
dims_include_sample_dims = set(sample_dims) <= set(dims) | ||
if not dims_include_sample_dims: | ||
raise ValueError( | ||
"All DataArrays must share the dims: {}. ".format(dims) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would change "All DataArrays" by "All data variables in Dataset" for this error message. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Taking the example from the docs/docstrings: # the line below gives "ValueError: All DataArrays must share the dims: ('x',)."
data.to_stacked_array('z', ['x', 'y'])
# the line below gives "ValueError: All DataArrays must share the dims: ('x', 'y')."
data.to_stacked_array('z', ['foo']) Those error messages are still a bit confusing to me. For the second I would expect a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can see how that is confusing, but technically it is true. I think it would be a little unwieldy to have differerent error messages for different numbers of sample_dimensions. I change the message to "All variables in the dataset must contain the dimensions {}." Hopefully, that is better. |
||
) | ||
|
||
def f(val): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please give this as sensible name rather than There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure. I changed it to |
||
# ensure square output | ||
|
||
assign_coords = {variable_dim: val.name} | ||
for dim in stacking_dims: | ||
if (dim not in val.dims): | ||
jhamman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
assign_coords[dim] = None | ||
|
||
expand_dims = set(stacking_dims).difference(set(val.dims)) | ||
expand_dims.add(variable_dim) | ||
# must be list for .expand_dims | ||
expand_dims = list(expand_dims) | ||
|
||
return val.assign_coords(**assign_coords) \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: here and below, per PEP8, prefer using parentheses rather than return (val.assign_coords(**assign_coords)
.expand_dims(expand_dims)
.stack(**{new_dim: (variable_dim,) + stacking_dims})) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure. done. |
||
.expand_dims(expand_dims) \ | ||
.stack(**{new_dim: (variable_dim,) + stacking_dims}) | ||
|
||
# concatenate the arrays | ||
Xs = [f(self[key]) for key in self.data_vars] | ||
dataset = xr.concat(Xs, dim=new_dim) | ||
jhamman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# coerce the levels of the MultiIndex to have the same type as the | ||
# input dimensions. This code is messy, so it might be better to just | ||
# input a dummy value for the singleton dimension. | ||
idx = dataset.indexes[new_dim] | ||
levels = [idx.levels[0]]\ | ||
+ [level.astype(self[level.name].dtype) | ||
for level in idx.levels[1:]] | ||
new_idx = idx.set_levels(levels) | ||
dataset[new_dim] = IndexVariable(new_dim, new_idx) | ||
|
||
if name is not None: | ||
dataset.name = name | ||
|
||
return dataset | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: "data_array" might be a better name for this variable. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I changed this |
||
|
||
def _unstack_once(self, dim): | ||
index = self.get_index(dim) | ||
# GH2619. For MultiIndex, we need to call remove_unused. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -58,6 +58,14 @@ def create_test_multiindex(): | |
return Dataset({}, {'x': mindex}) | ||
|
||
|
||
def create_test_stacked_array(): | ||
x = DataArray(pd.Index(np.r_[:10], name='x')) | ||
y = DataArray(pd.Index(np.r_[:20], name='y')) | ||
a = x * y | ||
b = x * y * y | ||
return a, b | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No need to change, but this would be ideal as test fixture |
||
|
||
|
||
class InaccessibleVariableDataStore(backends.InMemoryDataStore): | ||
def __init__(self): | ||
super(InaccessibleVariableDataStore, self).__init__() | ||
|
@@ -2397,6 +2405,61 @@ def test_stack_unstack_slow(self): | |
actual = stacked.isel(z=slice(None, None, -1)).unstack('z') | ||
assert actual.identical(ds[['b']]) | ||
|
||
def test_to_stacked_array_invalid_sample_dims(self): | ||
data = xr.Dataset( | ||
data_vars={'a': (('x', 'y'), [[0, 1, 2], [3, 4, 5]]), | ||
'b': ('x', [6, 7])}, | ||
coords={'y': ['u', 'v', 'w']} | ||
) | ||
with pytest.raises(ValueError): | ||
data.to_stacked_array("features", sample_dims=['y']) | ||
|
||
def test_to_stacked_array_name(self): | ||
name = 'adf9d' | ||
|
||
# make a two dimensional dataset | ||
a, b = create_test_stacked_array() | ||
D = xr.Dataset({'a': a, 'b': b}) | ||
sample_dims = ['x'] | ||
|
||
y = D.to_stacked_array('features', sample_dims, name=name) | ||
assert y.name == name | ||
|
||
def test_to_stacked_array_dtype_dims(self): | ||
# make a two dimensional dataset | ||
a, b = create_test_stacked_array() | ||
D = xr.Dataset({'a': a, 'b': b}) | ||
sample_dims = ['x'] | ||
y = D.to_stacked_array('features', sample_dims) | ||
assert y.indexes['features'].levels[1].dtype == D.y.dtype | ||
assert y.dims == ('x', 'features') | ||
|
||
def test_to_stacked_array_to_unstacked_dataset(self): | ||
# make a two dimensional dataset | ||
a, b = create_test_stacked_array() | ||
D = xr.Dataset({'a': a, 'b': b}) | ||
sample_dims = ['x'] | ||
y = D.to_stacked_array('features', sample_dims)\ | ||
.transpose("x", "features") | ||
|
||
x = y.to_unstacked_dataset("features") | ||
assert_identical(D, x) | ||
|
||
# test on just one sample | ||
x0 = y[0].to_unstacked_dataset("features") | ||
d0 = D.isel(x=0) | ||
assert_identical(d0, x0) | ||
|
||
def test_to_stacked_array_to_unstacked_dataset_different_dimension(self): | ||
# test when variables have different dimensionality | ||
a, b = create_test_stacked_array() | ||
sample_dims = ['x'] | ||
D = xr.Dataset({'a': a, 'b': b.isel(y=0)}) | ||
|
||
y = D.to_stacked_array('features', sample_dims) | ||
x = y.to_unstacked_dataset('features') | ||
assert_identical(D, x) | ||
|
||
def test_update(self): | ||
data = create_test_data(seed=0) | ||
expected = data.copy() | ||
|
Uh oh!
There was an error while loading. Please reload this page.