pydata · shoyer · Jul 5, 2019 · Sep 27, 2017 · Apr 1, 2019 · Apr 1, 2019
diff --git a/doc/api.rst b/doc/api.rst
@@ -199,6 +199,7 @@ Reshaping and reorganizing
    Dataset.transpose
    Dataset.stack
    Dataset.unstack
+   Dataset.to_stacked_array
    Dataset.shift
    Dataset.roll
    Dataset.sortby
@@ -371,6 +372,7 @@ Reshaping and reorganizing
    DataArray.transpose
    DataArray.stack
    DataArray.unstack
+   DataArray.to_unstacked_dataset
    DataArray.shift
    DataArray.roll
    DataArray.sortby

diff --git a/doc/reshaping.rst b/doc/reshaping.rst
@@ -133,6 +133,48 @@ pandas, it does not automatically drop missing values. Compare:
 We departed from pandas's behavior here because predictable shapes for new
 array dimensions is necessary for :ref:`dask`.
 
+Stacking different variables together
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+These stacking and unstacking operations are particularly useful for reshaping
+xarray objects for use in machine learning packages, such as `scikit-learn
+<http://scikit-learn.org/stable/>`_, that usually require two-dimensional numpy
+arrays as inputs. For datasets with only one variable, we only need ``stack``
+and ``unstack``, but combining multiple variables in a
+:py:class:`xarray.Dataset` is more complicated. If the variables in the dataset
+have matching numbers of dimensions, we can call
+:py:meth:`~xarray.Dataset.to_array` and then stack along the the new coordinate.
+But :py:meth:`~xarray.Dataset.to_array` will broadcast the dataarrays together,
+which will effectively tile the lower dimensional variable along the missing
+dimensions. The method :py:meth:`xarray.Dataset.to_stacked_array` allows
+combining variables of differing dimensions without this wasteful copying while
+:py:meth:`xarray.DataArray.to_unstacked_dataset` reverses this operation.
+Just as with :py:meth:`xarray.Dataset.stack` the stacked coordinate is
+represented by a :py:class:`pandas.MultiIndex` object. These methods are used
+like this:
+
+.. ipython:: python
+        data = xr.Dataset(
+            data_vars={'a': (('x', 'y'), [[0, 1, 2], [3, 4, 5]]),
+                      'b': ('x', [6, 7])},
+            coords={'y': ['u', 'v', 'w']}
+        )
+        stacked = data.to_stacked_array("z", sample_dims=['x'])
+        stacked
+        unstacked = stacked.to_unstacked_dataset("z")
+        unstacked
+
+In this example, ``stacked`` is a two dimensional array that we can easily pass to a scikit-learn or another generic
+numerical method.
+
+.. note::
+
+    Unlike with ``stack``,  in ``to_stacked_array``, the user specifies the dimensions they **do not** want stacked.
+    For a machine learning task, these unstacked dimensions can be interpreted as the dimensions over which samples are
+    drawn, whereas the stacked coordinates are the features. Naturally, all variables should possess these sampling
+    dimensions.
+
+
 .. _reshape.set_index:
 
 Set and reset index

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -103,6 +103,8 @@ Enhancements
 - Allow ``expand_dims`` method to support inserting/broadcasting dimensions
   with size > 1. (:issue:`2710`)
   By `Martin Pletcher <https://github.com/pletchm>`_.
+- New methods for reshaping Datasets of variables with different dimensions
+  (:issue:`1317`). By `Noah Brenowitz <https://github.com/nbren12>`_.
 
 Bug fixes
 ~~~~~~~~~

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -1408,6 +1408,72 @@ def unstack(self, dim=None):
         ds = self._to_temp_dataset().unstack(dim)
         return self._from_temp_dataset(ds)
 
+    def to_unstacked_dataset(self, dim, level=0):
+        """Unstack DataArray expanding to Dataset along a given level of a
+        stacked coordinate.
+
+        This is the inverse operation of Dataset.to_stacked_array.
+
+        Parameters
+        ----------
+        dim : str
+            Name of existing dimension to unstack
+        level : int or str
+            The MultiIndex level to expand to a dataset along. Can either be
+            the integer index of the level or its name.
+        label : int, optional
+            Label of the level to expand dataset along. Overrides the label
+            argument if given.
+
+        Returns
+        -------
+        unstacked: Dataset
+
+        Examples
+        --------
+        >>> import xarray as xr
+        >>> arr = DataArray(np.arange(6).reshape(2, 3),
+        ...                 coords=[('x', ['a', 'b']), ('y', [0, 1, 2])])
+        >>> data = xr.Dataset({'a': arr, 'b': arr.isel(y=0)})
+        >>> data
+        <xarray.Dataset>
+        Dimensions:  (x: 2, y: 3)
+        Coordinates:
+          * x        (x) <U1 'a' 'b'
+          * y        (y) int64 0 1 2
+        Data variables:
+            a        (x, y) int64 0 1 2 3 4 5
+            b        (x) int64 0 3
+        >>> stacked = data.to_stacked_array("z", ['y'])
+        >>> stacked.indexes['z']
+        MultiIndex(levels=[['a', 'b'], [0, 1, 2]],
+                labels=[[0, 0, 0, 1], [0, 1, 2, -1]],
+                names=['variable', 'y'])
+        >>> roundtripped = stacked.to_unstacked_dataset(dim='z')
+        >>> data.identical(roundtripped)
+        True
+
+        See Also
+        --------
+        Dataset.to_stacked_array
+        """
+
+        idx = self.indexes[dim]
+        if not isinstance(idx, pd.MultiIndex):
+            raise ValueError(dim, "is not a stacked coordinate")
+
+        level_number = idx._get_level_number(level)
+        variables = idx.levels[level_number]
+        variable_dim = idx.names[level_number]
+
+        # pull variables out of datarray
+        data_dict = OrderedDict()
+        for k in variables:
+            data_dict[k] = self.sel(**{variable_dim: k}).squeeze(drop=True)
-            data_dict[k] = self.sel(**{variable_dim: k}).squeeze(drop=True)
+            data_dict[k] = self.sel({variable_dim: k}).squeeze(drop=True)
-            data_dict[k] = self.sel(**{variable_dim: k}).squeeze(drop=True)
+            data_dict[k] = self.sel({variable_dim: k}).squeeze(drop=True)
+
+        # unstacked dataset
+        return Dataset(data_dict)
+
     def transpose(self, *dims, transpose_coords=None) -> 'DataArray':
         """Return a new DataArray object with transposed dimensions.
 

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -2650,6 +2650,119 @@ def stack(self, dimensions=None, **dimensions_kwargs):
             result = result._stack_once(dims, new_dim)
         return result
 
+    def to_stacked_array(self, new_dim, sample_dims, variable_dim='variable',
+                         name=None):
+        """Combine variables of differing dimensionality into a DataArray
+        without broadcasting.
+
+        This method is similar to Dataset.to_array but does not broadcast the
+        variables.
+
+        Parameters
+        ----------
+        new_dim : str
+            Name of the new stacked coordinate
+        sample_dims : Sequence[str]
+            Dimensions that **will not** be stacked. Each array in the dataset
+            must share these dimensions. For machine learning applications,
+            these define the dimensions over which samples are drawn.
+        variable_dim : str, optional
+            Name of the level in the stacked coordinate which corresponds to
+            the variables.
+        name : str, optional
+            Name of the new data array.
+
+        Returns
+        -------
+        stacked : DataArray
+            DataArray with the specified dimensions and data variables
+            stacked together. The stacked coordinate is named ``new_dim``
+            and represented by a MultiIndex object with a level containing the
+            data variable names. The name of this level is controlled using
+            the ``variable_dim`` argument.
+
+        See Also
+        --------
+        Dataset.to_array
+        Dataset.stack
+        DataArray.to_unstacked_dataset
+
+        Examples
+        --------
+        >>> data = Dataset(
+        ...     data_vars={'a': (('x', 'y'), [[0, 1, 2], [3, 4, 5]]),
+        ...                'b': ('x', [6, 7])},
+        ...     coords={'y': ['u', 'v', 'w']}
+        ... )
+
+        >>> data
+        <xarray.Dataset>
+        Dimensions:  (x: 2, y: 3)
+        Coordinates:
+        * y        (y) <U1 'u' 'v' 'w'
+        Dimensions without coordinates: x
+        Data variables:
+            a        (x, y) int64 0 1 2 3 4 5
+            b        (x) int64 6 7
+
+        >>> data.to_stacked_array("z", ['x'])
+        <xarray.DataArray (x: 2, z: 4)>
+        array([[0, 1, 2, 6],
+            [3, 4, 5, 7]])
+        Coordinates:
+        * z         (z) MultiIndex
+        - variable  (z) object 'a' 'a' 'a' 'b'
+        - y         (z) object 'u' 'v' 'w' nan
+        Dimensions without coordinates: x
+
+        """
+        stacking_dims = tuple(dim for dim in self.dims
+                              if dim not in sample_dims)
+
+        for variable in self:
+            dims = self[variable].dims
+            dims_include_sample_dims = set(sample_dims) <= set(dims)
+            if not dims_include_sample_dims:
+                raise ValueError(
+                    "All DataArrays must share the dims: {}. ".format(dims)
+                )
+
+        def f(val):
+            # ensure square output
+
+            assign_coords = {variable_dim: val.name}
+            for dim in stacking_dims:
+                if (dim not in val.dims):
+                    assign_coords[dim] = None
+
+            expand_dims = set(stacking_dims).difference(set(val.dims))
+            expand_dims.add(variable_dim)
+            # must be list for .expand_dims
+            expand_dims = list(expand_dims)
+
+            return val.assign_coords(**assign_coords) \
+                .expand_dims(expand_dims) \
+                .stack(**{new_dim: (variable_dim,) + stacking_dims})
+
+        # concatenate the arrays
+        Xs = [f(self[key]) for key in self.data_vars]
+        dataset = xr.concat(Xs, dim=new_dim)
+
+        # coerce the levels of the MultiIndex to have the same type as the
+        # input dimensions. This code is messy, so it might be better to just
+        # input a dummy value for the singleton dimension.
+        idx = dataset.indexes[new_dim]
+        levels = [idx.levels[0]]\
+            + [level.astype(self[level.name].dtype)
+               for level in idx.levels[1:]]
+        new_idx = idx.set_levels(levels)
+        dataset[new_dim] = IndexVariable(new_dim, new_idx)
+
+        if name is not None:
+            dataset.name = name
+
+        return dataset
+
     def _unstack_once(self, dim):
         index = self.get_index(dim)
         # GH2619. For MultiIndex, we need to call remove_unused.

diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py
@@ -58,6 +58,14 @@ def create_test_multiindex():
     return Dataset({}, {'x': mindex})
 
 
+def create_test_stacked_array():
+    x = DataArray(pd.Index(np.r_[:10], name='x'))
+    y = DataArray(pd.Index(np.r_[:20], name='y'))
+    a = x * y
+    b = x * y * y
+    return a, b
+
+
 class InaccessibleVariableDataStore(backends.InMemoryDataStore):
     def __init__(self):
         super(InaccessibleVariableDataStore, self).__init__()
@@ -2397,6 +2405,61 @@ def test_stack_unstack_slow(self):
         actual = stacked.isel(z=slice(None, None, -1)).unstack('z')
         assert actual.identical(ds[['b']])
 
+    def test_to_stacked_array_invalid_sample_dims(self):
+        data = xr.Dataset(
+            data_vars={'a': (('x', 'y'), [[0, 1, 2], [3, 4, 5]]),
+                       'b': ('x', [6, 7])},
+            coords={'y': ['u', 'v', 'w']}
+        )
+        with pytest.raises(ValueError):
+            data.to_stacked_array("features", sample_dims=['y'])
+
+    def test_to_stacked_array_name(self):
+        name = 'adf9d'
+
+        # make a two dimensional dataset
+        a, b = create_test_stacked_array()
+        D = xr.Dataset({'a': a, 'b': b})
+        sample_dims = ['x']
+
+        y = D.to_stacked_array('features', sample_dims, name=name)
+        assert y.name == name
+
+    def test_to_stacked_array_dtype_dims(self):
+        # make a two dimensional dataset
+        a, b = create_test_stacked_array()
+        D = xr.Dataset({'a': a, 'b': b})
+        sample_dims = ['x']
+        y = D.to_stacked_array('features', sample_dims)
+        assert y.indexes['features'].levels[1].dtype == D.y.dtype
+        assert y.dims == ('x', 'features')
+
+    def test_to_stacked_array_to_unstacked_dataset(self):
+        # make a two dimensional dataset
+        a, b = create_test_stacked_array()
+        D = xr.Dataset({'a': a, 'b': b})
+        sample_dims = ['x']
+        y = D.to_stacked_array('features', sample_dims)\
+            .transpose("x", "features")
+
+        x = y.to_unstacked_dataset("features")
+        assert_identical(D, x)
+
+        # test on just one sample
+        x0 = y[0].to_unstacked_dataset("features")
+        d0 = D.isel(x=0)
+        assert_identical(d0, x0)
+
+    def test_to_stacked_array_to_unstacked_dataset_different_dimension(self):
+        # test when variables have different dimensionality
+        a, b = create_test_stacked_array()
+        sample_dims = ['x']
+        D = xr.Dataset({'a': a, 'b': b.isel(y=0)})
+
+        y = D.to_stacked_array('features', sample_dims)
+        x = y.to_unstacked_dataset('features')
+        assert_identical(D, x)
+
     def test_update(self):
         data = create_test_data(seed=0)
         expected = data.copy()