From 2a31c1ae9f37e3c63303ce789fb113360a0f01ff Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Mon, 15 Feb 2016 21:14:43 -0800 Subject: [PATCH] Fix converting a dataframe with categorical column and a multiindex Fixes GH737 --- doc/whats-new.rst | 4 +++- xarray/core/dataset.py | 2 +- xarray/test/test_dataset.py | 26 +++++++++++++++++++++++++- 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 8b034ec3a0a..e7c674da142 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -29,7 +29,7 @@ Bug fixes - Single dimension variables no longer transpose as part of a broader ``.transpose``. This behavior was causing ``pandas.PeriodIndex`` dimensions to lose their type (:issue:`749`) -- `~xarray.Dataset` labels remain as their native type on ``.to_dataset``. +- :py:class:`~xarray.Dataset` labels remain as their native type on ``.to_dataset``. Previously they were coerced to strings (:issue:`745`) - Fixed a bug where replacing a ``DataArray`` index coordinate would improperly align the coordinate (:issue:`725`). @@ -37,6 +37,8 @@ Bug fixes reindexing leads to NaN values (:issue:`738`). - ``Dataset.rename`` and ``DataArray.rename`` support the old and new names being the same (:issue:`724`). +- Fix :py:meth:`~xarray.Dataset.from_dataset` for DataFrames with Categorical + column and a MultiIndex index (:issue:`737`). - Fixes to ensure xarray works properly after the upcoming pandas v0.18 and NumPy v1.11 releases. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index f650e21661a..d5f5e91b8a8 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1905,7 +1905,7 @@ def from_dataframe(cls, dataframe): shape = -1 for name, series in iteritems(dataframe): - data = series.values.reshape(shape) + data = np.asarray(series).reshape(shape) obj[name] = (dims, data) return obj diff --git a/xarray/test/test_dataset.py b/xarray/test/test_dataset.py index 928bfd23c00..1e5b93c243f 100644 --- a/xarray/test/test_dataset.py +++ b/xarray/test/test_dataset.py @@ -1722,7 +1722,6 @@ def test_to_and_from_dataframe(self): expected = Dataset({'A': DataArray([], dims=('index',))}) self.assertDatasetIdentical(expected, actual) - # regression test for GH278 # use int64 to ensure consistent results for the pandas .equals method # on windows (which requires the same dtype) @@ -1741,12 +1740,37 @@ def test_to_and_from_dataframe(self): expected = pd.DataFrame([[]], index=idx) assert expected.equals(actual), (expected, actual) + def test_from_dataframe_non_unique_columns(self): # regression test for GH449 df = pd.DataFrame(np.zeros((2, 2))) df.columns = ['foo', 'foo'] with self.assertRaisesRegexp(ValueError, 'non-unique columns'): Dataset.from_dataframe(df) + def test_convert_dataframe_with_many_types_and_multiindex(self): + # regression test for GH737 + df = pd.DataFrame({'a': list('abc'), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0, dtype='float64'), + 'e': [True, False, True], + 'f': pd.Categorical(list('abc')), + 'g': pd.date_range('20130101', periods=3), + 'h': pd.date_range('20130101', + periods=3, + tz='US/Eastern')}) + df.index = pd.MultiIndex.from_product([['a'], range(3)], + names=['one', 'two']) + roundtripped = Dataset.from_dataframe(df).to_dataframe() + # we can't do perfectly, but we should be at least as faithful as + # np.asarray + expected = df.apply(np.asarray) + if pd.__version__ < '0.17': + # datetime with timezone dtype is not consistent on old pandas + roundtripped = roundtripped.drop(['h'], axis=1) + expected = expected.drop(['h'], axis=1) + assert roundtripped.equals(expected) + def test_pickle(self): data = create_test_data() roundtripped = pickle.loads(pickle.dumps(data))