From 61ec6e05db477d37160b20330e55cf7c66180e28 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 27 Mar 2018 10:52:38 -0500 Subject: [PATCH 1/3] ENH: Support ExtensionArray in Groupby ```python In [1]: import pandas as pd In [2]: from cyberpandas import IPArray In [3]: df = pd.DataFrame({"A": IPArray([0, 0, 1, 2, 2]), "B": [1, 5, 1, 1, 3]}) In [4]: df Out[4]: A B 0 0.0.0.0 1 1 0.0.0.0 5 2 0.0.0.1 1 3 0.0.0.2 1 4 0.0.0.2 3 In [5]: df.groupby("A").B.mean() Out[5]: A 0.0.0.1 1 0.0.0.2 2 Name: B, dtype: int64 ``` --- pandas/core/groupby.py | 4 +- pandas/tests/extension/base/__init__.py | 1 + pandas/tests/extension/base/groupby.py | 69 +++++++++++++++++++ .../tests/extension/decimal/test_decimal.py | 4 ++ pandas/tests/extension/json/test_json.py | 26 +++++++ 5 files changed, 102 insertions(+), 2 deletions(-) create mode 100644 pandas/tests/extension/base/groupby.py diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 601acac20c96d..7c89cab6b1428 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -44,7 +44,7 @@ DataError, SpecificationError) from pandas.core.index import (Index, MultiIndex, CategoricalIndex, _ensure_index) -from pandas.core.arrays import Categorical +from pandas.core.arrays import ExtensionArray, Categorical from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame, _shared_docs from pandas.core.internals import BlockManager, make_block @@ -2968,7 +2968,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, # no level passed elif not isinstance(self.grouper, - (Series, Index, Categorical, np.ndarray)): + (Series, Index, ExtensionArray, np.ndarray)): if getattr(self.grouper, 'ndim', 1) != 1: t = self.name or str(type(self.grouper)) raise ValueError("Grouper for '%s' not 1-dimensional" % t) diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index 27c106efd0524..f8078d2798b32 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -44,6 +44,7 @@ class TestMyDtype(BaseDtypeTests): from .constructors import BaseConstructorsTests # noqa from .dtype import BaseDtypeTests # noqa from .getitem import BaseGetitemTests # noqa +from .groupby import BaseGroupbyTests # noqa from .interface import BaseInterfaceTests # noqa from .methods import BaseMethodsTests # noqa from .missing import BaseMissingTests # noqa diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py new file mode 100644 index 0000000000000..a29ef2a509a63 --- /dev/null +++ b/pandas/tests/extension/base/groupby.py @@ -0,0 +1,69 @@ +import pytest + +import pandas.util.testing as tm +import pandas as pd +from .base import BaseExtensionTests + + +class BaseGroupbyTests(BaseExtensionTests): + """Groupby-specific tests.""" + + def test_grouping_grouper(self, data_for_grouping): + df = pd.DataFrame({ + "A": ["B", "B", None, None, "A", "A", "B", "C"], + "B": data_for_grouping + }) + gr1 = df.groupby("A").grouper.groupings[0] + gr2 = df.groupby("B").grouper.groupings[0] + + tm.assert_numpy_array_equal(gr1.grouper, df.A.values) + tm.assert_extension_array_equal(gr2.grouper, data_for_grouping) + + @pytest.mark.parametrize('as_index', [True, False]) + def test_groupby_extension_agg(self, as_index, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], + "B": data_for_grouping}) + result = df.groupby("B", as_index=as_index).A.mean() + _, index = pd.factorize(data_for_grouping, sort=True) + # TODO(ExtensionIndex): remove astype + index = pd.Index(index.astype(object), name="B") + expected = pd.Series([3, 1, 4], index=index, name="A") + if as_index: + self.assert_series_equal(result, expected) + else: + expected = expected.reset_index() + self.assert_frame_equal(result, expected) + + def test_groupby_extension_no_sort(self, data_for_grouping): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], + "B": data_for_grouping}) + result = df.groupby("B", sort=False).A.mean() + _, index = pd.factorize(data_for_grouping, sort=False) + # TODO(ExtensionIndex): remove astype + index = pd.Index(index.astype(object), name="B") + expected = pd.Series([1, 3, 4], index=index, name="A") + self.assert_series_equal(result, expected) + + def test_groupby_extension_transform(self, data_for_grouping): + valid = data_for_grouping[~data_for_grouping.isna()] + df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4], + "B": valid}) + + result = df.groupby("B").A.transform(len) + expected = pd.Series([3, 3, 2, 2, 3, 1], name="A") + + self.assert_series_equal(result, expected) + + @pytest.mark.parametrize('op', [ + lambda x: 1, + lambda x: [1] * len(x), + lambda x: pd.Series([1] * len(x)), + lambda x: x, + ], ids=['scalar', 'list', 'series', 'object']) + def test_groupby_extension_apply(self, data_for_grouping, op): + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], + "B": data_for_grouping}) + df.groupby("B").apply(op) + df.groupby("B").A.apply(op) + df.groupby("A").apply(op) + df.groupby("A").B.apply(op) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 22c1a67a0d60d..d509170565e1a 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -127,6 +127,10 @@ class TestCasting(BaseDecimal, base.BaseCastingTests): pass +class TestGroupby(BaseDecimal, base.BaseGroupbyTests): + pass + + def test_series_constructor_coerce_data_to_extension_dtype_raises(): xpr = ("Cannot cast data to extension dtype 'decimal'. Pass the " "extension array directly.") diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 63d97d5e7a2c5..a0416cbeb287c 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -126,3 +126,29 @@ def test_sort_values_missing(self, data_missing_for_sorting, ascending): class TestCasting(base.BaseCastingTests): pass + + +class TestGroupby(base.BaseGroupbyTests): + unhashable = pytest.mark.skip(reason="Unhashable") + incomparable = pytest.mark.skip(reason="Incomparable") + + @unhashable + def test_groupby_extension_transform(self): + """ + This currently fails in Series.name.setter, since the + name must be hashable, but the value is a dictionary. + I think this is what we want, i.e. `.name` should be the original + values, and not the values for factorization. + """ + + @unhashable + def test_groupby_extension_apply(self): + """ + This fails in Index._do_unique_check with + + > hash(val) + E TypeError: unhashable type: 'UserDict' with + + I suspect that once we support Index[ExtensionArray], + we'll be able to dispatch unique. + """ From e3fed38e2685e4f66038666f934422b591307831 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 27 Mar 2018 11:06:33 -0500 Subject: [PATCH 2/3] REF: Reuse in factorize --- pandas/tests/extension/json/array.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 51a68a3701046..d9ae49d87804a 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -113,8 +113,8 @@ def _concat_same_type(cls, to_concat): return cls(data) def _values_for_factorize(self): - frozen = tuple(tuple(x.items()) for x in self) - return np.array(frozen, dtype=object), () + frozen = self._values_for_argsort() + return frozen, () def _values_for_argsort(self): # Disable NumPy's shape inference by including an empty tuple... From 98a3a8530b4d25658c93e4994c415bd9f1146d54 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 27 Mar 2018 12:22:37 -0500 Subject: [PATCH 3/3] Test relies on ordered dictionaries --- pandas/tests/extension/json/test_json.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index a0416cbeb287c..5e9639c487c37 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -89,11 +89,12 @@ def test_fillna_frame(self): """We treat dictionaries as a mapping in fillna, not a scalar.""" -class TestMethods(base.BaseMethodsTests): - unhashable = pytest.mark.skip(reason="Unhashable") - unstable = pytest.mark.skipif(not PY36, # 3.6 or higher - reason="Dictionary order unstable") +unhashable = pytest.mark.skip(reason="Unhashable") +unstable = pytest.mark.skipif(not PY36, # 3.6 or higher + reason="Dictionary order unstable") + +class TestMethods(base.BaseMethodsTests): @unhashable def test_value_counts(self, all_data, dropna): pass @@ -118,6 +119,7 @@ def test_sort_values(self, data_for_sorting, ascending): super(TestMethods, self).test_sort_values( data_for_sorting, ascending) + @unstable @pytest.mark.parametrize('ascending', [True, False]) def test_sort_values_missing(self, data_missing_for_sorting, ascending): super(TestMethods, self).test_sort_values_missing( @@ -129,8 +131,6 @@ class TestCasting(base.BaseCastingTests): class TestGroupby(base.BaseGroupbyTests): - unhashable = pytest.mark.skip(reason="Unhashable") - incomparable = pytest.mark.skip(reason="Incomparable") @unhashable def test_groupby_extension_transform(self): @@ -152,3 +152,10 @@ def test_groupby_extension_apply(self): I suspect that once we support Index[ExtensionArray], we'll be able to dispatch unique. """ + + @unstable + @pytest.mark.parametrize('as_index', [True, False]) + def test_groupby_extension_agg(self, as_index, data_for_grouping): + super(TestGroupby, self).test_groupby_extension_agg( + as_index, data_for_grouping + )