From 5f8477f474c8f44ef495061737fe76d9c9e9373b Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 27 Mar 2020 19:58:06 -0500 Subject: [PATCH 01/14] Don't check isinstance --- pandas/core/dtypes/cast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index da9646aa8c46f..f2b97c46c232a 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -278,7 +278,7 @@ def maybe_cast_result( if is_extension_array_dtype(dtype) and dtype.kind != "M": # The result may be of any type, cast back to original # type if it's compatible. - if len(result) and isinstance(result[0], dtype.type): + if len(result): cls = dtype.construct_array_type() result = maybe_cast_to_extension_array(cls, result, dtype=dtype) From 359231ccedf21449806422ed8ef451a1d3b62195 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 27 Mar 2020 20:11:54 -0500 Subject: [PATCH 02/14] Fix categorical test --- pandas/tests/groupby/aggregate/test_aggregate.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index e860ea1a3d052..c415e1dd6200d 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -463,9 +463,12 @@ def test_agg_cython_category_not_implemented_fallback(): df = pd.DataFrame({"col_num": [1, 1, 2, 3]}) df["col_cat"] = df["col_num"].astype("category") - result = df.groupby("col_num").col_cat.first() + result = df.groupby("col_num")["col_cat"].first() expected = pd.Series( - [1, 2, 3], index=pd.Index([1, 2, 3], name="col_num"), name="col_cat" + [1, 2, 3], + dtype="category", + index=pd.Index([1, 2, 3], name="col_num"), + name="col_cat", ) tm.assert_series_equal(result, expected) From 5311de94f6134b5d15de50d4f5cbaff3af88912d Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 27 Mar 2020 20:12:18 -0500 Subject: [PATCH 03/14] Take out len --- pandas/core/dtypes/cast.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index f2b97c46c232a..379b5f0a8c5d3 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -278,9 +278,8 @@ def maybe_cast_result( if is_extension_array_dtype(dtype) and dtype.kind != "M": # The result may be of any type, cast back to original # type if it's compatible. - if len(result): - cls = dtype.construct_array_type() - result = maybe_cast_to_extension_array(cls, result, dtype=dtype) + cls = dtype.construct_array_type() + result = maybe_cast_to_extension_array(cls, result, dtype=dtype) elif numeric_only and is_numeric_dtype(dtype) or not numeric_only: result = maybe_downcast_to_dtype(result, dtype) From 0aa91f6c632973e0148bb7f925917ebd4913a3ae Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 27 Mar 2020 20:41:51 -0500 Subject: [PATCH 04/14] Special case categorical --- pandas/core/dtypes/cast.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 379b5f0a8c5d3..0813b9959e697 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -32,6 +32,7 @@ ensure_str, is_bool, is_bool_dtype, + is_categorical_dtype, is_complex, is_complex_dtype, is_datetime64_dtype, @@ -275,9 +276,13 @@ def maybe_cast_result( dtype = maybe_cast_result_dtype(dtype, how) if not is_scalar(result): - if is_extension_array_dtype(dtype) and dtype.kind != "M": - # The result may be of any type, cast back to original - # type if it's compatible. + if ( + is_extension_array_dtype(dtype) + and not is_categorical_dtype(dtype) + and dtype.kind != "M" + ): + # We have to special case categorical so as not to upcast + # things like counts back to categorical cls = dtype.construct_array_type() result = maybe_cast_to_extension_array(cls, result, dtype=dtype) From 8546e29775d5da984b8130901733c443eb887ef0 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 27 Mar 2020 20:42:20 -0500 Subject: [PATCH 05/14] Tests --- pandas/tests/groupby/test_function.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 9c33843cdcecc..4778c8b62524c 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1636,3 +1636,14 @@ def test_apply_to_nullable_integer_returns_float(values, function): result = groups.agg([function]) expected.columns = MultiIndex.from_tuples([("b", function)]) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("function", ["first", "last", "min", "max"]) +def test_aggregate_nullable_boolean_keeps_dtype(function): + df = pd.DataFrame({"a": [1, 2], "b": pd.array([True, False], dtype="boolean")}) + grouped = df.groupby("a")["b"] + result = getattr(grouped, function)() + expected = pd.Series( + [True, False], dtype="boolean", name="b", index=pd.Int64Index([1, 2], name="a") + ) + tm.assert_series_equal(result, expected) From cf29e3158c1876f2c6cc7142eadb88e4320d60ef Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 27 Mar 2020 20:42:43 -0500 Subject: [PATCH 06/14] Revert "Fix categorical test" This reverts commit 359231ccedf21449806422ed8ef451a1d3b62195. --- pandas/tests/groupby/aggregate/test_aggregate.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index c415e1dd6200d..e860ea1a3d052 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -463,12 +463,9 @@ def test_agg_cython_category_not_implemented_fallback(): df = pd.DataFrame({"col_num": [1, 1, 2, 3]}) df["col_cat"] = df["col_num"].astype("category") - result = df.groupby("col_num")["col_cat"].first() + result = df.groupby("col_num").col_cat.first() expected = pd.Series( - [1, 2, 3], - dtype="category", - index=pd.Index([1, 2, 3], name="col_num"), - name="col_cat", + [1, 2, 3], index=pd.Index([1, 2, 3], name="col_num"), name="col_cat" ) tm.assert_series_equal(result, expected) From 62de8355d34b762cff32c1103744232eab06c2cc Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 27 Mar 2020 20:47:19 -0500 Subject: [PATCH 07/14] Note --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 20415bba99476..7971169386120 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -436,7 +436,7 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ -- +- Fixed bug where :meth:`SeriesGroupBy.first`, :meth:`SeriesGroupBy.last`, :meth:`SeriesGroupBy.min`, and :meth:`SeriesGroupBy.max` would return floats when applied to nullable Booleans (:issue:`33071`) - From 89505ac7cb1012e0bf0894009fc8260035655f2b Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 27 Mar 2020 21:24:05 -0500 Subject: [PATCH 08/14] Hack test --- pandas/tests/resample/test_datetime_index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 3ad82b9e075a8..3602e347f7bb6 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -124,7 +124,7 @@ def test_resample_integerarray(): expected = Series( [1, 4, 7], index=pd.date_range("1/1/2000", periods=3, freq="3T"), - dtype="float64", + dtype="Int64", ) tm.assert_series_equal(result, expected) From a426f9d77ea18e748bb8fac4207a6e40cb8c3e2d Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Fri, 27 Mar 2020 22:01:17 -0500 Subject: [PATCH 09/14] Black --- pandas/tests/resample/test_datetime_index.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index 3602e347f7bb6..ab6985b11ba9a 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -122,9 +122,7 @@ def test_resample_integerarray(): result = ts.resample("3T").mean() expected = Series( - [1, 4, 7], - index=pd.date_range("1/1/2000", periods=3, freq="3T"), - dtype="Int64", + [1, 4, 7], index=pd.date_range("1/1/2000", periods=3, freq="3T"), dtype="Int64", ) tm.assert_series_equal(result, expected) From 85fe2508585d51c0e80337cfc6e6b52638e409d1 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 28 Mar 2020 09:29:52 -0500 Subject: [PATCH 10/14] Add tests --- pandas/tests/groupby/test_function.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 4778c8b62524c..f72ce4a39211e 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1638,12 +1638,19 @@ def test_apply_to_nullable_integer_returns_float(values, function): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "values", + [ + pd.array([True, False], dtype="boolean"), + pd.array([1, 2], dtype="Int64"), + pd.to_datetime(["2020-01-01", "2020-02-01"]), + pd.TimedeltaIndex([1, 2], unit="D"), + ], +) @pytest.mark.parametrize("function", ["first", "last", "min", "max"]) -def test_aggregate_nullable_boolean_keeps_dtype(function): - df = pd.DataFrame({"a": [1, 2], "b": pd.array([True, False], dtype="boolean")}) +def test_aggregate_extension_array_keeps_dtype(values, function): + df = pd.DataFrame({"a": [1, 2], "b": values}) grouped = df.groupby("a")["b"] result = getattr(grouped, function)() - expected = pd.Series( - [True, False], dtype="boolean", name="b", index=pd.Int64Index([1, 2], name="a") - ) + expected = pd.Series(values, name="b", index=pd.Int64Index([1, 2], name="a")) tm.assert_series_equal(result, expected) From d55a74f780c090f2a510012d512e1bd502e52249 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 28 Mar 2020 09:40:18 -0500 Subject: [PATCH 11/14] Include dict case --- pandas/tests/groupby/test_function.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index f72ce4a39211e..558926b4a63cf 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1650,7 +1650,13 @@ def test_apply_to_nullable_integer_returns_float(values, function): @pytest.mark.parametrize("function", ["first", "last", "min", "max"]) def test_aggregate_extension_array_keeps_dtype(values, function): df = pd.DataFrame({"a": [1, 2], "b": values}) - grouped = df.groupby("a")["b"] - result = getattr(grouped, function)() - expected = pd.Series(values, name="b", index=pd.Int64Index([1, 2], name="a")) + grouped = df.groupby("a") + idx = pd.Int64Index([1, 2], name="a") + + result = getattr(grouped["b"], function)() + expected = pd.Series(values, name="b", index=idx) tm.assert_series_equal(result, expected) + + result = grouped.agg({"b": function}) + expected = pd.DataFrame({"b": values}, index=idx) + tm.assert_frame_equal(result, expected) From c7e38b0e95c5d9aecfc9609b9be4413b4bd55acf Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sat, 28 Mar 2020 09:51:41 -0500 Subject: [PATCH 12/14] Issue numbers --- pandas/tests/groupby/test_function.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 558926b4a63cf..4c63e30e70c29 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1649,6 +1649,8 @@ def test_apply_to_nullable_integer_returns_float(values, function): ) @pytest.mark.parametrize("function", ["first", "last", "min", "max"]) def test_aggregate_extension_array_keeps_dtype(values, function): + # https://github.com/pandas-dev/pandas/issues/33071 + # https://github.com/pandas-dev/pandas/issues/32194 df = pd.DataFrame({"a": [1, 2], "b": values}) grouped = df.groupby("a") idx = pd.Int64Index([1, 2], name="a") From fc25876483eeb479bedc6d27f6d38f636705f385 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 29 Mar 2020 10:16:21 -0500 Subject: [PATCH 13/14] Move note --- doc/source/whatsnew/v1.1.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 93ab03df18ff3..9d605f6530e4c 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -404,6 +404,8 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) - Bug in :meth:`DataFrameGroupby.transform` produces incorrect result with transformation functions (:issue:`30918`) - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` produces inconsistent type when aggregating Boolean series (:issue:`32894`) +- Bug in :meth:`SeriesGroupBy.first`, :meth:`SeriesGroupBy.last`, :meth:`SeriesGroupBy.min`, and :meth:`SeriesGroupBy.max` returning floats when applied to nullable Booleans (:issue:`33071`) +- Bug in :meth:`DataFrameGroupBy.agg` with dictionary input losing ``ExtensionArray`` dtypes (:issue:`32194`) Reshaping @@ -437,7 +439,6 @@ Sparse ExtensionArray ^^^^^^^^^^^^^^ -- Fixed bug where :meth:`SeriesGroupBy.first`, :meth:`SeriesGroupBy.last`, :meth:`SeriesGroupBy.min`, and :meth:`SeriesGroupBy.max` would return floats when applied to nullable Booleans (:issue:`33071`) - From db4fb5eb1e28a7aed1e47eba2a0a23beb778c3e3 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Sun, 29 Mar 2020 10:17:54 -0500 Subject: [PATCH 14/14] Update tests --- pandas/tests/groupby/test_function.py | 26 -------------------------- pandas/tests/groupby/test_nth.py | 26 ++++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 4c63e30e70c29..9c33843cdcecc 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1636,29 +1636,3 @@ def test_apply_to_nullable_integer_returns_float(values, function): result = groups.agg([function]) expected.columns = MultiIndex.from_tuples([("b", function)]) tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "values", - [ - pd.array([True, False], dtype="boolean"), - pd.array([1, 2], dtype="Int64"), - pd.to_datetime(["2020-01-01", "2020-02-01"]), - pd.TimedeltaIndex([1, 2], unit="D"), - ], -) -@pytest.mark.parametrize("function", ["first", "last", "min", "max"]) -def test_aggregate_extension_array_keeps_dtype(values, function): - # https://github.com/pandas-dev/pandas/issues/33071 - # https://github.com/pandas-dev/pandas/issues/32194 - df = pd.DataFrame({"a": [1, 2], "b": values}) - grouped = df.groupby("a") - idx = pd.Int64Index([1, 2], name="a") - - result = getattr(grouped["b"], function)() - expected = pd.Series(values, name="b", index=idx) - tm.assert_series_equal(result, expected) - - result = grouped.agg({"b": function}) - expected = pd.DataFrame({"b": values}, index=idx) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index b1476f1059d84..e1bc058508bee 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -384,6 +384,32 @@ def test_first_last_tz_multi_column(method, ts, alpha): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "values", + [ + pd.array([True, False], dtype="boolean"), + pd.array([1, 2], dtype="Int64"), + pd.to_datetime(["2020-01-01", "2020-02-01"]), + pd.to_timedelta([1, 2], unit="D"), + ], +) +@pytest.mark.parametrize("function", ["first", "last", "min", "max"]) +def test_first_last_extension_array_keeps_dtype(values, function): + # https://github.com/pandas-dev/pandas/issues/33071 + # https://github.com/pandas-dev/pandas/issues/32194 + df = DataFrame({"a": [1, 2], "b": values}) + grouped = df.groupby("a") + idx = Index([1, 2], name="a") + expected_series = Series(values, name="b", index=idx) + expected_frame = DataFrame({"b": values}, index=idx) + + result_series = getattr(grouped["b"], function)() + tm.assert_series_equal(result_series, expected_series) + + result_frame = grouped.agg({"b": function}) + tm.assert_frame_equal(result_frame, expected_frame) + + def test_nth_multi_index_as_expected(): # PR 9090, related to issue 8979 # test nth on MultiIndex