From 42e864c525dfb4a0dd8e46b32ce3a53907b54fc3 Mon Sep 17 00:00:00 2001 From: Anna Daglis Date: Thu, 21 Jan 2021 18:23:55 +0000 Subject: [PATCH 1/4] BUG: SeriesGroupBy.value_counts raising error on an empty series --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/core/groupby/generic.py | 4 ++ pandas/tests/groupby/test_value_counts.py | 87 ++++++++++++++--------- 3 files changed, 59 insertions(+), 33 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index cbbba84da6ae6..973f95ebb3510 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -340,6 +340,7 @@ Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`SeriesGroupBy.value_counts` where unobserved categories in a grouped categorical series were not tallied (:issue:`38672`) +- Bug in :meth:`SeriesGroupBy.value_counts` where error was raised on an empty series (:issue:`39172`) - Bug in :meth:`.GroupBy.indices` would contain non-existent indices when null values were present in the groupby keys (:issue:`9304`) - Fixed bug in :meth:`DataFrameGroupBy.sum` and :meth:`SeriesGroupBy.sum` causing loss of precision through using Kahan summation (:issue:`38778`) - Fixed bug in :meth:`DataFrameGroupBy.cumsum`, :meth:`SeriesGroupBy.cumsum`, :meth:`DataFrameGroupBy.mean` and :meth:`SeriesGroupBy.mean` causing loss of precision through using Kahan summation (:issue:`38934`) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 50dbfe2596a77..ec59cdb475da9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -731,10 +731,14 @@ def apply_series_value_counts(): # group boundaries are where group ids change idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] + if len(ids) == 0: + idx = ids # new values are where sorted labels change lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1)) inc = np.r_[True, lchanges] + if len(lchanges) == 0: + inc = lchanges inc[idx] = True # group boundaries are also new values out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index afb648d8527ca..881073f116d2a 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -59,39 +59,39 @@ def seed_df(seed_nans, n, m): ids.append(f"{k}-{n}-{m}") -@pytest.mark.slow -@pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids) -@pytest.mark.parametrize("isort", [True, False]) -@pytest.mark.parametrize("normalize", [True, False]) -@pytest.mark.parametrize("sort", [True, False]) -@pytest.mark.parametrize("ascending", [True, False]) -@pytest.mark.parametrize("dropna", [True, False]) -def test_series_groupby_value_counts( - df, keys, bins, n, m, isort, normalize, sort, ascending, dropna -): - def rebuild_index(df): - arr = list(map(df.index.get_level_values, range(df.index.nlevels))) - df.index = MultiIndex.from_arrays(arr, names=df.index.names) - return df - - kwargs = { - "normalize": normalize, - "sort": sort, - "ascending": ascending, - "dropna": dropna, - "bins": bins, - } - - gr = df.groupby(keys, sort=isort) - left = gr["3rd"].value_counts(**kwargs) - - gr = df.groupby(keys, sort=isort) - right = gr["3rd"].apply(Series.value_counts, **kwargs) - right.index.names = right.index.names[:-1] + ["3rd"] - - # have to sort on index because of unstable sort on values - left, right = map(rebuild_index, (left, right)) # xref GH9212 - tm.assert_series_equal(left.sort_index(), right.sort_index()) +# @pytest.mark.slow +# @pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids) +# @pytest.mark.parametrize("isort", [True, False]) +# @pytest.mark.parametrize("normalize", [True, False]) +# @pytest.mark.parametrize("sort", [True, False]) +# @pytest.mark.parametrize("ascending", [True, False]) +# @pytest.mark.parametrize("dropna", [True, False]) +# def test_series_groupby_value_counts( +# df, keys, bins, n, m, isort, normalize, sort, ascending, dropna +# ): +# def rebuild_index(df): +# arr = list(map(df.index.get_level_values, range(df.index.nlevels))) +# df.index = MultiIndex.from_arrays(arr, names=df.index.names) +# return df +# +# kwargs = { +# "normalize": normalize, +# "sort": sort, +# "ascending": ascending, +# "dropna": dropna, +# "bins": bins, +# } +# +# gr = df.groupby(keys, sort=isort) +# left = gr["3rd"].value_counts(**kwargs) +# +# gr = df.groupby(keys, sort=isort) +# right = gr["3rd"].apply(Series.value_counts, **kwargs) +# right.index.names = right.index.names[:-1] + ["3rd"] +# +# # have to sort on index because of unstable sort on values +# left, right = map(rebuild_index, (left, right)) # xref GH9212 +# tm.assert_series_equal(left.sort_index(), right.sort_index()) def test_series_groupby_value_counts_with_grouper(): @@ -122,6 +122,27 @@ def test_series_groupby_value_counts_with_grouper(): tm.assert_series_equal(result, expected) +def test_series_groupby_value_counts_empty(): + # GH39172 + df = DataFrame(columns=["A", "B"]) + dfg = df.groupby("A") + + result = dfg["B"].value_counts() + expected = Series([], name="B", dtype=result.dtype) + expected.index = MultiIndex.from_arrays([[]] * 2, names=["A", "B"]) + + tm.assert_series_equal(result, expected) + + df = DataFrame(columns=["A", "B", "C"]) + dfg = df.groupby(["A", "B"]) + + result = dfg["C"].value_counts() + expected = Series([], name="C", dtype=result.dtype) + expected.index = MultiIndex.from_arrays([[]] * 3, names=["A", "B", "C"]) + + tm.assert_series_equal(result, expected) + + def test_series_groupby_value_counts_on_categorical(): # GH38672 From 466a100eeb73f8085da1b96c495f173dce150ec9 Mon Sep 17 00:00:00 2001 From: Anna Daglis Date: Thu, 21 Jan 2021 19:05:08 +0000 Subject: [PATCH 2/4] Fix bug --- pandas/tests/groupby/test_value_counts.py | 66 +++++++++++------------ 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/pandas/tests/groupby/test_value_counts.py b/pandas/tests/groupby/test_value_counts.py index 881073f116d2a..8bb07b7163f2e 100644 --- a/pandas/tests/groupby/test_value_counts.py +++ b/pandas/tests/groupby/test_value_counts.py @@ -59,39 +59,39 @@ def seed_df(seed_nans, n, m): ids.append(f"{k}-{n}-{m}") -# @pytest.mark.slow -# @pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids) -# @pytest.mark.parametrize("isort", [True, False]) -# @pytest.mark.parametrize("normalize", [True, False]) -# @pytest.mark.parametrize("sort", [True, False]) -# @pytest.mark.parametrize("ascending", [True, False]) -# @pytest.mark.parametrize("dropna", [True, False]) -# def test_series_groupby_value_counts( -# df, keys, bins, n, m, isort, normalize, sort, ascending, dropna -# ): -# def rebuild_index(df): -# arr = list(map(df.index.get_level_values, range(df.index.nlevels))) -# df.index = MultiIndex.from_arrays(arr, names=df.index.names) -# return df -# -# kwargs = { -# "normalize": normalize, -# "sort": sort, -# "ascending": ascending, -# "dropna": dropna, -# "bins": bins, -# } -# -# gr = df.groupby(keys, sort=isort) -# left = gr["3rd"].value_counts(**kwargs) -# -# gr = df.groupby(keys, sort=isort) -# right = gr["3rd"].apply(Series.value_counts, **kwargs) -# right.index.names = right.index.names[:-1] + ["3rd"] -# -# # have to sort on index because of unstable sort on values -# left, right = map(rebuild_index, (left, right)) # xref GH9212 -# tm.assert_series_equal(left.sort_index(), right.sort_index()) +@pytest.mark.slow +@pytest.mark.parametrize("df, keys, bins, n, m", binned, ids=ids) +@pytest.mark.parametrize("isort", [True, False]) +@pytest.mark.parametrize("normalize", [True, False]) +@pytest.mark.parametrize("sort", [True, False]) +@pytest.mark.parametrize("ascending", [True, False]) +@pytest.mark.parametrize("dropna", [True, False]) +def test_series_groupby_value_counts( + df, keys, bins, n, m, isort, normalize, sort, ascending, dropna +): + def rebuild_index(df): + arr = list(map(df.index.get_level_values, range(df.index.nlevels))) + df.index = MultiIndex.from_arrays(arr, names=df.index.names) + return df + + kwargs = { + "normalize": normalize, + "sort": sort, + "ascending": ascending, + "dropna": dropna, + "bins": bins, + } + + gr = df.groupby(keys, sort=isort) + left = gr["3rd"].value_counts(**kwargs) + + gr = df.groupby(keys, sort=isort) + right = gr["3rd"].apply(Series.value_counts, **kwargs) + right.index.names = right.index.names[:-1] + ["3rd"] + + # have to sort on index because of unstable sort on values + left, right = map(rebuild_index, (left, right)) # xref GH9212 + tm.assert_series_equal(left.sort_index(), right.sort_index()) def test_series_groupby_value_counts_with_grouper(): From a16f99d8127e421db3e5558e814385fbcdc9dad6 Mon Sep 17 00:00:00 2001 From: Anna Daglis Date: Thu, 21 Jan 2021 20:40:00 +0000 Subject: [PATCH 3/4] Trigger From ee9ac9bb8b8e992ea9bf0905314a7eaa2fb870be Mon Sep 17 00:00:00 2001 From: Anna Daglis Date: Fri, 22 Jan 2021 19:39:02 +0000 Subject: [PATCH 4/4] Make sure the data type of idx is correct --- pandas/core/groupby/generic.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ec59cdb475da9..812b1470be9a7 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -730,14 +730,15 @@ def apply_series_value_counts(): ids, lab = ids[sorter], lab[sorter] # group boundaries are where group ids change - idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]] - if len(ids) == 0: - idx = ids + idchanges = 1 + np.nonzero(ids[1:] != ids[:-1])[0] + idx = np.r_[0, idchanges] + if not len(ids): + idx = idchanges # new values are where sorted labels change lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1)) inc = np.r_[True, lchanges] - if len(lchanges) == 0: + if not len(lchanges): inc = lchanges inc[idx] = True # group boundaries are also new values out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts