Skip to content

(wip) BUG: groupby with sort=False create buggy multiindex #32506

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 4 additions & 24 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1185,8 +1185,6 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
if len(keys) == 0:
return DataFrame(index=keys)

key_names = self.grouper.names

# GH12824.
def first_not_none(values):
try:
Expand All @@ -1203,27 +1201,9 @@ def first_not_none(values):
elif isinstance(v, DataFrame):
return self._concat_objects(keys, values, not_indexed_same=not_indexed_same)
elif self.grouper.groupings is not None:
if len(self.grouper.groupings) > 1:
key_index = self.grouper.result_index

else:
ping = self.grouper.groupings[0]
if len(keys) == ping.ngroups:
key_index = ping.group_index
key_index.name = key_names[0]

key_lookup = Index(keys)
indexer = key_lookup.get_indexer(key_index)

# reorder the values
values = [values[i] for i in indexer]
else:

key_index = Index(keys, name=key_names[0])

# don't use the key indexer
if not self.as_index:
key_index = None
key_index = self.grouper.result_index
if not self.as_index:
key_index = None

# make Nones an empty object
v = first_not_none(values)
Expand Down Expand Up @@ -1635,7 +1615,7 @@ def _gotitem(self, key, ndim: int, subset=None):
raise AssertionError("invalid ndim for _gotitem")

def _wrap_frame_output(self, result, obj) -> DataFrame:
result_index = self.grouper.levels[0]
result_index = self.grouper.result_index

if self.axis == 0:
return DataFrame(result, index=obj.columns, columns=result_index).T
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,7 +412,7 @@ def _make_codes(self) -> None:
codes = self.grouper.codes_info
uniques = self.grouper.result_index
else:
codes, uniques = algorithms.factorize(self.grouper, sort=self.sort)
codes, uniques = algorithms.factorize(self.grouper, sort=True)
uniques = Index(uniques, name=self.name)
self._codes = codes
self._group_index = uniques
Expand Down
17 changes: 9 additions & 8 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,13 @@
from pandas.core.dtypes.missing import _maybe_fill, isna

import pandas.core.algorithms as algorithms
from pandas.core.arrays import Categorical
from pandas.core.base import SelectionMixin
import pandas.core.common as com
from pandas.core.frame import DataFrame
from pandas.core.generic import NDFrame
from pandas.core.groupby import base, grouper
from pandas.core.indexes.api import Index, MultiIndex, ensure_index
from pandas.core.indexes.api import CategoricalIndex, Index, MultiIndex, ensure_index
from pandas.core.series import Series
from pandas.core.sorting import (
compress_group_index,
Expand Down Expand Up @@ -141,7 +142,7 @@ def _get_grouper(self):

def _get_group_keys(self):
if len(self.groupings) == 1:
return self.levels[0]
return self.result_index
else:
comp_ids, _, ngroups = self.group_info

Expand Down Expand Up @@ -277,12 +278,13 @@ def codes_info(self) -> np.ndarray:
return codes

def _get_compressed_codes(self) -> Tuple[np.ndarray, np.ndarray]:
ping = self.groupings[0]
all_codes = self.codes
if len(all_codes) > 1:
if len(all_codes) > 1 or not isinstance(
ping.grouper, (Categorical, CategoricalIndex, BinGrouper)
):
Comment on lines +283 to +285
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel really uneasy about this special-casing, just seeing if it works

group_index = get_group_index(all_codes, self.shape, sort=True, xnull=True)
return compress_group_index(group_index, sort=self.sort)

ping = self.groupings[0]
return ping.codes, np.arange(len(ping.group_index))

@cache_readonly
Expand All @@ -297,14 +299,13 @@ def reconstructed_codes(self) -> List[np.ndarray]:

@cache_readonly
def result_index(self) -> Index:
if not self.compressed and len(self.groupings) == 1:
return self.groupings[0].result_index.rename(self.names[0])

codes = self.reconstructed_codes
levels = [ping.result_index for ping in self.groupings]
result = MultiIndex(
levels=levels, codes=codes, verify_integrity=False, names=self.names
)
if not self.compressed and len(self.groupings) == 1:
return result.get_level_values(0)
return result

def get_group_levels(self):
Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2057,3 +2057,27 @@ def test_groups_repr_truncates(max_seq_items, expected):

result = df.groupby(np.array(df.a)).groups.__repr__()
assert result == expected


def test_sort_false_multiindex_lexsorted():
# GH 32259
d = pd.to_datetime(
[
"2020-11-02",
"2019-01-02",
"2020-01-02",
"2020-02-04",
"2020-11-03",
"2019-11-03",
"2019-11-13",
"2019-11-13",
]
)
a = np.arange(len(d))
b = np.random.rand(len(d))
df = pd.DataFrame({"d": d, "a": a, "b": b})
t = df.groupby(["d", "a"], sort=False).mean()
assert not t.index.is_lexsorted()

t = df.groupby(["d", "a"], sort=True).mean()
assert t.index.is_lexsorted()
10 changes: 3 additions & 7 deletions pandas/tests/groupby/test_grouping.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,16 +575,12 @@ def test_groupby_args(self, mframe):
frame.groupby(by=None, level=None)

@pytest.mark.parametrize(
"sort,labels",
[
[True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]],
[False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]],
],
"sort", [True, False],
)
def test_level_preserve_order(self, sort, labels, mframe):
def test_level_preserve_order(self, sort, mframe):
# GH 17537
grouped = mframe.groupby(level=0, sort=sort)
exp_labels = np.array(labels, np.intp)
exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], np.intp)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Am deliberately changing the codes, so IMO it's right to change this test as well

tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels)

def test_grouping_labels(self, mframe):
Expand Down