-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Inconsistencies between python/cython groupby.agg behavior #35928
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4c5eddd
c632c9f
9e64be3
42649fb
47121dd
1decb3e
57c5dd3
a358463
ffa7ad7
e5e98d4
408db5a
d3493cf
75a805a
9f61070
2d10f6e
3e20187
51205a5
f453c5b
2ae2124
98a91a3
5f73b03
065fc69
c230f72
bf2e171
a4bcf43
607aea8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -75,7 +75,14 @@ | |
group_selection_context, | ||
) | ||
from pandas.core.groupby.numba_ import generate_numba_func, split_for_numba | ||
from pandas.core.indexes.api import Index, MultiIndex, all_indexes_same | ||
from pandas.core.indexes.api import ( | ||
DatetimeIndex, | ||
Index, | ||
MultiIndex, | ||
PeriodIndex, | ||
TimedeltaIndex, | ||
all_indexes_same, | ||
) | ||
import pandas.core.indexes.base as ibase | ||
from pandas.core.internals import BlockManager | ||
from pandas.core.series import Series | ||
|
@@ -257,17 +264,27 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) | |
if self.grouper.nkeys > 1: | ||
return self._python_agg_general(func, *args, **kwargs) | ||
|
||
try: | ||
return self._python_agg_general(func, *args, **kwargs) | ||
except (ValueError, KeyError): | ||
# TODO: KeyError is raised in _python_agg_general, | ||
# see see test_groupby.test_basic | ||
result = self._aggregate_named(func, *args, **kwargs) | ||
if isinstance( | ||
self._selected_obj.index, (DatetimeIndex, TimedeltaIndex, PeriodIndex) | ||
): | ||
# using _python_agg_general would end up incorrectly patching | ||
# _index_data in reduction.pyx | ||
result = self._aggregate_maybe_named(func, *args, **kwargs) | ||
else: | ||
try: | ||
return self._python_agg_general(func, *args, **kwargs) | ||
except (ValueError, KeyError): | ||
# TODO: KeyError is raised in _python_agg_general, | ||
# see see test_groupby.test_basic | ||
result = self._aggregate_maybe_named(func, *args, **kwargs) | ||
|
||
index = self.grouper.result_index | ||
assert index.name == self.grouper.names[0] | ||
|
||
index = Index(sorted(result), name=self.grouper.names[0]) | ||
ret = create_series_with_explicit_dtype( | ||
result, index=index, dtype_if_empty=object | ||
) | ||
ret.name = self._selected_obj.name # test_metadata_propagation_indiv | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you pass the name in L284 |
||
|
||
if not self.as_index: # pragma: no cover | ||
print("Warning, ignoring as_index=True") | ||
|
@@ -470,14 +487,34 @@ def _get_index() -> Index: | |
) | ||
return self._reindex_output(result) | ||
|
||
def _aggregate_named(self, func, *args, **kwargs): | ||
def _aggregate_maybe_named(self, func, *args, **kwargs): | ||
""" | ||
Try the named-aggregator first, then unnamed, which better matches | ||
what libreduction does. | ||
""" | ||
try: | ||
return self._aggregate_named(func, *args, named=True, **kwargs) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what is this actually doing? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. trying to track down an answer to why i did this, may take a bit There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK now i recall. in e.g. test_apply_columns_multilevel we do:
so the function we are passing to apply is But the user could also pass a function that depends on the non-patched Series name, so we end up guessing which regime we're in, which this is doing. If it were up to me I'd shoot this name-patching thing into the sun. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. grr, this is really complex. |
||
except KeyError: | ||
return self._aggregate_named(func, *args, named=False, **kwargs) | ||
|
||
def _aggregate_named(self, func, *args, named: bool = True, **kwargs): | ||
result = {} | ||
|
||
for name, group in self: | ||
group.name = name | ||
for name, group in self: # TODO: could we have duplicate names? | ||
if named: | ||
group.name = name | ||
|
||
output = func(group, *args, **kwargs) | ||
if isinstance(output, (Series, Index, np.ndarray)): | ||
raise ValueError("Must produce aggregated value") | ||
if ( | ||
isinstance(output, Series) | ||
and len(output) == 1 | ||
and name in output.index | ||
): | ||
# FIXME: kludge for test_resampler_grouper.test_apply | ||
output = output.iloc[0] | ||
else: | ||
raise ValueError("Must produce aggregated value") | ||
result[name] = output | ||
|
||
return result | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
i guess this would fail if we had a name that was NA