Skip to content

CLN: Unify signatures in _libs.groupby #34372

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jun 18, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -378,8 +378,8 @@ def group_fillna_indexer(ndarray[int64_t] out, ndarray[int64_t] labels,
@cython.boundscheck(False)
@cython.wraparound(False)
def group_any_all(uint8_t[:] out,
const int64_t[:] labels,
const uint8_t[:] values,
const int64_t[:] labels,
const uint8_t[:] mask,
object val_test,
bint skipna):
Expand Down Expand Up @@ -560,7 +560,8 @@ def _group_var(floating[:, :] out,
int64_t[:] counts,
floating[:, :] values,
const int64_t[:] labels,
Py_ssize_t min_count=-1):
Py_ssize_t min_count=-1,
int64_t ddof=1):
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
floating val, ct, oldmean
Expand Down Expand Up @@ -600,10 +601,10 @@ def _group_var(floating[:, :] out,
for i in range(ncounts):
for j in range(K):
ct = nobs[i, j]
if ct < 2:
if ct <= ddof:
out[i, j] = NAN
else:
out[i, j] /= (ct - 1)
out[i, j] /= (ct - ddof)


group_var_float32 = _group_var['float']
Expand Down Expand Up @@ -715,8 +716,8 @@ group_ohlc_float64 = _group_ohlc['double']
@cython.boundscheck(False)
@cython.wraparound(False)
def group_quantile(ndarray[float64_t] out,
ndarray[int64_t] labels,
numeric[:] values,
ndarray[int64_t] labels,
ndarray[uint8_t] mask,
float64_t q,
object interpolation):
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1736,7 +1736,8 @@ def _wrap_aggregated_output(
DataFrame
"""
indexed_output = {key.position: val for key, val in output.items()}
columns = Index(key.label for key in output)
name = self._obj_with_exclusions._get_axis(1 - self.axis).name
columns = Index([key.label for key in output], name=name)

result = self.obj._constructor(indexed_output)
result.columns = columns
Expand Down
64 changes: 50 additions & 14 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1277,6 +1277,7 @@ def result_to_bool(result: np.ndarray, inference: Type) -> np.ndarray:
return self._get_cythonized_result(
"group_any_all",
aggregate=True,
numeric_only=False,
cython_dtype=np.dtype(np.uint8),
needs_values=True,
needs_mask=True,
Expand Down Expand Up @@ -1433,18 +1434,16 @@ def std(self, ddof: int = 1):
Series or DataFrame
Standard deviation of values within each group.
"""
result = self.var(ddof=ddof)
if result.ndim == 1:
result = np.sqrt(result)
else:
cols = result.columns.get_indexer_for(
result.columns.difference(self.exclusions).unique()
)
# TODO(GH-22046) - setting with iloc broken if labels are not unique
# .values to remove labels
result.iloc[:, cols] = np.sqrt(result.iloc[:, cols]).values

return result
return self._get_cythonized_result(
"group_var_float64",
aggregate=True,
needs_counts=True,
needs_values=True,
needs_2d=True,
cython_dtype=np.dtype(np.float64),
post_processing=lambda vals, inference: np.sqrt(vals),
ddof=ddof,
)

@Substitution(name="groupby")
@Appender(_common_see_also)
Expand Down Expand Up @@ -1778,6 +1777,7 @@ def _fill(self, direction, limit=None):

return self._get_cythonized_result(
"group_fillna_indexer",
numeric_only=False,
needs_mask=True,
cython_dtype=np.dtype(np.int64),
result_is_index=True,
Expand Down Expand Up @@ -2078,6 +2078,7 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray:
return self._get_cythonized_result(
"group_quantile",
aggregate=True,
numeric_only=False,
needs_values=True,
needs_mask=True,
cython_dtype=np.dtype(np.float64),
Expand Down Expand Up @@ -2367,7 +2368,11 @@ def _get_cythonized_result(
how: str,
cython_dtype: np.dtype,
aggregate: bool = False,
numeric_only: bool = True,
needs_counts: bool = False,
needs_values: bool = False,
needs_2d: bool = False,
min_count: Optional[int] = None,
needs_mask: bool = False,
needs_ngroups: bool = False,
result_is_index: bool = False,
Expand All @@ -2386,9 +2391,18 @@ def _get_cythonized_result(
aggregate : bool, default False
Whether the result should be aggregated to match the number of
groups
numeric_only : bool, default True
Whether only numeric datatypes should be computed
needs_counts : bool, default False
Whether the counts should be a part of the Cython call
needs_values : bool, default False
Whether the values should be a part of the Cython call
signature
needs_2d : bool, default False
Whether the values and result of the Cython call signature
are at least 2-dimensional.
min_count : int, default None
When not None, min_count for the Cython call
needs_mask : bool, default False
Whether boolean mask needs to be part of the Cython call
signature
Expand Down Expand Up @@ -2418,7 +2432,7 @@ def _get_cythonized_result(
if result_is_index and aggregate:
raise ValueError("'result_is_index' and 'aggregate' cannot both be True!")
if post_processing:
if not callable(pre_processing):
if not callable(post_processing):
raise ValueError("'post_processing' must be a callable!")
if pre_processing:
if not callable(pre_processing):
Expand All @@ -2438,21 +2452,39 @@ def _get_cythonized_result(
name = obj.name
values = obj._values

if numeric_only and not is_numeric_dtype(values):
continue

if aggregate:
result_sz = ngroups
else:
result_sz = len(values)

result = np.zeros(result_sz, dtype=cython_dtype)
func = partial(base_func, result, labels)
if needs_2d:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you use at_least2d or just reshape here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you do this here

Copy link
Member Author

@rhshadrach rhshadrach Jun 18, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you're asking to replace

if needs_2d:
    result = np.zeros((result_sz, 1), dtype=cython_dtype)
else:
    result = np.zeros(result_sz, dtype=cython_dtype)

with

result = np.zeros(result_sz, dtype=cython_dtype)
if needs_2d:
    result = result.reshape((-1, 1))

I think the reshape version is less performant when needs_2d is True, no?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't believe at_least2d is applicable, it will turn a 1d into a single row (1xn) whereas we need a column (nx1).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

result = np.zeros(result_sz, dtype=cython_dtype)
if needs_2d:
    result = result.reshape((-1, 1))

yes this would be an improvement

result = result.reshape((-1, 1))
func = partial(base_func, result)

inferences = None

if needs_counts:
counts = np.zeros(self.ngroups, dtype=np.int64)
func = partial(func, counts)

if needs_values:
vals = values
if pre_processing:
vals, inferences = pre_processing(vals)
if needs_2d:
vals = vals.reshape((-1, 1))
vals = vals.astype(cython_dtype, copy=False)
func = partial(func, vals)

func = partial(func, labels)

if min_count is not None:
func = partial(func, min_count)

if needs_mask:
mask = isna(values).view(np.uint8)
func = partial(func, mask)
Expand All @@ -2462,6 +2494,9 @@ def _get_cythonized_result(

func(**kwargs) # Call func to modify indexer values in place

if needs_2d:
result = result.reshape(-1)

if result_is_index:
result = algorithms.take_nd(values, result)

Expand Down Expand Up @@ -2512,6 +2547,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):

return self._get_cythonized_result(
"group_shift_indexer",
numeric_only=False,
cython_dtype=np.dtype(np.int64),
needs_ngroups=True,
result_is_index=True,
Expand Down