Skip to content

Commit ae71dc1

Browse files
authored
REF: split describe categorical function (#39287)
1 parent 917b734 commit ae71dc1

File tree

1 file changed

+82
-71
lines changed

1 file changed

+82
-71
lines changed

pandas/core/describe.py

+82-71
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from __future__ import annotations
77

88
from abc import ABC, abstractmethod
9-
from typing import TYPE_CHECKING, List, Optional, Sequence, Union, cast
9+
from typing import TYPE_CHECKING, Callable, List, Optional, Sequence, Union, cast
1010
import warnings
1111

1212
import numpy as np
@@ -113,12 +113,11 @@ class SeriesDescriber(NDFrameDescriberAbstract):
113113
obj: "Series"
114114

115115
def describe(self, percentiles: Sequence[float]) -> Series:
116-
return describe_1d(
116+
describe_func = select_describe_func(
117117
self.obj,
118-
percentiles=percentiles,
119-
datetime_is_numeric=self.datetime_is_numeric,
120-
is_series=True,
118+
self.datetime_is_numeric,
121119
)
120+
return describe_func(self.obj, percentiles)
122121

123122

124123
class DataFrameDescriber(NDFrameDescriberAbstract):
@@ -155,15 +154,10 @@ def __init__(
155154
def describe(self, percentiles: Sequence[float]) -> DataFrame:
156155
data = self._select_data()
157156

158-
ldesc = [
159-
describe_1d(
160-
series,
161-
percentiles=percentiles,
162-
datetime_is_numeric=self.datetime_is_numeric,
163-
is_series=False,
164-
)
165-
for _, series in data.items()
166-
]
157+
ldesc: List["Series"] = []
158+
for _, series in data.items():
159+
describe_func = select_describe_func(series, self.datetime_is_numeric)
160+
ldesc.append(describe_func(series, percentiles))
167161

168162
col_names = reorder_columns(ldesc)
169163
d = concat(
@@ -231,55 +225,73 @@ def describe_numeric_1d(series: "Series", percentiles: Sequence[float]) -> Serie
231225
return Series(d, index=stat_index, name=series.name)
232226

233227

234-
def describe_categorical_1d(data: "Series", is_series: bool) -> Series:
228+
def describe_categorical_1d(
229+
data: "Series",
230+
percentiles_ignored: Sequence[float],
231+
) -> Series:
235232
"""Describe series containing categorical data.
236233
237234
Parameters
238235
----------
239236
data : Series
240237
Series to be described.
241-
is_series : bool
242-
True if the original object is a Series.
243-
False if the one column of the DataFrame is described.
238+
percentiles_ignored : list-like of numbers
239+
Ignored, but in place to unify interface.
240+
"""
241+
names = ["count", "unique", "top", "freq"]
242+
objcounts = data.value_counts()
243+
count_unique = len(objcounts[objcounts != 0])
244+
if count_unique > 0:
245+
top, freq = objcounts.index[0], objcounts.iloc[0]
246+
dtype = None
247+
else:
248+
# If the DataFrame is empty, set 'top' and 'freq' to None
249+
# to maintain output shape consistency
250+
top, freq = np.nan, np.nan
251+
dtype = "object"
252+
253+
result = [data.count(), count_unique, top, freq]
254+
255+
from pandas import Series
256+
257+
return Series(result, index=names, name=data.name, dtype=dtype)
258+
259+
260+
def describe_timestamp_as_categorical_1d(
261+
data: "Series",
262+
percentiles_ignored: Sequence[float],
263+
) -> Series:
264+
"""Describe series containing timestamp data treated as categorical.
265+
266+
Parameters
267+
----------
268+
data : Series
269+
Series to be described.
270+
percentiles_ignored : list-like of numbers
271+
Ignored, but in place to unify interface.
244272
"""
245273
names = ["count", "unique"]
246274
objcounts = data.value_counts()
247275
count_unique = len(objcounts[objcounts != 0])
248276
result = [data.count(), count_unique]
249277
dtype = None
250-
if result[1] > 0:
278+
if count_unique > 0:
251279
top, freq = objcounts.index[0], objcounts.iloc[0]
252-
if is_datetime64_any_dtype(data.dtype):
253-
if is_series:
254-
stacklevel = 6
255-
else:
256-
stacklevel = 7
257-
warnings.warn(
258-
"Treating datetime data as categorical rather than numeric in "
259-
"`.describe` is deprecated and will be removed in a future "
260-
"version of pandas. Specify `datetime_is_numeric=True` to "
261-
"silence this warning and adopt the future behavior now.",
262-
FutureWarning,
263-
stacklevel=stacklevel,
264-
)
265-
tz = data.dt.tz
266-
asint = data.dropna().values.view("i8")
267-
top = Timestamp(top)
268-
if top.tzinfo is not None and tz is not None:
269-
# Don't tz_localize(None) if key is already tz-aware
270-
top = top.tz_convert(tz)
271-
else:
272-
top = top.tz_localize(tz)
273-
names += ["top", "freq", "first", "last"]
274-
result += [
275-
top,
276-
freq,
277-
Timestamp(asint.min(), tz=tz),
278-
Timestamp(asint.max(), tz=tz),
279-
]
280+
tz = data.dt.tz
281+
asint = data.dropna().values.view("i8")
282+
top = Timestamp(top)
283+
if top.tzinfo is not None and tz is not None:
284+
# Don't tz_localize(None) if key is already tz-aware
285+
top = top.tz_convert(tz)
280286
else:
281-
names += ["top", "freq"]
282-
result += [top, freq]
287+
top = top.tz_localize(tz)
288+
names += ["top", "freq", "first", "last"]
289+
result += [
290+
top,
291+
freq,
292+
Timestamp(asint.min(), tz=tz),
293+
Timestamp(asint.max(), tz=tz),
294+
]
283295

284296
# If the DataFrame is empty, set 'top' and 'freq' to None
285297
# to maintain output shape consistency
@@ -317,41 +329,40 @@ def describe_timestamp_1d(data: "Series", percentiles: Sequence[float]) -> Serie
317329
return Series(d, index=stat_index, name=data.name)
318330

319331

320-
def describe_1d(
332+
def select_describe_func(
321333
data: "Series",
322-
percentiles: Sequence[float],
323334
datetime_is_numeric: bool,
324-
*,
325-
is_series: bool,
326-
) -> Series:
327-
"""Describe series.
335+
) -> Callable:
336+
"""Select proper function for describing series based on data type.
328337
329338
Parameters
330339
----------
331340
data : Series
332341
Series to be described.
333-
percentiles : list-like of numbers
334-
The percentiles to include in the output.
335-
datetime_is_numeric : bool, default False
342+
datetime_is_numeric : bool
336343
Whether to treat datetime dtypes as numeric.
337-
is_series : bool
338-
True if the original object is a Series.
339-
False if the one column of the DataFrame is described.
340-
341-
Returns
342-
-------
343-
Series
344344
"""
345345
if is_bool_dtype(data.dtype):
346-
return describe_categorical_1d(data, is_series)
346+
return describe_categorical_1d
347347
elif is_numeric_dtype(data):
348-
return describe_numeric_1d(data, percentiles)
349-
elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
350-
return describe_timestamp_1d(data, percentiles)
348+
return describe_numeric_1d
349+
elif is_datetime64_any_dtype(data.dtype):
350+
if datetime_is_numeric:
351+
return describe_timestamp_1d
352+
else:
353+
warnings.warn(
354+
"Treating datetime data as categorical rather than numeric in "
355+
"`.describe` is deprecated and will be removed in a future "
356+
"version of pandas. Specify `datetime_is_numeric=True` to "
357+
"silence this warning and adopt the future behavior now.",
358+
FutureWarning,
359+
stacklevel=5,
360+
)
361+
return describe_timestamp_as_categorical_1d
351362
elif is_timedelta64_dtype(data.dtype):
352-
return describe_numeric_1d(data, percentiles)
363+
return describe_numeric_1d
353364
else:
354-
return describe_categorical_1d(data, is_series)
365+
return describe_categorical_1d
355366

356367

357368
def refine_percentiles(percentiles: Optional[Sequence[float]]) -> Sequence[float]:

0 commit comments

Comments
 (0)