REF: split describe categorical function (#39287)

ivanovmg · web-flow · commit ae71dc1c5a69 · 2021-01-20T09:52:45.000-05:00
diff --git a/pandas/core/describe.py b/pandas/core/describe.py
@@ -6,7 +6,7 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, List, Optional, Sequence, Union, cast
+from typing import TYPE_CHECKING, Callable, List, Optional, Sequence, Union, cast
 import warnings
 
 import numpy as np
@@ -113,12 +113,11 @@ class SeriesDescriber(NDFrameDescriberAbstract):
     obj: "Series"
 
     def describe(self, percentiles: Sequence[float]) -> Series:
-        return describe_1d(
+        describe_func = select_describe_func(
             self.obj,
-            percentiles=percentiles,
-            datetime_is_numeric=self.datetime_is_numeric,
-            is_series=True,
+            self.datetime_is_numeric,
         )
+        return describe_func(self.obj, percentiles)
 
 
 class DataFrameDescriber(NDFrameDescriberAbstract):
@@ -155,15 +154,10 @@ def __init__(
     def describe(self, percentiles: Sequence[float]) -> DataFrame:
         data = self._select_data()
 
-        ldesc = [
-            describe_1d(
-                series,
-                percentiles=percentiles,
-                datetime_is_numeric=self.datetime_is_numeric,
-                is_series=False,
-            )
-            for _, series in data.items()
-        ]
+        ldesc: List["Series"] = []
+        for _, series in data.items():
+            describe_func = select_describe_func(series, self.datetime_is_numeric)
+            ldesc.append(describe_func(series, percentiles))
 
         col_names = reorder_columns(ldesc)
         d = concat(
@@ -231,55 +225,73 @@ def describe_numeric_1d(series: "Series", percentiles: Sequence[float]) -> Serie
     return Series(d, index=stat_index, name=series.name)
 
 
-def describe_categorical_1d(data: "Series", is_series: bool) -> Series:
+def describe_categorical_1d(
+    data: "Series",
+    percentiles_ignored: Sequence[float],
+) -> Series:
     """Describe series containing categorical data.
 
     Parameters
     ----------
     data : Series
         Series to be described.
-    is_series : bool
-        True if the original object is a Series.
-        False if the one column of the DataFrame is described.
+    percentiles_ignored : list-like of numbers
+        Ignored, but in place to unify interface.
+    """
+    names = ["count", "unique", "top", "freq"]
+    objcounts = data.value_counts()
+    count_unique = len(objcounts[objcounts != 0])
+    if count_unique > 0:
+        top, freq = objcounts.index[0], objcounts.iloc[0]
+        dtype = None
+    else:
+        # If the DataFrame is empty, set 'top' and 'freq' to None
+        # to maintain output shape consistency
+        top, freq = np.nan, np.nan
+        dtype = "object"
+
+    result = [data.count(), count_unique, top, freq]
+
+    from pandas import Series
+
+    return Series(result, index=names, name=data.name, dtype=dtype)
+
+
+def describe_timestamp_as_categorical_1d(
+    data: "Series",
+    percentiles_ignored: Sequence[float],
+) -> Series:
+    """Describe series containing timestamp data treated as categorical.
+
+    Parameters
+    ----------
+    data : Series
+        Series to be described.
+    percentiles_ignored : list-like of numbers
+        Ignored, but in place to unify interface.
     """
     names = ["count", "unique"]
     objcounts = data.value_counts()
     count_unique = len(objcounts[objcounts != 0])
     result = [data.count(), count_unique]
     dtype = None
-    if result[1] > 0:
+    if count_unique > 0:
         top, freq = objcounts.index[0], objcounts.iloc[0]
-        if is_datetime64_any_dtype(data.dtype):
-            if is_series:
-                stacklevel = 6
-            else:
-                stacklevel = 7
-            warnings.warn(
-                "Treating datetime data as categorical rather than numeric in "
-                "`.describe` is deprecated and will be removed in a future "
-                "version of pandas. Specify `datetime_is_numeric=True` to "
-                "silence this warning and adopt the future behavior now.",
-                FutureWarning,
-                stacklevel=stacklevel,
-            )
-            tz = data.dt.tz
-            asint = data.dropna().values.view("i8")
-            top = Timestamp(top)
-            if top.tzinfo is not None and tz is not None:
-                # Don't tz_localize(None) if key is already tz-aware
-                top = top.tz_convert(tz)
-            else:
-                top = top.tz_localize(tz)
-            names += ["top", "freq", "first", "last"]
-            result += [
-                top,
-                freq,
-                Timestamp(asint.min(), tz=tz),
-                Timestamp(asint.max(), tz=tz),
-            ]
+        tz = data.dt.tz
+        asint = data.dropna().values.view("i8")
+        top = Timestamp(top)
+        if top.tzinfo is not None and tz is not None:
+            # Don't tz_localize(None) if key is already tz-aware
+            top = top.tz_convert(tz)
         else:
-            names += ["top", "freq"]
-            result += [top, freq]
+            top = top.tz_localize(tz)
+        names += ["top", "freq", "first", "last"]
+        result += [
+            top,
+            freq,
+            Timestamp(asint.min(), tz=tz),
+            Timestamp(asint.max(), tz=tz),
+        ]
 
     # If the DataFrame is empty, set 'top' and 'freq' to None
     # to maintain output shape consistency
@@ -317,41 +329,40 @@ def describe_timestamp_1d(data: "Series", percentiles: Sequence[float]) -> Serie
     return Series(d, index=stat_index, name=data.name)
 
 
-def describe_1d(
+def select_describe_func(
     data: "Series",
-    percentiles: Sequence[float],
     datetime_is_numeric: bool,
-    *,
-    is_series: bool,
-) -> Series:
-    """Describe series.
+) -> Callable:
+    """Select proper function for describing series based on data type.
 
     Parameters
     ----------
     data : Series
         Series to be described.
-    percentiles : list-like of numbers
-        The percentiles to include in the output.
-    datetime_is_numeric : bool, default False
+    datetime_is_numeric : bool
         Whether to treat datetime dtypes as numeric.
-    is_series : bool
-        True if the original object is a Series.
-        False if the one column of the DataFrame is described.
-
-    Returns
-    -------
-    Series
     """
     if is_bool_dtype(data.dtype):
-        return describe_categorical_1d(data, is_series)
+        return describe_categorical_1d
     elif is_numeric_dtype(data):
-        return describe_numeric_1d(data, percentiles)
-    elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
-        return describe_timestamp_1d(data, percentiles)
+        return describe_numeric_1d
+    elif is_datetime64_any_dtype(data.dtype):
+        if datetime_is_numeric:
+            return describe_timestamp_1d
+        else:
+            warnings.warn(
+                "Treating datetime data as categorical rather than numeric in "
+                "`.describe` is deprecated and will be removed in a future "
+                "version of pandas. Specify `datetime_is_numeric=True` to "
+                "silence this warning and adopt the future behavior now.",
+                FutureWarning,
+                stacklevel=5,
+            )
+            return describe_timestamp_as_categorical_1d
     elif is_timedelta64_dtype(data.dtype):
-        return describe_numeric_1d(data, percentiles)
+        return describe_numeric_1d
     else:
-        return describe_categorical_1d(data, is_series)
+        return describe_categorical_1d
 
 
 def refine_percentiles(percentiles: Optional[Sequence[float]]) -> Sequence[float]: