|
6 | 6 | from __future__ import annotations
|
7 | 7 |
|
8 | 8 | from abc import ABC, abstractmethod
|
9 |
| -from typing import TYPE_CHECKING, List, Optional, Sequence, Union, cast |
| 9 | +from typing import TYPE_CHECKING, Callable, List, Optional, Sequence, Union, cast |
10 | 10 | import warnings
|
11 | 11 |
|
12 | 12 | import numpy as np
|
@@ -113,12 +113,11 @@ class SeriesDescriber(NDFrameDescriberAbstract):
|
113 | 113 | obj: "Series"
|
114 | 114 |
|
115 | 115 | def describe(self, percentiles: Sequence[float]) -> Series:
|
116 |
| - return describe_1d( |
| 116 | + describe_func = select_describe_func( |
117 | 117 | self.obj,
|
118 |
| - percentiles=percentiles, |
119 |
| - datetime_is_numeric=self.datetime_is_numeric, |
120 |
| - is_series=True, |
| 118 | + self.datetime_is_numeric, |
121 | 119 | )
|
| 120 | + return describe_func(self.obj, percentiles) |
122 | 121 |
|
123 | 122 |
|
124 | 123 | class DataFrameDescriber(NDFrameDescriberAbstract):
|
@@ -155,15 +154,10 @@ def __init__(
|
155 | 154 | def describe(self, percentiles: Sequence[float]) -> DataFrame:
|
156 | 155 | data = self._select_data()
|
157 | 156 |
|
158 |
| - ldesc = [ |
159 |
| - describe_1d( |
160 |
| - series, |
161 |
| - percentiles=percentiles, |
162 |
| - datetime_is_numeric=self.datetime_is_numeric, |
163 |
| - is_series=False, |
164 |
| - ) |
165 |
| - for _, series in data.items() |
166 |
| - ] |
| 157 | + ldesc: List["Series"] = [] |
| 158 | + for _, series in data.items(): |
| 159 | + describe_func = select_describe_func(series, self.datetime_is_numeric) |
| 160 | + ldesc.append(describe_func(series, percentiles)) |
167 | 161 |
|
168 | 162 | col_names = reorder_columns(ldesc)
|
169 | 163 | d = concat(
|
@@ -231,55 +225,73 @@ def describe_numeric_1d(series: "Series", percentiles: Sequence[float]) -> Serie
|
231 | 225 | return Series(d, index=stat_index, name=series.name)
|
232 | 226 |
|
233 | 227 |
|
234 |
| -def describe_categorical_1d(data: "Series", is_series: bool) -> Series: |
| 228 | +def describe_categorical_1d( |
| 229 | + data: "Series", |
| 230 | + percentiles_ignored: Sequence[float], |
| 231 | +) -> Series: |
235 | 232 | """Describe series containing categorical data.
|
236 | 233 |
|
237 | 234 | Parameters
|
238 | 235 | ----------
|
239 | 236 | data : Series
|
240 | 237 | Series to be described.
|
241 |
| - is_series : bool |
242 |
| - True if the original object is a Series. |
243 |
| - False if the one column of the DataFrame is described. |
| 238 | + percentiles_ignored : list-like of numbers |
| 239 | + Ignored, but in place to unify interface. |
| 240 | + """ |
| 241 | + names = ["count", "unique", "top", "freq"] |
| 242 | + objcounts = data.value_counts() |
| 243 | + count_unique = len(objcounts[objcounts != 0]) |
| 244 | + if count_unique > 0: |
| 245 | + top, freq = objcounts.index[0], objcounts.iloc[0] |
| 246 | + dtype = None |
| 247 | + else: |
| 248 | + # If the DataFrame is empty, set 'top' and 'freq' to None |
| 249 | + # to maintain output shape consistency |
| 250 | + top, freq = np.nan, np.nan |
| 251 | + dtype = "object" |
| 252 | + |
| 253 | + result = [data.count(), count_unique, top, freq] |
| 254 | + |
| 255 | + from pandas import Series |
| 256 | + |
| 257 | + return Series(result, index=names, name=data.name, dtype=dtype) |
| 258 | + |
| 259 | + |
| 260 | +def describe_timestamp_as_categorical_1d( |
| 261 | + data: "Series", |
| 262 | + percentiles_ignored: Sequence[float], |
| 263 | +) -> Series: |
| 264 | + """Describe series containing timestamp data treated as categorical. |
| 265 | +
|
| 266 | + Parameters |
| 267 | + ---------- |
| 268 | + data : Series |
| 269 | + Series to be described. |
| 270 | + percentiles_ignored : list-like of numbers |
| 271 | + Ignored, but in place to unify interface. |
244 | 272 | """
|
245 | 273 | names = ["count", "unique"]
|
246 | 274 | objcounts = data.value_counts()
|
247 | 275 | count_unique = len(objcounts[objcounts != 0])
|
248 | 276 | result = [data.count(), count_unique]
|
249 | 277 | dtype = None
|
250 |
| - if result[1] > 0: |
| 278 | + if count_unique > 0: |
251 | 279 | top, freq = objcounts.index[0], objcounts.iloc[0]
|
252 |
| - if is_datetime64_any_dtype(data.dtype): |
253 |
| - if is_series: |
254 |
| - stacklevel = 6 |
255 |
| - else: |
256 |
| - stacklevel = 7 |
257 |
| - warnings.warn( |
258 |
| - "Treating datetime data as categorical rather than numeric in " |
259 |
| - "`.describe` is deprecated and will be removed in a future " |
260 |
| - "version of pandas. Specify `datetime_is_numeric=True` to " |
261 |
| - "silence this warning and adopt the future behavior now.", |
262 |
| - FutureWarning, |
263 |
| - stacklevel=stacklevel, |
264 |
| - ) |
265 |
| - tz = data.dt.tz |
266 |
| - asint = data.dropna().values.view("i8") |
267 |
| - top = Timestamp(top) |
268 |
| - if top.tzinfo is not None and tz is not None: |
269 |
| - # Don't tz_localize(None) if key is already tz-aware |
270 |
| - top = top.tz_convert(tz) |
271 |
| - else: |
272 |
| - top = top.tz_localize(tz) |
273 |
| - names += ["top", "freq", "first", "last"] |
274 |
| - result += [ |
275 |
| - top, |
276 |
| - freq, |
277 |
| - Timestamp(asint.min(), tz=tz), |
278 |
| - Timestamp(asint.max(), tz=tz), |
279 |
| - ] |
| 280 | + tz = data.dt.tz |
| 281 | + asint = data.dropna().values.view("i8") |
| 282 | + top = Timestamp(top) |
| 283 | + if top.tzinfo is not None and tz is not None: |
| 284 | + # Don't tz_localize(None) if key is already tz-aware |
| 285 | + top = top.tz_convert(tz) |
280 | 286 | else:
|
281 |
| - names += ["top", "freq"] |
282 |
| - result += [top, freq] |
| 287 | + top = top.tz_localize(tz) |
| 288 | + names += ["top", "freq", "first", "last"] |
| 289 | + result += [ |
| 290 | + top, |
| 291 | + freq, |
| 292 | + Timestamp(asint.min(), tz=tz), |
| 293 | + Timestamp(asint.max(), tz=tz), |
| 294 | + ] |
283 | 295 |
|
284 | 296 | # If the DataFrame is empty, set 'top' and 'freq' to None
|
285 | 297 | # to maintain output shape consistency
|
@@ -317,41 +329,40 @@ def describe_timestamp_1d(data: "Series", percentiles: Sequence[float]) -> Serie
|
317 | 329 | return Series(d, index=stat_index, name=data.name)
|
318 | 330 |
|
319 | 331 |
|
320 |
| -def describe_1d( |
| 332 | +def select_describe_func( |
321 | 333 | data: "Series",
|
322 |
| - percentiles: Sequence[float], |
323 | 334 | datetime_is_numeric: bool,
|
324 |
| - *, |
325 |
| - is_series: bool, |
326 |
| -) -> Series: |
327 |
| - """Describe series. |
| 335 | +) -> Callable: |
| 336 | + """Select proper function for describing series based on data type. |
328 | 337 |
|
329 | 338 | Parameters
|
330 | 339 | ----------
|
331 | 340 | data : Series
|
332 | 341 | Series to be described.
|
333 |
| - percentiles : list-like of numbers |
334 |
| - The percentiles to include in the output. |
335 |
| - datetime_is_numeric : bool, default False |
| 342 | + datetime_is_numeric : bool |
336 | 343 | Whether to treat datetime dtypes as numeric.
|
337 |
| - is_series : bool |
338 |
| - True if the original object is a Series. |
339 |
| - False if the one column of the DataFrame is described. |
340 |
| -
|
341 |
| - Returns |
342 |
| - ------- |
343 |
| - Series |
344 | 344 | """
|
345 | 345 | if is_bool_dtype(data.dtype):
|
346 |
| - return describe_categorical_1d(data, is_series) |
| 346 | + return describe_categorical_1d |
347 | 347 | elif is_numeric_dtype(data):
|
348 |
| - return describe_numeric_1d(data, percentiles) |
349 |
| - elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric: |
350 |
| - return describe_timestamp_1d(data, percentiles) |
| 348 | + return describe_numeric_1d |
| 349 | + elif is_datetime64_any_dtype(data.dtype): |
| 350 | + if datetime_is_numeric: |
| 351 | + return describe_timestamp_1d |
| 352 | + else: |
| 353 | + warnings.warn( |
| 354 | + "Treating datetime data as categorical rather than numeric in " |
| 355 | + "`.describe` is deprecated and will be removed in a future " |
| 356 | + "version of pandas. Specify `datetime_is_numeric=True` to " |
| 357 | + "silence this warning and adopt the future behavior now.", |
| 358 | + FutureWarning, |
| 359 | + stacklevel=5, |
| 360 | + ) |
| 361 | + return describe_timestamp_as_categorical_1d |
351 | 362 | elif is_timedelta64_dtype(data.dtype):
|
352 |
| - return describe_numeric_1d(data, percentiles) |
| 363 | + return describe_numeric_1d |
353 | 364 | else:
|
354 |
| - return describe_categorical_1d(data, is_series) |
| 365 | + return describe_categorical_1d |
355 | 366 |
|
356 | 367 |
|
357 | 368 | def refine_percentiles(percentiles: Optional[Sequence[float]]) -> Sequence[float]:
|
|
0 commit comments