|
10 | 10 | import pandas.core.common as com
|
11 | 11 | import pandas.algos as algos
|
12 | 12 | import pandas.hashtable as htable
|
| 13 | +from pandas.types import api as gt |
13 | 14 | from pandas.compat import string_types
|
14 | 15 | from pandas.tslib import iNaT
|
15 | 16 |
|
@@ -253,84 +254,101 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
|
253 | 254 |
|
254 | 255 | """
|
255 | 256 | from pandas.core.series import Series
|
256 |
| - from pandas.tools.tile import cut |
257 |
| - from pandas import Index, PeriodIndex, DatetimeIndex |
258 |
| - |
259 | 257 | name = getattr(values, 'name', None)
|
260 |
| - values = Series(values).values |
261 | 258 |
|
262 | 259 | if bins is not None:
|
263 | 260 | try:
|
| 261 | + from pandas.tools.tile import cut |
| 262 | + values = Series(values).values |
264 | 263 | cat, bins = cut(values, bins, retbins=True)
|
265 | 264 | except TypeError:
|
266 | 265 | raise TypeError("bins argument only works with numeric data.")
|
267 | 266 | values = cat.codes
|
268 | 267 |
|
269 |
| - if com.is_categorical_dtype(values.dtype): |
270 |
| - result = values.value_counts(dropna) |
271 |
| - |
| 268 | + if com.is_extension_type(values) and not com.is_datetimetz(values): |
| 269 | + # handle Categorical and sparse, |
| 270 | + # datetime tz can be handeled in ndarray path |
| 271 | + result = Series(values).values.value_counts(dropna=dropna) |
| 272 | + result.name = name |
| 273 | + counts = result.values |
272 | 274 | else:
|
| 275 | + # ndarray path. pass original to handle DatetimeTzBlock |
| 276 | + keys, counts = _value_counts_arraylike(values, dropna=dropna) |
273 | 277 |
|
274 |
| - dtype = values.dtype |
275 |
| - is_period = com.is_period_arraylike(values) |
276 |
| - is_datetimetz = com.is_datetimetz(values) |
| 278 | + from pandas import Index, Series |
| 279 | + if not isinstance(keys, Index): |
| 280 | + keys = Index(keys) |
| 281 | + result = Series(counts, index=keys, name=name) |
277 | 282 |
|
278 |
| - if com.is_datetime_or_timedelta_dtype(dtype) or is_period or \ |
279 |
| - is_datetimetz: |
| 283 | + if bins is not None: |
| 284 | + # TODO: This next line should be more efficient |
| 285 | + result = result.reindex(np.arange(len(cat.categories)), |
| 286 | + fill_value=0) |
| 287 | + result.index = bins[:-1] |
280 | 288 |
|
281 |
| - if is_period: |
282 |
| - values = PeriodIndex(values) |
283 |
| - elif is_datetimetz: |
284 |
| - tz = getattr(values, 'tz', None) |
285 |
| - values = DatetimeIndex(values).tz_localize(None) |
| 289 | + if sort: |
| 290 | + result = result.sort_values(ascending=ascending) |
286 | 291 |
|
287 |
| - values = values.view(np.int64) |
288 |
| - keys, counts = htable.value_count_scalar64(values, dropna) |
| 292 | + if normalize: |
| 293 | + result = result / float(counts.sum()) |
289 | 294 |
|
290 |
| - if dropna: |
291 |
| - msk = keys != iNaT |
292 |
| - keys, counts = keys[msk], counts[msk] |
| 295 | + return result |
293 | 296 |
|
294 |
| - # localize to the original tz if necessary |
295 |
| - if is_datetimetz: |
296 |
| - keys = DatetimeIndex(keys).tz_localize(tz) |
297 | 297 |
|
298 |
| - # convert the keys back to the dtype we came in |
299 |
| - else: |
300 |
| - keys = keys.astype(dtype) |
| 298 | +def _value_counts_arraylike(values, dropna=True): |
| 299 | + is_datetimetz = com.is_datetimetz(values) |
| 300 | + is_period = (isinstance(values, gt.ABCPeriodIndex) or |
| 301 | + com.is_period_arraylike(values)) |
301 | 302 |
|
302 |
| - elif com.is_integer_dtype(dtype): |
303 |
| - values = com._ensure_int64(values) |
304 |
| - keys, counts = htable.value_count_scalar64(values, dropna) |
305 |
| - elif com.is_float_dtype(dtype): |
306 |
| - values = com._ensure_float64(values) |
307 |
| - keys, counts = htable.value_count_scalar64(values, dropna) |
| 303 | + orig = values |
308 | 304 |
|
309 |
| - else: |
310 |
| - values = com._ensure_object(values) |
311 |
| - mask = com.isnull(values) |
312 |
| - keys, counts = htable.value_count_object(values, mask) |
313 |
| - if not dropna and mask.any(): |
314 |
| - keys = np.insert(keys, 0, np.NaN) |
315 |
| - counts = np.insert(counts, 0, mask.sum()) |
| 305 | + from pandas.core.series import Series |
| 306 | + values = Series(values).values |
| 307 | + dtype = values.dtype |
316 | 308 |
|
317 |
| - if not isinstance(keys, Index): |
318 |
| - keys = Index(keys) |
319 |
| - result = Series(counts, index=keys, name=name) |
| 309 | + if com.is_datetime_or_timedelta_dtype(dtype) or is_period: |
| 310 | + from pandas.tseries.index import DatetimeIndex |
| 311 | + from pandas.tseries.period import PeriodIndex |
320 | 312 |
|
321 |
| - if bins is not None: |
322 |
| - # TODO: This next line should be more efficient |
323 |
| - result = result.reindex(np.arange(len(cat.categories)), |
324 |
| - fill_value=0) |
325 |
| - result.index = bins[:-1] |
| 313 | + if is_period: |
| 314 | + values = PeriodIndex(values) |
| 315 | + freq = values.freq |
326 | 316 |
|
327 |
| - if sort: |
328 |
| - result = result.sort_values(ascending=ascending) |
| 317 | + values = values.view(np.int64) |
| 318 | + keys, counts = htable.value_count_scalar64(values, dropna) |
329 | 319 |
|
330 |
| - if normalize: |
331 |
| - result = result / float(counts.sum()) |
| 320 | + if dropna: |
| 321 | + msk = keys != iNaT |
| 322 | + keys, counts = keys[msk], counts[msk] |
332 | 323 |
|
333 |
| - return result |
| 324 | + # convert the keys back to the dtype we came in |
| 325 | + keys = keys.astype(dtype) |
| 326 | + |
| 327 | + # dtype handling |
| 328 | + if is_datetimetz: |
| 329 | + if isinstance(orig, gt.ABCDatetimeIndex): |
| 330 | + tz = orig.tz |
| 331 | + else: |
| 332 | + tz = orig.dt.tz |
| 333 | + keys = DatetimeIndex._simple_new(keys, tz=tz) |
| 334 | + if is_period: |
| 335 | + keys = PeriodIndex._simple_new(keys, freq=freq) |
| 336 | + |
| 337 | + elif com.is_integer_dtype(dtype): |
| 338 | + values = com._ensure_int64(values) |
| 339 | + keys, counts = htable.value_count_scalar64(values, dropna) |
| 340 | + elif com.is_float_dtype(dtype): |
| 341 | + values = com._ensure_float64(values) |
| 342 | + keys, counts = htable.value_count_scalar64(values, dropna) |
| 343 | + else: |
| 344 | + values = com._ensure_object(values) |
| 345 | + mask = com.isnull(values) |
| 346 | + keys, counts = htable.value_count_object(values, mask) |
| 347 | + if not dropna and mask.any(): |
| 348 | + keys = np.insert(keys, 0, np.NaN) |
| 349 | + counts = np.insert(counts, 0, mask.sum()) |
| 350 | + |
| 351 | + return keys, counts |
334 | 352 |
|
335 | 353 |
|
336 | 354 | def mode(values):
|
|
0 commit comments