Skip to content

Commit 8439d28

Browse files
sinhrksjreback
authored andcommitted
BUG: SparseSeries.value_counts ignores fill_value
closes #6749 Author: sinhrks <[email protected]> Closes #12835 from sinhrks/sparse_valuecounts and squashes the following commits: 2392e7c [sinhrks] Move dtype handling to algorithm 2f46f73 [sinhrks] BUG: SparseSeries.value_counts ignores fill_value
1 parent 7bbd031 commit 8439d28

File tree

8 files changed

+413
-80
lines changed

8 files changed

+413
-80
lines changed

doc/source/whatsnew/v0.18.1.txt

+4
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,7 @@ These changes conform sparse handling to return the correct types and work to ma
259259
- Bug in ``SparseSeries`` and ``SparseArray`` may have different ``dtype`` from its dense values (:issue:`12908`)
260260
- Bug in ``SparseSeries.reindex`` incorrectly handle ``fill_value`` (:issue:`12797`)
261261
- Bug in ``SparseArray.to_frame()`` results in ``DataFrame``, rather than ``SparseDataFrame`` (:issue:`9850`)
262+
- Bug in ``SparseSeries.value_counts()`` does not count ``fill_value`` (:issue:`6749`)
262263
- Bug in ``SparseArray.to_dense()`` does not preserve ``dtype`` (:issue:`10648`)
263264
- Bug in ``SparseArray.to_dense()`` incorrectly handle ``fill_value`` (:issue:`12797`)
264265
- Bug in ``pd.concat()`` of ``SparseSeries`` results in dense (:issue:`10536`)
@@ -536,6 +537,9 @@ Bug Fixes
536537

537538

538539
- Bug in ``value_counts`` when ``normalize=True`` and ``dropna=True`` where nulls still contributed to the normalized count (:issue:`12558`)
540+
- Bug in ``Series.value_counts()`` loses name if its dtype is category (:issue:`12835`)
541+
- Bug in ``Series.value_counts()`` loses timezone info (:issue:`12835`)
542+
- Bug in ``Series.value_counts(normalize=True)`` with ``Categorical`` raises ``UnboundLocalError`` (:issue:`12835`)
539543
- Bug in ``Panel.fillna()`` ignoring ``inplace=True`` (:issue:`12633`)
540544
- Bug in ``read_csv`` when specifying ``names``, ``usecols``, and ``parse_dates`` simultaneously with the C engine (:issue:`9755`)
541545
- Bug in ``read_csv`` when specifying ``delim_whitespace=True`` and ``lineterminator`` simultaneously with the C engine (:issue:`12912`)

pandas/core/algorithms.py

+72-54
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import pandas.core.common as com
1111
import pandas.algos as algos
1212
import pandas.hashtable as htable
13+
from pandas.types import api as gt
1314
from pandas.compat import string_types
1415
from pandas.tslib import iNaT
1516

@@ -253,84 +254,101 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
253254
254255
"""
255256
from pandas.core.series import Series
256-
from pandas.tools.tile import cut
257-
from pandas import Index, PeriodIndex, DatetimeIndex
258-
259257
name = getattr(values, 'name', None)
260-
values = Series(values).values
261258

262259
if bins is not None:
263260
try:
261+
from pandas.tools.tile import cut
262+
values = Series(values).values
264263
cat, bins = cut(values, bins, retbins=True)
265264
except TypeError:
266265
raise TypeError("bins argument only works with numeric data.")
267266
values = cat.codes
268267

269-
if com.is_categorical_dtype(values.dtype):
270-
result = values.value_counts(dropna)
271-
268+
if com.is_extension_type(values) and not com.is_datetimetz(values):
269+
# handle Categorical and sparse,
270+
# datetime tz can be handeled in ndarray path
271+
result = Series(values).values.value_counts(dropna=dropna)
272+
result.name = name
273+
counts = result.values
272274
else:
275+
# ndarray path. pass original to handle DatetimeTzBlock
276+
keys, counts = _value_counts_arraylike(values, dropna=dropna)
273277

274-
dtype = values.dtype
275-
is_period = com.is_period_arraylike(values)
276-
is_datetimetz = com.is_datetimetz(values)
278+
from pandas import Index, Series
279+
if not isinstance(keys, Index):
280+
keys = Index(keys)
281+
result = Series(counts, index=keys, name=name)
277282

278-
if com.is_datetime_or_timedelta_dtype(dtype) or is_period or \
279-
is_datetimetz:
283+
if bins is not None:
284+
# TODO: This next line should be more efficient
285+
result = result.reindex(np.arange(len(cat.categories)),
286+
fill_value=0)
287+
result.index = bins[:-1]
280288

281-
if is_period:
282-
values = PeriodIndex(values)
283-
elif is_datetimetz:
284-
tz = getattr(values, 'tz', None)
285-
values = DatetimeIndex(values).tz_localize(None)
289+
if sort:
290+
result = result.sort_values(ascending=ascending)
286291

287-
values = values.view(np.int64)
288-
keys, counts = htable.value_count_scalar64(values, dropna)
292+
if normalize:
293+
result = result / float(counts.sum())
289294

290-
if dropna:
291-
msk = keys != iNaT
292-
keys, counts = keys[msk], counts[msk]
295+
return result
293296

294-
# localize to the original tz if necessary
295-
if is_datetimetz:
296-
keys = DatetimeIndex(keys).tz_localize(tz)
297297

298-
# convert the keys back to the dtype we came in
299-
else:
300-
keys = keys.astype(dtype)
298+
def _value_counts_arraylike(values, dropna=True):
299+
is_datetimetz = com.is_datetimetz(values)
300+
is_period = (isinstance(values, gt.ABCPeriodIndex) or
301+
com.is_period_arraylike(values))
301302

302-
elif com.is_integer_dtype(dtype):
303-
values = com._ensure_int64(values)
304-
keys, counts = htable.value_count_scalar64(values, dropna)
305-
elif com.is_float_dtype(dtype):
306-
values = com._ensure_float64(values)
307-
keys, counts = htable.value_count_scalar64(values, dropna)
303+
orig = values
308304

309-
else:
310-
values = com._ensure_object(values)
311-
mask = com.isnull(values)
312-
keys, counts = htable.value_count_object(values, mask)
313-
if not dropna and mask.any():
314-
keys = np.insert(keys, 0, np.NaN)
315-
counts = np.insert(counts, 0, mask.sum())
305+
from pandas.core.series import Series
306+
values = Series(values).values
307+
dtype = values.dtype
316308

317-
if not isinstance(keys, Index):
318-
keys = Index(keys)
319-
result = Series(counts, index=keys, name=name)
309+
if com.is_datetime_or_timedelta_dtype(dtype) or is_period:
310+
from pandas.tseries.index import DatetimeIndex
311+
from pandas.tseries.period import PeriodIndex
320312

321-
if bins is not None:
322-
# TODO: This next line should be more efficient
323-
result = result.reindex(np.arange(len(cat.categories)),
324-
fill_value=0)
325-
result.index = bins[:-1]
313+
if is_period:
314+
values = PeriodIndex(values)
315+
freq = values.freq
326316

327-
if sort:
328-
result = result.sort_values(ascending=ascending)
317+
values = values.view(np.int64)
318+
keys, counts = htable.value_count_scalar64(values, dropna)
329319

330-
if normalize:
331-
result = result / float(counts.sum())
320+
if dropna:
321+
msk = keys != iNaT
322+
keys, counts = keys[msk], counts[msk]
332323

333-
return result
324+
# convert the keys back to the dtype we came in
325+
keys = keys.astype(dtype)
326+
327+
# dtype handling
328+
if is_datetimetz:
329+
if isinstance(orig, gt.ABCDatetimeIndex):
330+
tz = orig.tz
331+
else:
332+
tz = orig.dt.tz
333+
keys = DatetimeIndex._simple_new(keys, tz=tz)
334+
if is_period:
335+
keys = PeriodIndex._simple_new(keys, freq=freq)
336+
337+
elif com.is_integer_dtype(dtype):
338+
values = com._ensure_int64(values)
339+
keys, counts = htable.value_count_scalar64(values, dropna)
340+
elif com.is_float_dtype(dtype):
341+
values = com._ensure_float64(values)
342+
keys, counts = htable.value_count_scalar64(values, dropna)
343+
else:
344+
values = com._ensure_object(values)
345+
mask = com.isnull(values)
346+
keys, counts = htable.value_count_object(values, mask)
347+
if not dropna and mask.any():
348+
keys = np.insert(keys, 0, np.NaN)
349+
counts = np.insert(counts, 0, mask.sum())
350+
351+
return keys, counts
334352

335353

336354
def mode(values):

pandas/core/base.py

+7-15
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from pandas.util.decorators import (Appender, cache_readonly,
1111
deprecate_kwarg, Substitution)
1212
from pandas.core.common import AbstractMethodError
13+
from pandas.types import api as gt
1314
from pandas.formats.printing import pprint_thing
1415

1516
_shared_docs = dict()
@@ -291,15 +292,15 @@ def name(self):
291292

292293
@property
293294
def _selection_list(self):
294-
if not isinstance(self._selection, (list, tuple, com.ABCSeries,
295-
com.ABCIndex, np.ndarray)):
295+
if not isinstance(self._selection, (list, tuple, gt.ABCSeries,
296+
gt.ABCIndex, np.ndarray)):
296297
return [self._selection]
297298
return self._selection
298299

299300
@cache_readonly
300301
def _selected_obj(self):
301302

302-
if self._selection is None or isinstance(self.obj, com.ABCSeries):
303+
if self._selection is None or isinstance(self.obj, gt.ABCSeries):
303304
return self.obj
304305
else:
305306
return self.obj[self._selection]
@@ -311,7 +312,7 @@ def ndim(self):
311312
@cache_readonly
312313
def _obj_with_exclusions(self):
313314
if self._selection is not None and isinstance(self.obj,
314-
com.ABCDataFrame):
315+
gt.ABCDataFrame):
315316
return self.obj.reindex(columns=self._selection_list)
316317

317318
if len(self.exclusions) > 0:
@@ -323,7 +324,7 @@ def __getitem__(self, key):
323324
if self._selection is not None:
324325
raise Exception('Column(s) %s already selected' % self._selection)
325326

326-
if isinstance(key, (list, tuple, com.ABCSeries, com.ABCIndex,
327+
if isinstance(key, (list, tuple, gt.ABCSeries, gt.ABCIndex,
327328
np.ndarray)):
328329
if len(self.obj.columns.intersection(key)) != len(key):
329330
bad_keys = list(set(key).difference(self.obj.columns))
@@ -551,7 +552,7 @@ def _agg(arg, func):
551552
if isinstance(result, list):
552553
result = concat(result, keys=keys, axis=1)
553554
elif isinstance(list(compat.itervalues(result))[0],
554-
com.ABCDataFrame):
555+
gt.ABCDataFrame):
555556
result = concat([result[k] for k in keys], keys=keys, axis=1)
556557
else:
557558
from pandas import DataFrame
@@ -940,17 +941,8 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
940941
counts : Series
941942
"""
942943
from pandas.core.algorithms import value_counts
943-
from pandas.tseries.api import DatetimeIndex, PeriodIndex
944944
result = value_counts(self, sort=sort, ascending=ascending,
945945
normalize=normalize, bins=bins, dropna=dropna)
946-
947-
if isinstance(self, PeriodIndex):
948-
# preserve freq
949-
result.index = self._simple_new(result.index.values,
950-
freq=self.freq)
951-
elif isinstance(self, DatetimeIndex):
952-
result.index = self._simple_new(result.index.values,
953-
tz=getattr(self, 'tz', None))
954946
return result
955947

956948
def unique(self):

pandas/sparse/array.py

+38
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from numpy import nan, ndarray
88
import numpy as np
99

10+
import pandas as pd
1011
from pandas.core.base import PandasObject
1112
import pandas.core.common as com
1213

@@ -16,6 +17,7 @@
1617
from pandas._sparse import SparseIndex, BlockIndex, IntIndex
1718
import pandas._sparse as splib
1819
import pandas.index as _index
20+
import pandas.core.algorithms as algos
1921
import pandas.core.ops as ops
2022
import pandas.formats.printing as printing
2123
from pandas.util.decorators import Appender
@@ -503,6 +505,42 @@ def mean(self, axis=None, dtype=None, out=None):
503505
nsparse = self.sp_index.ngaps
504506
return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)
505507

508+
def value_counts(self, dropna=True):
509+
"""
510+
Returns a Series containing counts of unique values.
511+
512+
Parameters
513+
----------
514+
dropna : boolean, default True
515+
Don't include counts of NaN, even if NaN is in sp_values.
516+
517+
Returns
518+
-------
519+
counts : Series
520+
"""
521+
keys, counts = algos._value_counts_arraylike(self.sp_values,
522+
dropna=dropna)
523+
fcounts = self.sp_index.ngaps
524+
if fcounts > 0:
525+
if self._null_fill_value and dropna:
526+
pass
527+
else:
528+
if self._null_fill_value:
529+
mask = pd.isnull(keys)
530+
else:
531+
mask = keys == self.fill_value
532+
533+
if mask.any():
534+
counts[mask] += fcounts
535+
else:
536+
keys = np.insert(keys, 0, self.fill_value)
537+
counts = np.insert(counts, 0, fcounts)
538+
539+
if not isinstance(keys, pd.Index):
540+
keys = pd.Index(keys)
541+
result = pd.Series(counts, index=keys)
542+
return result
543+
506544

507545
def _maybe_to_dense(obj):
508546
""" try to convert to dense """

0 commit comments

Comments
 (0)