Skip to content

Commit 2f46f73

Browse files
committed
BUG: SparseSeries.value_counts ignores fill_value
1 parent a959bd5 commit 2f46f73

File tree

8 files changed

+386
-73
lines changed

8 files changed

+386
-73
lines changed

doc/source/whatsnew/v0.18.1.txt

+4
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,7 @@ These changes conform sparse handling to return the correct types and work to ma
197197
- Bug in ``SparseSeries`` and ``SparseArray`` may have different ``dtype`` from its dense values (:issue:`12908`)
198198
- Bug in ``SparseSeries.reindex`` incorrectly handle ``fill_value`` (:issue:`12797`)
199199
- Bug in ``SparseArray.to_frame()`` results in ``DataFrame``, rather than ``SparseDataFrame`` (:issue:`9850`)
200+
- Bug in ``SparseSeries.value_counts()`` does not count ``fill_value`` (:issue:`6749`)
200201
- Bug in ``SparseArray.to_dense()`` does not preserve ``dtype`` (:issue:`10648`)
201202
- Bug in ``SparseArray.to_dense()`` incorrectly handle ``fill_value`` (:issue:`12797`)
202203
- Bug in ``pd.concat()`` of ``SparseSeries`` results in dense (:issue:`10536`)
@@ -474,6 +475,9 @@ Bug Fixes
474475

475476

476477
- Bug in ``value_counts`` when ``normalize=True`` and ``dropna=True`` where nulls still contributed to the normalized count (:issue:`12558`)
478+
- Bug in ``Series.value_counts()`` loses name if its dtype is category (:issue:`12835`)
479+
- Bug in ``Series.value_counts()`` loses timezone info (:issue:`12835`)
480+
- Bug in ``Series.value_counts(normalize=True)`` with ``Categorical`` raises ``UnboundLocalError`` (:issue:`12835`)
477481
- Bug in ``Panel.fillna()`` ignoring ``inplace=True`` (:issue:`12633`)
478482
- Bug in ``read_csv`` when specifying ``names``, ``usecols``, and ``parse_dates`` simultaneously with the C engine (:issue:`9755`)
479483
- Bug in ``read_csv`` when specifying ``delim_whitespace=True`` and ``lineterminator`` simultaneously with the C engine (:issue:`12912`)

pandas/core/algorithms.py

+60-51
Original file line numberDiff line numberDiff line change
@@ -254,7 +254,6 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
254254
"""
255255
from pandas.core.series import Series
256256
from pandas.tools.tile import cut
257-
from pandas import Index, PeriodIndex, DatetimeIndex
258257

259258
name = getattr(values, 'name', None)
260259
values = Series(values).values
@@ -266,71 +265,81 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
266265
raise TypeError("bins argument only works with numeric data.")
267266
values = cat.codes
268267

269-
if com.is_categorical_dtype(values.dtype):
270-
result = values.value_counts(dropna)
271-
268+
if com.is_extension_type(values):
269+
result = values.value_counts(dropna=dropna)
270+
result.name = name
271+
counts = result.values
272272
else:
273+
# ndarray path
274+
keys, counts = _value_counts_arraylike(values, dropna=dropna)
273275

274-
dtype = values.dtype
275-
is_period = com.is_period_arraylike(values)
276-
is_datetimetz = com.is_datetimetz(values)
276+
from pandas import Index
277+
if not isinstance(keys, Index):
278+
keys = Index(keys)
279+
result = Series(counts, index=keys, name=name)
277280

278-
if com.is_datetime_or_timedelta_dtype(dtype) or is_period or \
279-
is_datetimetz:
281+
if bins is not None:
282+
# TODO: This next line should be more efficient
283+
result = result.reindex(np.arange(len(cat.categories)),
284+
fill_value=0)
285+
result.index = bins[:-1]
280286

281-
if is_period:
282-
values = PeriodIndex(values)
283-
elif is_datetimetz:
284-
tz = getattr(values, 'tz', None)
285-
values = DatetimeIndex(values).tz_localize(None)
287+
if sort:
288+
result = result.sort_values(ascending=ascending)
286289

287-
values = values.view(np.int64)
288-
keys, counts = htable.value_count_scalar64(values, dropna)
290+
if normalize:
291+
result = result / float(counts.sum())
289292

290-
if dropna:
291-
msk = keys != iNaT
292-
keys, counts = keys[msk], counts[msk]
293+
return result
293294

294-
# localize to the original tz if necessary
295-
if is_datetimetz:
296-
keys = DatetimeIndex(keys).tz_localize(tz)
297295

298-
# convert the keys back to the dtype we came in
299-
else:
300-
keys = keys.astype(dtype)
296+
def _value_counts_arraylike(values, dropna=True):
297+
from pandas import PeriodIndex, DatetimeIndex
301298

302-
elif com.is_integer_dtype(dtype):
303-
values = com._ensure_int64(values)
304-
keys, counts = htable.value_count_scalar64(values, dropna)
305-
elif com.is_float_dtype(dtype):
306-
values = com._ensure_float64(values)
307-
keys, counts = htable.value_count_scalar64(values, dropna)
299+
dtype = values.dtype
300+
is_period = com.is_period_arraylike(values)
301+
is_datetimetz = com.is_datetimetz(values)
308302

309-
else:
310-
values = com._ensure_object(values)
311-
mask = com.isnull(values)
312-
keys, counts = htable.value_count_object(values, mask)
313-
if not dropna and mask.any():
314-
keys = np.insert(keys, 0, np.NaN)
315-
counts = np.insert(counts, 0, mask.sum())
303+
if com.is_datetime_or_timedelta_dtype(dtype) or is_period or \
304+
is_datetimetz:
316305

317-
if not isinstance(keys, Index):
318-
keys = Index(keys)
319-
result = Series(counts, index=keys, name=name)
306+
if is_period:
307+
values = PeriodIndex(values)
308+
elif is_datetimetz:
309+
tz = getattr(values, 'tz', None)
310+
values = DatetimeIndex(values).tz_localize(None)
320311

321-
if bins is not None:
322-
# TODO: This next line should be more efficient
323-
result = result.reindex(np.arange(len(cat.categories)),
324-
fill_value=0)
325-
result.index = bins[:-1]
312+
values = values.view(np.int64)
313+
keys, counts = htable.value_count_scalar64(values, dropna)
326314

327-
if sort:
328-
result = result.sort_values(ascending=ascending)
315+
if dropna:
316+
msk = keys != iNaT
317+
keys, counts = keys[msk], counts[msk]
329318

330-
if normalize:
331-
result = result / float(counts.sum())
319+
# localize to the original tz if necessary
320+
if is_datetimetz:
321+
keys = DatetimeIndex(keys).tz_localize(tz)
332322

333-
return result
323+
# convert the keys back to the dtype we came in
324+
else:
325+
keys = keys.astype(dtype)
326+
327+
elif com.is_integer_dtype(dtype):
328+
values = com._ensure_int64(values)
329+
keys, counts = htable.value_count_scalar64(values, dropna)
330+
elif com.is_float_dtype(dtype):
331+
values = com._ensure_float64(values)
332+
keys, counts = htable.value_count_scalar64(values, dropna)
333+
334+
else:
335+
values = com._ensure_object(values)
336+
mask = com.isnull(values)
337+
keys, counts = htable.value_count_object(values, mask)
338+
if not dropna and mask.any():
339+
keys = np.insert(keys, 0, np.NaN)
340+
counts = np.insert(counts, 0, mask.sum())
341+
342+
return keys, counts
334343

335344

336345
def mode(values):

pandas/core/base.py

+15-11
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from pandas.util.decorators import (Appender, cache_readonly,
1111
deprecate_kwarg, Substitution)
1212
from pandas.core.common import AbstractMethodError
13+
from pandas.types import api as gt
1314
from pandas.formats.printing import pprint_thing
1415

1516
_shared_docs = dict()
@@ -291,15 +292,15 @@ def name(self):
291292

292293
@property
293294
def _selection_list(self):
294-
if not isinstance(self._selection, (list, tuple, com.ABCSeries,
295-
com.ABCIndex, np.ndarray)):
295+
if not isinstance(self._selection, (list, tuple, gt.ABCSeries,
296+
gt.ABCIndex, np.ndarray)):
296297
return [self._selection]
297298
return self._selection
298299

299300
@cache_readonly
300301
def _selected_obj(self):
301302

302-
if self._selection is None or isinstance(self.obj, com.ABCSeries):
303+
if self._selection is None or isinstance(self.obj, gt.ABCSeries):
303304
return self.obj
304305
else:
305306
return self.obj[self._selection]
@@ -311,7 +312,7 @@ def ndim(self):
311312
@cache_readonly
312313
def _obj_with_exclusions(self):
313314
if self._selection is not None and isinstance(self.obj,
314-
com.ABCDataFrame):
315+
gt.ABCDataFrame):
315316
return self.obj.reindex(columns=self._selection_list)
316317

317318
if len(self.exclusions) > 0:
@@ -323,7 +324,7 @@ def __getitem__(self, key):
323324
if self._selection is not None:
324325
raise Exception('Column(s) %s already selected' % self._selection)
325326

326-
if isinstance(key, (list, tuple, com.ABCSeries, com.ABCIndex,
327+
if isinstance(key, (list, tuple, gt.ABCSeries, gt.ABCIndex,
327328
np.ndarray)):
328329
if len(self.obj.columns.intersection(key)) != len(key):
329330
bad_keys = list(set(key).difference(self.obj.columns))
@@ -551,7 +552,7 @@ def _agg(arg, func):
551552
if isinstance(result, list):
552553
result = concat(result, keys=keys, axis=1)
553554
elif isinstance(list(compat.itervalues(result))[0],
554-
com.ABCDataFrame):
555+
gt.ABCDataFrame):
555556
result = concat([result[k] for k in keys], keys=keys, axis=1)
556557
else:
557558
from pandas import DataFrame
@@ -940,17 +941,20 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
940941
counts : Series
941942
"""
942943
from pandas.core.algorithms import value_counts
943-
from pandas.tseries.api import DatetimeIndex, PeriodIndex
944944
result = value_counts(self, sort=sort, ascending=ascending,
945945
normalize=normalize, bins=bins, dropna=dropna)
946946

947-
if isinstance(self, PeriodIndex):
947+
if isinstance(self, gt.ABCPeriodIndex):
948948
# preserve freq
949949
result.index = self._simple_new(result.index.values,
950950
freq=self.freq)
951-
elif isinstance(self, DatetimeIndex):
952-
result.index = self._simple_new(result.index.values,
953-
tz=getattr(self, 'tz', None))
951+
elif com.is_datetimetz(self):
952+
if isinstance(self, gt.ABCDatetimeIndex):
953+
tz = self.tz
954+
else:
955+
tz = self.dt.tz
956+
result.index = result.index._simple_new(result.index.values,
957+
tz=tz)
954958
return result
955959

956960
def unique(self):

pandas/sparse/array.py

+38
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from numpy import nan, ndarray
88
import numpy as np
99

10+
import pandas as pd
1011
from pandas.core.base import PandasObject
1112
import pandas.core.common as com
1213

@@ -16,6 +17,7 @@
1617
from pandas._sparse import SparseIndex, BlockIndex, IntIndex
1718
import pandas._sparse as splib
1819
import pandas.index as _index
20+
import pandas.core.algorithms as algos
1921
import pandas.core.ops as ops
2022
import pandas.formats.printing as printing
2123
from pandas.util.decorators import Appender
@@ -503,6 +505,42 @@ def mean(self, axis=None, dtype=None, out=None):
503505
nsparse = self.sp_index.ngaps
504506
return (sp_sum + self.fill_value * nsparse) / (ct + nsparse)
505507

508+
def value_counts(self, dropna=True):
509+
"""
510+
Returns a Series containing counts of unique values.
511+
512+
Parameters
513+
----------
514+
dropna : boolean, default True
515+
Don't include counts of NaN, even if NaN is in sp_values.
516+
517+
Returns
518+
-------
519+
counts : Series
520+
"""
521+
keys, counts = algos._value_counts_arraylike(self.sp_values,
522+
dropna=dropna)
523+
fcounts = self.sp_index.ngaps
524+
if fcounts > 0:
525+
if self._null_fill_value and dropna:
526+
pass
527+
else:
528+
if self._null_fill_value:
529+
mask = pd.isnull(keys)
530+
else:
531+
mask = keys == self.fill_value
532+
533+
if mask.any():
534+
counts[mask] += fcounts
535+
else:
536+
keys = np.insert(keys, 0, self.fill_value)
537+
counts = np.insert(counts, 0, fcounts)
538+
539+
if not isinstance(keys, pd.Index):
540+
keys = pd.Index(keys)
541+
result = pd.Series(counts, index=keys)
542+
return result
543+
506544

507545
def _maybe_to_dense(obj):
508546
""" try to convert to dense """

0 commit comments

Comments
 (0)