Skip to content

Commit faf471d

Browse files
committed
Merge pull request #7424 from hayd/value_counts_NaT
FIX value_counts should skip NaT
2 parents ad40979 + 26139f3 commit faf471d

File tree

6 files changed

+53
-24
lines changed

6 files changed

+53
-24
lines changed

doc/source/v0.14.1.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ Enhancements
8383

8484

8585

86-
86+
- Add ``dropna`` argument to ``value_counts`` and ``nunique`` (:issue:`5569`).
8787

8888

8989

@@ -159,7 +159,7 @@ Bug Fixes
159159

160160

161161

162-
162+
- Bug in ``value_counts`` where ``NaT`` did not qualify as missing (``NaN``) (:issue:`7423`)
163163

164164

165165

pandas/core/algorithms.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
168168

169169

170170
def value_counts(values, sort=True, ascending=False, normalize=False,
171-
bins=None):
171+
bins=None, dropna=True):
172172
"""
173173
Compute a histogram of the counts of non-null values
174174
@@ -184,6 +184,8 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
184184
bins : integer, optional
185185
Rather than count values, group them into half-open bins,
186186
convenience for pd.cut, only works with numeric data
187+
dropna : boolean, default False
188+
Don't include counts of NaN
187189
188190
Returns
189191
-------
@@ -202,25 +204,31 @@ def value_counts(values, sort=True, ascending=False, normalize=False,
202204
raise TypeError("bins argument only works with numeric data.")
203205
values = cat.labels
204206

205-
if com.is_integer_dtype(values.dtype):
207+
dtype = values.dtype
208+
if com.is_integer_dtype(dtype):
206209
values = com._ensure_int64(values)
207210
keys, counts = htable.value_count_int64(values)
208211

209212
elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
210-
dtype = values.dtype
211213
values = values.view(np.int64)
212214
keys, counts = htable.value_count_int64(values)
213215

216+
if dropna:
217+
from pandas.tslib import iNaT
218+
msk = keys != iNaT
219+
keys, counts = keys[msk], counts[msk]
214220
# convert the keys back to the dtype we came in
215-
keys = Series(keys, dtype=dtype)
221+
keys = keys.astype(dtype)
216222

217223
else:
218-
mask = com.isnull(values)
219224
values = com._ensure_object(values)
225+
mask = com.isnull(values)
220226
keys, counts = htable.value_count_object(values, mask)
227+
if not dropna:
228+
keys = np.insert(keys, 0, np.NaN)
229+
counts = np.insert(counts, 0, mask.sum())
221230

222231
result = Series(counts, index=com._values_from_object(keys))
223-
224232
if bins is not None:
225233
# TODO: This next line should be more efficient
226234
result = result.reindex(np.arange(len(cat.levels)), fill_value=0)

pandas/core/base.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,7 @@ def min(self):
245245
return pandas.core.nanops.nanmin(self.values)
246246

247247
def value_counts(self, normalize=False, sort=True, ascending=False,
248-
bins=None):
248+
bins=None, dropna=True):
249249
"""
250250
Returns object containing counts of unique values. The resulting object
251251
will be in descending order so that the first element is the most
@@ -263,14 +263,16 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
263263
bins : integer, optional
264264
Rather than count values, group them into half-open bins,
265265
a convenience for pd.cut, only works with numeric data
266+
dropna : boolean, default False
267+
Don't include counts of NaN
266268
267269
Returns
268270
-------
269271
counts : Series
270272
"""
271273
from pandas.core.algorithms import value_counts
272274
return value_counts(self.values, sort=sort, ascending=ascending,
273-
normalize=normalize, bins=bins)
275+
normalize=normalize, bins=bins, dropna=dropna)
274276

275277
def unique(self):
276278
"""
@@ -284,15 +286,15 @@ def unique(self):
284286
from pandas.core.nanops import unique1d
285287
return unique1d(self.values)
286288

287-
def nunique(self):
289+
def nunique(self, dropna=True):
288290
"""
289291
Return count of unique elements in the object. Excludes NA values.
290292
291293
Returns
292294
-------
293295
nunique : int
294296
"""
295-
return len(self.value_counts())
297+
return len(self.value_counts(dropna=dropna))
296298

297299
def factorize(self, sort=False, na_sentinel=-1):
298300
"""

pandas/tests/test_algos.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,19 @@ def test_value_counts_dtypes(self):
237237

238238
self.assertRaises(TypeError, lambda s: algos.value_counts(s, bins=1), ['1', 1])
239239

240+
def test_value_counts_nat(self):
241+
td = Series([np.timedelta64(10000), pd.NaT], dtype='timedelta64[ns]')
242+
dt = pd.to_datetime(['NaT', '2014-01-01'])
243+
244+
for s in [td, dt]:
245+
vc = algos.value_counts(s)
246+
vc_with_na = algos.value_counts(s, dropna=False)
247+
self.assertEqual(len(vc), 1)
248+
self.assertEqual(len(vc_with_na), 2)
249+
250+
exp_dt = pd.Series({pd.Timestamp('2014-01-01 00:00:00'): 1})
251+
tm.assert_series_equal(algos.value_counts(dt), exp_dt)
252+
# TODO same for (timedelta)
240253

241254
def test_quantile():
242255
s = Series(np.random.randn(100))

pandas/tests/test_base.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -292,12 +292,13 @@ def test_value_counts_unique_nunique(self):
292292
o = klass(np.repeat(values, range(1, len(o) + 1)))
293293

294294
if isinstance(o, DatetimeIndex):
295-
# DatetimeIndex: nan is casted to Nat and included
296-
expected_s = Series(list(range(10, 2, -1)) + [3], index=values[9:0:-1])
295+
expected_s_na = Series(list(range(10, 2, -1)) + [3], index=values[9:0:-1])
296+
expected_s = Series(list(range(10, 2, -1)), index=values[9:1:-1])
297297
else:
298-
# nan is excluded
299-
expected_s = Series(range(10, 2, -1), index=values[9:1:-1], dtype='int64')
298+
expected_s_na = Series(list(range(10, 2, -1)) +[3], index=values[9:0:-1], dtype='int64')
299+
expected_s = Series(list(range(10, 2, -1)), index=values[9:1:-1], dtype='int64')
300300

301+
tm.assert_series_equal(o.value_counts(dropna=False), expected_s_na)
301302
tm.assert_series_equal(o.value_counts(), expected_s)
302303

303304
# numpy_array_equal cannot compare arrays includes nan
@@ -309,10 +310,8 @@ def test_value_counts_unique_nunique(self):
309310
else:
310311
self.assertTrue(pd.isnull(result[0]))
311312

312-
if isinstance(o, DatetimeIndex):
313-
self.assertEqual(o.nunique(), 9)
314-
else:
315-
self.assertEqual(o.nunique(), 8)
313+
self.assertEqual(o.nunique(), 8)
314+
self.assertEqual(o.nunique(dropna=False), 9)
316315

317316
def test_value_counts_inferred(self):
318317
klasses = [Index, Series]
@@ -406,6 +405,9 @@ def test_value_counts_inferred(self):
406405

407406
result = s.value_counts()
408407
self.assertEqual(result.index.dtype, 'datetime64[ns]')
408+
tm.assert_series_equal(result, expected_s)
409+
410+
result = s.value_counts(dropna=False)
409411
expected_s[pd.NaT] = 1
410412
tm.assert_series_equal(result, expected_s)
411413

@@ -415,7 +417,8 @@ def test_value_counts_inferred(self):
415417
self.assert_numpy_array_equal(unique[:3], expected)
416418
self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == pd.tslib.iNaT)
417419

418-
self.assertEqual(s.nunique(), 4)
420+
self.assertEqual(s.nunique(), 3)
421+
self.assertEqual(s.nunique(dropna=False), 4)
419422

420423
# timedelta64[ns]
421424
td = df.dt - df.dt + timedelta(1)

pandas/tseries/tests/test_timeseries.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -106,16 +106,19 @@ def test_index_unique(self):
106106
self.assertEqual(result.name, 'foo')
107107
self.assertTrue(result.equals(expected))
108108

109-
# NaT
109+
# NaT, note this is excluded
110110
arr = [ 1370745748 + t for t in range(20) ] + [iNaT]
111111
idx = DatetimeIndex(arr * 3)
112112
self.assertTrue(idx.unique().equals(DatetimeIndex(arr)))
113-
self.assertEqual(idx.nunique(), 21)
113+
self.assertEqual(idx.nunique(), 20)
114+
self.assertEqual(idx.nunique(dropna=False), 21)
114115

115116
arr = [ Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20) ] + [NaT]
116117
idx = DatetimeIndex(arr * 3)
117118
self.assertTrue(idx.unique().equals(DatetimeIndex(arr)))
118-
self.assertEqual(idx.nunique(), 21)
119+
self.assertEqual(idx.nunique(), 20)
120+
self.assertEqual(idx.nunique(dropna=False), 21)
121+
119122

120123
def test_index_dupes_contains(self):
121124
d = datetime(2011, 12, 5, 20, 30)

0 commit comments

Comments
 (0)