Skip to content

Commit 19c3d40

Browse files
API: replace dropna=False option with na_sentinel=None in factorize (#35852)
* remove \n from docstring * fix issue 17038 * revert change * revert change * add dropna doc for factorize * rephrase the doc * flake8 * fixup * use NaN * add dropna in series.factorize * black * add test * linting * linting * doct * fix black * fixup * fix doctest * add whatsnew * linting * fix test * try one time * hide dropna and use na_sentinel=None * update whatsnew * rename test function * remove dropna from factorize * update doc * docstring * update doc * add comment * code change on review * update doc * code change on review * minor move in whatsnew * add default example * doc * one more try * explicit doc * add space
1 parent 1fc244f commit 19c3d40

File tree

6 files changed

+66
-41
lines changed

6 files changed

+66
-41
lines changed

doc/source/whatsnew/v1.1.2.rst

+8
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,14 @@ Bug fixes
3535

3636
.. ---------------------------------------------------------------------------
3737
38+
.. _whatsnew_112.other:
39+
40+
Other
41+
~~~~~
42+
- :meth:`factorize` now supports ``na_sentinel=None`` to include NaN in the uniques of the values and remove ``dropna`` keyword which was unintentionally exposed to public facing API in 1.1 version from :meth:`factorize`(:issue:`35667`)
43+
44+
.. ---------------------------------------------------------------------------
45+
3846
.. _whatsnew_112.contributors:
3947

4048
Contributors

pandas/core/algorithms.py

+29-4
Original file line numberDiff line numberDiff line change
@@ -526,9 +526,8 @@ def _factorize_array(
526526
def factorize(
527527
values,
528528
sort: bool = False,
529-
na_sentinel: int = -1,
529+
na_sentinel: Optional[int] = -1,
530530
size_hint: Optional[int] = None,
531-
dropna: bool = True,
532531
) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]:
533532
"""
534533
Encode the object as an enumerated type or categorical variable.
@@ -541,8 +540,11 @@ def factorize(
541540
Parameters
542541
----------
543542
{values}{sort}
544-
na_sentinel : int, default -1
545-
Value to mark "not found".
543+
na_sentinel : int or None, default -1
544+
Value to mark "not found". If None, will not drop the NaN
545+
from the uniques of the values.
546+
547+
.. versionchanged:: 1.1.2
546548
{size_hint}\
547549
548550
Returns
@@ -620,6 +622,22 @@ def factorize(
620622
array([0, 0, 1]...)
621623
>>> uniques
622624
Index(['a', 'c'], dtype='object')
625+
626+
If NaN is in the values, and we want to include NaN in the uniques of the
627+
values, it can be achieved by setting ``na_sentinel=None``.
628+
629+
>>> values = np.array([1, 2, 1, np.nan])
630+
>>> codes, uniques = pd.factorize(values) # default: na_sentinel=-1
631+
>>> codes
632+
array([ 0, 1, 0, -1])
633+
>>> uniques
634+
array([1., 2.])
635+
636+
>>> codes, uniques = pd.factorize(values, na_sentinel=None)
637+
>>> codes
638+
array([0, 1, 0, 2])
639+
>>> uniques
640+
array([ 1., 2., nan])
623641
"""
624642
# Implementation notes: This method is responsible for 3 things
625643
# 1.) coercing data to array-like (ndarray, Index, extension array)
@@ -633,6 +651,13 @@ def factorize(
633651
values = _ensure_arraylike(values)
634652
original = values
635653

654+
# GH35667, if na_sentinel=None, we will not dropna NaNs from the uniques
655+
# of values, assign na_sentinel=-1 to replace code value for NaN.
656+
dropna = True
657+
if na_sentinel is None:
658+
na_sentinel = -1
659+
dropna = False
660+
636661
if is_extension_array_dtype(values.dtype):
637662
values = extract_array(values)
638663
codes, uniques = values.factorize(na_sentinel=na_sentinel)

pandas/core/base.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1398,7 +1398,7 @@ def memory_usage(self, deep=False):
13981398
"""
13991399
),
14001400
)
1401-
def factorize(self, sort=False, na_sentinel=-1):
1401+
def factorize(self, sort: bool = False, na_sentinel: Optional[int] = -1):
14021402
return algorithms.factorize(self, sort=sort, na_sentinel=na_sentinel)
14031403

14041404
_shared_docs[

pandas/core/groupby/grouper.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -587,8 +587,13 @@ def _make_codes(self) -> None:
587587
codes = self.grouper.codes_info
588588
uniques = self.grouper.result_index
589589
else:
590+
# GH35667, replace dropna=False with na_sentinel=None
591+
if not self.dropna:
592+
na_sentinel = None
593+
else:
594+
na_sentinel = -1
590595
codes, uniques = algorithms.factorize(
591-
self.grouper, sort=self.sort, dropna=self.dropna
596+
self.grouper, sort=self.sort, na_sentinel=na_sentinel
592597
)
593598
uniques = Index(uniques, name=self.name)
594599
self._codes = codes

pandas/tests/base/test_factorize.py

+13
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,16 @@ def test_factorize(index_or_series_obj, sort):
2626

2727
tm.assert_numpy_array_equal(result_codes, expected_codes)
2828
tm.assert_index_equal(result_uniques, expected_uniques)
29+
30+
31+
def test_series_factorize_na_sentinel_none():
32+
# GH35667
33+
values = np.array([1, 2, 1, np.nan])
34+
ser = pd.Series(values)
35+
codes, uniques = ser.factorize(na_sentinel=None)
36+
37+
expected_codes = np.array([0, 1, 0, 2], dtype="int64")
38+
expected_uniques = pd.Index([1.0, 2.0, np.nan])
39+
40+
tm.assert_numpy_array_equal(codes, expected_codes)
41+
tm.assert_index_equal(uniques, expected_uniques)

pandas/tests/test_algos.py

+9-35
Original file line numberDiff line numberDiff line change
@@ -340,73 +340,47 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques):
340340
tm.assert_extension_array_equal(uniques, expected_uniques)
341341

342342
@pytest.mark.parametrize(
343-
"data, dropna, expected_codes, expected_uniques",
343+
"data, expected_codes, expected_uniques",
344344
[
345345
(
346346
["a", None, "b", "a"],
347-
True,
348-
np.array([0, -1, 1, 0], dtype=np.dtype("intp")),
349-
np.array(["a", "b"], dtype=object),
350-
),
351-
(
352-
["a", np.nan, "b", "a"],
353-
True,
354-
np.array([0, -1, 1, 0], dtype=np.dtype("intp")),
355-
np.array(["a", "b"], dtype=object),
356-
),
357-
(
358-
["a", None, "b", "a"],
359-
False,
360347
np.array([0, 2, 1, 0], dtype=np.dtype("intp")),
361348
np.array(["a", "b", np.nan], dtype=object),
362349
),
363350
(
364351
["a", np.nan, "b", "a"],
365-
False,
366352
np.array([0, 2, 1, 0], dtype=np.dtype("intp")),
367353
np.array(["a", "b", np.nan], dtype=object),
368354
),
369355
],
370356
)
371-
def test_object_factorize_dropna(
372-
self, data, dropna, expected_codes, expected_uniques
357+
def test_object_factorize_na_sentinel_none(
358+
self, data, expected_codes, expected_uniques
373359
):
374-
codes, uniques = algos.factorize(data, dropna=dropna)
360+
codes, uniques = algos.factorize(data, na_sentinel=None)
375361

376362
tm.assert_numpy_array_equal(uniques, expected_uniques)
377363
tm.assert_numpy_array_equal(codes, expected_codes)
378364

379365
@pytest.mark.parametrize(
380-
"data, dropna, expected_codes, expected_uniques",
366+
"data, expected_codes, expected_uniques",
381367
[
382368
(
383369
[1, None, 1, 2],
384-
True,
385-
np.array([0, -1, 0, 1], dtype=np.dtype("intp")),
386-
np.array([1, 2], dtype="O"),
387-
),
388-
(
389-
[1, np.nan, 1, 2],
390-
True,
391-
np.array([0, -1, 0, 1], dtype=np.dtype("intp")),
392-
np.array([1, 2], dtype=np.float64),
393-
),
394-
(
395-
[1, None, 1, 2],
396-
False,
397370
np.array([0, 2, 0, 1], dtype=np.dtype("intp")),
398371
np.array([1, 2, np.nan], dtype="O"),
399372
),
400373
(
401374
[1, np.nan, 1, 2],
402-
False,
403375
np.array([0, 2, 0, 1], dtype=np.dtype("intp")),
404376
np.array([1, 2, np.nan], dtype=np.float64),
405377
),
406378
],
407379
)
408-
def test_int_factorize_dropna(self, data, dropna, expected_codes, expected_uniques):
409-
codes, uniques = algos.factorize(data, dropna=dropna)
380+
def test_int_factorize_na_sentinel_none(
381+
self, data, expected_codes, expected_uniques
382+
):
383+
codes, uniques = algos.factorize(data, na_sentinel=None)
410384

411385
tm.assert_numpy_array_equal(uniques, expected_uniques)
412386
tm.assert_numpy_array_equal(codes, expected_codes)

0 commit comments

Comments
 (0)