Skip to content

REF: rename 'labels' in pd.factorize to 'codes' #29509

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 11, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 30 additions & 24 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
intended for public consumption
"""
from textwrap import dedent
from typing import Dict
from typing import Dict, Optional, Tuple, Union
from warnings import catch_warnings, simplefilter, warn

import numpy as np
Expand Down Expand Up @@ -501,9 +501,9 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non

Returns
-------
labels : ndarray
codes : ndarray
An integer ndarray that's an indexer into `uniques`.
``uniques.take(labels)`` will have the same values as `values`.
``uniques.take(codes)`` will have the same values as `values`.
uniques : ndarray, Index, or Categorical
The unique valid values. When `values` is Categorical, `uniques`
is a Categorical. When `values` is some other pandas object, an
Expand All @@ -525,27 +525,27 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
``pd.factorize(values)``. The results are identical for methods like
:meth:`Series.factorize`.

>>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
>>> labels
>>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
>>> codes
array([0, 0, 1, 2, 0])
>>> uniques
array(['b', 'a', 'c'], dtype=object)

With ``sort=True``, the `uniques` will be sorted, and `labels` will be
With ``sort=True``, the `uniques` will be sorted, and `codes` will be
shuffled so that the relationship is the maintained.

>>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
>>> labels
>>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
>>> codes
array([1, 1, 0, 2, 1])
>>> uniques
array(['a', 'b', 'c'], dtype=object)

Missing values are indicated in `labels` with `na_sentinel`
Missing values are indicated in `codes` with `na_sentinel`
(``-1`` by default). Note that missing values are never
included in `uniques`.

>>> labels, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
>>> labels
>>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
>>> codes
array([ 0, -1, 1, 2, 0])
>>> uniques
array(['b', 'a', 'c'], dtype=object)
Expand All @@ -555,8 +555,8 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
will differ. For Categoricals, a `Categorical` is returned.

>>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
>>> labels, uniques = pd.factorize(cat)
>>> labels
>>> codes, uniques = pd.factorize(cat)
>>> codes
array([0, 0, 1])
>>> uniques
[a, c]
Expand All @@ -569,8 +569,8 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
returned.

>>> cat = pd.Series(['a', 'a', 'c'])
>>> labels, uniques = pd.factorize(cat)
>>> labels
>>> codes, uniques = pd.factorize(cat)
>>> codes
array([0, 0, 1])
>>> uniques
Index(['a', 'c'], dtype='object')
Expand All @@ -596,7 +596,7 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
sort=dedent(
"""\
sort : bool, default False
Sort `uniques` and shuffle `labels` to maintain the
Sort `uniques` and shuffle `codes` to maintain the
relationship.
"""
),
Expand All @@ -609,11 +609,17 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
)
@Appender(_shared_docs["factorize"])
@deprecate_kwarg(old_arg_name="order", new_arg_name=None)
def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=None):
def factorize(
values,
sort: bool = False,
order=None,
na_sentinel: int = -1,
size_hint: Optional[int] = None,
) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]:
# Implementation notes: This method is responsible for 3 things
# 1.) coercing data to array-like (ndarray, Index, extension array)
# 2.) factorizing labels and uniques
# 3.) Maybe boxing the output in an Index
# 2.) factorizing codes and uniques
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this should remain labels

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're factorizing the values into codes and uniques, so should be codes?

Maybe the sentence should actually be worded as "2) factorizing values into codes and uniques" (more explicit?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

codes is correct; we are changing on purpose to conform with other usages in the codebase

# 3.) Maybe boxing the uniques in an Index
#
# Step 2 is dispatched to extension types (like Categorical). They are
# responsible only for factorization. All data coercion, sorting and boxing
Expand All @@ -624,7 +630,7 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=

if is_extension_array_dtype(values):
values = extract_array(values)
labels, uniques = values.factorize(na_sentinel=na_sentinel)
codes, uniques = values.factorize(na_sentinel=na_sentinel)
dtype = original.dtype
else:
values, dtype = _ensure_data(values)
Expand All @@ -634,13 +640,13 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=
else:
na_value = None

labels, uniques = _factorize_array(
codes, uniques = _factorize_array(
values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value
)

if sort and len(uniques) > 0:
uniques, labels = safe_sort(
uniques, labels, na_sentinel=na_sentinel, assume_unique=True, verify=False
uniques, codes = safe_sort(
uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
)

uniques = _reconstruct_data(uniques, dtype, original)
Expand All @@ -653,7 +659,7 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=

uniques = Index(uniques)

return labels, uniques
return codes, uniques


def value_counts(
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,11 +690,11 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArra
Parameters
----------
na_sentinel : int, default -1
Value to use in the `labels` array to indicate missing values.
Value to use in the `codes` array to indicate missing values.

Returns
-------
labels : ndarray
codes : ndarray
An integer NumPy array that's an indexer into the original
ExtensionArray.
uniques : ExtensionArray
Expand Down Expand Up @@ -724,12 +724,12 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArra
# Complete control over factorization.
arr, na_value = self._values_for_factorize()

labels, uniques = _factorize_array(
codes, uniques = _factorize_array(
arr, na_sentinel=na_sentinel, na_value=na_value
)

uniques = self._from_factorized(uniques, self)
return labels, uniques
return codes, uniques

_extension_array_shared_docs[
"repeat"
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -710,11 +710,11 @@ def factorize(self, na_sentinel=-1):
# Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
# The sparsity on this is backwards from what Sparse would want. Want
# ExtensionArray.factorize -> Tuple[EA, EA]
# Given that we have to return a dense array of labels, why bother
# Given that we have to return a dense array of codes, why bother
# implementing an efficient factorize?
labels, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
uniques = SparseArray(uniques, dtype=self.dtype)
return labels, uniques
return codes, uniques

def value_counts(self, dropna=True):
"""
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1518,7 +1518,7 @@ def memory_usage(self, deep=False):
sort=textwrap.dedent(
"""\
sort : bool, default False
Sort `uniques` and shuffle `labels` to maintain the
Sort `uniques` and shuffle `codes` to maintain the
relationship.
"""
),
Expand Down
18 changes: 9 additions & 9 deletions pandas/tests/arrays/categorical/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,23 @@ def test_factorize(categories, ordered):
cat = pd.Categorical(
["b", "b", "a", "c", None], categories=categories, ordered=ordered
)
labels, uniques = pd.factorize(cat)
expected_labels = np.array([0, 0, 1, 2, -1], dtype=np.intp)
codes, uniques = pd.factorize(cat)
expected_codes = np.array([0, 0, 1, 2, -1], dtype=np.intp)
expected_uniques = pd.Categorical(
["b", "a", "c"], categories=categories, ordered=ordered
)

tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_categorical_equal(uniques, expected_uniques)


def test_factorized_sort():
cat = pd.Categorical(["b", "b", None, "a"])
labels, uniques = pd.factorize(cat, sort=True)
expected_labels = np.array([1, 1, -1, 0], dtype=np.intp)
codes, uniques = pd.factorize(cat, sort=True)
expected_codes = np.array([1, 1, -1, 0], dtype=np.intp)
expected_uniques = pd.Categorical(["a", "b"])

tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_categorical_equal(uniques, expected_uniques)


Expand All @@ -36,13 +36,13 @@ def test_factorized_sort_ordered():
["b", "b", None, "a"], categories=["c", "b", "a"], ordered=True
)

labels, uniques = pd.factorize(cat, sort=True)
expected_labels = np.array([0, 0, -1, 1], dtype=np.intp)
codes, uniques = pd.factorize(cat, sort=True)
expected_codes = np.array([0, 0, -1, 1], dtype=np.intp)
expected_uniques = pd.Categorical(
["b", "a"], categories=["c", "b", "a"], ordered=True
)

tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_categorical_equal(uniques, expected_uniques)


Expand Down
20 changes: 10 additions & 10 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,29 +113,29 @@ def test_unique(self, data, box, method):

@pytest.mark.parametrize("na_sentinel", [-1, -2])
def test_factorize(self, data_for_grouping, na_sentinel):
labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
expected_labels = np.array(
codes, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
expected_codes = np.array(
[0, 0, na_sentinel, na_sentinel, 1, 1, 0, 2], dtype=np.intp
)
expected_uniques = data_for_grouping.take([0, 4, 7])

tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_numpy_array_equal(codes, expected_codes)
self.assert_extension_array_equal(uniques, expected_uniques)

@pytest.mark.parametrize("na_sentinel", [-1, -2])
def test_factorize_equivalence(self, data_for_grouping, na_sentinel):
l1, u1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
l2, u2 = data_for_grouping.factorize(na_sentinel=na_sentinel)
codes_1, uniques_1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
codes_2, uniques_2 = data_for_grouping.factorize(na_sentinel=na_sentinel)

tm.assert_numpy_array_equal(l1, l2)
self.assert_extension_array_equal(u1, u2)
tm.assert_numpy_array_equal(codes_1, codes_2)
self.assert_extension_array_equal(uniques_1, uniques_2)

def test_factorize_empty(self, data):
labels, uniques = pd.factorize(data[:0])
expected_labels = np.array([], dtype=np.intp)
codes, uniques = pd.factorize(data[:0])
expected_codes = np.array([], dtype=np.intp)
expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype)

tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_numpy_array_equal(codes, expected_codes)
self.assert_extension_array_equal(uniques, expected_uniques)

def test_fillna_copy_frame(self, data_missing):
Expand Down
Loading