Skip to content

REF: Add ExtensionArray.map #51809

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Other enhancements
- :meth:`MultiIndex.sortlevel` and :meth:`Index.sortlevel` gained a new keyword ``na_position`` (:issue:`51612`)
- Improve error message when setting :class:`DataFrame` with wrong number of columns through :meth:`DataFrame.isetitem` (:issue:`51701`)
- Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`)
- :class:`api.extensions.ExtensionArray` now has a :meth:`~api.extensions.ExtensionArray.map` method (:issue:`51809`).

.. ---------------------------------------------------------------------------
.. _whatsnew_210.notable_bug_fixes:
Expand Down
73 changes: 73 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
is_categorical_dtype,
is_complex_dtype,
is_datetime64_dtype,
is_dict_like,
is_extension_array_dtype,
is_float_dtype,
is_integer,
Expand Down Expand Up @@ -1678,3 +1679,75 @@ def union_with_duplicates(
unique_vals = ensure_wrapped_if_datetimelike(unique_vals)
repeats = final_count.reindex(unique_vals).values
return np.repeat(unique_vals, repeats)


def map_array(
arr: ArrayLike, mapper, na_action: Literal["ignore"] | None = None
) -> np.ndarray | ExtensionArray | Index:
"""
Map values using an input mapping or function.

Parameters
----------
mapper : function, dict, or Series
Mapping correspondence.
na_action : {None, 'ignore'}, default None
If 'ignore', propagate NA values, without passing them to the
mapping correspondence.

Returns
-------
Union[ndarray, Index, ExtensionArray]
The output of the mapping function applied to the array.
If the function returns a tuple with more than one element
a MultiIndex will be returned.
"""
if na_action not in (None, "ignore"):
msg = f"na_action must either be 'ignore' or None, {na_action} was passed"
raise ValueError(msg)

# we can fastpath dict/Series to an efficient map
# as we know that we are not going to have to yield
# python types
if is_dict_like(mapper):
if isinstance(mapper, dict) and hasattr(mapper, "__missing__"):
# If a dictionary subclass defines a default value method,
# convert mapper to a lookup function (GH #15999).
dict_with_default = mapper
mapper = lambda x: dict_with_default[
np.nan if isinstance(x, float) and np.isnan(x) else x
]
else:
# Dictionary does not have a default. Thus it's safe to
# convert to an Series for efficiency.
# we specify the keys here to handle the
# possibility that they are tuples

# The return value of mapping with an empty mapper is
# expected to be pd.Series(np.nan, ...). As np.nan is
# of dtype float64 the return value of this method should
# be float64 as well
from pandas import Series

if len(mapper) == 0:
mapper = Series(mapper, dtype=np.float64)
else:
mapper = Series(mapper)

if isinstance(mapper, ABCSeries):
if na_action == "ignore":
mapper = mapper[mapper.index.notna()]

# Since values were input this means we came from either
# a dict or a series and mapper should be an index
indexer = mapper.index.get_indexer(arr)
new_values = take_nd(mapper._values, indexer)

return new_values

# we must convert to python types
values = arr.astype(object, copy=False)
if na_action is None:
return lib.map_infer(values, mapper)
else:
return lib.map_infer_mask(values, mapper, isna(values).view(np.uint8))
3 changes: 1 addition & 2 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -1068,8 +1068,7 @@ def apply_standard(self) -> DataFrame | Series:
return f(obj)

# row-wise access
if is_extension_array_dtype(obj.dtype) and hasattr(obj._values, "map"):
# GH#23179 some EAs do not have `map`
if is_extension_array_dtype(obj.dtype):
mapped = obj._values.map(f)
else:
values = obj.astype(object)._values
Expand Down
24 changes: 24 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
from pandas.core.algorithms import (
factorize_array,
isin,
map_array,
mode,
rank,
unique,
Expand Down Expand Up @@ -180,6 +181,7 @@ class ExtensionArray:
* factorize / _values_for_factorize
* argsort, argmax, argmin / _values_for_argsort
* searchsorted
* map

The remaining methods implemented on this class should be performant,
as they only compose abstract methods. Still, a more efficient
Expand Down Expand Up @@ -1706,6 +1708,28 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):

return arraylike.default_array_ufunc(self, ufunc, method, *inputs, **kwargs)

def map(self, mapper, na_action=None):
"""
Map values using an input mapping or function.

Parameters
----------
mapper : function, dict, or Series
Mapping correspondence.
na_action : {None, 'ignore'}, default None
If 'ignore', propagate NA values, without passing them to the
mapping correspondence. If 'ignore' is not supported, a
``NotImplementedError`` should be raised.

Returns
-------
Union[ndarray, Index, ExtensionArray]
The output of the mapping function applied to the array.
If the function returns a tuple with more than one element
a MultiIndex will be returned.
"""
return map_array(self, mapper, na_action=na_action)


class ExtensionArraySupportsAnyAll(ExtensionArray):
def any(self, *, skipna: bool = True) -> bool:
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1198,7 +1198,7 @@ def remove_unused_categories(self) -> Categorical:

# ------------------------------------------------------------------

def map(self, mapper):
def map(self, mapper, na_action=None):
"""
Map categories using an input mapping or function.

Expand Down Expand Up @@ -1267,6 +1267,9 @@ def map(self, mapper):
>>> cat.map({'a': 'first', 'b': 'second'})
Index(['first', 'second', nan], dtype='object')
"""
if na_action is not None:
raise NotImplementedError

new_categories = self.categories.map(mapper)
try:
return self.from_codes(
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -750,7 +750,10 @@ def _unbox(self, other) -> np.int64 | np.datetime64 | np.timedelta64 | np.ndarra
# pandas assumes they're there.

@ravel_compat
def map(self, mapper):
def map(self, mapper, na_action=None):
if na_action is not None:
raise NotImplementedError

# TODO(GH-23179): Add ExtensionArray.map
# Need to figure out if we want ExtensionArray.map first.
# If so, then we can refactor IndexOpsMixin._map_values to
Expand Down
8 changes: 7 additions & 1 deletion pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1270,14 +1270,17 @@ def astype(self, dtype: AstypeArg | None = None, copy: bool = True):

return self._simple_new(sp_values, self.sp_index, dtype)

def map(self: SparseArrayT, mapper) -> SparseArrayT:
def map(self: SparseArrayT, mapper, na_action=None) -> SparseArrayT:
"""
Map categories using an input mapping or function.

Parameters
----------
mapper : dict, Series, callable
The correspondence from old values to new.
na_action : {None, 'ignore'}, default None
If 'ignore', propagate NA values, without passing them to the
mapping correspondence.

Returns
-------
Expand Down Expand Up @@ -1307,6 +1310,9 @@ def map(self: SparseArrayT, mapper) -> SparseArrayT:
IntIndex
Indices: array([1, 2], dtype=int32)
"""
if na_action is not None:
raise NotImplementedError

# this is used in apply.
# We get hit since we're an "is_extension_array_dtype" but regular extension
# types are not hit. This may be worth adding to the interface.
Expand Down
97 changes: 13 additions & 84 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@

from pandas.core.dtypes.cast import can_hold_element
from pandas.core.dtypes.common import (
is_categorical_dtype,
is_dict_like,
is_extension_array_dtype,
is_object_dtype,
is_scalar,
Expand Down Expand Up @@ -78,7 +76,6 @@
)

from pandas import (
Categorical,
Index,
Series,
)
Expand Down Expand Up @@ -882,87 +879,19 @@ def _map_values(self, mapper, na_action=None):
If the function returns a tuple with more than one element
a MultiIndex will be returned.
"""
# we can fastpath dict/Series to an efficient map
# as we know that we are not going to have to yield
# python types
if is_dict_like(mapper):
if isinstance(mapper, dict) and hasattr(mapper, "__missing__"):
# If a dictionary subclass defines a default value method,
# convert mapper to a lookup function (GH #15999).
dict_with_default = mapper
mapper = lambda x: dict_with_default[
np.nan if isinstance(x, float) and np.isnan(x) else x
]
else:
# Dictionary does not have a default. Thus it's safe to
# convert to an Series for efficiency.
# we specify the keys here to handle the
# possibility that they are tuples

# The return value of mapping with an empty mapper is
# expected to be pd.Series(np.nan, ...). As np.nan is
# of dtype float64 the return value of this method should
# be float64 as well
from pandas import Series

if len(mapper) == 0:
mapper = Series(mapper, dtype=np.float64)
else:
mapper = Series(mapper)

if isinstance(mapper, ABCSeries):
if na_action not in (None, "ignore"):
msg = (
"na_action must either be 'ignore' or None, "
f"{na_action} was passed"
)
raise ValueError(msg)

if na_action == "ignore":
mapper = mapper[mapper.index.notna()]

# Since values were input this means we came from either
# a dict or a series and mapper should be an index
if is_categorical_dtype(self.dtype):
# use the built in categorical series mapper which saves
# time by mapping the categories instead of all values

cat = cast("Categorical", self._values)
return cat.map(mapper)

values = self._values

indexer = mapper.index.get_indexer(values)
new_values = algorithms.take_nd(mapper._values, indexer)

return new_values

# we must convert to python types
if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"):
# GH#23179 some EAs do not have `map`
values = self._values
if na_action is not None:
raise NotImplementedError
map_f = lambda values, f: values.map(f)
else:
values = self._values.astype(object)
if na_action == "ignore":
map_f = lambda values, f: lib.map_infer_mask(
values, f, isna(values).view(np.uint8)
)
elif na_action is None:
map_f = lib.map_infer
else:
msg = (
"na_action must either be 'ignore' or None, "
f"{na_action} was passed"
)
raise ValueError(msg)

# mapper is a function
new_values = map_f(values, mapper)

return new_values
arr = extract_array(self, extract_numpy=True, extract_range=True)

if is_extension_array_dtype(arr.dtype):
# Item "IndexOpsMixin" of "Union[IndexOpsMixin, ExtensionArray,
# ndarray[Any, Any]]" has no attribute "map"
return arr.map(mapper, na_action=na_action) # type: ignore[union-attr]

# Argument 1 to "map_array" has incompatible type
# "Union[IndexOpsMixin, ExtensionArray, ndarray[Any, Any]]";
# expected "Union[ExtensionArray, ndarray[Any, Any]]"
return algorithms.map_array(
arr, mapper, na_action=na_action # type: ignore[arg-type]
)

@final
def value_counts(
Expand Down
6 changes: 6 additions & 0 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,12 @@ def test_apply_simple_series(self, data):
result = pd.Series(data).apply(id)
assert isinstance(result, pd.Series)

@pytest.mark.parametrize("na_action", [None, "ignore"])
def test_map(self, data, na_action):
result = data.map(lambda x: x, na_action=na_action)
expected = data.to_numpy()
tm.assert_numpy_array_equal(result, expected)

def test_argsort(self, data_for_sorting):
result = pd.Series(data_for_sorting).argsort()
# argsort result gets passed to take, so should be np.intp
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/extension/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,15 @@ def test_combine_add(self, data_repeated):
expected = pd.Series([a + val for a in list(orig_data1)])
self.assert_series_equal(result, expected)

@pytest.mark.parametrize("na_action", [None, "ignore"])
def test_map(self, data, na_action):
if na_action is not None:
with pytest.raises(NotImplementedError, match=""):
data.map(lambda x: x, na_action=na_action)
else:
result = data.map(lambda x: x, na_action=na_action)
self.assert_extension_array_equal(result, data)


class TestCasting(base.BaseCastingTests):
@pytest.mark.parametrize("cls", [Categorical, CategoricalIndex])
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/extension/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,15 @@ def test_combine_add(self, data_repeated):
# Timestamp.__add__(Timestamp) not defined
pass

@pytest.mark.parametrize("na_action", [None, "ignore"])
def test_map(self, data, na_action):
if na_action is not None:
with pytest.raises(NotImplementedError, match=""):
data.map(lambda x: x, na_action=na_action)
else:
result = data.map(lambda x: x, na_action=na_action)
self.assert_extension_array_equal(result, data)


class TestInterface(BaseDatetimeTests, base.BaseInterfaceTests):
pass
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/extension/test_period.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,15 @@ def test_diff(self, data, periods):
else:
super().test_diff(data, periods)

@pytest.mark.parametrize("na_action", [None, "ignore"])
def test_map(self, data, na_action):
if na_action is not None:
with pytest.raises(NotImplementedError, match=""):
data.map(lambda x: x, na_action=na_action)
else:
result = data.map(lambda x: x, na_action=na_action)
self.assert_extension_array_equal(result, data)


class TestInterface(BasePeriodTests, base.BaseInterfaceTests):
pass
Expand Down
Loading