diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index c12807304f74d..2dea25d6f10f4 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -32,6 +32,7 @@ Other enhancements - :meth:`MultiIndex.sortlevel` and :meth:`Index.sortlevel` gained a new keyword ``na_position`` (:issue:`51612`) - Improve error message when setting :class:`DataFrame` with wrong number of columns through :meth:`DataFrame.isetitem` (:issue:`51701`) - Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`) +- :class:`api.extensions.ExtensionArray` now has a :meth:`~api.extensions.ExtensionArray.map` method (:issue:`51809`). .. --------------------------------------------------------------------------- .. _whatsnew_210.notable_bug_fixes: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c82b47867fbb3..e07d6c8298bca 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -46,6 +46,7 @@ is_categorical_dtype, is_complex_dtype, is_datetime64_dtype, + is_dict_like, is_extension_array_dtype, is_float_dtype, is_integer, @@ -1678,3 +1679,75 @@ def union_with_duplicates( unique_vals = ensure_wrapped_if_datetimelike(unique_vals) repeats = final_count.reindex(unique_vals).values return np.repeat(unique_vals, repeats) + + +def map_array( + arr: ArrayLike, mapper, na_action: Literal["ignore"] | None = None +) -> np.ndarray | ExtensionArray | Index: + """ + Map values using an input mapping or function. + + Parameters + ---------- + mapper : function, dict, or Series + Mapping correspondence. + na_action : {None, 'ignore'}, default None + If 'ignore', propagate NA values, without passing them to the + mapping correspondence. + + Returns + ------- + Union[ndarray, Index, ExtensionArray] + The output of the mapping function applied to the array. + If the function returns a tuple with more than one element + a MultiIndex will be returned. + """ + if na_action not in (None, "ignore"): + msg = f"na_action must either be 'ignore' or None, {na_action} was passed" + raise ValueError(msg) + + # we can fastpath dict/Series to an efficient map + # as we know that we are not going to have to yield + # python types + if is_dict_like(mapper): + if isinstance(mapper, dict) and hasattr(mapper, "__missing__"): + # If a dictionary subclass defines a default value method, + # convert mapper to a lookup function (GH #15999). + dict_with_default = mapper + mapper = lambda x: dict_with_default[ + np.nan if isinstance(x, float) and np.isnan(x) else x + ] + else: + # Dictionary does not have a default. Thus it's safe to + # convert to an Series for efficiency. + # we specify the keys here to handle the + # possibility that they are tuples + + # The return value of mapping with an empty mapper is + # expected to be pd.Series(np.nan, ...). As np.nan is + # of dtype float64 the return value of this method should + # be float64 as well + from pandas import Series + + if len(mapper) == 0: + mapper = Series(mapper, dtype=np.float64) + else: + mapper = Series(mapper) + + if isinstance(mapper, ABCSeries): + if na_action == "ignore": + mapper = mapper[mapper.index.notna()] + + # Since values were input this means we came from either + # a dict or a series and mapper should be an index + indexer = mapper.index.get_indexer(arr) + new_values = take_nd(mapper._values, indexer) + + return new_values + + # we must convert to python types + values = arr.astype(object, copy=False) + if na_action is None: + return lib.map_infer(values, mapper) + else: + return lib.map_infer_mask(values, mapper, isna(values).view(np.uint8)) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 4790d00a071cb..f23b05a5c41b8 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -1068,8 +1068,7 @@ def apply_standard(self) -> DataFrame | Series: return f(obj) # row-wise access - if is_extension_array_dtype(obj.dtype) and hasattr(obj._values, "map"): - # GH#23179 some EAs do not have `map` + if is_extension_array_dtype(obj.dtype): mapped = obj._values.map(f) else: values = obj.astype(object)._values diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 1a082a7579dc3..9313d49ec87de 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -78,6 +78,7 @@ from pandas.core.algorithms import ( factorize_array, isin, + map_array, mode, rank, unique, @@ -180,6 +181,7 @@ class ExtensionArray: * factorize / _values_for_factorize * argsort, argmax, argmin / _values_for_argsort * searchsorted + * map The remaining methods implemented on this class should be performant, as they only compose abstract methods. Still, a more efficient @@ -1706,6 +1708,28 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): return arraylike.default_array_ufunc(self, ufunc, method, *inputs, **kwargs) + def map(self, mapper, na_action=None): + """ + Map values using an input mapping or function. + + Parameters + ---------- + mapper : function, dict, or Series + Mapping correspondence. + na_action : {None, 'ignore'}, default None + If 'ignore', propagate NA values, without passing them to the + mapping correspondence. If 'ignore' is not supported, a + ``NotImplementedError`` should be raised. + + Returns + ------- + Union[ndarray, Index, ExtensionArray] + The output of the mapping function applied to the array. + If the function returns a tuple with more than one element + a MultiIndex will be returned. + """ + return map_array(self, mapper, na_action=na_action) + class ExtensionArraySupportsAnyAll(ExtensionArray): def any(self, *, skipna: bool = True) -> bool: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index be6c8493963ea..bdd9e1727ca10 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1198,7 +1198,7 @@ def remove_unused_categories(self) -> Categorical: # ------------------------------------------------------------------ - def map(self, mapper): + def map(self, mapper, na_action=None): """ Map categories using an input mapping or function. @@ -1267,6 +1267,9 @@ def map(self, mapper): >>> cat.map({'a': 'first', 'b': 'second'}) Index(['first', 'second', nan], dtype='object') """ + if na_action is not None: + raise NotImplementedError + new_categories = self.categories.map(mapper) try: return self.from_codes( diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 1e9b5641aa5e0..5295f89626e04 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -750,7 +750,10 @@ def _unbox(self, other) -> np.int64 | np.datetime64 | np.timedelta64 | np.ndarra # pandas assumes they're there. @ravel_compat - def map(self, mapper): + def map(self, mapper, na_action=None): + if na_action is not None: + raise NotImplementedError + # TODO(GH-23179): Add ExtensionArray.map # Need to figure out if we want ExtensionArray.map first. # If so, then we can refactor IndexOpsMixin._map_values to diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 78153890745d7..600fb92a6b279 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1270,7 +1270,7 @@ def astype(self, dtype: AstypeArg | None = None, copy: bool = True): return self._simple_new(sp_values, self.sp_index, dtype) - def map(self: SparseArrayT, mapper) -> SparseArrayT: + def map(self: SparseArrayT, mapper, na_action=None) -> SparseArrayT: """ Map categories using an input mapping or function. @@ -1278,6 +1278,9 @@ def map(self: SparseArrayT, mapper) -> SparseArrayT: ---------- mapper : dict, Series, callable The correspondence from old values to new. + na_action : {None, 'ignore'}, default None + If 'ignore', propagate NA values, without passing them to the + mapping correspondence. Returns ------- @@ -1307,6 +1310,9 @@ def map(self: SparseArrayT, mapper) -> SparseArrayT: IntIndex Indices: array([1, 2], dtype=int32) """ + if na_action is not None: + raise NotImplementedError + # this is used in apply. # We get hit since we're an "is_extension_array_dtype" but regular extension # types are not hit. This may be worth adding to the interface. diff --git a/pandas/core/base.py b/pandas/core/base.py index d9b2647d19f93..2d3338d7c9f10 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -40,8 +40,6 @@ from pandas.core.dtypes.cast import can_hold_element from pandas.core.dtypes.common import ( - is_categorical_dtype, - is_dict_like, is_extension_array_dtype, is_object_dtype, is_scalar, @@ -78,7 +76,6 @@ ) from pandas import ( - Categorical, Index, Series, ) @@ -882,87 +879,19 @@ def _map_values(self, mapper, na_action=None): If the function returns a tuple with more than one element a MultiIndex will be returned. """ - # we can fastpath dict/Series to an efficient map - # as we know that we are not going to have to yield - # python types - if is_dict_like(mapper): - if isinstance(mapper, dict) and hasattr(mapper, "__missing__"): - # If a dictionary subclass defines a default value method, - # convert mapper to a lookup function (GH #15999). - dict_with_default = mapper - mapper = lambda x: dict_with_default[ - np.nan if isinstance(x, float) and np.isnan(x) else x - ] - else: - # Dictionary does not have a default. Thus it's safe to - # convert to an Series for efficiency. - # we specify the keys here to handle the - # possibility that they are tuples - - # The return value of mapping with an empty mapper is - # expected to be pd.Series(np.nan, ...). As np.nan is - # of dtype float64 the return value of this method should - # be float64 as well - from pandas import Series - - if len(mapper) == 0: - mapper = Series(mapper, dtype=np.float64) - else: - mapper = Series(mapper) - - if isinstance(mapper, ABCSeries): - if na_action not in (None, "ignore"): - msg = ( - "na_action must either be 'ignore' or None, " - f"{na_action} was passed" - ) - raise ValueError(msg) - - if na_action == "ignore": - mapper = mapper[mapper.index.notna()] - - # Since values were input this means we came from either - # a dict or a series and mapper should be an index - if is_categorical_dtype(self.dtype): - # use the built in categorical series mapper which saves - # time by mapping the categories instead of all values - - cat = cast("Categorical", self._values) - return cat.map(mapper) - - values = self._values - - indexer = mapper.index.get_indexer(values) - new_values = algorithms.take_nd(mapper._values, indexer) - - return new_values - - # we must convert to python types - if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"): - # GH#23179 some EAs do not have `map` - values = self._values - if na_action is not None: - raise NotImplementedError - map_f = lambda values, f: values.map(f) - else: - values = self._values.astype(object) - if na_action == "ignore": - map_f = lambda values, f: lib.map_infer_mask( - values, f, isna(values).view(np.uint8) - ) - elif na_action is None: - map_f = lib.map_infer - else: - msg = ( - "na_action must either be 'ignore' or None, " - f"{na_action} was passed" - ) - raise ValueError(msg) - - # mapper is a function - new_values = map_f(values, mapper) - - return new_values + arr = extract_array(self, extract_numpy=True, extract_range=True) + + if is_extension_array_dtype(arr.dtype): + # Item "IndexOpsMixin" of "Union[IndexOpsMixin, ExtensionArray, + # ndarray[Any, Any]]" has no attribute "map" + return arr.map(mapper, na_action=na_action) # type: ignore[union-attr] + + # Argument 1 to "map_array" has incompatible type + # "Union[IndexOpsMixin, ExtensionArray, ndarray[Any, Any]]"; + # expected "Union[ExtensionArray, ndarray[Any, Any]]" + return algorithms.map_array( + arr, mapper, na_action=na_action # type: ignore[arg-type] + ) @final def value_counts( diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 173edfbdcd7a7..9f556b47937f7 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -88,6 +88,12 @@ def test_apply_simple_series(self, data): result = pd.Series(data).apply(id) assert isinstance(result, pd.Series) + @pytest.mark.parametrize("na_action", [None, "ignore"]) + def test_map(self, data, na_action): + result = data.map(lambda x: x, na_action=na_action) + expected = data.to_numpy() + tm.assert_numpy_array_equal(result, expected) + def test_argsort(self, data_for_sorting): result = pd.Series(data_for_sorting).argsort() # argsort result gets passed to take, so should be np.intp diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 9a363c6a0f022..34a23315fd9fa 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -184,6 +184,15 @@ def test_combine_add(self, data_repeated): expected = pd.Series([a + val for a in list(orig_data1)]) self.assert_series_equal(result, expected) + @pytest.mark.parametrize("na_action", [None, "ignore"]) + def test_map(self, data, na_action): + if na_action is not None: + with pytest.raises(NotImplementedError, match=""): + data.map(lambda x: x, na_action=na_action) + else: + result = data.map(lambda x: x, na_action=na_action) + self.assert_extension_array_equal(result, data) + class TestCasting(base.BaseCastingTests): @pytest.mark.parametrize("cls", [Categorical, CategoricalIndex]) diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index 92796c604333d..21f4f949cdfa9 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -116,6 +116,15 @@ def test_combine_add(self, data_repeated): # Timestamp.__add__(Timestamp) not defined pass + @pytest.mark.parametrize("na_action", [None, "ignore"]) + def test_map(self, data, na_action): + if na_action is not None: + with pytest.raises(NotImplementedError, match=""): + data.map(lambda x: x, na_action=na_action) + else: + result = data.map(lambda x: x, na_action=na_action) + self.assert_extension_array_equal(result, data) + class TestInterface(BaseDatetimeTests, base.BaseInterfaceTests): pass diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index cb1ebd87875e1..4be761358ba15 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -105,6 +105,15 @@ def test_diff(self, data, periods): else: super().test_diff(data, periods) + @pytest.mark.parametrize("na_action", [None, "ignore"]) + def test_map(self, data, na_action): + if na_action is not None: + with pytest.raises(NotImplementedError, match=""): + data.map(lambda x: x, na_action=na_action) + else: + result = data.map(lambda x: x, na_action=na_action) + self.assert_extension_array_equal(result, data) + class TestInterface(BasePeriodTests, base.BaseInterfaceTests): pass diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 836e644affbda..e14de81d6fbd6 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -351,6 +351,15 @@ def test_equals(self, data, na_value, as_series, box): self._check_unsupported(data) super().test_equals(data, na_value, as_series, box) + @pytest.mark.parametrize("na_action", [None, "ignore"]) + def test_map(self, data, na_action): + if na_action is not None: + with pytest.raises(NotImplementedError, match=""): + data.map(lambda x: x, na_action=na_action) + else: + result = data.map(lambda x: x, na_action=na_action) + self.assert_extension_array_equal(result, data) + class TestCasting(BaseSparseTests, base.BaseCastingTests): def test_astype_str(self, data):