diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index a5bba8e5592c7..19ef536a636ea 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -8,6 +8,7 @@ import numpy as np from pandas import ( + NA, CategoricalIndex, DataFrame, Index, @@ -83,6 +84,37 @@ def time_loc_slice(self, index, index_structure): self.data.loc[:800000] +class NumericMaskedIndexing: + monotonic_list = list(range(10**6)) + non_monotonic_list = ( + list(range(50)) + [54, 53, 52, 51] + list(range(55, 10**6 - 1)) + ) + + params = [ + ("Int64", "UInt64", "Float64"), + (True, False), + ] + param_names = ["dtype", "monotonic"] + + def setup(self, dtype, monotonic): + + indices = { + True: Index(self.monotonic_list, dtype=dtype), + False: Index(self.non_monotonic_list, dtype=dtype).append( + Index([NA], dtype=dtype) + ), + } + self.data = indices[monotonic] + self.indexer = np.arange(300, 1_000) + self.data_dups = self.data.append(self.data) + + def time_get_indexer(self, dtype, monotonic): + self.data.get_indexer(self.indexer) + + def time_get_indexer_dups(self, dtype, monotonic): + self.data.get_indexer_for(self.indexer) + + class NonNumericSeriesIndexing: params = [ diff --git a/asv_bench/benchmarks/indexing_engines.py b/asv_bench/benchmarks/indexing_engines.py index 0c6cb89f49da1..ce208761638c5 100644 --- a/asv_bench/benchmarks/indexing_engines.py +++ b/asv_bench/benchmarks/indexing_engines.py @@ -1,5 +1,8 @@ """ -Benchmarks in this file depend exclusively on code in _libs/ +Benchmarks in this file depend mostly on code in _libs/ + +We have to created masked arrays to test the masked engine though. The +array is unpacked on the Cython level. If a PR does not edit anything in _libs, it is very unlikely that benchmarks in this file will be affected. @@ -9,6 +12,8 @@ from pandas._libs import index as libindex +from pandas.core.arrays import BaseMaskedArray + def _get_numeric_engines(): engine_names = [ @@ -30,6 +35,26 @@ def _get_numeric_engines(): ] +def _get_masked_engines(): + engine_names = [ + ("MaskedInt64Engine", "Int64"), + ("MaskedInt32Engine", "Int32"), + ("MaskedInt16Engine", "Int16"), + ("MaskedInt8Engine", "Int8"), + ("MaskedUInt64Engine", "UInt64"), + ("MaskedUInt32Engine", "UInt32"), + ("MaskedUInt16engine", "UInt16"), + ("MaskedUInt8Engine", "UInt8"), + ("MaskedFloat64Engine", "Float64"), + ("MaskedFloat32Engine", "Float32"), + ] + return [ + (getattr(libindex, engine_name), dtype) + for engine_name, dtype in engine_names + if hasattr(libindex, engine_name) + ] + + class NumericEngineIndexing: params = [ @@ -80,6 +105,61 @@ def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N): self.data.get_loc(self.key_middle) +class MaskedNumericEngineIndexing: + + params = [ + _get_masked_engines(), + ["monotonic_incr", "monotonic_decr", "non_monotonic"], + [True, False], + [10**5, 2 * 10**6], # 2e6 is above SIZE_CUTOFF + ] + param_names = ["engine_and_dtype", "index_type", "unique", "N"] + + def setup(self, engine_and_dtype, index_type, unique, N): + engine, dtype = engine_and_dtype + + if index_type == "monotonic_incr": + if unique: + arr = np.arange(N * 3, dtype=dtype.lower()) + else: + values = list([1] * N + [2] * N + [3] * N) + arr = np.array(values, dtype=dtype.lower()) + mask = np.zeros(N * 3, dtype=np.bool_) + elif index_type == "monotonic_decr": + if unique: + arr = np.arange(N * 3, dtype=dtype.lower())[::-1] + else: + values = list([1] * N + [2] * N + [3] * N) + arr = np.array(values, dtype=dtype.lower())[::-1] + mask = np.zeros(N * 3, dtype=np.bool_) + else: + assert index_type == "non_monotonic" + if unique: + arr = np.zeros(N * 3, dtype=dtype.lower()) + arr[:N] = np.arange(N * 2, N * 3, dtype=dtype.lower()) + arr[N:] = np.arange(N * 2, dtype=dtype.lower()) + + else: + arr = np.array([1, 2, 3] * N, dtype=dtype.lower()) + mask = np.zeros(N * 3, dtype=np.bool_) + mask[-1] = True + + self.data = engine(BaseMaskedArray(arr, mask)) + # code belows avoids populating the mapping etc. while timing. + self.data.get_loc(2) + + self.key_middle = arr[len(arr) // 2] + self.key_early = arr[2] + + def time_get_loc(self, engine_and_dtype, index_type, unique, N): + self.data.get_loc(self.key_early) + + def time_get_loc_near_middle(self, engine_and_dtype, index_type, unique, N): + # searchsorted performance may be different near the middle of a range + # vs near an endpoint + self.data.get_loc(self.key_middle) + + class ObjectEngineIndexing: params = [("monotonic_incr", "monotonic_decr", "non_monotonic")] diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index dc05745c8c0e5..f8ac9645f758d 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -884,6 +884,7 @@ Performance improvements - Performance improvement in :func:`to_datetime` when parsing strings with timezone offsets (:issue:`50107`) - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) +- Performance improvement for indexing operations with nullable dtypes (:issue:`49420`) - Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`, :issue:`49178`) - Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`) - Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`) diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index af47e6c408c05..e9b78ad53380f 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -165,10 +165,12 @@ class HashTable: def map_locations( self, values: np.ndarray, # np.ndarray[subclass-specific] + mask: npt.NDArray[np.bool_] | None = ..., ) -> None: ... def lookup( self, values: np.ndarray, # np.ndarray[subclass-specific] + mask: npt.NDArray[np.bool_] | None = ..., ) -> npt.NDArray[np.intp]: ... def get_labels( self, diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index 06ad614b4f963..d4d3117a32ac9 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -1005,8 +1005,9 @@ cdef class StringHashTable(HashTable): return labels @cython.boundscheck(False) - def lookup(self, ndarray[object] values) -> ndarray: + def lookup(self, ndarray[object] values, object mask = None) -> ndarray: # -> np.ndarray[np.intp] + # mask not yet implemented cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -1041,7 +1042,8 @@ cdef class StringHashTable(HashTable): return np.asarray(locs) @cython.boundscheck(False) - def map_locations(self, ndarray[object] values) -> None: + def map_locations(self, ndarray[object] values, object mask = None) -> None: + # mask not yet implemented cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -1314,7 +1316,8 @@ cdef class PyObjectHashTable(HashTable): else: raise KeyError(key) - def map_locations(self, ndarray[object] values) -> None: + def map_locations(self, ndarray[object] values, object mask = None) -> None: + # mask not yet implemented cdef: Py_ssize_t i, n = len(values) int ret = 0 @@ -1328,8 +1331,9 @@ cdef class PyObjectHashTable(HashTable): k = kh_put_pymap(self.table, val, &ret) self.table.vals[k] = i - def lookup(self, ndarray[object] values) -> ndarray: + def lookup(self, ndarray[object] values, object mask = None) -> ndarray: # -> np.ndarray[np.intp] + # mask not yet implemented cdef: Py_ssize_t i, n = len(values) int ret = 0 diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 8fff335352617..4b4c4d65d1ea4 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -29,6 +29,12 @@ class IndexEngine: targets: np.ndarray, ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... +class MaskedIndexEngine(IndexEngine): + def __init__(self, values: object) -> None: ... + def get_indexer_non_unique( + self, targets: object + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... + class Float64Engine(IndexEngine): ... class Float32Engine(IndexEngine): ... class Complex128Engine(IndexEngine): ... @@ -46,6 +52,19 @@ class DatetimeEngine(Int64Engine): ... class TimedeltaEngine(DatetimeEngine): ... class PeriodEngine(Int64Engine): ... class BoolEngine(UInt8Engine): ... +class MaskedBoolEngine(MaskedUInt8Engine): ... +class MaskedFloat64Engine(MaskedIndexEngine): ... +class MaskedFloat32Engine(MaskedIndexEngine): ... +class MaskedComplex128Engine(MaskedIndexEngine): ... +class MaskedComplex64Engine(MaskedIndexEngine): ... +class MaskedInt64Engine(MaskedIndexEngine): ... +class MaskedInt32Engine(MaskedIndexEngine): ... +class MaskedInt16Engine(MaskedIndexEngine): ... +class MaskedInt8Engine(MaskedIndexEngine): ... +class MaskedUInt64Engine(MaskedIndexEngine): ... +class MaskedUInt32Engine(MaskedIndexEngine): ... +class MaskedUInt16Engine(MaskedIndexEngine): ... +class MaskedUInt8Engine(MaskedIndexEngine): ... class BaseMultiIndexCodesEngine: levels: list[np.ndarray] @@ -57,10 +76,7 @@ class BaseMultiIndexCodesEngine: labels: list[np.ndarray], # all entries integer-dtyped offsets: np.ndarray, # np.ndarray[np.uint64, ndim=1] ) -> None: ... - def get_indexer( - self, - target: npt.NDArray[np.object_], - ) -> npt.NDArray[np.intp]: ... + def get_indexer(self, target: npt.NDArray[np.object_]) -> npt.NDArray[np.intp]: ... def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ... def get_indexer_with_fill( self, diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 9e2adee407b1a..94d21f39dc61a 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -32,6 +32,7 @@ from pandas._libs import ( from pandas._libs.lib cimport eq_NA_compat from pandas._libs.missing cimport ( + C_NA, checknull, is_matching_na, ) @@ -48,7 +49,7 @@ cdef bint is_definitely_invalid_key(object val): return False -cdef ndarray _get_bool_indexer(ndarray values, object val): +cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None): """ Return a ndarray[bool] of locations where val matches self.values. @@ -61,6 +62,7 @@ cdef ndarray _get_bool_indexer(ndarray values, object val): object item if values.descr.type_num == cnp.NPY_OBJECT: + assert mask is None # no mask for object dtype # i.e. values.dtype == object if not checknull(val): indexer = eq_NA_compat(values, val) @@ -74,10 +76,16 @@ cdef ndarray _get_bool_indexer(ndarray values, object val): indexer[i] = is_matching_na(item, val) else: - if util.is_nan(val): - indexer = np.isnan(values) + if mask is not None: + if val is C_NA: + indexer = mask == 1 + else: + indexer = (values == val) & ~mask else: - indexer = values == val + if util.is_nan(val): + indexer = np.isnan(values) + else: + indexer = values == val return indexer.view(bool) @@ -111,6 +119,7 @@ cdef class IndexEngine: cdef readonly: ndarray values + ndarray mask HashTable mapping bint over_size_threshold @@ -121,6 +130,7 @@ cdef class IndexEngine: def __init__(self, ndarray values): self.values = values + self.mask = None self.over_size_threshold = len(values) >= _SIZE_CUTOFF self.clear_mapping() @@ -159,6 +169,8 @@ cdef class IndexEngine: self._ensure_mapping_populated() if not self.unique: return self._get_loc_duplicates(val) + if self.mask is not None and val is C_NA: + return self.mapping.get_na() try: return self.mapping.get_item(val) @@ -206,7 +218,7 @@ cdef class IndexEngine: cdef: ndarray[uint8_t, ndim=1, cast=True] indexer - indexer = _get_bool_indexer(self.values, val) + indexer = _get_bool_indexer(self.values, val, self.mask) return _unpack_bool_indexer(indexer, val) def sizeof(self, deep: bool = False) -> int: @@ -247,21 +259,25 @@ cdef class IndexEngine: cdef _do_monotonic_check(self): cdef: bint is_unique - try: - values = self.values - self.monotonic_inc, self.monotonic_dec, is_unique = \ - self._call_monotonic(values) - except TypeError: + if self.mask is not None and np.any(self.mask): self.monotonic_inc = 0 self.monotonic_dec = 0 - is_unique = 0 + else: + try: + values = self.values + self.monotonic_inc, self.monotonic_dec, is_unique = \ + self._call_monotonic(values) + except TypeError: + self.monotonic_inc = 0 + self.monotonic_dec = 0 + is_unique = 0 - self.need_monotonic_check = 0 + self.need_monotonic_check = 0 - # we can only be sure of uniqueness if is_unique=1 - if is_unique: - self.unique = 1 - self.need_unique_check = 0 + # we can only be sure of uniqueness if is_unique=1 + if is_unique: + self.unique = 1 + self.need_unique_check = 0 cdef _call_monotonic(self, values): return algos.is_monotonic(values, timelike=False) @@ -286,7 +302,7 @@ cdef class IndexEngine: values = self.values self.mapping = self._make_hash_table(len(values)) - self.mapping.map_locations(values) + self.mapping.map_locations(values, self.mask) if len(self.mapping) == len(values): self.unique = 1 @@ -841,6 +857,15 @@ cdef class BoolEngine(UInt8Engine): return val +cdef class MaskedBoolEngine(MaskedUInt8Engine): + cdef _check_type(self, object val): + if val is C_NA: + return val + if not util.is_bool_object(val): + raise KeyError(val) + return val + + @cython.internal @cython.freelist(32) cdef class SharedEngine: @@ -1110,3 +1135,130 @@ cdef class ExtensionEngine(SharedEngine): cdef _check_type(self, object val): hash(val) + + +cdef class MaskedIndexEngine(IndexEngine): + def __init__(self, object values): + super().__init__(values._data) + self.mask = values._mask + + def get_indexer(self, object values) -> np.ndarray: + self._ensure_mapping_populated() + return self.mapping.lookup(values._data, values._mask) + + def get_indexer_non_unique(self, object targets): + """ + Return an indexer suitable for taking from a non unique index + return the labels in the same order as the target + and a missing indexer into the targets (which correspond + to the -1 indices in the results + + Returns + ------- + indexer : np.ndarray[np.intp] + missing : np.ndarray[np.intp] + """ + # TODO: Unify with parent class + cdef: + ndarray values, mask, target_vals, target_mask + ndarray[intp_t] result, missing + set stargets + list na_pos + dict d = {} + object val + Py_ssize_t count = 0, count_missing = 0 + Py_ssize_t i, j, n, n_t, n_alloc, start, end, na_idx + + target_vals = targets._data + target_mask = targets._mask + + values = self.values + assert not values.dtype == object # go through object path instead + + mask = self.mask + stargets = set(target_vals[~target_mask]) + + n = len(values) + n_t = len(target_vals) + if n > 10_000: + n_alloc = 10_000 + else: + n_alloc = n + + result = np.empty(n_alloc, dtype=np.intp) + missing = np.empty(n_t, dtype=np.intp) + + # map each starget to its position in the index + if ( + stargets and + len(stargets) < 5 and + not np.any(target_mask) and + self.is_monotonic_increasing + ): + # if there are few enough stargets and the index is monotonically + # increasing, then use binary search for each starget + for starget in stargets: + start = values.searchsorted(starget, side="left") + end = values.searchsorted(starget, side="right") + if start != end: + d[starget] = list(range(start, end)) + + stargets = set() + + if stargets: + # otherwise, map by iterating through all items in the index + + na_pos = [] + + for i in range(n): + val = values[i] + + if mask[i]: + na_pos.append(i) + + else: + if val in stargets: + if val not in d: + d[val] = [] + d[val].append(i) + + for i in range(n_t): + val = target_vals[i] + + if target_mask[i]: + if na_pos: + for na_idx in na_pos: + # realloc if needed + if count >= n_alloc: + n_alloc += 10_000 + result = np.resize(result, n_alloc) + + result[count] = na_idx + count += 1 + continue + + elif val in d: + # found + key = val + + for j in d[key]: + + # realloc if needed + if count >= n_alloc: + n_alloc += 10_000 + result = np.resize(result, n_alloc) + + result[count] = j + count += 1 + continue + + # value not found + if count >= n_alloc: + n_alloc += 10_000 + result = np.resize(result, n_alloc) + result[count] = -1 + count += 1 + missing[count_missing] = i + count_missing += 1 + + return result[0:count], missing[0:count_missing] diff --git a/pandas/_libs/index_class_helper.pxi.in b/pandas/_libs/index_class_helper.pxi.in index b9c02ba64f69c..bf3d88edd9386 100644 --- a/pandas/_libs/index_class_helper.pxi.in +++ b/pandas/_libs/index_class_helper.pxi.in @@ -24,17 +24,29 @@ dtypes = [('Float64', 'float64'), ('Complex64', 'complex64'), ('Complex128', 'complex128'), ] + +engines = [('', 'IndexEngine'), ('Masked', 'MaskedIndexEngine')] + }} {{for name, dtype in dtypes}} +{{for prefix, engine in engines}} -cdef class {{name}}Engine(IndexEngine): +cdef class {{prefix}}{{name}}Engine({{engine}}): cdef _make_hash_table(self, Py_ssize_t n): + {{if engine == 'MaskedIndexEngine'}} + return _hash.{{name}}HashTable(n, uses_mask=True) + {{else}} return _hash.{{name}}HashTable(n) + {{endif}} cdef _check_type(self, object val): + {{if engine == 'MaskedIndexEngine'}} + if val is C_NA: + return val + {{endif}} {{if name not in {'Float64', 'Float32', 'Complex64', 'Complex128'} }} if not util.is_integer_object(val): if util.is_float_object(val): @@ -61,5 +73,6 @@ cdef class {{name}}Engine(IndexEngine): {{endif}} return val +{{endfor}} {{endfor}} diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 2948bb81d0b6a..df98df1a11557 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -206,6 +206,22 @@ _dtype_obj = np.dtype("object") +_masked_engines = { + "Complex128": libindex.MaskedComplex128Engine, + "Complex64": libindex.MaskedComplex64Engine, + "Float64": libindex.MaskedFloat64Engine, + "Float32": libindex.MaskedFloat32Engine, + "UInt64": libindex.MaskedUInt64Engine, + "UInt32": libindex.MaskedUInt32Engine, + "UInt16": libindex.MaskedUInt16Engine, + "UInt8": libindex.MaskedUInt8Engine, + "Int64": libindex.MaskedInt64Engine, + "Int32": libindex.MaskedInt32Engine, + "Int16": libindex.MaskedInt16Engine, + "Int8": libindex.MaskedInt8Engine, + "boolean": libindex.MaskedBoolEngine, +} + def _maybe_return_indexers(meth: F) -> F: """ @@ -755,14 +771,15 @@ def _cleanup(self) -> None: @cache_readonly def _engine( self, - ) -> libindex.IndexEngine | libindex.ExtensionEngine: + ) -> libindex.IndexEngine | libindex.ExtensionEngine | libindex.MaskedIndexEngine: # For base class (object dtype) we get ObjectEngine target_values = self._get_engine_target() - if ( - isinstance(target_values, ExtensionArray) - and self._engine_type is libindex.ObjectEngine - ): - return libindex.ExtensionEngine(target_values) + if isinstance(target_values, ExtensionArray): + + if isinstance(target_values, BaseMaskedArray): + return _masked_engines[target_values.dtype.name](target_values) + elif self._engine_type is libindex.ObjectEngine: + return libindex.ExtensionEngine(target_values) target_values = cast(np.ndarray, target_values) # to avoid a reference cycle, bind `target_values` to a local variable, so @@ -4836,7 +4853,11 @@ def _get_engine_target(self) -> ArrayLike: if isinstance(vals, StringArray): # GH#45652 much more performant than ExtensionEngine return vals._ndarray - if type(self) is Index and isinstance(self._values, ExtensionArray): + if ( + type(self) is Index + and isinstance(self._values, ExtensionArray) + and not isinstance(self._values, BaseMaskedArray) + ): # TODO(ExtensionIndex): remove special-case, just use self._values return self._values.astype(object) return vals diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index 5c4596b0d9503..eec1df8b44f33 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -4,12 +4,14 @@ from pandas.errors import InvalidIndexError from pandas import ( + NA, Index, RangeIndex, Series, Timestamp, ) import pandas._testing as tm +from pandas.core.arrays import FloatingArray @pytest.fixture @@ -314,6 +316,76 @@ def test_get_indexer_uint64(self, index_large): expected = np.array([0, 1, 1, 2, 3, 4, -1, -1, -1, -1], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected) + @pytest.mark.parametrize("val, val2", [(4, 5), (4, 4), (4, NA), (NA, NA)]) + def test_get_loc_masked(self, val, val2, any_numeric_ea_dtype): + # GH#39133 + idx = Index([1, 2, 3, val, val2], dtype=any_numeric_ea_dtype) + result = idx.get_loc(2) + assert result == 1 + + with pytest.raises(KeyError, match="9"): + idx.get_loc(9) + + def test_get_loc_masked_na(self, any_numeric_ea_dtype): + # GH#39133 + idx = Index([1, 2, NA], dtype=any_numeric_ea_dtype) + result = idx.get_loc(NA) + assert result == 2 + + idx = Index([1, 2, NA, NA], dtype=any_numeric_ea_dtype) + result = idx.get_loc(NA) + tm.assert_numpy_array_equal(result, np.array([False, False, True, True])) + + idx = Index([1, 2, 3], dtype=any_numeric_ea_dtype) + with pytest.raises(KeyError, match="NA"): + idx.get_loc(NA) + + def test_get_loc_masked_na_and_nan(self): + # GH#39133 + idx = Index( + FloatingArray( + np.array([1, 2, 1, np.nan]), mask=np.array([False, False, True, False]) + ) + ) + result = idx.get_loc(NA) + assert result == 2 + result = idx.get_loc(np.nan) + assert result == 3 + + idx = Index( + FloatingArray(np.array([1, 2, 1.0]), mask=np.array([False, False, True])) + ) + result = idx.get_loc(NA) + assert result == 2 + with pytest.raises(KeyError, match="nan"): + idx.get_loc(np.nan) + + idx = Index( + FloatingArray( + np.array([1, 2, np.nan]), mask=np.array([False, False, False]) + ) + ) + result = idx.get_loc(np.nan) + assert result == 2 + with pytest.raises(KeyError, match="NA"): + idx.get_loc(NA) + + @pytest.mark.parametrize("val", [4, 2]) + def test_get_indexer_masked_na(self, any_numeric_ea_dtype, val): + # GH#39133 + idx = Index([1, 2, NA, 3, val], dtype=any_numeric_ea_dtype) + result = idx.get_indexer_for([1, NA, 5]) + expected = np.array([0, 2, -1]) + tm.assert_numpy_array_equal(result, expected, check_dtype=False) + + def test_get_indexer_masked_na_boolean(self): + # GH#39133 + idx = Index([True, False, NA], dtype="boolean") + result = idx.get_loc(False) + assert result == 1 + result = idx.get_loc(NA) + assert result == 2 + class TestWhere: @pytest.mark.parametrize(