Skip to content

Commit f79b072

Browse files
committed
REF: move implementation to ArrowStringArrayMixin
1 parent 272d7ba commit f79b072

File tree

3 files changed

+28
-24
lines changed

3 files changed

+28
-24
lines changed

pandas/core/arrays/_arrow_string_mixins.py

+27
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ class ArrowStringArrayMixin:
2323
def __init__(self, *args, **kwargs) -> None:
2424
raise NotImplementedError
2525

26+
def _convert_int_dtype(self, result):
27+
# Convert an int-dtype arrow result to an appropriate output type.
28+
raise NotImplementedError
29+
2630
def _str_pad(
2731
self,
2832
width: int,
@@ -89,3 +93,26 @@ def _str_removesuffix(self, suffix: str):
8993
removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
9094
result = pc.if_else(ends_with, removed, self._pa_array)
9195
return type(self)(result)
96+
97+
def _str_find(self, sub: str, start: int = 0, end: int | None = None):
98+
if (start == 0 or start is None) and end is None:
99+
result = pc.find_substring(self._pa_array, sub)
100+
else:
101+
if sub == "":
102+
# GH#56792
103+
result = self._apply_elementwise(lambda val: val.find(sub, start, end))
104+
return self._convert_int_dtype(pa.chunked_array(result))
105+
if start is None:
106+
start_offset = 0
107+
start = 0
108+
elif start < 0:
109+
start_offset = pc.add(start, pc.utf8_length(self._pa_array))
110+
start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset)
111+
else:
112+
start_offset = start
113+
slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
114+
result = pc.find_substring(slices, sub)
115+
found = pc.not_equal(result, pa.scalar(-1, type=result.type))
116+
offset_result = pc.add(result, start_offset)
117+
result = pc.if_else(found, offset_result, -1)
118+
return self._convert_int_dtype(result)

pandas/core/arrays/arrow/array.py

-23
Original file line numberDiff line numberDiff line change
@@ -2390,29 +2390,6 @@ def _str_fullmatch(
23902390
def _convert_int_dtype(self, result):
23912391
return type(self)(result)
23922392

2393-
def _str_find(self, sub: str, start: int = 0, end: int | None = None) -> Self:
2394-
if (start == 0 or start is None) and end is None:
2395-
result = pc.find_substring(self._pa_array, sub)
2396-
else:
2397-
if sub == "":
2398-
# GH 56792
2399-
result = self._apply_elementwise(lambda val: val.find(sub, start, end))
2400-
return self._convert_int_dtype(pa.chunked_array(result))
2401-
if start is None:
2402-
start_offset = 0
2403-
start = 0
2404-
elif start < 0:
2405-
start_offset = pc.add(start, pc.utf8_length(self._pa_array))
2406-
start_offset = pc.if_else(pc.less(start_offset, 0), 0, start_offset)
2407-
else:
2408-
start_offset = start
2409-
slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
2410-
result = pc.find_substring(slices, sub)
2411-
found = pc.not_equal(result, pa.scalar(-1, type=result.type))
2412-
offset_result = pc.add(result, start_offset)
2413-
result = pc.if_else(found, offset_result, -1)
2414-
return self._convert_int_dtype(result)
2415-
24162393
def _str_join(self, sep: str) -> Self:
24172394
if pa.types.is_string(self._pa_array.type) or pa.types.is_large_string(
24182395
self._pa_array.type

pandas/core/arrays/string_arrow.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -482,7 +482,7 @@ def _str_find(self, sub: str, start: int = 0, end: int | None = None):
482482
):
483483
# https://github.com/pandas-dev/pandas/pull/59562/files#r1725688888
484484
return super()._str_find(sub, start, end)
485-
return ArrowExtensionArray._str_find(self, sub, start, end)
485+
return ArrowStringArrayMixin._str_find(self, sub, start, end)
486486

487487
def _str_get_dummies(self, sep: str = "|"):
488488
dummies_pa, labels = ArrowExtensionArray(self._pa_array)._str_get_dummies(sep)

0 commit comments

Comments
 (0)