Skip to content

Commit 62c873b

Browse files
committed
ENH: Add on_bad_lines for pyarrow (SQUASHED)
1 parent 9d70a49 commit 62c873b

File tree

3 files changed

+43
-6
lines changed

3 files changed

+43
-6
lines changed

pandas/io/parsers/arrow_parser_wrapper.py

+26
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
11
from __future__ import annotations
22

33
from typing import TYPE_CHECKING
4+
import warnings
45

56
from pandas._config import using_pyarrow_string_dtype
67

78
from pandas._libs import lib
89
from pandas.compat._optional import import_optional_dependency
10+
from pandas.errors import ParserWarning
11+
from pandas.util._exceptions import find_stack_level
912

1013
from pandas.core.dtypes.inference import is_integer
1114

@@ -85,6 +88,29 @@ def _get_pyarrow_options(self) -> None:
8588
and option_name
8689
in ("delimiter", "quote_char", "escape_char", "ignore_empty_lines")
8790
}
91+
92+
if "on_bad_lines" in self.kwds:
93+
if callable(self.kwds["on_bad_lines"]):
94+
self.parse_options["invalid_row_handler"] = self.kwds["on_bad_lines"]
95+
elif self.kwds["on_bad_lines"] == ParserBase.BadLineHandleMethod.ERROR:
96+
self.parse_options[
97+
"invalid_row_handler"
98+
] = None # PyArrow raises an exception by default
99+
elif self.kwds["on_bad_lines"] == ParserBase.BadLineHandleMethod.WARN:
100+
101+
def handle_warning(invalid_row):
102+
warnings.warn(
103+
f"Expected {invalid_row.expected_columns} columns, but found "
104+
f"{invalid_row.actual_columns}: {invalid_row.text}",
105+
ParserWarning,
106+
stacklevel=find_stack_level(),
107+
)
108+
return "skip"
109+
110+
self.parse_options["invalid_row_handler"] = handle_warning
111+
elif self.kwds["on_bad_lines"] == ParserBase.BadLineHandleMethod.SKIP:
112+
self.parse_options["invalid_row_handler"] = lambda _: "skip"
113+
88114
self.convert_options = {
89115
option_name: option_value
90116
for option_name, option_value in self.kwds.items()

pandas/io/parsers/readers.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,13 @@
390390
expected, a ``ParserWarning`` will be emitted while dropping extra elements.
391391
Only supported when ``engine='python'``
392392
393+
.. versionchanged:: 1.4.1
394+
395+
- Callable, function with signature
396+
as described in `pyarrow documentation
397+
<https://arrow.apache.org/docs/python/generated/pyarrow.csv.ParseOptions.html
398+
#pyarrow.csv.ParseOptions.invalid_row_handler>_` when ``engine='pyarrow'``
399+
393400
delim_whitespace : bool, default False
394401
Specifies whether or not whitespace (e.g. ``' '`` or ``'\\t'``) will be
395402
used as the ``sep`` delimiter. Equivalent to setting ``sep='\\s+'``. If this option
@@ -483,7 +490,6 @@ class _Fwf_Defaults(TypedDict):
483490
"thousands",
484491
"memory_map",
485492
"dialect",
486-
"on_bad_lines",
487493
"delim_whitespace",
488494
"quoting",
489495
"lineterminator",
@@ -2038,9 +2044,10 @@ def _refine_defaults_read(
20382044
elif on_bad_lines == "skip":
20392045
kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP
20402046
elif callable(on_bad_lines):
2041-
if engine != "python":
2047+
if engine not in ["python", "pyarrow"]:
20422048
raise ValueError(
2043-
"on_bad_line can only be a callable function if engine='python'"
2049+
"on_bad_line can only be a callable function "
2050+
"if engine='python' or 'pyarrow'"
20442051
)
20452052
kwds["on_bad_lines"] = on_bad_lines
20462053
else:

pandas/tests/io/parser/test_unsupported.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -156,13 +156,17 @@ def test_pyarrow_engine(self):
156156
with pytest.raises(ValueError, match=msg):
157157
read_csv(StringIO(data), engine="pyarrow", **kwargs)
158158

159-
def test_on_bad_lines_callable_python_only(self, all_parsers):
159+
def test_on_bad_lines_callable_python_or_pyarrow(self, all_parsers):
160160
# GH 5686
161+
# GH 54643
161162
sio = StringIO("a,b\n1,2")
162163
bad_lines_func = lambda x: x
163164
parser = all_parsers
164-
if all_parsers.engine != "python":
165-
msg = "on_bad_line can only be a callable function if engine='python'"
165+
if all_parsers.engine not in ["python", "pyarrow"]:
166+
msg = (
167+
"on_bad_line can only be a callable "
168+
"function if engine='python' or 'pyarrow'"
169+
)
166170
with pytest.raises(ValueError, match=msg):
167171
parser.read_csv(sio, on_bad_lines=bad_lines_func)
168172
else:

0 commit comments

Comments
 (0)