Skip to content

Commit 2663213

Browse files
Backport PR #52548 on branch 2.0.x (PERF: Improve performance for arrow engine and dtype_backend=pyarrow for datetime conversion) (#52592)
Backport PR #52548: PERF: Improve performance for arrow engine and dtype_backend=pyarrow for datetime conversion Co-authored-by: Patrick Hoefler <[email protected]>
1 parent 33dfa6c commit 2663213

File tree

3 files changed

+29
-0
lines changed

3 files changed

+29
-0
lines changed

asv_bench/benchmarks/io/csv.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -555,4 +555,19 @@ def time_read_csv_index_col(self):
555555
read_csv(self.StringIO_input, index_col="a")
556556

557557

558+
class ReadCSVDatePyarrowEngine(StringIORewind):
559+
def setup(self):
560+
count_elem = 100_000
561+
data = "a\n" + "2019-12-31\n" * count_elem
562+
self.StringIO_input = StringIO(data)
563+
564+
def time_read_csv_index_col(self):
565+
read_csv(
566+
self.StringIO_input,
567+
parse_dates=["a"],
568+
engine="pyarrow",
569+
dtype_backend="pyarrow",
570+
)
571+
572+
558573
from ..pandas_vb_common import setup # noqa: F401 isort:skip

doc/source/whatsnew/v2.0.1.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ Bug fixes
2828
- Fixed segfault in :meth:`Series.to_numpy` with ``null[pyarrow]`` dtype (:issue:`52443`)
2929
- Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`)
3030
- Bug in :meth:`ArrowDtype.__from_arrow__` not respecting if dtype is explicitly given (:issue:`52533`)
31+
- Bug in :func:`read_csv` casting PyArrow datetimes to NumPy when ``dtype_backend="pyarrow"`` and ``parse_dates`` is set causing a performance bottleneck in the process (:issue:`52546`)
3132

3233
.. ---------------------------------------------------------------------------
3334
.. _whatsnew_201.other:

pandas/io/parsers/base_parser.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
from pandas.core.dtypes.missing import isna
6868

6969
from pandas import (
70+
ArrowDtype,
7071
DatetimeIndex,
7172
StringDtype,
7273
)
@@ -866,6 +867,7 @@ def _do_date_conversions(
866867
self.index_names,
867868
names,
868869
keep_date_col=self.keep_date_col,
870+
dtype_backend=self.dtype_backend,
869871
)
870872

871873
return names, data
@@ -1202,6 +1204,7 @@ def _process_date_conversion(
12021204
index_names,
12031205
columns,
12041206
keep_date_col: bool = False,
1207+
dtype_backend=lib.no_default,
12051208
):
12061209
def _isindex(colspec):
12071210
return (isinstance(index_col, list) and colspec in index_col) or (
@@ -1227,6 +1230,16 @@ def _isindex(colspec):
12271230
colspec = orig_names[colspec]
12281231
if _isindex(colspec):
12291232
continue
1233+
elif dtype_backend == "pyarrow":
1234+
import pyarrow as pa
1235+
1236+
dtype = data_dict[colspec].dtype
1237+
if isinstance(dtype, ArrowDtype) and (
1238+
pa.types.is_timestamp(dtype.pyarrow_dtype)
1239+
or pa.types.is_date(dtype.pyarrow_dtype)
1240+
):
1241+
continue
1242+
12301243
# Pyarrow engine returns Series which we need to convert to
12311244
# numpy array before converter, its a no-op for other parsers
12321245
data_dict[colspec] = converter(

0 commit comments

Comments
 (0)