Skip to content

BUG: read_csv with engine pyarrow parsing multiple date columns #50056

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
May 18, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -745,7 +745,7 @@ I/O
- Bug in :func:`DataFrame.to_string` with ``header=False`` that printed the index name on the same line as the first row of the data (:issue:`49230`)
- Fixed memory leak which stemmed from the initialization of the internal JSON module (:issue:`49222`)
- Fixed issue where :func:`json_normalize` would incorrectly remove leading characters from column names that matched the ``sep`` argument (:issue:`49861`)
-
- Fixed issue where :func:`read_csv` would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`)

Period
^^^^^^
Expand Down
2 changes: 1 addition & 1 deletion pandas/io/parsers/arrow_parser_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
multi_index_named = False
frame.columns = self.names
# we only need the frame not the names
frame.columns, frame = self._do_date_conversions(frame.columns, frame)
_, frame = self._do_date_conversions(frame.columns, frame)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This gives us back the frame with already changed column names?

Copy link
Member Author

@lithomas1 lithomas1 Dec 5, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, _do_date_conversions changes the names in the data_dict/frame too.

I'm actually not too sure why names is returned from this function again. (I guess it might have been before dicts were ordered???)
EDIT: It's probably related to making a multi-index from the columns for the other engines. _do_date_conversions can always fix the frame directly, so this isn't relevant for the pyarrow engine.

if self.index_col is not None:
for i, item in enumerate(self.index_col):
if is_integer(item):
Expand Down
15 changes: 9 additions & 6 deletions pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from enum import Enum
import itertools
from typing import (
TYPE_CHECKING,
Any,
Callable,
DefaultDict,
Expand Down Expand Up @@ -71,7 +70,11 @@
)
from pandas.core.dtypes.missing import isna

from pandas import StringDtype
from pandas import (
DataFrame,
StringDtype,
concat,
)
from pandas.core import algorithms
from pandas.core.arrays import (
ArrowExtensionArray,
Expand All @@ -89,9 +92,6 @@
from pandas.core.series import Series
from pandas.core.tools import datetimes as tools

if TYPE_CHECKING:
from pandas import DataFrame


class ParserBase:
class BadLineHandleMethod(Enum):
Expand Down Expand Up @@ -1264,7 +1264,10 @@ def _isindex(colspec):
new_cols.append(new_name)
date_cols.update(old_names)

data_dict.update(new_data)
if isinstance(data_dict, DataFrame):
data_dict = concat([DataFrame(new_data), data_dict], axis=1)
else:
data_dict.update(new_data)
new_cols.extend(columns)

if not keep_date_col:
Expand Down
31 changes: 19 additions & 12 deletions pandas/tests/io/parser/test_parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,8 @@ def test_separator_date_conflict(all_parsers):
tm.assert_frame_equal(df, expected)


@xfail_pyarrow
@pytest.mark.parametrize("keep_date_col", [True, False])
def test_multiple_date_col_custom(all_parsers, keep_date_col):
def test_multiple_date_col_custom(all_parsers, keep_date_col, request):
data = """\
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
Expand All @@ -151,6 +150,14 @@ def test_multiple_date_col_custom(all_parsers, keep_date_col):
"""
parser = all_parsers

if keep_date_col and parser.engine == "pyarrow":
# For this to pass, we need to disable auto-inference on the date columns
# in parse_dates. We have no way of doing this though
mark = pytest.mark.xfail(
reason="pyarrow doesn't support disabling auto-inference on column numbers."
)
request.node.add_marker(mark)

def date_parser(*date_cols):
"""
Test date parser.
Expand Down Expand Up @@ -293,9 +300,8 @@ def test_concat_date_col_fail(container, dim):
parsing.concat_date_cols(date_cols)


@xfail_pyarrow
@pytest.mark.parametrize("keep_date_col", [True, False])
def test_multiple_date_col(all_parsers, keep_date_col):
def test_multiple_date_col(all_parsers, keep_date_col, request):
data = """\
KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000
KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000
Expand All @@ -305,6 +311,15 @@ def test_multiple_date_col(all_parsers, keep_date_col):
KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000
"""
parser = all_parsers

if keep_date_col and parser.engine == "pyarrow":
# For this to pass, we need to disable auto-inference on the date columns
# in parse_dates. We have no way of doing this though
mark = pytest.mark.xfail(
reason="pyarrow doesn't support disabling auto-inference on column numbers."
)
request.node.add_marker(mark)

kwds = {
"header": None,
"parse_dates": [[1, 2], [1, 3]],
Expand Down Expand Up @@ -461,7 +476,6 @@ def test_date_col_as_index_col(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_multiple_date_cols_int_cast(all_parsers):
data = (
"KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
Expand Down Expand Up @@ -1132,7 +1146,6 @@ def test_multiple_date_cols_chunked(all_parsers):
tm.assert_frame_equal(chunks[2], expected[4:])


@xfail_pyarrow
def test_multiple_date_col_named_index_compat(all_parsers):
parser = all_parsers
data = """\
Expand All @@ -1156,7 +1169,6 @@ def test_multiple_date_col_named_index_compat(all_parsers):
tm.assert_frame_equal(with_indices, with_names)


@xfail_pyarrow
def test_multiple_date_col_multiple_index_compat(all_parsers):
parser = all_parsers
data = """\
Expand Down Expand Up @@ -1301,7 +1313,6 @@ def test_parse_date_time_multi_level_column_name(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
@pytest.mark.parametrize(
"data,kwargs,expected",
[
Expand Down Expand Up @@ -1385,7 +1396,6 @@ def test_parse_date_time(all_parsers, data, kwargs, expected):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_parse_date_fields(all_parsers):
parser = all_parsers
data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
Expand All @@ -1403,7 +1413,6 @@ def test_parse_date_fields(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_parse_date_all_fields(all_parsers):
parser = all_parsers
data = """\
Expand All @@ -1427,7 +1436,6 @@ def test_parse_date_all_fields(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_datetime_fractional_seconds(all_parsers):
parser = all_parsers
data = """\
Expand All @@ -1451,7 +1459,6 @@ def test_datetime_fractional_seconds(all_parsers):
tm.assert_frame_equal(result, expected)


@xfail_pyarrow
def test_generic(all_parsers):
parser = all_parsers
data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
Expand Down