diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index e1ac9e3309de7..67f5484044cd5 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -386,6 +386,7 @@ I/O - :meth:`DataFrame.to_orc` now raising ``ValueError`` when non-default :class:`Index` is given (:issue:`51828`) - :meth:`DataFrame.to_sql` now raising ``ValueError`` when the name param is left empty while using SQLAlchemy to connect (:issue:`52675`) - Bug in :func:`json_normalize`, fix json_normalize cannot parse metadata fields list type (:issue:`37782`) +- Bug in :func:`read_csv` where it would error when ``parse_dates`` was set to a list or dictionary with ``engine="pyarrow"`` (:issue:`47961`) - Bug in :func:`read_hdf` not properly closing store after a ``IndexError`` is raised (:issue:`52781`) - Bug in :func:`read_html`, style elements were read into DataFrames (:issue:`52197`) - Bug in :func:`read_html`, tail texts were removed together with elements containing ``display:none`` style (:issue:`51629`) diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index e106db224c3dc..d34b3ae1372fd 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -58,6 +58,21 @@ def _get_pyarrow_options(self) -> None: if pandas_name in self.kwds and self.kwds.get(pandas_name) is not None: self.kwds[pyarrow_name] = self.kwds.pop(pandas_name) + # Date format handling + # If we get a string, we need to convert it into a list for pyarrow + # If we get a dict, we want to parse those separately + date_format = self.date_format + if isinstance(date_format, str): + date_format = [date_format] + else: + # In case of dict, we don't want to propagate through, so + # just set to pyarrow default of None + + # Ideally, in future we disable pyarrow dtype inference (read in as string) + # to prevent misreads. + date_format = None + self.kwds["timestamp_parsers"] = date_format + self.parse_options = { option_name: option_value for option_name, option_value in self.kwds.items() @@ -76,6 +91,7 @@ def _get_pyarrow_options(self) -> None: "true_values", "false_values", "decimal_point", + "timestamp_parsers", ) } self.convert_options["strings_can_be_null"] = "" in self.kwds["null_values"] @@ -116,7 +132,7 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: multi_index_named = False frame.columns = self.names # we only need the frame not the names - frame.columns, frame = self._do_date_conversions(frame.columns, frame) + _, frame = self._do_date_conversions(frame.columns, frame) if self.index_col is not None: index_to_set = self.index_col.copy() for i, item in enumerate(self.index_col): diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 564339cefa3aa..3208286489fbe 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -61,8 +61,10 @@ from pandas import ( ArrowDtype, + DataFrame, DatetimeIndex, StringDtype, + concat, ) from pandas.core import algorithms from pandas.core.arrays import ( @@ -92,8 +94,6 @@ Scalar, ) - from pandas import DataFrame - class ParserBase: class BadLineHandleMethod(Enum): @@ -1304,7 +1304,10 @@ def _isindex(colspec): new_cols.append(new_name) date_cols.update(old_names) - data_dict.update(new_data) + if isinstance(data_dict, DataFrame): + data_dict = concat([DataFrame(new_data), data_dict], axis=1, copy=False) + else: + data_dict.update(new_data) new_cols.extend(columns) if not keep_date_col: diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 81de4f13de81d..b354f7d9da94d 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -139,9 +139,8 @@ def test_separator_date_conflict(all_parsers): tm.assert_frame_equal(df, expected) -@xfail_pyarrow @pytest.mark.parametrize("keep_date_col", [True, False]) -def test_multiple_date_col_custom(all_parsers, keep_date_col): +def test_multiple_date_col_custom(all_parsers, keep_date_col, request): data = """\ KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 @@ -152,6 +151,14 @@ def test_multiple_date_col_custom(all_parsers, keep_date_col): """ parser = all_parsers + if keep_date_col and parser.engine == "pyarrow": + # For this to pass, we need to disable auto-inference on the date columns + # in parse_dates. We have no way of doing this though + mark = pytest.mark.xfail( + reason="pyarrow doesn't support disabling auto-inference on column numbers." + ) + request.node.add_marker(mark) + def date_parser(*date_cols): """ Test date parser. @@ -301,9 +308,8 @@ def test_concat_date_col_fail(container, dim): parsing.concat_date_cols(date_cols) -@xfail_pyarrow @pytest.mark.parametrize("keep_date_col", [True, False]) -def test_multiple_date_col(all_parsers, keep_date_col): +def test_multiple_date_col(all_parsers, keep_date_col, request): data = """\ KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 @@ -313,6 +319,15 @@ def test_multiple_date_col(all_parsers, keep_date_col): KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 """ parser = all_parsers + + if keep_date_col and parser.engine == "pyarrow": + # For this to pass, we need to disable auto-inference on the date columns + # in parse_dates. We have no way of doing this though + mark = pytest.mark.xfail( + reason="pyarrow doesn't support disabling auto-inference on column numbers." + ) + request.node.add_marker(mark) + kwds = { "header": None, "parse_dates": [[1, 2], [1, 3]], @@ -469,7 +484,6 @@ def test_date_col_as_index_col(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow def test_multiple_date_cols_int_cast(all_parsers): data = ( "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" @@ -530,7 +544,6 @@ def test_multiple_date_cols_int_cast(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow def test_multiple_date_col_timestamp_parse(all_parsers): parser = all_parsers data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 @@ -1168,7 +1181,6 @@ def test_multiple_date_cols_chunked(all_parsers): tm.assert_frame_equal(chunks[2], expected[4:]) -@xfail_pyarrow def test_multiple_date_col_named_index_compat(all_parsers): parser = all_parsers data = """\ @@ -1192,7 +1204,6 @@ def test_multiple_date_col_named_index_compat(all_parsers): tm.assert_frame_equal(with_indices, with_names) -@xfail_pyarrow def test_multiple_date_col_multiple_index_compat(all_parsers): parser = all_parsers data = """\ @@ -1408,7 +1419,6 @@ def test_parse_date_time_multi_level_column_name(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow @pytest.mark.parametrize( "data,kwargs,expected", [ @@ -1498,9 +1508,6 @@ def test_parse_date_time(all_parsers, data, kwargs, expected): tm.assert_frame_equal(result, expected) -@xfail_pyarrow -# From date_parser fallback behavior -@pytest.mark.filterwarnings("ignore:elementwise comparison:FutureWarning") def test_parse_date_fields(all_parsers): parser = all_parsers data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11." @@ -1510,7 +1517,7 @@ def test_parse_date_fields(all_parsers): StringIO(data), header=0, parse_dates={"ymd": [0, 1, 2]}, - date_parser=pd.to_datetime, + date_parser=lambda x: x, ) expected = DataFrame( @@ -1520,7 +1527,6 @@ def test_parse_date_fields(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow @pytest.mark.parametrize( ("key", "value", "warn"), [ @@ -1557,7 +1563,6 @@ def test_parse_date_all_fields(all_parsers, key, value, warn): tm.assert_frame_equal(result, expected) -@xfail_pyarrow @pytest.mark.parametrize( ("key", "value", "warn"), [ @@ -1594,7 +1599,6 @@ def test_datetime_fractional_seconds(all_parsers, key, value, warn): tm.assert_frame_equal(result, expected) -@xfail_pyarrow def test_generic(all_parsers): parser = all_parsers data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."