diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index dae6107db4d92..ff0ccffced0f3 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -445,16 +445,6 @@ def setup(self, engine): data = data.format(*two_cols) self.StringIO_input = StringIO(data) - def time_multiple_date(self, engine): - read_csv( - self.data(self.StringIO_input), - engine=engine, - sep=",", - header=None, - names=list(string.digits[:9]), - parse_dates=[[1, 2], [1, 3]], - ) - def time_baseline(self, engine): read_csv( self.data(self.StringIO_input), diff --git a/asv_bench/benchmarks/io/parsers.py b/asv_bench/benchmarks/io/parsers.py index 1078837a8e395..d3fd5075a4707 100644 --- a/asv_bench/benchmarks/io/parsers.py +++ b/asv_bench/benchmarks/io/parsers.py @@ -1,10 +1,5 @@ -import numpy as np - try: - from pandas._libs.tslibs.parsing import ( - _does_string_look_like_datetime, - concat_date_cols, - ) + from pandas._libs.tslibs.parsing import _does_string_look_like_datetime except ImportError: # Avoid whole benchmark suite import failure on asv (currently 0.4) pass @@ -20,21 +15,3 @@ def setup(self, value): def time_check_datetimes(self, value): for obj in self.objects: _does_string_look_like_datetime(obj) - - -class ConcatDateCols: - params = ([1234567890, "AAAA"], [1, 2]) - param_names = ["value", "dim"] - - def setup(self, value, dim): - count_elem = 10000 - if dim == 1: - self.object = (np.array([value] * count_elem),) - if dim == 2: - self.object = ( - np.array([value] * count_elem), - np.array([value] * count_elem), - ) - - def time_check_concat(self, value, dim): - concat_date_cols(self.object) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 30bbbbdc2926b..6776786885169 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -270,15 +270,9 @@ parse_dates : boolean or list of ints or names or list of lists or dict, default * If ``True`` -> try parsing the index. * If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date column. - * If ``[[1, 3]]`` -> combine columns 1 and 3 and parse as a single date - column. - * If ``{'foo': [1, 3]}`` -> parse columns 1, 3 as date and call result 'foo'. .. note:: A fast-path exists for iso8601-formatted dates. -keep_date_col : boolean, default ``False`` - If ``True`` and parse_dates specifies combining multiple columns then keep the - original columns. date_format : str or dict of column -> format, default ``None`` If used in conjunction with ``parse_dates``, will parse dates according to this format. For anything more complex, @@ -810,71 +804,8 @@ The simplest case is to just pass in ``parse_dates=True``: It is often the case that we may want to store date and time data separately, or store various date fields separately. the ``parse_dates`` keyword can be -used to specify a combination of columns to parse the dates and/or times from. - -You can specify a list of column lists to ``parse_dates``, the resulting date -columns will be prepended to the output (so as to not affect the existing column -order) and the new column names will be the concatenation of the component -column names: - -.. ipython:: python - :okwarning: - - data = ( - "KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900" - ) - - with open("tmp.csv", "w") as fh: - fh.write(data) - - df = pd.read_csv("tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]]) - df - -By default the parser removes the component date columns, but you can choose -to retain them via the ``keep_date_col`` keyword: - -.. ipython:: python - :okwarning: - - df = pd.read_csv( - "tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True - ) - df +used to specify columns to parse the dates and/or times. -Note that if you wish to combine multiple columns into a single date column, a -nested list must be used. In other words, ``parse_dates=[1, 2]`` indicates that -the second and third columns should each be parsed as separate date columns -while ``parse_dates=[[1, 2]]`` means the two columns should be parsed into a -single column. - -You can also use a dict to specify custom name columns: - -.. ipython:: python - :okwarning: - - date_spec = {"nominal": [1, 2], "actual": [1, 3]} - df = pd.read_csv("tmp.csv", header=None, parse_dates=date_spec) - df - -It is important to remember that if multiple text columns are to be parsed into -a single date column, then a new column is prepended to the data. The ``index_col`` -specification is based off of this new set of columns rather than the original -data columns: - - -.. ipython:: python - :okwarning: - - date_spec = {"nominal": [1, 2], "actual": [1, 3]} - df = pd.read_csv( - "tmp.csv", header=None, parse_dates=date_spec, index_col=0 - ) # index is the nominal column - df .. note:: If a column or index contains an unparsable date, the entire column or @@ -888,10 +819,6 @@ data columns: for your data to store datetimes in this format, load times will be significantly faster, ~20x has been observed. -.. deprecated:: 2.2.0 - Combining date columns inside read_csv is deprecated. Use ``pd.to_datetime`` - on the relevant result columns instead. - Date parsing functions ++++++++++++++++++++++ @@ -907,12 +834,6 @@ Performance-wise, you should try these methods of parsing dates in order: then use ``to_datetime``. -.. ipython:: python - :suppress: - - os.remove("tmp.csv") - - .. _io.csv.mixed_timezones: Parsing a CSV with mixed timezones diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 53b6179dbae93..ce44050a99a79 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -260,8 +260,10 @@ Removal of prior version deprecations/changes - Enforced deprecation of ``axis=None`` acting the same as ``axis=0`` in the DataFrame reductions ``sum``, ``prod``, ``std``, ``var``, and ``sem``, passing ``axis=None`` will now reduce over both axes; this is particularly the case when doing e.g. ``numpy.sum(df)`` (:issue:`21597`) - Enforced deprecation of ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock`` (:issue:`58467`) - Enforced deprecation of ``date_parser`` in :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_excel` in favour of ``date_format`` (:issue:`50601`) +- Enforced deprecation of ``keep_date_col`` keyword in :func:`read_csv` (:issue:`55569`) - Enforced deprecation of ``quantile`` keyword in :meth:`.Rolling.quantile` and :meth:`.Expanding.quantile`, renamed to ``q`` instead. (:issue:`52550`) - Enforced deprecation of argument ``infer_datetime_format`` in :func:`read_csv`, as a strict version of it is now the default (:issue:`48621`) +- Enforced deprecation of combining parsed datetime columns in :func:`read_csv` in ``parse_dates`` (:issue:`55569`) - Enforced deprecation of non-standard (``np.ndarray``, :class:`ExtensionArray`, :class:`Index`, or :class:`Series`) argument to :func:`api.extensions.take` (:issue:`52981`) - Enforced deprecation of parsing system timezone strings to ``tzlocal``, which depended on system timezone, pass the 'tz' keyword instead (:issue:`50791`) - Enforced deprecation of passing a dictionary to :meth:`SeriesGroupBy.agg` (:issue:`52268`) diff --git a/pandas/_libs/tslibs/parsing.pyi b/pandas/_libs/tslibs/parsing.pyi index 40394f915d4b0..845bd9a5a5635 100644 --- a/pandas/_libs/tslibs/parsing.pyi +++ b/pandas/_libs/tslibs/parsing.pyi @@ -27,7 +27,4 @@ def guess_datetime_format( dt_str: str, dayfirst: bool | None = ..., ) -> str | None: ... -def concat_date_cols( - date_cols: tuple, -) -> npt.NDArray[np.object_]: ... def get_rule_month(source: str) -> str: ... diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 85ef3fd93ff09..c448a7e7c01b5 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -7,7 +7,6 @@ import warnings from pandas.util._exceptions import find_stack_level -cimport cython from cpython.datetime cimport ( datetime, datetime_new, @@ -18,7 +17,6 @@ from cpython.datetime cimport ( from datetime import timezone -from cpython.object cimport PyObject_Str from cpython.unicode cimport PyUnicode_AsUTF8AndSize from cython cimport Py_ssize_t from libc.string cimport strchr @@ -28,15 +26,7 @@ import_datetime() import numpy as np cimport numpy as cnp -from numpy cimport ( - PyArray_GETITEM, - PyArray_ITER_DATA, - PyArray_ITER_NEXT, - PyArray_IterNew, - flatiter, - float64_t, - int64_t, -) +from numpy cimport int64_t cnp.import_array() @@ -75,8 +65,6 @@ import_pandas_datetime() from pandas._libs.tslibs.strptime import array_strptime -from pandas._libs.tslibs.util cimport is_array - cdef extern from "pandas/portable.h": int getdigit_ascii(char c, int default) nogil @@ -1097,115 +1085,6 @@ cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst) noexcept: ) -@cython.wraparound(False) -@cython.boundscheck(False) -cdef object convert_to_unicode(object item, bint keep_trivial_numbers): - """ - Convert `item` to str. - - Parameters - ---------- - item : object - keep_trivial_numbers : bool - if True, then conversion (to string from integer/float zero) - is not performed - - Returns - ------- - str or int or float - """ - cdef: - float64_t float_item - - if keep_trivial_numbers: - if isinstance(item, int): - if item == 0: - return item - elif isinstance(item, float): - float_item = item - if float_item == 0.0 or float_item != float_item: - return item - - if not isinstance(item, str): - item = PyObject_Str(item) - - return item - - -@cython.wraparound(False) -@cython.boundscheck(False) -def concat_date_cols(tuple date_cols) -> np.ndarray: - """ - Concatenates elements from numpy arrays in `date_cols` into strings. - - Parameters - ---------- - date_cols : tuple[ndarray] - - Returns - ------- - arr_of_rows : ndarray[object] - - Examples - -------- - >>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object) - >>> times=np.array(['11:20', '10:45'], dtype=object) - >>> result = concat_date_cols((dates, times)) - >>> result - array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object) - """ - cdef: - Py_ssize_t rows_count = 0, col_count = len(date_cols) - Py_ssize_t col_idx, row_idx - list list_to_join - cnp.ndarray[object] iters - object[::1] iters_view - flatiter it - cnp.ndarray[object] result - object[::1] result_view - - if col_count == 0: - return np.zeros(0, dtype=object) - - if not all(is_array(array) for array in date_cols): - raise ValueError("not all elements from date_cols are numpy arrays") - - rows_count = min(len(array) for array in date_cols) - result = np.zeros(rows_count, dtype=object) - result_view = result - - if col_count == 1: - array = date_cols[0] - it = PyArray_IterNew(array) - for row_idx in range(rows_count): - item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) - result_view[row_idx] = convert_to_unicode(item, True) - PyArray_ITER_NEXT(it) - else: - # create fixed size list - more efficient memory allocation - list_to_join = [None] * col_count - iters = np.zeros(col_count, dtype=object) - - # create memoryview of iters ndarray, that will contain some - # flatiter's for each array in `date_cols` - more efficient indexing - iters_view = iters - for col_idx, array in enumerate(date_cols): - iters_view[col_idx] = PyArray_IterNew(array) - - # array elements that are on the same line are converted to one string - for row_idx in range(rows_count): - for col_idx, array in enumerate(date_cols): - # this cast is needed, because we did not find a way - # to efficiently store `flatiter` type objects in ndarray - it = iters_view[col_idx] - item = PyArray_GETITEM(array, PyArray_ITER_DATA(it)) - list_to_join[col_idx] = convert_to_unicode(item, False) - PyArray_ITER_NEXT(it) - result_view[row_idx] = " ".join(list_to_join) - - return result - - cpdef str get_rule_month(str source): """ Return starting month of given freq, default is December. diff --git a/pandas/io/parsers/arrow_parser_wrapper.py b/pandas/io/parsers/arrow_parser_wrapper.py index f8263a65ef5c7..8b6f7d5750ffe 100644 --- a/pandas/io/parsers/arrow_parser_wrapper.py +++ b/pandas/io/parsers/arrow_parser_wrapper.py @@ -174,8 +174,8 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame: self.names = list(range(num_cols - len(self.names))) + self.names multi_index_named = False frame.columns = self.names - # we only need the frame not the names - _, frame = self._do_date_conversions(frame.columns, frame) + + frame = self._do_date_conversions(frame.columns, frame) if self.index_col is not None: index_to_set = self.index_col.copy() for i, item in enumerate(self.index_col): diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index c9ed653babd7a..c6cc85b9f722b 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -4,7 +4,6 @@ from copy import copy import csv from enum import Enum -import itertools from typing import ( TYPE_CHECKING, Any, @@ -23,7 +22,6 @@ ) import pandas._libs.ops as libops from pandas._libs.parsers import STR_NA_VALUES -from pandas._libs.tslibs import parsing from pandas.compat._optional import import_optional_dependency from pandas.errors import ( ParserError, @@ -33,7 +31,6 @@ from pandas.core.dtypes.astype import astype_array from pandas.core.dtypes.common import ( - ensure_object, is_bool_dtype, is_dict_like, is_extension_array_dtype, @@ -42,7 +39,6 @@ is_integer_dtype, is_list_like, is_object_dtype, - is_scalar, is_string_dtype, pandas_dtype, ) @@ -57,7 +53,6 @@ DataFrame, DatetimeIndex, StringDtype, - concat, ) from pandas.core import algorithms from pandas.core.arrays import ( @@ -110,7 +105,6 @@ class BadLineHandleMethod(Enum): keep_default_na: bool dayfirst: bool cache_dates: bool - keep_date_col: bool usecols_dtype: str | None def __init__(self, kwds) -> None: @@ -124,11 +118,19 @@ def __init__(self, kwds) -> None: self.index_names: Sequence[Hashable] | None = None self.col_names: Sequence[Hashable] | None = None - self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False)) - self._parse_date_cols: Iterable = [] + parse_dates = kwds.pop("parse_dates", False) + if parse_dates is None or lib.is_bool(parse_dates): + parse_dates = bool(parse_dates) + elif not isinstance(parse_dates, list): + raise TypeError( + "Only booleans and lists are accepted " + "for the 'parse_dates' parameter" + ) + self.parse_dates: bool | list = parse_dates + self._parse_date_cols: set = set() + self.date_parser = kwds.pop("date_parser", lib.no_default) self.date_format = kwds.pop("date_format", None) self.dayfirst = kwds.pop("dayfirst", False) - self.keep_date_col = kwds.pop("keep_date_col", False) self.na_values = kwds.get("na_values") self.na_fvalues = kwds.get("na_fvalues") @@ -177,8 +179,6 @@ def __init__(self, kwds) -> None: else: self.index_col = list(self.index_col) - self._name_processed = False - self._first_chunk = True self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"]) @@ -187,7 +187,7 @@ def __init__(self, kwds) -> None: # Normally, this arg would get pre-processed earlier on self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR) - def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable: + def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> set: """ Check if parse_dates are in columns. @@ -201,7 +201,7 @@ def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterabl Returns ------- - The names of the columns which will get parsed later if a dict or list + The names of the columns which will get parsed later if a list is given as specification. Raises @@ -210,30 +210,15 @@ def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterabl If column to parse_date is not in dataframe. """ - cols_needed: Iterable - if is_dict_like(self.parse_dates): - cols_needed = itertools.chain(*self.parse_dates.values()) - elif is_list_like(self.parse_dates): - # a column in parse_dates could be represented - # ColReference = Union[int, str] - # DateGroups = List[ColReference] - # ParseDates = Union[DateGroups, List[DateGroups], - # Dict[ColReference, DateGroups]] - cols_needed = itertools.chain.from_iterable( - col if is_list_like(col) and not isinstance(col, tuple) else [col] - for col in self.parse_dates - ) - else: - cols_needed = [] - - cols_needed = list(cols_needed) + if not isinstance(self.parse_dates, list): + return set() # get only columns that are references using names (str), not by index missing_cols = ", ".join( sorted( { col - for col in cols_needed + for col in self.parse_dates if isinstance(col, str) and col not in columns } ) @@ -243,27 +228,18 @@ def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterabl f"Missing column provided to 'parse_dates': '{missing_cols}'" ) # Convert positions to actual column names - return [ + return { col if (isinstance(col, str) or col in columns) else columns[col] - for col in cols_needed - ] + for col in self.parse_dates + } def close(self) -> None: pass - @final - @property - def _has_complex_date_col(self) -> bool: - return isinstance(self.parse_dates, dict) or ( - isinstance(self.parse_dates, list) - and len(self.parse_dates) > 0 - and isinstance(self.parse_dates[0], list) - ) - @final def _should_parse_dates(self, i: int) -> bool: - if lib.is_bool(self.parse_dates): - return bool(self.parse_dates) + if isinstance(self.parse_dates, bool): + return self.parse_dates else: if self.index_names is not None: name = self.index_names[i] @@ -365,18 +341,9 @@ def _make_index( index: Index | None if not is_index_col(self.index_col) or not self.index_col: index = None - - elif not self._has_complex_date_col: + else: simple_index = self._get_simple_index(alldata, columns) index = self._agg_index(simple_index) - elif self._has_complex_date_col: - if not self._name_processed: - (self.index_names, _, self.index_col) = self._clean_index_names( - list(columns), self.index_col - ) - self._name_processed = True - date_index = self._get_complex_date_index(data, columns) - index = self._agg_index(date_index, try_parse_dates=False) # add names for the index if indexnamerow: @@ -412,34 +379,6 @@ def ix(col): return index - @final - def _get_complex_date_index(self, data, col_names): - def _get_name(icol): - if isinstance(icol, str): - return icol - - if col_names is None: - raise ValueError(f"Must supply column order to use {icol!s} as index") - - for i, c in enumerate(col_names): - if i == icol: - return c - - to_remove = [] - index = [] - for idx in self.index_col: - name = _get_name(idx) - to_remove.append(name) - index.append(data[name]) - - # remove index items from content and columns, don't pop in - # loop - for c in sorted(to_remove, reverse=True): - data.pop(c) - col_names.remove(c) - - return index - @final def _clean_mapping(self, mapping): """converts col numbers to names""" @@ -642,19 +581,7 @@ def _set(x) -> int: if isinstance(self.parse_dates, list): for val in self.parse_dates: - if isinstance(val, list): - for k in val: - noconvert_columns.add(_set(k)) - else: - noconvert_columns.add(_set(val)) - - elif isinstance(self.parse_dates, dict): - for val in self.parse_dates.values(): - if isinstance(val, list): - for k in val: - noconvert_columns.add(_set(k)) - else: - noconvert_columns.add(_set(val)) + noconvert_columns.add(_set(val)) elif self.parse_dates: if isinstance(self.index_col, list): @@ -855,36 +782,33 @@ def _do_date_conversions( self, names: Index, data: DataFrame, - ) -> tuple[Sequence[Hashable] | Index, DataFrame]: ... + ) -> DataFrame: ... @overload def _do_date_conversions( self, names: Sequence[Hashable], data: Mapping[Hashable, ArrayLike], - ) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]: ... + ) -> Mapping[Hashable, ArrayLike]: ... @final def _do_date_conversions( self, names: Sequence[Hashable] | Index, data: Mapping[Hashable, ArrayLike] | DataFrame, - ) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]: - # returns data, columns - - if self.parse_dates is not None: - data, names = _process_date_conversion( + ) -> Mapping[Hashable, ArrayLike] | DataFrame: + if isinstance(self.parse_dates, list): + return _process_date_conversion( data, self._date_conv, self.parse_dates, self.index_col, self.index_names, names, - keep_date_col=self.keep_date_col, dtype_backend=self.dtype_backend, ) - return names, data + return data @final def _check_data_length( @@ -1121,17 +1045,15 @@ def _make_date_converter( cache_dates: bool = True, date_format: dict[Hashable, str] | str | None = None, ): - def converter(*date_cols, col: Hashable): - if len(date_cols) == 1 and date_cols[0].dtype.kind in "Mm": - return date_cols[0] - # TODO: Can we remove concat_date_cols after deprecation of parsing - # multiple cols? - strs = parsing.concat_date_cols(date_cols) + def converter(date_col, col: Hashable): + if date_col.dtype.kind in "Mm": + return date_col + date_fmt = ( date_format.get(col) if isinstance(date_format, dict) else date_format ) - str_objs = ensure_object(strs) + str_objs = lib.ensure_string_array(date_col) try: result = tools.to_datetime( str_objs, @@ -1180,7 +1102,6 @@ def converter(*date_cols, col: Hashable): "decimal": ".", # 'engine': 'c', "parse_dates": False, - "keep_date_col": False, "dayfirst": False, "date_format": None, "usecols": None, @@ -1196,125 +1117,39 @@ def converter(*date_cols, col: Hashable): def _process_date_conversion( - data_dict, + data_dict: Mapping[Hashable, ArrayLike] | DataFrame, converter: Callable, - parse_spec, + parse_spec: list, index_col, index_names, - columns, - keep_date_col: bool = False, + columns: Sequence[Hashable] | Index, dtype_backend=lib.no_default, -) -> tuple[dict, list]: - def _isindex(colspec): - return (isinstance(index_col, list) and colspec in index_col) or ( +) -> Mapping[Hashable, ArrayLike] | DataFrame: + for colspec in parse_spec: + if isinstance(colspec, int) and colspec not in data_dict: + colspec = columns[colspec] + if (isinstance(index_col, list) and colspec in index_col) or ( isinstance(index_names, list) and colspec in index_names - ) - - new_cols = [] - new_data = {} - - orig_names = columns - columns = list(columns) - - date_cols = set() - - if parse_spec is None or isinstance(parse_spec, bool): - return data_dict, columns - - if isinstance(parse_spec, list): - # list of column lists - for colspec in parse_spec: - if is_scalar(colspec) or isinstance(colspec, tuple): - if isinstance(colspec, int) and colspec not in data_dict: - colspec = orig_names[colspec] - if _isindex(colspec): - continue - elif dtype_backend == "pyarrow": - import pyarrow as pa - - dtype = data_dict[colspec].dtype - if isinstance(dtype, ArrowDtype) and ( - pa.types.is_timestamp(dtype.pyarrow_dtype) - or pa.types.is_date(dtype.pyarrow_dtype) - ): - continue - - # Pyarrow engine returns Series which we need to convert to - # numpy array before converter, its a no-op for other parsers - data_dict[colspec] = converter( - np.asarray(data_dict[colspec]), col=colspec - ) - else: - new_name, col, old_names = _try_convert_dates( - converter, colspec, data_dict, orig_names - ) - if new_name in data_dict: - raise ValueError(f"New date column already in dict {new_name}") - new_data[new_name] = col - new_cols.append(new_name) - date_cols.update(old_names) - - elif isinstance(parse_spec, dict): - # dict of new name to column list - for new_name, colspec in parse_spec.items(): - if new_name in data_dict: - raise ValueError(f"Date column {new_name} already in dict") - - _, col, old_names = _try_convert_dates( - converter, - colspec, - data_dict, - orig_names, - target_name=new_name, - ) - - new_data[new_name] = col - - # If original column can be converted to date we keep the converted values - # This can only happen if values are from single column - if len(colspec) == 1: - new_data[colspec[0]] = col - - new_cols.append(new_name) - date_cols.update(old_names) - - if isinstance(data_dict, DataFrame): - data_dict = concat([DataFrame(new_data), data_dict], axis=1) - else: - data_dict.update(new_data) - new_cols.extend(columns) - - if not keep_date_col: - for c in list(date_cols): - data_dict.pop(c) - new_cols.remove(c) - - return data_dict, new_cols - - -def _try_convert_dates( - parser: Callable, colspec, data_dict, columns, target_name: str | None = None -): - colset = set(columns) - colnames = [] - - for c in colspec: - if c in colset: - colnames.append(c) - elif isinstance(c, int) and c not in columns: - colnames.append(columns[c]) - else: - colnames.append(c) + ): + continue + elif dtype_backend == "pyarrow": + import pyarrow as pa + + dtype = data_dict[colspec].dtype + if isinstance(dtype, ArrowDtype) and ( + pa.types.is_timestamp(dtype.pyarrow_dtype) + or pa.types.is_date(dtype.pyarrow_dtype) + ): + continue - new_name: tuple | str - if all(isinstance(x, tuple) for x in colnames): - new_name = tuple(map("_".join, zip(*colnames))) - else: - new_name = "_".join([str(x) for x in colnames]) - to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict] + # Pyarrow engine returns Series which we need to convert to + # numpy array before converter, its a no-op for other parsers + result = converter(np.asarray(data_dict[colspec]), col=colspec) + # error: Unsupported target for indexed assignment + # ("Mapping[Hashable, ExtensionArray | ndarray[Any, Any]] | DataFrame") + data_dict[colspec] = result # type: ignore[index] - new_col = parser(*to_parse, col=new_name if target_name is None else target_name) - return new_name, new_col, colnames + return data_dict def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): @@ -1352,26 +1187,5 @@ def _get_na_values(col, na_values, na_fvalues, keep_default_na: bool): return na_values, na_fvalues -def _validate_parse_dates_arg(parse_dates): - """ - Check whether or not the 'parse_dates' parameter - is a non-boolean scalar. Raises a ValueError if - that is the case. - """ - msg = ( - "Only booleans, lists, and dictionaries are accepted " - "for the 'parse_dates' parameter" - ) - - if not ( - parse_dates is None - or lib.is_bool(parse_dates) - or isinstance(parse_dates, (list, dict)) - ): - raise TypeError(msg) - - return parse_dates - - def is_index_col(col) -> bool: return col is not None and col is not False diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 6e5d36ad39c8a..4de626288aa41 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -166,30 +166,28 @@ def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None: # error: Cannot determine type of 'names' self.orig_names = self.names # type: ignore[has-type] - if not self._has_complex_date_col: - # error: Cannot determine type of 'index_col' - if self._reader.leading_cols == 0 and is_index_col( - self.index_col # type: ignore[has-type] - ): - self._name_processed = True - ( - index_names, - # error: Cannot determine type of 'names' - self.names, # type: ignore[has-type] - self.index_col, - ) = self._clean_index_names( - # error: Cannot determine type of 'names' - self.names, # type: ignore[has-type] - # error: Cannot determine type of 'index_col' - self.index_col, # type: ignore[has-type] - ) + # error: Cannot determine type of 'index_col' + if self._reader.leading_cols == 0 and is_index_col( + self.index_col # type: ignore[has-type] + ): + ( + index_names, + # error: Cannot determine type of 'names' + self.names, # type: ignore[has-type] + self.index_col, + ) = self._clean_index_names( + # error: Cannot determine type of 'names' + self.names, # type: ignore[has-type] + # error: Cannot determine type of 'index_col' + self.index_col, # type: ignore[has-type] + ) - if self.index_names is None: - self.index_names = index_names + if self.index_names is None: + self.index_names = index_names - if self._reader.header is None and not passed_names: - assert self.index_names is not None - self.index_names = [None] * len(self.index_names) + if self._reader.header is None and not passed_names: + assert self.index_names is not None + self.index_names = [None] * len(self.index_names) self._implicit_index = self._reader.leading_cols > 0 @@ -274,9 +272,6 @@ def read( names = self.names # type: ignore[has-type] if self._reader.leading_cols: - if self._has_complex_date_col: - raise NotImplementedError("file structure not yet supported") - # implicit index, no index names arrays = [] @@ -307,12 +302,10 @@ def read( data_tups = sorted(data.items()) data = {k: v for k, (i, v) in zip(names, data_tups)} - column_names, date_data = self._do_date_conversions(names, data) + date_data = self._do_date_conversions(names, data) # maybe create a mi on the columns - column_names = self._maybe_make_multi_index_columns( - column_names, self.col_names - ) + column_names = self._maybe_make_multi_index_columns(names, self.col_names) else: # rename dict keys @@ -335,7 +328,7 @@ def read( data = {k: v for k, (i, v) in zip(names, data_tups)} - names, date_data = self._do_date_conversions(names, data) + date_data = self._do_date_conversions(names, data) index, column_names = self._make_index(date_data, alldata, names) return index, column_names, date_data diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index e2456b165fe60..f7d2aa2419429 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -150,14 +150,9 @@ def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None: # get popped off for index self.orig_names: list[Hashable] = list(self.columns) - # needs to be cleaned/refactored - # multiple date column thing turning into a real spaghetti factory - - if not self._has_complex_date_col: - (index_names, self.orig_names, self.columns) = self._get_index_name() - self._name_processed = True - if self.index_names is None: - self.index_names = index_names + index_names, self.orig_names, self.columns = self._get_index_name() + if self.index_names is None: + self.index_names = index_names if self._col_indices is None: self._col_indices = list(range(len(self.columns))) @@ -294,7 +289,7 @@ def read( data, columns = self._exclude_implicit_index(alldata) conv_data = self._convert_data(data) - columns, conv_data = self._do_date_conversions(columns, conv_data) + conv_data = self._do_date_conversions(columns, conv_data) index, result_columns = self._make_index( conv_data, alldata, columns, indexnamerow diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 648e5108df77a..970ddaacc6e65 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -40,7 +40,6 @@ from pandas.core.dtypes.common import ( is_file_like, is_float, - is_hashable, is_integer, is_list_like, pandas_dtype, @@ -118,7 +117,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): na_filter: bool skip_blank_lines: bool parse_dates: bool | Sequence[Hashable] | None - keep_date_col: bool | lib.NoDefault date_format: str | dict[Hashable, str] | None dayfirst: bool cache_dates: bool @@ -300,18 +298,13 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): performance of reading a large file. skip_blank_lines : bool, default True If ``True``, skip over blank lines rather than interpreting as ``NaN`` values. -parse_dates : bool, None, list of Hashable, list of lists or dict of {{Hashable : \ -list}}, default None +parse_dates : bool, None, list of Hashable, default None The behavior is as follows: * ``bool``. If ``True`` -> try parsing the index. * ``None``. Behaves like ``True`` if ``date_format`` is specified. * ``list`` of ``int`` or names. e.g. If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date column. - * ``list`` of ``list``. e.g. If ``[[1, 3]]`` -> combine columns 1 and 3 and parse - as a single date column. Values are joined with a space before parsing. - * ``dict``, e.g. ``{{'foo' : [1, 3]}}`` -> parse columns 1, 3 as date and call - result 'foo'. Values are joined with a space before parsing. If a column or index cannot be represented as an array of ``datetime``, say because of an unparsable value or a mixture of timezones, the column @@ -320,9 +313,6 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): :func:`~pandas.read_csv`. Note: A fast-path exists for iso8601-formatted dates. -keep_date_col : bool, default False - If ``True`` and ``parse_dates`` specifies combining multiple columns then - keep the original columns. date_format : str or dict of column -> format, optional Format to use for parsing dates when used in conjunction with ``parse_dates``. The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See @@ -729,7 +719,6 @@ def read_csv( skip_blank_lines: bool = True, # Datetime Handling parse_dates: bool | Sequence[Hashable] | None = None, - keep_date_col: bool | lib.NoDefault = lib.no_default, date_format: str | dict[Hashable, str] | None = None, dayfirst: bool = False, cache_dates: bool = True, @@ -759,38 +748,6 @@ def read_csv( storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: - if keep_date_col is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'keep_date_col' keyword in pd.read_csv is deprecated and " - "will be removed in a future version. Explicitly remove unwanted " - "columns after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - keep_date_col = False - - if lib.is_list_like(parse_dates): - # GH#55569 - depr = False - # error: Item "bool" of "bool | Sequence[Hashable] | None" has no - # attribute "__iter__" (not iterable) - if not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr] - depr = True - elif isinstance(parse_dates, dict) and any( - lib.is_list_like(x) for x in parse_dates.values() - ): - depr = True - if depr: - warnings.warn( - "Support for nested sequences for 'parse_dates' in pd.read_csv " - "is deprecated. Combine the desired columns with pd.to_datetime " - "after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if delim_whitespace is not lib.no_default: # GH#55569 warnings.warn( @@ -907,7 +864,6 @@ def read_table( skip_blank_lines: bool = True, # Datetime Handling parse_dates: bool | Sequence[Hashable] | None = None, - keep_date_col: bool | lib.NoDefault = lib.no_default, date_format: str | dict[Hashable, str] | None = None, dayfirst: bool = False, cache_dates: bool = True, @@ -937,29 +893,6 @@ def read_table( storage_options: StorageOptions | None = None, dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, ) -> DataFrame | TextFileReader: - if keep_date_col is not lib.no_default: - # GH#55569 - warnings.warn( - "The 'keep_date_col' keyword in pd.read_table is deprecated and " - "will be removed in a future version. Explicitly remove unwanted " - "columns after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - else: - keep_date_col = False - - # error: Item "bool" of "bool | Sequence[Hashable]" has no attribute "__iter__" - if lib.is_list_like(parse_dates) and not all(is_hashable(x) for x in parse_dates): # type: ignore[union-attr] - # GH#55569 - warnings.warn( - "Support for nested sequences for 'parse_dates' in pd.read_table " - "is deprecated. Combine the desired columns with pd.to_datetime " - "after parsing instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - if delim_whitespace is not lib.no_default: # GH#55569 warnings.warn( @@ -1616,7 +1549,6 @@ def TextParser(*args, **kwds) -> TextFileReader: comment : str, optional Comment out remainder of line parse_dates : bool, default False - keep_date_col : bool, default False date_format : str or dict of column -> format, default ``None`` .. versionadded:: 2.0.0 diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 485680d9de48c..d79e0c34edaab 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -22,14 +22,10 @@ from pandas import ( DataFrame, Index, - Timestamp, compat, ) import pandas._testing as tm -from pandas.io.parsers import TextFileReader -from pandas.io.parsers.c_parser_wrapper import CParserWrapper - pytestmark = pytest.mark.filterwarnings( "ignore:Passing a BlockManager to DataFrame:DeprecationWarning" ) @@ -38,53 +34,6 @@ skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -def test_override_set_noconvert_columns(): - # see gh-17351 - # - # Usecols needs to be sorted in _set_noconvert_columns based - # on the test_usecols_with_parse_dates test from test_usecols.py - class MyTextFileReader(TextFileReader): - def __init__(self) -> None: - self._currow = 0 - self.squeeze = False - - class MyCParserWrapper(CParserWrapper): - def _set_noconvert_columns(self): - if self.usecols_dtype == "integer": - # self.usecols is a set, which is documented as unordered - # but in practice, a CPython set of integers is sorted. - # In other implementations this assumption does not hold. - # The following code simulates a different order, which - # before GH 17351 would cause the wrong columns to be - # converted via the parse_dates parameter - self.usecols = list(self.usecols) - self.usecols.reverse() - return CParserWrapper._set_noconvert_columns(self) - - data = """a,b,c,d,e -0,1,2014-01-01,09:00,4 -0,1,2014-01-02,10:00,4""" - - parse_dates = [[1, 2]] - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - - parser = MyTextFileReader() - parser.options = { - "usecols": [0, 2, 3], - "parse_dates": parse_dates, - "delimiter": ",", - } - parser.engine = "c" - parser._engine = MyCParserWrapper(StringIO(data), **parser.options) - - result = parser.read() - tm.assert_frame_equal(result, expected) - - def test_read_csv_local(all_parsers, csv1): prefix = "file:///" if compat.is_platform_windows() else "file://" parser = all_parsers diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 9c9a10f206a47..3bb3d793606e1 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -14,8 +14,6 @@ import pytest import pytz -from pandas._libs.tslibs import parsing - import pandas as pd from pandas import ( DataFrame, @@ -39,181 +37,6 @@ skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip") -@xfail_pyarrow -def test_separator_date_conflict(all_parsers): - # Regression test for gh-4678 - # - # Make sure thousands separator and - # date parsing do not conflict. - parser = all_parsers - data = "06-02-2013;13:00;1-000.215" - expected = DataFrame( - [[datetime(2013, 6, 2, 13, 0, 0), 1000.215]], columns=["Date", 2] - ) - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - df = parser.read_csv( - StringIO(data), - sep=";", - thousands="-", - parse_dates={"Date": [0, 1]}, - header=None, - ) - tm.assert_frame_equal(df, expected) - - -@pytest.mark.parametrize("container", [list, tuple, Index, Series]) -@pytest.mark.parametrize("dim", [1, 2]) -def test_concat_date_col_fail(container, dim): - msg = "not all elements from date_cols are numpy arrays" - value = "19990127" - - date_cols = tuple(container([value]) for _ in range(dim)) - - with pytest.raises(ValueError, match=msg): - parsing.concat_date_cols(date_cols) - - -@pytest.mark.parametrize("keep_date_col", [True, False]) -def test_multiple_date_col(all_parsers, keep_date_col, request): - data = """\ -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - parser = all_parsers - - if keep_date_col and parser.engine == "pyarrow": - # For this to pass, we need to disable auto-inference on the date columns - # in parse_dates. We have no way of doing this though - mark = pytest.mark.xfail( - reason="pyarrow doesn't support disabling auto-inference on column numbers." - ) - request.applymarker(mark) - - depr_msg = "The 'keep_date_col' keyword in pd.read_csv is deprecated" - - kwds = { - "header": None, - "parse_dates": [[1, 2], [1, 3]], - "keep_date_col": keep_date_col, - "names": ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8"], - } - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv(StringIO(data), **kwds) - - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - datetime(1999, 1, 27, 18, 56), - "KORD", - "19990127", - " 19:00:00", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - datetime(1999, 1, 27, 19, 56), - "KORD", - "19990127", - " 20:00:00", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 20, 56), - "KORD", - "19990127", - " 21:00:00", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - datetime(1999, 1, 27, 21, 18), - "KORD", - "19990127", - " 21:00:00", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - datetime(1999, 1, 27, 21, 56), - "KORD", - "19990127", - " 22:00:00", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - datetime(1999, 1, 27, 22, 56), - "KORD", - "19990127", - " 23:00:00", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "X1_X2", - "X1_X3", - "X0", - "X1", - "X2", - "X3", - "X4", - "X5", - "X6", - "X7", - "X8", - ], - ) - - if not keep_date_col: - expected = expected.drop(["X1", "X2", "X3"], axis=1) - - tm.assert_frame_equal(result, expected) - - def test_date_col_as_index_col(all_parsers): data = """\ KORD,19990127 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 @@ -260,136 +83,6 @@ def test_date_col_as_index_col(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow -def test_multiple_date_cols_with_header(all_parsers): - parser = all_parsers - data = """\ -ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - "KORD", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - "KORD", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - "KORD", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - "KORD", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "nominal", - "ID", - "ActualTime", - "TDew", - "TAir", - "Windspeed", - "Precip", - "WindDir", - ], - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize( - "data,parse_dates,msg", - [ - ( - """\ -date_NominalTime,date,NominalTime -KORD1,19990127, 19:00:00 -KORD2,19990127, 20:00:00""", - [[1, 2]], - ("New date column already in dict date_NominalTime"), - ), - ( - """\ -ID,date,nominalTime -KORD,19990127, 19:00:00 -KORD,19990127, 20:00:00""", - {"ID": [1, 2]}, - "Date column ID already in dict", - ), - ], -) -def test_multiple_date_col_name_collision(all_parsers, data, parse_dates, msg): - parser = all_parsers - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv(StringIO(data), parse_dates=parse_dates) - - @xfail_pyarrow def test_nat_parse(all_parsers): # see gh-3062 @@ -441,37 +134,6 @@ def test_parse_dates_string(all_parsers): tm.assert_frame_equal(result, expected) -# Bug in https://github.com/dateutil/dateutil/issues/217 -# has been addressed, but we just don't pass in the `yearfirst` -@pytest.mark.xfail(reason="yearfirst is not surfaced in read_*") -@pytest.mark.parametrize("parse_dates", [[["date", "time"]], [[0, 1]]]) -def test_yy_format_with_year_first(all_parsers, parse_dates): - data = """date,time,B,C -090131,0010,1,2 -090228,1020,3,4 -090331,0830,5,6 -""" - parser = all_parsers - result = parser.read_csv_check_warnings( - UserWarning, - "Could not infer format", - StringIO(data), - index_col=0, - parse_dates=parse_dates, - ) - index = DatetimeIndex( - [ - datetime(2009, 1, 31, 0, 10, 0), - datetime(2009, 2, 28, 10, 20, 0), - datetime(2009, 3, 31, 8, 30, 0), - ], - dtype=object, - name="date_time", - ) - expected = DataFrame({"B": [1, 3, 5], "C": [2, 4, 6]}, index=index) - tm.assert_frame_equal(result, expected) - - @xfail_pyarrow @pytest.mark.parametrize("parse_dates", [[0, 2], ["a", "c"]]) def test_parse_dates_column_list(all_parsers, parse_dates): @@ -561,282 +223,11 @@ def test_parse_tz_aware(all_parsers): assert result.index.tz is expected_tz -@xfail_pyarrow -@pytest.mark.parametrize( - "parse_dates,index_col", - [({"nominal": [1, 2]}, "nominal"), ({"nominal": [1, 2]}, 0), ([[1, 2]], 0)], -) -def test_multiple_date_cols_index(all_parsers, parse_dates, index_col): - parser = all_parsers - data = """ -ID,date,NominalTime,ActualTime,TDew,TAir,Windspeed,Precip,WindDir -KORD1,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD2,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - "KORD1", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - "KORD2", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD3", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD4", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - "KORD5", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - "KORD6", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=[ - "nominal", - "ID", - "ActualTime", - "TDew", - "TAir", - "Windspeed", - "Precip", - "WindDir", - ], - ) - expected = expected.set_index("nominal") - - if not isinstance(parse_dates, dict): - expected.index.name = "date_NominalTime" - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), parse_dates=parse_dates, index_col=index_col - ) - tm.assert_frame_equal(result, expected) - - -@xfail_pyarrow -def test_multiple_date_cols_chunked(all_parsers): - parser = all_parsers - data = """\ -ID,date,nominalTime,actualTime,A,B,C,D,E -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - - expected = DataFrame( - [ - [ - datetime(1999, 1, 27, 19, 0), - "KORD", - " 18:56:00", - 0.81, - 2.81, - 7.2, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 20, 0), - "KORD", - " 19:56:00", - 0.01, - 2.21, - 7.2, - 0.0, - 260.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 20:56:00", - -0.59, - 2.21, - 5.7, - 0.0, - 280.0, - ], - [ - datetime(1999, 1, 27, 21, 0), - "KORD", - " 21:18:00", - -0.99, - 2.01, - 3.6, - 0.0, - 270.0, - ], - [ - datetime(1999, 1, 27, 22, 0), - "KORD", - " 21:56:00", - -0.59, - 1.71, - 5.1, - 0.0, - 290.0, - ], - [ - datetime(1999, 1, 27, 23, 0), - "KORD", - " 22:56:00", - -0.59, - 1.71, - 4.6, - 0.0, - 280.0, - ], - ], - columns=["nominal", "ID", "actualTime", "A", "B", "C", "D", "E"], - ) - expected = expected.set_index("nominal") - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - with parser.read_csv( - StringIO(data), - parse_dates={"nominal": [1, 2]}, - index_col="nominal", - chunksize=2, - ) as reader: - chunks = list(reader) - - tm.assert_frame_equal(chunks[0], expected[:2]) - tm.assert_frame_equal(chunks[1], expected[2:4]) - tm.assert_frame_equal(chunks[2], expected[4:]) - - -def test_multiple_date_col_named_index_compat(all_parsers): - parser = all_parsers - data = """\ -ID,date,nominalTime,actualTime,A,B,C,D,E -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - with_indices = parser.read_csv( - StringIO(data), parse_dates={"nominal": [1, 2]}, index_col="nominal" - ) - - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - with_names = parser.read_csv( - StringIO(data), - index_col="nominal", - parse_dates={"nominal": ["date", "nominalTime"]}, - ) - tm.assert_frame_equal(with_indices, with_names) - - -def test_multiple_date_col_multiple_index_compat(all_parsers): - parser = all_parsers - data = """\ -ID,date,nominalTime,actualTime,A,B,C,D,E -KORD,19990127, 19:00:00, 18:56:00, 0.8100, 2.8100, 7.2000, 0.0000, 280.0000 -KORD,19990127, 20:00:00, 19:56:00, 0.0100, 2.2100, 7.2000, 0.0000, 260.0000 -KORD,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 -KORD,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 -KORD,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000 -""" - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), index_col=["nominal", "ID"], parse_dates={"nominal": [1, 2]} - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - expected = parser.read_csv(StringIO(data), parse_dates={"nominal": [1, 2]}) - - expected = expected.set_index(["nominal", "ID"]) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("kwargs", [{}, {"index_col": "C"}]) def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): # see gh-5636 parser = all_parsers - msg = ( - "Only booleans, lists, and dictionaries " - "are accepted for the 'parse_dates' parameter" - ) + msg = "Only booleans and lists " "are accepted for the 'parse_dates' parameter" data = """A,B,C 1,2,2003-11-1""" @@ -847,15 +238,12 @@ def test_read_with_parse_dates_scalar_non_bool(all_parsers, kwargs): @pytest.mark.parametrize("parse_dates", [(1,), np.array([4, 5]), {1, 3}]) def test_read_with_parse_dates_invalid_type(all_parsers, parse_dates): parser = all_parsers - msg = ( - "Only booleans, lists, and dictionaries " - "are accepted for the 'parse_dates' parameter" - ) + msg = "Only booleans and lists " "are accepted for the 'parse_dates' parameter" data = """A,B,C 1,2,2003-11-1""" with pytest.raises(TypeError, match=msg): - parser.read_csv(StringIO(data), parse_dates=(1,)) + parser.read_csv(StringIO(data), parse_dates=parse_dates) @pytest.mark.parametrize("value", ["nan", ""]) @@ -905,7 +293,6 @@ def test_bad_date_parse_with_warning(all_parsers, cache, value): ) -@xfail_pyarrow def test_parse_dates_empty_string(all_parsers): # see gh-2263 parser = all_parsers @@ -1118,11 +505,6 @@ def test_parse_multiple_delimited_dates_with_swap_warnings(): [ (None, ["val"], ["date", "time"], "date, time"), (None, ["val"], [0, "time"], "time"), - (None, ["val"], [["date", "time"]], "date, time"), - (None, ["val"], [[0, "time"]], "time"), - (None, ["val"], {"date": [0, "time"]}, "time"), - (None, ["val"], {"date": ["date", "time"]}, "date, time"), - (None, ["val"], [["date", "time"], "date"], "date, time"), (["date1", "time1", "temperature"], None, ["date", "time"], "date, time"), ( ["date1", "time1", "temperature"], @@ -1140,20 +522,10 @@ def test_missing_parse_dates_column_raises( content = StringIO("date,time,val\n2020-01-31,04:20:32,32\n") msg = f"Missing column provided to 'parse_dates': '{missing_cols}'" - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - warn = FutureWarning - if isinstance(parse_dates, list) and all( - isinstance(x, (int, str)) for x in parse_dates - ): - warn = None - with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(warn, match=depr_msg, check_stacklevel=False): - parser.read_csv( - content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates - ) + parser.read_csv( + content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates + ) @xfail_pyarrow # mismatched shape @@ -1189,37 +561,6 @@ def test_date_parser_multiindex_columns(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # TypeError: an integer is required -@pytest.mark.parametrize( - "parse_spec, col_name", - [ - ([[("a", "1"), ("b", "2")]], ("a_b", "1_2")), - ({("foo", "1"): [("a", "1"), ("b", "2")]}, ("foo", "1")), - ], -) -def test_date_parser_multiindex_columns_combine_cols(all_parsers, parse_spec, col_name): - parser = all_parsers - data = """a,b,c -1,2,3 -2019-12,-31,6""" - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), - parse_dates=parse_spec, - header=[0, 1], - ) - expected = DataFrame( - {col_name: Timestamp("2019-12-31").as_unit("ns"), ("c", "3"): [6]} - ) - tm.assert_frame_equal(result, expected) - - def test_date_parser_usecols_thousands(all_parsers): # GH#39365 data = """A,B,C @@ -1253,26 +594,6 @@ def test_date_parser_usecols_thousands(all_parsers): tm.assert_frame_equal(result, expected) -@xfail_pyarrow # mismatched shape -def test_parse_dates_and_keep_original_column(all_parsers): - # GH#13378 - parser = all_parsers - data = """A -20150908 -20150909 -""" - depr_msg = "The 'keep_date_col' keyword in pd.read_csv is deprecated" - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), parse_dates={"date": ["A"]}, keep_date_col=True - ) - expected_data = [Timestamp("2015-09-08"), Timestamp("2015-09-09")] - expected = DataFrame({"date": expected_data, "A": expected_data}) - tm.assert_frame_equal(result, expected) - - def test_dayfirst_warnings(): # GH 12585 @@ -1467,33 +788,6 @@ def test_parse_dates_dict_format(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "key, parse_dates", [("a_b", [[0, 1]]), ("foo", {"foo": [0, 1]})] -) -def test_parse_dates_dict_format_two_columns(all_parsers, key, parse_dates): - # GH#51240 - parser = all_parsers - data = """a,b -31-,12-2019 -31-,12-2020""" - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), date_format={key: "%d- %m-%Y"}, parse_dates=parse_dates - ) - expected = DataFrame( - { - key: [Timestamp("2019-12-31"), Timestamp("2020-12-31")], - } - ) - tm.assert_frame_equal(result, expected) - - @xfail_pyarrow # object dtype index def test_parse_dates_dict_format_index(all_parsers): # GH#51240 diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py index ab98857e0c178..0cf3fe894c916 100644 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -26,42 +26,6 @@ ) -@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) -def test_usecols_with_parse_dates(all_parsers, usecols): - # see gh-9755 - data = """a,b,c,d,e -0,1,2014-01-01,09:00,4 -0,1,2014-01-02,10:00,4""" - parser = all_parsers - parse_dates = [[1, 2]] - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - if parser.engine == "pyarrow": - with pytest.raises(ValueError, match=_msg_pyarrow_requires_names): - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - parser.read_csv( - StringIO(data), usecols=usecols, parse_dates=parse_dates - ) - return - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), usecols=usecols, parse_dates=parse_dates - ) - tm.assert_frame_equal(result, expected) - - @skip_pyarrow # pyarrow.lib.ArrowKeyError: Column 'fdate' in include_columns def test_usecols_with_parse_dates2(all_parsers): # see gh-13604 @@ -121,75 +85,3 @@ def test_usecols_with_parse_dates3(all_parsers): result = parser.read_csv(StringIO(data), usecols=usecols, parse_dates=parse_dates) tm.assert_frame_equal(result, expected) - - -def test_usecols_with_parse_dates4(all_parsers): - data = "a,b,c,d,e,f,g,h,i,j\n2016/09/21,1,1,2,3,4,5,6,7,8" - usecols = list("abcdefghij") - parse_dates = [[0, 1]] - parser = all_parsers - - cols = { - "a_b": "2016/09/21 1", - "c": [1], - "d": [2], - "e": [3], - "f": [4], - "g": [5], - "h": [6], - "i": [7], - "j": [8], - } - expected = DataFrame(cols, columns=["a_b"] + list("cdefghij")) - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(data), - usecols=usecols, - parse_dates=parse_dates, - ) - tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("usecols", [[0, 2, 3], [3, 0, 2]]) -@pytest.mark.parametrize( - "names", - [ - list("abcde"), # Names span all columns in original data. - list("acd"), # Names span only the selected columns. - ], -) -def test_usecols_with_parse_dates_and_names(all_parsers, usecols, names, request): - # see gh-9755 - s = """0,1,2014-01-01,09:00,4 -0,1,2014-01-02,10:00,4""" - parse_dates = [[1, 2]] - parser = all_parsers - - if parser.engine == "pyarrow" and not (len(names) == 3 and usecols[0] == 0): - mark = pytest.mark.xfail( - reason="Length mismatch in some cases, UserWarning in other" - ) - request.applymarker(mark) - - cols = { - "a": [0, 0], - "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")], - } - expected = DataFrame(cols, columns=["c_d", "a"]) - - depr_msg = ( - "Support for nested sequences for 'parse_dates' in pd.read_csv is deprecated" - ) - with tm.assert_produces_warning( - FutureWarning, match=depr_msg, check_stacklevel=False - ): - result = parser.read_csv( - StringIO(s), names=names, parse_dates=parse_dates, usecols=usecols - ) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index f16f3a2a5c775..594c1d02b94cc 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1050,20 +1050,6 @@ def test_parse_dates_list(self, flavor_read_html): res = flavor_read_html(StringIO(expected), parse_dates=["date"], index_col=0) tm.assert_frame_equal(df, res[0]) - def test_parse_dates_combine(self, flavor_read_html): - raw_dates = Series(date_range("1/1/2001", periods=10)) - df = DataFrame( - { - "date": raw_dates.map(lambda x: str(x.date())), - "time": raw_dates.map(lambda x: str(x.time())), - } - ) - res = flavor_read_html( - StringIO(df.to_html()), parse_dates={"datetime": [1, 2]}, index_col=1 - ) - newdf = DataFrame({"datetime": raw_dates}) - tm.assert_frame_equal(newdf, res[0]) - def test_wikipedia_states_table(self, datapath, flavor_read_html): data = datapath("io", "data", "html", "wikipedia_states.html") assert os.path.isfile(data), f"{data!r} is not a file" diff --git a/pandas/tests/io/xml/test_xml_dtypes.py b/pandas/tests/io/xml/test_xml_dtypes.py index a85576ff13f5c..96ef50f9d7149 100644 --- a/pandas/tests/io/xml/test_xml_dtypes.py +++ b/pandas/tests/io/xml/test_xml_dtypes.py @@ -378,58 +378,6 @@ def test_parse_dates_true(parser): tm.assert_frame_equal(df_iter, df_expected) -def test_parse_dates_dictionary(parser): - xml = """ - - - square - 360 - 4.0 - 2020 - 12 - 31 - - - circle - 360 - - 2021 - 12 - 31 - - - triangle - 180 - 3.0 - 2022 - 12 - 31 - -""" - - df_result = read_xml( - StringIO(xml), parse_dates={"date_end": ["year", "month", "day"]}, parser=parser - ) - df_iter = read_xml_iterparse( - xml, - parser=parser, - parse_dates={"date_end": ["year", "month", "day"]}, - iterparse={"row": ["shape", "degrees", "sides", "year", "month", "day"]}, - ) - - df_expected = DataFrame( - { - "date_end": to_datetime(["2020-12-31", "2021-12-31", "2022-12-31"]), - "shape": ["square", "circle", "triangle"], - "degrees": [360, 360, 180], - "sides": [4.0, float("nan"), 3.0], - } - ) - - tm.assert_frame_equal(df_result, df_expected) - tm.assert_frame_equal(df_iter, df_expected) - - def test_day_first_parse_dates(parser): xml = """\ @@ -479,7 +427,5 @@ def test_day_first_parse_dates(parser): def test_wrong_parse_dates_type(xml_books, parser, iterparse): - with pytest.raises( - TypeError, match=("Only booleans, lists, and dictionaries are accepted") - ): + with pytest.raises(TypeError, match="Only booleans and lists are accepted"): read_xml(xml_books, parse_dates={"date"}, parser=parser, iterparse=iterparse)