Skip to content

Commit a17d449

Browse files
authored
CLN: Enforce read_csv(keep_date_col, parse_dates) deprecations (#58622)
* CLN: Enforce read_csv(keep_date_col, parse_dates) deprecations * Add whatsnew, address other tests * Remove unnecessary reference * inline function * Remove os.remove * Address html and xml tests * Typo * Simplify _process_date_conversion * Remove _get_complex_date_index * Remove concat arrays for csv * Unexfail test * Remove convert to unicode
1 parent 24182c2 commit a17d449

File tree

16 files changed

+102
-1535
lines changed

16 files changed

+102
-1535
lines changed

asv_bench/benchmarks/io/csv.py

-10
Original file line numberDiff line numberDiff line change
@@ -445,16 +445,6 @@ def setup(self, engine):
445445
data = data.format(*two_cols)
446446
self.StringIO_input = StringIO(data)
447447

448-
def time_multiple_date(self, engine):
449-
read_csv(
450-
self.data(self.StringIO_input),
451-
engine=engine,
452-
sep=",",
453-
header=None,
454-
names=list(string.digits[:9]),
455-
parse_dates=[[1, 2], [1, 3]],
456-
)
457-
458448
def time_baseline(self, engine):
459449
read_csv(
460450
self.data(self.StringIO_input),

asv_bench/benchmarks/io/parsers.py

+1-24
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,5 @@
1-
import numpy as np
2-
31
try:
4-
from pandas._libs.tslibs.parsing import (
5-
_does_string_look_like_datetime,
6-
concat_date_cols,
7-
)
2+
from pandas._libs.tslibs.parsing import _does_string_look_like_datetime
83
except ImportError:
94
# Avoid whole benchmark suite import failure on asv (currently 0.4)
105
pass
@@ -20,21 +15,3 @@ def setup(self, value):
2015
def time_check_datetimes(self, value):
2116
for obj in self.objects:
2217
_does_string_look_like_datetime(obj)
23-
24-
25-
class ConcatDateCols:
26-
params = ([1234567890, "AAAA"], [1, 2])
27-
param_names = ["value", "dim"]
28-
29-
def setup(self, value, dim):
30-
count_elem = 10000
31-
if dim == 1:
32-
self.object = (np.array([value] * count_elem),)
33-
if dim == 2:
34-
self.object = (
35-
np.array([value] * count_elem),
36-
np.array([value] * count_elem),
37-
)
38-
39-
def time_check_concat(self, value, dim):
40-
concat_date_cols(self.object)

doc/source/user_guide/io.rst

+1-80
Original file line numberDiff line numberDiff line change
@@ -262,15 +262,9 @@ parse_dates : boolean or list of ints or names or list of lists or dict, default
262262
* If ``True`` -> try parsing the index.
263263
* If ``[1, 2, 3]`` -> try parsing columns 1, 2, 3 each as a separate date
264264
column.
265-
* If ``[[1, 3]]`` -> combine columns 1 and 3 and parse as a single date
266-
column.
267-
* If ``{'foo': [1, 3]}`` -> parse columns 1, 3 as date and call result 'foo'.
268265

269266
.. note::
270267
A fast-path exists for iso8601-formatted dates.
271-
keep_date_col : boolean, default ``False``
272-
If ``True`` and parse_dates specifies combining multiple columns then keep the
273-
original columns.
274268
date_format : str or dict of column -> format, default ``None``
275269
If used in conjunction with ``parse_dates``, will parse dates according to this
276270
format. For anything more complex,
@@ -802,71 +796,8 @@ The simplest case is to just pass in ``parse_dates=True``:
802796
803797
It is often the case that we may want to store date and time data separately,
804798
or store various date fields separately. the ``parse_dates`` keyword can be
805-
used to specify a combination of columns to parse the dates and/or times from.
806-
807-
You can specify a list of column lists to ``parse_dates``, the resulting date
808-
columns will be prepended to the output (so as to not affect the existing column
809-
order) and the new column names will be the concatenation of the component
810-
column names:
811-
812-
.. ipython:: python
813-
:okwarning:
814-
815-
data = (
816-
"KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
817-
"KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
818-
"KORD,19990127, 21:00:00, 20:56:00, -0.5900\n"
819-
"KORD,19990127, 21:00:00, 21:18:00, -0.9900\n"
820-
"KORD,19990127, 22:00:00, 21:56:00, -0.5900\n"
821-
"KORD,19990127, 23:00:00, 22:56:00, -0.5900"
822-
)
823-
824-
with open("tmp.csv", "w") as fh:
825-
fh.write(data)
826-
827-
df = pd.read_csv("tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]])
828-
df
829-
830-
By default the parser removes the component date columns, but you can choose
831-
to retain them via the ``keep_date_col`` keyword:
832-
833-
.. ipython:: python
834-
:okwarning:
835-
836-
df = pd.read_csv(
837-
"tmp.csv", header=None, parse_dates=[[1, 2], [1, 3]], keep_date_col=True
838-
)
839-
df
799+
used to specify columns to parse the dates and/or times.
840800

841-
Note that if you wish to combine multiple columns into a single date column, a
842-
nested list must be used. In other words, ``parse_dates=[1, 2]`` indicates that
843-
the second and third columns should each be parsed as separate date columns
844-
while ``parse_dates=[[1, 2]]`` means the two columns should be parsed into a
845-
single column.
846-
847-
You can also use a dict to specify custom name columns:
848-
849-
.. ipython:: python
850-
:okwarning:
851-
852-
date_spec = {"nominal": [1, 2], "actual": [1, 3]}
853-
df = pd.read_csv("tmp.csv", header=None, parse_dates=date_spec)
854-
df
855-
856-
It is important to remember that if multiple text columns are to be parsed into
857-
a single date column, then a new column is prepended to the data. The ``index_col``
858-
specification is based off of this new set of columns rather than the original
859-
data columns:
860-
861-
862-
.. ipython:: python
863-
:okwarning:
864-
865-
date_spec = {"nominal": [1, 2], "actual": [1, 3]}
866-
df = pd.read_csv(
867-
"tmp.csv", header=None, parse_dates=date_spec, index_col=0
868-
) # index is the nominal column
869-
df
870801

871802
.. note::
872803
If a column or index contains an unparsable date, the entire column or
@@ -880,10 +811,6 @@ data columns:
880811
for your data to store datetimes in this format, load times will be
881812
significantly faster, ~20x has been observed.
882813

883-
.. deprecated:: 2.2.0
884-
Combining date columns inside read_csv is deprecated. Use ``pd.to_datetime``
885-
on the relevant result columns instead.
886-
887814

888815
Date parsing functions
889816
++++++++++++++++++++++
@@ -899,12 +826,6 @@ Performance-wise, you should try these methods of parsing dates in order:
899826
then use ``to_datetime``.
900827

901828

902-
.. ipython:: python
903-
:suppress:
904-
905-
os.remove("tmp.csv")
906-
907-
908829
.. _io.csv.mixed_timezones:
909830

910831
Parsing a CSV with mixed timezones

doc/source/whatsnew/v3.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,10 @@ Removal of prior version deprecations/changes
260260
- Enforced deprecation of ``axis=None`` acting the same as ``axis=0`` in the DataFrame reductions ``sum``, ``prod``, ``std``, ``var``, and ``sem``, passing ``axis=None`` will now reduce over both axes; this is particularly the case when doing e.g. ``numpy.sum(df)`` (:issue:`21597`)
261261
- Enforced deprecation of ``core.internals`` members ``Block``, ``ExtensionBlock``, and ``DatetimeTZBlock`` (:issue:`58467`)
262262
- Enforced deprecation of ``date_parser`` in :func:`read_csv`, :func:`read_table`, :func:`read_fwf`, and :func:`read_excel` in favour of ``date_format`` (:issue:`50601`)
263+
- Enforced deprecation of ``keep_date_col`` keyword in :func:`read_csv` (:issue:`55569`)
263264
- Enforced deprecation of ``quantile`` keyword in :meth:`.Rolling.quantile` and :meth:`.Expanding.quantile`, renamed to ``q`` instead. (:issue:`52550`)
264265
- Enforced deprecation of argument ``infer_datetime_format`` in :func:`read_csv`, as a strict version of it is now the default (:issue:`48621`)
266+
- Enforced deprecation of combining parsed datetime columns in :func:`read_csv` in ``parse_dates`` (:issue:`55569`)
265267
- Enforced deprecation of non-standard (``np.ndarray``, :class:`ExtensionArray`, :class:`Index`, or :class:`Series`) argument to :func:`api.extensions.take` (:issue:`52981`)
266268
- Enforced deprecation of parsing system timezone strings to ``tzlocal``, which depended on system timezone, pass the 'tz' keyword instead (:issue:`50791`)
267269
- Enforced deprecation of passing a dictionary to :meth:`SeriesGroupBy.agg` (:issue:`52268`)

pandas/_libs/tslibs/parsing.pyi

-3
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,4 @@ def guess_datetime_format(
2727
dt_str: str,
2828
dayfirst: bool | None = ...,
2929
) -> str | None: ...
30-
def concat_date_cols(
31-
date_cols: tuple,
32-
) -> npt.NDArray[np.object_]: ...
3330
def get_rule_month(source: str) -> str: ...

pandas/_libs/tslibs/parsing.pyx

+1-122
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ import warnings
77

88
from pandas.util._exceptions import find_stack_level
99

10-
cimport cython
1110
from cpython.datetime cimport (
1211
datetime,
1312
datetime_new,
@@ -18,7 +17,6 @@ from cpython.datetime cimport (
1817

1918
from datetime import timezone
2019

21-
from cpython.object cimport PyObject_Str
2220
from cpython.unicode cimport PyUnicode_AsUTF8AndSize
2321
from cython cimport Py_ssize_t
2422
from libc.string cimport strchr
@@ -28,15 +26,7 @@ import_datetime()
2826
import numpy as np
2927

3028
cimport numpy as cnp
31-
from numpy cimport (
32-
PyArray_GETITEM,
33-
PyArray_ITER_DATA,
34-
PyArray_ITER_NEXT,
35-
PyArray_IterNew,
36-
flatiter,
37-
float64_t,
38-
int64_t,
39-
)
29+
from numpy cimport int64_t
4030

4131
cnp.import_array()
4232

@@ -75,8 +65,6 @@ import_pandas_datetime()
7565

7666
from pandas._libs.tslibs.strptime import array_strptime
7767

78-
from pandas._libs.tslibs.util cimport is_array
79-
8068

8169
cdef extern from "pandas/portable.h":
8270
int getdigit_ascii(char c, int default) nogil
@@ -1097,115 +1085,6 @@ cdef void _maybe_warn_about_dayfirst(format: str, bint dayfirst) noexcept:
10971085
)
10981086

10991087

1100-
@cython.wraparound(False)
1101-
@cython.boundscheck(False)
1102-
cdef object convert_to_unicode(object item, bint keep_trivial_numbers):
1103-
"""
1104-
Convert `item` to str.
1105-
1106-
Parameters
1107-
----------
1108-
item : object
1109-
keep_trivial_numbers : bool
1110-
if True, then conversion (to string from integer/float zero)
1111-
is not performed
1112-
1113-
Returns
1114-
-------
1115-
str or int or float
1116-
"""
1117-
cdef:
1118-
float64_t float_item
1119-
1120-
if keep_trivial_numbers:
1121-
if isinstance(item, int):
1122-
if <int>item == 0:
1123-
return item
1124-
elif isinstance(item, float):
1125-
float_item = item
1126-
if float_item == 0.0 or float_item != float_item:
1127-
return item
1128-
1129-
if not isinstance(item, str):
1130-
item = PyObject_Str(item)
1131-
1132-
return item
1133-
1134-
1135-
@cython.wraparound(False)
1136-
@cython.boundscheck(False)
1137-
def concat_date_cols(tuple date_cols) -> np.ndarray:
1138-
"""
1139-
Concatenates elements from numpy arrays in `date_cols` into strings.
1140-
1141-
Parameters
1142-
----------
1143-
date_cols : tuple[ndarray]
1144-
1145-
Returns
1146-
-------
1147-
arr_of_rows : ndarray[object]
1148-
1149-
Examples
1150-
--------
1151-
>>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object)
1152-
>>> times=np.array(['11:20', '10:45'], dtype=object)
1153-
>>> result = concat_date_cols((dates, times))
1154-
>>> result
1155-
array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object)
1156-
"""
1157-
cdef:
1158-
Py_ssize_t rows_count = 0, col_count = len(date_cols)
1159-
Py_ssize_t col_idx, row_idx
1160-
list list_to_join
1161-
cnp.ndarray[object] iters
1162-
object[::1] iters_view
1163-
flatiter it
1164-
cnp.ndarray[object] result
1165-
object[::1] result_view
1166-
1167-
if col_count == 0:
1168-
return np.zeros(0, dtype=object)
1169-
1170-
if not all(is_array(array) for array in date_cols):
1171-
raise ValueError("not all elements from date_cols are numpy arrays")
1172-
1173-
rows_count = min(len(array) for array in date_cols)
1174-
result = np.zeros(rows_count, dtype=object)
1175-
result_view = result
1176-
1177-
if col_count == 1:
1178-
array = date_cols[0]
1179-
it = <flatiter>PyArray_IterNew(array)
1180-
for row_idx in range(rows_count):
1181-
item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
1182-
result_view[row_idx] = convert_to_unicode(item, True)
1183-
PyArray_ITER_NEXT(it)
1184-
else:
1185-
# create fixed size list - more efficient memory allocation
1186-
list_to_join = [None] * col_count
1187-
iters = np.zeros(col_count, dtype=object)
1188-
1189-
# create memoryview of iters ndarray, that will contain some
1190-
# flatiter's for each array in `date_cols` - more efficient indexing
1191-
iters_view = iters
1192-
for col_idx, array in enumerate(date_cols):
1193-
iters_view[col_idx] = PyArray_IterNew(array)
1194-
1195-
# array elements that are on the same line are converted to one string
1196-
for row_idx in range(rows_count):
1197-
for col_idx, array in enumerate(date_cols):
1198-
# this cast is needed, because we did not find a way
1199-
# to efficiently store `flatiter` type objects in ndarray
1200-
it = <flatiter>iters_view[col_idx]
1201-
item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
1202-
list_to_join[col_idx] = convert_to_unicode(item, False)
1203-
PyArray_ITER_NEXT(it)
1204-
result_view[row_idx] = " ".join(list_to_join)
1205-
1206-
return result
1207-
1208-
12091088
cpdef str get_rule_month(str source):
12101089
"""
12111090
Return starting month of given freq, default is December.

pandas/io/parsers/arrow_parser_wrapper.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -174,8 +174,8 @@ def _finalize_pandas_output(self, frame: DataFrame) -> DataFrame:
174174
self.names = list(range(num_cols - len(self.names))) + self.names
175175
multi_index_named = False
176176
frame.columns = self.names
177-
# we only need the frame not the names
178-
_, frame = self._do_date_conversions(frame.columns, frame)
177+
178+
frame = self._do_date_conversions(frame.columns, frame)
179179
if self.index_col is not None:
180180
index_to_set = self.index_col.copy()
181181
for i, item in enumerate(self.index_col):

0 commit comments

Comments
 (0)