Skip to content

Commit d37e743

Browse files
author
MarcoGorelli
committed
share paths and fix bugs
1 parent ca3e0c8 commit d37e743

File tree

9 files changed

+179
-140
lines changed

9 files changed

+179
-140
lines changed

doc/source/whatsnew/v2.0.0.rst

+7
Original file line numberDiff line numberDiff line change
@@ -768,6 +768,7 @@ Performance improvements
768768
- Performance improvement in :func:`merge` when not merging on the index - the new index will now be :class:`RangeIndex` instead of :class:`Int64Index` (:issue:`49478`)
769769
- Performance improvement in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when using any non-object dtypes (:issue:`46470`)
770770
- Performance improvement in :func:`read_html` when there are multiple tables (:issue:`49929`)
771+
- Performance improvement in :func:`to_datetime` when using ``'%Y%m%d'`` format (:issue:`17410`)
771772

772773
.. ---------------------------------------------------------------------------
773774
.. _whatsnew_200.bug_fixes:
@@ -794,6 +795,12 @@ Datetimelike
794795
- Bug in :func:`to_datetime` was raising ``ValueError`` when parsing empty string and non-ISO8601 format was passed. Now, empty strings will be parsed as :class:`NaT`, for compatibility with how is done for ISO8601 formats (:issue:`50251`)
795796
- Bug in :class:`Timestamp` was showing ``UserWarning``, which was not actionable by users, when parsing non-ISO8601 delimited date strings (:issue:`50232`)
796797
- Bug in :func:`to_datetime` was showing misleading ``ValueError`` when parsing dates with format containing ISO week directive and ISO weekday directive (:issue:`50308`)
798+
- Bug in :func:`to_datetime` was not raising ``ValueError`` when parsing string with decimal date with format ``'%Y%m%d'`` (:issue:`50051`)
799+
- Bug in :func:`to_datetime` was not converting ``None`` to ``NaT`` when parsing mixed-offset date strings with ISO8601 format (:issue:`50071`)
800+
- Bug in :func:`to_datetime` was returning ``datetime.datetime`` object when parsing out-of-bounds date string with ``errors='ignore'`` and ``format='%Y%m%d'`` (:issue:`14487`)
801+
- Bug in :func:`to_datetime` was converting timezone-naive ``datetime.datetime`` when parsing with timezone-aware strings and ISO8601 format (:issue:`50254`)
802+
- Bug in :func:`to_datetime` was throwing ``ValueError`` when parsing dates with ISO8601 format where some values were not zero-padded (:issue:`21422`)
803+
- Bug in :func:`to_datetime` was giving incorrect results when using ``format='%Y%m%d'`` and ``errors='ignore'`` (:issue:`26493`)
797804
-
798805

799806
Timedelta

pandas/_libs/tslib.pyi

-3
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,6 @@ def array_to_datetime(
2323
dayfirst: bool = ...,
2424
yearfirst: bool = ...,
2525
utc: bool = ...,
26-
require_iso8601: bool = ...,
27-
format: str | None = ...,
28-
exact: bool = ...,
2926
) -> tuple[np.ndarray, tzinfo | None]: ...
3027

3128
# returned ndarray may be object dtype or datetime64[ns]

pandas/_libs/tslib.pyx

+1-40
Original file line numberDiff line numberDiff line change
@@ -409,9 +409,6 @@ cpdef array_to_datetime(
409409
bint dayfirst=False,
410410
bint yearfirst=False,
411411
bint utc=False,
412-
bint require_iso8601=False,
413-
format: str | None=None,
414-
bint exact=True,
415412
):
416413
"""
417414
Converts a 1D array of date-like values to a numpy array of either:
@@ -438,8 +435,6 @@ cpdef array_to_datetime(
438435
yearfirst parsing behavior when encountering datetime strings
439436
utc : bool, default False
440437
indicator whether the dates should be UTC
441-
require_iso8601 : bool, default False
442-
indicator whether the datetime string should be iso8601
443438
444439
Returns
445440
-------
@@ -510,16 +505,6 @@ cpdef array_to_datetime(
510505
iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)
511506

512507
elif is_integer_object(val) or is_float_object(val):
513-
if require_iso8601:
514-
if is_coerce:
515-
iresult[i] = NPY_NAT
516-
continue
517-
elif is_raise:
518-
raise ValueError(
519-
f"time data \"{val}\" at position {i} doesn't "
520-
f"match format \"{format}\""
521-
)
522-
return values, tz_out
523508
# these must be ns unit by-definition
524509
seen_integer = True
525510

@@ -550,25 +535,13 @@ cpdef array_to_datetime(
550535

551536
string_to_dts_failed = string_to_dts(
552537
val, &dts, &out_bestunit, &out_local,
553-
&out_tzoffset, False, format, exact
538+
&out_tzoffset, False, None, False
554539
)
555540
if string_to_dts_failed:
556541
# An error at this point is a _parsing_ error
557542
# specifically _not_ OutOfBoundsDatetime
558543
if _parse_today_now(val, &iresult[i], utc):
559544
continue
560-
elif require_iso8601:
561-
# if requiring iso8601 strings, skip trying
562-
# other formats
563-
if is_coerce:
564-
iresult[i] = NPY_NAT
565-
continue
566-
elif is_raise:
567-
raise ValueError(
568-
f"time data \"{val}\" at position {i} doesn't "
569-
f"match format \"{format}\""
570-
)
571-
return values, tz_out
572545

573546
try:
574547
py_dt = parse_datetime_string(val,
@@ -631,18 +604,6 @@ cpdef array_to_datetime(
631604
if is_coerce:
632605
iresult[i] = NPY_NAT
633606
continue
634-
elif require_iso8601 and isinstance(val, str):
635-
# GH#19382 for just-barely-OutOfBounds falling back to
636-
# dateutil parser will return incorrect result because
637-
# it will ignore nanoseconds
638-
if is_raise:
639-
640-
# Still raise OutOfBoundsDatetime,
641-
# as error message is informative.
642-
raise
643-
644-
assert is_ignore
645-
return values, tz_out
646607
raise
647608

648609
except OutOfBoundsDatetime:

pandas/_libs/tslibs/parsing.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -846,7 +846,7 @@ def format_is_iso(f: str) -> bint:
846846
but must be consistent. Leading 0s in dates and times are optional.
847847
"""
848848
iso_template = "%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}".format
849-
excluded_formats = ["%Y%m%d", "%Y%m", "%Y"]
849+
excluded_formats = ["%Y%m", "%Y"]
850850

851851
for date_sep in [" ", "/", "\\", "-", ".", ""]:
852852
for time_sep in [" ", "T"]:

pandas/_libs/tslibs/strptime.pyx

+37
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,14 @@ from pandas._libs.tslibs.nattype cimport (
3434
c_nat_strings as nat_strings,
3535
)
3636
from pandas._libs.tslibs.np_datetime cimport (
37+
NPY_DATETIMEUNIT,
3738
NPY_FR_ns,
3839
check_dts_bounds,
3940
npy_datetimestruct,
4041
npy_datetimestruct_to_datetime,
4142
pydate_to_dt64,
4243
pydatetime_to_dt64,
44+
string_to_dts,
4345
)
4446
from pandas._libs.tslibs.timestamps cimport _Timestamp
4547
from pandas._libs.util cimport (
@@ -93,6 +95,7 @@ def array_strptime(
9395
exact : matches must be exact if True, search if False
9496
errors : string specifying error handling, {'raise', 'ignore', 'coerce'}
9597
"""
98+
from pandas._libs.tslibs.parsing import format_is_iso
9699

97100
cdef:
98101
Py_ssize_t i, n = len(values)
@@ -110,6 +113,9 @@ def array_strptime(
110113
bint found_naive = False
111114
bint found_tz = False
112115
tzinfo tz_out = None
116+
bint iso_format = fmt is not None and format_is_iso(fmt)
117+
NPY_DATETIMEUNIT out_bestunit
118+
int out_local = 0, out_tzoffset = 0
113119

114120
assert is_raise or is_ignore or is_coerce
115121

@@ -230,6 +236,37 @@ def array_strptime(
230236
else:
231237
val = str(val)
232238

239+
if iso_format:
240+
string_to_dts_failed = string_to_dts(
241+
val, &dts, &out_bestunit, &out_local,
242+
&out_tzoffset, False, fmt, exact
243+
)
244+
if not string_to_dts_failed:
245+
# No error reported by string_to_dts, pick back up
246+
# where we left off
247+
value = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts)
248+
if out_local == 1:
249+
# Store the out_tzoffset in seconds
250+
# since we store the total_seconds of
251+
# dateutil.tz.tzoffset objects
252+
tz = timezone(timedelta(minutes=out_tzoffset))
253+
result_timezone[i] = tz
254+
out_local = 0
255+
out_tzoffset = 0
256+
iresult[i] = value
257+
try:
258+
check_dts_bounds(&dts)
259+
except ValueError:
260+
if is_coerce:
261+
iresult[i] = NPY_NAT
262+
continue
263+
raise
264+
continue
265+
266+
# Some ISO formats can't be parsed by string_to_dts
267+
# For example, 6-digit YYYYMD. So, if there's an error,
268+
# try the string-matching code below.
269+
233270
# exact matching
234271
if exact:
235272
found = format_regex.match(val)

pandas/core/arrays/datetimes.py

-7
Original file line numberDiff line numberDiff line change
@@ -2118,10 +2118,7 @@ def objects_to_datetime64ns(
21182118
yearfirst,
21192119
utc: bool = False,
21202120
errors: DateTimeErrorChoices = "raise",
2121-
require_iso8601: bool = False,
21222121
allow_object: bool = False,
2123-
format: str | None = None,
2124-
exact: bool = True,
21252122
):
21262123
"""
21272124
Convert data to array of timestamps.
@@ -2134,7 +2131,6 @@ def objects_to_datetime64ns(
21342131
utc : bool, default False
21352132
Whether to convert/localize timestamps to UTC.
21362133
errors : {'raise', 'ignore', 'coerce'}
2137-
require_iso8601 : bool, default False
21382134
allow_object : bool
21392135
Whether to return an object-dtype ndarray instead of raising if the
21402136
data contains more than one timezone.
@@ -2165,9 +2161,6 @@ def objects_to_datetime64ns(
21652161
utc=utc,
21662162
dayfirst=dayfirst,
21672163
yearfirst=yearfirst,
2168-
require_iso8601=require_iso8601,
2169-
format=format,
2170-
exact=exact,
21712164
)
21722165
result = result.reshape(data.shape, order=order)
21732166
except OverflowError as err:

pandas/core/tools/datetimes.py

+6-59
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@
3636
from pandas._libs.tslibs.conversion import precision_from_unit
3737
from pandas._libs.tslibs.parsing import (
3838
DateParseError,
39-
format_is_iso,
4039
guess_datetime_format,
4140
)
4241
from pandas._libs.tslibs.strptime import array_strptime
@@ -419,7 +418,6 @@ def _convert_listlike_datetimes(
419418

420419
# warn if passing timedelta64, raise for PeriodDtype
421420
# NB: this must come after unit transformation
422-
orig_arg = arg
423421
try:
424422
arg, _ = maybe_convert_dtype(arg, copy=False, tz=libtimezones.maybe_get_tz(tz))
425423
except TypeError:
@@ -432,35 +430,20 @@ def _convert_listlike_datetimes(
432430
raise
433431

434432
arg = ensure_object(arg)
435-
require_iso8601 = False
436433

437434
if format is None:
438435
format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)
439436

440-
# There is a special fast-path for iso8601 formatted datetime strings
441-
require_iso8601 = format is not None and format_is_iso(format)
442-
443-
if format is not None and not require_iso8601:
444-
return _to_datetime_with_format(
445-
arg,
446-
orig_arg,
447-
name,
448-
utc,
449-
format,
450-
exact,
451-
errors,
452-
)
437+
if format is not None:
438+
return _array_strptime_with_fallback(arg, name, utc, format, exact, errors)
453439

454440
result, tz_parsed = objects_to_datetime64ns(
455441
arg,
456442
dayfirst=dayfirst,
457443
yearfirst=yearfirst,
458444
utc=utc,
459445
errors=errors,
460-
require_iso8601=require_iso8601,
461446
allow_object=True,
462-
format=format,
463-
exact=exact,
464447
)
465448

466449
if tz_parsed is not None:
@@ -512,40 +495,6 @@ def _array_strptime_with_fallback(
512495
return _box_as_indexlike(result, utc=utc, name=name)
513496

514497

515-
def _to_datetime_with_format(
516-
arg,
517-
orig_arg,
518-
name,
519-
utc: bool,
520-
fmt: str,
521-
exact: bool,
522-
errors: str,
523-
) -> Index:
524-
"""
525-
Try parsing with the given format.
526-
"""
527-
result = None
528-
529-
# shortcut formatting here
530-
if fmt == "%Y%m%d":
531-
# pass orig_arg as float-dtype may have been converted to
532-
# datetime64[ns]
533-
orig_arg = ensure_object(orig_arg)
534-
try:
535-
# may return None without raising
536-
result = _attempt_YYYYMMDD(orig_arg, errors=errors)
537-
except (ValueError, TypeError, OutOfBoundsDatetime) as err:
538-
raise ValueError(
539-
"cannot convert the input to '%Y%m%d' date format"
540-
) from err
541-
if result is not None:
542-
return _box_as_indexlike(result, utc=utc, name=name)
543-
544-
# fallback
545-
res = _array_strptime_with_fallback(arg, name, utc, fmt, exact, errors)
546-
return res
547-
548-
549498
def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index:
550499
"""
551500
to_datetime specalized to the case where a 'unit' is passed.
@@ -1000,7 +949,7 @@ def to_datetime(
1000949
in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`.
1001950
1002951
>>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore')
1003-
datetime.datetime(1300, 1, 1, 0, 0)
952+
'13000101'
1004953
>>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce')
1005954
NaT
1006955
@@ -1033,14 +982,12 @@ def to_datetime(
1033982
Index([2020-10-25 02:00:00+02:00, 2020-10-25 04:00:00+01:00],
1034983
dtype='object')
1035984
1036-
- A mix of timezone-aware and timezone-naive inputs is converted to
1037-
a timezone-aware :class:`DatetimeIndex` if the offsets of the timezone-aware
1038-
are constant:
985+
- A mix of timezone-aware and timezone-naive inputs is also converted to
986+
a simple :class:`Index` containing :class:`datetime.datetime` objects:
1039987
1040988
>>> from datetime import datetime
1041989
>>> pd.to_datetime(["2020-01-01 01:00:00-01:00", datetime(2020, 1, 1, 3, 0)])
1042-
DatetimeIndex(['2020-01-01 01:00:00-01:00', '2020-01-01 02:00:00-01:00'],
1043-
dtype='datetime64[ns, UTC-01:00]', freq=None)
990+
Index([2020-01-01 01:00:00-01:00, 2020-01-01 03:00:00], dtype='object')
1044991
1045992
|
1046993

0 commit comments

Comments
 (0)