Skip to content

Commit fc419d5

Browse files
author
MarcoGorelli
committed
make iso8601 fastpath respect exact
1 parent cc307ab commit fc419d5

File tree

11 files changed

+396
-22
lines changed

11 files changed

+396
-22
lines changed

f.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
from typing import NamedTuple
2+
3+
4+
class ISO8601Info(NamedTuple):
5+
format: str = b""
6+
date_sep: str = b""
7+
time_sep: str = b""
8+
micro_or_tz: str = b""
9+
year: bool = False
10+
month: bool = False
11+
day: bool = False
12+
hour: bool = False
13+
minute: bool = False
14+
second: bool = False
15+
16+
17+
def format_is_iso(f: str):
18+
"""
19+
Does format match the iso8601 set that can be handled by the C parser?
20+
Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different
21+
but must be consistent. Leading 0s in dates and times are optional.
22+
23+
no, needs doing in c. eff...
24+
"""
25+
excluded_formats = ["%Y%m%d", "%Y%m", "%Y"]
26+
27+
if f in excluded_formats:
28+
return ISO8601Info()
29+
for date_sep in [" ", "/", "\\", "-", ".", ""]:
30+
for time_sep in [" ", "T"]:
31+
for micro_or_tz in ["", "%z", "%Z", ".%f", ".%f%z", ".%f%Z"]:
32+
if f"%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}" == f:
33+
return ISO8601Info(
34+
format=f.encode("utf-8"),
35+
date_sep=date_sep.encode("utf-8"),
36+
time_sep=time_sep.encode("utf-8"),
37+
micro_or_tz=micro_or_tz.encode("utf-8"),
38+
year=True,
39+
month=True,
40+
day=True,
41+
hour=True,
42+
minute=True,
43+
second=True,
44+
)
45+
elif f"%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M" == f:
46+
return ISO8601Info(
47+
format=f.encode("utf-8"),
48+
date_sep=date_sep.encode("utf-8"),
49+
time_sep=time_sep.encode("utf-8"),
50+
micro_or_tz=micro_or_tz.encode("utf-8"),
51+
year=True,
52+
month=True,
53+
day=True,
54+
hour=True,
55+
minute=True,
56+
)
57+
elif f"%Y{date_sep}%m{date_sep}%d{time_sep}%H" == f:
58+
return ISO8601Info(
59+
format=f.encode("utf-8"),
60+
date_sep=date_sep.encode("utf-8"),
61+
time_sep=time_sep.encode("utf-8"),
62+
micro_or_tz=micro_or_tz.encode("utf-8"),
63+
year=True,
64+
month=True,
65+
day=True,
66+
hour=True,
67+
)
68+
elif f"%Y{date_sep}%m{date_sep}%d" == f:
69+
return ISO8601Info(
70+
format=f.encode("utf-8"),
71+
date_sep=date_sep.encode("utf-8"),
72+
time_sep=time_sep.encode("utf-8"),
73+
micro_or_tz=micro_or_tz.encode("utf-8"),
74+
year=True,
75+
month=True,
76+
day=True,
77+
)
78+
elif f"%Y{date_sep}%m" == f:
79+
return ISO8601Info(
80+
format=f.encode("utf-8"),
81+
date_sep=date_sep.encode("utf-8"),
82+
time_sep=time_sep.encode("utf-8"),
83+
micro_or_tz=micro_or_tz.encode("utf-8"),
84+
year=True,
85+
month=True,
86+
)
87+
return ISO8601Info()
88+
89+
90+
if __name__ == "__main__":
91+
print(format_is_iso("%Y-%m-%d %H:%M:%S%z"))
92+
# print(format_is_iso('%Y%m%d %H'))

pandas/_libs/tslib.pyx

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,19 @@ def _test_parse_iso8601(ts: str):
9393
elif ts == 'today':
9494
return Timestamp.now().normalize()
9595

96-
string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True)
96+
string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True,
97+
format='',
98+
date_sep='',
99+
time_sep='',
100+
micro_or_tz='',
101+
year=False,
102+
month=False,
103+
day=False,
104+
hour=False,
105+
minute=False,
106+
second=False,
107+
exact=False,
108+
)
97109
obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts)
98110
check_dts_bounds(&obj.dts)
99111
if out_local == 1:
@@ -449,6 +461,17 @@ cpdef array_to_datetime(
449461
bint utc=False,
450462
bint require_iso8601=False,
451463
bint allow_mixed=False,
464+
const char *format='',
465+
const char *date_sep='',
466+
const char *time_sep='',
467+
const char *micro_or_tz='',
468+
bint year=False,
469+
bint month=False,
470+
bint day=False,
471+
bint hour=False,
472+
bint minute=False,
473+
bint second=False,
474+
bint exact=False,
452475
):
453476
"""
454477
Converts a 1D array of date-like values to a numpy array of either:
@@ -568,6 +591,16 @@ cpdef array_to_datetime(
568591
iresult[i] = get_datetime64_nanos(val, NPY_FR_ns)
569592

570593
elif is_integer_object(val) or is_float_object(val):
594+
if require_iso8601:
595+
if is_coerce:
596+
iresult[i] = NPY_NAT
597+
continue
598+
elif is_raise:
599+
raise ValueError(
600+
f"time data \"{val}\" at position {i} doesn't match format {format.decode('utf-8')}"
601+
)
602+
return values, tz_out
603+
571604
# these must be ns unit by-definition
572605
seen_integer = True
573606

@@ -598,7 +631,18 @@ cpdef array_to_datetime(
598631

599632
string_to_dts_failed = string_to_dts(
600633
val, &dts, &out_bestunit, &out_local,
601-
&out_tzoffset, False
634+
&out_tzoffset, False,
635+
format,
636+
date_sep=date_sep,
637+
time_sep=time_sep,
638+
micro_or_tz=micro_or_tz,
639+
year=year,
640+
month=month,
641+
day=day,
642+
hour=hour,
643+
minute=minute,
644+
second=second,
645+
exact=exact,
602646
)
603647
if string_to_dts_failed:
604648
# An error at this point is a _parsing_ error
@@ -613,7 +657,7 @@ cpdef array_to_datetime(
613657
continue
614658
elif is_raise:
615659
raise ValueError(
616-
f"time data \"{val}\" at position {i} doesn't match format specified"
660+
f"time data \"{val}\" at position {i} doesn't match format {format.decode('utf-8')}"
617661
)
618662
return values, tz_out
619663

pandas/_libs/tslibs/conversion.pyx

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -488,7 +488,18 @@ cdef _TSObject _convert_str_to_tsobject(object ts, tzinfo tz, str unit,
488488
else:
489489
string_to_dts_failed = string_to_dts(
490490
ts, &dts, &out_bestunit, &out_local,
491-
&out_tzoffset, False
491+
&out_tzoffset, False,
492+
'',
493+
date_sep='',
494+
time_sep='',
495+
micro_or_tz='',
496+
year=False,
497+
month=False,
498+
day=False,
499+
hour=False,
500+
minute=False,
501+
second=False,
502+
exact=False,
492503
)
493504
if not string_to_dts_failed:
494505
try:

pandas/_libs/tslibs/np_datetime.pxd

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,17 @@ cdef int string_to_dts(
9595
int* out_local,
9696
int* out_tzoffset,
9797
bint want_exc,
98+
const char *format,
99+
const char *date_sep,
100+
const char *time_sep,
101+
const char *micro_or_tz,
102+
bint year,
103+
bint month,
104+
bint day,
105+
bint hour,
106+
bint minute,
107+
bint second,
108+
bint exact,
98109
) except? -1
99110

100111
cdef NPY_DATETIMEUNIT get_unit_from_dtype(cnp.dtype dtype)

pandas/_libs/tslibs/np_datetime.pyx

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,19 @@ cdef extern from "src/datetime/np_datetime_strings.h":
5252
int parse_iso_8601_datetime(const char *str, int len, int want_exc,
5353
npy_datetimestruct *out,
5454
NPY_DATETIMEUNIT *out_bestunit,
55-
int *out_local, int *out_tzoffset)
55+
int *out_local, int *out_tzoffset,
56+
int format,
57+
const char *date_sep,
58+
const char *time_sep,
59+
const char *micro_or_tz,
60+
int year,
61+
int month,
62+
int day,
63+
int hour,
64+
int minute,
65+
int second,
66+
int exact
67+
)
5668

5769

5870
# ----------------------------------------------------------------------
@@ -273,14 +285,40 @@ cdef inline int string_to_dts(
273285
int* out_local,
274286
int* out_tzoffset,
275287
bint want_exc,
288+
const char *format,
289+
const char *date_sep,
290+
const char *time_sep,
291+
const char *micro_or_tz,
292+
bint year,
293+
bint month,
294+
bint day,
295+
bint hour,
296+
bint minute,
297+
bint second,
298+
bint exact,
276299
) except? -1:
277300
cdef:
278301
Py_ssize_t length
302+
Py_ssize_t format_length
279303
const char* buf
280304

281305
buf = get_c_string_buf_and_size(val, &length)
282-
return parse_iso_8601_datetime(buf, length, want_exc,
283-
dts, out_bestunit, out_local, out_tzoffset)
306+
format_length = len(format)
307+
result = parse_iso_8601_datetime(buf, length, want_exc,
308+
dts, out_bestunit, out_local, out_tzoffset,
309+
format_length,
310+
date_sep,
311+
time_sep,
312+
micro_or_tz,
313+
year,
314+
month,
315+
day,
316+
hour,
317+
minute,
318+
second,
319+
exact,
320+
)
321+
return result
284322

285323

286324
cpdef ndarray astype_overflowsafe(

pandas/_libs/tslibs/parsing.pyx

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,18 @@ cdef parse_datetime_string_with_reso(
409409
# TODO: does this render some/all of parse_delimited_date redundant?
410410
string_to_dts_failed = string_to_dts(
411411
date_string, &dts, &out_bestunit, &out_local,
412-
&out_tzoffset, False
412+
&out_tzoffset, False,
413+
'',
414+
'',
415+
'',
416+
'',
417+
False,
418+
False,
419+
False,
420+
False,
421+
False,
422+
False,
423+
False,
413424
)
414425
if not string_to_dts_failed:
415426
if dts.ps != 0 or out_local:

0 commit comments

Comments
 (0)