Skip to content

Commit 9c455c7

Browse files
committed
Hand written ISO parser for Timedelta construction
1 parent 8acdf80 commit 9c455c7

File tree

1 file changed

+78
-25
lines changed

1 file changed

+78
-25
lines changed

pandas/_libs/tslibs/timedeltas.pyx

+78-25
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# -*- coding: utf-8 -*-
22
# cython: profile=False
33
import collections
4-
import re
54

65
import sys
76
cdef bint PY3 = (sys.version_info[0] >= 3)
@@ -507,18 +506,6 @@ def _binary_op_method_timedeltalike(op, name):
507506
# ----------------------------------------------------------------------
508507
# Timedelta Construction
509508

510-
iso_pater = re.compile(r"""P
511-
(?P<days>-?[0-9]*)DT
512-
(?P<hours>[0-9]{1,2})H
513-
(?P<minutes>[0-9]{1,2})M
514-
(?P<seconds>[0-9]{0,2})
515-
(\.
516-
(?P<milliseconds>[0-9]{1,3})
517-
(?P<microseconds>[0-9]{0,3})
518-
(?P<nanoseconds>[0-9]{0,3})
519-
)?S""", re.VERBOSE)
520-
521-
522509
cdef int64_t parse_iso_format_string(object iso_fmt) except? -1:
523510
"""
524511
Extracts and cleanses the appropriate values from a match object with
@@ -540,22 +527,88 @@ cdef int64_t parse_iso_format_string(object iso_fmt) except? -1:
540527
If ``iso_fmt`` cannot be parsed
541528
"""
542529

543-
cdef int64_t ns = 0
530+
cdef:
531+
unicode c
532+
int64_t result = 0, r
533+
int p=0
534+
object dec_unit = 'ms', err_msg
535+
bint have_dot=0, have_value=0, neg=0
536+
list number=[], unit=[]
544537

545-
match = re.match(iso_pater, iso_fmt)
546-
if match:
547-
match_dict = match.groupdict(default='0')
548-
for comp in ['milliseconds', 'microseconds', 'nanoseconds']:
549-
match_dict[comp] = '{:0<3}'.format(match_dict[comp])
538+
err_msg = "Invalid ISO 8601 Duration format - {}".format(iso_fmt)
550539

551-
for k, v in match_dict.items():
552-
ns += timedelta_from_spec(v, '0', k)
540+
for c in iso_fmt:
541+
# number (ascii codes)
542+
if ord(c) >= 48 and ord(c) <= 57:
553543

554-
else:
555-
raise ValueError("Invalid ISO 8601 Duration format - "
556-
"{}".format(iso_fmt))
544+
have_value = 1
545+
if have_dot:
546+
if p == 3 and dec_unit != 'ns':
547+
unit.append(dec_unit)
548+
if dec_unit == 'ms':
549+
dec_unit = 'us'
550+
elif dec_unit == 'us':
551+
dec_unit = 'ns'
552+
p = 0
553+
p += 1
554+
555+
if not len(unit):
556+
number.append(c)
557+
else:
558+
# if in days, pop trailing T
559+
if unit[-1] == 'T':
560+
unit.pop()
561+
elif 'H' in unit or 'M' in unit:
562+
if len(number) > 2:
563+
raise ValueError(err_msg)
564+
r = timedelta_from_spec(number, '0', unit)
565+
result += timedelta_as_neg(r, neg)
557566

558-
return ns
567+
neg = 0
568+
unit, number = [], [c]
569+
else:
570+
if c == 'P':
571+
pass # ignore leading character
572+
elif c == '-':
573+
if neg or have_value:
574+
raise ValueError(err_msg)
575+
else:
576+
neg = 1
577+
elif c in ['D', 'T', 'H', 'M']:
578+
unit.append(c)
579+
elif c == '.':
580+
# append any seconds
581+
if len(number):
582+
r = timedelta_from_spec(number, '0', 'S')
583+
result += timedelta_as_neg(r, neg)
584+
unit, number = [], []
585+
have_dot = 1
586+
elif c == 'S':
587+
if have_dot: # ms, us, or ns
588+
if not len(number) or p > 3:
589+
raise ValueError(err_msg)
590+
# pad to 3 digits as required
591+
pad = 3 - p
592+
while pad > 0:
593+
number.append('0')
594+
pad -= 1
595+
596+
r = timedelta_from_spec(number, '0', dec_unit)
597+
result += timedelta_as_neg(r, neg)
598+
else: # seconds
599+
if len(number) <= 2:
600+
r = timedelta_from_spec(number, '0', 'S')
601+
result += timedelta_as_neg(r, neg)
602+
else:
603+
raise ValueError(err_msg)
604+
else:
605+
raise ValueError(err_msg)
606+
607+
if not have_value:
608+
# Received string only - never parsed any values
609+
raise ValueError(err_msg)
610+
611+
return result
559612

560613

561614
cdef _to_py_int_float(v):

0 commit comments

Comments
 (0)