Skip to content

Hand written ISO parser for Timedelta construction #19191

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 12, 2018
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 92 additions & 31 deletions pandas/_libs/tslibs/timedeltas.pyx
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# -*- coding: utf-8 -*-
# cython: profile=False
import collections
import re

import sys
cdef bint PY3 = (sys.version_info[0] >= 3)
Expand Down Expand Up @@ -236,6 +235,14 @@ cpdef inline int64_t cast_from_unit(object ts, object unit) except? -1:
return <int64_t> (base *m) + <int64_t> (frac *m)


cdef inline _decode_if_necessary(object ts):
# decode ts if necessary
if not PyUnicode_Check(ts) and not PY3:
ts = str(ts).decode('utf-8')

return ts


cdef inline parse_timedelta_string(object ts):
"""
Parse a regular format timedelta string. Return an int64_t (in ns)
Expand All @@ -258,9 +265,7 @@ cdef inline parse_timedelta_string(object ts):
if len(ts) == 0 or ts in nat_strings:
return NPY_NAT

# decode ts if necessary
if not PyUnicode_Check(ts) and not PY3:
ts = str(ts).decode('utf-8')
ts = _decode_if_necessary(ts)

for c in ts:

Expand Down Expand Up @@ -507,26 +512,14 @@ def _binary_op_method_timedeltalike(op, name):
# ----------------------------------------------------------------------
# Timedelta Construction

iso_pater = re.compile(r"""P
(?P<days>-?[0-9]*)DT
(?P<hours>[0-9]{1,2})H
(?P<minutes>[0-9]{1,2})M
(?P<seconds>[0-9]{0,2})
(\.
(?P<milliseconds>[0-9]{1,3})
(?P<microseconds>[0-9]{0,3})
(?P<nanoseconds>[0-9]{0,3})
)?S""", re.VERBOSE)


cdef int64_t parse_iso_format_string(object iso_fmt) except? -1:
cdef inline int64_t parse_iso_format_string(object ts) except? -1:
"""
Extracts and cleanses the appropriate values from a match object with
groups for each component of an ISO 8601 duration

Parameters
----------
iso_fmt:
ts:
ISO 8601 Duration formatted string

Returns
Expand All @@ -537,25 +530,93 @@ cdef int64_t parse_iso_format_string(object iso_fmt) except? -1:
Raises
------
ValueError
If ``iso_fmt`` cannot be parsed
If ``ts`` cannot be parsed
"""

cdef int64_t ns = 0
cdef:
unicode c
int64_t result = 0, r
int p=0
object dec_unit = 'ms', err_msg
bint have_dot=0, have_value=0, neg=0
list number=[], unit=[]

match = re.match(iso_pater, iso_fmt)
if match:
match_dict = match.groupdict(default='0')
for comp in ['milliseconds', 'microseconds', 'nanoseconds']:
match_dict[comp] = '{:0<3}'.format(match_dict[comp])
ts = _decode_if_necessary(ts)

for k, v in match_dict.items():
ns += timedelta_from_spec(v, '0', k)
err_msg = "Invalid ISO 8601 Duration format - {}".format(ts)

else:
raise ValueError("Invalid ISO 8601 Duration format - "
"{}".format(iso_fmt))
for c in ts:
# number (ascii codes)
if ord(c) >= 48 and ord(c) <= 57:

have_value = 1
if have_dot:
if p == 3 and dec_unit != 'ns':
unit.append(dec_unit)
if dec_unit == 'ms':
dec_unit = 'us'
elif dec_unit == 'us':
dec_unit = 'ns'
p = 0
p += 1

if not len(unit):
number.append(c)
else:
# if in days, pop trailing T
if unit[-1] == 'T':
unit.pop()
elif 'H' in unit or 'M' in unit:
if len(number) > 2:
raise ValueError(err_msg)
r = timedelta_from_spec(number, '0', unit)
result += timedelta_as_neg(r, neg)

return ns
neg = 0
unit, number = [], [c]
else:
if c == 'P':
pass # ignore leading character
elif c == '-':
if neg or have_value:
raise ValueError(err_msg)
else:
neg = 1
elif c in ['D', 'T', 'H', 'M']:
unit.append(c)
elif c == '.':
# append any seconds
if len(number):
r = timedelta_from_spec(number, '0', 'S')
result += timedelta_as_neg(r, neg)
unit, number = [], []
have_dot = 1
elif c == 'S':
if have_dot: # ms, us, or ns
if not len(number) or p > 3:
raise ValueError(err_msg)
# pad to 3 digits as required
pad = 3 - p
while pad > 0:
number.append('0')
pad -= 1

r = timedelta_from_spec(number, '0', dec_unit)
result += timedelta_as_neg(r, neg)
else: # seconds
if len(number) <= 2:
r = timedelta_from_spec(number, '0', 'S')
result += timedelta_as_neg(r, neg)
else:
raise ValueError(err_msg)
else:
raise ValueError(err_msg)

if not have_value:
# Received string only - never parsed any values
raise ValueError(err_msg)

return result


cdef _to_py_int_float(v):
Expand Down