Skip to content

PERF: parse and timedelta ops improvements, #6755 #10396

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 22, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ Removal of prior version deprecations/changes
Performance Improvements
~~~~~~~~~~~~~~~~~~~~~~~~

- 4x improvement in ``timedelta`` string parsing (:issue:`6755`)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to_timedelta ?

- 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`)

.. _whatsnew_0170.bug_fixes:

Bug Fixes
Expand Down
5 changes: 4 additions & 1 deletion pandas/tseries/tests/test_timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,9 @@ def test_construction(self):
# currently invalid as it has a - on the hhmmdd part (only allowed on the days)
self.assertRaises(ValueError, lambda : Timedelta('-10 days -1 h 1.5m 1s 3us'))

# only leading neg signs are allowed
self.assertRaises(ValueError, lambda : Timedelta('10 days -1 h 1.5m 1s 3us'))

# roundtripping both for string and value
for v in ['1s',
'-1s',
Expand Down Expand Up @@ -151,7 +154,7 @@ def test_construction(self):
"cannot construct a TimeDelta",
lambda : Timedelta())
tm.assertRaisesRegexp(ValueError,
"cannot create timedelta string convert",
"unit abbreviation w/o a number",
lambda : Timedelta('foo'))
tm.assertRaisesRegexp(ValueError,
"cannot construct a TimeDelta from the passed arguments, allowed keywords are ",
Expand Down
163 changes: 3 additions & 160 deletions pandas/tseries/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,22 +34,13 @@ def _convert_listlike(arg, box, unit):
if isinstance(arg, (list,tuple)) or ((hasattr(arg,'__iter__') and not hasattr(arg,'dtype'))):
arg = np.array(list(arg), dtype='O')

# these are shortcutable
if is_timedelta64_dtype(arg):
value = arg.astype('timedelta64[ns]')
elif is_integer_dtype(arg):

# these are shortcutable
value = arg.astype('timedelta64[{0}]'.format(unit)).astype('timedelta64[ns]')
value = arg.astype('timedelta64[{0}]'.format(unit)).astype('timedelta64[ns]', copy=False)
else:
try:
value = tslib.array_to_timedelta64(_ensure_object(arg), unit=unit, coerce=coerce)
except:

# try to process strings fast; may need to fallback
try:
value = np.array([ _get_string_converter(r, unit=unit)() for r in arg ],dtype='m8[ns]')
except:
value = np.array([ _coerce_scalar_to_timedelta_type(r, unit=unit, coerce=coerce) for r in arg ])
value = tslib.array_to_timedelta64(_ensure_object(arg), unit=unit, coerce=coerce)
value = value.astype('timedelta64[ns]', copy=False)

if box:
Expand Down Expand Up @@ -95,15 +86,6 @@ def _convert_listlike(arg, box, unit):
'NS' : 'ns',
'ns' : 'ns',
}
_unit_scale = {
'd' : 86400*1e9,
'h' : 3600*1e9,
'm' : 60*1e9,
's' : 1e9,
'ms' : 1e6,
'us' : 1e3,
'ns' : 1,
}

def _validate_timedelta_unit(arg):
""" provide validation / translation for timedelta short units """
Expand All @@ -114,150 +96,11 @@ def _validate_timedelta_unit(arg):
return 'ns'
raise ValueError("invalid timedelta unit {0} provided".format(arg))

_short_search = re.compile(
"^\s*(?P<neg>-?)\s*(?P<value>\d*\.?\d*)\s*(?P<unit>d|s|ms|us|ns)?\s*$",re.IGNORECASE)
_full_search = re.compile(
"^\s*(?P<neg>-?)\s*(?P<days>\d*?\.?\d*?)?\s*(days|d|day)?,?\s*\+?(?P<time>\d{1,2}:\d{2}:\d{2})?(?P<frac>\.\d+)?\s*$",re.IGNORECASE)
_nat_search = re.compile(
"^\s*(nat|nan)\s*$",re.IGNORECASE)
_whitespace = re.compile('^\s*$')
_number_split = re.compile("^(\d+\.?\d*)")

# construct the full2_search
abbrevs = [('d' ,'days|d|day'),
('h' ,'hours|h|hour'),
('m' ,'minutes|min|minute|m'),
('s' ,'seconds|sec|second|s'),
('ms','milliseconds|milli|millis|millisecond|ms'),
('us','microseconds|micro|micros|microsecond|us'),
('ns','nanoseconds|nano|nanos|nanosecond|ns')]

_full_search2 = re.compile(''.join(
["^\s*(?P<neg>-?)\s*"] + [ "(?P<" + p + ">\\d+\.?\d*\s*(" + ss + "))?\\s*" for p, ss in abbrevs ] + ['$']))

def _coerce_scalar_to_timedelta_type(r, unit='ns', box=True, coerce=False):
""" convert strings to timedelta; coerce to Timedelta (if box), else np.timedelta64"""

if isinstance(r, compat.string_types):

# we are already converting to nanoseconds
converter = _get_string_converter(r, unit=unit)
r = converter()
unit='ns'

result = tslib.convert_to_timedelta(r,unit,coerce)
if box:
result = tslib.Timedelta(result)

return result

def _get_string_converter(r, unit='ns'):
""" return a string converter for r to process the timedelta format """

# treat as a nan
if isnull(r):
def convert(r=None, unit=None):
return tslib.iNaT
return convert

if _whitespace.search(r):
def convert(r=None, unit=None):
return tslib.iNaT
return convert

m = _short_search.search(r)
if m:
def convert(r=None, unit=unit, m=m):
if r is not None:
m = _short_search.search(r)

gd = m.groupdict()

r = float(gd['value'])
u = gd.get('unit')
if u is not None:
unit = u.lower()
result = tslib.cast_from_unit(r, unit)
if gd['neg']:
result *= -1
return result
return convert

m = _full_search.search(r)
if m:
def convert(r=None, unit=None, m=m):
if r is not None:
m = _full_search.search(r)

gd = m.groupdict()

# handle time
value = 0
time = gd['time']
if time:
(hh,mm,ss) = time.split(':')
value += int((float(hh)*3600 + float(mm)*60 + float(ss))*1e9)

# handle frac
frac = gd['frac']
if frac:
value += round(float(frac)*1e9)

# handle days (possibly negative)
is_neg = gd['neg']
if gd['days']:
days = int((float(gd['days'] or 0) * 86400)*1e9)
if is_neg:
days *= -1
value += days
else:
if is_neg:
value *= -1
return tslib.cast_from_unit(value, 'ns')
return convert

# look for combo strings
m = _full_search2.search(r)
if m:
def convert(r=None, unit=None, m=m):
if r is not None:
m = _full_search2.search(r)

gd = m.groupdict()

# the parser
def parse(k, v):
if v is None:
return 0
v = float(_number_split.search(v).group())
return int(v*_unit_scale[k])

# handle non-days
days = gd.pop('days',None)
neg = gd.pop('neg',None)
value = 0
for k, v in gd.items():
value += parse(k,v)

# parse days / neg
if days:
days = parse('days',days)
if neg:
days *= -1
value += days
else:
if neg:
value *= -1

return tslib.cast_from_unit(value, 'ns')
return convert

m = _nat_search.search(r)
if m:
def convert(r=None, unit=None, m=m):
return tslib.iNaT
return convert

# no converter
raise ValueError("cannot create timedelta string converter for [{0}]".format(r))

7 changes: 7 additions & 0 deletions pandas/tseries/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@ def _convert_listlike(arg, box, format):
if isinstance(arg, (list,tuple)):
arg = np.array(arg, dtype='O')

# these are shortcutable
if com.is_datetime64_ns_dtype(arg):
if box and not isinstance(arg, DatetimeIndex):
try:
Expand All @@ -271,6 +272,12 @@ def _convert_listlike(arg, box, format):
pass

return arg
elif format is None and com.is_integer_dtype(arg) and unit=='ns':
result = arg.astype('datetime64[ns]')
if box:
return DatetimeIndex(result, tz='utc' if utc else None)

return result

arg = com._ensure_object(arg)

Expand Down
Loading