Skip to content

Commit bd0369f

Browse files
committed
ENH: support timedelta64[ns] as a serialization type in HDFStore for query and append (GH3577)
1 parent 4577064 commit bd0369f

File tree

7 files changed

+121
-8
lines changed

7 files changed

+121
-8
lines changed

doc/source/io.rst

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2009,6 +2009,22 @@ space. These are in terms of the total number of rows in a table.
20092009
Term('minor_axis', '=', ['A','B']) ],
20102010
start=0, stop=10)
20112011
2012+
**Using timedelta64[ns]**
2013+
2014+
.. versionadded:: 0.13
2015+
2016+
Beginning in 0.13.0, you can store and query using the ``timedelta64[ns]`` type. Terms can be
2017+
specified in the format: ``<float>(<unit>)``, where float may be signed (and fractional), and unit can be
2018+
``D,s,ms,us,ns`` for the timedelta. Here's an example:
2019+
2020+
.. ipython:: python
2021+
2022+
from datetime import timedelta
2023+
dftd = DataFrame(dict(A = Timestamp('20130101'), B = [ Timestamp('20130101') + timedelta(days=i,seconds=10) for i in range(10) ]))
2024+
dftd['C'] = dftd['A']-dftd['B']
2025+
dftd
2026+
store.append('dftd',dftd,data_columns=True)
2027+
store.select('dftd',Term("C","<","-3.5D"))
20122028
20132029
Indexing
20142030
~~~~~~~~

doc/source/release.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ API Changes
156156
- a column multi-index will be recreated properly (:issue:`4710`); raise on trying to use a multi-index
157157
with data_columns on the same axis
158158
- ``select_as_coordinates`` will now return an ``Int64Index`` of the resultant selection set
159+
- support ``timedelta64[ns]`` as a serialization type (:issue:`3577`)
159160
- ``JSON``
160161

161162
- added ``date_unit`` parameter to specify resolution of timestamps. Options

doc/source/v0.13.0.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ API changes
8080
See :ref:`here<io.hdf5-selecting_coordinates>` for an example.
8181
- allow a passed locations array or mask as a ``where`` condition (:issue:`4467`).
8282
See :ref:`here<io.hdf5-where_mask>` for an example.
83-
83+
- support ``timedelta64[ns]`` as a serialization type (:issue:`3577`)
8484
- the ``format`` keyword now replaces the ``table`` keyword; allowed values are ``fixed(f)`` or ``table(t)``
8585
the same defaults as prior < 0.13.0 remain, e.g. ``put`` implies 'fixed` or 'f' (Fixed) format
8686
and ``append`` imples 'table' or 't' (Table) format

pandas/core/common.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919
from pandas.core.config import get_option
2020
from pandas.core import array as pa
2121

22-
2322
# XXX: HACK for NumPy 1.5.1 to suppress warnings
2423
try:
2524
np.seterr(all='ignore')
@@ -704,13 +703,29 @@ def diff(arr, n, axis=0):
704703

705704
return out_arr
706705

706+
timedelta_search = re.compile(
707+
"^(?P<value>-?\d*\.?\d*)(?P<unit>D|s|ms|us|ns)?$")
707708

708-
def _coerce_scalar_to_timedelta_type(r):
709+
def _coerce_scalar_to_timedelta_type(r, unit='ns'):
709710
# kludgy here until we have a timedelta scalar
710711
# handle the numpy < 1.7 case
711712

713+
if isinstance(r, compat.string_types):
714+
m = timedelta_search.search(r)
715+
if m:
716+
r = float(m.groupdict()['value'])
717+
u = m.groupdict().get('unit')
718+
if u is not None:
719+
unit = u
720+
else:
721+
raise ValueError("cannot convert timedelta scalar value!")
722+
723+
r = tslib.cast_from_unit(unit, r)
724+
r = timedelta(microseconds=int(r)/1000)
725+
712726
if is_integer(r):
713-
r = timedelta(microseconds=r/1000)
727+
r = tslib.cast_from_unit(unit, r)
728+
r = timedelta(microseconds=int(r)/1000)
714729

715730
if _np_version_under1p7:
716731
if not isinstance(r, timedelta):

pandas/io/pytables.py

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
from pandas.core.common import adjoin, is_list_like, pprint_thing
2626
from pandas.core.algorithms import match, unique
2727
from pandas.core.categorical import Categorical
28-
from pandas.core.common import _asarray_tuplesafe
28+
from pandas.core.common import _asarray_tuplesafe, _np_version_under1p7
2929
from pandas.core.internals import BlockManager, make_block
3030
from pandas.core.reshape import block2d_to_blocknd, factor_indexer
3131
from pandas.core.index import _ensure_index
@@ -1527,6 +1527,8 @@ def set_kind(self):
15271527
self.kind = 'integer'
15281528
elif dtype.startswith(u('date')):
15291529
self.kind = 'datetime'
1530+
elif dtype.startswith(u('timedelta')):
1531+
self.kind = 'timedelta'
15301532
elif dtype.startswith(u('bool')):
15311533
self.kind = 'bool'
15321534
else:
@@ -1547,6 +1549,11 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No
15471549

15481550
if inferred_type == 'datetime64':
15491551
self.set_atom_datetime64(block)
1552+
elif dtype == 'timedelta64[ns]':
1553+
if _np_version_under1p7:
1554+
raise TypeError(
1555+
"timdelta64 is not supported under under numpy < 1.7")
1556+
self.set_atom_timedelta64(block)
15501557
elif inferred_type == 'date':
15511558
raise TypeError(
15521559
"[date] is not implemented as a table column")
@@ -1667,6 +1674,16 @@ def set_atom_datetime64(self, block, values=None):
16671674
values = block.values.view('i8')
16681675
self.set_data(values, 'datetime64')
16691676

1677+
def get_atom_timedelta64(self, block):
1678+
return _tables().Int64Col(shape=block.shape[0])
1679+
1680+
def set_atom_timedelta64(self, block, values=None):
1681+
self.kind = 'timedelta64'
1682+
self.typ = self.get_atom_timedelta64(block)
1683+
if values is None:
1684+
values = block.values.view('i8')
1685+
self.set_data(values, 'timedelta64')
1686+
16701687
@property
16711688
def shape(self):
16721689
return getattr(self.data, 'shape', None)
@@ -1719,6 +1736,8 @@ def convert(self, values, nan_rep, encoding):
17191736
else:
17201737
self.data = np.asarray(self.data, dtype='M8[ns]')
17211738

1739+
elif dtype == u('timedelta64'):
1740+
self.data = np.asarray(self.data, dtype='m8[ns]')
17221741
elif dtype == u('date'):
17231742
self.data = np.array(
17241743
[date.fromtimestamp(v) for v in self.data], dtype=object)
@@ -1767,6 +1786,9 @@ def get_atom_data(self, block):
17671786
def get_atom_datetime64(self, block):
17681787
return _tables().Int64Col()
17691788

1789+
def get_atom_timedelta64(self, block):
1790+
return _tables().Int64Col()
1791+
17701792

17711793
class GenericDataIndexableCol(DataIndexableCol):
17721794

@@ -2007,6 +2029,11 @@ def read_array(self, key):
20072029

20082030
if dtype == u('datetime64'):
20092031
ret = np.array(ret, dtype='M8[ns]')
2032+
elif dtype == u('timedelta64'):
2033+
if _np_version_under1p7:
2034+
raise TypeError(
2035+
"timedelta64 is not supported under under numpy < 1.7")
2036+
ret = np.array(ret, dtype='m8[ns]')
20102037

20112038
if transposed:
20122039
return ret.T
@@ -2214,6 +2241,9 @@ def write_array(self, key, value, items=None):
22142241
elif value.dtype.type == np.datetime64:
22152242
self._handle.createArray(self.group, key, value.view('i8'))
22162243
getattr(self.group, key)._v_attrs.value_type = 'datetime64'
2244+
elif value.dtype.type == np.timedelta64:
2245+
self._handle.createArray(self.group, key, value.view('i8'))
2246+
getattr(self.group, key)._v_attrs.value_type = 'timedelta64'
22172247
else:
22182248
if empty_array:
22192249
self.write_array_empty(key, value)
@@ -4000,7 +4030,9 @@ def eval(self):
40004030
""" set the numexpr expression for this term """
40014031

40024032
if not self.is_valid:
4003-
raise ValueError("query term is not valid [%s]" % str(self))
4033+
raise ValueError("query term is not valid [{0}]\n"
4034+
" all queries terms must include a reference to\n"
4035+
" either an axis (e.g. index or column), or a data_columns\n".format(str(self)))
40044036

40054037
# convert values if we are in the table
40064038
if self.is_in_table:
@@ -4060,6 +4092,9 @@ def stringify(value):
40604092
if v.tz is not None:
40614093
v = v.tz_convert('UTC')
40624094
return TermValue(v, v.value, kind)
4095+
elif kind == u('timedelta64') or kind == u('timedelta'):
4096+
v = com._coerce_scalar_to_timedelta_type(v,unit='s').item()
4097+
return TermValue(int(v), v, kind)
40634098
elif (isinstance(v, datetime) or hasattr(v, 'timetuple')
40644099
or kind == u('date')):
40654100
v = time.mktime(v.timetuple())

pandas/io/tests/test_pytables.py

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
assert_series_equal)
2424
from pandas import concat, Timestamp
2525
from pandas import compat
26+
from pandas.core import common as com
2627

2728
from numpy.testing.decorators import slow
2829

@@ -1732,7 +1733,7 @@ def test_unimplemented_dtypes_table_columns(self):
17321733
# this fails because we have a date in the object block......
17331734
self.assertRaises(TypeError, store.append, 'df_unimplemented', df)
17341735

1735-
def test_table_append_with_timezones(self):
1736+
def test_append_with_timezones(self):
17361737

17371738
from datetime import timedelta
17381739

@@ -1798,6 +1799,51 @@ def compare(a,b):
17981799
result = store.select('df')
17991800
assert_frame_equal(result,df)
18001801

1802+
def test_append_with_timedelta(self):
1803+
if com._np_version_under1p7:
1804+
raise nose.SkipTest("requires numpy >= 1.7")
1805+
1806+
# GH 3577
1807+
# append timedelta
1808+
1809+
from datetime import timedelta
1810+
df = DataFrame(dict(A = Timestamp('20130101'), B = [ Timestamp('20130101') + timedelta(days=i,seconds=10) for i in range(10) ]))
1811+
df['C'] = df['A']-df['B']
1812+
df.ix[3:5,'C'] = np.nan
1813+
1814+
with ensure_clean(self.path) as store:
1815+
1816+
# table
1817+
_maybe_remove(store, 'df')
1818+
store.append('df',df,data_columns=True)
1819+
result = store.select('df')
1820+
assert_frame_equal(result,df)
1821+
1822+
result = store.select('df',Term("C<100000"))
1823+
assert_frame_equal(result,df)
1824+
1825+
result = store.select('df',Term("C","<",-3*86400))
1826+
assert_frame_equal(result,df.iloc[3:])
1827+
1828+
result = store.select('df',Term("C","<",'-3D'))
1829+
assert_frame_equal(result,df.iloc[3:])
1830+
1831+
# a bit hacky here as we don't really deal with the NaT properly
1832+
1833+
result = store.select('df',Term("C","<",'-500000s'))
1834+
result = result.dropna(subset=['C'])
1835+
assert_frame_equal(result,df.iloc[6:])
1836+
1837+
result = store.select('df',Term("C","<",'-3.5D'))
1838+
result = result.iloc[1:]
1839+
assert_frame_equal(result,df.iloc[4:])
1840+
1841+
# fixed
1842+
_maybe_remove(store, 'df2')
1843+
store.put('df2',df)
1844+
result = store.select('df2')
1845+
assert_frame_equal(result,df)
1846+
18011847
def test_remove(self):
18021848

18031849
with ensure_clean(self.path) as store:

pandas/tslib.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1276,7 +1276,7 @@ cdef inline _get_datetime64_nanos(object val):
12761276
else:
12771277
return ival
12781278

1279-
cdef inline int64_t cast_from_unit(object unit, object ts) except -1:
1279+
cpdef inline int64_t cast_from_unit(object unit, object ts) except -1:
12801280
""" return a casting of the unit represented to nanoseconds
12811281
round the fractional part of a float to our precision, p """
12821282
if unit == 'D':

0 commit comments

Comments
 (0)