From bd0369fffc2667ecf9876952ecf458aa88c598e1 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 11 Sep 2013 19:59:29 -0400 Subject: [PATCH 1/4] ENH: support timedelta64[ns] as a serialization type in HDFStore for query and append (GH3577) --- doc/source/io.rst | 16 +++++++++++ doc/source/release.rst | 1 + doc/source/v0.13.0.txt | 2 +- pandas/core/common.py | 21 ++++++++++++-- pandas/io/pytables.py | 39 ++++++++++++++++++++++++-- pandas/io/tests/test_pytables.py | 48 +++++++++++++++++++++++++++++++- pandas/tslib.pyx | 2 +- 7 files changed, 121 insertions(+), 8 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index 1d3980e216587..94c8343bfec1d 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2009,6 +2009,22 @@ space. These are in terms of the total number of rows in a table. Term('minor_axis', '=', ['A','B']) ], start=0, stop=10) +**Using timedelta64[ns]** + +.. versionadded:: 0.13 + +Beginning in 0.13.0, you can store and query using the ``timedelta64[ns]`` type. Terms can be +specified in the format: ``()``, where float may be signed (and fractional), and unit can be +``D,s,ms,us,ns`` for the timedelta. Here's an example: + +.. ipython:: python + + from datetime import timedelta + dftd = DataFrame(dict(A = Timestamp('20130101'), B = [ Timestamp('20130101') + timedelta(days=i,seconds=10) for i in range(10) ])) + dftd['C'] = dftd['A']-dftd['B'] + dftd + store.append('dftd',dftd,data_columns=True) + store.select('dftd',Term("C","<","-3.5D")) Indexing ~~~~~~~~ diff --git a/doc/source/release.rst b/doc/source/release.rst index 087d2880511d2..d50438cd08058 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -156,6 +156,7 @@ API Changes - a column multi-index will be recreated properly (:issue:`4710`); raise on trying to use a multi-index with data_columns on the same axis - ``select_as_coordinates`` will now return an ``Int64Index`` of the resultant selection set + - support ``timedelta64[ns]`` as a serialization type (:issue:`3577`) - ``JSON`` - added ``date_unit`` parameter to specify resolution of timestamps. Options diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index caf218747bdfb..536b0badeec00 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -80,7 +80,7 @@ API changes See :ref:`here` for an example. - allow a passed locations array or mask as a ``where`` condition (:issue:`4467`). See :ref:`here` for an example. - + - support ``timedelta64[ns]`` as a serialization type (:issue:`3577`) - the ``format`` keyword now replaces the ``table`` keyword; allowed values are ``fixed(f)`` or ``table(t)`` the same defaults as prior < 0.13.0 remain, e.g. ``put`` implies 'fixed` or 'f' (Fixed) format and ``append`` imples 'table' or 't' (Table) format diff --git a/pandas/core/common.py b/pandas/core/common.py index ba7c6cc511933..2f153e88f4dc6 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -19,7 +19,6 @@ from pandas.core.config import get_option from pandas.core import array as pa - # XXX: HACK for NumPy 1.5.1 to suppress warnings try: np.seterr(all='ignore') @@ -704,13 +703,29 @@ def diff(arr, n, axis=0): return out_arr +timedelta_search = re.compile( + "^(?P-?\d*\.?\d*)(?PD|s|ms|us|ns)?$") -def _coerce_scalar_to_timedelta_type(r): +def _coerce_scalar_to_timedelta_type(r, unit='ns'): # kludgy here until we have a timedelta scalar # handle the numpy < 1.7 case + if isinstance(r, compat.string_types): + m = timedelta_search.search(r) + if m: + r = float(m.groupdict()['value']) + u = m.groupdict().get('unit') + if u is not None: + unit = u + else: + raise ValueError("cannot convert timedelta scalar value!") + + r = tslib.cast_from_unit(unit, r) + r = timedelta(microseconds=int(r)/1000) + if is_integer(r): - r = timedelta(microseconds=r/1000) + r = tslib.cast_from_unit(unit, r) + r = timedelta(microseconds=int(r)/1000) if _np_version_under1p7: if not isinstance(r, timedelta): diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 6759e07ed7935..bf45fbf96c0e5 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -25,7 +25,7 @@ from pandas.core.common import adjoin, is_list_like, pprint_thing from pandas.core.algorithms import match, unique from pandas.core.categorical import Categorical -from pandas.core.common import _asarray_tuplesafe +from pandas.core.common import _asarray_tuplesafe, _np_version_under1p7 from pandas.core.internals import BlockManager, make_block from pandas.core.reshape import block2d_to_blocknd, factor_indexer from pandas.core.index import _ensure_index @@ -1527,6 +1527,8 @@ def set_kind(self): self.kind = 'integer' elif dtype.startswith(u('date')): self.kind = 'datetime' + elif dtype.startswith(u('timedelta')): + self.kind = 'timedelta' elif dtype.startswith(u('bool')): self.kind = 'bool' else: @@ -1547,6 +1549,11 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No if inferred_type == 'datetime64': self.set_atom_datetime64(block) + elif dtype == 'timedelta64[ns]': + if _np_version_under1p7: + raise TypeError( + "timdelta64 is not supported under under numpy < 1.7") + self.set_atom_timedelta64(block) elif inferred_type == 'date': raise TypeError( "[date] is not implemented as a table column") @@ -1667,6 +1674,16 @@ def set_atom_datetime64(self, block, values=None): values = block.values.view('i8') self.set_data(values, 'datetime64') + def get_atom_timedelta64(self, block): + return _tables().Int64Col(shape=block.shape[0]) + + def set_atom_timedelta64(self, block, values=None): + self.kind = 'timedelta64' + self.typ = self.get_atom_timedelta64(block) + if values is None: + values = block.values.view('i8') + self.set_data(values, 'timedelta64') + @property def shape(self): return getattr(self.data, 'shape', None) @@ -1719,6 +1736,8 @@ def convert(self, values, nan_rep, encoding): else: self.data = np.asarray(self.data, dtype='M8[ns]') + elif dtype == u('timedelta64'): + self.data = np.asarray(self.data, dtype='m8[ns]') elif dtype == u('date'): self.data = np.array( [date.fromtimestamp(v) for v in self.data], dtype=object) @@ -1767,6 +1786,9 @@ def get_atom_data(self, block): def get_atom_datetime64(self, block): return _tables().Int64Col() + def get_atom_timedelta64(self, block): + return _tables().Int64Col() + class GenericDataIndexableCol(DataIndexableCol): @@ -2007,6 +2029,11 @@ def read_array(self, key): if dtype == u('datetime64'): ret = np.array(ret, dtype='M8[ns]') + elif dtype == u('timedelta64'): + if _np_version_under1p7: + raise TypeError( + "timedelta64 is not supported under under numpy < 1.7") + ret = np.array(ret, dtype='m8[ns]') if transposed: return ret.T @@ -2214,6 +2241,9 @@ def write_array(self, key, value, items=None): elif value.dtype.type == np.datetime64: self._handle.createArray(self.group, key, value.view('i8')) getattr(self.group, key)._v_attrs.value_type = 'datetime64' + elif value.dtype.type == np.timedelta64: + self._handle.createArray(self.group, key, value.view('i8')) + getattr(self.group, key)._v_attrs.value_type = 'timedelta64' else: if empty_array: self.write_array_empty(key, value) @@ -4000,7 +4030,9 @@ def eval(self): """ set the numexpr expression for this term """ if not self.is_valid: - raise ValueError("query term is not valid [%s]" % str(self)) + raise ValueError("query term is not valid [{0}]\n" + " all queries terms must include a reference to\n" + " either an axis (e.g. index or column), or a data_columns\n".format(str(self))) # convert values if we are in the table if self.is_in_table: @@ -4060,6 +4092,9 @@ def stringify(value): if v.tz is not None: v = v.tz_convert('UTC') return TermValue(v, v.value, kind) + elif kind == u('timedelta64') or kind == u('timedelta'): + v = com._coerce_scalar_to_timedelta_type(v,unit='s').item() + return TermValue(int(v), v, kind) elif (isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u('date')): v = time.mktime(v.timetuple()) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 7e5c3f9fff061..3667dff994232 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -23,6 +23,7 @@ assert_series_equal) from pandas import concat, Timestamp from pandas import compat +from pandas.core import common as com from numpy.testing.decorators import slow @@ -1732,7 +1733,7 @@ def test_unimplemented_dtypes_table_columns(self): # this fails because we have a date in the object block...... self.assertRaises(TypeError, store.append, 'df_unimplemented', df) - def test_table_append_with_timezones(self): + def test_append_with_timezones(self): from datetime import timedelta @@ -1798,6 +1799,51 @@ def compare(a,b): result = store.select('df') assert_frame_equal(result,df) + def test_append_with_timedelta(self): + if com._np_version_under1p7: + raise nose.SkipTest("requires numpy >= 1.7") + + # GH 3577 + # append timedelta + + from datetime import timedelta + df = DataFrame(dict(A = Timestamp('20130101'), B = [ Timestamp('20130101') + timedelta(days=i,seconds=10) for i in range(10) ])) + df['C'] = df['A']-df['B'] + df.ix[3:5,'C'] = np.nan + + with ensure_clean(self.path) as store: + + # table + _maybe_remove(store, 'df') + store.append('df',df,data_columns=True) + result = store.select('df') + assert_frame_equal(result,df) + + result = store.select('df',Term("C<100000")) + assert_frame_equal(result,df) + + result = store.select('df',Term("C","<",-3*86400)) + assert_frame_equal(result,df.iloc[3:]) + + result = store.select('df',Term("C","<",'-3D')) + assert_frame_equal(result,df.iloc[3:]) + + # a bit hacky here as we don't really deal with the NaT properly + + result = store.select('df',Term("C","<",'-500000s')) + result = result.dropna(subset=['C']) + assert_frame_equal(result,df.iloc[6:]) + + result = store.select('df',Term("C","<",'-3.5D')) + result = result.iloc[1:] + assert_frame_equal(result,df.iloc[4:]) + + # fixed + _maybe_remove(store, 'df2') + store.put('df2',df) + result = store.select('df2') + assert_frame_equal(result,df) + def test_remove(self): with ensure_clean(self.path) as store: diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 983d3385e8f85..b145bd8fba507 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1276,7 +1276,7 @@ cdef inline _get_datetime64_nanos(object val): else: return ival -cdef inline int64_t cast_from_unit(object unit, object ts) except -1: +cpdef inline int64_t cast_from_unit(object unit, object ts) except -1: """ return a casting of the unit represented to nanoseconds round the fractional part of a float to our precision, p """ if unit == 'D': From d8280c12a3449b66beaf0459bf86e71e99a44b2b Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 12 Sep 2013 07:41:47 -0400 Subject: [PATCH 2/4] CLN: refactored locations of timedeltas to core/tseries/timedeltas (from a series of functions in core/common) --- pandas/__init__.py | 13 ++++ pandas/core/common.py | 119 ++----------------------------- pandas/core/generic.py | 4 +- pandas/core/series.py | 16 +++-- pandas/io/pytables.py | 7 +- pandas/io/tests/test_pytables.py | 4 +- pandas/tests/test_frame.py | 5 +- pandas/tests/test_series.py | 14 ++-- pandas/tseries/index.py | 3 +- pandas/tseries/timedeltas.py | 111 ++++++++++++++++++++++++++++ 10 files changed, 156 insertions(+), 140 deletions(-) create mode 100644 pandas/tseries/timedeltas.py diff --git a/pandas/__init__.py b/pandas/__init__.py index a0edb397c28c1..03681d3fa5a3f 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -18,6 +18,19 @@ from datetime import datetime import numpy as np +# XXX: HACK for NumPy 1.5.1 to suppress warnings +try: + np.seterr(all='ignore') + # np.set_printoptions(suppress=True) +except Exception: # pragma: no cover + pass + +# numpy versioning +from distutils.version import LooseVersion +_np_version = np.version.short_version +_np_version_under1p6 = LooseVersion(_np_version) < '1.6' +_np_version_under1p7 = LooseVersion(_np_version) < '1.7' + from pandas.version import version as __version__ from pandas.info import __doc__ diff --git a/pandas/core/common.py b/pandas/core/common.py index 2f153e88f4dc6..b58bd92a4fd1f 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -11,7 +11,6 @@ import pandas.algos as algos import pandas.lib as lib import pandas.tslib as tslib -from distutils.version import LooseVersion from pandas import compat from pandas.compat import StringIO, BytesIO, range, long, u, zip, map from datetime import timedelta @@ -19,14 +18,6 @@ from pandas.core.config import get_option from pandas.core import array as pa -# XXX: HACK for NumPy 1.5.1 to suppress warnings -try: - np.seterr(all='ignore') - # np.set_printoptions(suppress=True) -except Exception: # pragma: no cover - pass - - class PandasError(Exception): pass @@ -34,11 +25,6 @@ class PandasError(Exception): class AmbiguousIndexError(PandasError, KeyError): pass -# versioning -_np_version = np.version.short_version -_np_version_under1p6 = LooseVersion(_np_version) < '1.6' -_np_version_under1p7 = LooseVersion(_np_version) < '1.7' - _POSSIBLY_CAST_DTYPES = set([np.dtype(t) for t in ['M8[ns]', 'm8[ns]', 'O', 'int8', 'uint8', 'int16', 'uint16', 'int32', 'uint32', 'int64', 'uint64']]) @@ -703,50 +689,13 @@ def diff(arr, n, axis=0): return out_arr -timedelta_search = re.compile( - "^(?P-?\d*\.?\d*)(?PD|s|ms|us|ns)?$") - -def _coerce_scalar_to_timedelta_type(r, unit='ns'): - # kludgy here until we have a timedelta scalar - # handle the numpy < 1.7 case - - if isinstance(r, compat.string_types): - m = timedelta_search.search(r) - if m: - r = float(m.groupdict()['value']) - u = m.groupdict().get('unit') - if u is not None: - unit = u - else: - raise ValueError("cannot convert timedelta scalar value!") - - r = tslib.cast_from_unit(unit, r) - r = timedelta(microseconds=int(r)/1000) - - if is_integer(r): - r = tslib.cast_from_unit(unit, r) - r = timedelta(microseconds=int(r)/1000) - - if _np_version_under1p7: - if not isinstance(r, timedelta): - raise AssertionError("Invalid type for timedelta scalar: %s" % type(r)) - if compat.PY3: - # convert to microseconds in timedelta64 - r = np.timedelta64(int(r.total_seconds()*1e9 + r.microseconds*1000)) - else: - return r - - if isinstance(r, timedelta): - r = np.timedelta64(r) - elif not isinstance(r, np.timedelta64): - raise AssertionError("Invalid type for timedelta scalar: %s" % type(r)) - return r.astype('timedelta64[ns]') - def _coerce_to_dtypes(result, dtypes): """ given a dtypes and a result set, coerce the result elements to the dtypes """ if len(result) != len(dtypes): raise AssertionError("_coerce_to_dtypes requires equal len arrays") + from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type + def conv(r,dtype): try: if isnull(r): @@ -1339,68 +1288,6 @@ def _possibly_convert_platform(values): return values - -def _possibly_cast_to_timedelta(value, coerce=True): - """ try to cast to timedelta64, if already a timedeltalike, then make - sure that we are [ns] (as numpy 1.6.2 is very buggy in this regards, - don't force the conversion unless coerce is True - - if coerce='compat' force a compatibilty coercerion (to timedeltas) if needeed - """ - - # coercion compatability - if coerce == 'compat' and _np_version_under1p7: - - def convert(td, dtype): - - # we have an array with a non-object dtype - if hasattr(td,'item'): - td = td.astype(np.int64).item() - if td == tslib.iNaT: - return td - if dtype == 'm8[us]': - td *= 1000 - return td - - if td == tslib.compat_NaT: - return tslib.iNaT - - # convert td value to a nanosecond value - d = td.days - s = td.seconds - us = td.microseconds - - if dtype == 'object' or dtype == 'm8[ns]': - td = 1000*us + (s + d * 24 * 3600) * 10 ** 9 - else: - raise ValueError("invalid conversion of dtype in np < 1.7 [%s]" % dtype) - - return td - - # < 1.7 coercion - if not is_list_like(value): - value = np.array([ value ]) - - dtype = value.dtype - return np.array([ convert(v,dtype) for v in value ], dtype='m8[ns]') - - # deal with numpy not being able to handle certain timedelta operations - if isinstance(value, (ABCSeries, np.ndarray)) and value.dtype.kind == 'm': - if value.dtype != 'timedelta64[ns]': - value = value.astype('timedelta64[ns]') - return value - - # we don't have a timedelta, but we want to try to convert to one (but - # don't force it) - if coerce: - new_value = tslib.array_to_timedelta64( - _values_from_object(value).astype(object), coerce=False) - if new_value.dtype == 'i8': - value = np.array(new_value, dtype='timedelta64[ns]') - - return value - - def _possibly_cast_to_datetime(value, dtype, coerce=False): """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT """ @@ -1438,6 +1325,7 @@ def _possibly_cast_to_datetime(value, dtype, coerce=False): from pandas.tseries.tools import to_datetime value = to_datetime(value, coerce=coerce).values elif is_timedelta64: + from pandas.tseries.timedeltas import _possibly_cast_to_timedelta value = _possibly_cast_to_timedelta(value) except: pass @@ -1463,6 +1351,7 @@ def _possibly_cast_to_datetime(value, dtype, coerce=False): except: pass elif inferred_type in ['timedelta', 'timedelta64']: + from pandas.tseries.timedeltas import _possibly_cast_to_timedelta value = _possibly_cast_to_timedelta(value) return value diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 212e2bad563b6..b9ffe788d183d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -13,7 +13,7 @@ from pandas.tseries.index import DatetimeIndex from pandas.core.internals import BlockManager import pandas.core.common as com -from pandas import compat +from pandas import compat, _np_version_under1p7 from pandas.compat import map, zip, lrange from pandas.core.common import (isnull, notnull, is_list_like, _values_from_object, @@ -1908,7 +1908,7 @@ def abs(self): obj = np.abs(self) # suprimo numpy 1.6 hacking - if com._np_version_under1p7: + if _np_version_under1p7: if self.ndim == 1: if obj.dtype == 'm8[us]': obj = obj.astype('m8[ns]') diff --git a/pandas/core/series.py b/pandas/core/series.py index 4516fcfbaee8e..8d6591c3acd60 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -19,6 +19,7 @@ _asarray_tuplesafe, is_integer_dtype, _NS_DTYPE, _TD_DTYPE, _infer_dtype_from_scalar, is_list_like, _values_from_object, + _possibly_cast_to_datetime, _possibly_castable, _possibly_convert_platform, ABCSparseArray) from pandas.core.index import (Index, MultiIndex, InvalidIndexError, _ensure_index, _handle_legacy_indexes) @@ -32,6 +33,7 @@ from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex, Period from pandas.tseries.offsets import DateOffset +from pandas.tseries.timedeltas import _possibly_cast_to_timedelta from pandas import compat from pandas.util.terminal import get_terminal_size from pandas.compat import zip, lzip, u, OrderedDict @@ -142,7 +144,7 @@ def _convert_to_array(self, values, name=None): values = values.to_series() elif inferred_type in ('timedelta', 'timedelta64'): # have a timedelta, convert to to ns here - values = com._possibly_cast_to_timedelta(values, coerce=coerce) + values = _possibly_cast_to_timedelta(values, coerce=coerce) elif inferred_type == 'integer': # py3 compat where dtype is 'm' but is an integer if values.dtype.kind == 'm': @@ -160,7 +162,7 @@ def _convert_to_array(self, values, name=None): raise TypeError("cannot use a non-absolute DateOffset in " "datetime/timedelta operations [{0}]".format( ','.join([ com.pprint_thing(v) for v in values[mask] ]))) - values = com._possibly_cast_to_timedelta(os, coerce=coerce) + values = _possibly_cast_to_timedelta(os, coerce=coerce) else: raise TypeError("incompatible type [{0}] for a datetime/timedelta operation".format(pa.array(values).dtype)) @@ -3215,11 +3217,11 @@ def _try_cast(arr, take_fast_path): # perf shortcut as this is the most common case if take_fast_path: - if com._possibly_castable(arr) and not copy and dtype is None: + if _possibly_castable(arr) and not copy and dtype is None: return arr try: - arr = com._possibly_cast_to_datetime(arr, dtype) + arr = _possibly_cast_to_datetime(arr, dtype) subarr = pa.array(arr, dtype=dtype, copy=copy) except (ValueError, TypeError): if dtype is not None and raise_cast_failure: @@ -3266,9 +3268,9 @@ def _try_cast(arr, take_fast_path): subarr = lib.maybe_convert_objects(subarr) else: - subarr = com._possibly_convert_platform(data) + subarr = _possibly_convert_platform(data) - subarr = com._possibly_cast_to_datetime(subarr, dtype) + subarr = _possibly_cast_to_datetime(subarr, dtype) else: subarr = _try_cast(data, False) @@ -3285,7 +3287,7 @@ def _try_cast(arr, take_fast_path): dtype, value = _infer_dtype_from_scalar(value) else: # need to possibly convert the value here - value = com._possibly_cast_to_datetime(value, dtype) + value = _possibly_cast_to_datetime(value, dtype) subarr = pa.empty(len(index), dtype=dtype) subarr.fill(value) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index bf45fbf96c0e5..9b6a230f6a551 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -17,7 +17,7 @@ import numpy as np import pandas from pandas import (Series, TimeSeries, DataFrame, Panel, Panel4D, Index, - MultiIndex, Int64Index, Timestamp) + MultiIndex, Int64Index, Timestamp, _np_version_under1p7) from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel from pandas.sparse.array import BlockIndex, IntIndex from pandas.tseries.api import PeriodIndex, DatetimeIndex @@ -25,10 +25,11 @@ from pandas.core.common import adjoin, is_list_like, pprint_thing from pandas.core.algorithms import match, unique from pandas.core.categorical import Categorical -from pandas.core.common import _asarray_tuplesafe, _np_version_under1p7 +from pandas.core.common import _asarray_tuplesafe from pandas.core.internals import BlockManager, make_block from pandas.core.reshape import block2d_to_blocknd, factor_indexer from pandas.core.index import _ensure_index +from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type import pandas.core.common as com from pandas.tools.merge import concat from pandas import compat @@ -4093,7 +4094,7 @@ def stringify(value): v = v.tz_convert('UTC') return TermValue(v, v.value, kind) elif kind == u('timedelta64') or kind == u('timedelta'): - v = com._coerce_scalar_to_timedelta_type(v,unit='s').item() + v = _coerce_scalar_to_timedelta_type(v,unit='s').item() return TermValue(int(v), v, kind) elif (isinstance(v, datetime) or hasattr(v, 'timetuple') or kind == u('date')): diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 3667dff994232..3f4ce72198215 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -22,7 +22,7 @@ assert_frame_equal, assert_series_equal) from pandas import concat, Timestamp -from pandas import compat +from pandas import compat, _np_version_under1p7 from pandas.core import common as com from numpy.testing.decorators import slow @@ -1800,7 +1800,7 @@ def compare(a,b): assert_frame_equal(result,df) def test_append_with_timedelta(self): - if com._np_version_under1p7: + if _np_version_under1p7: raise nose.SkipTest("requires numpy >= 1.7") # GH 3577 diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 723810a19d140..c5af0b0d4d5c8 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -3248,9 +3248,10 @@ def test_operators_timedelta64(self): mixed['F'] = Timestamp('20130101') # results in an object array + from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type result = mixed.min() - expected = Series([com._coerce_scalar_to_timedelta_type(timedelta(seconds=5*60+5)), - com._coerce_scalar_to_timedelta_type(timedelta(days=-1)), + expected = Series([_coerce_scalar_to_timedelta_type(timedelta(seconds=5*60+5)), + _coerce_scalar_to_timedelta_type(timedelta(days=-1)), 'foo', 1, 1.0, diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 1f008354756bc..7a993cbcf07f4 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -14,7 +14,7 @@ import pandas as pd from pandas import (Index, Series, DataFrame, isnull, notnull, - bdate_range, date_range) + bdate_range, date_range, _np_version_under1p7) from pandas.core.index import MultiIndex from pandas.tseries.index import Timestamp, DatetimeIndex import pandas.core.config as cf @@ -2188,7 +2188,7 @@ def test_timedeltas_with_DateOffset(self): [Timestamp('20130101 9:06:00.005'), Timestamp('20130101 9:07:00.005')]) assert_series_equal(result, expected) - if not com._np_version_under1p7: + if not _np_version_under1p7: # operate with np.timedelta64 correctly result = s + np.timedelta64(1, 's') @@ -2292,7 +2292,7 @@ def test_timedelta64_operations_with_integers(self): self.assertRaises(TypeError, sop, s2.values) def test_timedelta64_conversions(self): - if com._np_version_under1p7: + if _np_version_under1p7: raise nose.SkipTest("cannot use 2 argument form of timedelta64 conversions with numpy < 1.7") startdate = Series(date_range('2013-01-01', '2013-01-03')) @@ -2317,7 +2317,7 @@ def test_timedelta64_equal_timedelta_supported_ops(self): 'm': 60 * 1000000, 's': 1000000, 'us': 1} def timedelta64(*args): - if com._np_version_under1p7: + if _np_version_under1p7: coeffs = np.array(args) terms = np.array([npy16_mappings[interval] for interval in intervals]) @@ -2426,7 +2426,7 @@ def test_timedelta64_functions(self): assert_series_equal(result, expected) def test_timedelta_fillna(self): - if com._np_version_under1p7: + if _np_version_under1p7: raise nose.SkipTest("timedelta broken in np 1.6.1") #GH 3371 @@ -2498,12 +2498,12 @@ def test_datetime64_fillna(self): assert_series_equal(result,expected) def test_sub_of_datetime_from_TimeSeries(self): - from pandas.core import common as com + from pandas.tseries.timedeltas import _possibly_cast_to_timedelta from datetime import datetime a = Timestamp(datetime(1993, 0o1, 0o7, 13, 30, 00)) b = datetime(1993, 6, 22, 13, 30) a = Series([a]) - result = com._possibly_cast_to_timedelta(np.abs(a - b)) + result = _possibly_cast_to_timedelta(np.abs(a - b)) self.assert_(result.dtype == 'timedelta64[ns]') def test_datetime64_with_index(self): diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index e91cad62e7dce..1572ca481d8a4 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -7,8 +7,7 @@ import numpy as np from pandas.core.common import (isnull, _NS_DTYPE, _INT64_DTYPE, - is_list_like,_possibly_cast_to_timedelta, - _values_from_object, _maybe_box) + is_list_like,_values_from_object, _maybe_box) from pandas.core.index import Index, Int64Index import pandas.compat as compat from pandas.compat import u diff --git a/pandas/tseries/timedeltas.py b/pandas/tseries/timedeltas.py new file mode 100644 index 0000000000000..990dd9c351e27 --- /dev/null +++ b/pandas/tseries/timedeltas.py @@ -0,0 +1,111 @@ +""" +timedelta support tools +""" + +import re +from datetime import timedelta + +import numpy as np +import pandas.tslib as tslib +from pandas import compat, _np_version_under1p7 +from pandas.core.common import ABCSeries, is_integer, _values_from_object + +timedelta_search = re.compile( + "^(?P-?\d*\.?\d*)(?PD|s|ms|us|ns)?$") + +def _coerce_scalar_to_timedelta_type(r, unit='ns'): + # kludgy here until we have a timedelta scalar + # handle the numpy < 1.7 case + + if isinstance(r, compat.string_types): + m = timedelta_search.search(r) + if m: + r = float(m.groupdict()['value']) + u = m.groupdict().get('unit') + if u is not None: + unit = u + else: + raise ValueError("cannot convert timedelta scalar value!") + + r = tslib.cast_from_unit(unit, r) + r = timedelta(microseconds=int(r)/1000) + + if is_integer(r): + r = tslib.cast_from_unit(unit, r) + r = timedelta(microseconds=int(r)/1000) + + if _np_version_under1p7: + if not isinstance(r, timedelta): + raise AssertionError("Invalid type for timedelta scalar: %s" % type(r)) + if compat.PY3: + # convert to microseconds in timedelta64 + r = np.timedelta64(int(r.total_seconds()*1e9 + r.microseconds*1000)) + else: + return r + + if isinstance(r, timedelta): + r = np.timedelta64(r) + elif not isinstance(r, np.timedelta64): + raise AssertionError("Invalid type for timedelta scalar: %s" % type(r)) + return r.astype('timedelta64[ns]') + +def _possibly_cast_to_timedelta(value, coerce=True): + """ try to cast to timedelta64, if already a timedeltalike, then make + sure that we are [ns] (as numpy 1.6.2 is very buggy in this regards, + don't force the conversion unless coerce is True + + if coerce='compat' force a compatibilty coercerion (to timedeltas) if needeed + """ + + # coercion compatability + if coerce == 'compat' and _np_version_under1p7: + + def convert(td, dtype): + + # we have an array with a non-object dtype + if hasattr(td,'item'): + td = td.astype(np.int64).item() + if td == tslib.iNaT: + return td + if dtype == 'm8[us]': + td *= 1000 + return td + + if td == tslib.compat_NaT: + return tslib.iNaT + + # convert td value to a nanosecond value + d = td.days + s = td.seconds + us = td.microseconds + + if dtype == 'object' or dtype == 'm8[ns]': + td = 1000*us + (s + d * 24 * 3600) * 10 ** 9 + else: + raise ValueError("invalid conversion of dtype in np < 1.7 [%s]" % dtype) + + return td + + # < 1.7 coercion + if not is_list_like(value): + value = np.array([ value ]) + + dtype = value.dtype + return np.array([ convert(v,dtype) for v in value ], dtype='m8[ns]') + + # deal with numpy not being able to handle certain timedelta operations + if isinstance(value, (ABCSeries, np.ndarray)) and value.dtype.kind == 'm': + if value.dtype != 'timedelta64[ns]': + value = value.astype('timedelta64[ns]') + return value + + # we don't have a timedelta, but we want to try to convert to one (but + # don't force it) + if coerce: + new_value = tslib.array_to_timedelta64( + _values_from_object(value).astype(object), coerce=False) + if new_value.dtype == 'i8': + value = np.array(new_value, dtype='timedelta64[ns]') + + return value + From 30d91c75919ae6cd9ecbe821ceb3c3b4036688d8 Mon Sep 17 00:00:00 2001 From: jreback Date: Thu, 12 Sep 2013 08:52:16 -0400 Subject: [PATCH 3/4] TST: add pandas/tseries/tests/test_timedeltas.py API: add full timedelta parsing and conversion to np.timedelta64[ns] --- doc/source/release.rst | 1 + pandas/tseries/tests/test_timedeltas.py | 128 ++++++++++++++++++++++++ pandas/tseries/timedeltas.py | 78 ++++++++++++--- pandas/tslib.pyx | 12 +-- 4 files changed, 197 insertions(+), 22 deletions(-) create mode 100644 pandas/tseries/tests/test_timedeltas.py diff --git a/doc/source/release.rst b/doc/source/release.rst index d50438cd08058..2ed866e1fdeda 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -191,6 +191,7 @@ API Changes - provide automatic dtype conversions on _reduce operations (:issue:`3371`) - exclude non-numerics if mixed types with datelike in _reduce operations (:issue:`3371`) - default for ``tupleize_cols`` is now ``False`` for both ``to_csv`` and ``read_csv``. Fair warning in 0.12 (:issue:`3604`) + - moved timedeltas support to pandas.tseries.timedeltas.py; add timedeltas string parsing Internal Refactoring ~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py new file mode 100644 index 0000000000000..8b88da1d7ef28 --- /dev/null +++ b/pandas/tseries/tests/test_timedeltas.py @@ -0,0 +1,128 @@ +# pylint: disable-msg=E1101,W0612 + +from datetime import datetime, timedelta +import nose +import unittest + +import numpy as np +import pandas as pd + +from pandas import (Index, Series, DataFrame, isnull, notnull, + bdate_range, date_range, _np_version_under1p7) +import pandas.core.common as com +from pandas.compat import StringIO, lrange, range, zip, u, OrderedDict, long +from pandas import compat +from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type as ct +from pandas.util.testing import (assert_series_equal, + assert_frame_equal, + assert_almost_equal, + ensure_clean) +import pandas.util.testing as tm + +def _skip_if_numpy_not_friendly(): + # not friendly for < 1.7 + if _np_version_under1p7: + raise nose.SkipTest("numpy < 1.7") + +class TestTimedeltas(unittest.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + pass + + def test_numeric_conversions(self): + _skip_if_numpy_not_friendly() + + # ns not converted properly + self.assert_(ct(0) == np.timedelta64(0,'ns')) + self.assert_(ct(10) == np.timedelta64(0,'ns')) + self.assert_(ct(10,unit='ns') == np.timedelta64(0,'ns').astype('m8[ns]')) + + self.assert_(ct(10,unit='us') == np.timedelta64(10,'us').astype('m8[ns]')) + self.assert_(ct(10,unit='ms') == np.timedelta64(10,'ms').astype('m8[ns]')) + self.assert_(ct(10,unit='s') == np.timedelta64(10,'s').astype('m8[ns]')) + self.assert_(ct(10,unit='d') == np.timedelta64(10,'D').astype('m8[ns]')) + + def test_timedelta_conversions(self): + _skip_if_numpy_not_friendly() + + self.assert_(ct(timedelta(seconds=1)) == np.timedelta64(1,'s').astype('m8[ns]')) + self.assert_(ct(timedelta(microseconds=1)) == np.timedelta64(1,'us').astype('m8[ns]')) + self.assert_(ct(timedelta(days=1)) == np.timedelta64(1,'D').astype('m8[ns]')) + + def test_short_format_converters(self): + _skip_if_numpy_not_friendly() + + def conv(v): + return v.astype('m8[ns]') + + # ns not converted properly + self.assert_(ct('10') == np.timedelta64(0,'ns')) + self.assert_(ct('10ns') == np.timedelta64(0,'ns')) + self.assert_(ct('100') == np.timedelta64(0,'ns')) + self.assert_(ct('100ns') == np.timedelta64(0,'ns')) + + self.assert_(ct('1000') == np.timedelta64(1000,'ns')) + self.assert_(ct('1000ns') == np.timedelta64(1000,'ns')) + self.assert_(ct('1000NS') == np.timedelta64(1000,'ns')) + + self.assert_(ct('10us') == np.timedelta64(10000,'ns')) + self.assert_(ct('100us') == np.timedelta64(100000,'ns')) + self.assert_(ct('1000us') == np.timedelta64(1000000,'ns')) + self.assert_(ct('1000Us') == np.timedelta64(1000000,'ns')) + self.assert_(ct('1000uS') == np.timedelta64(1000000,'ns')) + + self.assert_(ct('1ms') == np.timedelta64(1000000,'ns')) + self.assert_(ct('10ms') == np.timedelta64(10000000,'ns')) + self.assert_(ct('100ms') == np.timedelta64(100000000,'ns')) + self.assert_(ct('1000ms') == np.timedelta64(1000000000,'ns')) + + self.assert_(ct('-1s') == -np.timedelta64(1000000000,'ns')) + self.assert_(ct('1s') == np.timedelta64(1000000000,'ns')) + self.assert_(ct('10s') == np.timedelta64(10000000000,'ns')) + self.assert_(ct('100s') == np.timedelta64(100000000000,'ns')) + self.assert_(ct('1000s') == np.timedelta64(1000000000000,'ns')) + + self.assert_(ct('1d') == conv(np.timedelta64(1,'D'))) + self.assert_(ct('-1d') == -conv(np.timedelta64(1,'D'))) + self.assert_(ct('1D') == conv(np.timedelta64(1,'D'))) + self.assert_(ct('10D') == conv(np.timedelta64(10,'D'))) + self.assert_(ct('100D') == conv(np.timedelta64(100,'D'))) + self.assert_(ct('1000D') == conv(np.timedelta64(1000,'D'))) + self.assert_(ct('10000D') == conv(np.timedelta64(10000,'D'))) + + # space + self.assert_(ct(' 10000D ') == conv(np.timedelta64(10000,'D'))) + self.assert_(ct(' - 10000D ') == -conv(np.timedelta64(10000,'D'))) + + # invalid + self.assertRaises(ValueError, ct, '1foo') + self.assertRaises(ValueError, ct, 'foo') + + def test_full_format_converters(self): + _skip_if_numpy_not_friendly() + + def conv(v): + return v.astype('m8[ns]') + d1 = np.timedelta64(1,'D') + + self.assert_(ct('1days') == conv(d1)) + self.assert_(ct('1days,') == conv(d1)) + self.assert_(ct('- 1days,') == -conv(d1)) + + self.assert_(ct('00:00:01') == conv(np.timedelta64(1,'s'))) + self.assert_(ct('06:00:01') == conv(np.timedelta64(6*3600+1,'s'))) + self.assert_(ct('06:00:01.0') == conv(np.timedelta64(6*3600+1,'s'))) + self.assert_(ct('06:00:01.01') == conv(np.timedelta64(1000*(6*3600+1)+10,'ms'))) + + self.assert_(ct('- 1days, 00:00:01') == -conv(d1+np.timedelta64(1,'s'))) + self.assert_(ct('1days, 06:00:01') == conv(d1+np.timedelta64(6*3600+1,'s'))) + self.assert_(ct('1days, 06:00:01.01') == conv(d1+np.timedelta64(1000*(6*3600+1)+10,'ms'))) + + # invalid + self.assertRaises(ValueError, ct, '- 1days, 00') + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/tseries/timedeltas.py b/pandas/tseries/timedeltas.py index 990dd9c351e27..d12791755859e 100644 --- a/pandas/tseries/timedeltas.py +++ b/pandas/tseries/timedeltas.py @@ -8,31 +8,29 @@ import numpy as np import pandas.tslib as tslib from pandas import compat, _np_version_under1p7 -from pandas.core.common import ABCSeries, is_integer, _values_from_object +from pandas.core.common import (ABCSeries, is_integer, + _values_from_object, is_list_like) -timedelta_search = re.compile( - "^(?P-?\d*\.?\d*)(?PD|s|ms|us|ns)?$") +repr_timedelta = tslib.repr_timedelta64 +repr_timedelta64 = tslib.repr_timedelta64 + +_short_search = re.compile( + "^\s*(?P-?)\s*(?P\d*\.?\d*)\s*(?Pd|s|ms|us|ns)?\s*$",re.IGNORECASE) +_full_search = re.compile( + "^\s*(?P-?)\s*(?P\d+)?\s*(days|d)?,?\s*(?P