From f1164210248e6be262cfbf08bf899f1d1781c0fc Mon Sep 17 00:00:00 2001 From: Stephan Hoyer Date: Tue, 17 Feb 2015 11:57:40 -0800 Subject: [PATCH] API/ENH: add method='nearest' for Index.get_indexer and reindex Fixes GH8845 --- doc/source/basics.rst | 24 ++- doc/source/whatsnew/v0.16.0.txt | 28 +++ pandas/core/common.py | 18 +- pandas/core/generic.py | 36 ++-- pandas/core/index.py | 225 +++++++++++++++--------- pandas/tests/test_frame.py | 46 +++-- pandas/tests/test_index.py | 218 ++++++++++++++++++++++- pandas/tests/test_series.py | 15 +- pandas/tseries/index.py | 18 +- pandas/tseries/period.py | 10 +- pandas/tseries/tdi.py | 10 +- pandas/tseries/tests/test_period.py | 2 +- pandas/tseries/tests/test_timeseries.py | 3 +- pandas/tseries/tests/test_tslib.py | 8 +- pandas/tslib.pyx | 4 +- 15 files changed, 508 insertions(+), 157 deletions(-) diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 7ee82cd69a257..dc43c1177f8c3 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -948,15 +948,9 @@ chosen from the following table: pad / ffill, Fill values forward bfill / backfill, Fill values backward + nearest, Fill from the nearest index value -Other fill methods could be added, of course, but these are the two most -commonly used for time series data. In a way they only make sense for time -series or otherwise ordered data, but you may have an application on non-time -series data where this sort of "interpolation" logic is the correct thing to -do. More sophisticated interpolation of missing values would be an obvious -extension. - -We illustrate these fill methods on a simple TimeSeries: +We illustrate these fill methods on a simple Series: .. ipython:: python @@ -969,18 +963,22 @@ We illustrate these fill methods on a simple TimeSeries: ts2.reindex(ts.index) ts2.reindex(ts.index, method='ffill') ts2.reindex(ts.index, method='bfill') + ts2.reindex(ts.index, method='nearest') -Note these methods require that the indexes are **order increasing**. +These methods require that the indexes are **ordered** increasing or +decreasing. -Note the same result could have been achieved using :ref:`fillna -`: +Note that the same result could have been achieved using +:ref:`fillna ` (except for ``method='nearest'``) or +:ref:`interpolate `: .. ipython:: python ts2.reindex(ts.index).fillna(method='ffill') -Note that ``reindex`` will raise a ValueError if the index is not -monotonic. ``fillna`` will not make any checks on the order of the index. +``reindex`` will raise a ValueError if the index is not monotonic increasing or +descreasing. ``fillna`` and ``interpolate`` will not make any checks on the +order of the index. .. _basics.drop: diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 63606cb830cbe..bf050dfabea9b 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -20,6 +20,15 @@ users upgrade to this version. New features ~~~~~~~~~~~~ +- Reindex now supports ``method='nearest'`` for frames or series with a monotonic increasing or decreasing index (:issue:`9258`): + + .. ipython:: python + + df = pd.DataFrame({'x': range(5)}) + df.reindex([0.2, 1.8, 3.5], method='nearest') + + This method is also exposed by the lower level ``Index.get_indexer`` and ``Index.get_loc`` methods. + .. _whatsnew_0160.api: .. _whatsnew_0160.api_breaking: @@ -189,6 +198,9 @@ Enhancements - Added ``StringMethods.find()`` and ``rfind()`` which behave as the same as standard ``str`` (:issue:`9386`) +- ``Index.get_indexer`` now supports ``method='pad'`` and ``method='backfill'`` even for any target array, not just monotonic targets. These methods also work for monotonic decreasing as well as monotonic increasing indexes (:issue:`9258`). +- ``Index.asof`` now works on all index types (:issue:`9258`). + - Added ``StringMethods.isnumeric`` and ``isdecimal`` which behave as the same as standard ``str`` (:issue:`9439`) - Added ``StringMethods.ljust()`` and ``rjust()`` which behave as the same as standard ``str`` (:issue:`9352`) - ``StringMethods.pad()`` and ``center()`` now accept ``fillchar`` option to specify filling character (:issue:`9352`) @@ -244,6 +256,22 @@ Bug Fixes - Fixed character encoding bug in ``read_stata`` and ``StataReader`` when loading data from a URL (:issue:`9231`). +- Looking up a partial string label with ``DatetimeIndex.asof`` now includes values that match the string, even if they are after the start of the partial string label (:issue:`9258`). Old behavior: + + .. ipython:: python + :verbatim: + + In [4]: pd.to_datetime(['2000-01-31', '2000-02-28']).asof('2000-02') + Out[4]: Timestamp('2000-01-31 00:00:00') + + Fixed behavior: + + .. ipython:: python + + pd.to_datetime(['2000-01-31', '2000-02-28']).asof('2000-02') + + To reproduce the old behavior, simply add more precision to the label (e.g., use ``2000-02-01`` instead of ``2000-02``). + - Bug in adding ``offsets.Nano`` to other offets raises ``TypeError`` (:issue:`9284`) diff --git a/pandas/core/common.py b/pandas/core/common.py index 7ab88edd77d4b..78c0c6c5dbd0f 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -2682,7 +2682,7 @@ def _astype_nansafe(arr, dtype, copy=True): return arr.view(dtype) -def _clean_fill_method(method): +def _clean_fill_method(method, allow_nearest=False): if method is None: return None method = method.lower() @@ -2690,13 +2690,23 @@ def _clean_fill_method(method): method = 'pad' if method == 'bfill': method = 'backfill' - if method not in ['pad', 'backfill']: - msg = ('Invalid fill method. Expecting pad (ffill) or backfill ' - '(bfill). Got %s' % method) + + valid_methods = ['pad', 'backfill'] + expecting = 'pad (ffill) or backfill (bfill)' + if allow_nearest: + valid_methods.append('nearest') + expecting = 'pad (ffill), backfill (bfill) or nearest' + if method not in valid_methods: + msg = ('Invalid fill method. Expecting %s. Got %s' + % (expecting, method)) raise ValueError(msg) return method +def _clean_reindex_fill_method(method): + return _clean_fill_method(method, allow_nearest=True) + + def _all_none(*args): for arg in args: if arg is not None: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f454af5df9c90..336b29dfb8572 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1672,10 +1672,12 @@ def sort_index(self, axis=0, ascending=True): keywords) New labels / index to conform to. Preferably an Index object to avoid duplicating data - method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None - Method to use for filling holes in reindexed DataFrame - pad / ffill: propagate last valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap + method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}, optional + Method to use for filling holes in reindexed DataFrame: + * default: don't fill gaps + * pad / ffill: propagate last valid observation forward to next valid + * backfill / bfill: use next valid observation to fill gap + * nearest: use nearest valid observations to fill gap copy : boolean, default True Return a new object, even if the passed indexes are the same level : int or name @@ -1703,7 +1705,7 @@ def reindex(self, *args, **kwargs): # construct the args axes, kwargs = self._construct_axes_from_arguments(args, kwargs) - method = com._clean_fill_method(kwargs.get('method')) + method = com._clean_reindex_fill_method(kwargs.get('method')) level = kwargs.get('level') copy = kwargs.get('copy', True) limit = kwargs.get('limit') @@ -1744,9 +1746,8 @@ def _reindex_axes(self, axes, level, limit, method, fill_value, copy): axis = self._get_axis_number(a) obj = obj._reindex_with_indexers( - {axis: [new_index, indexer]}, method=method, - fill_value=fill_value, limit=limit, copy=copy, - allow_dups=False) + {axis: [new_index, indexer]}, + fill_value=fill_value, copy=copy, allow_dups=False) return obj @@ -1770,10 +1771,12 @@ def _reindex_multi(self, axes, copy, fill_value): New labels / index to conform to. Preferably an Index object to avoid duplicating data axis : %(axes_single_arg)s - method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None - Method to use for filling holes in reindexed object. - pad / ffill: propagate last valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap + method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}, optional + Method to use for filling holes in reindexed DataFrame: + * default: don't fill gaps + * pad / ffill: propagate last valid observation forward to next valid + * backfill / bfill: use next valid observation to fill gap + * nearest: use nearest valid observations to fill gap copy : boolean, default True Return a new object, even if the passed indexes are the same level : int or name @@ -1802,15 +1805,14 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, axis_name = self._get_axis_name(axis) axis_values = self._get_axis(axis_name) - method = com._clean_fill_method(method) + method = com._clean_reindex_fill_method(method) new_index, indexer = axis_values.reindex(labels, method, level, limit=limit) return self._reindex_with_indexers( - {axis: [new_index, indexer]}, method=method, fill_value=fill_value, - limit=limit, copy=copy) + {axis: [new_index, indexer]}, fill_value=fill_value, copy=copy) - def _reindex_with_indexers(self, reindexers, method=None, - fill_value=np.nan, limit=None, copy=False, + def _reindex_with_indexers(self, reindexers, + fill_value=np.nan, copy=False, allow_dups=False): """ allow_dups indicates an internal call here """ diff --git a/pandas/core/index.py b/pandas/core/index.py index 2444014ac9779..0cad537855857 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -1089,22 +1089,20 @@ def identical(self, other): def asof(self, label): """ For a sorted index, return the most recent label up to and including - the passed label. Return NaN if not found - """ - if isinstance(label, (Index, ABCSeries, np.ndarray)): - raise TypeError('%s' % type(label)) - - if not isinstance(label, Timestamp): - label = Timestamp(label) - - if label not in self: - loc = self.searchsorted(label, side='left') - if loc > 0: - return self[loc - 1] - else: - return np.nan + the passed label. Return NaN if not found. - return label + See also + -------- + get_loc : asof is a thin wrapper around get_loc with method='pad' + """ + try: + loc = self.get_loc(label, method='pad') + except KeyError: + return _get_na_value(self.dtype) + else: + if isinstance(loc, slice): + loc = loc.indices(len(self))[-1] + return self[loc] def asof_locs(self, where, mask): """ @@ -1402,15 +1400,34 @@ def sym_diff(self, other, result_name=None): the_diff = sorted(set((self.difference(other)).union(other.difference(self)))) return Index(the_diff, name=result_name) - def get_loc(self, key): + def get_loc(self, key, method=None): """ Get integer location for requested label + Parameters + ---------- + key : label + method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'} + * default: exact matches only. + * pad / ffill: find the PREVIOUS index value if no exact match. + * backfill / bfill: use NEXT index value if no exact match + * nearest: use the NEAREST index value if no exact match. Tied + distances are broken by preferring the larger index value. + Returns ------- loc : int if unique index, possibly slice or mask if not """ - return self._engine.get_loc(_values_from_object(key)) + if method is None: + return self._engine.get_loc(_values_from_object(key)) + + indexer = self.get_indexer([key], method=method) + if indexer.ndim > 1 or indexer.size > 1: + raise TypeError('get_loc requires scalar valued input') + loc = indexer.item() + if loc == -1: + raise KeyError(key) + return loc def get_value(self, series, key): """ @@ -1477,19 +1494,20 @@ def get_indexer(self, target, method=None, limit=None): """ Compute indexer and mask for new index given the current index. The indexer should be then used as an input to ndarray.take to align the - current data to the new index. The mask determines whether labels are - found or not in the current index + current data to the new index. Parameters ---------- target : Index - method : {'pad', 'ffill', 'backfill', 'bfill'} - pad / ffill: propagate LAST valid observation forward to next valid - backfill / bfill: use NEXT valid observation to fill gap - - Notes - ----- - This is a low-level method and probably should be used at your own risk + method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'} + * default: exact matches only. + * pad / ffill: find the PREVIOUS index value if no exact match. + * backfill / bfill: use NEXT index value if no exact match + * nearest: use the NEAREST index value if no exact match. Tied + distances are broken by preferring the larger index value. + limit : int + Maximum number of consecuctive labels in ``target`` to match for + inexact matches. Examples -------- @@ -1498,9 +1516,12 @@ def get_indexer(self, target, method=None, limit=None): Returns ------- - indexer : ndarray + indexer : ndarray of int + Integers from 0 to n - 1 indicating that the index at these + positions matches the corresponding target values. Missing values + in the target are marked by -1. """ - method = self._get_method(method) + method = com._clean_reindex_fill_method(method) target = _ensure_index(target) pself, ptarget = self._possibly_promote(target) @@ -1516,21 +1537,73 @@ def get_indexer(self, target, method=None, limit=None): raise InvalidIndexError('Reindexing only valid with uniquely' ' valued Index objects') - if method == 'pad': - if not self.is_monotonic or not target.is_monotonic: - raise ValueError('Must be monotonic for forward fill') - indexer = self._engine.get_pad_indexer(target.values, limit) - elif method == 'backfill': - if not self.is_monotonic or not target.is_monotonic: - raise ValueError('Must be monotonic for backward fill') - indexer = self._engine.get_backfill_indexer(target.values, limit) - elif method is None: - indexer = self._engine.get_indexer(target.values) + if method == 'pad' or method == 'backfill': + indexer = self._get_fill_indexer(target, method, limit) + elif method == 'nearest': + indexer = self._get_nearest_indexer(target, limit) else: - raise ValueError('unrecognized method: %s' % method) + indexer = self._engine.get_indexer(target.values) return com._ensure_platform_int(indexer) + def _get_fill_indexer(self, target, method, limit=None): + if self.is_monotonic_increasing and target.is_monotonic_increasing: + method = (self._engine.get_pad_indexer if method == 'pad' + else self._engine.get_backfill_indexer) + indexer = method(target.values, limit) + else: + indexer = self._get_fill_indexer_searchsorted(target, method, limit) + return indexer + + def _get_fill_indexer_searchsorted(self, target, method, limit=None): + """ + Fallback pad/backfill get_indexer that works for monotonic decreasing + indexes and non-monotonic targets + """ + if limit is not None: + raise ValueError('limit argument for %r method only well-defined ' + 'if index and target are monotonic' % method) + + side = 'left' if method == 'pad' else 'right' + target = np.asarray(target) + + # find exact matches first (this simplifies the algorithm) + indexer = self.get_indexer(target) + nonexact = (indexer == -1) + indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], side) + if side == 'left': + # searchsorted returns "indices into a sorted array such that, + # if the corresponding elements in v were inserted before the + # indices, the order of a would be preserved". + # Thus, we need to subtract 1 to find values to the left. + indexer[nonexact] -= 1 + # This also mapped not found values (values of 0 from + # np.searchsorted) to -1, which conveniently is also our + # sentinel for missing values + else: + # Mark indices to the right of the largest value as not found + indexer[indexer == len(self)] = -1 + return indexer + + def _get_nearest_indexer(self, target, limit): + """ + Get the indexer for the nearest index labels; requires an index with + values that can be subtracted from each other (e.g., not strings or + tuples). + """ + left_indexer = self.get_indexer(target, 'pad', limit=limit) + right_indexer = self.get_indexer(target, 'backfill', limit=limit) + + target = np.asarray(target) + left_distances = abs(self.values[left_indexer] - target) + right_distances = abs(self.values[right_indexer] - target) + + op = operator.lt if self.is_monotonic_increasing else operator.le + indexer = np.where(op(left_distances, right_distances) + | (right_indexer == -1), + left_indexer, right_indexer) + return indexer + def get_indexer_non_unique(self, target, **kwargs): """ return an indexer suitable for taking from a non unique index return the labels in the same order as the target, and @@ -1616,16 +1689,6 @@ def isin(self, values, level=None): self._validate_index_level(level) return lib.ismember(self._array_values(), value_set) - def _get_method(self, method): - if method: - method = method.lower() - - aliases = { - 'ffill': 'pad', - 'bfill': 'backfill' - } - return aliases.get(method, method) - def reindex(self, target, method=None, level=None, limit=None): """ Create index with target's values (move/add/delete values as necessary) @@ -2063,6 +2126,19 @@ def _maybe_cast_slice_bound(self, label, side): """ return label + def _searchsorted_monotonic(self, label, side='left'): + if self.is_monotonic_increasing: + return self.searchsorted(label, side=side) + elif self.is_monotonic_decreasing: + # np.searchsorted expects ascending sort order, have to reverse + # everything for it to work (element ordering, search side and + # resulting value). + pos = self[::-1].searchsorted( + label, side='right' if side == 'left' else 'right') + return len(self) - pos + + raise ValueError('index must be monotonic increasing or decreasing') + def get_slice_bound(self, label, side): """ Calculate slice bound that corresponds to given label. @@ -2088,19 +2164,12 @@ def get_slice_bound(self, label, side): try: slc = self.get_loc(label) - except KeyError: - if self.is_monotonic_increasing: - return self.searchsorted(label, side=side) - elif self.is_monotonic_decreasing: - # np.searchsorted expects ascending sort order, have to reverse - # everything for it to work (element ordering, search side and - # resulting value). - pos = self[::-1].searchsorted( - label, side='right' if side == 'left' else 'right') - return len(self) - pos - - # In all other cases, just re-raise the KeyError - raise + except KeyError as err: + try: + return self._searchsorted_monotonic(label, side) + except ValueError: + # raise the original KeyError + raise err if isinstance(slc, np.ndarray): # get_loc may return a boolean array or an array of indices, which @@ -2664,7 +2733,7 @@ def __contains__(self, other): except: return False - def get_loc(self, key): + def get_loc(self, key, method=None): try: if np.all(np.isnan(key)): nan_idxs = self._nan_idxs @@ -2676,7 +2745,7 @@ def get_loc(self, key): return nan_idxs except (TypeError, NotImplementedError): pass - return super(Float64Index, self).get_loc(key) + return super(Float64Index, self).get_loc(key, method=method) @property def is_all_dates(self): @@ -3932,7 +4001,7 @@ def get_indexer(self, target, method=None, limit=None): ------- (indexer, mask) : (ndarray, ndarray) """ - method = self._get_method(method) + method = com._clean_reindex_fill_method(method) target = _ensure_index(target) @@ -3949,20 +4018,13 @@ def get_indexer(self, target, method=None, limit=None): self_index = self._tuple_index - if method == 'pad': - if not self.is_unique or not self.is_monotonic: - raise AssertionError(('Must be unique and monotonic to ' - 'use forward fill getting the indexer')) - indexer = self_index._engine.get_pad_indexer(target_index.values, - limit=limit) - elif method == 'backfill': - if not self.is_unique or not self.is_monotonic: - raise AssertionError(('Must be unique and monotonic to ' - 'use backward fill getting the indexer')) - indexer = self_index._engine.get_backfill_indexer(target_index.values, - limit=limit) + if method == 'pad' or method == 'backfill': + indexer = self_index._get_fill_indexer(target, method, limit) + elif method == 'nearest': + raise NotImplementedError("method='nearest' not implemented yet " + 'for MultiIndex; see GitHub issue 9365') else: - indexer = self_index._engine.get_indexer(target_index.values) + indexer = self_index._engine.get_indexer(target.values) return com._ensure_platform_int(indexer) @@ -4099,7 +4161,7 @@ def _partial_tup_index(self, tup, side='left'): else: return start + section.searchsorted(idx, side=side) - def get_loc(self, key): + def get_loc(self, key, method=None): """ Get integer location, slice or boolean mask for requested label or tuple If the key is past the lexsort depth, the return may be a boolean mask @@ -4108,11 +4170,16 @@ def get_loc(self, key): Parameters ---------- key : label or tuple + method : None Returns ------- loc : int, slice object or boolean mask """ + if method is not None: + raise NotImplementedError('only the default get_loc method is ' + 'currently supported for MultiIndex') + def _maybe_to_slice(loc): '''convert integer indexer to boolean mask or slice if possible''' if not isinstance(loc, np.ndarray) or loc.dtype != 'int64': diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 66e008aa16b3e..0f015843fcb0f 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -1875,27 +1875,43 @@ def test_nested_exception(self): except Exception as e: self.assertNotEqual(type(e), UnboundLocalError) - def test_reverse_reindex_ffill_raises(self): + def test_reindex_methods(self): + df = pd.DataFrame({'x': range(5)}) + target = np.array([-0.1, 0.9, 1.1, 1.5]) + + for method, expected_values in [('nearest', [0, 1, 1, 2]), + ('pad', [np.nan, 0, 1, 1]), + ('backfill', [0, 1, 2, 2])]: + expected = pd.DataFrame({'x': expected_values}, index=target) + actual = df.reindex(target, method=method) + assert_frame_equal(expected, actual) + + e2 = expected[::-1] + actual = df.reindex(target[::-1], method=method) + assert_frame_equal(e2, actual) + + new_order = [3, 0, 2, 1] + e2 = expected.iloc[new_order] + actual = df.reindex(target[new_order], method=method) + assert_frame_equal(e2, actual) + + switched_method = ('pad' if method == 'backfill' + else 'backfill' if method == 'pad' + else method) + actual = df[::-1].reindex(target, method=switched_method) + assert_frame_equal(expected, actual) + + def test_non_monotonic_reindex_methods(self): dr = pd.date_range('2013-08-01', periods=6, freq='B') data = np.random.randn(6,1) df = pd.DataFrame(data, index=dr, columns=list('A')) - df['A'][3] = np.nan - df_rev = pd.DataFrame(data, index=dr[::-1], columns=list('A')) - # Reverse index is not 'monotonic' + df_rev = pd.DataFrame(data, index=dr[[3, 4, 5] + [0, 1, 2]], + columns=list('A')) + # index is not monotonic increasing or decreasing self.assertRaises(ValueError, df_rev.reindex, df.index, method='pad') self.assertRaises(ValueError, df_rev.reindex, df.index, method='ffill') self.assertRaises(ValueError, df_rev.reindex, df.index, method='bfill') - - def test_reversed_reindex_ffill_raises(self): - dr = pd.date_range('2013-08-01', periods=6, freq='B') - data = np.random.randn(6,1) - df = pd.DataFrame(data, index=dr, columns=list('A')) - df['A'][3] = np.nan - df = pd.DataFrame(data, index=dr, columns=list('A')) - # Reversed reindex is not 'monotonic' - self.assertRaises(ValueError, df.reindex, dr[::-1], method='pad') - self.assertRaises(ValueError, df.reindex, dr[::-1], method='ffill') - self.assertRaises(ValueError, df.reindex, dr[::-1], method='bfill') + self.assertRaises(ValueError, df_rev.reindex, df.index, method='nearest') def test_reindex_level(self): from itertools import permutations diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index 65e42f128564e..75c28681ecde5 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # pylint: disable=E1101,E1103,W0232 -from datetime import datetime, timedelta +from datetime import datetime, timedelta, time from pandas.compat import range, lrange, lzip, u, zip import operator import re @@ -95,6 +95,15 @@ def f(): pass tm.assertRaisesRegexp(ValueError,'The truth value of a',f) + def test_reindex_base(self): + idx = self.create_index() + expected = np.arange(idx.size) + actual = idx.get_indexer(idx) + assert_array_equal(expected, actual) + + with tm.assertRaisesRegexp(ValueError, 'Invalid fill method'): + idx.get_indexer(idx, method='invalid') + def test_ndarray_compat_properties(self): idx = self.create_index() @@ -109,6 +118,7 @@ def test_ndarray_compat_properties(self): idx.nbytes idx.values.nbytes + class TestIndex(Base, tm.TestCase): _holder = Index _multiprocess_can_split_ = True @@ -421,7 +431,7 @@ def test_is_(self): def test_asof(self): d = self.dateIndex[0] - self.assertIs(self.dateIndex.asof(d), d) + self.assertEqual(self.dateIndex.asof(d), d) self.assertTrue(np.isnan(self.dateIndex.asof(d - timedelta(1)))) d = self.dateIndex[-1] @@ -432,9 +442,10 @@ def test_asof(self): def test_asof_datetime_partial(self): idx = pd.date_range('2010-01-01', periods=2, freq='m') - expected = Timestamp('2010-01-31') + expected = Timestamp('2010-02-28') result = idx.asof('2010-02') self.assertEqual(result, expected) + self.assertFalse(isinstance(result, Index)) def test_nanosecond_index_access(self): s = Series([Timestamp('20130101')]).values.view('i8')[0] @@ -855,17 +866,81 @@ def test_get_indexer(self): assert_almost_equal(r1, [1, 3, -1]) r1 = idx2.get_indexer(idx1, method='pad') - assert_almost_equal(r1, [-1, 0, 0, 1, 1]) + e1 = [-1, 0, 0, 1, 1] + assert_almost_equal(r1, e1) + + r2 = idx2.get_indexer(idx1[::-1], method='pad') + assert_almost_equal(r2, e1[::-1]) rffill1 = idx2.get_indexer(idx1, method='ffill') assert_almost_equal(r1, rffill1) r1 = idx2.get_indexer(idx1, method='backfill') - assert_almost_equal(r1, [0, 0, 1, 1, 2]) + e1 = [0, 0, 1, 1, 2] + assert_almost_equal(r1, e1) rbfill1 = idx2.get_indexer(idx1, method='bfill') assert_almost_equal(r1, rbfill1) + r2 = idx2.get_indexer(idx1[::-1], method='backfill') + assert_almost_equal(r2, e1[::-1]) + + def test_get_indexer_nearest(self): + idx = Index(np.arange(10)) + + all_methods = ['pad', 'backfill', 'nearest'] + for method in all_methods: + actual = idx.get_indexer([0, 5, 9], method=method) + self.assert_array_equal(actual, [0, 5, 9]) + + for method, expected in zip(all_methods, [[0, 1, 8], [1, 2, 9], [0, 2, 9]]): + actual = idx.get_indexer([0.2, 1.8, 8.5], method=method) + self.assert_array_equal(actual, expected) + + with tm.assertRaisesRegexp(ValueError, 'limit argument'): + idx.get_indexer([1, 0], method='nearest', limit=1) + + def test_get_indexer_nearest_decreasing(self): + idx = Index(np.arange(10))[::-1] + + all_methods = ['pad', 'backfill', 'nearest'] + for method in all_methods: + actual = idx.get_indexer([0, 5, 9], method=method) + self.assert_array_equal(actual, [9, 4, 0]) + + for method, expected in zip(all_methods, [[8, 7, 0], [9, 8, 1], [9, 7, 0]]): + actual = idx.get_indexer([0.2, 1.8, 8.5], method=method) + self.assert_array_equal(actual, expected) + + def test_get_indexer_strings(self): + idx = pd.Index(['b', 'c']) + + actual = idx.get_indexer(['a', 'b', 'c', 'd'], method='pad') + expected = [-1, 0, 1, 1] + self.assert_array_equal(actual, expected) + + actual = idx.get_indexer(['a', 'b', 'c', 'd'], method='backfill') + expected = [0, 0, 1, -1] + self.assert_array_equal(actual, expected) + + with tm.assertRaises(TypeError): + idx.get_indexer(['a', 'b', 'c', 'd'], method='nearest') + + def test_get_loc(self): + idx = pd.Index([0, 1, 2]) + all_methods = [None, 'pad', 'backfill', 'nearest'] + for method in all_methods: + self.assertEqual(idx.get_loc(1, method=method), 1) + with tm.assertRaises(TypeError): + idx.get_loc([1, 2], method=method) + + for method, loc in [('pad', 1), ('backfill', 2), ('nearest', 1)]: + self.assertEqual(idx.get_loc(1.1, method), loc) + + idx = pd.Index(['a', 'c']) + with tm.assertRaises(TypeError): + idx.get_loc('a', method='nearest') + def test_slice_locs(self): for dtype in [int, float]: idx = Index(np.array([0, 1, 2, 5, 6, 7, 9, 10], dtype=dtype)) @@ -1247,6 +1322,7 @@ def test_ufunc_compat(self): expected = Float64Index(np.sin(np.arange(5,dtype='int64'))) tm.assert_index_equal(result, expected) + class TestFloat64Index(Numeric, tm.TestCase): _holder = Float64Index _multiprocess_can_split_ = True @@ -1360,6 +1436,26 @@ def test_equals(self): i2 = Float64Index([1.0,np.nan]) self.assertTrue(i.equals(i2)) + def test_get_indexer(self): + idx = Float64Index([0.0, 1.0, 2.0]) + self.assert_array_equal(idx.get_indexer(idx), [0, 1, 2]) + + target = [-0.1, 0.5, 1.1] + self.assert_array_equal(idx.get_indexer(target, 'pad'), [-1, 0, 1]) + self.assert_array_equal(idx.get_indexer(target, 'backfill'), [0, 1, 2]) + self.assert_array_equal(idx.get_indexer(target, 'nearest'), [0, 1, 1]) + + def test_get_loc(self): + idx = Float64Index([0.0, 1.0, 2.0]) + for method in [None, 'pad', 'backfill', 'nearest']: + self.assertEqual(idx.get_loc(1, method), 1) + + for method, loc in [('pad', 1), ('backfill', 2), ('nearest', 1)]: + self.assertEqual(idx.get_loc(1.1, method), loc) + + self.assertRaises(KeyError, idx.get_loc, 'foo') + self.assertRaises(KeyError, idx.get_loc, 1.5) + def test_get_loc_na(self): idx = Float64Index([np.nan, 1, 2]) self.assertEqual(idx.get_loc(1), 1) @@ -1897,6 +1993,54 @@ def test_numeric_compat(self): lambda : pd.date_range('2000-01-01', periods=3) * np.timedelta64(1, 'D').astype('m8[ns]') ]: self.assertRaises(TypeError, f) + def test_get_loc(self): + idx = pd.date_range('2000-01-01', periods=3) + + for method in [None, 'pad', 'backfill', 'nearest']: + self.assertEqual(idx.get_loc(idx[1], method), 1) + self.assertEqual(idx.get_loc(idx[1].to_pydatetime(), method), 1) + self.assertEqual(idx.get_loc(str(idx[1]), method), 1) + + self.assertEqual(idx.get_loc('2000-01-01', method='nearest'), 0) + self.assertEqual(idx.get_loc('2000-01-01T12', method='nearest'), 1) + + self.assertEqual(idx.get_loc('2000', method='nearest'), slice(0, 3)) + self.assertEqual(idx.get_loc('2000-01', method='nearest'), slice(0, 3)) + + self.assertEqual(idx.get_loc('1999', method='nearest'), 0) + self.assertEqual(idx.get_loc('2001', method='nearest'), 2) + + with tm.assertRaises(KeyError): + idx.get_loc('1999', method='pad') + with tm.assertRaises(KeyError): + idx.get_loc('2001', method='backfill') + + with tm.assertRaises(KeyError): + idx.get_loc('foobar') + with tm.assertRaises(TypeError): + idx.get_loc(slice(2)) + + idx = pd.to_datetime(['2000-01-01', '2000-01-04']) + self.assertEqual(idx.get_loc('2000-01-02', method='nearest'), 0) + self.assertEqual(idx.get_loc('2000-01-03', method='nearest'), 1) + self.assertEqual(idx.get_loc('2000-01', method='nearest'), slice(0, 2)) + + # time indexing + idx = pd.date_range('2000-01-01', periods=24, freq='H') + assert_array_equal(idx.get_loc(time(12)), [12]) + assert_array_equal(idx.get_loc(time(12, 30)), []) + with tm.assertRaises(NotImplementedError): + idx.get_loc(time(12, 30), method='pad') + + def test_get_indexer(self): + idx = pd.date_range('2000-01-01', periods=3) + self.assert_array_equal(idx.get_indexer(idx), [0, 1, 2]) + + target = idx[0] + pd.to_timedelta(['-1 hour', '12 hours', '1 day 1 hour']) + self.assert_array_equal(idx.get_indexer(target, 'pad'), [-1, 0, 1]) + self.assert_array_equal(idx.get_indexer(target, 'backfill'), [0, 1, 2]) + self.assert_array_equal(idx.get_indexer(target, 'nearest'), [0, 1, 1]) + def test_roundtrip_pickle_with_tz(self): # GH 8367 @@ -1959,6 +2103,30 @@ def create_index(self): def test_pickle_compat_construction(self): pass + def test_get_loc(self): + idx = pd.period_range('2000-01-01', periods=3) + + for method in [None, 'pad', 'backfill', 'nearest']: + self.assertEqual(idx.get_loc(idx[1], method), 1) + self.assertEqual(idx.get_loc(idx[1].asfreq('H', how='start'), method), 1) + self.assertEqual(idx.get_loc(idx[1].to_timestamp(), method), 1) + self.assertEqual(idx.get_loc(idx[1].to_timestamp().to_pydatetime(), method), 1) + self.assertEqual(idx.get_loc(str(idx[1]), method), 1) + + def test_get_indexer(self): + idx = pd.period_range('2000-01-01', periods=3).asfreq('H', how='start') + self.assert_array_equal(idx.get_indexer(idx), [0, 1, 2]) + + target = pd.PeriodIndex(['1999-12-31T23', '2000-01-01T12', + '2000-01-02T01'], freq='H') + self.assert_array_equal(idx.get_indexer(target, 'pad'), [-1, 0, 1]) + self.assert_array_equal(idx.get_indexer(target, 'backfill'), [0, 1, 2]) + self.assert_array_equal(idx.get_indexer(target, 'nearest'), [0, 1, 1]) + + with self.assertRaisesRegexp(ValueError, 'different freq'): + idx.asfreq('D').get_indexer(idx) + + class TestTimedeltaIndex(DatetimeLike, tm.TestCase): _holder = TimedeltaIndex _multiprocess_can_split_ = True @@ -1966,6 +2134,26 @@ class TestTimedeltaIndex(DatetimeLike, tm.TestCase): def create_index(self): return pd.to_timedelta(range(5),unit='d') + pd.offsets.Hour(1) + def test_get_loc(self): + idx = pd.to_timedelta(['0 days', '1 days', '2 days']) + + for method in [None, 'pad', 'backfill', 'nearest']: + self.assertEqual(idx.get_loc(idx[1], method), 1) + self.assertEqual(idx.get_loc(idx[1].to_pytimedelta(), method), 1) + self.assertEqual(idx.get_loc(str(idx[1]), method), 1) + + for method, loc in [('pad', 1), ('backfill', 2), ('nearest', 1)]: + self.assertEqual(idx.get_loc('1 day 1 hour', method), loc) + + def test_get_indexer(self): + idx = pd.to_timedelta(['0 days', '1 days', '2 days']) + self.assert_array_equal(idx.get_indexer(idx), [0, 1, 2]) + + target = pd.to_timedelta(['-1 hour', '12 hours', '1 day 1 hour']) + self.assert_array_equal(idx.get_indexer(target, 'pad'), [-1, 0, 1]) + self.assert_array_equal(idx.get_indexer(target, 'backfill'), [0, 1, 2]) + self.assert_array_equal(idx.get_indexer(target, 'nearest'), [0, 1, 1]) + def test_numeric_compat(self): idx = self._holder(np.arange(5,dtype='int64')) @@ -2733,6 +2921,9 @@ def test_get_loc(self): self.assertRaises(KeyError, self.index.get_loc, ('bar', 'two')) self.assertRaises(KeyError, self.index.get_loc, 'quux') + self.assertRaises(NotImplementedError, self.index.get_loc, 'foo', + method='nearest') + # 3 levels index = MultiIndex(levels=[Index(lrange(4)), Index(lrange(4)), @@ -2935,13 +3126,21 @@ def test_get_indexer(self): assert_almost_equal(r1, [1, 3, -1]) r1 = idx2.get_indexer(idx1, method='pad') - assert_almost_equal(r1, [-1, 0, 0, 1, 1]) + e1 = [-1, 0, 0, 1, 1] + assert_almost_equal(r1, e1) + + r2 = idx2.get_indexer(idx1[::-1], method='pad') + assert_almost_equal(r2, e1[::-1]) rffill1 = idx2.get_indexer(idx1, method='ffill') assert_almost_equal(r1, rffill1) r1 = idx2.get_indexer(idx1, method='backfill') - assert_almost_equal(r1, [0, 0, 1, 1, 2]) + e1 = [0, 0, 1, 1, 2] + assert_almost_equal(r1, e1) + + r2 = idx2.get_indexer(idx1[::-1], method='backfill') + assert_almost_equal(r2, e1[::-1]) rbfill1 = idx2.get_indexer(idx1, method='bfill') assert_almost_equal(r1, rbfill1) @@ -2961,6 +3160,11 @@ def test_get_indexer(self): " uniquely valued Index objects", idx1.get_indexer, idx2) + def test_get_indexer_nearest(self): + midx = MultiIndex.from_tuples([('a', 1), ('b', 2)]) + with tm.assertRaises(NotImplementedError): + midx.get_indexer(['a'], method='nearest') + def test_format(self): self.index.format() self.index[:0].format() diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 1a2fc5a8fc13c..04dad68703577 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -5862,8 +5862,9 @@ def test_reindex_pad(self): result = s.reindex(new_index).ffill(downcast='infer') assert_series_equal(result, expected) - # invalid because we can't forward fill on this type of index - self.assertRaises(ValueError, lambda : s.reindex(new_index, method='ffill')) + expected = Series([1, 5, 3, 5], index=new_index) + result = s.reindex(new_index, method='ffill') + assert_series_equal(result, expected) # inferrence of new dtype s = Series([True,False,False,True],index=list('abcd')) @@ -5878,6 +5879,16 @@ def test_reindex_pad(self): expected = Series(False,index=lrange(0,5)) assert_series_equal(result, expected) + def test_reindex_nearest(self): + s = Series(np.arange(10, dtype='int64')) + target = [0.1, 0.9, 1.5, 2.0] + actual = s.reindex(target, method='nearest') + expected = Series(np.around(target).astype('int64'), target) + assert_series_equal(expected, actual) + + actual = s.reindex_like(actual, method='nearest') + assert_series_equal(expected, actual) + def test_reindex_backfill(self): pass diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index 2205c6c4f4a64..3940bbcc949ba 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1227,7 +1227,7 @@ def get_value_maybe_box(self, series, key): values = self._engine.get_value(_values_from_object(series), key) return _maybe_box(self, values, series, key) - def get_loc(self, key): + def get_loc(self, key, method=None): """ Get integer location for requested label @@ -1237,15 +1237,18 @@ def get_loc(self, key): """ if isinstance(key, datetime): # needed to localize naive datetimes - stamp = Timestamp(key, tz=self.tz) - return self._engine.get_loc(stamp) + key = Timestamp(key, tz=self.tz) + return Index.get_loc(self, key, method=method) if isinstance(key, time): + if method is not None: + raise NotImplementedError('cannot yet lookup inexact labels ' + 'when key is a time object') return self.indexer_at_time(key) try: - return Index.get_loc(self, key) - except (KeyError, ValueError): + return Index.get_loc(self, key, method=method) + except (KeyError, ValueError, TypeError): try: return self._get_string_slice(key) except (TypeError, KeyError, ValueError): @@ -1253,7 +1256,7 @@ def get_loc(self, key): try: stamp = Timestamp(key, tz=self.tz) - return self._engine.get_loc(stamp) + return Index.get_loc(self, stamp, method=method) except (KeyError, ValueError): raise KeyError(key) @@ -1637,9 +1640,6 @@ def indexer_at_time(self, time, asof=False): Parameters ---------- time : datetime.time or string - tz : string or pytz.timezone or dateutil.tz.tzfile - Time zone for time. Corresponding timestamps would be converted to - time zone of the TimeSeries Returns ------- diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 106e8535ce15a..074ed720991ce 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -516,7 +516,13 @@ def get_value(self, series, key): key = Period(key, self.freq).ordinal return _maybe_box(self, self._engine.get_value(s, key), series, key) - def get_loc(self, key): + def get_indexer(self, target, method=None, limit=None): + if hasattr(target, 'freq') and target.freq != self.freq: + raise ValueError('target and index have different freq: ' + '(%s, %s)' % (target.freq, self.freq)) + return Index.get_indexer(self, target, method, limit) + + def get_loc(self, key, method=None): """ Get integer location for requested label @@ -538,7 +544,7 @@ def get_loc(self, key): key = Period(key, self.freq) try: - return self._engine.get_loc(key.ordinal) + return Index.get_loc(self, key.ordinal, method=method) except KeyError: raise KeyError(key) diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index c365dced8d277..897a28e8f5ea9 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -649,7 +649,7 @@ def get_value_maybe_box(self, series, key): values = self._engine.get_value(_values_from_object(series), key) return _maybe_box(self, values, series, key) - def get_loc(self, key): + def get_loc(self, key, method=None): """ Get integer location for requested label @@ -659,11 +659,11 @@ def get_loc(self, key): """ if _is_convertible_to_td(key): key = Timedelta(key) - return self._engine.get_loc(key) + return Index.get_loc(self, key, method=method) try: - return Index.get_loc(self, key) - except (KeyError, ValueError): + return Index.get_loc(self, key, method=method) + except (KeyError, ValueError, TypeError): try: return self._get_string_slice(key) except (TypeError, KeyError, ValueError): @@ -671,7 +671,7 @@ def get_loc(self, key): try: stamp = Timedelta(key) - return self._engine.get_loc(stamp) + return Index.get_loc(self, stamp, method=method) except (KeyError, ValueError): raise KeyError(key) diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index cf82733c6629d..5f48861097b6d 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -1544,7 +1544,7 @@ def test_period_set_index_reindex(self): df = df.set_index(idx1) self.assertTrue(df.index.equals(idx1)) - df = df.reindex(idx2) + df = df.set_index(idx2) self.assertTrue(df.index.equals(idx2)) def test_nested_dict_frame_constructor(self): diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 54045dfd7c835..b65ecd14d3fff 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -660,7 +660,8 @@ def test_sparse_frame_fillna_limit(self): def test_pad_require_monotonicity(self): rng = date_range('1/1/2000', '3/1/2000', freq='B') - rng2 = rng[::2][::-1] + # neither monotonic increasing or decreasing + rng2 = rng[[1, 0, 2]] self.assertRaises(ValueError, rng2.get_indexer, rng, method='pad') diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index c1b9a3e2359d9..f5626618ea9f5 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -6,7 +6,7 @@ import pandas._period as period import datetime -from pandas.core.api import Timestamp, Series, Timedelta +from pandas.core.api import Timestamp, Series, Timedelta, Period from pandas.tslib import get_timezone from pandas._period import period_asfreq, period_ordinal from pandas.tseries.index import date_range @@ -138,6 +138,12 @@ def test_constructor_with_stringoffset(self): self.assertEqual(repr(result), expected_repr) self.assertEqual(result, eval(repr(result))) + def test_constructor_invalid(self): + with tm.assertRaisesRegexp(TypeError, 'Cannot convert input'): + Timestamp(slice(2)) + with tm.assertRaisesRegexp(ValueError, 'Cannot convert Period'): + Timestamp(Period('1000-01-01')) + def test_conversion(self): # GH 9255 ts = Timestamp('2000-01-01') diff --git a/pandas/tslib.pyx b/pandas/tslib.pyx index 85cb50b8f18ae..f4cf711951f5e 100644 --- a/pandas/tslib.pyx +++ b/pandas/tslib.pyx @@ -1154,8 +1154,10 @@ cdef convert_to_tsobject(object ts, object tz, object unit): # Keep the converter same as PyDateTime's ts = datetime.combine(ts, datetime_time()) return convert_to_tsobject(ts, tz, None) - else: + elif getattr(ts, '_typ', None) == 'period': raise ValueError("Cannot convert Period to Timestamp unambiguously. Use to_timestamp") + else: + raise TypeError('Cannot convert input to Timestamp') if obj.value != NPY_NAT: _check_dts_bounds(&obj.dts)