From 9788ad19db036ba8eded8b66b16b45ce392d24f7 Mon Sep 17 00:00:00 2001 From: immerrr Date: Sat, 22 Feb 2014 11:58:28 +0400 Subject: [PATCH] PERF: optimize index.__getitem__ for slice & boolean mask indexers --- doc/source/release.rst | 2 ++ doc/source/v0.14.0.txt | 15 +++++++++++ pandas/core/index.py | 51 +++++++++++++++++++------------------- pandas/core/internals.py | 2 +- pandas/tests/test_index.py | 27 ++++++++++++++++++++ pandas/tseries/index.py | 2 -- pandas/tseries/period.py | 2 -- vb_suite/index_object.py | 13 ++++++++++ 8 files changed, 83 insertions(+), 31 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 9b7f1b619f90f..7674cc9f35622 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -105,6 +105,8 @@ API Changes - ``NameResolutionError`` was removed because it isn't necessary anymore. - ``concat`` will now concatenate mixed Series and DataFrames using the Series name or numbering columns as needed (:issue:`2385`) +- Slicing and advanced/boolean indexing operations on ``Index`` classes will no + longer change type of the resulting index (:issue:`6440`). Experimental Features ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index ada29dc674420..4432e9e891e7d 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -78,6 +78,21 @@ These are out-of-bounds selections - ``NameResolutionError`` was removed because it isn't necessary anymore. - ``concat`` will now concatenate mixed Series and DataFrames using the Series name or numbering columns as needed (:issue:`2385`). See :ref:`the docs ` +- Slicing and advanced/boolean indexing operations on ``Index`` classes will no + longer change type of the resulting index (:issue:`6440`) + + .. ipython:: python + + i = pd.Index([1, 2, 3, 'a' , 'b', 'c']) + i[[0,1,2]] + + Previously, the above operation would return ``Int64Index``. If you'd like + to do this manually, use :meth:`Index.astype` + + .. ipython:: python + + i[[0,1,2]].astype(np.int_) + MultiIndexing Using Slicers ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/pandas/core/index.py b/pandas/core/index.py index 4a4086c4eeb0c..c16e2eff06904 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -631,34 +631,35 @@ def __hash__(self): raise TypeError("unhashable type: %r" % type(self).__name__) def __getitem__(self, key): - """Override numpy.ndarray's __getitem__ method to work as desired""" - arr_idx = self.view(np.ndarray) + """ + Override numpy.ndarray's __getitem__ method to work as desired. + + This function adds lists and Series as valid boolean indexers + (ndarrays only supports ndarray with dtype=bool). + + If resulting ndim != 1, plain ndarray is returned instead of + corresponding `Index` subclass. + + """ + # There's no custom logic to be implemented in __getslice__, so it's + # not overloaded intentionally. + __getitem__ = super(Index, self).__getitem__ if np.isscalar(key): - return arr_idx[key] - else: - if com._is_bool_indexer(key): - key = np.asarray(key) + return __getitem__(key) - try: - result = arr_idx[key] - if result.ndim > 1: - return result - except (IndexError): - if not len(key): - result = [] - else: - raise + if isinstance(key, slice): + # This case is separated from the conditional above to avoid + # pessimization of basic indexing. + return __getitem__(key) - return Index(result, name=self.name) + if com._is_bool_indexer(key): + return __getitem__(np.asarray(key)) - def _getitem_slice(self, key): - """ getitem for a bool/sliceable, fallback to standard getitem """ - try: - arr_idx = self.view(np.ndarray) - result = arr_idx[key] - return self.__class__(result, name=self.name, fastpath=True) - except: - return self.__getitem__(key) + result = __getitem__(key) + if result.ndim > 1: + return result.view(np.ndarray) + else: + return result def append(self, other): """ @@ -2800,8 +2801,6 @@ def __getitem__(self, key): return result - _getitem_slice = __getitem__ - def take(self, indexer, axis=None): """ Analogous to ndarray.take diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 10017f89e5204..74a8ce0118d88 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -3737,7 +3737,7 @@ def get_slice(self, slobj, raise_on_error=False): if raise_on_error: _check_slice_bounds(slobj, self.index) return self.__class__(self._block._slice(slobj), - self.index._getitem_slice(slobj), fastpath=True) + self.index[slobj], fastpath=True) def set_axis(self, axis, value, maybe_rename=True, check_axis=True): cur_axis, value = self._set_axis(axis, value, check_axis) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index e828bc100dfcf..3e578a5e36bb1 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -323,6 +323,25 @@ def test_fancy(self): for i in sl: self.assertEqual(i, sl[sl.get_loc(i)]) + def test_empty_fancy(self): + empty_farr = np.array([], dtype=np.float_) + empty_iarr = np.array([], dtype=np.int_) + empty_barr = np.array([], dtype=np.bool_) + + # pd.DatetimeIndex is excluded, because it overrides getitem and should + # be tested separately. + for idx in [self.strIndex, self.intIndex, self.floatIndex]: + empty_idx = idx.__class__([]) + values = idx.values + + self.assert_(idx[[]].identical(empty_idx)) + self.assert_(idx[empty_iarr].identical(empty_idx)) + self.assert_(idx[empty_barr].identical(empty_idx)) + + # np.ndarray only accepts ndarray of int & bool dtypes, so should + # Index. + self.assertRaises(IndexError, idx.__getitem__, empty_farr) + def test_getitem(self): arr = np.array(self.dateIndex) exp = self.dateIndex[5] @@ -762,6 +781,14 @@ def test_join_self(self): joined = res.join(res, how=kind) self.assertIs(res, joined) + def test_indexing_doesnt_change_class(self): + idx = Index([1, 2, 3, 'a', 'b', 'c']) + + self.assert_(idx[1:3].identical( + pd.Index([2, 3], dtype=np.object_))) + self.assert_(idx[[0,1]].identical( + pd.Index([1, 2], dtype=np.object_))) + class TestFloat64Index(tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index f81634f45bdb2..c58447acec621 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -1406,8 +1406,6 @@ def __getitem__(self, key): return self._simple_new(result, self.name, new_offset, self.tz) - _getitem_slice = __getitem__ - # Try to run function on index first, and then on elements of index # Especially important for group-by functionality def map(self, f): diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 337533ad29f4f..5fca119c14e83 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -1056,8 +1056,6 @@ def __getitem__(self, key): return PeriodIndex(result, name=self.name, freq=self.freq) - _getitem_slice = __getitem__ - def _format_with_header(self, header, **kwargs): return header + self._format_native_types(**kwargs) diff --git a/vb_suite/index_object.py b/vb_suite/index_object.py index 8b348ddc6e6cc..2cfdffdc38541 100644 --- a/vb_suite/index_object.py +++ b/vb_suite/index_object.py @@ -46,3 +46,16 @@ index_int64_intersection = Benchmark('left.intersection(right)', setup, start_date=datetime(2011, 1, 1)) + +#---------------------------------------------------------------------- +# string index slicing +setup = common_setup + """ +idx = tm.makeStringIndex(1000000) + +mask = np.arange(1000000) % 3 == 0 +series_mask = Series(mask) +""" +index_str_slice_indexer_basic = Benchmark('idx[:-1]', setup) +index_str_slice_indexer_even = Benchmark('idx[::2]', setup) +index_str_boolean_indexer = Benchmark('idx[mask]', setup) +index_str_boolean_series_indexer = Benchmark('idx[series_mask]', setup)