Merge pull request #6443 from jreback/iloc_bounds2

jreback · jreback · commit a96b53d8a005 · 2014-02-22T10:13:36.000-05:00
BUG/TST: iloc will now raise IndexError on out-of-bounds list indexers (GH6296 / GH6299)
diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst
@@ -77,9 +77,9 @@ of multi-axis indexing.
   See more at :ref:`Selection by Label <indexing.label>`
 
 - ``.iloc`` is strictly integer position based (from ``0`` to ``length-1`` of
-  the axis), will raise ``IndexError`` if a single index is requested and it
-  is out-of-bounds, otherwise it will conform the bounds to size of the object.
-  Allowed inputs are:
+  the axis), will raise ``IndexError`` if an indexer is requested and it
+  is out-of-bounds, except *slice* indexers which allow out-of-bounds indexing.
+  (this conforms with python/numpy *slice* semantics). Allowed inputs are:
 
   - An integer e.g. ``5``
   - A list or array of integers ``[4, 3, 0]``
@@ -421,19 +421,28 @@ python/numpy allow slicing past the end of an array without an associated error.
     x[4:10]
     x[8:10]
 
-- as of v0.14.0, ``iloc`` will now accept out-of-bounds indexers, e.g. a value that exceeds the length of the object being
+- as of v0.14.0, ``iloc`` will now accept out-of-bounds indexers for slices, e.g. a value that exceeds the length of the object being
   indexed. These will be excluded. This will make pandas conform more with pandas/numpy indexing of out-of-bounds
-  values. A single indexer that is out-of-bounds and drops the dimensions of the object will still raise
-  ``IndexError`` (:issue:`6296`). This could result in an empty axis (e.g. an empty DataFrame being returned)
+  values. A single indexer / list of indexers that is out-of-bounds will still raise
+  ``IndexError`` (:issue:`6296`, :issue:`6299`). This could result in an empty axis (e.g. an empty DataFrame being returned)
 
 .. ipython:: python
 
    dfl = DataFrame(np.random.randn(5,2),columns=list('AB'))
    dfl
-   dfl.iloc[[4,5,6]]
-   dfl.iloc[4:6]
    dfl.iloc[:,2:3]
    dfl.iloc[:,1:3]
+   dfl.iloc[4:6]
+
+These are out-of-bounds selections
+
+.. code-block:: python
+
+   dfl.iloc[[4,5,6]]
+   IndexError: positional indexers are out-of-bounds
+
+   dfl.iloc[:,4]
+   IndexError: single positional indexer is out-of-bounds
 
 .. _indexing.basics.partial_setting:
 
@@ -911,9 +920,9 @@ You can combine this with other expressions for very succinct queries:
    **expression itself** is evaluated in vanilla Python. For example, in the
    expression
 
-       .. code-block:: python
+   .. code-block:: python
 
-          df.query('a in b + c + d')
+      df.query('a in b + c + d')
 
    ``(b + c + d)`` is evaluated by ``numexpr`` and *then* the ``in``
    operation is evaluated in plain Python. In general, any operations that can
diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt
@@ -15,19 +15,29 @@ Highlights include:
 API changes
 ~~~~~~~~~~~
 
-- ``iloc`` will now accept out-of-bounds indexers, e.g. a value that exceeds the length of the object being
+- ``iloc`` will now accept out-of-bounds indexers for slices, e.g. a value that exceeds the length of the object being
   indexed. These will be excluded. This will make pandas conform more with pandas/numpy indexing of out-of-bounds
-  values. A single indexer that is out-of-bounds and drops the dimensions of the object will still raise
-  ``IndexError`` (:issue:`6296`). This could result in an empty axis (e.g. an empty DataFrame being returned)
+  values. A single indexer / list of indexers that is out-of-bounds will still raise
+  ``IndexError`` (:issue:`6296`, :issue:`6299`). This could result in an empty axis (e.g. an empty DataFrame being returned)
 
-  .. ipython:: python
+.. ipython:: python
+
+   dfl = DataFrame(np.random.randn(5,2),columns=list('AB'))
+   dfl
+   dfl.iloc[:,2:3]
+   dfl.iloc[:,1:3]
+   dfl.iloc[4:6]
+
+These are out-of-bounds selections
+
+.. code-block:: python
+
+   dfl.iloc[[4,5,6]]
+   IndexError: positional indexers are out-of-bounds
+
+   dfl.iloc[:,4]
+   IndexError: single positional indexer is out-of-bounds
 
-      df = DataFrame(np.random.randn(5,2),columns=list('AB'))
-      df
-      df.iloc[[4,5,6]]
-      df.iloc[4:6]
-      df.iloc[:,2:3]
-      df.iloc[:,1:3]
 
 - The ``DataFrame.interpolate()`` ``downcast`` keyword default has been changed from ``infer`` to
   ``None``. This is to preseve the original dtype unless explicitly requested otherwise (:issue:`6290`).
diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -1376,7 +1376,7 @@ def _getitem_axis(self, key, axis=0, validate_iterable=False):
                 arr = np.array(key)
                 l = len(ax)
                 if len(arr) and (arr.max() >= l or arr.min() <= -l):
-                    key = arr[(arr>-l) & (arr<l)]
+                    raise IndexError("positional indexers are out-of-bounds")
 
                 # force an actual list
                 key = list(key)
@@ -1389,7 +1389,7 @@ def _getitem_axis(self, key, axis=0, validate_iterable=False):
                                     "non-integer key")
 
                 if key > len(ax):
-                    raise IndexError("single indexer is out-of-bounds")
+                    raise IndexError("single positional indexer is out-of-bounds")
 
             return self._get_loc(key, axis=axis)
 
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -961,6 +961,7 @@ def test_frame_groupby(self):
         assert_frame_equal(stragged, aggregated, check_names=False)
 
         # transform
+        grouped = self.tsframe.head(30).groupby(lambda x: x.weekday())
         transformed = grouped.transform(lambda x: x - x.mean())
         self.assertEqual(len(transformed), 30)
         self.assertEqual(len(transformed.columns), 4)
@@ -2203,7 +2204,7 @@ def test_panel_groupby(self):
         grouped = self.panel.groupby(lambda x: x.month, axis='major')
         agged = grouped.mean()
 
-        self.assert_numpy_array_equal(agged.major_axis, [1, 2])
+        self.assert_numpy_array_equal(agged.major_axis, sorted(list(set(self.panel.major_axis.month))))
 
         grouped = self.panel.groupby({'A': 0, 'B': 0, 'C': 1, 'D': 1},
                                      axis='minor')
diff --git a/pandas/tests/test_indexing.py b/pandas/tests/test_indexing.py
@@ -348,17 +348,24 @@ def test_iloc_exceeds_bounds(self):
         # iloc should allow indexers that exceed the bounds
         df = DataFrame(np.random.random_sample((20,5)), columns=list('ABCDE'))
         expected = df
-        result = df.iloc[:,[0,1,2,3,4,5]]
-        assert_frame_equal(result,expected)
 
-        result = df.iloc[[1,30]]
-        expected = df.iloc[[1]]
-        assert_frame_equal(result,expected)
+        # lists of positions should raise IndexErrror!
+        with tm.assertRaisesRegexp(IndexError, 'positional indexers are out-of-bounds'):
+            df.iloc[:,[0,1,2,3,4,5]]
+        self.assertRaises(IndexError, lambda : df.iloc[[1,30]])
+        self.assertRaises(IndexError, lambda : df.iloc[[1,-30]])
+        self.assertRaises(IndexError, lambda : df.iloc[[100]])
 
-        result = df.iloc[[1,-30]]
-        expected = df.iloc[[1]]
-        assert_frame_equal(result,expected)
+        s = df['A']
+        self.assertRaises(IndexError, lambda : s.iloc[[100]])
+        self.assertRaises(IndexError, lambda : s.iloc[[-100]])
 
+        # still raise on a single indexer
+        with tm.assertRaisesRegexp(IndexError, 'single positional indexer is out-of-bounds'):
+            df.iloc[30]
+        self.assertRaises(IndexError, lambda : df.iloc[-30])
+
+        # slices are ok
         result = df.iloc[:,4:10]
         expected = df.iloc[:,4:]
         assert_frame_equal(result,expected)
@@ -367,34 +374,15 @@ def test_iloc_exceeds_bounds(self):
         expected = df.iloc[:,-4:]
         assert_frame_equal(result,expected)
 
-        result = df.iloc[[100]]
-        expected = DataFrame(columns=df.columns)
-        assert_frame_equal(result,expected)
-
-        # still raise on a single indexer
-        def f():
-            df.iloc[30]
-        self.assertRaises(IndexError, f)
-
-        s = df['A']
-        result = s.iloc[[100]]
-        expected = Series()
-        assert_series_equal(result,expected)
-
-        result = s.iloc[[-100]]
-        expected = Series()
-        assert_series_equal(result,expected)
-
-        # slice
+        # slice bounds exceeding is ok
         result = s.iloc[18:30]
         expected = s.iloc[18:]
         assert_series_equal(result,expected)
 
         # doc example
         df = DataFrame(np.random.randn(5,2),columns=list('AB'))
-        result = df.iloc[[4,5,6]]
-        expected = df.iloc[[4]]
-        assert_frame_equal(result,expected)
+        self.assertRaises(IndexError, lambda : df.iloc[[4,5,6]])
+        self.assertRaises(IndexError, lambda : df.iloc[:,4])
 
         result = df.iloc[4:6]
         expected = df.iloc[[4]]
diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py
@@ -144,6 +144,7 @@ class DatetimeIndex(Int64Index):
 
     _engine_type = _index.DatetimeEngine
 
+    tz = None
     offset = None
     _comparables = ['name','freqstr','tz']
     _allow_datetime_index_ops = True