Support iterable as rows_sel for iloc indexer

HyukjinKwon · HyukjinKwon · commit 8aa73f83854b · 2020-03-12T15:40:08.000+09:00
diff --git a/databricks/koalas/indexing.py b/databricks/koalas/indexing.py
@@ -17,14 +17,15 @@
 """
 A loc indexer for Koalas DataFrame/Series.
 """
-from collections import OrderedDict
+from collections import OrderedDict, Iterable
 from functools import reduce
 
 from pandas.api.types import is_list_like
 from pyspark import sql as spark
 from pyspark.sql import functions as F
 from pyspark.sql.types import BooleanType, LongType
 from pyspark.sql.utils import AnalysisException
+import numpy as np
 
 from databricks.koalas.internal import _InternalFrame, NATURAL_ORDER_COLUMN_NAME
 from databricks.koalas.exceptions import SparkPandasIndexingError, SparkPandasNotImplementedError
@@ -991,10 +992,47 @@ def verify_type(i):
         elif isinstance(rows_sel, int):
             sdf = self._internal.spark_frame
             return (sdf[self._sequence_col] == rows_sel), None, 0
+        elif isinstance(rows_sel, Iterable):
+            sdf = self._internal.spark_frame
+
+            if any(
+                isinstance(key, (int, np.int, np.int64, np.int32)) and key < 0 for key in rows_sel
+            ):
+                offset = sdf.count()
+            else:
+                offset = 0
+
+            new_rows_sel = []
+            for key in list(rows_sel):
+                if not isinstance(key, (int, np.int, np.int64, np.int32)):
+                    raise TypeError(
+                        "cannot do positional indexing with these indexers [{}] of {}".format(
+                            key, type(key)
+                        )
+                    )
+                if key < 0:
+                    key = key + offset
+                new_rows_sel.append(key)
+
+            if len(new_rows_sel) != len(set(new_rows_sel)):
+                raise NotImplementedError(
+                    "Duplicated row selection is not currently supported; "
+                    "however, normalised index was [%s]" % new_rows_sel
+                )
+
+            sequence_scol = sdf[self._sequence_col]
+            cond = []
+            for key in new_rows_sel:
+                cond.append(sequence_scol == F.lit(int(key)).cast(LongType()))
+
+            if len(cond) == 0:
+                cond = [F.lit(False)]
+            return reduce(lambda x, y: x | y, cond), None, None
         else:
             iLocIndexer._raiseNotImplemented(
-                ".iloc requires numeric slice or conditional "
-                "boolean Index, got {}".format(type(rows_sel))
+                ".iloc requires numeric slice, conditional "
+                "boolean Index or a sequence of positions as int, "
+                "got {}".format(type(rows_sel))
             )
 
     def _select_cols(self, cols_sel):
diff --git a/databricks/koalas/tests/test_indexing.py b/databricks/koalas/tests/test_indexing.py
@@ -695,6 +695,33 @@ def test_iloc_slice_rows_sel(self):
                 self.assert_eq(kdf.iloc[rows_sel].sort_index(), pdf.iloc[rows_sel].sort_index())
                 self.assert_eq(kdf.A.iloc[rows_sel].sort_index(), pdf.A.iloc[rows_sel].sort_index())
 
+    def test_iloc_iterable_rows_sel(self):
+        pdf = pd.DataFrame({"A": [1, 2] * 5, "B": [3, 4] * 5, "C": [5, 6] * 5})
+        kdf = ks.from_pandas(pdf)
+
+        for rows_sel in [
+            [],
+            np.array([0, 1]),
+            [1, 2],
+            np.array([-3]),
+            [3],
+            np.array([-2]),
+            [8, 3, -5],
+        ]:
+            with self.subTest(rows_sel=rows_sel):
+                self.assert_eq(kdf.iloc[rows_sel].sort_index(), pdf.iloc[rows_sel].sort_index())
+                self.assert_eq(kdf.A.iloc[rows_sel].sort_index(), pdf.A.iloc[rows_sel].sort_index())
+
+            with self.subTest(rows_sel=rows_sel):
+                self.assert_eq(
+                    kdf.iloc[rows_sel, :].sort_index(), pdf.iloc[rows_sel, :].sort_index()
+                )
+
+            with self.subTest(rows_sel=rows_sel):
+                self.assert_eq(
+                    kdf.iloc[rows_sel, :1].sort_index(), pdf.iloc[rows_sel, :1].sort_index()
+                )
+
     def test_setitem(self):
         pdf = pd.DataFrame(
             [[1, 2], [4, 5], [7, 8]],
@@ -736,18 +763,6 @@ def test_iloc_raises(self):
         pdf = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]})
         kdf = ks.from_pandas(pdf)
 
-        with self.assertRaisesRegex(
-            SparkPandasNotImplementedError,
-            ".iloc requires numeric slice or conditional boolean Index",
-        ):
-            kdf.iloc[[0, 1], :]
-
-        with self.assertRaisesRegex(
-            SparkPandasNotImplementedError,
-            ".iloc requires numeric slice or conditional boolean Index",
-        ):
-            kdf.A.iloc[[0, 1]]
-
         with self.assertRaisesRegex(SparkPandasIndexingError, "Only accepts pairs of candidates"):
             kdf.iloc[[0, 1], [0, 1], [1, 2]]
 
@@ -768,3 +783,8 @@ def test_iloc_raises(self):
 
         with self.assertRaisesRegex(IndexError, "out of range"):
             kdf.iloc[:, [5, 6]]
+
+        with self.assertRaisesRegex(
+            NotImplementedError, "Duplicated row selection is not currently supported"
+        ):
+            kdf.iloc[[1, 1]]