Skip to content

Commit 8aa73f8

Browse files
committed
Support iterable as rows_sel for iloc indexer
1 parent e4e5a1c commit 8aa73f8

File tree

2 files changed

+73
-15
lines changed

2 files changed

+73
-15
lines changed

databricks/koalas/indexing.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,15 @@
1717
"""
1818
A loc indexer for Koalas DataFrame/Series.
1919
"""
20-
from collections import OrderedDict
20+
from collections import OrderedDict, Iterable
2121
from functools import reduce
2222

2323
from pandas.api.types import is_list_like
2424
from pyspark import sql as spark
2525
from pyspark.sql import functions as F
2626
from pyspark.sql.types import BooleanType, LongType
2727
from pyspark.sql.utils import AnalysisException
28+
import numpy as np
2829

2930
from databricks.koalas.internal import _InternalFrame, NATURAL_ORDER_COLUMN_NAME
3031
from databricks.koalas.exceptions import SparkPandasIndexingError, SparkPandasNotImplementedError
@@ -991,10 +992,47 @@ def verify_type(i):
991992
elif isinstance(rows_sel, int):
992993
sdf = self._internal.spark_frame
993994
return (sdf[self._sequence_col] == rows_sel), None, 0
995+
elif isinstance(rows_sel, Iterable):
996+
sdf = self._internal.spark_frame
997+
998+
if any(
999+
isinstance(key, (int, np.int, np.int64, np.int32)) and key < 0 for key in rows_sel
1000+
):
1001+
offset = sdf.count()
1002+
else:
1003+
offset = 0
1004+
1005+
new_rows_sel = []
1006+
for key in list(rows_sel):
1007+
if not isinstance(key, (int, np.int, np.int64, np.int32)):
1008+
raise TypeError(
1009+
"cannot do positional indexing with these indexers [{}] of {}".format(
1010+
key, type(key)
1011+
)
1012+
)
1013+
if key < 0:
1014+
key = key + offset
1015+
new_rows_sel.append(key)
1016+
1017+
if len(new_rows_sel) != len(set(new_rows_sel)):
1018+
raise NotImplementedError(
1019+
"Duplicated row selection is not currently supported; "
1020+
"however, normalised index was [%s]" % new_rows_sel
1021+
)
1022+
1023+
sequence_scol = sdf[self._sequence_col]
1024+
cond = []
1025+
for key in new_rows_sel:
1026+
cond.append(sequence_scol == F.lit(int(key)).cast(LongType()))
1027+
1028+
if len(cond) == 0:
1029+
cond = [F.lit(False)]
1030+
return reduce(lambda x, y: x | y, cond), None, None
9941031
else:
9951032
iLocIndexer._raiseNotImplemented(
996-
".iloc requires numeric slice or conditional "
997-
"boolean Index, got {}".format(type(rows_sel))
1033+
".iloc requires numeric slice, conditional "
1034+
"boolean Index or a sequence of positions as int, "
1035+
"got {}".format(type(rows_sel))
9981036
)
9991037

10001038
def _select_cols(self, cols_sel):

databricks/koalas/tests/test_indexing.py

Lines changed: 32 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -695,6 +695,33 @@ def test_iloc_slice_rows_sel(self):
695695
self.assert_eq(kdf.iloc[rows_sel].sort_index(), pdf.iloc[rows_sel].sort_index())
696696
self.assert_eq(kdf.A.iloc[rows_sel].sort_index(), pdf.A.iloc[rows_sel].sort_index())
697697

698+
def test_iloc_iterable_rows_sel(self):
699+
pdf = pd.DataFrame({"A": [1, 2] * 5, "B": [3, 4] * 5, "C": [5, 6] * 5})
700+
kdf = ks.from_pandas(pdf)
701+
702+
for rows_sel in [
703+
[],
704+
np.array([0, 1]),
705+
[1, 2],
706+
np.array([-3]),
707+
[3],
708+
np.array([-2]),
709+
[8, 3, -5],
710+
]:
711+
with self.subTest(rows_sel=rows_sel):
712+
self.assert_eq(kdf.iloc[rows_sel].sort_index(), pdf.iloc[rows_sel].sort_index())
713+
self.assert_eq(kdf.A.iloc[rows_sel].sort_index(), pdf.A.iloc[rows_sel].sort_index())
714+
715+
with self.subTest(rows_sel=rows_sel):
716+
self.assert_eq(
717+
kdf.iloc[rows_sel, :].sort_index(), pdf.iloc[rows_sel, :].sort_index()
718+
)
719+
720+
with self.subTest(rows_sel=rows_sel):
721+
self.assert_eq(
722+
kdf.iloc[rows_sel, :1].sort_index(), pdf.iloc[rows_sel, :1].sort_index()
723+
)
724+
698725
def test_setitem(self):
699726
pdf = pd.DataFrame(
700727
[[1, 2], [4, 5], [7, 8]],
@@ -736,18 +763,6 @@ def test_iloc_raises(self):
736763
pdf = pd.DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]})
737764
kdf = ks.from_pandas(pdf)
738765

739-
with self.assertRaisesRegex(
740-
SparkPandasNotImplementedError,
741-
".iloc requires numeric slice or conditional boolean Index",
742-
):
743-
kdf.iloc[[0, 1], :]
744-
745-
with self.assertRaisesRegex(
746-
SparkPandasNotImplementedError,
747-
".iloc requires numeric slice or conditional boolean Index",
748-
):
749-
kdf.A.iloc[[0, 1]]
750-
751766
with self.assertRaisesRegex(SparkPandasIndexingError, "Only accepts pairs of candidates"):
752767
kdf.iloc[[0, 1], [0, 1], [1, 2]]
753768

@@ -768,3 +783,8 @@ def test_iloc_raises(self):
768783

769784
with self.assertRaisesRegex(IndexError, "out of range"):
770785
kdf.iloc[:, [5, 6]]
786+
787+
with self.assertRaisesRegex(
788+
NotImplementedError, "Duplicated row selection is not currently supported"
789+
):
790+
kdf.iloc[[1, 1]]

0 commit comments

Comments
 (0)