Skip to content

Commit 4a18a09

Browse files
authored
Implements multi-index support in Dataframe.filter (#1512)
Implementing multi-index support in `DataFrame.filter`
1 parent 9c879d3 commit 4a18a09

File tree

2 files changed

+55
-25
lines changed

2 files changed

+55
-25
lines changed

databricks/koalas/frame.py

Lines changed: 41 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8945,34 +8945,58 @@ def filter(self, items=None, like=None, regex=None, axis=None):
89458945
else:
89468946
raise ValueError("items should be a list-like object.")
89478947
if axis == 0:
8948-
# TODO: support multi-index here
8949-
if len(index_scols) != 1:
8950-
raise ValueError("Single index must be specified.")
8951-
col = None
8952-
for item in items:
8953-
if col is None:
8954-
col = index_scols[0] == F.lit(item)
8955-
else:
8956-
col = col | (index_scols[0] == F.lit(item))
8948+
if len(index_scols) == 1:
8949+
col = None
8950+
for item in items:
8951+
if col is None:
8952+
col = index_scols[0] == F.lit(item)
8953+
else:
8954+
col = col | (index_scols[0] == F.lit(item))
8955+
elif len(index_scols) > 1:
8956+
# for multi-index
8957+
col = None
8958+
for item in items:
8959+
if not isinstance(item, (tuple)):
8960+
raise TypeError("Unsupported type {}".format(type(item)))
8961+
if not item:
8962+
raise ValueError("The item should not be empty.")
8963+
midx_col = None
8964+
for i, element in enumerate(item):
8965+
if midx_col is None:
8966+
midx_col = index_scols[i] == F.lit(element)
8967+
else:
8968+
midx_col = midx_col & (index_scols[i] == F.lit(element))
8969+
if col is None:
8970+
col = midx_col
8971+
else:
8972+
col = col | midx_col
8973+
else:
8974+
raise ValueError("Single or multi index must be specified.")
89578975
return DataFrame(self._internal.with_filter(col))
89588976
elif axis == 1:
89598977
return self[items]
89608978
elif like is not None:
89618979
if axis == 0:
8962-
# TODO: support multi-index here
8963-
if len(index_scols) != 1:
8964-
raise ValueError("Single index must be specified.")
8965-
return DataFrame(self._internal.with_filter(index_scols[0].contains(like)))
8980+
col = None
8981+
for index_scol in index_scols:
8982+
if col is None:
8983+
col = index_scol.contains(like)
8984+
else:
8985+
col = col | index_scol.contains(like)
8986+
return DataFrame(self._internal.with_filter(col))
89668987
elif axis == 1:
89678988
column_labels = self._internal.column_labels
89688989
output_labels = [label for label in column_labels if any(like in i for i in label)]
89698990
return self[output_labels]
89708991
elif regex is not None:
89718992
if axis == 0:
8972-
# TODO: support multi-index here
8973-
if len(index_scols) != 1:
8974-
raise ValueError("Single index must be specified.")
8975-
return DataFrame(self._internal.with_filter(index_scols[0].rlike(regex)))
8993+
col = None
8994+
for index_scol in index_scols:
8995+
if col is None:
8996+
col = index_scol.rlike(regex)
8997+
else:
8998+
col = col | index_scol.rlike(regex)
8999+
return DataFrame(self._internal.with_filter(col))
89769000
elif axis == 1:
89779001
column_labels = self._internal.column_labels
89789002
matcher = re.compile(regex)

databricks/koalas/tests/test_dataframe.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2685,17 +2685,23 @@ def test_filter(self):
26852685
pdf = pdf.set_index("ba", append=True)
26862686
kdf = ks.from_pandas(pdf)
26872687

2688-
with self.assertRaisesRegex(ValueError, "items should be a list-like object"):
2689-
kdf.filter(items="b")
2688+
self.assert_eq(
2689+
kdf.filter(items=[("aa", 1), ("bd", 2)], axis=0).sort_index(),
2690+
pdf.filter(items=[("aa", 1), ("bd", 2)], axis=0).sort_index(),
2691+
)
26902692

2691-
with self.assertRaisesRegex(ValueError, "Single index must be specified."):
2692-
kdf.filter(items=["b"], axis=0)
2693+
with self.assertRaisesRegex(TypeError, "Unsupported type <class 'list'>"):
2694+
kdf.filter(items=[["aa", 1], ("bd", 2)], axis=0)
26932695

2694-
with self.assertRaisesRegex(ValueError, "Single index must be specified."):
2695-
kdf.filter(like="b", axis="index")
2696+
with self.assertRaisesRegex(ValueError, "The item should not be empty."):
2697+
kdf.filter(items=[(), ("bd", 2)], axis=0)
26962698

2697-
with self.assertRaisesRegex(ValueError, "Single index must be specified."):
2698-
kdf.filter(regex="b.*", axis="index")
2699+
self.assert_eq(kdf.filter(like="b", axis=0), pdf.filter(like="b", axis=0))
2700+
2701+
self.assert_eq(kdf.filter(regex="b.*", axis=0), pdf.filter(regex="b.*", axis=0))
2702+
2703+
with self.assertRaisesRegex(ValueError, "items should be a list-like object"):
2704+
kdf.filter(items="b")
26992705

27002706
with self.assertRaisesRegex(ValueError, "No axis named"):
27012707
kdf.filter(regex="b.*", axis=123)

0 commit comments

Comments
 (0)