Skip to content

Commit 6f4c7fa

Browse files
charlesdong1991HyukjinKwon
authored andcommitted
Add has_duplicates property for Index and MultiIndex (#946)
1 parent aa44e93 commit 6f4c7fa

File tree

3 files changed

+46
-2
lines changed

3 files changed

+46
-2
lines changed

databricks/koalas/indexes.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,30 @@ def spark_type(self):
127127
""" Returns the data type as defined by Spark, as a Spark DataType object."""
128128
return self.to_series().spark_type
129129

130+
@property
131+
def has_duplicates(self) -> bool:
132+
"""
133+
If index has duplicates, return True, otherwise False.
134+
135+
Examples
136+
--------
137+
>>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=list('aac'))
138+
>>> kdf.index.has_duplicates
139+
True
140+
141+
>>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('abc'), list('def')])
142+
>>> kdf.index.has_duplicates
143+
False
144+
145+
>>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[list('aac'), list('eef')])
146+
>>> kdf.index.has_duplicates
147+
True
148+
"""
149+
df = self._kdf._sdf.select(self._scol)
150+
col = df.columns[0]
151+
152+
return df.select(F.count(col) != F.countDistinct(col)).first()[0]
153+
130154
@property
131155
def name(self) -> Union[str, Tuple[str, ...]]:
132156
"""Return name of the Index."""

databricks/koalas/missing/indexes.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@ class _MissingPandasLikeIndex(object):
3131

3232
# Properties
3333
T = unsupported_property('T')
34-
has_duplicates = unsupported_property('has_duplicates')
3534
nbytes = unsupported_property('nbytes')
3635
nlevels = unsupported_property('nlevels')
3736
shape = unsupported_property('shape')
@@ -129,7 +128,6 @@ class _MissingPandasLikeMultiIndex(object):
129128
# Properties
130129
T = unsupported_property('T')
131130
codes = unsupported_property('codes')
132-
has_duplicates = unsupported_property('has_duplicates')
133131
is_all_dates = unsupported_property('is_all_dates')
134132
levels = unsupported_property('levels')
135133
levshape = unsupported_property('levshape')

databricks/koalas/tests/test_indexes.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,28 @@ def test_missing(self):
239239
"property.*Index.*{}.*is deprecated".format(name)):
240240
getattr(kdf.set_index(['a', 'b']).index, name)
241241

242+
def test_index_has_duplicates(self):
243+
indexes = [("a", "b", "c"), ("a", "a", "c"), (1, 3, 3), (1, 2, 3)]
244+
names = [None, 'ks', 'ks', None]
245+
has_dup = [False, True, True, False]
246+
247+
for idx, name, expected in zip(indexes, names, has_dup):
248+
pdf = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index(idx, name=name))
249+
kdf = ks.from_pandas(pdf)
250+
251+
self.assertEqual(kdf.index.has_duplicates, expected)
252+
253+
def test_multiindex_has_duplicates(self):
254+
indexes = [[list("abc"), list("edf")], [list("aac"), list("edf")],
255+
[list("aac"), list("eef")], [[1, 4, 4], [4, 6, 6]]]
256+
has_dup = [False, False, True, True]
257+
258+
for idx, expected in zip(indexes, has_dup):
259+
pdf = pd.DataFrame({"a": [1, 2, 3]}, index=idx)
260+
kdf = ks.from_pandas(pdf)
261+
262+
self.assertEqual(kdf.index.has_duplicates, expected)
263+
242264
def test_multi_index_not_supported(self):
243265
kdf = ks.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]})
244266

0 commit comments

Comments
 (0)