Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions databricks/koalas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -945,11 +945,12 @@ def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, d
raise NotImplementedError("value_counts currently does not support bins")

if dropna:
sdf_dropna = self._internal._sdf.dropna()
sdf_dropna = self._internal._sdf.select(self._scol).dropna()
else:
sdf_dropna = self._internal._sdf
index_name = SPARK_INDEX_NAME_FORMAT(0)
sdf = sdf_dropna.groupby(self._scol.alias(index_name)).count()
column_name = self._internal.data_columns[0]
sdf = sdf_dropna.groupby(sdf_dropna[column_name].alias(index_name)).count()
if sort:
if ascending:
sdf = sdf.orderBy(F.col('count'))
Expand Down
26 changes: 26 additions & 0 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,32 @@ def test_value_counts(self):
kser.name = 'index'
self.assert_eq(kser.value_counts(), pser.value_counts(), almost=True)

# Series from DataFrame
pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [None, 1, None]})
kdf = ks.from_pandas(pdf)

self.assert_eq(kdf.a.value_counts(normalize=True),
pdf.a.value_counts(normalize=True), almost=True)
self.assert_eq(kdf.a.value_counts(ascending=True),
pdf.a.value_counts(ascending=True), almost=True)
self.assert_eq(kdf.a.value_counts(normalize=True, dropna=False),
pdf.a.value_counts(normalize=True, dropna=False), almost=True)
self.assert_eq(kdf.a.value_counts(ascending=True, dropna=False),
pdf.a.value_counts(ascending=True, dropna=False), almost=True)

# Series with NaN index
pser = pd.Series([1, 2, 3], index=[2, None, 5])
kser = ks.from_pandas(pser)

self.assert_eq(kser.value_counts(normalize=True),
pser.value_counts(normalize=True), almost=True)
self.assert_eq(kser.value_counts(ascending=True),
pser.value_counts(ascending=True), almost=True)
self.assert_eq(kser.value_counts(normalize=True, dropna=False),
pser.value_counts(normalize=True, dropna=False), almost=True)
self.assert_eq(kser.value_counts(ascending=True, dropna=False),
pser.value_counts(ascending=True, dropna=False), almost=True)

def test_nsmallest(self):
sample_lst = [1, 2, 3, 4, np.nan, 6]
pser = pd.Series(sample_lst, name='x')
Expand Down