-
Notifications
You must be signed in to change notification settings - Fork 367
Fix value_counts() to work properly when dropna is True #1116
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
37206db
21c8431
f2f3df5
a895436
64d1fb6
0362cc2
d2271db
1299d0e
0210ebe
204976e
c04be15
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,6 +24,7 @@ | |
| import matplotlib | ||
| matplotlib.use('agg') | ||
| from matplotlib import pyplot as plt | ||
| import pyspark | ||
| import numpy as np | ||
| import pandas as pd | ||
|
|
||
|
|
@@ -33,6 +34,7 @@ | |
| from databricks.koalas.exceptions import PandasNotImplementedError | ||
| from databricks.koalas.missing.series import _MissingPandasLikeSeries | ||
| from databricks.koalas.config import set_option, reset_option | ||
| from databricks.koalas.utils import default_session | ||
|
|
||
|
|
||
| class SeriesTest(ReusedSQLTestCase, SQLTestUtils): | ||
|
|
@@ -243,7 +245,8 @@ def test_nunique(self): | |
| self.assertEqual(ks.Series(range(100)).nunique(approx=True), 103) | ||
| self.assertEqual(ks.Series(range(100)).nunique(approx=True, rsd=0.01), 100) | ||
|
|
||
| def test_value_counts(self): | ||
| def _test_value_counts(self): | ||
| # this is also containing test for Index & MultiIndex | ||
| pser = pd.Series([1, 2, 1, 3, 3, np.nan, 1, 4], name="x") | ||
| kser = ks.from_pandas(pser) | ||
|
|
||
|
|
@@ -261,6 +264,15 @@ def test_value_counts(self): | |
| self.assert_eq(kser.value_counts(ascending=True, dropna=False), | ||
| pser.value_counts(ascending=True, dropna=False), almost=True) | ||
|
|
||
| self.assert_eq(kser.index.value_counts(normalize=True), | ||
| pser.index.value_counts(normalize=True), almost=True) | ||
| self.assert_eq(kser.index.value_counts(ascending=True), | ||
| pser.index.value_counts(ascending=True), almost=True) | ||
| self.assert_eq(kser.index.value_counts(normalize=True, dropna=False), | ||
| pser.index.value_counts(normalize=True, dropna=False), almost=True) | ||
| self.assert_eq(kser.index.value_counts(ascending=True, dropna=False), | ||
| pser.index.value_counts(ascending=True, dropna=False), almost=True) | ||
|
|
||
| with self.assertRaisesRegex(NotImplementedError, | ||
| "value_counts currently does not support bins"): | ||
| kser.value_counts(bins=3) | ||
|
|
@@ -269,6 +281,132 @@ def test_value_counts(self): | |
| kser.name = 'index' | ||
| self.assert_eq(kser.value_counts(), pser.value_counts(), almost=True) | ||
|
|
||
| # Series from DataFrame | ||
| pdf = pd.DataFrame({'a': [1, 2, 3], 'b': [None, 1, None]}) | ||
| kdf = ks.from_pandas(pdf) | ||
|
|
||
| self.assert_eq(kdf.a.value_counts(normalize=True), | ||
| pdf.a.value_counts(normalize=True), almost=True) | ||
| self.assert_eq(kdf.a.value_counts(ascending=True), | ||
| pdf.a.value_counts(ascending=True), almost=True) | ||
| self.assert_eq(kdf.a.value_counts(normalize=True, dropna=False), | ||
| pdf.a.value_counts(normalize=True, dropna=False), almost=True) | ||
| self.assert_eq(kdf.a.value_counts(ascending=True, dropna=False), | ||
| pdf.a.value_counts(ascending=True, dropna=False), almost=True) | ||
|
|
||
| self.assert_eq(kser.index.value_counts(normalize=True), | ||
| pser.index.value_counts(normalize=True), almost=True) | ||
| self.assert_eq(kser.index.value_counts(ascending=True), | ||
| pser.index.value_counts(ascending=True), almost=True) | ||
| self.assert_eq(kser.index.value_counts(normalize=True, dropna=False), | ||
| pser.index.value_counts(normalize=True, dropna=False), almost=True) | ||
| self.assert_eq(kser.index.value_counts(ascending=True, dropna=False), | ||
| pser.index.value_counts(ascending=True, dropna=False), almost=True) | ||
|
|
||
| # Series with NaN index | ||
| pser = pd.Series([1, 2, 3], index=[2, None, 5]) | ||
itholic marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| kser = ks.from_pandas(pser) | ||
|
|
||
| self.assert_eq(kser.value_counts(normalize=True), | ||
| pser.value_counts(normalize=True), almost=True) | ||
| self.assert_eq(kser.value_counts(ascending=True), | ||
| pser.value_counts(ascending=True), almost=True) | ||
| self.assert_eq(kser.value_counts(normalize=True, dropna=False), | ||
| pser.value_counts(normalize=True, dropna=False), almost=True) | ||
| self.assert_eq(kser.value_counts(ascending=True, dropna=False), | ||
| pser.value_counts(ascending=True, dropna=False), almost=True) | ||
|
|
||
| self.assert_eq(kser.index.value_counts(normalize=True), | ||
| pser.index.value_counts(normalize=True), almost=True) | ||
| self.assert_eq(kser.index.value_counts(ascending=True), | ||
| pser.index.value_counts(ascending=True), almost=True) | ||
| self.assert_eq(kser.index.value_counts(normalize=True, dropna=False), | ||
| pser.index.value_counts(normalize=True, dropna=False), almost=True) | ||
| self.assert_eq(kser.index.value_counts(ascending=True, dropna=False), | ||
| pser.index.value_counts(ascending=True, dropna=False), almost=True) | ||
|
|
||
| # Series with MultiIndex | ||
| pser.index = pd.MultiIndex.from_tuples([('x', 'a'), ('x', 'b'), ('y', 'c')]) | ||
| kser = ks.from_pandas(pser) | ||
|
|
||
| self.assert_eq(kser.value_counts(normalize=True), | ||
| pser.value_counts(normalize=True), almost=True) | ||
| self.assert_eq(kser.value_counts(ascending=True), | ||
| pser.value_counts(ascending=True), almost=True) | ||
| self.assert_eq(kser.value_counts(normalize=True, dropna=False), | ||
| pser.value_counts(normalize=True, dropna=False), almost=True) | ||
| self.assert_eq(kser.value_counts(ascending=True, dropna=False), | ||
| pser.value_counts(ascending=True, dropna=False), almost=True) | ||
|
|
||
| self.assert_eq(kser.index.value_counts(normalize=True), | ||
| pser.index.value_counts(normalize=True), almost=True) | ||
| self.assert_eq(kser.index.value_counts(ascending=True), | ||
| pser.index.value_counts(ascending=True), almost=True) | ||
| self.assert_eq(kser.index.value_counts(normalize=True, dropna=False), | ||
| pser.index.value_counts(normalize=True, dropna=False), almost=True) | ||
| self.assert_eq(kser.index.value_counts(ascending=True, dropna=False), | ||
| pser.index.value_counts(ascending=True, dropna=False), almost=True) | ||
|
|
||
| # Series with MultiIndex some of index has NaN | ||
| pser.index = pd.MultiIndex.from_tuples([('x', 'a'), ('x', None), ('y', 'c')]) | ||
| kser = ks.from_pandas(pser) | ||
|
|
||
| self.assert_eq(kser.value_counts(normalize=True), | ||
| pser.value_counts(normalize=True), almost=True) | ||
| self.assert_eq(kser.value_counts(ascending=True), | ||
| pser.value_counts(ascending=True), almost=True) | ||
| self.assert_eq(kser.value_counts(normalize=True, dropna=False), | ||
| pser.value_counts(normalize=True, dropna=False), almost=True) | ||
| self.assert_eq(kser.value_counts(ascending=True, dropna=False), | ||
| pser.value_counts(ascending=True, dropna=False), almost=True) | ||
|
|
||
| self.assert_eq(kser.index.value_counts(normalize=True), | ||
| pser.index.value_counts(normalize=True), almost=True) | ||
| self.assert_eq(kser.index.value_counts(ascending=True), | ||
| pser.index.value_counts(ascending=True), almost=True) | ||
| self.assert_eq(kser.index.value_counts(normalize=True, dropna=False), | ||
| pser.index.value_counts(normalize=True, dropna=False), almost=True) | ||
| self.assert_eq(kser.index.value_counts(ascending=True, dropna=False), | ||
| pser.index.value_counts(ascending=True, dropna=False), almost=True) | ||
|
|
||
| # Series with MultiIndex some of index is NaN. | ||
| # This test only available for pandas >= 0.24. | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why are these only for
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. because >>> pd.__version__
'0.23.0'
>>> pidx = pd.MultiIndex.from_tuples([('x', 'a'), None, ('y', 'c')])
Traceback (most recent call last):
...
TypeError: object of type 'NoneType' has no len()so i think this test should be ran on |
||
| if LooseVersion(pd.__version__) >= LooseVersion("0.24"): | ||
| pser.index = pd.MultiIndex.from_tuples([('x', 'a'), None, ('y', 'c')]) | ||
| kser = ks.from_pandas(pser) | ||
|
|
||
| self.assert_eq(kser.value_counts(normalize=True), | ||
| pser.value_counts(normalize=True), almost=True) | ||
| self.assert_eq(kser.value_counts(ascending=True), | ||
| pser.value_counts(ascending=True), almost=True) | ||
| self.assert_eq(kser.value_counts(normalize=True, dropna=False), | ||
| pser.value_counts(normalize=True, dropna=False), almost=True) | ||
| self.assert_eq(kser.value_counts(ascending=True, dropna=False), | ||
| pser.value_counts(ascending=True, dropna=False), almost=True) | ||
|
|
||
| self.assert_eq(kser.index.value_counts(normalize=True), | ||
| pser.index.value_counts(normalize=True), almost=True) | ||
| self.assert_eq(kser.index.value_counts(ascending=True), | ||
| pser.index.value_counts(ascending=True), almost=True) | ||
| self.assert_eq(kser.index.value_counts(normalize=True, dropna=False), | ||
| pser.index.value_counts(normalize=True, dropna=False), almost=True) | ||
| self.assert_eq(kser.index.value_counts(ascending=True, dropna=False), | ||
| pser.index.value_counts(ascending=True, dropna=False), almost=True) | ||
|
|
||
| def test_value_counts(self): | ||
| if LooseVersion(pyspark.__version__) < LooseVersion("2.4") and \ | ||
| default_session().conf.get("spark.sql.execution.arrow.enabled") == "true": | ||
| default_session().conf.set("spark.sql.execution.arrow.enabled", "false") | ||
| try: | ||
| self._test_value_counts() | ||
| finally: | ||
| default_session().conf.set("spark.sql.execution.arrow.enabled", "true") | ||
| self.assertRaises( | ||
| RuntimeError, | ||
| lambda: ks.MultiIndex.from_tuples([('x', 'a'), ('x', 'b')]).value_counts()) | ||
| else: | ||
| self._test_value_counts() | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you disable arrow execution only when pyspark < 2.4 and for MultiIndex?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. okay i will 👍 |
||
|
|
||
| def test_nsmallest(self): | ||
| sample_lst = [1, 2, 3, 4, np.nan, 6] | ||
| pser = pd.Series(sample_lst, name='x') | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Btw, I think we should do this only in
MultiIndexby overriding.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ah, that seems better. thanks for the advice!