Skip to content

Commit 7aae45e

Browse files
deepyamanHyukjinKwon
authored andcommitted
DataFrameGroupBy.describe (#1168)
Close #1166 Manual test: ``` >>> import databricks.koalas as ks >>> df = ks.DataFrame({'a': [1, 1, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) 20/01/04 18:16:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). >>> df.groupby('a').describe() 20/01/04 18:17:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation. 20/01/04 18:17:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation. 20/01/04 18:17:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation. b c count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max a 1 2 4.5 0.707107 4 4 4 5 5 2 7.5 0.707107 7 7 7 8 8 3 1 6.0 NaN 6 6 6 6 6 1 9.0 NaN 9 9 9 9 9 ``` TODO: - [x] Fix formatting (line length over 100, etc.) - [ ] Add docstring - [x] Add unit tests - [x] Reorder percentiles
1 parent f94ef62 commit 7aae45e

File tree

3 files changed

+61
-1
lines changed

3 files changed

+61
-1
lines changed

databricks/koalas/groupby.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
import inspect
2323
from collections import Callable, OrderedDict, namedtuple
2424
from functools import partial
25+
from itertools import product
26+
from operator import itemgetter
2527
from typing import Any, List, Tuple, Union
2628

2729
import numpy as np
@@ -33,6 +35,7 @@
3335
from pyspark.sql.functions import PandasUDFType, pandas_udf, Column
3436

3537
from databricks import koalas as ks # For running doctests and reference resolution in PyCharm.
38+
from databricks.koalas.base import _column_op
3639
from databricks.koalas.typedef import _infer_return_type
3740
from databricks.koalas.frame import DataFrame
3841
from databricks.koalas.internal import (_InternalFrame, HIDDEN_COLUMNS, NATURAL_ORDER_COLUMN_NAME,
@@ -210,6 +213,13 @@ def _spark_groupby(kdf, func, groupkeys):
210213
if aggfunc == "nunique":
211214
reordered.append(
212215
F.expr('count(DISTINCT `{0}`) as `{1}`'.format(name, data_col)))
216+
217+
# Implement "quartiles" aggregate function for ``describe``.
218+
elif aggfunc == "quartiles":
219+
reordered.append(
220+
F.expr('percentile_approx(`{0}`, array(0.25, 0.5, 0.75)) as `{1}`'.format(
221+
name, data_col)))
222+
213223
else:
214224
reordered.append(F.expr('{1}(`{0}`) as `{2}`'.format(name, aggfunc, data_col)))
215225
sdf = sdf.groupby(*groupkey_cols).agg(*reordered)
@@ -1936,6 +1946,34 @@ def _shift(self, periods, fill_value):
19361946
for c in applied])
19371947
return DataFrame(internal)
19381948

1949+
def describe(self):
1950+
kdf = self.agg(["count", "mean", "std", "min", "quartiles", "max"]).reset_index()
1951+
formatted_percentiles = ["25%", "50%", "75%"]
1952+
1953+
# Split "quartiles" columns into first, second, and third quartiles.
1954+
for label, content in kdf.iteritems():
1955+
if label[1] == "quartiles":
1956+
exploded = ks.DataFrame(
1957+
{
1958+
(label[0], x): _column_op(itemgetter(i))(content).to_numpy()
1959+
for i, x in enumerate(formatted_percentiles)
1960+
}
1961+
)
1962+
kdf = kdf.drop(label).join(exploded)
1963+
1964+
# Reindex the DataFrame to reflect initial grouping and agg columns.
1965+
input_groupnames = [s.name for s in self._groupkeys]
1966+
kdf.set_index([(key, "") for key in input_groupnames], inplace=True)
1967+
kdf.index.names = input_groupnames
1968+
1969+
# Reorder columns lexicographically by agg column followed by stats.
1970+
agg_cols = (col.name for col in self._agg_columns)
1971+
stats = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
1972+
kdf = kdf[list(product(agg_cols, stats))]
1973+
1974+
# Cast columns to ``"float64"`` to match `pandas.DataFrame.groupby`.
1975+
return kdf.astype("float64")
1976+
19391977

19401978
class SeriesGroupBy(GroupBy):
19411979

databricks/koalas/missing/groupby.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ class _MissingPandasLikeDataFrameGroupBy(object):
5050
# Functions
5151
boxplot = unsupported_function('boxplot')
5252
cumcount = unsupported_function('cumcount')
53-
describe = unsupported_function('describe')
5453
get_group = unsupported_function('get_group')
5554
median = unsupported_function('median')
5655
ngroup = unsupported_function('ngroup')

databricks/koalas/tests/test_groupby.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
import unittest
1818
import inspect
1919
from distutils.version import LooseVersion
20+
from itertools import product
2021

2122
import numpy as np
2223
import pandas as pd
@@ -257,6 +258,28 @@ def test_aggregate_relabel(self):
257258
)
258259
self.assert_eq(agg_kdf, agg_pdf)
259260

261+
def test_describe(self):
262+
pdf = pd.DataFrame({"a": [1, 1, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
263+
kdf = ks.from_pandas(pdf)
264+
265+
describe_pdf = pdf.groupby("a").describe().sort_index()
266+
describe_kdf = kdf.groupby("a").describe().sort_index()
267+
268+
# Check that non-percentile columns are equal.
269+
agg_cols = [col.name for col in kdf.groupby("a")._agg_columns]
270+
formatted_percentiles = ["25%", "50%", "75%"]
271+
self.assert_eq(describe_kdf.drop(list(product(agg_cols, formatted_percentiles))),
272+
describe_pdf.drop(columns=formatted_percentiles, level=1))
273+
274+
# Check that percentile columns are equal.
275+
percentiles = [0.25, 0.5, 0.75]
276+
# The interpolation argument is yet to be implemented in Koalas.
277+
quantile_pdf = pdf.groupby("a").quantile(percentiles, interpolation="nearest")
278+
quantile_pdf = quantile_pdf.unstack(level=1).astype(float)
279+
non_percentile_stats = ["count", "mean", "std", "min", "max"]
280+
self.assert_eq(describe_kdf.drop(list(product(agg_cols, non_percentile_stats))),
281+
quantile_pdf.rename(columns="{:.0%}".format, level=1))
282+
260283
def test_all_any(self):
261284
pdf = pd.DataFrame({'A': [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],
262285
'B': [True, True, True, False, False, False, None, True, None, False]})

0 commit comments

Comments
 (0)