DataFrameGroupBy.describe (#1168)

deepyaman · HyukjinKwon · commit 7aae45e966a4 · 2020-01-16T13:49:08.000+09:00
Close #1166 Manual test: ``` >>> import databricks.koalas as ks >>> df = ks.DataFrame({'a': [1, 1, 3], 'b': [4, 5, 6], 'c': [7, 8, 9]}) 20/01/04 18:16:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). >>> df.groupby('a').describe() 20/01/04 18:17:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation. 20/01/04 18:17:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation. 20/01/04 18:17:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation. b c count mean std min 25% 50% 75% max count mean std min 25% 50% 75% max a 1 2 4.5 0.707107 4 4 4 5 5 2 7.5 0.707107 7 7 7 8 8 3 1 6.0 NaN 6 6 6 6 6 1 9.0 NaN 9 9 9 9 9 ``` TODO: - [x] Fix formatting (line length over 100, etc.) - [ ] Add docstring - [x] Add unit tests - [x] Reorder percentiles
diff --git a/databricks/koalas/groupby.py b/databricks/koalas/groupby.py
@@ -22,6 +22,8 @@
 import inspect
 from collections import Callable, OrderedDict, namedtuple
 from functools import partial
+from itertools import product
+from operator import itemgetter
 from typing import Any, List, Tuple, Union
 
 import numpy as np
@@ -33,6 +35,7 @@
 from pyspark.sql.functions import PandasUDFType, pandas_udf, Column
 
 from databricks import koalas as ks  # For running doctests and reference resolution in PyCharm.
+from databricks.koalas.base import _column_op
 from databricks.koalas.typedef import _infer_return_type
 from databricks.koalas.frame import DataFrame
 from databricks.koalas.internal import (_InternalFrame, HIDDEN_COLUMNS, NATURAL_ORDER_COLUMN_NAME,
@@ -210,6 +213,13 @@ def _spark_groupby(kdf, func, groupkeys):
                 if aggfunc == "nunique":
                     reordered.append(
                         F.expr('count(DISTINCT `{0}`) as `{1}`'.format(name, data_col)))
+
+                # Implement "quartiles" aggregate function for ``describe``.
+                elif aggfunc == "quartiles":
+                    reordered.append(
+                        F.expr('percentile_approx(`{0}`, array(0.25, 0.5, 0.75)) as `{1}`'.format(
+                            name, data_col)))
+
                 else:
                     reordered.append(F.expr('{1}(`{0}`) as `{2}`'.format(name, aggfunc, data_col)))
         sdf = sdf.groupby(*groupkey_cols).agg(*reordered)
@@ -1936,6 +1946,34 @@ def _shift(self, periods, fill_value):
                                                     for c in applied])
         return DataFrame(internal)
 
+    def describe(self):
+        kdf = self.agg(["count", "mean", "std", "min", "quartiles", "max"]).reset_index()
+        formatted_percentiles = ["25%", "50%", "75%"]
+
+        # Split "quartiles" columns into first, second, and third quartiles.
+        for label, content in kdf.iteritems():
+            if label[1] == "quartiles":
+                exploded = ks.DataFrame(
+                    {
+                        (label[0], x): _column_op(itemgetter(i))(content).to_numpy()
+                        for i, x in enumerate(formatted_percentiles)
+                    }
+                )
+                kdf = kdf.drop(label).join(exploded)
+
+        # Reindex the DataFrame to reflect initial grouping and agg columns.
+        input_groupnames = [s.name for s in self._groupkeys]
+        kdf.set_index([(key, "") for key in input_groupnames], inplace=True)
+        kdf.index.names = input_groupnames
+
+        # Reorder columns lexicographically by agg column followed by stats.
+        agg_cols = (col.name for col in self._agg_columns)
+        stats = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
+        kdf = kdf[list(product(agg_cols, stats))]
+
+        # Cast columns to ``"float64"`` to match `pandas.DataFrame.groupby`.
+        return kdf.astype("float64")
+
 
 class SeriesGroupBy(GroupBy):
 
diff --git a/databricks/koalas/missing/groupby.py b/databricks/koalas/missing/groupby.py
@@ -50,7 +50,6 @@ class _MissingPandasLikeDataFrameGroupBy(object):
     # Functions
     boxplot = unsupported_function('boxplot')
     cumcount = unsupported_function('cumcount')
-    describe = unsupported_function('describe')
     get_group = unsupported_function('get_group')
     median = unsupported_function('median')
     ngroup = unsupported_function('ngroup')
diff --git a/databricks/koalas/tests/test_groupby.py b/databricks/koalas/tests/test_groupby.py
@@ -17,6 +17,7 @@
 import unittest
 import inspect
 from distutils.version import LooseVersion
+from itertools import product
 
 import numpy as np
 import pandas as pd
@@ -257,6 +258,28 @@ def test_aggregate_relabel(self):
         )
         self.assert_eq(agg_kdf, agg_pdf)
 
+    def test_describe(self):
+        pdf = pd.DataFrame({"a": [1, 1, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
+        kdf = ks.from_pandas(pdf)
+
+        describe_pdf = pdf.groupby("a").describe().sort_index()
+        describe_kdf = kdf.groupby("a").describe().sort_index()
+
+        # Check that non-percentile columns are equal.
+        agg_cols = [col.name for col in kdf.groupby("a")._agg_columns]
+        formatted_percentiles = ["25%", "50%", "75%"]
+        self.assert_eq(describe_kdf.drop(list(product(agg_cols, formatted_percentiles))),
+                       describe_pdf.drop(columns=formatted_percentiles, level=1))
+
+        # Check that percentile columns are equal.
+        percentiles = [0.25, 0.5, 0.75]
+        # The interpolation argument is yet to be implemented in Koalas.
+        quantile_pdf = pdf.groupby("a").quantile(percentiles, interpolation="nearest")
+        quantile_pdf = quantile_pdf.unstack(level=1).astype(float)
+        non_percentile_stats = ["count", "mean", "std", "min", "max"]
+        self.assert_eq(describe_kdf.drop(list(product(agg_cols, non_percentile_stats))),
+                       quantile_pdf.rename(columns="{:.0%}".format, level=1))
+
     def test_all_any(self):
         pdf = pd.DataFrame({'A': [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],
                             'B': [True, True, True, False, False, False, None, True, None, False]})