|
22 | 22 | import inspect |
23 | 23 | from collections import Callable, OrderedDict, namedtuple |
24 | 24 | from functools import partial |
| 25 | +from itertools import product |
| 26 | +from operator import itemgetter |
25 | 27 | from typing import Any, List, Tuple, Union |
26 | 28 |
|
27 | 29 | import numpy as np |
|
33 | 35 | from pyspark.sql.functions import PandasUDFType, pandas_udf, Column |
34 | 36 |
|
35 | 37 | from databricks import koalas as ks # For running doctests and reference resolution in PyCharm. |
| 38 | +from databricks.koalas.base import _column_op |
36 | 39 | from databricks.koalas.typedef import _infer_return_type |
37 | 40 | from databricks.koalas.frame import DataFrame |
38 | 41 | from databricks.koalas.internal import (_InternalFrame, HIDDEN_COLUMNS, NATURAL_ORDER_COLUMN_NAME, |
@@ -210,6 +213,13 @@ def _spark_groupby(kdf, func, groupkeys): |
210 | 213 | if aggfunc == "nunique": |
211 | 214 | reordered.append( |
212 | 215 | F.expr('count(DISTINCT `{0}`) as `{1}`'.format(name, data_col))) |
| 216 | + |
| 217 | + # Implement "quartiles" aggregate function for ``describe``. |
| 218 | + elif aggfunc == "quartiles": |
| 219 | + reordered.append( |
| 220 | + F.expr('percentile_approx(`{0}`, array(0.25, 0.5, 0.75)) as `{1}`'.format( |
| 221 | + name, data_col))) |
| 222 | + |
213 | 223 | else: |
214 | 224 | reordered.append(F.expr('{1}(`{0}`) as `{2}`'.format(name, aggfunc, data_col))) |
215 | 225 | sdf = sdf.groupby(*groupkey_cols).agg(*reordered) |
@@ -1936,6 +1946,34 @@ def _shift(self, periods, fill_value): |
1936 | 1946 | for c in applied]) |
1937 | 1947 | return DataFrame(internal) |
1938 | 1948 |
|
| 1949 | + def describe(self): |
| 1950 | + kdf = self.agg(["count", "mean", "std", "min", "quartiles", "max"]).reset_index() |
| 1951 | + formatted_percentiles = ["25%", "50%", "75%"] |
| 1952 | + |
| 1953 | + # Split "quartiles" columns into first, second, and third quartiles. |
| 1954 | + for label, content in kdf.iteritems(): |
| 1955 | + if label[1] == "quartiles": |
| 1956 | + exploded = ks.DataFrame( |
| 1957 | + { |
| 1958 | + (label[0], x): _column_op(itemgetter(i))(content).to_numpy() |
| 1959 | + for i, x in enumerate(formatted_percentiles) |
| 1960 | + } |
| 1961 | + ) |
| 1962 | + kdf = kdf.drop(label).join(exploded) |
| 1963 | + |
| 1964 | + # Reindex the DataFrame to reflect initial grouping and agg columns. |
| 1965 | + input_groupnames = [s.name for s in self._groupkeys] |
| 1966 | + kdf.set_index([(key, "") for key in input_groupnames], inplace=True) |
| 1967 | + kdf.index.names = input_groupnames |
| 1968 | + |
| 1969 | + # Reorder columns lexicographically by agg column followed by stats. |
| 1970 | + agg_cols = (col.name for col in self._agg_columns) |
| 1971 | + stats = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"] |
| 1972 | + kdf = kdf[list(product(agg_cols, stats))] |
| 1973 | + |
| 1974 | + # Cast columns to ``"float64"`` to match `pandas.DataFrame.groupby`. |
| 1975 | + return kdf.astype("float64") |
| 1976 | + |
1939 | 1977 |
|
1940 | 1978 | class SeriesGroupBy(GroupBy): |
1941 | 1979 |
|
|
0 commit comments