2323from collections import Callable , OrderedDict , namedtuple
2424from functools import partial
2525from itertools import product
26+ from operator import itemgetter
2627from typing import Any , List , Tuple , Union
2728
2829import numpy as np
3435from pyspark .sql .functions import PandasUDFType , pandas_udf , Column
3536
3637from databricks import koalas as ks # For running doctests and reference resolution in PyCharm.
38+ from databricks .koalas .base import _column_op
3739from databricks .koalas .typedef import _infer_return_type
3840from databricks .koalas .frame import DataFrame
3941from databricks .koalas .internal import (_InternalFrame , HIDDEN_COLUMNS , NATURAL_ORDER_COLUMN_NAME ,
@@ -1946,12 +1948,17 @@ def _shift(self, periods, fill_value):
19461948
19471949 def describe (self ):
19481950 kdf = self .agg (["count" , "mean" , "std" , "min" , "quartiles" , "max" ]).reset_index ()
1951+ formatted_percentiles = ["25%" , "50%" , "75%" ]
19491952
19501953 # Split "quartiles" columns into first, second, and third quartiles.
19511954 for label , content in kdf .iteritems ():
19521955 if label [1 ] == "quartiles" :
1953- exploded = ks .DataFrame (content .tolist ())
1954- exploded .columns = [(label [0 ], "25%" ), (label [0 ], "50%" ), (label [0 ], "75%" )]
1956+ exploded = ks .DataFrame (
1957+ {
1958+ (label [0 ], x ): _column_op (itemgetter (i ))(content ).to_numpy ()
1959+ for i , x in enumerate (formatted_percentiles )
1960+ }
1961+ )
19551962 kdf = kdf .drop (label ).join (exploded )
19561963
19571964 # Reindex the DataFrame to reflect initial grouping and agg columns.
@@ -1961,7 +1968,7 @@ def describe(self):
19611968
19621969 # Reorder columns lexicographically by agg column followed by stats.
19631970 agg_cols = (col .name for col in self ._agg_columns )
1964- stats = ["count" , "mean" , "std" , "min" , "25%" , "50%" , "75%" , "max" ]
1971+ stats = ["count" , "mean" , "std" , "min" ] + formatted_percentiles + [ "max" ]
19651972 kdf = kdf [list (product (agg_cols , stats ))]
19661973
19671974 # Cast columns to ``"float64"`` to match `pandas.DataFrame.groupby`.
0 commit comments