Skip to content

Commit 707f1eb

Browse files
committed
Replace potentially-inefficient tolist operation
1 parent 086e391 commit 707f1eb

File tree

1 file changed

+10
-3
lines changed

1 file changed

+10
-3
lines changed

databricks/koalas/groupby.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from collections import Callable, OrderedDict, namedtuple
2424
from functools import partial
2525
from itertools import product
26+
from operator import itemgetter
2627
from typing import Any, List, Tuple, Union
2728

2829
import numpy as np
@@ -34,6 +35,7 @@
3435
from pyspark.sql.functions import PandasUDFType, pandas_udf, Column
3536

3637
from databricks import koalas as ks # For running doctests and reference resolution in PyCharm.
38+
from databricks.koalas.base import _column_op
3739
from databricks.koalas.typedef import _infer_return_type
3840
from databricks.koalas.frame import DataFrame
3941
from databricks.koalas.internal import (_InternalFrame, HIDDEN_COLUMNS, NATURAL_ORDER_COLUMN_NAME,
@@ -1946,12 +1948,17 @@ def _shift(self, periods, fill_value):
19461948

19471949
def describe(self):
19481950
kdf = self.agg(["count", "mean", "std", "min", "quartiles", "max"]).reset_index()
1951+
formatted_percentiles = ["25%", "50%", "75%"]
19491952

19501953
# Split "quartiles" columns into first, second, and third quartiles.
19511954
for label, content in kdf.iteritems():
19521955
if label[1] == "quartiles":
1953-
exploded = ks.DataFrame(content.tolist())
1954-
exploded.columns = [(label[0], "25%"), (label[0], "50%"), (label[0], "75%")]
1956+
exploded = ks.DataFrame(
1957+
{
1958+
(label[0], x): _column_op(itemgetter(i))(content).to_numpy()
1959+
for i, x in enumerate(formatted_percentiles)
1960+
}
1961+
)
19551962
kdf = kdf.drop(label).join(exploded)
19561963

19571964
# Reindex the DataFrame to reflect initial grouping and agg columns.
@@ -1961,7 +1968,7 @@ def describe(self):
19611968

19621969
# Reorder columns lexicographically by agg column followed by stats.
19631970
agg_cols = (col.name for col in self._agg_columns)
1964-
stats = ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]
1971+
stats = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
19651972
kdf = kdf[list(product(agg_cols, stats))]
19661973

19671974
# Cast columns to ``"float64"`` to match `pandas.DataFrame.groupby`.

0 commit comments

Comments
 (0)