1616
1717import typing
1818
19+ import pandas as pd
20+
1921from bigframes import dataframe , dtypes , series
20- from bigframes .core .reshape import api as rs
22+ from bigframes .core import agg_expressions , blocks
23+ from bigframes .operations import aggregations
24+
25+ _DEFAULT_DTYPES = (
26+ dtypes .NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE + dtypes .TEMPORAL_NUMERIC_BIGFRAMES_TYPES
27+ )
2128
2229
2330def describe (
@@ -30,100 +37,88 @@ def describe(
3037 elif not isinstance (input , dataframe .DataFrame ):
3138 raise TypeError (f"Unsupported type: { type (input )} " )
3239
40+ block = input ._block
41+
42+ describe_block = _describe (block , columns = block .value_columns , include = include )
43+ # we override default stack behavior, because we want very specific ordering
44+ stack_cols = pd .Index (
45+ [
46+ "count" ,
47+ "nunique" ,
48+ "top" ,
49+ "freq" ,
50+ "mean" ,
51+ "std" ,
52+ "min" ,
53+ "25%" ,
54+ "50%" ,
55+ "75%" ,
56+ "max" ,
57+ ]
58+ ).intersection (describe_block .column_labels .get_level_values (- 1 ))
59+ describe_block = describe_block .stack (override_labels = stack_cols )
60+
61+ return dataframe .DataFrame (describe_block ).droplevel (level = 0 )
62+
63+
64+ def _describe (
65+ block : blocks .Block ,
66+ columns : typing .Sequence [str ],
67+ include : None | typing .Literal ["all" ] = None ,
68+ * ,
69+ as_index : bool = True ,
70+ by_col_ids : typing .Sequence [str ] = [],
71+ dropna : bool = False ,
72+ ) -> blocks .Block :
73+ stats : list [agg_expressions .Aggregation ] = []
74+ column_labels : list [typing .Hashable ] = []
75+
76+ # include=None behaves like include='all' if no numeric columns present
3377 if include is None :
34- numeric_df = _select_dtypes (
35- input ,
36- dtypes .NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
37- + dtypes .TEMPORAL_NUMERIC_BIGFRAMES_TYPES ,
38- )
39- if len (numeric_df .columns ) == 0 :
40- # Describe eligible non-numeric columns
41- return _describe_non_numeric (input )
42-
43- # Otherwise, only describe numeric columns
44- return _describe_numeric (input )
45-
46- elif include == "all" :
47- numeric_result = _describe_numeric (input )
48- non_numeric_result = _describe_non_numeric (input )
49-
50- if len (numeric_result .columns ) == 0 :
51- return non_numeric_result
52- elif len (non_numeric_result .columns ) == 0 :
53- return numeric_result
54- else :
55- # Use reindex after join to preserve the original column order.
56- return rs .concat (
57- [non_numeric_result , numeric_result ], axis = 1
58- )._reindex_columns (input .columns )
59-
60- else :
61- raise ValueError (f"Unsupported include type: { include } " )
62-
63-
64- def _describe_numeric (df : dataframe .DataFrame ) -> dataframe .DataFrame :
65- number_df_result = typing .cast (
66- dataframe .DataFrame ,
67- _select_dtypes (df , dtypes .NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE ).agg (
68- [
69- "count" ,
70- "mean" ,
71- "std" ,
72- "min" ,
73- "25%" ,
74- "50%" ,
75- "75%" ,
76- "max" ,
77- ]
78- ),
79- )
80- temporal_df_result = typing .cast (
81- dataframe .DataFrame ,
82- _select_dtypes (df , dtypes .TEMPORAL_NUMERIC_BIGFRAMES_TYPES ).agg (["count" ]),
78+ if not any (
79+ block .expr .get_column_type (col ) in _DEFAULT_DTYPES for col in columns
80+ ):
81+ include = "all"
82+
83+ for col_id in columns :
84+ label = block .col_id_to_label [col_id ]
85+ dtype = block .expr .get_column_type (col_id )
86+ if include != "all" and dtype not in _DEFAULT_DTYPES :
87+ continue
88+ agg_ops = _get_aggs_for_dtype (dtype )
89+ stats .extend (op .as_expr (col_id ) for op in agg_ops )
90+ label_tuple = (label ,) if block .column_labels .nlevels == 1 else label
91+ column_labels .extend ((* label_tuple , op .name ) for op in agg_ops ) # type: ignore
92+
93+ agg_block , _ = block .aggregate (
94+ by_column_ids = by_col_ids ,
95+ aggregations = stats ,
96+ dropna = dropna ,
97+ column_labels = pd .Index (column_labels , name = (* block .column_labels .names , None )),
8398 )
84-
85- if len (number_df_result .columns ) == 0 :
86- return temporal_df_result
87- elif len (temporal_df_result .columns ) == 0 :
88- return number_df_result
99+ return agg_block if as_index else agg_block .reset_index (drop = False )
100+
101+
102+ def _get_aggs_for_dtype (dtype ) -> list [aggregations .UnaryAggregateOp ]:
103+ if dtype in dtypes .NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE :
104+ return [
105+ aggregations .count_op ,
106+ aggregations .mean_op ,
107+ aggregations .std_op ,
108+ aggregations .min_op ,
109+ aggregations .ApproxQuartilesOp (1 ),
110+ aggregations .ApproxQuartilesOp (2 ),
111+ aggregations .ApproxQuartilesOp (3 ),
112+ aggregations .max_op ,
113+ ]
114+ elif dtype in dtypes .TEMPORAL_NUMERIC_BIGFRAMES_TYPES :
115+ return [aggregations .count_op ]
116+ elif dtype in [
117+ dtypes .STRING_DTYPE ,
118+ dtypes .BOOL_DTYPE ,
119+ dtypes .BYTES_DTYPE ,
120+ dtypes .TIME_DTYPE ,
121+ ]:
122+ return [aggregations .count_op , aggregations .nunique_op ]
89123 else :
90- import bigframes .core .reshape .api as rs
91-
92- original_columns = _select_dtypes (
93- df ,
94- dtypes .NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
95- + dtypes .TEMPORAL_NUMERIC_BIGFRAMES_TYPES ,
96- ).columns
97-
98- # Use reindex after join to preserve the original column order.
99- return rs .concat (
100- [number_df_result , temporal_df_result ],
101- axis = 1 ,
102- )._reindex_columns (original_columns )
103-
104-
105- def _describe_non_numeric (df : dataframe .DataFrame ) -> dataframe .DataFrame :
106- return typing .cast (
107- dataframe .DataFrame ,
108- _select_dtypes (
109- df ,
110- [
111- dtypes .STRING_DTYPE ,
112- dtypes .BOOL_DTYPE ,
113- dtypes .BYTES_DTYPE ,
114- dtypes .TIME_DTYPE ,
115- ],
116- ).agg (["count" , "nunique" ]),
117- )
118-
119-
120- def _select_dtypes (
121- df : dataframe .DataFrame , dtypes : typing .Sequence [dtypes .Dtype ]
122- ) -> dataframe .DataFrame :
123- """Selects columns without considering inheritance relationships."""
124- columns = [
125- col_id
126- for col_id , dtype in zip (df ._block .value_columns , df ._block .dtypes )
127- if dtype in dtypes
128- ]
129- return dataframe .DataFrame (df ._block .select_columns (columns ))
124+ return []
0 commit comments