2020
2121from bigframes import dataframe , dtypes , series
2222from bigframes .core import agg_expressions , blocks
23- from bigframes .core .reshape import api as rs
2423from bigframes .operations import aggregations
2524
2625_DEFAULT_DTYPES = (
@@ -38,34 +37,11 @@ def describe(
3837 elif not isinstance (input , dataframe .DataFrame ):
3938 raise TypeError (f"Unsupported type: { type (input )} " )
4039
41- if include is None :
42- numeric_df = _select_dtypes (
43- input ,
44- _DEFAULT_DTYPES ,
45- )
46- if len (numeric_df .columns ) == 0 :
47- # Describe eligible non-numeric columns
48- return _describe_non_numeric (input )
49-
50- # Otherwise, only describe numeric columns
51- return _describe_numeric (input )
52-
53- elif include == "all" :
54- numeric_result = _describe_numeric (input )
55- non_numeric_result = _describe_non_numeric (input )
56-
57- if len (numeric_result .columns ) == 0 :
58- return non_numeric_result
59- elif len (non_numeric_result .columns ) == 0 :
60- return numeric_result
61- else :
62- # Use reindex after join to preserve the original column order.
63- return rs .concat (
64- [non_numeric_result , numeric_result ], axis = 1
65- )._reindex_columns (input .columns )
40+ block = input ._block
6641
67- else :
68- raise ValueError (f"Unsupported include type: { include } " )
42+ describe_block = _describe (block , columns = block .value_columns , include = include )
43+
44+ return dataframe .DataFrame (describe_block ).stack ().droplevel (level = 0 )
6945
7046
7147def _describe (
@@ -80,6 +56,13 @@ def _describe(
8056 stats : list [agg_expressions .Aggregation ] = []
8157 column_labels : list [typing .Hashable ] = []
8258
59+ # include=None behaves like include='all' if no numeric columns present
60+ if include is None :
61+ if not any (
62+ block .expr .get_column_type (col ) in _DEFAULT_DTYPES for col in columns
63+ ):
64+ include = "all"
65+
8366 for col_id in columns :
8467 label = block .col_id_to_label [col_id ]
8568 dtype = block .expr .get_column_type (col_id )
@@ -94,7 +77,7 @@ def _describe(
9477 by_column_ids = by_col_ids ,
9578 aggregations = stats ,
9679 dropna = dropna ,
97- column_labels = pd .Index (column_labels , name = (* block .index .names , None )),
80+ column_labels = pd .Index (column_labels , name = (* block .column_labels .names , None )),
9881 )
9982 return agg_block if as_index else agg_block .reset_index (drop = False )
10083
@@ -122,70 +105,3 @@ def _get_aggs_for_dtype(dtype) -> list[aggregations.UnaryAggregateOp]:
122105 return [aggregations .count_op , aggregations .nunique_op ]
123106 else :
124107 return []
125-
126-
127- def _describe_numeric (df : dataframe .DataFrame ) -> dataframe .DataFrame :
128- number_df_result = typing .cast (
129- dataframe .DataFrame ,
130- _select_dtypes (df , dtypes .NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE ).agg (
131- [
132- "count" ,
133- "mean" ,
134- "std" ,
135- "min" ,
136- "25%" ,
137- "50%" ,
138- "75%" ,
139- "max" ,
140- ]
141- ),
142- )
143- temporal_df_result = typing .cast (
144- dataframe .DataFrame ,
145- _select_dtypes (df , dtypes .TEMPORAL_NUMERIC_BIGFRAMES_TYPES ).agg (["count" ]),
146- )
147-
148- if len (number_df_result .columns ) == 0 :
149- return temporal_df_result
150- elif len (temporal_df_result .columns ) == 0 :
151- return number_df_result
152- else :
153- import bigframes .core .reshape .api as rs
154-
155- original_columns = _select_dtypes (
156- df ,
157- _DEFAULT_DTYPES ,
158- ).columns
159-
160- # Use reindex after join to preserve the original column order.
161- return rs .concat (
162- [number_df_result , temporal_df_result ],
163- axis = 1 ,
164- )._reindex_columns (original_columns )
165-
166-
167- def _describe_non_numeric (df : dataframe .DataFrame ) -> dataframe .DataFrame :
168- return typing .cast (
169- dataframe .DataFrame ,
170- _select_dtypes (
171- df ,
172- [
173- dtypes .STRING_DTYPE ,
174- dtypes .BOOL_DTYPE ,
175- dtypes .BYTES_DTYPE ,
176- dtypes .TIME_DTYPE ,
177- ],
178- ).agg (["count" , "nunique" ]),
179- )
180-
181-
182- def _select_dtypes (
183- df : dataframe .DataFrame , dtypes : typing .Sequence [dtypes .Dtype ]
184- ) -> dataframe .DataFrame :
185- """Selects columns without considering inheritance relationships."""
186- columns = [
187- col_id
188- for col_id , dtype in zip (df ._block .value_columns , df ._block .dtypes )
189- if dtype in dtypes
190- ]
191- return dataframe .DataFrame (df ._block .select_columns (columns ))
0 commit comments