Skip to content
This repository was archived by the owner on Apr 1, 2026. It is now read-only.

Commit fc83189

Browse files
refactor describe to share code path
1 parent ccdea04 commit fc83189

File tree

1 file changed

+12
-96
lines changed

1 file changed

+12
-96
lines changed

bigframes/pandas/core/methods/describe.py

Lines changed: 12 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020

2121
from bigframes import dataframe, dtypes, series
2222
from bigframes.core import agg_expressions, blocks
23-
from bigframes.core.reshape import api as rs
2423
from bigframes.operations import aggregations
2524

2625
_DEFAULT_DTYPES = (
@@ -38,34 +37,11 @@ def describe(
3837
elif not isinstance(input, dataframe.DataFrame):
3938
raise TypeError(f"Unsupported type: {type(input)}")
4039

41-
if include is None:
42-
numeric_df = _select_dtypes(
43-
input,
44-
_DEFAULT_DTYPES,
45-
)
46-
if len(numeric_df.columns) == 0:
47-
# Describe eligible non-numeric columns
48-
return _describe_non_numeric(input)
49-
50-
# Otherwise, only describe numeric columns
51-
return _describe_numeric(input)
52-
53-
elif include == "all":
54-
numeric_result = _describe_numeric(input)
55-
non_numeric_result = _describe_non_numeric(input)
56-
57-
if len(numeric_result.columns) == 0:
58-
return non_numeric_result
59-
elif len(non_numeric_result.columns) == 0:
60-
return numeric_result
61-
else:
62-
# Use reindex after join to preserve the original column order.
63-
return rs.concat(
64-
[non_numeric_result, numeric_result], axis=1
65-
)._reindex_columns(input.columns)
40+
block = input._block
6641

67-
else:
68-
raise ValueError(f"Unsupported include type: {include}")
42+
describe_block = _describe(block, columns=block.value_columns, include=include)
43+
44+
return dataframe.DataFrame(describe_block).stack().droplevel(level=0)
6945

7046

7147
def _describe(
@@ -80,6 +56,13 @@ def _describe(
8056
stats: list[agg_expressions.Aggregation] = []
8157
column_labels: list[typing.Hashable] = []
8258

59+
# include=None behaves like include='all' if no numeric columns present
60+
if include is None:
61+
if not any(
62+
block.expr.get_column_type(col) in _DEFAULT_DTYPES for col in columns
63+
):
64+
include = "all"
65+
8366
for col_id in columns:
8467
label = block.col_id_to_label[col_id]
8568
dtype = block.expr.get_column_type(col_id)
@@ -94,7 +77,7 @@ def _describe(
9477
by_column_ids=by_col_ids,
9578
aggregations=stats,
9679
dropna=dropna,
97-
column_labels=pd.Index(column_labels, name=(*block.index.names, None)),
80+
column_labels=pd.Index(column_labels, name=(*block.column_labels.names, None)),
9881
)
9982
return agg_block if as_index else agg_block.reset_index(drop=False)
10083

@@ -122,70 +105,3 @@ def _get_aggs_for_dtype(dtype) -> list[aggregations.UnaryAggregateOp]:
122105
return [aggregations.count_op, aggregations.nunique_op]
123106
else:
124107
return []
125-
126-
127-
def _describe_numeric(df: dataframe.DataFrame) -> dataframe.DataFrame:
128-
number_df_result = typing.cast(
129-
dataframe.DataFrame,
130-
_select_dtypes(df, dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE).agg(
131-
[
132-
"count",
133-
"mean",
134-
"std",
135-
"min",
136-
"25%",
137-
"50%",
138-
"75%",
139-
"max",
140-
]
141-
),
142-
)
143-
temporal_df_result = typing.cast(
144-
dataframe.DataFrame,
145-
_select_dtypes(df, dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES).agg(["count"]),
146-
)
147-
148-
if len(number_df_result.columns) == 0:
149-
return temporal_df_result
150-
elif len(temporal_df_result.columns) == 0:
151-
return number_df_result
152-
else:
153-
import bigframes.core.reshape.api as rs
154-
155-
original_columns = _select_dtypes(
156-
df,
157-
_DEFAULT_DTYPES,
158-
).columns
159-
160-
# Use reindex after join to preserve the original column order.
161-
return rs.concat(
162-
[number_df_result, temporal_df_result],
163-
axis=1,
164-
)._reindex_columns(original_columns)
165-
166-
167-
def _describe_non_numeric(df: dataframe.DataFrame) -> dataframe.DataFrame:
168-
return typing.cast(
169-
dataframe.DataFrame,
170-
_select_dtypes(
171-
df,
172-
[
173-
dtypes.STRING_DTYPE,
174-
dtypes.BOOL_DTYPE,
175-
dtypes.BYTES_DTYPE,
176-
dtypes.TIME_DTYPE,
177-
],
178-
).agg(["count", "nunique"]),
179-
)
180-
181-
182-
def _select_dtypes(
183-
df: dataframe.DataFrame, dtypes: typing.Sequence[dtypes.Dtype]
184-
) -> dataframe.DataFrame:
185-
"""Selects columns without considering inheritance relationships."""
186-
columns = [
187-
col_id
188-
for col_id, dtype in zip(df._block.value_columns, df._block.dtypes)
189-
if dtype in dtypes
190-
]
191-
return dataframe.DataFrame(df._block.select_columns(columns))

0 commit comments

Comments
 (0)