Skip to content

Commit acad2de

Browse files
fix(dataframe): handle FixedSizeBinary in describe
Skip min/max describe summaries for unsupported binary-like types so describe falls back to nulls instead of attempting an invalid Utf8 cast. Add a regression test for FixedSizeBinary and rerun the dataframe describe integration tests reported in #20273.
1 parent 603bfb4 commit acad2de

File tree

2 files changed

+54
-6
lines changed

2 files changed

+54
-6
lines changed

datafusion/core/src/dataframe/mod.rs

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,6 +1015,16 @@ impl DataFrame {
10151015
//the functions now supported
10161016
let supported_describe_functions =
10171017
vec!["count", "null_count", "mean", "std", "min", "max", "median"];
1018+
let supports_describe_min_max = |data_type: &DataType| {
1019+
!matches!(
1020+
data_type,
1021+
DataType::Boolean
1022+
| DataType::Binary
1023+
| DataType::LargeBinary
1024+
| DataType::BinaryView
1025+
| DataType::FixedSizeBinary(_)
1026+
)
1027+
};
10181028

10191029
let original_schema_fields = self.schema().fields().iter();
10201030

@@ -1075,9 +1085,7 @@ impl DataFrame {
10751085
vec![],
10761086
original_schema_fields
10771087
.clone()
1078-
.filter(|f| {
1079-
!matches!(f.data_type(), DataType::Binary | DataType::Boolean)
1080-
})
1088+
.filter(|f| supports_describe_min_max(f.data_type()))
10811089
.map(|f| min(ident(f.name())).alias(f.name()))
10821090
.collect::<Vec<_>>(),
10831091
),
@@ -1086,9 +1094,7 @@ impl DataFrame {
10861094
vec![],
10871095
original_schema_fields
10881096
.clone()
1089-
.filter(|f| {
1090-
!matches!(f.data_type(), DataType::Binary | DataType::Boolean)
1091-
})
1097+
.filter(|f| supports_describe_min_max(f.data_type()))
10921098
.map(|f| max(ident(f.name())).alias(f.name()))
10931099
.collect::<Vec<_>>(),
10941100
),

datafusion/core/tests/dataframe/describe.rs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,10 @@ use datafusion::prelude::{ParquetReadOptions, SessionContext};
1919
use datafusion_common::test_util::batches_to_string;
2020
use datafusion_common::{Result, test_util::parquet_test_data};
2121
use insta::assert_snapshot;
22+
use std::sync::Arc;
23+
24+
use arrow::array::{FixedSizeBinaryArray, RecordBatch};
25+
use arrow::datatypes::{DataType, Field, Schema};
2226

2327
#[tokio::test]
2428
async fn describe() -> Result<()> {
@@ -81,6 +85,44 @@ async fn describe_boolean_binary() -> Result<()> {
8185
Ok(())
8286
}
8387

88+
#[tokio::test]
89+
async fn describe_fixed_size_binary() -> Result<()> {
90+
let ctx = SessionContext::new();
91+
let batch = RecordBatch::try_new(
92+
Arc::new(Schema::new(vec![Field::new(
93+
"fsb",
94+
DataType::FixedSizeBinary(3),
95+
true,
96+
)])),
97+
vec![Arc::new(FixedSizeBinaryArray::from(vec![
98+
Some(&[1_u8, 2, 3][..]),
99+
None,
100+
Some(&[4_u8, 5, 6][..]),
101+
]))],
102+
)?;
103+
ctx.register_batch("test", batch)?;
104+
105+
let result = ctx.table("test").await?.describe().await?.collect().await?;
106+
107+
assert_snapshot!(
108+
batches_to_string(&result),
109+
@r"
110+
+------------+------+
111+
| describe | fsb |
112+
+------------+------+
113+
| count | 2 |
114+
| null_count | 1 |
115+
| mean | null |
116+
| std | null |
117+
| min | null |
118+
| max | null |
119+
| median | null |
120+
+------------+------+
121+
"
122+
);
123+
Ok(())
124+
}
125+
84126
#[tokio::test]
85127
async fn describe_null() -> Result<()> {
86128
let ctx = parquet_context().await;

0 commit comments

Comments
 (0)