Skip to content

Commit 89f4c84

Browse files
committed
Write a simplified test case to show the problem
1 parent 6fce943 commit 89f4c84

1 file changed

Lines changed: 82 additions & 0 deletions

File tree

kernel/tests/clustering_e2e.rs

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
use std::collections::HashMap;
88
use std::sync::Arc;
99

10+
use delta_kernel::arrow::array::{ArrayRef, Int32Array};
1011
use delta_kernel::committer::FileSystemCommitter;
1112
use delta_kernel::expressions::ColumnName;
1213
use delta_kernel::schema::{DataType, StructField, StructType};
@@ -138,3 +139,84 @@ async fn test_clustered_table_write_and_checkpoint(
138139

139140
Ok(())
140141
}
142+
143+
/// Regression test: writing a batch where a clustering column has ALL null values should succeed.
144+
///
145+
/// `collect_stats` (commit 76d480f0) omits `minValues`/`maxValues` fields for all-null columns,
146+
/// but `StatsVerifier` tries to extract `stats.minValues.<column>` before checking the
147+
/// `nullCount == numRecords` condition. The column extraction fails with:
148+
/// "Column stats.minValues.<column> not found in the data"
149+
/// because the field is missing from the stats StructArray entirely.
150+
///
151+
/// The verifier should tolerate missing min/max fields when the column is all-null.
152+
#[tokio::test(flavor = "multi_thread", worker_threads = 2)]
153+
async fn test_clustered_table_write_all_null_clustering_column() {
154+
let (_temp_dir, table_path, engine) = test_table_setup_mt().unwrap();
155+
let schema = Arc::new(
156+
StructType::try_new(vec![
157+
StructField::new("category", DataType::STRING, false),
158+
StructField::new("region_id", DataType::INTEGER, true),
159+
])
160+
.unwrap(),
161+
);
162+
163+
// Create table clustered on "category" and "region_id"
164+
let create_result = create_table(&table_path, schema, "Test/1.0")
165+
.with_data_layout(DataLayout::Clustered {
166+
columns: vec![
167+
ColumnName::new(["category"]),
168+
ColumnName::new(["region_id"]),
169+
],
170+
})
171+
.build(engine.as_ref(), Box::new(FileSystemCommitter::new()))
172+
.unwrap()
173+
.commit(engine.as_ref())
174+
.unwrap();
175+
176+
let snapshot = match create_result {
177+
CommitResult::CommittedTransaction(committed) => committed
178+
.post_commit_snapshot()
179+
.expect("post-commit snapshot should exist")
180+
.clone(),
181+
other => panic!("Expected CommittedTransaction, got: {other:?}"),
182+
};
183+
184+
// Write a batch where region_id is ALL nulls.
185+
// This should succeed — all-null clustering columns are valid.
186+
let all_null_region: ArrayRef = Arc::new(Int32Array::from(vec![None, None, None]));
187+
let batch = generate_batch(vec![
188+
("category", vec!["a", "b", "c"].into_array()),
189+
("region_id", all_null_region),
190+
])
191+
.unwrap();
192+
193+
// BUG: This fails with "Column stats.minValues.region_id not found in the data"
194+
// because collect_stats omits minValues/maxValues for all-null columns,
195+
// but StatsVerifier tries to extract the column before checking nullCount == numRecords.
196+
let snapshot = write_batch_to_table(&snapshot, engine.as_ref(), batch, HashMap::new())
197+
.await
198+
.unwrap();
199+
assert_eq!(snapshot.version(), 1);
200+
201+
// Verify data is readable
202+
let scan = snapshot.clone().scan_builder().build().unwrap();
203+
let batches = read_scan(&scan, engine.clone()).unwrap();
204+
let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum();
205+
assert_eq!(total_rows, 3);
206+
207+
// Verify stats: region_id should have nullCount=3, but minValues/maxValues
208+
// should NOT contain region_id (since all values are null, there's nothing to aggregate)
209+
let add_infos = read_add_infos(&snapshot, engine.as_ref()).unwrap();
210+
assert_eq!(add_infos.len(), 1);
211+
let stats = add_infos[0].stats.as_ref().expect("should have stats");
212+
assert_eq!(stats["numRecords"], 3);
213+
assert_eq!(stats["nullCount"]["region_id"], 3);
214+
assert!(
215+
stats["minValues"].get("region_id").is_none(),
216+
"minValues should not contain region_id when all values are null"
217+
);
218+
assert!(
219+
stats["maxValues"].get("region_id").is_none(),
220+
"maxValues should not contain region_id when all values are null"
221+
);
222+
}

0 commit comments

Comments
 (0)