|
7 | 7 | use std::collections::HashMap; |
8 | 8 | use std::sync::Arc; |
9 | 9 |
|
| 10 | +use delta_kernel::arrow::array::{ArrayRef, Int32Array}; |
10 | 11 | use delta_kernel::committer::FileSystemCommitter; |
11 | 12 | use delta_kernel::expressions::ColumnName; |
12 | 13 | use delta_kernel::schema::{DataType, StructField, StructType}; |
@@ -138,3 +139,84 @@ async fn test_clustered_table_write_and_checkpoint( |
138 | 139 |
|
139 | 140 | Ok(()) |
140 | 141 | } |
| 142 | + |
| 143 | +/// Regression test: writing a batch where a clustering column has ALL null values should succeed. |
| 144 | +/// |
| 145 | +/// `collect_stats` (commit 76d480f0) omits `minValues`/`maxValues` fields for all-null columns, |
| 146 | +/// but `StatsVerifier` tries to extract `stats.minValues.<column>` before checking the |
| 147 | +/// `nullCount == numRecords` condition. The column extraction fails with: |
| 148 | +/// "Column stats.minValues.<column> not found in the data" |
| 149 | +/// because the field is missing from the stats StructArray entirely. |
| 150 | +/// |
| 151 | +/// The verifier should tolerate missing min/max fields when the column is all-null. |
| 152 | +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] |
| 153 | +async fn test_clustered_table_write_all_null_clustering_column() { |
| 154 | + let (_temp_dir, table_path, engine) = test_table_setup_mt().unwrap(); |
| 155 | + let schema = Arc::new( |
| 156 | + StructType::try_new(vec![ |
| 157 | + StructField::new("category", DataType::STRING, false), |
| 158 | + StructField::new("region_id", DataType::INTEGER, true), |
| 159 | + ]) |
| 160 | + .unwrap(), |
| 161 | + ); |
| 162 | + |
| 163 | + // Create table clustered on "category" and "region_id" |
| 164 | + let create_result = create_table(&table_path, schema, "Test/1.0") |
| 165 | + .with_data_layout(DataLayout::Clustered { |
| 166 | + columns: vec![ |
| 167 | + ColumnName::new(["category"]), |
| 168 | + ColumnName::new(["region_id"]), |
| 169 | + ], |
| 170 | + }) |
| 171 | + .build(engine.as_ref(), Box::new(FileSystemCommitter::new())) |
| 172 | + .unwrap() |
| 173 | + .commit(engine.as_ref()) |
| 174 | + .unwrap(); |
| 175 | + |
| 176 | + let snapshot = match create_result { |
| 177 | + CommitResult::CommittedTransaction(committed) => committed |
| 178 | + .post_commit_snapshot() |
| 179 | + .expect("post-commit snapshot should exist") |
| 180 | + .clone(), |
| 181 | + other => panic!("Expected CommittedTransaction, got: {other:?}"), |
| 182 | + }; |
| 183 | + |
| 184 | + // Write a batch where region_id is ALL nulls. |
| 185 | + // This should succeed — all-null clustering columns are valid. |
| 186 | + let all_null_region: ArrayRef = Arc::new(Int32Array::from(vec![None, None, None])); |
| 187 | + let batch = generate_batch(vec![ |
| 188 | + ("category", vec!["a", "b", "c"].into_array()), |
| 189 | + ("region_id", all_null_region), |
| 190 | + ]) |
| 191 | + .unwrap(); |
| 192 | + |
| 193 | + // BUG: This fails with "Column stats.minValues.region_id not found in the data" |
| 194 | + // because collect_stats omits minValues/maxValues for all-null columns, |
| 195 | + // but StatsVerifier tries to extract the column before checking nullCount == numRecords. |
| 196 | + let snapshot = write_batch_to_table(&snapshot, engine.as_ref(), batch, HashMap::new()) |
| 197 | + .await |
| 198 | + .unwrap(); |
| 199 | + assert_eq!(snapshot.version(), 1); |
| 200 | + |
| 201 | + // Verify data is readable |
| 202 | + let scan = snapshot.clone().scan_builder().build().unwrap(); |
| 203 | + let batches = read_scan(&scan, engine.clone()).unwrap(); |
| 204 | + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); |
| 205 | + assert_eq!(total_rows, 3); |
| 206 | + |
| 207 | + // Verify stats: region_id should have nullCount=3, but minValues/maxValues |
| 208 | + // should NOT contain region_id (since all values are null, there's nothing to aggregate) |
| 209 | + let add_infos = read_add_infos(&snapshot, engine.as_ref()).unwrap(); |
| 210 | + assert_eq!(add_infos.len(), 1); |
| 211 | + let stats = add_infos[0].stats.as_ref().expect("should have stats"); |
| 212 | + assert_eq!(stats["numRecords"], 3); |
| 213 | + assert_eq!(stats["nullCount"]["region_id"], 3); |
| 214 | + assert!( |
| 215 | + stats["minValues"].get("region_id").is_none(), |
| 216 | + "minValues should not contain region_id when all values are null" |
| 217 | + ); |
| 218 | + assert!( |
| 219 | + stats["maxValues"].get("region_id").is_none(), |
| 220 | + "maxValues should not contain region_id when all values are null" |
| 221 | + ); |
| 222 | +} |
0 commit comments