fix: use spill writer's schema instead of the first batch schema for spill files

gruuya · gruuya · commit fec1dc6702a2 · 2026-03-31T16:33:53.000+02:00
diff --git a/datafusion/core/tests/union_nullable_spill.rs b/datafusion/core/tests/union_nullable_spill.rs
@@ -0,0 +1,127 @@
+// Regression test for: InProgressSpillFile::append_batch() used batch.schema()
+// to initialize the IPC writer, meaning the first spilled batch's schema
+// determined the IPC file schema. When UnionExec returns child streams
+// directly (without coercing batch schemas to the union's declared schema),
+// batches from different UNION branches can have different nullability.
+// If a non-nullable batch is the first to be spilled, the IPC file declares
+// the column as non-nullable. Subsequent batches with NULLs spilled to the
+// same file then get read back with a non-nullable schema but contain null
+// values — an invalid state that causes downstream RecordBatch::try_new to
+// fail with "Column is declared as non-nullable but contains null values".
+//
+// The fix: use the SpillManager's declared schema (which represents the
+// canonical schema from the operator) instead of the first batch's schema.
+
+use std::sync::Arc;
+
+use arrow::array::{Array, Decimal128Array, Int64Array, RecordBatch};
+use arrow::datatypes::{DataType, Field, Schema};
+use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+use datafusion_physical_plan::metrics::{ExecutionPlanMetricsSet, SpillMetrics};
+use datafusion_physical_plan::spill::SpillManager;
+use futures::StreamExt;
+
+/// Proves that InProgressSpillFile uses the SpillManager's declared schema for
+/// the IPC writer, so readback batches always have the correct schema even when
+/// input batches have mismatched nullability.
+///
+/// Scenario:
+///   - SpillManager declares schema with nullable `val`
+///   - First appended batch has non-nullable `val` (simulates literal-0 UNION branch)
+///   - Second appended batch has nullable `val` with NULLs (simulates table UNION branch)
+///   - On readback, both batches must have the nullable schema
+#[tokio::test]
+async fn test_spill_file_uses_spill_manager_schema() {
+    let nullable_schema = Arc::new(Schema::new(vec![
+        Field::new("key", DataType::Int64, false),
+        Field::new("val", DataType::Decimal128(15, 7), true),
+    ]));
+    let non_nullable_schema = Arc::new(Schema::new(vec![
+        Field::new("key", DataType::Int64, false),
+        Field::new("val", DataType::Decimal128(15, 7), false),
+    ]));
+
+    let runtime = Arc::new(RuntimeEnvBuilder::new().build().unwrap());
+    let metrics_set = ExecutionPlanMetricsSet::new();
+    let spill_metrics = SpillMetrics::new(&metrics_set, 0);
+    let spill_manager = Arc::new(SpillManager::new(
+        runtime,
+        spill_metrics,
+        Arc::clone(&nullable_schema),
+    ));
+
+    let mut in_progress = spill_manager.create_in_progress_file("test").unwrap();
+
+    // First batch: non-nullable val (simulates literal-0 UNION branch)
+    let non_nullable_batch = RecordBatch::try_new(
+        Arc::clone(&non_nullable_schema),
+        vec![
+            Arc::new(Int64Array::from(vec![1, 2, 3])),
+            Arc::new(
+                Decimal128Array::from(vec![0i128; 3])
+                    .with_precision_and_scale(15, 7)
+                    .unwrap(),
+            ),
+        ],
+    )
+    .unwrap();
+    in_progress.append_batch(&non_nullable_batch).unwrap();
+
+    // Second batch: nullable val with NULLs (simulates table UNION branch)
+    let nullable_batch = RecordBatch::try_new(
+        Arc::clone(&nullable_schema),
+        vec![
+            Arc::new(Int64Array::from(vec![4, 5, 6])),
+            Arc::new(
+                Decimal128Array::from(vec![
+                    Some(10_000_000i128),
+                    None,
+                    Some(30_000_000i128),
+                ])
+                .with_precision_and_scale(15, 7)
+                .unwrap(),
+            ),
+        ],
+    )
+    .unwrap();
+    in_progress.append_batch(&nullable_batch).unwrap();
+
+    let spill_file = in_progress.finish().unwrap().unwrap();
+
+    // Read back
+    let mut stream = spill_manager
+        .read_spill_as_stream(spill_file, None)
+        .unwrap();
+
+    assert!(
+        stream.schema().field(1).is_nullable(),
+        "Stream schema should be nullable"
+    );
+
+    let mut batches = vec![];
+    while let Some(result) = stream.next().await {
+        batches.push(result.unwrap());
+    }
+    assert_eq!(batches.len(), 2);
+
+    // Both readback batches must have the SpillManager's nullable schema
+    for (i, batch) in batches.iter().enumerate() {
+        assert!(
+            batch.schema().field(1).is_nullable(),
+            "Readback batch {i} should have nullable schema from SpillManager"
+        );
+    }
+
+    // The second batch must preserve its NULL data
+    let val_col = batches[1]
+        .column(1)
+        .as_any()
+        .downcast_ref::<Decimal128Array>()
+        .unwrap();
+    assert_eq!(val_col.null_count(), 1, "Second batch should have 1 null");
+
+    // Rebuilding the batch with its own schema must succeed (would fail if
+    // schema said non-nullable but data contained nulls)
+    RecordBatch::try_new(batches[1].schema(), batches[1].columns().to_vec())
+        .expect("Readback batch should be valid: schema should match data nullability");
+}
diff --git a/datafusion/physical-plan/src/spill/in_progress_spill_file.rs b/datafusion/physical-plan/src/spill/in_progress_spill_file.rs
@@ -62,7 +62,11 @@ impl InProgressSpillFile {
             ));
         }
         if self.writer.is_none() {
-            let schema = batch.schema();
+            // Use the SpillManager's declared schema rather than the batch's schema.
+            // Individual batches may have different schemas (e.g., different nullability)
+            // when they come from different branches of a UnionExec. The SpillManager's
+            // schema represents the canonical schema that all batches should conform to.
+            let schema = Arc::clone(self.spill_writer.schema());
             if let Some(in_progress_file) = &mut self.in_progress_file {
                 self.writer = Some(IPCStreamWriter::new(
                     in_progress_file.path(),