feat: post-decode feedback loop for byte-sized batches

westonpace · claude · westonpace · commit 18a7b66b707d · 2026-04-02T06:18:15.000-07:00
After each batch is decoded, measure the actual data bytes per row and feed it back so that the next `next_batch_task()` call uses the measured value instead of the schema-based estimate. This corrects for inaccurate initial estimates on variable-width data (strings, binary) where the schema default of 64 bytes may be far off. The measurement uses `batch_data_size()`, a new helper that computes the actual data contribution of a batch by walking column types and reading offsets for variable-width arrays. This avoids the over-counting from `get_array_memory_size()` which reports full shared page-buffer capacity rather than per-batch data. Part of #6387 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/rust/lance-encoding/src/decoder.rs b/rust/lance-encoding/src/decoder.rs
@@ -213,6 +213,7 @@
 //!    relation to the way the data is stored.
 
 use std::collections::VecDeque;
+use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::{LazyLock, Once, OnceLock};
 use std::{ops::Range, sync::Arc};
 
@@ -1682,6 +1683,73 @@ impl<T: RootDecoderType> RecordBatchReader for BatchDecodeIterator<T> {
     }
 }
 
+/// Compute the actual data size (in bytes) of a record batch,
+/// accounting only for the portion of buffers that belongs to the
+/// batch's row range. Unlike `get_array_memory_size()`, this does
+/// not over-count when arrays share a larger underlying page buffer.
+fn batch_data_size(batch: &RecordBatch) -> u64 {
+    batch
+        .columns()
+        .iter()
+        .map(|c| array_data_size(c.as_ref()))
+        .sum()
+}
+
+fn array_data_size(array: &dyn arrow_array::Array) -> u64 {
+    let dt = array.data_type();
+    let n = array.len() as u64;
+    if let Some(w) = dt.byte_width_opt() {
+        return n * w as u64;
+    }
+    match dt {
+        DataType::Boolean => n.div_ceil(8),
+        DataType::Utf8 => {
+            let arr = array.as_string::<i32>();
+            let offsets = arr.value_offsets();
+            (offsets[n as usize] - offsets[0]) as u64
+        }
+        DataType::LargeUtf8 => {
+            let arr = array.as_string::<i64>();
+            let offsets = arr.value_offsets();
+            (offsets[n as usize] - offsets[0]) as u64
+        }
+        DataType::Binary => {
+            let arr = array.as_binary::<i32>();
+            let offsets = arr.value_offsets();
+            (offsets[n as usize] - offsets[0]) as u64
+        }
+        DataType::LargeBinary => {
+            let arr = array.as_binary::<i64>();
+            let offsets = arr.value_offsets();
+            (offsets[n as usize] - offsets[0]) as u64
+        }
+        DataType::Struct(fields) => {
+            let s = array.as_struct();
+            fields
+                .iter()
+                .enumerate()
+                .map(|(i, _)| array_data_size(s.column(i).as_ref()))
+                .sum()
+        }
+        DataType::List(_) => {
+            let list = array.as_list::<i32>();
+            array_data_size(list.values().as_ref())
+        }
+        DataType::LargeList(_) => {
+            let list = array.as_list::<i64>();
+            array_data_size(list.values().as_ref())
+        }
+        DataType::FixedSizeList(_, _) => {
+            let list = array
+                .as_any()
+                .downcast_ref::<arrow_array::FixedSizeListArray>()
+                .unwrap();
+            array_data_size(list.values().as_ref())
+        }
+        _ => n * 64, // fallback for uncommon types
+    }
+}
+
 /// Estimate the number of bytes per row for a given Arrow data type.
 ///
 /// For fixed-width types this is exact. For variable-width types (strings,
@@ -1734,6 +1802,9 @@ pub struct StructuralBatchDecodeStream {
     /// Schema-based estimate of bytes per row, computed once at construction.
     /// Only meaningful when `batch_size_bytes` is `Some`.
     schema_bytes_per_row: f64,
+    /// Post-decode feedback: actual bytes-per-row measured from the most
+    /// recently decoded batch.  Zero means no feedback yet (use schema estimate).
+    bytes_per_row_feedback: Arc<AtomicU64>,
 }
 
 impl StructuralBatchDecodeStream {
@@ -1771,6 +1842,7 @@ impl StructuralBatchDecodeStream {
             spawn_batch_decode_tasks,
             batch_size_bytes,
             schema_bytes_per_row,
+            bytes_per_row_feedback: Arc::new(AtomicU64::new(0)),
         }
     }
 
@@ -1814,7 +1886,13 @@ impl StructuralBatchDecodeStream {
         }
 
         let mut to_take = if let Some(batch_size_bytes) = self.batch_size_bytes {
-            let rows = (batch_size_bytes as f64 / self.schema_bytes_per_row) as u64;
+            let feedback = self.bytes_per_row_feedback.load(Ordering::Relaxed);
+            let bpr = if feedback > 0 {
+                feedback as f64
+            } else {
+                self.schema_bytes_per_row
+            };
+            let rows = (batch_size_bytes as f64 / bpr) as u64;
             self.rows_remaining.min(rows.max(1))
         } else {
             self.rows_remaining.min(self.rows_per_batch as u64)
@@ -1854,20 +1932,30 @@ impl StructuralBatchDecodeStream {
             let next_task = next_task.transpose().map(|next_task| {
                 let num_rows = next_task.as_ref().map(|t| t.num_rows).unwrap_or(0);
                 let emitted_batch_size_warning = slf.emitted_batch_size_warning.clone();
+                let bytes_per_row_feedback = slf.bytes_per_row_feedback.clone();
                 // Capture the per-stream policy once so every emitted batch task follows the
                 // same throughput-vs-overhead choice made by the scheduler.
                 let spawn_batch_decode_tasks = slf.spawn_batch_decode_tasks;
                 let task = async move {
                     let next_task = next_task?;
-                    if spawn_batch_decode_tasks {
+                    let batch = if spawn_batch_decode_tasks {
                         tokio::spawn(
                             async move { next_task.into_batch(emitted_batch_size_warning) },
                         )
                         .await
                         .map_err(|err| Error::wrapped(err.into()))?
                     } else {
                         next_task.into_batch(emitted_batch_size_warning)
+                    };
+                    if let Ok(ref b) = batch {
+                        let num_rows = b.num_rows() as u64;
+                        if num_rows > 0 {
+                            let bpr = batch_data_size(b) / num_rows;
+                            bytes_per_row_feedback
+                                .store(bpr.max(1), Ordering::Relaxed);
+                        }
                     }
+                    batch
                 };
                 (task, num_rows)
             });
@@ -1978,6 +2066,7 @@ fn check_scheduler_on_drop(
         .boxed()
 }
 
+#[allow(clippy::too_many_arguments)]
 pub fn create_decode_stream(
     schema: &Schema,
     num_rows: u64,
@@ -2909,11 +2998,11 @@ mod tests {
         use arrow_array::Int32Array;
 
         // 1000 rows x 4 Int32 columns = 16 bytes/row
-        let num_rows = 1000;
+        let num_rows: i32 = 1000;
         let arrays: Vec<Arc<dyn arrow_array::Array>> = (0..4)
             .map(|col| {
                 Arc::new(Int32Array::from_iter_values(
-                    (0..num_rows).map(|row| (row * 10 + col) as i32),
+                    (0..num_rows).map(move |row| row * 10 + col),
                 )) as _
             })
             .collect();
@@ -2963,11 +3052,11 @@ mod tests {
         use arrow_array::Int32Array;
 
         // Without batch_size_bytes, rows_per_batch controls batching
-        let num_rows = 1000;
+        let num_rows: i32 = 1000;
         let arrays: Vec<Arc<dyn arrow_array::Array>> = (0..2)
             .map(|col| {
                 Arc::new(Int32Array::from_iter_values(
-                    (0..num_rows).map(|row| (row * 10 + col) as i32),
+                    (0..num_rows).map(move |row| row * 10 + col),
                 )) as _
             })
             .collect();
@@ -2991,4 +3080,67 @@ mod tests {
             );
         }
     }
+
+    #[tokio::test]
+    async fn test_byte_sized_batches_feedback_convergence() {
+        use arrow_array::StringArray;
+
+        // Each row has a 100-byte string. Schema estimate = 64 bytes (default
+        // for Utf8), so the first batch will overshoot. The feedback loop
+        // should correct subsequent batches toward the target.
+        let num_rows = 500;
+        let value: String = "x".repeat(100);
+        let arrays: Vec<Arc<dyn arrow_array::Array>> = vec![Arc::new(StringArray::from(
+            (0..num_rows).map(|_| value.as_str()).collect::<Vec<_>>(),
+        ))];
+        let schema = Arc::new(ArrowSchema::new(vec![ArrowField::new(
+            "s",
+            DataType::Utf8,
+            false,
+        )]));
+        let input_batch = RecordBatch::try_new(schema, arrays).unwrap();
+
+        // Target 5000 bytes/batch. At 100 bytes/row the ideal is 50 rows/batch.
+        // Schema estimate is 64 bytes/row → first batch ~78 rows (overshoot).
+        // After feedback kicks in, batches should converge to ~50 rows.
+        let target_bytes: u64 = 5000;
+        let batches =
+            decode_batches_with_byte_limit(&input_batch, /*batch_size=*/ 1024, Some(target_bytes))
+                .await;
+
+        // Verify all data round-trips correctly
+        let all_batches: Vec<&RecordBatch> = batches.iter().collect();
+        let concatenated = arrow_select::concat::concat_batches(
+            &batches[0].schema(),
+            all_batches.iter().copied(),
+        )
+        .unwrap();
+        assert_eq!(concatenated.num_rows(), num_rows as usize);
+        assert_eq!(
+            concatenated.column(0).as_ref(),
+            input_batch.column(0).as_ref()
+        );
+
+        // After the first batch, subsequent batches should be closer to the
+        // target. The ideal is 50 rows/batch.
+        assert!(
+            batches.len() >= 2,
+            "need at least 2 batches to test convergence"
+        );
+        // The first batch uses the schema estimate (64 bytes/row) →
+        // ~78 rows. After feedback the rows should settle near 50.
+        if batches.len() >= 3 {
+            let second_batch_rows = batches[1].num_rows();
+            let third_batch_rows = batches[2].num_rows();
+            // Both should be within 20% of the ideal (50 rows)
+            assert!(
+                (40..=60).contains(&second_batch_rows),
+                "second batch should be near 50 rows, got {second_batch_rows}"
+            );
+            assert!(
+                (40..=60).contains(&third_batch_rows),
+                "third batch should be near 50 rows, got {third_batch_rows}"
+            );
+        }
+    }
 }