resolve native sumcheck gpu trace fill problem

darth-cy · darth-cy · commit f2cd33241ce4 · 2026-03-09T18:36:45.000-04:00
diff --git a/extensions/native/circuit/src/extension/cuda.rs b/extensions/native/circuit/src/extension/cuda.rs
@@ -82,12 +82,13 @@ impl VmProverExtension<GpuBabyBearPoseidon2Engine, DenseRecordArena, Native>
         // HintSpaceProvider must be registered BEFORE NativeSumcheck because chips are
         // dispatched in reverse order: sumcheck runs first and populates the provider.
         let hint_air: &HintSpaceProviderAir = inventory.next_air::<HintSpaceProviderAir>()?;
-        let cpu_chip = Arc::new(HintSpaceProviderChip::new(hint_air.hint_bus.clone()));
-        let provider_gpu = HintSpaceProviderChipGpu::new(cpu_chip);
+        let cpu_chip = Arc::new(HintSpaceProviderChip::new(hint_air.hint_bus));
+        let provider_gpu = HintSpaceProviderChipGpu::new(cpu_chip.clone());
         inventory.add_periphery_chip(provider_gpu);
 
         inventory.next_air::<NativeSumcheckAir>()?;
-        let sumcheck = NativeSumcheckChipGpu::new(range_checker.clone(), timestamp_max_bits);
+        let sumcheck =
+            NativeSumcheckChipGpu::new(range_checker.clone(), timestamp_max_bits, cpu_chip);
         inventory.add_executor_chip(sumcheck);
 
         Ok(())
diff --git a/extensions/native/circuit/src/sumcheck/cuda.rs b/extensions/native/circuit/src/sumcheck/cuda.rs
@@ -1,4 +1,4 @@
-use std::{mem::size_of, slice::from_raw_parts, sync::Arc};
+use std::{borrow::Borrow, mem::size_of, slice::from_raw_parts, sync::Arc};
 
 use derive_new::new;
 use openvm_circuit::{arch::DenseRecordArena, utils::next_power_of_two_or_zero};
@@ -7,15 +7,70 @@ use openvm_cuda_backend::{
     base::DeviceMatrix, chip::get_empty_air_proving_ctx, prover_backend::GpuBackend, types::F,
 };
 use openvm_cuda_common::copy::MemCopyH2D;
-use openvm_stark_backend::{prover::types::AirProvingContext, Chip};
+use openvm_stark_backend::{p3_field::PrimeField32, prover::types::AirProvingContext, Chip};
 
-use super::columns::NativeSumcheckCols;
-use crate::cuda_abi::sumcheck_cuda;
+use super::columns::{LogupSpecificCols, NativeSumcheckCols, ProdSpecificCols};
+use crate::{
+    cuda_abi::sumcheck_cuda,
+    hint_space_provider::SharedHintSpaceProviderChip,
+};
 
 #[derive(new)]
 pub struct NativeSumcheckChipGpu {
     pub range_checker: Arc<VariableRangeCheckerChipGPU>,
     pub timestamp_max_bits: usize,
+    pub hint_space_provider: SharedHintSpaceProviderChip<F>,
+}
+
+impl NativeSumcheckChipGpu {
+    /// Scans execution records to populate the hint space provider with
+    /// (hint_id, offset, value) triples for each hint element referenced
+    /// by prod and logup rows. This bridges the gap between CPU execution
+    /// (which produces the records) and GPU trace generation.
+    fn populate_hint_provider(&self, records: &[u8]) {
+        let width = NativeSumcheckCols::<F>::width();
+        let record_size = width * size_of::<F>();
+        if records.len() % record_size != 0 {
+            return;
+        }
+        let num_rows = records.len() / record_size;
+
+        let row_slice = unsafe {
+            let ptr = records.as_ptr() as *const F;
+            from_raw_parts(ptr, num_rows * width)
+        };
+
+        for i in 0..num_rows {
+            let row_data = &row_slice[i * width..(i + 1) * width];
+            let cols: &NativeSumcheckCols<F> = row_data.borrow();
+
+            if cols.within_round_limit != F::ONE {
+                continue;
+            }
+
+            if cols.prod_row == F::ONE {
+                let prod_specific: &ProdSpecificCols<F> =
+                    cols.specific[..ProdSpecificCols::<F>::width()].borrow();
+                for (j, &val) in prod_specific.p.iter().enumerate() {
+                    self.hint_space_provider.request(
+                        cols.prod_hint_id,
+                        prod_specific.data_ptr + F::from_canonical_usize(j),
+                        val,
+                    );
+                }
+            } else if cols.logup_row == F::ONE {
+                let logup_specific: &LogupSpecificCols<F> =
+                    cols.specific[..LogupSpecificCols::<F>::width()].borrow();
+                for (j, &val) in logup_specific.pq.iter().enumerate() {
+                    self.hint_space_provider.request(
+                        cols.logup_hint_id,
+                        logup_specific.data_ptr + F::from_canonical_usize(j),
+                        val,
+                    );
+                }
+            }
+        }
+    }
 }
 
 impl Chip<DenseRecordArena, GpuBackend> for NativeSumcheckChipGpu {
@@ -25,6 +80,9 @@ impl Chip<DenseRecordArena, GpuBackend> for NativeSumcheckChipGpu {
             return get_empty_air_proving_ctx::<GpuBackend>();
         }
 
+        // Populate hint space provider from execution records before GPU upload.
+        self.populate_hint_provider(records);
+
         let width = NativeSumcheckCols::<F>::width();
         let record_size = width * size_of::<F>();
         assert_eq!(records.len() % record_size, 0);