[WebGPU] Support continuous decoding (RewindTo) with graph capture (microsoft#2083)

qjia7 · web-flow · commit 2a2ef8cda8ae · 2026-04-23T19:14:12.000-07:00
This pull request introduces improvements to the handling of attention
masks in both the CUDA and WebGPU backends, focusing on more efficient
and correct updates of mask buffers during decoding. The main changes
are the implementation of a CPU-side update for static attention masks
in CUDA and the addition of a reusable staging buffer for efficient mask
updates in WebGPU, with logic to avoid redundant work for single-beam
cases.

**CUDA backend improvements:**

* Replaced the previous (commented-out and incorrect) CUDA memory set
logic in `DefaultPositionInputs::RewindMask` with a CPU-side update that
correctly sets attended and non-attended positions in the attention mask
for each batch/beam, followed by a copy back to the device. This ensures
the mask is set with 1s for attended tokens and 0s for future tokens,
supporting both `int32_t` and `int64_t` types.

**WebGPU backend improvements:**

* Added a reusable CPU staging buffer (`mask_staging_buffer_`) to the
`InterfaceImpl` struct for efficient attention mask updates, avoiding
repeated allocations and redundant writes.
* Implemented the `UpdateAttentionMask` method to efficiently update the
mask for single-beam cases by only filling new positions with 1s and
copying the relevant portion to the device, falling back to CPU for
multi-beam cases. This method handles static update path and supports
both `int32_t` and `int64_t` mask types.
diff --git a/src/models/position_inputs.cpp b/src/models/position_inputs.cpp
@@ -380,23 +380,50 @@ void DefaultPositionInputs::InitializeSequenceLengths(std::array<int64_t, 2> sha
 }
 
 void DefaultPositionInputs::RewindMask(size_t index) {
-  if (state_.params_->use_graph_capture) {
-    throw std::runtime_error("PositionInputs::RewindMask - Static buffer is not supported for continuous decoding.");
-#if 0  // TODO: Fix implementation, cudaMemsetAsync of 1 is setting bytes of 1 vs int32's of 1
-    int past_length = static_cast<int>(index);
-    int max_length = static_cast<int>(state_.params_->search.max_length);
-    cudaMemsetAsync(attention_mask_->GetTensorMutableRawData(),
-                    0,
-                    (type_ == Ort::TypeToTensorType<int32_t> ? sizeof(int32_t) : sizeof(int64_t)) * max_length,
-                    model_.cuda_stream_);
-    cudaMemsetAsync(attention_mask_->GetTensorMutableRawData(),
-                    1,
-                    (type_ == Ort::TypeToTensorType<int32_t> ? sizeof(int32_t) : sizeof(int64_t)) * past_length,
-                    model_.cuda_stream_);
-#endif
+  if (ShouldUseStaticMaskHandling()) {
+    // Static mask layout: [batch_beam_size, max_length]
+    // Rewind to index: write 1s for [0, index), 0s for [index, max_length)
+    size_t max_len = static_cast<size_t>(state_.params_->search.max_length);
+    if (index > max_len) {
+      throw std::runtime_error("RewindMask: index exceeds max_length");
+    }
+    size_t batch_beam_size = static_cast<size_t>(attention_mask_shape_[0]);
+    auto byte_span = attention_mask_->GetByteSpan();
+    auto cpu_data = byte_span.CpuSpan();
+    if (type_ == Ort::TypeToTensorType<int32_t>) {
+      auto* data = reinterpret_cast<int32_t*>(cpu_data.data());
+      for (size_t i = 0; i < batch_beam_size; i++) {
+        std::fill_n(data + i * max_len, index, static_cast<int32_t>(1));
+        std::fill_n(data + i * max_len + index, max_len - index, static_cast<int32_t>(0));
+      }
+    } else {
+      auto* data = reinterpret_cast<int64_t*>(cpu_data.data());
+      for (size_t i = 0; i < batch_beam_size; i++) {
+        std::fill_n(data + i * max_len, index, static_cast<int64_t>(1));
+        std::fill_n(data + i * max_len + index, max_len - index, static_cast<int64_t>(0));
+      }
+    }
+    byte_span.CopyCpuToDevice();
+    return;
   }
+
+  // Dynamic mask: adjust shape so the next Update() creates the correct-sized tensor.
+  // For batch_beam_size == 1 (the only case RewindTo supports), the CPU UpdateAttentionMask
+  // fills the entire next mask with 1s, so no data fixup is needed - just the shape.
+  attention_mask_shape_[1] = static_cast<int64_t>(index);
 }
 
+// Returns true when the attention mask is a fixed-size [batch_beam_size, max_length] buffer
+// that must be updated in-place (write 1s/0s) rather than re-created per step.
+// Currently triggered by:
+//   - DML (always uses graph capture, see IsGraphCaptureEnabled in config.cpp)
+//   - WebGPU with enableGraphCapture=1 in provider options
+//   - NvTensorRtRtx with past-present shared buffers
+// Not yet using this path:
+//   - CUDA: graph capture is currently disabled in GenAI due to bugs
+//     (IsGraphCaptureEnabled throws for CUDA). Once re-enabled, RewindMask's
+//     static path will work for CUDA as well since it uses device-agnostic
+//     CpuSpan/CopyCpuToDevice.
 bool DefaultPositionInputs::ShouldUseStaticMaskHandling() const {
   return state_.params_->use_graph_capture ||
          (state_.params_->IsPastPresentShareBufferEnabled(model_.config_->model.type) &&
diff --git a/src/webgpu/interface.cpp b/src/webgpu/interface.cpp
@@ -171,6 +171,11 @@ struct InterfaceImpl : DeviceInterface {
  private:
   Ort::Allocator* ort_allocator_{};
   const OrtMemoryInfo* ort_memory_info_{};
+  // Reusable CPU staging buffers for UpdateAttentionMask, pre-filled with 1s.
+  // Content is always all 1s so sharing across generators is safe; only upload_bytes
+  // worth of data is copied each call, regardless of buffer capacity.
+  std::vector<int32_t> mask_staging_buffer_i32_;
+  std::vector<int64_t> mask_staging_buffer_i64_;
 
  public:
   Ort::Allocator& GetAllocator() override {
@@ -190,6 +195,47 @@ struct InterfaceImpl : DeviceInterface {
 
   void Synchronize() override {}  // Nothing to do?
 
+  bool UpdateAttentionMask([[maybe_unused]] void* next_mask_data, void* mask_data, int batch_beam_size, [[maybe_unused]] int new_kv_length, int total_length, [[maybe_unused]] int max_length, bool update_only, ONNXTensorElementDataType type) override {
+    if (batch_beam_size != 1 || !update_only) {
+      return false;  // Fall back to CPU for multi-beam or non-static mask
+    }
+    if (type != Ort::TypeToTensorType<int32_t> && type != Ort::TypeToTensorType<int64_t>) {
+      return false;  // Unsupported mask type; fall back to CPU handling.
+    }
+    // For batch_beam_size == 1 with static mask (update_only=true, no padding),
+    // the mask is always all 1s for attended positions.
+    size_t num_elements = static_cast<size_t>(total_length);
+    size_t upload_bytes;
+    void* staging_data;
+
+    // Use the correctly typed staging buffer. Each grows monotonically and
+    // only newly extended positions need to be filled with 1.
+    if (type == Ort::TypeToTensorType<int32_t>) {
+      if (mask_staging_buffer_i32_.size() < num_elements) {
+        mask_staging_buffer_i32_.resize(num_elements, static_cast<int32_t>(1));
+      }
+      staging_data = mask_staging_buffer_i32_.data();
+      upload_bytes = num_elements * sizeof(int32_t);
+    } else {
+      if (mask_staging_buffer_i64_.size() < num_elements) {
+        mask_staging_buffer_i64_.resize(num_elements, static_cast<int64_t>(1));
+      }
+      staging_data = mask_staging_buffer_i64_.data();
+      upload_bytes = num_elements * sizeof(int64_t);
+    }
+
+    int64_t shape_val = static_cast<int64_t>(upload_bytes);
+    std::span<const int64_t> shape{&shape_val, 1};
+    static const auto cpu_mem_info = OrtMemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeDefault);
+    auto src_tensor = OrtValue::CreateTensor(*cpu_mem_info, staging_data, upload_bytes, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);
+    auto dst_tensor = OrtValue::CreateTensor(*ort_memory_info_, mask_data, upload_bytes, shape, ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8);
+    const std::vector<const OrtValue*> src_ptrs = {src_tensor.get()};
+    const std::vector<OrtValue*> dst_ptrs = {dst_tensor.get()};
+    GetOrtEnv().CopyTensors(src_ptrs, dst_ptrs, nullptr);
+
+    return true;
+  }
+
   bool Cast(void* input, void* output, ONNXTensorElementDataType input_type, ONNXTensorElementDataType output_type, size_t element_count) override {
     if (!ort_allocator_) {
       throw std::runtime_error("WebGPU allocator not initialized");
diff --git a/test/c_api_tests.cpp b/test/c_api_tests.cpp
@@ -1334,6 +1334,56 @@ TEST(CAPITests, RewindGptFp32CAPI) {
 }
 #endif
 
+// Test RewindTo with static mask handling via NvTensorRtRtx past-present share buffer.
+// Skipped when the phi3-fp16-nvtrt model is not available (CI-only model).
+TEST(CAPITests, RewindGraphCaptureNvTensorRtRtxCAPI) {
+  std::string nvtrt_path = MODEL_PATH "hf-internal-testing/phi3-fp16-nvtrt";
+  if (!std::filesystem::exists(nvtrt_path)) {
+    GTEST_SKIP() << "NvTensorRtRtx model not available at " << nvtrt_path;
+  }
+
+  auto config = OgaConfig::Create(nvtrt_path.c_str());
+  config->ClearProviders();
+  config->AppendProvider("NvTensorRtRtx");
+
+  int max_length = 20;
+
+  auto model = OgaModel::Create(*config);
+  auto params = OgaGeneratorParams::Create(*model);
+  params->SetSearchOption("max_length", max_length);
+
+  std::vector<int32_t> input_ids{1, 15043, 29892, 920};
+
+  auto generator = OgaGenerator::Create(*model, *params);
+  generator->AppendTokens(input_ids.data(), input_ids.size());
+  while (!generator->IsDone()) {
+    generator->GenerateNextToken();
+  }
+
+  auto seq_len = generator->GetSequenceCount(0);
+  std::vector<int32_t> first_output(seq_len);
+  std::memcpy(first_output.data(), generator->GetSequenceData(0), seq_len * sizeof(int32_t));
+
+  generator->RewindTo(0);
+  generator->AppendTokens(input_ids.data(), input_ids.size());
+  while (!generator->IsDone()) {
+    generator->GenerateNextToken();
+  }
+
+  auto seq_len2 = generator->GetSequenceCount(0);
+  ASSERT_EQ(seq_len2, seq_len);
+  EXPECT_TRUE(0 == std::memcmp(first_output.data(), generator->GetSequenceData(0), seq_len * sizeof(int32_t)));
+
+  generator->RewindTo(6);
+  while (!generator->IsDone()) {
+    generator->GenerateNextToken();
+  }
+
+  seq_len2 = generator->GetSequenceCount(0);
+  ASSERT_EQ(seq_len2, seq_len);
+  EXPECT_TRUE(0 == std::memcmp(first_output.data(), generator->GetSequenceData(0), seq_len * sizeof(int32_t)));
+}
+
 #ifndef STREAMING_ASR_PATH
 #define STREAMING_ASR_PATH MODEL_PATH "nemotron-speech-streaming"
 #endif