Add clarifying comment for CUDA bounds guard per review

vraspar · Copilot · vraspar · commit 749f7a0e371a · 2026-04-13T20:37:34.000Z
Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/onnxruntime/contrib_ops/cuda/bert/ngram_repeat_block_impl.cu b/onnxruntime/contrib_ops/cuda/bert/ngram_repeat_block_impl.cu
@@ -49,6 +49,8 @@ __global__ void banRepeatedTokens(const int64_t* __restrict__ tokens,
   if (is_banned == true) {
     auto token_to_be_banned = tokens_shm[col + no_repeat_ngram_size - 1];
     CUDA_KERNEL_ASSERT(token_to_be_banned >= 0 && token_to_be_banned < vocab_size);
+    // In release builds, silently skip OOB tokens rather than writing out of bounds.
+    // CUDA kernels cannot propagate Status errors to the host.
     if (token_to_be_banned >= 0 && token_to_be_banned < vocab_size) {
       lprobs[lprob_start + token_to_be_banned] = -std::numeric_limits<float>::infinity();
     }

Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,8 @@ __global__ void banRepeatedTokens(const int64_t* __restrict__ tokens,`
`49`	`49`	`if (is_banned == true) {`
`50`	`50`	`auto token_to_be_banned = tokens_shm[col + no_repeat_ngram_size - 1];`
`51`	`51`	`CUDA_KERNEL_ASSERT(token_to_be_banned >= 0 && token_to_be_banned < vocab_size);`
	`52`	`+ // In release builds, silently skip OOB tokens rather than writing out of bounds.`
	`53`	`+ // CUDA kernels cannot propagate Status errors to the host.`
`52`	`54`	`if (token_to_be_banned >= 0 && token_to_be_banned < vocab_size) {`
`53`	`55`	`lprobs[lprob_start + token_to_be_banned] = -std::numeric_limits<float>::infinity();`
`54`	`56`	`}`