update test

NKNaN · NKNaN · commit ace86b8353cc · 2026-05-25T19:04:25.000+08:00
diff --git a/custom_ops/gpu_ops/speculate_decoding/ngram_match.cu b/custom_ops/gpu_ops/speculate_decoding/ngram_match.cu
@@ -215,13 +215,14 @@ __global__ void ngram_match_gather_kernel(
       }
     }
 
-    // === Pad seq_lens_this_time to K+1 for cudagraph stability ===
-    // Variable seq_lens_this_time (range [1, K+1]) clashes with cudagraph's
-    // fixed launch params captured at warm-up time; downstream kernels read
-    // past valid cu_seqlens / slot_mapping when replay sees a smaller slt,
-    // leading to OOB / CUDA 700. When pad_to_max=true (cudagraph enabled),
-    // pad missing positions with a placeholder so slt is fixed at K+1.
-    // pad_to_max=false skips the padding cost when cudagraph is off.
+    // === Pad seq_lens_this_time to num_speculative_tokens+1 for cudagraph
+    // stability === Variable seq_lens_this_time (range [1,
+    // num_speculative_tokens+1]) clashes with cudagraph's fixed launch params
+    // captured at warm-up time; downstream kernels read past valid cu_seqlens /
+    // slot_mapping when replay sees a smaller slt, leading to OOB / CUDA 700.
+    // When pad_to_max=true (cudagraph enabled), pad missing positions with a
+    // placeholder so slt is fixed at num_speculative_tokens+1. pad_to_max=false
+    // skips the padding cost when cudagraph is off.
     if (pad_to_max) {
       int target_slt = max_draft_tokens_param + 1;
       if (actual < target_slt) {
diff --git a/tests/operators/test_ngram_match.py b/tests/operators/test_ngram_match.py
@@ -61,6 +61,7 @@ def test_basic_match(self):
             max_dec_len,
             3,
             4,
+            False,  # pad_to_max: match unchanged (no-pad) reference behavior
         )
 
         # Extract non-zero tokens and assert the results.
@@ -100,6 +101,7 @@ def test_no_match(self):
             max_dec_len,
             3,
             3,
+            False,  # pad_to_max: match unchanged (no-pad) reference behavior
         )
 
         # No match → should only keep 1 token
diff --git a/tests/spec_decode/test_benchmark_ngram_kernel.py b/tests/spec_decode/test_benchmark_ngram_kernel.py
@@ -155,6 +155,7 @@ def _run_gpu(ngram_match_fn, gpu_data):
         gpu_data["max_dec_len"],
         MAX_NGRAM_SIZE,
         MAX_DRAFT_TOKENS,
+        False,  # pad_to_max: benchmark unrelated to cudagraph, measure no-pad cost
     )
 
 

Original file line number	Diff line number	Diff line change
`@@ -61,6 +61,7 @@ def test_basic_match(self):`
`61`	`61`	`max_dec_len,`
`62`	`62`	`3,`
`63`	`63`	`4,`
	`64`	`+ False, # pad_to_max: match unchanged (no-pad) reference behavior`
`64`	`65`	`)`
`65`	`66`
`66`	`67`	`# Extract non-zero tokens and assert the results.`
`@@ -100,6 +101,7 @@ def test_no_match(self):`
`100`	`101`	`max_dec_len,`
`101`	`102`	`3,`
`102`	`103`	`3,`
	`104`	`+ False, # pad_to_max: match unchanged (no-pad) reference behavior`
`103`	`105`	`)`
`104`	`106`
`105`	`107`	`# No match → should only keep 1 token`
Original file line number	Diff line number	Diff line change
`@@ -155,6 +155,7 @@ def _run_gpu(ngram_match_fn, gpu_data):`
`155`	`155`	`gpu_data["max_dec_len"],`
`156`	`156`	`MAX_NGRAM_SIZE,`
`157`	`157`	`MAX_DRAFT_TOKENS,`
	`158`	`+ False, # pad_to_max: benchmark unrelated to cudagraph, measure no-pad cost`
`158`	`159`	`)`
`159`	`160`
`160`	`161`