@@ -400,13 +400,18 @@ def get_trtllm_inputs(
400400 return trtllm_inputs
401401
402402 def get_scenarios (self ) -> List [MultimodalScenario ]:
403- """Minimal scenario sweep for the initial coverage.
404-
405- Starts with one image scenario, no CUDA graph / chunked
406- prefill / kv-cache reuse — those add additional surface area
407- (mRoPE handling under graph capture, multimodal cumsum under
408- chunking, etc.) that's worth adding incrementally once the
409- baseline parity passes.
403+ """Modality-sanity sweep (image / multiple_image / video).
404+
405+ These three catch differences in placeholder counts and the
406+ multimodal-cumsum path between single-image, multi-image, and
407+ video inputs.
408+
409+ CUDA-graph capture is intentionally not exercised here. The
410+ standard `attn_metadata.create_cuda_graph_metadata` path only
411+ addresses attention metadata; the Mamba SSM state buffer of the
412+ hybrid (Mamba + attention) cache is not threaded through, so
413+ replayed logits diverge from the HF reference. Adding that path
414+ is dedicated harness work and tracked separately.
410415 """
411416 return [
412417 MultimodalScenario (
@@ -415,6 +420,18 @@ def get_scenarios(self) -> List[MultimodalScenario]:
415420 chunked_prefill = False ,
416421 kv_cache_reuse = False ,
417422 ),
423+ MultimodalScenario (
424+ modality = "multiple_image" ,
425+ use_cuda_graph = False ,
426+ chunked_prefill = False ,
427+ kv_cache_reuse = False ,
428+ ),
429+ MultimodalScenario (
430+ modality = "video" ,
431+ use_cuda_graph = False ,
432+ chunked_prefill = False ,
433+ kv_cache_reuse = False ,
434+ ),
418435 ]
419436
420437 def test_construction_and_weight_loading_smoke (self ):
0 commit comments