Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 12 additions & 8 deletions tensorrt_llm/_torch/visual_gen/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,16 +170,19 @@ def _cuda_profiler_stop(self):
logger.info("CUDA profiler stopped")

def _setup_cuda_graphs(self):
"""Wrap all transformer components with CUDA graph capture/replay."""
"""Wrap all transformer components with CUDA graph capture/replay.

Composes with torch.compile: the runner wraps the (outer) transformer
``forward`` while torch.compile compiles the inner transformer blocks
(see ``torch_compile``). Graph capture happens during warmup, by which
point the runner's own ``WARMUP_STEPS`` eager iterations have already
triggered torch.compile's lazy compilation, so the captured graph
contains the optimized compiled kernels. (The ``LTX2Pipeline`` override
relies on the same ordering.)
"""
if not self.pipeline_config.cuda_graph.enable:
return

if self.pipeline_config.torch_compile.enable:
logger.warning(
"CUDA graphs with torch.compile not yet supported. Using torch.compile only."
)
return

if len(self.transformer_components) > 1:
logger.info(
"CUDA graph runner: multiple transformer components, using shared graph pool"
Expand All @@ -188,6 +191,7 @@ def _setup_cuda_graphs(self):
else:
shared_pool = None

compile_note = " (with torch.compile)" if self.pipeline_config.torch_compile.enable else ""
for name in self.transformer_components:
model = getattr(self, name, None)
if model is None:
Expand All @@ -198,7 +202,7 @@ def _setup_cuda_graphs(self):
shared_pool,
)
model.register_cuda_graph_extra_key_fns(runner)
logger.info(f"CUDA graph runner: wrapping {name}.forward")
logger.info(f"CUDA graph runner: wrapping {name}.forward{compile_note}")
model.forward = runner.wrap(model.forward)
self._cuda_graph_runners[name] = runner

Expand Down
1 change: 1 addition & 0 deletions tests/integration/test_lists/waives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,7 @@ unittest/_torch/multi_gpu/test_user_buffers.py::test_user_buffers_pass[2-fp16-_t
unittest/_torch/ray_orchestrator/multi_gpu/test_llm_update_weights_multi_gpu.py::test_llm_partial_update_weights_nvfp4[auto-Qwen3/Qwen3-8B] SKIP (https://nvbugs/6372690)
unittest/_torch/ray_orchestrator/multi_gpu/test_llm_update_weights_multi_gpu.py::test_llm_partial_update_weights_nvfp4[fp8-Qwen3/Qwen3-30B-A3B] SKIP (https://nvbugs/6372690)
unittest/_torch/ray_orchestrator/multi_gpu/test_llm_update_weights_multi_gpu.py::test_llm_partial_update_weights_nvfp4[fp8-Qwen3/Qwen3-8B] SKIP (https://nvbugs/6372690)
unittest/_torch/sampler/test_beam_search_speculative_d2h.py::test_speculative_d2h_predictor_hit_is_sync_free SKIP (https://nvbugs/6378901)
unittest/_torch/thop/parallel/test_fp8_rowwise_linear.py::test_fp8_rowwise_linear[dtype1] SKIP (https://nvbugs/6301807)
unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingDSv3-swiglu-1024-1024-1] SKIP (https://nvbugs/5908070)
unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_no_autotune[use_score_as_input-RoutingRenormalize_qwen_next-swiglu-1024-1024-150] SKIP (https://nvbugs/5908070)
Expand Down
Loading