[None][fix] AutoDeploy: Fixed wrong dist_backend AUTO detection when using trtllm-llmapi-launch (NVIDIA#15423)

MrGeva · GitLab CI Bot · commit b72fb896cca3 · 2026-06-24T03:01:10.000Z
Signed-off-by: Eran Geva &lt;19514940+MrGeva@users.noreply.github.com&gt;
Signed-off-by: GitLab CI Bot &lt;gitlab-ci@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/trtllm_dist.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/distributed/trtllm_dist.py
@@ -19,6 +19,7 @@
 The torch fallback variants are defined separately to enable multi-pattern matching.
 """
 
+import os
 from typing import List, Optional
 
 import torch
@@ -295,5 +296,18 @@ def trtllm_fused_allreduce_residual_rmsnorm_out_quant_nvfp4_fake(
 
 
 def is_trtllm_op_available():
-    """Check if TRT-LLM ops are available and running with MPI."""
-    return is_ompi()
+    """Check if TRT-LLM ops are available for AutoDeploy collectives."""
+    if is_ompi():
+        return True
+
+    # trtllm-llmapi-launch intentionally removes OMPI/SLURM variables from
+    # the trtllm-serve child to avoid duplicate MPI initialization. It leaves
+    # these launcher-specific variables so the child can bind to pre-spawned
+    # LLMAPI worker ranks.
+    if os.getenv("TLLM_SPAWN_PROXY_PROCESS") == "1":
+        try:
+            return int(os.getenv("tllm_mpi_size") or "1") > 1
+        except ValueError:
+            return False
+
+    return False
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py b/tensorrt_llm/_torch/auto_deploy/transform/library/sharding.py
@@ -1482,6 +1482,20 @@ def validate_allreduce_strategy(v):
     return v  # Let Pydantic handle other types
 
 
+_LOGGED_DIST_BACKEND_CHOICES: set[tuple[str, str]] = set()
+
+
+def _log_dist_backend_choice(configured_backend: str, resolved_backend: str):
+    key = (configured_backend, resolved_backend)
+    if key in _LOGGED_DIST_BACKEND_CHOICES:
+        return
+    _LOGGED_DIST_BACKEND_CHOICES.add(key)
+    ad_logger.info(
+        f"AutoDeploy selected distributed backend: {resolved_backend} "
+        f"(configured: {configured_backend})"
+    )
+
+
 def _get_dist_ops(backend: str):
     """Get the (all_gather, all_reduce) op pair for *backend*.
 
@@ -1492,12 +1506,27 @@ def _get_dist_ops(backend: str):
     """
     if hasattr(backend, "value"):
         backend = backend.value
+    configured_backend = str(backend)
 
-    if backend == "trtllm" or is_trtllm_op_available():
+    if backend == "trtllm":
+        _log_dist_backend_choice(configured_backend, "trtllm")
+        return (
+            torch.ops.auto_deploy.trtllm_dist_all_gather.default,
+            torch.ops.auto_deploy.trtllm_dist_all_reduce.default,
+        )
+    if backend == "torch":
+        _log_dist_backend_choice(configured_backend, "torch")
+        return (
+            torch.ops.auto_deploy.torch_dist_all_gather.default,
+            torch.ops.auto_deploy.torch_dist_all_reduce.default,
+        )
+    if is_trtllm_op_available():
+        _log_dist_backend_choice(configured_backend, "trtllm")
         return (
             torch.ops.auto_deploy.trtllm_dist_all_gather.default,
             torch.ops.auto_deploy.trtllm_dist_all_reduce.default,
         )
+    _log_dist_backend_choice(configured_backend, "torch")
     return (
         torch.ops.auto_deploy.torch_dist_all_gather.default,
         torch.ops.auto_deploy.torch_dist_all_reduce.default,
diff --git a/tests/unittest/auto_deploy/multigpu/transformations/library/test_dist_backend.py b/tests/unittest/auto_deploy/multigpu/transformations/library/test_dist_backend.py
@@ -21,6 +21,9 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
+from tensorrt_llm._torch.auto_deploy.custom_ops.distributed.trtllm_dist import (
+    is_trtllm_op_available,
+)
 from tensorrt_llm._torch.auto_deploy.export import torch_export_to_gm
 from tensorrt_llm._torch.auto_deploy.transform.optimizer import InferenceOptimizer
 from tensorrt_llm._torch.auto_deploy.utils.node_utils import is_op
@@ -155,6 +158,27 @@ def test_dist_backend_auto_and_default(dist_backend):
     _check_dist_ops(gm_transformed, expected_backend="any")
 
 
+def test_trtllm_ops_available_with_llmapi_launch_env(monkeypatch):
+    """LLMAPI launcher strips OMPI env but still provides TRT-LLM worker ranks."""
+    monkeypatch.delenv("OMPI_COMM_WORLD_SIZE", raising=False)
+    monkeypatch.setenv("TLLM_SPAWN_PROXY_PROCESS", "1")
+    monkeypatch.setenv("tllm_mpi_size", "2")
+
+    assert is_trtllm_op_available()
+
+
+@pytest.mark.parametrize("dist_backend", ["auto", None])
+def test_dist_backend_auto_uses_trtllm_with_llmapi_launch_env(monkeypatch, dist_backend):
+    """AUTO should select TRT-LLM ops under trtllm-llmapi-launch."""
+    monkeypatch.delenv("OMPI_COMM_WORLD_SIZE", raising=False)
+    monkeypatch.setenv("TLLM_SPAWN_PROXY_PROCESS", "1")
+    monkeypatch.setenv("tllm_mpi_size", "2")
+
+    model = SimpleMLP()
+    gm_transformed = _create_and_transform_model(model, dist_backend=dist_backend, world_size=2)
+    _check_dist_ops(gm_transformed, expected_backend="trtllm")
+
+
 @pytest.mark.parametrize("dist_backend", ["torch", "trtllm"])
 def test_dist_backend_all_gather(dist_backend):
     """Test dist_backend with all_gather operations (column sharding with single Linear)."""