nv-auto-deploy
diff --git a/‎examples/auto_deploy/model_registry/configs/openelm.yaml‎
Lines changed: 6 additions & 2 deletions b/‎examples/auto_deploy/model_registry/configs/openelm.yaml‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/models/custom/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎tensorrt_llm/_torch/auto_deploy/models/custom/__init__.py‎
Lines changed: 2 additions & 0 deletions
@@ -1,3 +1,7 @@
-# Configuration for Apple OpenELM models
-# These models require Llama-2 tokenizer
+# Configuration for Apple OpenELM models (270M, 1.1B, 3B)
+# These models use the Llama-2 tokenizer (confirmed by Apple's CoreNet docs).
 tokenizer: meta-llama/Llama-2-7b-hf
+
+# Override dashboard_default's attn_backend=trtllm which produces degenerate
+# output for OpenELM. Use flashinfer which works with torch-cudagraph.
+attn_backend: flashinfer
@@ -22,6 +22,7 @@
 from .modeling_nemotron_flash import NemotronFlashForCausalLM, NemotronFlashPreTrainedTokenizerFast
 from .modeling_nemotron_h import NemotronHForCausalLM
 from .modeling_olmo3 import Olmo3ForCausalLM
+from .modeling_openelm import OpenELMForCausalLM
 from .modeling_qwen2 import Qwen2ForCausalLM
 from .modeling_qwen3_5_moe import Qwen3_5MoeForCausalLM, Qwen3_5MoeForConditionalGeneration
 from .modeling_qwen3_moe import Qwen3MoeForCausalLM
@@ -60,6 +61,7 @@
     "NemotronFlashPreTrainedTokenizerFast",
     "NemotronHForCausalLM",
     "Olmo3ForCausalLM",
+    "OpenELMForCausalLM",
     "Phi4ForCausalLM",
     "Phi4FlashForCausalLM",
     "Phi4MMForCausalLM",