NVIDIA · Fridah-nv · Feb 11, 2026 · Feb 10, 2026 · Feb 11, 2026
@@ -20,8 +20,7 @@
 
 import uvicorn
 from fastapi import FastAPI, HTTPException
-from tensorrt_llm._torch.auto_deploy import LLM, AutoDeployConfig
-from tensorrt_llm.builder import BuildConfig
+from tensorrt_llm._torch.auto_deploy import LLM
 from tensorrt_llm.llmapi.llm import RequestOutput
 from tensorrt_llm.sampling_params import SamplingParams
 from tensorrt_llm.serve.openai_protocol import (
@@ -45,11 +44,8 @@ def build_runner_from_config(args) -> LLM:
     """Builds a model runner from our config."""
     mto.enable_huggingface_checkpointing()
     model_kwargs = {"max_position_embeddings": args.max_seq_len, "use_cache": False}
-    build_config = BuildConfig(max_seq_len=args.max_seq_len, max_batch_size=args.max_batch_size)
-    build_config.plugin_config.tokens_per_block = args.max_seq_len
 
-    # setup AD config
-    ad_config = AutoDeployConfig(
+    llm = LLM(
         model=args.ckpt_path,
         compile_backend=args.compile_backend,
         device=args.device,
@@ -58,9 +54,8 @@ def build_runner_from_config(args) -> LLM:
         max_seq_len=args.max_seq_len,
         max_num_tokens=args.max_num_tokens,
         model_kwargs=model_kwargs,
-        attn_backend="triton",
+        attn_backend="flashinfer",
     )
-    llm = LLM(**ad_config.to_llm_kwargs())
 
     return llm