fix autoquant-autodeploy example

Fridah-nv · Fridah-nv · commit 8e14fcf72e09 · 2026-02-10T21:37:13.000Z
Signed-off-by: Fridah-nv &lt;201670829+Fridah-nv@users.noreply.github.com&gt;
diff --git a/examples/llm_autodeploy/api_server.py b/examples/llm_autodeploy/api_server.py
@@ -20,7 +20,7 @@
 
 import uvicorn
 from fastapi import FastAPI, HTTPException
-from tensorrt_llm._torch.auto_deploy import LLM, AutoDeployConfig
+from tensorrt_llm._torch.auto_deploy import LLM
 from tensorrt_llm.builder import BuildConfig
 from tensorrt_llm.llmapi.llm import RequestOutput
 from tensorrt_llm.sampling_params import SamplingParams
@@ -48,8 +48,7 @@ def build_runner_from_config(args) -> LLM:
     build_config = BuildConfig(max_seq_len=args.max_seq_len, max_batch_size=args.max_batch_size)
     build_config.plugin_config.tokens_per_block = args.max_seq_len
 
-    # setup AD config
-    ad_config = AutoDeployConfig(
+    llm = LLM(
         model=args.ckpt_path,
         compile_backend=args.compile_backend,
         device=args.device,
@@ -58,9 +57,8 @@ def build_runner_from_config(args) -> LLM:
         max_seq_len=args.max_seq_len,
         max_num_tokens=args.max_num_tokens,
         model_kwargs=model_kwargs,
-        attn_backend="triton",
+        attn_backend="flashinfer",
     )
-    llm = LLM(**ad_config.to_llm_kwargs())
 
     return llm