[fix][5875912] Fix autoquant-autodeploy example (#878)

Fridah-nv · web-flow · commit eb3e6edb504e · 2026-02-11T20:33:08.000Z
## What does this PR do? **Type of change:** Bug fix  **Overview:** ? Please check Bug ticket ## Usage  ```python # Add a code snippet demonstrating how to use this ``` ## Testing Tested with ``` ./scripts/run_auto_quant_and_deploy.sh --hf_ckpt ./models/Qwen/Qwen3-8B --save_quantized_ckpt ./qwen3_8B_autoquant --quant fp8 --effective_bits 10.0 ``` ## Before your PR is "*Ready for review*"  - **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. - **Is this change backward compatible?**: Yes/No  - **Did you write any new necessary tests?**: Yes/No - **Did you add or update any necessary documentation?**: Yes/No - **Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?**: Yes/No  ## Additional Information   ## Summary by CodeRabbit * **Refactor** * Simplified LLM initialization by removing intermediate configuration layer * Updated attention backend from triton to flashinfer  --------- Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
diff --git a/examples/llm_autodeploy/api_server.py b/examples/llm_autodeploy/api_server.py
@@ -20,8 +20,7 @@
 
 import uvicorn
 from fastapi import FastAPI, HTTPException
-from tensorrt_llm._torch.auto_deploy import LLM, AutoDeployConfig
-from tensorrt_llm.builder import BuildConfig
+from tensorrt_llm._torch.auto_deploy import LLM
 from tensorrt_llm.llmapi.llm import RequestOutput
 from tensorrt_llm.sampling_params import SamplingParams
 from tensorrt_llm.serve.openai_protocol import (
@@ -45,11 +44,8 @@ def build_runner_from_config(args) -> LLM:
     """Builds a model runner from our config."""
     mto.enable_huggingface_checkpointing()
     model_kwargs = {"max_position_embeddings": args.max_seq_len, "use_cache": False}
-    build_config = BuildConfig(max_seq_len=args.max_seq_len, max_batch_size=args.max_batch_size)
-    build_config.plugin_config.tokens_per_block = args.max_seq_len
 
-    # setup AD config
-    ad_config = AutoDeployConfig(
+    llm = LLM(
         model=args.ckpt_path,
         compile_backend=args.compile_backend,
         device=args.device,
@@ -58,9 +54,8 @@ def build_runner_from_config(args) -> LLM:
         max_seq_len=args.max_seq_len,
         max_num_tokens=args.max_num_tokens,
         model_kwargs=model_kwargs,
-        attn_backend="triton",
+        attn_backend="flashinfer",
     )
-    llm = LLM(**ad_config.to_llm_kwargs())
 
     return llm