Skip to content

Commit 8e14fcf

Browse files
committed
fix autoquant-autodeploy example
Signed-off-by: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
1 parent 5e43b2a commit 8e14fcf

1 file changed

Lines changed: 3 additions & 5 deletions

File tree

examples/llm_autodeploy/api_server.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
import uvicorn
2222
from fastapi import FastAPI, HTTPException
23-
from tensorrt_llm._torch.auto_deploy import LLM, AutoDeployConfig
23+
from tensorrt_llm._torch.auto_deploy import LLM
2424
from tensorrt_llm.builder import BuildConfig
2525
from tensorrt_llm.llmapi.llm import RequestOutput
2626
from tensorrt_llm.sampling_params import SamplingParams
@@ -48,8 +48,7 @@ def build_runner_from_config(args) -> LLM:
4848
build_config = BuildConfig(max_seq_len=args.max_seq_len, max_batch_size=args.max_batch_size)
4949
build_config.plugin_config.tokens_per_block = args.max_seq_len
5050

51-
# setup AD config
52-
ad_config = AutoDeployConfig(
51+
llm = LLM(
5352
model=args.ckpt_path,
5453
compile_backend=args.compile_backend,
5554
device=args.device,
@@ -58,9 +57,8 @@ def build_runner_from_config(args) -> LLM:
5857
max_seq_len=args.max_seq_len,
5958
max_num_tokens=args.max_num_tokens,
6059
model_kwargs=model_kwargs,
61-
attn_backend="triton",
60+
attn_backend="flashinfer",
6261
)
63-
llm = LLM(**ad_config.to_llm_kwargs())
6462

6563
return llm
6664

0 commit comments

Comments
 (0)