diff --git a/examples/llm_autodeploy/api_server.py b/examples/llm_autodeploy/api_server.py index 6e7f9d53c7..0498ed8739 100644 --- a/examples/llm_autodeploy/api_server.py +++ b/examples/llm_autodeploy/api_server.py @@ -20,8 +20,7 @@ import uvicorn from fastapi import FastAPI, HTTPException -from tensorrt_llm._torch.auto_deploy import LLM, AutoDeployConfig -from tensorrt_llm.builder import BuildConfig +from tensorrt_llm._torch.auto_deploy import LLM from tensorrt_llm.llmapi.llm import RequestOutput from tensorrt_llm.sampling_params import SamplingParams from tensorrt_llm.serve.openai_protocol import ( @@ -45,11 +44,8 @@ def build_runner_from_config(args) -> LLM: """Builds a model runner from our config.""" mto.enable_huggingface_checkpointing() model_kwargs = {"max_position_embeddings": args.max_seq_len, "use_cache": False} - build_config = BuildConfig(max_seq_len=args.max_seq_len, max_batch_size=args.max_batch_size) - build_config.plugin_config.tokens_per_block = args.max_seq_len - # setup AD config - ad_config = AutoDeployConfig( + llm = LLM( model=args.ckpt_path, compile_backend=args.compile_backend, device=args.device, @@ -58,9 +54,8 @@ def build_runner_from_config(args) -> LLM: max_seq_len=args.max_seq_len, max_num_tokens=args.max_num_tokens, model_kwargs=model_kwargs, - attn_backend="triton", + attn_backend="flashinfer", ) - llm = LLM(**ad_config.to_llm_kwargs()) return llm