diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py index 25ab939758a9..c0c0d79d23ea 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py @@ -673,10 +673,10 @@ def drafting_loop_wrapper(model): sm_version = get_sm_version() if kv_cache_config.enable_block_reuse and sm_version not in [ - 90, 100, 103, 120 + 90, 100, 103, 120, 121 ]: logger.warning( - f"KV cache reuse for MLA can only be enabled on SM90/SM100/SM103/SM120, " + f"KV cache reuse for MLA can only be enabled on SM90/SM100/SM103/SM120/SM121, " f"disable enable_block_reuse for SM{sm_version}") kv_cache_config.enable_block_reuse = False _set_model_engines_cache_reuse([model_engine, draft_model_engine], @@ -693,9 +693,9 @@ def drafting_loop_wrapper(model): kv_cache_config.enable_block_reuse = False _set_model_engines_cache_reuse([model_engine, draft_model_engine], False) - if enable_chunked_context and sm_version not in [90, 100, 103, 120]: + if enable_chunked_context and sm_version not in [90, 100, 103, 120, 121]: logger.warning( - "Chunked Prefill for MLA can only be enabled on SM90/SM100/SM103/SM120, " + "Chunked Prefill for MLA can only be enabled on SM90/SM100/SM103/SM120/SM121, " f"disable enable_chunked_context for SM{sm_version}") enable_chunked_context = False model_engine.attn_runtime_features.chunked_prefill = False