[None][feat] Add PyTorch reset_prefix_cache API (#14970)

milesial · web-flow · commit ae9226e2852d · 2026-06-11T17:58:08.000-07:00
Signed-off-by: milesial &lt;milesial@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -5156,6 +5156,9 @@ def _handle_speculative_decoding(
         return new_target_inputs, num_accepted_tokens_device
 
     def reset_prefix_cache(self):
+        if self.active_requests or self.waiting_queue:
+            raise RuntimeError(
+                "reset_prefix_cache() requires no active or queued requests.")
         self.kv_cache_manager.reset_reuse_state()
 
     def _handle_guided_decoder_errors(
diff --git a/tensorrt_llm/executor/base_worker.py b/tensorrt_llm/executor/base_worker.py
@@ -788,6 +788,16 @@ def wakeup(self, wakeup_tags: List[str]) -> None:
             materialize_with_tag(*tags)
             torch.cuda.synchronize()
 
+    def reset_prefix_cache(self) -> None:
+        """Invalidate local KV prefix-cache reuse state on PyTorch engines."""
+        engine = self.engine
+        if engine is None or not hasattr(engine, "reset_prefix_cache"):
+            raise NotImplementedError(
+                "reset_prefix_cache() is only supported by the PyTorch backend."
+            )
+        with engine.control_action():
+            engine.reset_prefix_cache()
+
     def shutdown(self):
         if self.doing_shutdown:
             return
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
@@ -1494,6 +1494,33 @@ def _collective_rpc(
                 f"Executor type {type(self._executor)} does not support collective RPC."
             )
 
+    @set_api_status("beta")
+    def reset_prefix_cache(self) -> None:
+        """Reset local KV prefix-cache reuse state.
+
+        This invalidates local prefix-cache metadata in the PyTorch backend. It
+        requires no active or queued requests, and it does not reset
+        connector-managed external or offloaded cache state. Callers should
+        quiesce traffic before invoking this method.
+        """
+        if self._encode_only:
+            raise RuntimeError("reset_prefix_cache() is not available when "
+                               "encode_only=True.")
+        if self._executor is None:
+            raise RuntimeError("reset_prefix_cache() requires an active "
+                               "executor.")
+
+        if hasattr(self._executor, "collective_rpc"):
+            self._collective_rpc("reset_prefix_cache")
+            return
+
+        reset_prefix_cache = getattr(self._executor, "reset_prefix_cache", None)
+        if reset_prefix_cache is None:
+            raise NotImplementedError(
+                "reset_prefix_cache() is only supported by the PyTorch backend."
+            )
+        reset_prefix_cache()
+
     def _build_model(self):
         super()._build_model()
         assert self._engine_dir is None
diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py
@@ -666,6 +666,9 @@ def register_routes(self):
         self.app.add_api_route("/kv_cache_events",
                                self.get_kv_cache_events,
                                methods=["POST"])
+        self.app.add_api_route("/reset_prefix_cache",
+                               self.reset_prefix_cache,
+                               methods=["POST"])
         resource_governor_queue = self.generator._executor.resource_governor_queue
         if resource_governor_queue is not None:
             from .resource_governor import ResourceGovernor
@@ -1027,6 +1030,27 @@ async def get_kv_cache_events(self) -> JSONResponse:
             pass
         return JSONResponse(content=events)
 
+    async def reset_prefix_cache(self) -> Response:
+        reset_prefix_cache = getattr(self.generator, "reset_prefix_cache", None)
+        if reset_prefix_cache is None:
+            return self._create_not_supported_error(
+                "reset_prefix_cache() is only supported by the PyTorch backend."
+            )
+
+        try:
+            await asyncio.get_running_loop().run_in_executor(
+                None, reset_prefix_cache)
+        except NotImplementedError as e:
+            return self._create_not_supported_error(str(e))
+        except (RuntimeError, ValueError) as e:
+            return self.create_error_response(
+                message=str(e),
+                err_type="InvalidRequestError",
+                status_code=HTTPStatus.CONFLICT,
+            )
+
+        return Response(status_code=200)
+
     async def _extract_metrics(self, res: RequestOutput, raw_request: Request):
         if not res.finished:
             return
diff --git a/tests/unittest/_torch/executor/test_py_executor.py b/tests/unittest/_torch/executor/test_py_executor.py
@@ -133,6 +133,41 @@ def test_handle_special_queue_items(mock_executor):
     assert 2 in mock_executor.canceled_req_ids
 
 
+def test_reset_prefix_cache_resets_when_idle():
+    executor = object.__new__(PyExecutor)
+    executor.active_requests = []
+    executor.waiting_queue = []
+    executor.kv_cache_manager = Mock()
+
+    executor.reset_prefix_cache()
+
+    executor.kv_cache_manager.reset_reuse_state.assert_called_once_with()
+
+
+def test_reset_prefix_cache_rejects_active_requests():
+    executor = object.__new__(PyExecutor)
+    executor.active_requests = [Mock()]
+    executor.waiting_queue = []
+    executor.kv_cache_manager = Mock()
+
+    with pytest.raises(RuntimeError, match="no active or queued requests"):
+        executor.reset_prefix_cache()
+
+    executor.kv_cache_manager.reset_reuse_state.assert_not_called()
+
+
+def test_reset_prefix_cache_rejects_queued_requests():
+    executor = object.__new__(PyExecutor)
+    executor.active_requests = []
+    executor.waiting_queue = [Mock()]
+    executor.kv_cache_manager = Mock()
+
+    with pytest.raises(RuntimeError, match="no active or queued requests"):
+        executor.reset_prefix_cache()
+
+    executor.kv_cache_manager.reset_reuse_state.assert_not_called()
+
+
 def test_clear_canceled_req_ids(mock_executor):
     """Test clearing canceled request IDs."""
     mock_executor.canceled_req_ids = [1, 2, 3]
diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml
@@ -387,6 +387,10 @@ methods:
         default: 2
     return_annotation: tensorrt_llm.executor.result.IterationResult
     status: beta
+  reset_prefix_cache:
+    parameters: {}
+    return_annotation: None
+    status: beta
   get_stats:
     parameters:
       timeout:
diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
@@ -2708,6 +2708,114 @@ async def is_disconnected(self):
         return True
 
 
+class _FakeResetExecutor:
+
+    def __init__(self):
+        self.num_reset_calls = 0
+
+    def reset_prefix_cache(self):
+        self.num_reset_calls += 1
+
+    def shutdown(self):
+        pass
+
+
+class _FakeCollectiveResetExecutor:
+
+    def __init__(self):
+        self.calls = []
+
+    def collective_rpc(self, method, args, kwargs, non_block, unique_reply_rank,
+                       target_ranks):
+        self.calls.append(
+            (method, args, kwargs, non_block, unique_reply_rank, target_ranks))
+        return [None]
+
+    def shutdown(self):
+        pass
+
+
+class _FakeUnsupportedResetExecutor:
+
+    def shutdown(self):
+        pass
+
+
+class _FakeNotImplementedResetGenerator:
+
+    def reset_prefix_cache(self):
+        raise NotImplementedError("not supported")
+
+
+def test_llm_reset_prefix_cache_dispatches_to_executor() -> None:
+    llm = object.__new__(LLM_torch)
+    llm._encode_only = False
+    llm._executor = _FakeResetExecutor()
+
+    llm.reset_prefix_cache()
+
+    assert llm._executor.num_reset_calls == 1
+
+
+def test_llm_reset_prefix_cache_uses_collective_rpc() -> None:
+    llm = object.__new__(LLM_torch)
+    llm._encode_only = False
+    llm._executor = _FakeCollectiveResetExecutor()
+
+    llm.reset_prefix_cache()
+
+    assert llm._executor.calls == [("reset_prefix_cache", (), None, False, None,
+                                    None)]
+
+
+def test_llm_reset_prefix_cache_rejects_encode_only() -> None:
+    llm = object.__new__(LLM_torch)
+    llm._encode_only = True
+    llm._executor = _FakeResetExecutor()
+
+    with pytest.raises(RuntimeError, match="encode_only=True"):
+        llm.reset_prefix_cache()
+
+
+def test_llm_reset_prefix_cache_rejects_unsupported_executor() -> None:
+    llm = object.__new__(LLM_torch)
+    llm._encode_only = False
+    llm._executor = _FakeUnsupportedResetExecutor()
+
+    with pytest.raises(NotImplementedError,
+                       match="only supported by the PyTorch backend"):
+        llm.reset_prefix_cache()
+
+
+def test_openai_reset_prefix_cache_endpoint() -> None:
+    server = object.__new__(OpenAIServer)
+    server.generator = _FakeResetExecutor()
+
+    response = asyncio.run(server.reset_prefix_cache())
+
+    assert response.status_code == 200
+    assert server.generator.num_reset_calls == 1
+
+
+def test_openai_reset_prefix_cache_endpoint_rejects_unsupported_generator(
+) -> None:
+    server = object.__new__(OpenAIServer)
+    server.generator = object()
+
+    response = asyncio.run(server.reset_prefix_cache())
+
+    assert response.status_code == 501
+
+
+def test_openai_reset_prefix_cache_endpoint_maps_not_implemented() -> None:
+    server = object.__new__(OpenAIServer)
+    server.generator = _FakeNotImplementedResetGenerator()
+
+    response = asyncio.run(server.reset_prefix_cache())
+
+    assert response.status_code == 501
+
+
 def test_openai_completion_list_prompt_stream_reuses_stream_metadata() -> None:
 
     async def run_request():