@@ -2708,6 +2708,114 @@ async def is_disconnected(self):
27082708 return True
27092709
27102710
2711+ class _FakeResetExecutor :
2712+
2713+ def __init__ (self ):
2714+ self .num_reset_calls = 0
2715+
2716+ def reset_prefix_cache (self ):
2717+ self .num_reset_calls += 1
2718+
2719+ def shutdown (self ):
2720+ pass
2721+
2722+
2723+ class _FakeCollectiveResetExecutor :
2724+
2725+ def __init__ (self ):
2726+ self .calls = []
2727+
2728+ def collective_rpc (self , method , args , kwargs , non_block , unique_reply_rank ,
2729+ target_ranks ):
2730+ self .calls .append (
2731+ (method , args , kwargs , non_block , unique_reply_rank , target_ranks ))
2732+ return [None ]
2733+
2734+ def shutdown (self ):
2735+ pass
2736+
2737+
2738+ class _FakeUnsupportedResetExecutor :
2739+
2740+ def shutdown (self ):
2741+ pass
2742+
2743+
2744+ class _FakeNotImplementedResetGenerator :
2745+
2746+ def reset_prefix_cache (self ):
2747+ raise NotImplementedError ("not supported" )
2748+
2749+
2750+ def test_llm_reset_prefix_cache_dispatches_to_executor () -> None :
2751+ llm = object .__new__ (LLM_torch )
2752+ llm ._encode_only = False
2753+ llm ._executor = _FakeResetExecutor ()
2754+
2755+ llm .reset_prefix_cache ()
2756+
2757+ assert llm ._executor .num_reset_calls == 1
2758+
2759+
2760+ def test_llm_reset_prefix_cache_uses_collective_rpc () -> None :
2761+ llm = object .__new__ (LLM_torch )
2762+ llm ._encode_only = False
2763+ llm ._executor = _FakeCollectiveResetExecutor ()
2764+
2765+ llm .reset_prefix_cache ()
2766+
2767+ assert llm ._executor .calls == [("reset_prefix_cache" , (), None , False , None ,
2768+ None )]
2769+
2770+
2771+ def test_llm_reset_prefix_cache_rejects_encode_only () -> None :
2772+ llm = object .__new__ (LLM_torch )
2773+ llm ._encode_only = True
2774+ llm ._executor = _FakeResetExecutor ()
2775+
2776+ with pytest .raises (RuntimeError , match = "encode_only=True" ):
2777+ llm .reset_prefix_cache ()
2778+
2779+
2780+ def test_llm_reset_prefix_cache_rejects_unsupported_executor () -> None :
2781+ llm = object .__new__ (LLM_torch )
2782+ llm ._encode_only = False
2783+ llm ._executor = _FakeUnsupportedResetExecutor ()
2784+
2785+ with pytest .raises (NotImplementedError ,
2786+ match = "only supported by the PyTorch backend" ):
2787+ llm .reset_prefix_cache ()
2788+
2789+
2790+ def test_openai_reset_prefix_cache_endpoint () -> None :
2791+ server = object .__new__ (OpenAIServer )
2792+ server .generator = _FakeResetExecutor ()
2793+
2794+ response = asyncio .run (server .reset_prefix_cache ())
2795+
2796+ assert response .status_code == 200
2797+ assert server .generator .num_reset_calls == 1
2798+
2799+
2800+ def test_openai_reset_prefix_cache_endpoint_rejects_unsupported_generator (
2801+ ) -> None :
2802+ server = object .__new__ (OpenAIServer )
2803+ server .generator = object ()
2804+
2805+ response = asyncio .run (server .reset_prefix_cache ())
2806+
2807+ assert response .status_code == 501
2808+
2809+
2810+ def test_openai_reset_prefix_cache_endpoint_maps_not_implemented () -> None :
2811+ server = object .__new__ (OpenAIServer )
2812+ server .generator = _FakeNotImplementedResetGenerator ()
2813+
2814+ response = asyncio .run (server .reset_prefix_cache ())
2815+
2816+ assert response .status_code == 501
2817+
2818+
27112819def test_openai_completion_list_prompt_stream_reuses_stream_metadata () -> None :
27122820
27132821 async def run_request ():
0 commit comments