@@ -1739,18 +1739,22 @@ def test_llm_context_only_timed_out(transceiver_runtime):
17391739 disaggregated_params = disaggregated_params ):
17401740 print (output )
17411741
1742+ # Wait until the context-only request has allocated KV cache blocks
17421743 max_retries = 10
1744+ all_results = []
17431745 for _ in range (max_retries ):
17441746 results = llm .get_stats (2 )
1745- if len (results ) == 1 :
1747+ all_results .extend (results )
1748+ if all_results and all_results [- 1 ]["kvCacheStats" ]["usedNumBlocks" ] > 0 :
17461749 break
17471750 time .sleep (1 )
17481751 else :
17491752 pytest .fail (
1750- f"Failed to get stats with len==1 after { max_retries } retries" )
1753+ f"Context-only KV cache blocks not allocated after { max_retries } retries"
1754+ )
1755+ results = all_results
17511756
1752- assert len (results ) == 1
1753- context_only_used_num_blocks = results [0 ]["kvCacheStats" ]["usedNumBlocks" ]
1757+ context_only_used_num_blocks = results [- 1 ]["kvCacheStats" ]["usedNumBlocks" ]
17541758 print (f"Context only used num blocks: { context_only_used_num_blocks } " )
17551759
17561760 # Sleep 5 seconds to allow context only request to time out
@@ -1760,11 +1764,21 @@ def test_llm_context_only_timed_out(transceiver_runtime):
17601764 for output in llm .generate (prompts0 , sampling_params = sampling_params ):
17611765 print (output )
17621766
1763- # Get number of allocated blocks
1764- results = llm .get_stats (2 )
1765- assert len (results ) == 1
1766- final_used_num_blocks = results [0 ]["kvCacheStats" ]["usedNumBlocks" ]
1767+ # Wait until KV cache blocks are released (usedNumBlocks == 0)
1768+ max_retries = 10
1769+ all_results = []
1770+ for _ in range (max_retries ):
1771+ results = llm .get_stats (2 )
1772+ all_results .extend (results )
1773+ if all_results and all_results [- 1 ]["kvCacheStats" ]["usedNumBlocks" ] == 0 :
1774+ break
1775+ time .sleep (1 )
1776+ else :
1777+ pytest .fail (
1778+ f"KV cache blocks not released after { max_retries } retries" )
1779+ results = all_results
17671780
1781+ final_used_num_blocks = results [- 1 ]["kvCacheStats" ]["usedNumBlocks" ]
17681782 assert final_used_num_blocks == 0
17691783
17701784
0 commit comments