add thread safety to cache operations and implement LRU eviction

binary-husky · binary-husky · commit 39ab72e97e30 · 2026-03-04T16:06:34.000+08:00
diff --git a/ajet/tuner_lib/experimental/as_oai_model_server.py b/ajet/tuner_lib/experimental/as_oai_model_server.py
@@ -24,6 +24,7 @@
 
 from loguru import logger
 from pydantic import BaseModel
+from functools import lru_cache
 from fastapi import FastAPI, Header, HTTPException, Request
 from fastapi.responses import StreamingResponse
 from contextlib import asynccontextmanager
@@ -63,6 +64,9 @@ class HealthCheckRequest(BaseModel):
 context = zmq.Context()
 atexit.register(context.term)
 
+@lru_cache(maxsize=128)
+def ep_key(episode_uuid: str) -> str:
+    return f"episodes-{episode_uuid}"
 
 def get_app(max_fastapi_threads: int = 512, enable_swarm_mode=False, shared_mem_dict=None, shared_mem_dict_lock=None) -> Tuple[FastAPI, Optional[Coroutine]]:
 
@@ -100,6 +104,14 @@ def _begin_handle_chat_completion(episode_address, int_req: InterchangeCompletio
 
         result_str = ""
         for _ in range(50):  # max 5 minutes wait
+
+            if enable_swarm_mode:
+                assert shared_mem_dict is not None
+                ep_stat = shared_mem_dict[ep_key(episode_uuid)]
+                episode_status = ep_stat.episode_status
+                if episode_status != "claimed":
+                    raise HTTPException(status_code=404, detail="The episode is not claimed, cannot accept new requests.")
+
             try:
                 if DEBUG: logger.info(f"[server] episode_uuid: {episode_uuid} | recv_string begin.")
 
diff --git a/ajet/utils/tokenizer.py b/ajet/utils/tokenizer.py
@@ -1,5 +1,6 @@
 import copy
 import json
+import threading
 from typing import Dict, List
 
 
@@ -21,6 +22,7 @@ def cleanup_messages(messages: List[Dict]) -> List[Dict]:
 
 # Cache storage
 _cache = {}
+_cache_lock = threading.Lock()
 
 
 def ajet_apply_chat_template(
@@ -41,11 +43,12 @@ def ajet_apply_chat_template(
         tokenize,
     )
 
-    # Check cache
-    if cache_key in _cache:
-        return _cache[cache_key]
+    # Check cache with thread safety
+    with _cache_lock:
+        if cache_key in _cache:
+            return _cache[cache_key]
 
-    # Compute result
+    # Compute result (time consuming) - outside lock to avoid blocking other threads
     if tools:
         result = tokenizer.apply_chat_template(
             conversation,
@@ -60,10 +63,16 @@ def ajet_apply_chat_template(
             add_generation_prompt=add_generation_prompt,
         )
 
-    # Store in cache (implement LRU eviction if cache gets too large)
-    if len(_cache) >= 1024:
-        # Remove oldest item (first inserted)
-        _cache.pop(next(iter(_cache)))
+    # Store in cache with thread safety (implement LRU eviction if cache gets too large)
+    with _cache_lock:
+        if len(_cache) >= 1024:
+            # Remove oldest item (first inserted)
+            try:
+                _cache.pop(next(iter(_cache)))
+            except KeyError:
+                # Cache was modified by another thread, which is fine
+                pass
+
+        _cache[cache_key] = result
 
-    _cache[cache_key] = result
     return result