Skip to content

Commit e28dd0a

Browse files
authored
fix: prevent startup hang from blocking VLM call in redo recovery (#1226)
Move redo recovery to a background task so the server starts without waiting for potentially slow VLM calls. Add 60s timeout to extract_long_term_memories to prevent indefinite hangs on individual redo tasks. Clean up the redo task on stop(). Fixes #1222 Co-authored-by: Matt Van Horn <455140+mvanhorn@users.noreply.github.com>
1 parent 31577da commit e28dd0a

1 file changed

Lines changed: 17 additions & 6 deletions

File tree

openviking/storage/transaction/lock_manager.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def __init__(
3333
self._redo_log = RedoLog(agfs)
3434
self._handles: Dict[str, LockHandle] = {}
3535
self._cleanup_task: Optional[asyncio.Task] = None
36+
self._redo_task: Optional[asyncio.Task] = None
3637
self._running = False
3738

3839
@property
@@ -54,11 +55,18 @@ async def start(self) -> None:
5455
"""Start background cleanup and redo recovery."""
5556
self._running = True
5657
self._cleanup_task = asyncio.create_task(self._stale_cleanup_loop())
57-
await self._recover_pending_redo()
58+
self._redo_task = asyncio.create_task(self._recover_pending_redo())
5859

5960
async def stop(self) -> None:
6061
"""Stop cleanup and release all active locks."""
6162
self._running = False
63+
if self._redo_task:
64+
self._redo_task.cancel()
65+
try:
66+
await self._redo_task
67+
except asyncio.CancelledError:
68+
pass
69+
self._redo_task = None
6270
if self._cleanup_task:
6371
self._cleanup_task.cancel()
6472
try:
@@ -299,11 +307,14 @@ async def _redo_session_memory(self, info: Dict[str, Any]) -> None:
299307
from openviking.session import create_session_compressor
300308

301309
compressor = create_session_compressor(vikingdb=None)
302-
memories = await compressor.extract_long_term_memories(
303-
messages=messages,
304-
user=user,
305-
session_id=session_id,
306-
ctx=ctx,
310+
memories = await asyncio.wait_for(
311+
compressor.extract_long_term_memories(
312+
messages=messages,
313+
user=user,
314+
session_id=session_id,
315+
ctx=ctx,
316+
),
317+
timeout=60.0,
307318
)
308319
logger.info(f"Redo: extracted {len(memories)} memories from {archive_uri}")
309320
except Exception as e:

0 commit comments

Comments
 (0)