Skip to content

Commit b84f798

Browse files
authored
feat(transaction): add redo recovery toggle for session commits (#1934)
Allow crash-recovery redo for session commit phase 2 to be disabled via configuration while keeping the default behavior unchanged. This lets deployments skip pending redo marker writes and startup redo recovery when the mechanism is not wanted.
1 parent 3c31cc4 commit b84f798

7 files changed

Lines changed: 292 additions & 123 deletions

File tree

openviking/service/core.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ def _init_storage(
147147
agfs=self._agfs_client,
148148
lock_timeout=tx_cfg.lock_timeout,
149149
lock_expire=tx_cfg.lock_expire,
150+
redo_recovery_enabled=tx_cfg.redo_recovery_enabled,
150151
)
151152

152153
@property

openviking/session/session.py

Lines changed: 212 additions & 121 deletions
Large diffs are not rendered by default.

openviking/storage/transaction/lock_manager.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,12 @@ def __init__(
2626
agfs: AGFSClient,
2727
lock_timeout: float = 0.0,
2828
lock_expire: float = 300.0,
29+
redo_recovery_enabled: bool = True,
2930
):
3031
self._agfs = agfs
3132
self._path_lock = PathLock(agfs, lock_expire=lock_expire)
3233
self._lock_timeout = lock_timeout
34+
self._redo_recovery_enabled = redo_recovery_enabled
3335
self._redo_log = RedoLog(agfs)
3436
self._handles: Dict[str, LockHandle] = {}
3537
self._cleanup_task: Optional[asyncio.Task] = None
@@ -40,6 +42,10 @@ def __init__(
4042
def redo_log(self) -> RedoLog:
4143
return self._redo_log
4244

45+
@property
46+
def redo_recovery_enabled(self) -> bool:
47+
return self._redo_recovery_enabled
48+
4349
def _mark_handle_active(self, handle: LockHandle) -> None:
4450
handle.last_active_at = time.time()
4551

@@ -55,7 +61,10 @@ async def start(self) -> None:
5561
"""Start background cleanup and redo recovery."""
5662
self._running = True
5763
self._cleanup_task = asyncio.create_task(self._stale_cleanup_loop())
58-
self._redo_task = asyncio.create_task(self._recover_pending_redo())
64+
if self._redo_recovery_enabled:
65+
self._redo_task = asyncio.create_task(self._recover_pending_redo())
66+
else:
67+
logger.info("Redo recovery disabled by config; skipping pending redo recovery")
5968

6069
async def stop(self) -> None:
6170
"""Stop cleanup and release all active locks."""
@@ -367,9 +376,15 @@ def init_lock_manager(
367376
agfs: AGFSClient,
368377
lock_timeout: float = 0.0,
369378
lock_expire: float = 300.0,
379+
redo_recovery_enabled: bool = True,
370380
) -> LockManager:
371381
global _lock_manager
372-
_lock_manager = LockManager(agfs=agfs, lock_timeout=lock_timeout, lock_expire=lock_expire)
382+
_lock_manager = LockManager(
383+
agfs=agfs,
384+
lock_timeout=lock_timeout,
385+
lock_expire=lock_expire,
386+
redo_recovery_enabled=redo_recovery_enabled,
387+
)
373388
return _lock_manager
374389

375390

openviking_cli/utils/config/transaction_config.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,12 @@ class TransactionConfig(BaseModel):
2929
),
3030
)
3131

32+
redo_recovery_enabled: bool = Field(
33+
default=True,
34+
description=(
35+
"Enable session commit phase-2 crash-recovery redo. "
36+
"When false, pending redo markers are not written and startup redo recovery is skipped."
37+
),
38+
)
39+
3240
model_config = {"extra": "forbid"}

tests/session/test_session_commit.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,15 @@
55

66
import asyncio
77
import json
8+
from unittest.mock import MagicMock
89

910
import pytest
1011

1112
from openviking import AsyncOpenViking
1213
from openviking.message import TextPart
1314
from openviking.service.task_tracker import get_task_tracker
1415
from openviking.session import Session
16+
from openviking.storage.transaction import get_lock_manager
1517
from openviking_cli.exceptions import FailedPreconditionError
1618

1719

@@ -236,3 +238,20 @@ async def failing_extract(*args, **kwargs):
236238
session.add_message("user", [TextPart("Second round message")])
237239
with pytest.raises(FailedPreconditionError, match="unresolved failed archive"):
238240
await session.commit_async()
241+
242+
async def test_commit_skips_redo_when_recovery_disabled(
243+
self, session_with_messages: Session, monkeypatch: pytest.MonkeyPatch
244+
):
245+
"""Phase 2 should not write or clear redo markers when redo recovery is disabled."""
246+
247+
redo_log = MagicMock()
248+
lock_manager = get_lock_manager()
249+
monkeypatch.setattr(lock_manager, "_redo_recovery_enabled", False)
250+
monkeypatch.setattr(lock_manager, "_redo_log", redo_log)
251+
252+
result = await session_with_messages.commit_async()
253+
task_result = await _wait_for_task(result["task_id"])
254+
255+
assert task_result["status"] == "completed"
256+
redo_log.write_pending.assert_not_called()
257+
redo_log.mark_done.assert_not_called()

tests/test_config_loader.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,24 @@ def test_openviking_config_retrieval_hotness_alpha_defaults_to_zero(monkeypatch)
198198

199199
assert config.retrieval.hotness_alpha == 0.0
200200
assert config.retrieval.score_propagation_alpha == 0.5
201+
assert config.storage.transaction.redo_recovery_enabled is True
202+
203+
OpenVikingConfigSingleton.reset_instance()
204+
205+
206+
def test_openviking_config_transaction_redo_recovery_enabled_can_be_disabled(monkeypatch):
207+
monkeypatch.setenv(OPENVIKING_CONFIG_ENV, "/tmp/codex-no-config.json")
208+
209+
from openviking_cli.utils.config.open_viking_config import (
210+
OpenVikingConfig,
211+
OpenVikingConfigSingleton,
212+
)
213+
214+
config = OpenVikingConfig.from_dict(
215+
{"storage": {"transaction": {"redo_recovery_enabled": False}}}
216+
)
217+
218+
assert config.storage.transaction.redo_recovery_enabled is False
201219

202220
OpenVikingConfigSingleton.reset_instance()
203221

tests/transaction/test_lock_manager.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,20 @@ async def test_recover_pending_redo_preserves_cancelled_error(self, lm):
9999
await lm._recover_pending_redo()
100100

101101
lm._redo_log.mark_done.assert_not_called()
102+
103+
async def test_start_skips_redo_recovery_when_disabled(self, client):
104+
lm_disabled = LockManager(
105+
agfs=client._client.service._agfs_client,
106+
lock_timeout=1.0,
107+
lock_expire=1.0,
108+
redo_recovery_enabled=False,
109+
)
110+
lm_disabled._recover_pending_redo = AsyncMock()
111+
112+
await lm_disabled.start()
113+
await asyncio.sleep(0)
114+
115+
assert lm_disabled._redo_task is None
116+
lm_disabled._recover_pending_redo.assert_not_called()
117+
118+
await lm_disabled.stop()

0 commit comments

Comments
 (0)