Skip to content

Commit 461d353

Browse files
authored
fix(core): add busy_timeout + WAL to workspace SQLite connections (#648) (#686)
Enable WAL journaling (and an explicit 5s busy_timeout) on all core workspace SQLite connections via a centralized _open_db() helper, removing the rollback-journal reader/writer case that raises 'database is locked' immediately under parallel batch execution. Adds a public get_db_connection_by_path() accessor and routes budget.py / costs_v2.py through the shared setup. Deterministic held-lock concurrency test added. Also fixes the staging/production Deploy workflow, which had been failing the health check since #643: write AUTH_SECRET (existing repo secret) into the generated env file, fail fast on a blank/whitespace-only value, and document it. Closes #648.
1 parent bb393b7 commit 461d353

6 files changed

Lines changed: 172 additions & 11 deletions

File tree

.env.staging.example

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,12 @@
44
# Anthropic API Key (required for Lead Agent functionality)
55
ANTHROPIC_API_KEY=your-anthropic-api-key-here
66

7+
# Authentication secret (REQUIRED)
8+
# The backend hard-fails on startup if this is unset while auth is enabled
9+
# (auth is enabled by default) — see issue #643. Generate with:
10+
# openssl rand -hex 32
11+
AUTH_SECRET=your-secure-random-secret-here
12+
713
# Database Path (staging database location)
814
DATABASE_PATH=/home/frankbria/projects/codeframe/staging/.codeframe/state.db
915

.github/workflows/deploy.yml

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ jobs:
6363
REMOTE_PATH: ${{ secrets.PROJECT_PATH }}
6464
ENV_ANTHROPIC_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
6565
ENV_OPENAI_KEY: ${{ secrets.OPENAI_API_KEY }}
66+
ENV_AUTH_SECRET: ${{ secrets.AUTH_SECRET }}
6667
ENV_DATABASE_PATH: ${{ secrets.DATABASE_PATH }}
6768
ENV_API_HOST: ${{ secrets.API_HOST }}
6869
ENV_API_PORT: ${{ secrets.API_PORT }}
@@ -79,6 +80,16 @@ jobs:
7980
run: |
8081
echo "📝 Creating .env.staging file..."
8182
83+
# Fail fast if the AUTH_SECRET secret is missing/empty. The backend
84+
# hard-fails on the default secret when auth is enabled (issue #643),
85+
# which otherwise only surfaces as a health-check timeout much later.
86+
if [ -z "${ENV_AUTH_SECRET}" ] || ! printf '%s' "${ENV_AUTH_SECRET}" | grep -q '[^[:space:]]'; then
87+
echo "❌ AUTH_SECRET GitHub Actions secret is not set (or blank/whitespace-only)."
88+
echo " Set it to a secure random value, e.g.:"
89+
echo " gh secret set AUTH_SECRET --body \"\$(openssl rand -hex 32)\""
90+
exit 1
91+
fi
92+
8293
# Build env file content safely using printf (no shell interpretation)
8394
ENV_CONTENT=$(printf '%s\n' \
8495
"# CodeFRAME Environment Configuration" \
@@ -88,6 +99,11 @@ jobs:
8899
"ANTHROPIC_API_KEY=${ENV_ANTHROPIC_KEY}" \
89100
"OPENAI_API_KEY=${ENV_OPENAI_KEY}" \
90101
"" \
102+
"# Authentication" \
103+
"# Required: the backend hard-fails on the default secret when auth" \
104+
"# is enabled (the default) — see issue #643." \
105+
"AUTH_SECRET=${ENV_AUTH_SECRET}" \
106+
"" \
91107
"# Database Configuration" \
92108
"DATABASE_PATH=${ENV_DATABASE_PATH}" \
93109
"" \
@@ -277,6 +293,7 @@ jobs:
277293
REMOTE_PATH: ${{ secrets.PROJECT_PATH }}
278294
ENV_ANTHROPIC_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
279295
ENV_OPENAI_KEY: ${{ secrets.OPENAI_API_KEY }}
296+
ENV_AUTH_SECRET: ${{ secrets.AUTH_SECRET }}
280297
ENV_DATABASE_PATH: ${{ secrets.DATABASE_PATH }}
281298
ENV_API_HOST: ${{ secrets.API_HOST }}
282299
ENV_API_PORT: ${{ secrets.API_PORT }}
@@ -291,6 +308,16 @@ jobs:
291308
run: |
292309
echo "📝 Creating .env.production file..."
293310
311+
# Fail fast if the AUTH_SECRET secret is missing/empty. The backend
312+
# hard-fails on the default secret when auth is enabled (issue #643),
313+
# which otherwise only surfaces as a health-check timeout much later.
314+
if [ -z "${ENV_AUTH_SECRET}" ] || ! printf '%s' "${ENV_AUTH_SECRET}" | grep -q '[^[:space:]]'; then
315+
echo "❌ AUTH_SECRET GitHub Actions secret is not set (or blank/whitespace-only)."
316+
echo " Set it to a secure random value, e.g.:"
317+
echo " gh secret set AUTH_SECRET --body \"\$(openssl rand -hex 32)\""
318+
exit 1
319+
fi
320+
294321
# Build env file content safely using printf (no shell interpretation)
295322
ENV_CONTENT=$(printf '%s\n' \
296323
"# CodeFRAME Environment Configuration" \
@@ -300,6 +327,11 @@ jobs:
300327
"ANTHROPIC_API_KEY=${ENV_ANTHROPIC_KEY}" \
301328
"OPENAI_API_KEY=${ENV_OPENAI_KEY}" \
302329
"" \
330+
"# Authentication" \
331+
"# Required: the backend hard-fails on the default secret when auth" \
332+
"# is enabled (the default) — see issue #643." \
333+
"AUTH_SECRET=${ENV_AUTH_SECRET}" \
334+
"" \
303335
"# Database Configuration" \
304336
"DATABASE_PATH=${ENV_DATABASE_PATH}" \
305337
"" \

codeframe/adapters/e2b/budget.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
from datetime import datetime, timezone
1111
from typing import Any
1212

13+
from codeframe.core.workspace import get_db_connection
14+
1315

1416
def record_cloud_run(
1517
workspace: Any,
@@ -32,7 +34,7 @@ def record_cloud_run(
3234
scan_blocked: Number of files blocked by credential scanner.
3335
"""
3436
created_at = datetime.now(timezone.utc).isoformat()
35-
conn = sqlite3.connect(workspace.db_path)
37+
conn = get_db_connection(workspace)
3638
try:
3739
conn.execute(
3840
"""
@@ -60,7 +62,7 @@ def get_cloud_run(workspace: Any, run_id: str) -> dict | None:
6062
Returns:
6163
Dict with cloud run fields, or None if not found.
6264
"""
63-
conn = sqlite3.connect(workspace.db_path)
65+
conn = get_db_connection(workspace)
6466
conn.row_factory = sqlite3.Row
6567
try:
6668
row = conn.execute(

codeframe/core/workspace.py

Lines changed: 41 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,31 @@ def _get_state_dir(repo_path: Path) -> Path:
5959
return repo_path / CODEFRAME_DIR
6060

6161

62+
def _open_db(db_path: str | Path) -> sqlite3.Connection:
63+
"""Open a workspace SQLite connection with concurrency safeguards.
64+
65+
Mirrors ``codeframe/platform_store/database.py``. The substantive change is
66+
enabling **WAL journaling**: readers no longer block writers, which removes
67+
the rollback-journal case where a writer hits ``database is locked``
68+
immediately (the busy handler is skipped for that reader/writer deadlock).
69+
WAL is a persistent, database-level setting, so applying it on every
70+
connection is idempotent. ``busy_timeout`` is set to 5000ms to match
71+
platform_store and make the value explicit (Python's ``sqlite3.connect``
72+
already defaults to a 5s timeout). Matters under parallel batch execution
73+
where multiple processes and background agent threads write the same DB.
74+
75+
The caller is responsible for closing the connection.
76+
"""
77+
# NOTE: pass ``db_path`` through unchanged (sqlite3.connect accepts both str
78+
# and PathLike). Do NOT wrap in ``str()`` — that would coerce a non-path
79+
# (e.g. a test's MagicMock) into a literal filename and silently create a
80+
# junk DB file instead of raising, diverging from the prior connect call.
81+
conn = sqlite3.connect(db_path)
82+
conn.execute("PRAGMA journal_mode = WAL")
83+
conn.execute("PRAGMA busy_timeout = 5000")
84+
return conn
85+
86+
6287
def _init_database(db_path: Path) -> None:
6388
"""Initialize the workspace SQLite database with v2 schema.
6489
@@ -70,7 +95,7 @@ def _init_database(db_path: Path) -> None:
7095
- blockers: Human-in-the-loop blockers
7196
- checkpoints: State snapshots
7297
"""
73-
conn = sqlite3.connect(db_path)
98+
conn = _open_db(db_path)
7499
cursor = conn.cursor()
75100

76101
# Workspace metadata
@@ -408,7 +433,7 @@ def _ensure_schema_upgrades(db_path: Path) -> None:
408433
This function is idempotent and adds any new tables/columns
409434
that were added after the initial schema creation.
410435
"""
411-
conn = sqlite3.connect(db_path)
436+
conn = _open_db(db_path)
412437
cursor = conn.cursor()
413438

414439
# Check if batch_runs table exists, if not create it
@@ -767,7 +792,7 @@ def create_or_load_workspace(repo_path: Path, tech_stack: Optional[str] = None)
767792
workspace_id = str(uuid.uuid4())
768793
now = _utc_now().isoformat()
769794

770-
conn = sqlite3.connect(db_path)
795+
conn = _open_db(db_path)
771796
cursor = conn.cursor()
772797
cursor.execute(
773798
"INSERT INTO workspace (id, repo_path, tech_stack, created_at, updated_at) VALUES (?, ?, ?, ?, ?)",
@@ -807,7 +832,7 @@ def get_workspace(repo_path: Path) -> Workspace:
807832
# Ensure schema is up to date for existing workspaces
808833
_ensure_schema_upgrades(db_path)
809834

810-
conn = sqlite3.connect(db_path)
835+
conn = _open_db(db_path)
811836
cursor = conn.cursor()
812837
cursor.execute("SELECT id, repo_path, tech_stack, created_at FROM workspace LIMIT 1")
813838
row = cursor.fetchone()
@@ -836,7 +861,17 @@ def get_db_connection(workspace: Workspace) -> sqlite3.Connection:
836861
Returns:
837862
SQLite connection
838863
"""
839-
return sqlite3.connect(workspace.db_path)
864+
return _open_db(workspace.db_path)
865+
866+
867+
def get_db_connection_by_path(db_path: str | Path) -> sqlite3.Connection:
868+
"""Open a workspace DB connection from a raw path (WAL + busy_timeout).
869+
870+
Same connection setup as :func:`get_db_connection`, for callers that hold a
871+
path rather than a :class:`Workspace` (e.g. the costs router's helpers that
872+
tolerate fresh/locked DBs). The caller is responsible for closing it.
873+
"""
874+
return _open_db(db_path)
840875

841876

842877
def workspace_exists(repo_path: Path) -> bool:
@@ -875,7 +910,7 @@ def update_workspace_tech_stack(repo_path: Path, tech_stack: Optional[str]) -> W
875910

876911
now = _utc_now().isoformat()
877912

878-
conn = sqlite3.connect(db_path)
913+
conn = _open_db(db_path)
879914
cursor = conn.cursor()
880915
cursor.execute(
881916
"UPDATE workspace SET tech_stack = ?, updated_at = ?",

codeframe/ui/routers/costs_v2.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
from pydantic import BaseModel
2424

2525
from codeframe.core import tasks as tasks_module
26-
from codeframe.core.workspace import Workspace
26+
from codeframe.core.workspace import Workspace, get_db_connection_by_path
2727
from codeframe.lib.rate_limiter import rate_limit_standard
2828
from codeframe.platform_store.repositories.token_repository import TokenRepository
2929
from codeframe.ui.dependencies import get_v2_workspace
@@ -79,7 +79,7 @@ def _query_costs(db_path: str, days: int) -> Dict:
7979
Remove this workaround once the two schemas converge.
8080
"""
8181
try:
82-
conn = sqlite3.connect(db_path)
82+
conn = get_db_connection_by_path(db_path)
8383
conn.row_factory = sqlite3.Row
8484
except sqlite3.Error as e:
8585
logger.warning("costs: failed to open %s: %s", db_path, e)
@@ -184,7 +184,7 @@ def _open_workspace_conn(db_path: str) -> Optional[sqlite3.Connection]:
184184
fall back to an empty response rather than 500'ing the dashboard.
185185
"""
186186
try:
187-
conn = sqlite3.connect(db_path)
187+
conn = get_db_connection_by_path(db_path)
188188
conn.row_factory = sqlite3.Row
189189
return conn
190190
except sqlite3.Error as e:

tests/core/test_workspace.py

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@
1515
STATE_DB_NAME,
1616
)
1717

18+
# Per CLAUDE.md, new v2 tests carry the marker explicitly (tests/core/ is also
19+
# auto-marked v2 by conftest, but the convention makes the intent self-evident).
20+
pytestmark = pytest.mark.v2
21+
1822

1923
@pytest.fixture
2024
def temp_repo(tmp_path: Path) -> Path:
@@ -175,3 +179,85 @@ def test_returns_correct_path(self, initialized_workspace: Workspace, temp_repo:
175179

176180
def test_path_exists(self, initialized_workspace: Workspace):
177181
assert initialized_workspace.db_path.exists()
182+
183+
184+
class TestConcurrentWriters:
185+
"""Concurrency guards on workspace connections (issue #648)."""
186+
187+
def test_connection_sets_wal_and_busy_timeout(self, initialized_workspace: Workspace):
188+
"""get_db_connection should enable WAL journaling and a busy timeout."""
189+
conn = get_db_connection(initialized_workspace)
190+
try:
191+
journal_mode = conn.execute("PRAGMA journal_mode").fetchone()[0]
192+
busy_timeout = conn.execute("PRAGMA busy_timeout").fetchone()[0]
193+
finally:
194+
conn.close()
195+
196+
assert journal_mode.lower() == "wal"
197+
assert busy_timeout >= 5000
198+
199+
def test_writer_waits_for_held_lock_instead_of_failing(
200+
self, initialized_workspace: Workspace
201+
):
202+
"""A second writer should *wait* for a briefly-held write lock and then
203+
succeed — not fail immediately with ``database is locked``.
204+
205+
This is deterministic by design: one thread holds the write lock
206+
(``BEGIN IMMEDIATE``) for well under the busy timeout, and the second
207+
writer must block until the holder commits. A high-concurrency stress
208+
test was avoided on purpose — SQLite's busy handler is not fair, so
209+
N aggressively-contending writers can starve one past any fixed timeout,
210+
which would make the test flaky without telling us anything new.
211+
"""
212+
import threading
213+
import time
214+
215+
ws = initialized_workspace
216+
217+
# Dedicated probe table so the test is independent of the v2 schema.
218+
setup = get_db_connection(ws)
219+
try:
220+
setup.execute(
221+
"CREATE TABLE lock_probe (id INTEGER PRIMARY KEY, label TEXT)"
222+
)
223+
setup.commit()
224+
finally:
225+
setup.close()
226+
227+
lock_acquired = threading.Event()
228+
hold_seconds = 0.5 # comfortably under the 5s busy timeout
229+
230+
def hold_write_lock() -> None:
231+
conn = get_db_connection(ws)
232+
conn.isolation_level = None # take manual control of the transaction
233+
try:
234+
conn.execute("BEGIN IMMEDIATE")
235+
conn.execute("INSERT INTO lock_probe (id, label) VALUES (1, 'holder')")
236+
lock_acquired.set()
237+
time.sleep(hold_seconds)
238+
conn.execute("COMMIT")
239+
finally:
240+
conn.close()
241+
242+
holder = threading.Thread(target=hold_write_lock)
243+
holder.start()
244+
assert lock_acquired.wait(timeout=5), "holder never acquired the write lock"
245+
246+
# The lock is held now. Without the busy timeout this write would raise
247+
# ``database is locked`` immediately; with it, the write blocks until the
248+
# holder commits (~hold_seconds) and then succeeds.
249+
conn2 = get_db_connection(ws)
250+
try:
251+
conn2.execute("INSERT INTO lock_probe (id, label) VALUES (2, 'waiter')")
252+
conn2.commit()
253+
finally:
254+
conn2.close()
255+
holder.join(timeout=5)
256+
257+
# Both writes landed — the waiter was not dropped.
258+
verify = get_db_connection(ws)
259+
try:
260+
count = verify.execute("SELECT COUNT(*) FROM lock_probe").fetchone()[0]
261+
finally:
262+
verify.close()
263+
assert count == 2

0 commit comments

Comments
 (0)