Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions packages/razorback-plugin-dab/src/razorback_plugin_dab/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,11 @@ def generate(
help="Dataset materialization mode: bind (default — bind-mount dumps from data_root) or copy (per-task workdir copy).",
),
postgres_volume_mode: str = typer.Option(
"reuse",
"fresh",
"--postgres-volume-mode",
help="postgres data volume strategy: reuse (default — dataset-keyed shared volume) or fresh (per-task unique volume).",
help="postgres data volume strategy: fresh (default — per-task unique volume, "
"concurrency-safe) or reuse (dataset-keyed shared volume; NOT safe under "
"concurrency.trials>1 — two same-dataset cells collide on one writable PGDATA).",
),
query_mode: str = typer.Option(
"per-query",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def prepare_dataset_tasks(
docker_image: str = DEFAULT_AGENT_IMAGE,
container_workdir: str = DEFAULT_CONTAINER_WORKDIR,
materialize_mode: str = "bind",
postgres_volume_mode: str = "reuse",
postgres_volume_mode: str = "fresh",
query_mode: str = "per-query",
) -> list[TaskManifestEntry]:
"""Materialize one harbor task dir per query under tasks_root/<dataset>-q<n>/.
Expand All @@ -80,10 +80,21 @@ def prepare_dataset_tasks(
the other dataset payload. Preserved for provenance-strict runs.

postgres_volume_mode:
"reuse" (default) — postgres data volume is keyed on (dataset,
schema_version) so init.d runs ONCE per dataset across variants
and trials.
"fresh" — per-task unique volume; init.d runs every trial.
"fresh" (default) — per-task unique postgres data volume (keyed on
task_id). CONCURRENCY-SAFE: under concurrency.trials>1, two cells
of the same dataset must NOT share one writable PGDATA dir —
postgres locks the data dir, so the second container can't start,
goes unhealthy, and the cell errors. Per-task volumes also avoid
stale cross-run data (the volume does not survive across runs as a
dataset-keyed one would). Cost: init.d (DB restore) runs per cell
instead of once per dataset; DAB postgres dumps are small
(≤8.5MB / ~90k lines for crmarenapro, the worst concurrent case;
Comment on lines +87 to +91
PATENTS is 129MB but only 3 queries) and restore well within the
healthcheck budget.
"reuse" — postgres data volume keyed on (dataset, schema_version) so
init.d runs ONCE per dataset. Restore-once optimization, but NOT
concurrency-safe (see above). Only safe for serial single-trial
runs.
"""
if materialize_mode not in ("bind", "copy"):
raise ValueError(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import yaml

from razorback_plugin_dab.generate.compose import generate_compose
from razorback_plugin_dab.generate.prepare import prepare_dataset_tasks


def _bookreview_cfg(data_root: Path) -> dict:
Expand Down Expand Up @@ -122,6 +123,67 @@ def test_reuse_mode_default_no_per_task_suffix(tmp_path: Path):
assert vol_names == ["dab-postgres-data-bookreview-v1"]


def _two_query_postgres_data_root(root: Path) -> Path:
"""Bookreview-shaped data root with TWO postgres queries of one dataset.

Used to assert the concurrency-safety invariant: under the operator
default, two cells of the SAME dataset get NON-colliding postgres data
volume names so a trials:2 run never mounts one writable PGDATA twice.
"""
data_root = root / "data"
qdir = data_root / "query_bookreview"
qdir.mkdir(parents=True)
(qdir / "db_config.yaml").write_text(yaml.safe_dump({
"db_clients": {
"books_database": {
"db_type": "postgres",
"db_name": "bookreview_db",
"sql_file": "query_dataset/books_info.sql",
},
}
}))
(qdir / "db_description.txt").write_text("Bookreview schema description.")
qd = qdir / "query_dataset"
qd.mkdir()
(qd / "books_info.sql").write_text("CREATE TABLE books (id INT);\n")
for qid in (1, 2):
q = qdir / f"query{qid}"
q.mkdir()
(q / "query.json").write_text('{"question": "how many?"}')
return data_root


def _pg_data_volume_name(task_dir: Path) -> str:
compose = yaml.safe_load(
(task_dir / "environment" / "docker-compose.yaml").read_text()
)
pg_volumes = compose["services"]["dab-postgres"]["volumes"]
pgdata = [v for v in pg_volumes if "/var/lib/postgresql/data" in v]
return pgdata[0].split(":", 1)[0]


def test_same_dataset_tasks_get_distinct_postgres_volumes(tmp_path: Path):
"""Concurrency-safety invariant (the dab0015 corruption bug): two cells of
the SAME dataset, materialized with the operator default, must NOT key on
the same writable postgres data volume. Under trials:2 they start
concurrently and two containers mounting one PGDATA dir collide
(postgres locks the dir -> second container unhealthy -> cell errors)."""
data_root = _two_query_postgres_data_root(tmp_path)
out = tmp_path / "tasks"
manifest = prepare_dataset_tasks(
data_root=data_root,
dataset="bookreview",
tasks_root=out,
workspace_variant="direct-minimal",
materialize_mode="copy",
)
assert len(manifest) == 2
vol_names = {_pg_data_volume_name(e["task_dir"]) for e in manifest}
assert len(vol_names) == 2, (
f"two queries of one dataset must get distinct PGDATA volumes; got {vol_names}"
)


def test_postgres_volume_name_lowercased_for_caps_datasets(tmp_path: Path):
"""Docker volume names should be case-stable; the catalog names like
PANCANCER_ATLAS must lowercase to a valid volume name."""
Expand Down