diff --git a/packages/razorback-plugin-dab/src/razorback_plugin_dab/cli.py b/packages/razorback-plugin-dab/src/razorback_plugin_dab/cli.py index 961fd76..d66c2cf 100644 --- a/packages/razorback-plugin-dab/src/razorback_plugin_dab/cli.py +++ b/packages/razorback-plugin-dab/src/razorback_plugin_dab/cli.py @@ -63,9 +63,11 @@ def generate( help="Dataset materialization mode: bind (default — bind-mount dumps from data_root) or copy (per-task workdir copy).", ), postgres_volume_mode: str = typer.Option( - "reuse", + "fresh", "--postgres-volume-mode", - help="postgres data volume strategy: reuse (default — dataset-keyed shared volume) or fresh (per-task unique volume).", + help="postgres data volume strategy: fresh (default — per-task unique volume, " + "concurrency-safe) or reuse (dataset-keyed shared volume; NOT safe under " + "concurrency.trials>1 — two same-dataset cells collide on one writable PGDATA).", ), query_mode: str = typer.Option( "per-query", diff --git a/packages/razorback-plugin-dab/src/razorback_plugin_dab/generate/prepare.py b/packages/razorback-plugin-dab/src/razorback_plugin_dab/generate/prepare.py index f71a174..611ff3c 100644 --- a/packages/razorback-plugin-dab/src/razorback_plugin_dab/generate/prepare.py +++ b/packages/razorback-plugin-dab/src/razorback_plugin_dab/generate/prepare.py @@ -65,7 +65,7 @@ def prepare_dataset_tasks( docker_image: str = DEFAULT_AGENT_IMAGE, container_workdir: str = DEFAULT_CONTAINER_WORKDIR, materialize_mode: str = "bind", - postgres_volume_mode: str = "reuse", + postgres_volume_mode: str = "fresh", query_mode: str = "per-query", ) -> list[TaskManifestEntry]: """Materialize one harbor task dir per query under tasks_root/-q/. @@ -80,10 +80,21 @@ def prepare_dataset_tasks( the other dataset payload. Preserved for provenance-strict runs. postgres_volume_mode: - "reuse" (default) — postgres data volume is keyed on (dataset, - schema_version) so init.d runs ONCE per dataset across variants - and trials. - "fresh" — per-task unique volume; init.d runs every trial. + "fresh" (default) — per-task unique postgres data volume (keyed on + task_id). CONCURRENCY-SAFE: under concurrency.trials>1, two cells + of the same dataset must NOT share one writable PGDATA dir — + postgres locks the data dir, so the second container can't start, + goes unhealthy, and the cell errors. Per-task volumes also avoid + stale cross-run data (the volume does not survive across runs as a + dataset-keyed one would). Cost: init.d (DB restore) runs per cell + instead of once per dataset; DAB postgres dumps are small + (≤8.5MB / ~90k lines for crmarenapro, the worst concurrent case; + PATENTS is 129MB but only 3 queries) and restore well within the + healthcheck budget. + "reuse" — postgres data volume keyed on (dataset, schema_version) so + init.d runs ONCE per dataset. Restore-once optimization, but NOT + concurrency-safe (see above). Only safe for serial single-trial + runs. """ if materialize_mode not in ("bind", "copy"): raise ValueError( diff --git a/packages/razorback-plugin-dab/tests/unit/test_compose_dataset_volume.py b/packages/razorback-plugin-dab/tests/unit/test_compose_dataset_volume.py index 9132146..82d12ab 100644 --- a/packages/razorback-plugin-dab/tests/unit/test_compose_dataset_volume.py +++ b/packages/razorback-plugin-dab/tests/unit/test_compose_dataset_volume.py @@ -7,6 +7,7 @@ import yaml from razorback_plugin_dab.generate.compose import generate_compose +from razorback_plugin_dab.generate.prepare import prepare_dataset_tasks def _bookreview_cfg(data_root: Path) -> dict: @@ -122,6 +123,67 @@ def test_reuse_mode_default_no_per_task_suffix(tmp_path: Path): assert vol_names == ["dab-postgres-data-bookreview-v1"] +def _two_query_postgres_data_root(root: Path) -> Path: + """Bookreview-shaped data root with TWO postgres queries of one dataset. + + Used to assert the concurrency-safety invariant: under the operator + default, two cells of the SAME dataset get NON-colliding postgres data + volume names so a trials:2 run never mounts one writable PGDATA twice. + """ + data_root = root / "data" + qdir = data_root / "query_bookreview" + qdir.mkdir(parents=True) + (qdir / "db_config.yaml").write_text(yaml.safe_dump({ + "db_clients": { + "books_database": { + "db_type": "postgres", + "db_name": "bookreview_db", + "sql_file": "query_dataset/books_info.sql", + }, + } + })) + (qdir / "db_description.txt").write_text("Bookreview schema description.") + qd = qdir / "query_dataset" + qd.mkdir() + (qd / "books_info.sql").write_text("CREATE TABLE books (id INT);\n") + for qid in (1, 2): + q = qdir / f"query{qid}" + q.mkdir() + (q / "query.json").write_text('{"question": "how many?"}') + return data_root + + +def _pg_data_volume_name(task_dir: Path) -> str: + compose = yaml.safe_load( + (task_dir / "environment" / "docker-compose.yaml").read_text() + ) + pg_volumes = compose["services"]["dab-postgres"]["volumes"] + pgdata = [v for v in pg_volumes if "/var/lib/postgresql/data" in v] + return pgdata[0].split(":", 1)[0] + + +def test_same_dataset_tasks_get_distinct_postgres_volumes(tmp_path: Path): + """Concurrency-safety invariant (the dab0015 corruption bug): two cells of + the SAME dataset, materialized with the operator default, must NOT key on + the same writable postgres data volume. Under trials:2 they start + concurrently and two containers mounting one PGDATA dir collide + (postgres locks the dir -> second container unhealthy -> cell errors).""" + data_root = _two_query_postgres_data_root(tmp_path) + out = tmp_path / "tasks" + manifest = prepare_dataset_tasks( + data_root=data_root, + dataset="bookreview", + tasks_root=out, + workspace_variant="direct-minimal", + materialize_mode="copy", + ) + assert len(manifest) == 2 + vol_names = {_pg_data_volume_name(e["task_dir"]) for e in manifest} + assert len(vol_names) == 2, ( + f"two queries of one dataset must get distinct PGDATA volumes; got {vol_names}" + ) + + def test_postgres_volume_name_lowercased_for_caps_datasets(tmp_path: Path): """Docker volume names should be case-stable; the catalog names like PANCANCER_ATLAS must lowercase to a valid volume name."""