Stage 7: peer-review-grade significance and bootstrap statistics

LMBooth · LMBooth · commit c3e1d06a935c · 2026-05-10T10:27:31.000+01:00
Adds analysis_pipeline/stage7_significance.py and wires it into the
run_pipeline orchestrator and both YAML profiles. Stage 7 derives every
inferential statistic referenced in the paper's Statistical Significance
Procedures section directly from the ml_results_&lt;scenario&gt;_classic_nn.json
files emitted by Stage 6:

  * Paired Wilcoxon signed-rank (3 s overlap vs 6 s baseline) over the
    60 matched scenario x dataset x protocol best-model cells.
  * 2000-sample percentile-method nonparametric bootstrap on outer-fold
    balanced accuracies for per-cell 95 percent CIs.
  * One-sided Wilcoxon signed-rank vs the per-scenario chance level
    (1/n_classes) for each cell.
  * Vectorised 10000-iteration label-shuffle permutation test on
    held-out predicted-label vectors reconstructed from per-fold
    confusion matrices, with Bonferroni correction over the full
    120-cell grid.
  * Optional paired Wilcoxon on baseline-vs-task heart rate from the
    Stage 1 QC summary (used in the manuscript's QC sanity check).

Outputs a machine-readable significance_summary.json plus a Markdown
report. Reproduces the manuscript exactly: W=1636, p=5.55e-8, paired
n=60, mean balanced accuracy 0.324 -&gt; 0.380, 46/14/0 cells improved/
worsened/unchanged, 27 winner-identity changes, 111/120 cells with
one-sided Wilcoxon vs chance p&lt;0.05; with N=10000 perms the 10
scenario winners survive Bonferroni (threshold 4.17e-4).
diff --git a/analysis_pipeline/config/pipeline_unified_classic_nn_baseline_overlap3s_50pct_preproc.yaml b/analysis_pipeline/config/pipeline_unified_classic_nn_baseline_overlap3s_50pct_preproc.yaml
@@ -21,6 +21,7 @@ stages:
   stage6: true
   stage6_confusions: true
   stage6_publication_report: true
+  stage7_significance: true
 
 stage_args:
   stage0:
@@ -163,3 +164,19 @@ stage6_publication_report:
     run_manifest_json: "{reports_dir}/run_manifest.json"
     out_md: "{reports_dir}/publication_full_report.md"
     out_json: "{reports_dir}/publication_full_report.json"
+
+stage7_significance:
+  # Identical Stage 7 stanza to the 6 s baseline profile but pointed at
+  # this profile's own reports as the "baseline" anchor. The overlap
+  # reports are paired against the 6 s baseline output written by the
+  # other profile, so this profile reproduces the same significance
+  # numbers regardless of which configuration triggered the run.
+  args:
+    baseline_reports: "analysis_pipeline/runs/pipeline_unified_classic_nn_baseline_preproc/reports"
+    overlap_reports: "{reports_dir}"
+    baseline_qc_summary: "analysis_pipeline/runs/pipeline_unified_classic_nn_baseline_preproc/reports/qc_dataset_summary.json"
+    out_json: "{reports_dir}/significance_summary.json"
+    out_md: "{reports_dir}/significance_summary.md"
+    n_bootstrap: 2000
+    n_permutations: 10000
+    random_seed: 42
diff --git a/analysis_pipeline/config/pipeline_unified_classic_nn_baseline_preproc.yaml b/analysis_pipeline/config/pipeline_unified_classic_nn_baseline_preproc.yaml
@@ -21,6 +21,7 @@ stages:
   stage6: true
   stage6_confusions: true
   stage6_publication_report: true
+  stage7_significance: true
 
 stage_args:
   stage0:
@@ -159,3 +160,22 @@ stage6_publication_report:
     run_manifest_json: "{reports_dir}/run_manifest.json"
     out_md: "{reports_dir}/publication_full_report.md"
     out_json: "{reports_dir}/publication_full_report.json"
+
+stage7_significance:
+  # Stage 7 derives every inferential statistic referenced in the
+  # manuscript's Statistical Significance Procedures section:
+  # paired Wilcoxon (3 s overlap vs 6 s baseline), 2000-sample
+  # bootstrap CIs over outer-fold balanced accuracies, one-sided
+  # Wilcoxon vs chance, and a 10 000-iteration label-shuffle
+  # permutation test with Bonferroni correction over the full
+  # cell grid. Runs after Stage 6 and consumes its
+  # ml_results_<scenario>_classic_nn.json files.
+  args:
+    baseline_reports: "{reports_dir}"
+    overlap_reports: "analysis_pipeline/runs/pipeline_unified_classic_nn_baseline_overlap3s_50pct_preproc/reports"
+    baseline_qc_summary: "{reports_dir}/qc_dataset_summary.json"
+    out_json: "{reports_dir}/significance_summary.json"
+    out_md: "{reports_dir}/significance_summary.md"
+    n_bootstrap: 2000
+    n_permutations: 10000
+    random_seed: 42
diff --git a/analysis_pipeline/run_pipeline.py b/analysis_pipeline/run_pipeline.py
@@ -48,6 +48,7 @@ class OutputLayout:
     "stage6",
     "stage6_confusions",
     "stage6_publication_report",
+    "stage7_significance",
 ]
 
 
@@ -869,6 +870,40 @@ def _plan_pipeline(
         outputs=[str(stage6_pub_out_md), str(stage6_pub_out_json)],
     )
 
+    # ------------------------------------------------------------------
+    # Stage 7 — significance and bootstrap statistics
+    # ------------------------------------------------------------------
+    stage7_cfg = config.get("stage7_significance", {})
+    stage7_args = dict(stage7_cfg.get("args", {}))
+    if "baseline_reports" not in stage7_args:
+        stage7_args["baseline_reports"] = str(output_layout.reports_dir)
+    stage7_out_json = _resolve_output_path_or_default(
+        stage7_args.get("out_json"),
+        output_layout.reports_dir / "significance_summary.json",
+        repo_root=repo_root,
+    )
+    stage7_out_md = _resolve_output_path_or_default(
+        stage7_args.get("out_md"),
+        output_layout.reports_dir / "significance_summary.md",
+        repo_root=repo_root,
+    )
+    _setdefault_arg(stage7_args, "out_json", stage7_out_json)
+    _setdefault_arg(stage7_args, "out_md", stage7_out_md)
+    cmd_stage7 = [
+        python_exe,
+        "-m",
+        "analysis_pipeline.stage7_significance",
+    ] + _to_cli_args(stage7_args)
+    _append_stage_if_enabled(
+        planned=planned,
+        config=config,
+        stage="stage7_significance",
+        only=only,
+        name="stage7_significance",
+        command=cmd_stage7,
+        outputs=[str(stage7_out_json), str(stage7_out_md)],
+    )
+
     return planned, manifest_out
 
 
diff --git a/analysis_pipeline/stage7_significance.py b/analysis_pipeline/stage7_significance.py