sourcegraph
diff --git a/‎README.md‎
Lines changed: 5 additions & 0 deletions b/‎README.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎docs/OFFICIAL_RESULTS_BROWSER.md‎
Lines changed: 4 additions & 0 deletions b/‎docs/OFFICIAL_RESULTS_BROWSER.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎docs/ops/SCRIPT_INDEX.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/ops/SCRIPT_INDEX.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/reference/RESULT_DIRECTORY_SPEC.md‎
Lines changed: 18 additions & 6 deletions b/‎docs/reference/RESULT_DIRECTORY_SPEC.md‎
Lines changed: 18 additions & 6 deletions
diff --git a/‎scripts/generate_manifest.py‎
Lines changed: 4 additions & 2 deletions b/‎scripts/generate_manifest.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎scripts/official_runs.py‎
Lines changed: 19 additions & 3 deletions b/‎scripts/official_runs.py‎
Lines changed: 19 additions & 3 deletions
@@ -225,6 +225,11 @@ Each suite directory contains per-task subdirectories with `instruction.md`, `ta
 The `scripts/` directory contains a stdlib-only Python 3.10+ pipeline for extracting deterministic metrics from Harbor run output.
 Use `runs/analysis` for active analysis runs (and `runs/official` when producing publishable exports):
 
+Official runs layout note:
+- Raw source-of-truth run dirs now live under `runs/official/_raw/`.
+- Top-level `runs/official/` is kept clean for organized benchmark/model views (`csb_sdlc/`, `csb_org/`) plus `MANIFEST.json`.
+- Core scripts (manifest generation, promotion, organizer) resolve `_raw` automatically.
+
 ```bash
 # Generate evaluation report from analysis runs
 python3 scripts/generate_eval_report.py \
 
@@ -46,6 +46,10 @@ python3 scripts/export_official_results.py \
   --output-dir ./docs/official_results/
 ```
 
+Note: `runs/official/` uses a split layout (`_raw` for raw runs, organized
+views at top-level). Export tooling handles this automatically; pass
+`--runs-dir ./runs/official/` unless you intentionally want a custom root.
+
 If you promote runs with:
 
 ```bash
 
@@ -212,6 +212,7 @@ Generated from `scripts/registry.json` by `scripts/generate_script_index.py`.
 - `scripts/plot_csb_mcp_blog_figures.py` - Utility script for plot csb mcp blog figures.
 - `scripts/prepare_analysis_runs.py` - Utility script for prepare analysis runs.
 - `scripts/promote_agent_oracles.py` - Utility script for promote agent oracles.
+- `scripts/promote_blocked.py` - Utility script for promote blocked.
 - `scripts/push_base_images_ghcr.sh` - Utility script for push base images ghcr.
 - `scripts/regenerate_artifact_dockerfiles.py` - Utility script for regenerate artifact dockerfiles.
 - `scripts/rehost_sweap_images.py` - Utility script for rehost sweap images.
 
@@ -14,13 +14,22 @@
 
 ## Directory Layouts
 
-`runs/official/` contains batches with **three different directory structures**.
+`runs/official/` now has a split layout:
+
+- Raw run data (source of truth): `runs/official/_raw/`
+- Organized symlink views: `runs/official/csb_sdlc/`, `runs/official/csb_org/`, etc.
+- Canonical manifest: `runs/official/MANIFEST.json`
+
+Any scanner that reads run artifacts MUST scan `runs/official/_raw/` (or use
+`scripts/official_runs.py:raw_runs_dir(...)`), not top-level `runs/official/`.
+
+Inside the raw root, batches use **three different directory structures**.
 Any scanner MUST handle all three or it will under-count results.
 
 ### Layout 1: Old Promoted Format (pre-2026-02-24)
 
 ```
-runs/official/{suite}_{model}_{date}/
+runs/official/_raw/{suite}_{model}_{date}/
   baseline/
     {suite}_{task_id}_{config_name}/       ← wrapper dir
       {trial_dirname}/                     ← e.g. sgonly_task-name__AbCdEfG
@@ -40,7 +49,7 @@ Example (historical, from pre-split `csb_sdlc_build` runs): `csb_sdlc_build_haik
 ### Layout 2: Harbor Nested Format (2026-02-24+)
 
 ```
-runs/official/{suite}_{model}_{timestamp}/
+runs/official/_raw/{suite}_{model}_{timestamp}/
   baseline-local-direct/
     {harbor_timestamp}/                    ← e.g. 2026-02-26__00-09-23
       {task_dirname}/                      ← e.g. task-name__AbCdEfG
@@ -57,7 +66,7 @@ runs/official/{suite}_{model}_{timestamp}/
 ### Layout 3: CodeScaleBench-Org / Artifact Format
 
 ```
-runs/official/{suite}_{model}_{timestamp}/
+runs/official/_raw/{suite}_{model}_{timestamp}/
   baseline-local-direct/                   (or baseline-local-artifact)
     {harbor_timestamp}/
       bl_{TASK_ID}_{hash}__hash/           ← bl_ prefix, uppercase task ID
@@ -191,15 +200,17 @@ def extract_task_id_from_result(data: dict, parent_dir: str, suites: set[str]) -
 from pathlib import Path
 
 # Use rglob to find ALL result.json at any depth
-for rj in Path('runs/official').rglob('result.json'):
+from official_runs import raw_runs_dir
+raw_root = raw_runs_dir(Path('runs/official'))
+for rj in raw_root.rglob('result.json'):
     data = json.loads(rj.read_text())
 
     # 1. Skip batch-level results
     if 'task_name' not in data:
         continue
 
     # 2. Determine config from PATH COMPONENTS (not from result content)
-    parts = rj.relative_to(official).parts
+    parts = rj.relative_to(raw_root).parts
     is_baseline = any(p in BL_NAMES for p in parts)
     is_mcp = any(p in MCP_NAMES for p in parts)
 
@@ -217,6 +228,7 @@ for rj in Path('runs/official').rglob('result.json'):
 
 | Mistake | Consequence |
 |---|---|
+| Scanning top-level `runs/official/` instead of `_raw` | Mixes in organized symlink views and non-run artifacts |
 | Only checking 2-3 levels deep | Misses Layout 1 (old promoted, 4 levels deep) |
 | Using `task_id` field without checking if it's a dict | Crash or empty string |
 | Not stripping `sgonly_` prefix from `task_name` | No match against selection file |
 
@@ -17,8 +17,10 @@
 PROJECT_ROOT = Path(__file__).resolve().parent.parent
 sys.path.insert(0, str(PROJECT_ROOT / "scripts"))
 from config_utils import discover_configs
+from official_runs import raw_runs_dir
 
-RUNS_DIR = PROJECT_ROOT / "runs" / "official"
+RUNS_DIR = raw_runs_dir(PROJECT_ROOT / "runs" / "official")
+MANIFEST_OUTPUT_PATH = PROJECT_ROOT / "runs" / "official" / "MANIFEST.json"
 
 # Directories to skip entirely.
 # __v1_hinted: old run dirs from before enterprise task de-hinting (US-001..US-003).
@@ -938,7 +940,7 @@ def main():
         "run_history": run_history_section,
     }
 
-    output_path = RUNS_DIR / "MANIFEST.json"
+    output_path = MANIFEST_OUTPUT_PATH
     with open(output_path, "w") as f:
         json.dump(manifest, f, indent=2)
 
 
@@ -10,6 +10,7 @@
 DEFAULT_PREFIX_MAP_PATH = Path("configs/run_dir_prefix_map.json")
 TRIAGE_FILENAME = "triage.json"
 TRIAGE_DECISIONS = {"include", "exclude", "pending"}
+RAW_DIRNAME = "_raw"
 
 
 def should_skip(dirname: str) -> bool:
@@ -33,13 +34,29 @@ def detect_suite(run_dir_name: str, prefix_map: dict[str, str]) -> str | None:
     return None
 
 
+def raw_runs_dir(runs_dir: Path) -> Path:
+    """Return the directory that contains raw official run dirs.
+
+    Compatibility behavior:
+    - New layout: runs/official/_raw (preferred)
+    - Legacy layout: runs/official
+    """
+    if runs_dir.name == RAW_DIRNAME and runs_dir.is_dir():
+        return runs_dir
+    candidate = runs_dir / RAW_DIRNAME
+    if candidate.is_dir():
+        return candidate
+    return runs_dir
+
+
 def top_level_run_dirs(runs_dir: Path) -> list[Path]:
-    if not runs_dir.is_dir():
+    raw_dir = raw_runs_dir(runs_dir)
+    if not raw_dir.is_dir():
         return []
     return sorted(
         [
             p
-            for p in runs_dir.iterdir()
+            for p in raw_dir.iterdir()
             if p.is_dir() and not should_skip(p.name)
         ],
         key=lambda p: p.name,
@@ -90,4 +107,3 @@ def read_triage(run_dir: Path) -> tuple[dict | None, str | None]:
         if not triage.get(field):
             return triage, f"missing_{field}"
     return triage, None
-