Azure
diff --git a/‎benchmarks/2026-03-31-11-16-46.html‎
Lines changed: 0 additions & 484 deletions b/‎benchmarks/2026-03-31-11-16-46.html‎
Lines changed: 0 additions & 484 deletions
diff --git a/‎benchmarks/2026-04-08-14-40-57.html‎
Lines changed: 664 additions & 0 deletions b/‎benchmarks/2026-04-08-14-40-57.html‎
Lines changed: 664 additions & 0 deletions
diff --git a/‎benchmarks/2026-04-08_Benchmark_Report.pdf‎
1.09 MB b/‎benchmarks/2026-04-08_Benchmark_Report.pdf‎
1.09 MB
diff --git a/‎benchmarks/INSTRUCTIONS.md‎
Lines changed: 11 additions & 66 deletions b/‎benchmarks/INSTRUCTIONS.md‎
Lines changed: 11 additions & 66 deletions
diff --git a/‎benchmarks/extract.py‎
Lines changed: 77 additions & 18 deletions b/‎benchmarks/extract.py‎
Lines changed: 77 additions & 18 deletions
diff --git a/‎benchmarks/overall.html‎
Lines changed: 50 additions & 0 deletions b/‎benchmarks/overall.html‎
Lines changed: 50 additions & 0 deletions
@@ -42,74 +42,19 @@ The raw AI response is also available in the log (`"Stage N response"` → `cont
 
 Content boundaries: each multi-line value starts after `=` on the marker line and continues until the next line matching `^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} \|` (a timestamp-prefixed log entry).
 
-### Extraction Script Template
-
-```python
-#!/usr/bin/env python3
-"""Extract stage prompts and responses from debug log."""
-import re, os, sys
-
-LOG = sys.argv[1]  # Path to debug log
-OUT = sys.argv[2] if len(sys.argv) > 2 else "COMPARE"
-TIMESTAMP_RE = re.compile(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} \|")
-
-with open(LOG, "r", encoding="utf-8", errors="replace") as f:
-    lines = f.readlines()
-
-def find_line(pattern, start=0):
-    for i in range(start, len(lines)):
-        if pattern in lines[i]:
-            return i
-    return -1
-
-def extract_content(start_line, prefix):
-    first_line = lines[start_line]
-    idx = first_line.find(prefix + "=")
-    if idx == -1:
-        return ""
-    parts = [first_line[idx + len(prefix) + 1:]]
-    for i in range(start_line + 1, len(lines)):
-        if TIMESTAMP_RE.match(lines[i]):
-            break
-        parts.append(lines[i])
-    return "".join(parts)
-
-os.makedirs(OUT, exist_ok=True)
-for stage_num in range(1, 50):
-    prompt_line = find_line(f"Stage {stage_num} task prompt")
-    if prompt_line == -1:
-        break
-    # Extract post-transform output (final quality after governance transforms)
-    transform_line = find_line(f"Stage {stage_num} post-transform", prompt_line)
-    task_full_line = next((i for i in range(prompt_line, min(prompt_line+10, len(lines)))
-                          if "task_full=" in lines[i]), -1)
-    transformed_full_line = -1
-    if transform_line != -1:
-        transformed_full_line = next((i for i in range(transform_line, min(transform_line+10, len(lines)))
-                                     if "transformed_full=" in lines[i]), -1)
-    # Fallback to raw response if no post-transform entry (e.g., no transforms applied)
-    if transformed_full_line == -1:
-        response_line = find_line(f"Stage {stage_num} response", prompt_line)
-        if response_line != -1:
-            transformed_full_line = next((i for i in range(response_line, min(response_line+10, len(lines)))
-                                         if "content_full=" in lines[i]), -1)
-            content_key = "content_full"
-        else:
-            continue
-    else:
-        content_key = "transformed_full"
-    if task_full_line == -1 or transformed_full_line == -1:
-        continue
-    prompt = extract_content(task_full_line, "task_full")
-    response = extract_content(transformed_full_line, content_key)
-    with open(os.path.join(OUT, f"INPUT_{stage_num}.md"), "w") as f:
-        f.write(prompt)
-    with open(os.path.join(OUT, f"CP_RESPONSE_{stage_num}.md"), "w") as f:
-        f.write(response)
-    print(f"Stage {stage_num}: INPUT={len(prompt)}B  CP_RESPONSE={len(response)}B (source: {content_key})")
+### Extraction Script
+
+Use `benchmarks/extract.py` to extract from the debug log:
+
+```bash
+python3 benchmarks/extract.py debug_20260408144057.log COMPARE
 ```
 
-Usage: `python3 extract.py debug_20260328024351.log COMPARE`
+The script handles **full stage retries**: when a stage has multiple `task prompt`
+entries (from the retry loop), it uses the **last** attempt's input and the final
+post-transform output. Retried stages are marked with `[RETRY]` in the console output.
+
+See `benchmarks/extract.py` for the full implementation.
 
 ---
 
 
@@ -1,6 +1,14 @@
 #!/usr/bin/env python3
-"""Extract stage prompts and responses from debug log."""
-import re, os, sys
+"""Extract stage prompts and responses from debug log.
+
+When a stage has a full retry (second ``task prompt`` entry), uses the
+**last** task prompt and the final post-transform output for that stage.
+This ensures benchmarks measure the retry attempt's input — the one that
+includes prior QA findings — rather than the original attempt.
+"""
+import os
+import re
+import sys
 
 LOG = sys.argv[1]  # Path to debug log
 OUT = sys.argv[2] if len(sys.argv) > 2 else "COMPARE"
@@ -9,54 +17,105 @@
 with open(LOG, "r", encoding="utf-8", errors="replace") as f:
     lines = f.readlines()
 
-def find_line(pattern, start=0):
+
+def find_all_lines(pattern: str) -> list[int]:
+    """Return line indices of ALL occurrences of *pattern*."""
+    return [i for i, line in enumerate(lines) if pattern in line]
+
+
+def find_line(pattern: str, start: int = 0) -> int:
     for i in range(start, len(lines)):
         if pattern in lines[i]:
             return i
     return -1
 
-def extract_content(start_line, prefix):
+
+def extract_content(start_line: int, prefix: str) -> str:
     first_line = lines[start_line]
     idx = first_line.find(prefix + "=")
     if idx == -1:
         return ""
-    parts = [first_line[idx + len(prefix) + 1:]]
+    parts = [first_line[idx + len(prefix) + 1 :]]
     for i in range(start_line + 1, len(lines)):
         if TIMESTAMP_RE.match(lines[i]):
             break
         parts.append(lines[i])
     return "".join(parts)
 
+
 os.makedirs(OUT, exist_ok=True)
+
 for stage_num in range(1, 50):
-    prompt_line = find_line(f"Stage {stage_num} task prompt")
-    if prompt_line == -1:
+    # Find ALL task prompt entries for this stage — use the LAST one
+    # (if a full retry happened, the second prompt includes prior QA findings)
+    all_prompts = find_all_lines(f"Stage {stage_num} task prompt")
+    if not all_prompts:
         break
-    # Extract post-transform output (final quality after governance transforms)
+
+    prompt_line = all_prompts[-1]  # Use last (retry) attempt
+    retried = len(all_prompts) > 1
+
+    # Find task_full= within a few lines of the prompt marker
+    task_full_line = next(
+        (i for i in range(prompt_line, min(prompt_line + 15, len(lines))) if "task_full=" in lines[i]),
+        -1,
+    )
+
+    # Find the LAST post-transform output after the last task prompt
     transform_line = find_line(f"Stage {stage_num} post-transform", prompt_line)
-    task_full_line = next((i for i in range(prompt_line, min(prompt_line+10, len(lines)))
-                          if "task_full=" in lines[i]), -1)
+    # Walk forward to find the very last post-transform for this stage
+    # (there may be multiple from QA remediation cycles)
+    while True:
+        next_transform = find_line(f"Stage {stage_num} post-transform", transform_line + 1)
+        # Stop if we hit a different stage's task prompt or end of file
+        next_stage_prompt = find_line(f"Stage {stage_num + 1} task prompt", transform_line + 1)
+        if next_transform == -1:
+            break
+        if next_stage_prompt != -1 and next_transform > next_stage_prompt:
+            break
+        transform_line = next_transform
+
     transformed_full_line = -1
+    content_key = "transformed_full"
+
     if transform_line != -1:
-        transformed_full_line = next((i for i in range(transform_line, min(transform_line+10, len(lines)))
-                                     if "transformed_full=" in lines[i]), -1)
-    # Fallback to raw response if no post-transform entry (e.g., no transforms applied)
+        transformed_full_line = next(
+            (
+                i
+                for i in range(transform_line, min(transform_line + 15, len(lines)))
+                if "transformed_full=" in lines[i]
+            ),
+            -1,
+        )
+
+    # Fallback to raw response if no post-transform entry
     if transformed_full_line == -1:
         response_line = find_line(f"Stage {stage_num} response", prompt_line)
         if response_line != -1:
-            transformed_full_line = next((i for i in range(response_line, min(response_line+10, len(lines)))
-                                         if "content_full=" in lines[i]), -1)
+            transformed_full_line = next(
+                (
+                    i
+                    for i in range(response_line, min(response_line + 15, len(lines)))
+                    if "content_full=" in lines[i]
+                ),
+                -1,
+            )
             content_key = "content_full"
         else:
             continue
-    else:
-        content_key = "transformed_full"
+
     if task_full_line == -1 or transformed_full_line == -1:
         continue
+
     prompt = extract_content(task_full_line, "task_full")
     response = extract_content(transformed_full_line, content_key)
+
+    retry_tag = " [RETRY]" if retried else ""
     with open(os.path.join(OUT, f"INPUT_{stage_num}.md"), "w") as f:
         f.write(prompt)
     with open(os.path.join(OUT, f"CP_RESPONSE_{stage_num}.md"), "w") as f:
         f.write(response)
-    print(f"Stage {stage_num}: INPUT={len(prompt)}B  CP_RESPONSE={len(response)}B (source: {content_key})")
+    print(
+        f"Stage {stage_num}{retry_tag}: INPUT={len(prompt)}B  "
+        f"CP_RESPONSE={len(response)}B (source: {content_key})"
+    )
@@ -163,6 +163,56 @@ <h2 class="text-lg font-bold mb-3 flex items-center gap-2"><span class="w-2 h-5
       {n:10,ghcp:82,comp:79},{n:11,ghcp:81,comp:77},{n:12,ghcp:85,comp:72},
       {n:13,ghcp:95,comp:79},{n:14,ghcp:20,comp:95}
     ]
+  },
+  {
+    date: "2026-04-08",
+    model: "Sonnet 4.6",
+    project: "KanFlow Azure POC",
+    ghcp_scores: {
+      "B-INST":80,"B-CNST":81,"B-TECH":79,"B-SEC":88,
+      "B-OPS":86,"B-DEP":82,"B-SCOPE":90,"B-QUAL":77,
+      "B-OUT":77,"B-CONS":72,"B-DOC":74,"B-REL":79,
+      "B-RBAC":89,"B-ANTI":87
+    },
+    comparison_scores: {
+      "B-INST":87,"B-CNST":86,"B-TECH":84,"B-SEC":89,
+      "B-OPS":80,"B-DEP":82,"B-SCOPE":89,"B-QUAL":85,
+      "B-OUT":83,"B-CONS":80,"B-DOC":78,"B-REL":90,
+      "B-RBAC":83,"B-ANTI":87
+    },
+    // Per-benchmark sub-factor scores for the latest run
+    ghcp_factors: {
+      "B-INST":{f1:24,f2:20,f3:16,f4:12,f5:8},
+      "B-CNST":{f1:28,f2:24,f3:12,f4:8,f5:9},
+      "B-TECH":{f1:20,f2:20,f3:16,f4:12,f5:11},
+      "B-SEC":{f1:22,f2:22,f3:18,f4:13,f5:13},
+      "B-OPS":{f1:22,f2:17,f3:17,f4:17,f5:13},
+      "B-DEP":{f1:25,f2:20,f3:16,f4:12,f5:9},
+      "B-SCOPE":{f1:32,f2:22,f3:18,f4:9,f5:9},
+      "B-QUAL":{f1:19,f2:15,f3:15,f4:12,f5:16},
+      "B-OUT":{f1:27,f2:15,f3:15,f4:12,f5:8},
+      "B-CONS":{f1:18,f2:14,f3:14,f4:14,f5:12},
+      "B-DOC":{f1:19,f2:18,f3:15,f4:11,f5:11},
+      "B-REL":{f1:24,f2:20,f3:20,f4:15},
+      "B-RBAC":{f1:27,f2:18,f3:18,f4:13,f5:13},
+      "B-ANTI":{f1:22,f2:17,f3:17,f4:17,f5:14}
+    },
+    comparison_factors: {
+      "B-INST":{f1:26,f2:22,f3:17,f4:13,f5:9},
+      "B-CNST":{f1:30,f2:26,f3:13,f4:9,f5:8},
+      "B-TECH":{f1:21,f2:21,f3:17,f4:13,f5:12},
+      "B-SEC":{f1:22,f2:22,f3:18,f4:14,f5:13},
+      "B-OPS":{f1:20,f2:16,f3:16,f4:16,f5:12},
+      "B-DEP":{f1:25,f2:20,f3:16,f4:12,f5:9},
+      "B-SCOPE":{f1:31,f2:22,f3:18,f4:9,f5:9},
+      "B-QUAL":{f1:21,f2:17,f3:17,f4:13,f5:17},
+      "B-OUT":{f1:29,f2:17,f3:17,f4:12,f5:8},
+      "B-CONS":{f1:20,f2:16,f3:16,f4:16,f5:12},
+      "B-DOC":{f1:20,f2:19,f3:16,f4:12,f5:11},
+      "B-REL":{f1:27,f2:22,f3:23,f4:18},
+      "B-RBAC":{f1:25,f2:17,f3:17,f4:12,f5:12},
+      "B-ANTI":{f1:22,f2:17,f3:17,f4:17,f5:14}
+    }
   }
 ];