Skip to content

Commit 8f1d7c4

Browse files
committed
Benchmark run 2026-04-08: 19 stages scored, reports generated
- Updated extract.py to handle full stage retry (uses last task prompt per stage) - Simplified INSTRUCTIONS.md extraction section to reference extract.py directly - Individual run report: benchmarks/2026-04-08-14-40-57.html (19 stages, 14 benchmarks) - Updated overall.html trends dashboard with run #2 data - Generated PDF report with 29 charts (overall + 14 factor + 14 trend) - Updated generate_pdf.py data section with new scores and two-point trend history - Removed stale test run 2026-03-31-11-16-46.html
1 parent 05dccb3 commit 8f1d7c4

7 files changed

Lines changed: 869 additions & 617 deletions

File tree

benchmarks/2026-03-31-11-16-46.html

Lines changed: 0 additions & 484 deletions
This file was deleted.

benchmarks/2026-04-08-14-40-57.html

Lines changed: 664 additions & 0 deletions
Large diffs are not rendered by default.
1.09 MB
Binary file not shown.

benchmarks/INSTRUCTIONS.md

Lines changed: 11 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -42,74 +42,19 @@ The raw AI response is also available in the log (`"Stage N response"` → `cont
4242

4343
Content boundaries: each multi-line value starts after `=` on the marker line and continues until the next line matching `^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} \|` (a timestamp-prefixed log entry).
4444

45-
### Extraction Script Template
46-
47-
```python
48-
#!/usr/bin/env python3
49-
"""Extract stage prompts and responses from debug log."""
50-
import re, os, sys
51-
52-
LOG = sys.argv[1] # Path to debug log
53-
OUT = sys.argv[2] if len(sys.argv) > 2 else "COMPARE"
54-
TIMESTAMP_RE = re.compile(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} \|")
55-
56-
with open(LOG, "r", encoding="utf-8", errors="replace") as f:
57-
lines = f.readlines()
58-
59-
def find_line(pattern, start=0):
60-
for i in range(start, len(lines)):
61-
if pattern in lines[i]:
62-
return i
63-
return -1
64-
65-
def extract_content(start_line, prefix):
66-
first_line = lines[start_line]
67-
idx = first_line.find(prefix + "=")
68-
if idx == -1:
69-
return ""
70-
parts = [first_line[idx + len(prefix) + 1:]]
71-
for i in range(start_line + 1, len(lines)):
72-
if TIMESTAMP_RE.match(lines[i]):
73-
break
74-
parts.append(lines[i])
75-
return "".join(parts)
76-
77-
os.makedirs(OUT, exist_ok=True)
78-
for stage_num in range(1, 50):
79-
prompt_line = find_line(f"Stage {stage_num} task prompt")
80-
if prompt_line == -1:
81-
break
82-
# Extract post-transform output (final quality after governance transforms)
83-
transform_line = find_line(f"Stage {stage_num} post-transform", prompt_line)
84-
task_full_line = next((i for i in range(prompt_line, min(prompt_line+10, len(lines)))
85-
if "task_full=" in lines[i]), -1)
86-
transformed_full_line = -1
87-
if transform_line != -1:
88-
transformed_full_line = next((i for i in range(transform_line, min(transform_line+10, len(lines)))
89-
if "transformed_full=" in lines[i]), -1)
90-
# Fallback to raw response if no post-transform entry (e.g., no transforms applied)
91-
if transformed_full_line == -1:
92-
response_line = find_line(f"Stage {stage_num} response", prompt_line)
93-
if response_line != -1:
94-
transformed_full_line = next((i for i in range(response_line, min(response_line+10, len(lines)))
95-
if "content_full=" in lines[i]), -1)
96-
content_key = "content_full"
97-
else:
98-
continue
99-
else:
100-
content_key = "transformed_full"
101-
if task_full_line == -1 or transformed_full_line == -1:
102-
continue
103-
prompt = extract_content(task_full_line, "task_full")
104-
response = extract_content(transformed_full_line, content_key)
105-
with open(os.path.join(OUT, f"INPUT_{stage_num}.md"), "w") as f:
106-
f.write(prompt)
107-
with open(os.path.join(OUT, f"CP_RESPONSE_{stage_num}.md"), "w") as f:
108-
f.write(response)
109-
print(f"Stage {stage_num}: INPUT={len(prompt)}B CP_RESPONSE={len(response)}B (source: {content_key})")
45+
### Extraction Script
46+
47+
Use `benchmarks/extract.py` to extract from the debug log:
48+
49+
```bash
50+
python3 benchmarks/extract.py debug_20260408144057.log COMPARE
11051
```
11152

112-
Usage: `python3 extract.py debug_20260328024351.log COMPARE`
53+
The script handles **full stage retries**: when a stage has multiple `task prompt`
54+
entries (from the retry loop), it uses the **last** attempt's input and the final
55+
post-transform output. Retried stages are marked with `[RETRY]` in the console output.
56+
57+
See `benchmarks/extract.py` for the full implementation.
11358

11459
---
11560

benchmarks/extract.py

Lines changed: 77 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,14 @@
11
#!/usr/bin/env python3
2-
"""Extract stage prompts and responses from debug log."""
3-
import re, os, sys
2+
"""Extract stage prompts and responses from debug log.
3+
4+
When a stage has a full retry (second ``task prompt`` entry), uses the
5+
**last** task prompt and the final post-transform output for that stage.
6+
This ensures benchmarks measure the retry attempt's input — the one that
7+
includes prior QA findings — rather than the original attempt.
8+
"""
9+
import os
10+
import re
11+
import sys
412

513
LOG = sys.argv[1] # Path to debug log
614
OUT = sys.argv[2] if len(sys.argv) > 2 else "COMPARE"
@@ -9,54 +17,105 @@
917
with open(LOG, "r", encoding="utf-8", errors="replace") as f:
1018
lines = f.readlines()
1119

12-
def find_line(pattern, start=0):
20+
21+
def find_all_lines(pattern: str) -> list[int]:
22+
"""Return line indices of ALL occurrences of *pattern*."""
23+
return [i for i, line in enumerate(lines) if pattern in line]
24+
25+
26+
def find_line(pattern: str, start: int = 0) -> int:
1327
for i in range(start, len(lines)):
1428
if pattern in lines[i]:
1529
return i
1630
return -1
1731

18-
def extract_content(start_line, prefix):
32+
33+
def extract_content(start_line: int, prefix: str) -> str:
1934
first_line = lines[start_line]
2035
idx = first_line.find(prefix + "=")
2136
if idx == -1:
2237
return ""
23-
parts = [first_line[idx + len(prefix) + 1:]]
38+
parts = [first_line[idx + len(prefix) + 1 :]]
2439
for i in range(start_line + 1, len(lines)):
2540
if TIMESTAMP_RE.match(lines[i]):
2641
break
2742
parts.append(lines[i])
2843
return "".join(parts)
2944

45+
3046
os.makedirs(OUT, exist_ok=True)
47+
3148
for stage_num in range(1, 50):
32-
prompt_line = find_line(f"Stage {stage_num} task prompt")
33-
if prompt_line == -1:
49+
# Find ALL task prompt entries for this stage — use the LAST one
50+
# (if a full retry happened, the second prompt includes prior QA findings)
51+
all_prompts = find_all_lines(f"Stage {stage_num} task prompt")
52+
if not all_prompts:
3453
break
35-
# Extract post-transform output (final quality after governance transforms)
54+
55+
prompt_line = all_prompts[-1] # Use last (retry) attempt
56+
retried = len(all_prompts) > 1
57+
58+
# Find task_full= within a few lines of the prompt marker
59+
task_full_line = next(
60+
(i for i in range(prompt_line, min(prompt_line + 15, len(lines))) if "task_full=" in lines[i]),
61+
-1,
62+
)
63+
64+
# Find the LAST post-transform output after the last task prompt
3665
transform_line = find_line(f"Stage {stage_num} post-transform", prompt_line)
37-
task_full_line = next((i for i in range(prompt_line, min(prompt_line+10, len(lines)))
38-
if "task_full=" in lines[i]), -1)
66+
# Walk forward to find the very last post-transform for this stage
67+
# (there may be multiple from QA remediation cycles)
68+
while True:
69+
next_transform = find_line(f"Stage {stage_num} post-transform", transform_line + 1)
70+
# Stop if we hit a different stage's task prompt or end of file
71+
next_stage_prompt = find_line(f"Stage {stage_num + 1} task prompt", transform_line + 1)
72+
if next_transform == -1:
73+
break
74+
if next_stage_prompt != -1 and next_transform > next_stage_prompt:
75+
break
76+
transform_line = next_transform
77+
3978
transformed_full_line = -1
79+
content_key = "transformed_full"
80+
4081
if transform_line != -1:
41-
transformed_full_line = next((i for i in range(transform_line, min(transform_line+10, len(lines)))
42-
if "transformed_full=" in lines[i]), -1)
43-
# Fallback to raw response if no post-transform entry (e.g., no transforms applied)
82+
transformed_full_line = next(
83+
(
84+
i
85+
for i in range(transform_line, min(transform_line + 15, len(lines)))
86+
if "transformed_full=" in lines[i]
87+
),
88+
-1,
89+
)
90+
91+
# Fallback to raw response if no post-transform entry
4492
if transformed_full_line == -1:
4593
response_line = find_line(f"Stage {stage_num} response", prompt_line)
4694
if response_line != -1:
47-
transformed_full_line = next((i for i in range(response_line, min(response_line+10, len(lines)))
48-
if "content_full=" in lines[i]), -1)
95+
transformed_full_line = next(
96+
(
97+
i
98+
for i in range(response_line, min(response_line + 15, len(lines)))
99+
if "content_full=" in lines[i]
100+
),
101+
-1,
102+
)
49103
content_key = "content_full"
50104
else:
51105
continue
52-
else:
53-
content_key = "transformed_full"
106+
54107
if task_full_line == -1 or transformed_full_line == -1:
55108
continue
109+
56110
prompt = extract_content(task_full_line, "task_full")
57111
response = extract_content(transformed_full_line, content_key)
112+
113+
retry_tag = " [RETRY]" if retried else ""
58114
with open(os.path.join(OUT, f"INPUT_{stage_num}.md"), "w") as f:
59115
f.write(prompt)
60116
with open(os.path.join(OUT, f"CP_RESPONSE_{stage_num}.md"), "w") as f:
61117
f.write(response)
62-
print(f"Stage {stage_num}: INPUT={len(prompt)}B CP_RESPONSE={len(response)}B (source: {content_key})")
118+
print(
119+
f"Stage {stage_num}{retry_tag}: INPUT={len(prompt)}B "
120+
f"CP_RESPONSE={len(response)}B (source: {content_key})"
121+
)

benchmarks/overall.html

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,56 @@ <h2 class="text-lg font-bold mb-3 flex items-center gap-2"><span class="w-2 h-5
163163
{n:10,ghcp:82,comp:79},{n:11,ghcp:81,comp:77},{n:12,ghcp:85,comp:72},
164164
{n:13,ghcp:95,comp:79},{n:14,ghcp:20,comp:95}
165165
]
166+
},
167+
{
168+
date: "2026-04-08",
169+
model: "Sonnet 4.6",
170+
project: "KanFlow Azure POC",
171+
ghcp_scores: {
172+
"B-INST":80,"B-CNST":81,"B-TECH":79,"B-SEC":88,
173+
"B-OPS":86,"B-DEP":82,"B-SCOPE":90,"B-QUAL":77,
174+
"B-OUT":77,"B-CONS":72,"B-DOC":74,"B-REL":79,
175+
"B-RBAC":89,"B-ANTI":87
176+
},
177+
comparison_scores: {
178+
"B-INST":87,"B-CNST":86,"B-TECH":84,"B-SEC":89,
179+
"B-OPS":80,"B-DEP":82,"B-SCOPE":89,"B-QUAL":85,
180+
"B-OUT":83,"B-CONS":80,"B-DOC":78,"B-REL":90,
181+
"B-RBAC":83,"B-ANTI":87
182+
},
183+
// Per-benchmark sub-factor scores for the latest run
184+
ghcp_factors: {
185+
"B-INST":{f1:24,f2:20,f3:16,f4:12,f5:8},
186+
"B-CNST":{f1:28,f2:24,f3:12,f4:8,f5:9},
187+
"B-TECH":{f1:20,f2:20,f3:16,f4:12,f5:11},
188+
"B-SEC":{f1:22,f2:22,f3:18,f4:13,f5:13},
189+
"B-OPS":{f1:22,f2:17,f3:17,f4:17,f5:13},
190+
"B-DEP":{f1:25,f2:20,f3:16,f4:12,f5:9},
191+
"B-SCOPE":{f1:32,f2:22,f3:18,f4:9,f5:9},
192+
"B-QUAL":{f1:19,f2:15,f3:15,f4:12,f5:16},
193+
"B-OUT":{f1:27,f2:15,f3:15,f4:12,f5:8},
194+
"B-CONS":{f1:18,f2:14,f3:14,f4:14,f5:12},
195+
"B-DOC":{f1:19,f2:18,f3:15,f4:11,f5:11},
196+
"B-REL":{f1:24,f2:20,f3:20,f4:15},
197+
"B-RBAC":{f1:27,f2:18,f3:18,f4:13,f5:13},
198+
"B-ANTI":{f1:22,f2:17,f3:17,f4:17,f5:14}
199+
},
200+
comparison_factors: {
201+
"B-INST":{f1:26,f2:22,f3:17,f4:13,f5:9},
202+
"B-CNST":{f1:30,f2:26,f3:13,f4:9,f5:8},
203+
"B-TECH":{f1:21,f2:21,f3:17,f4:13,f5:12},
204+
"B-SEC":{f1:22,f2:22,f3:18,f4:14,f5:13},
205+
"B-OPS":{f1:20,f2:16,f3:16,f4:16,f5:12},
206+
"B-DEP":{f1:25,f2:20,f3:16,f4:12,f5:9},
207+
"B-SCOPE":{f1:31,f2:22,f3:18,f4:9,f5:9},
208+
"B-QUAL":{f1:21,f2:17,f3:17,f4:13,f5:17},
209+
"B-OUT":{f1:29,f2:17,f3:17,f4:12,f5:8},
210+
"B-CONS":{f1:20,f2:16,f3:16,f4:16,f5:12},
211+
"B-DOC":{f1:20,f2:19,f3:16,f4:12,f5:11},
212+
"B-REL":{f1:27,f2:22,f3:23,f4:18},
213+
"B-RBAC":{f1:25,f2:17,f3:17,f4:12,f5:12},
214+
"B-ANTI":{f1:22,f2:17,f3:17,f4:17,f5:14}
215+
}
166216
}
167217
];
168218

0 commit comments

Comments
 (0)