Skip to content

Commit 753434b

Browse files
bundoleeclaude
andcommitted
fix: remove fallback 0 for missing thresholds in step summary
Problem: thresholds.get(key, 0) silently treated missing threshold keys as 0, causing the step summary to show a green check even when the threshold config was incomplete. Solution: skip metrics with missing thresholds instead of defaulting to 0. Also apply regression_tolerance from thresholds.json and use a loop to reduce duplication across score metrics. Verification: YAML valid. Companion PR opendataloader-bench#9 updates check_regression() with matching tolerance logic. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent f3ab80f commit 753434b

1 file changed

Lines changed: 17 additions & 27 deletions

File tree

.github/workflows/test-benchmark.yml

Lines changed: 17 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -120,32 +120,21 @@ jobs:
120120
table_detection = eval_data.get("table_detection", {})
121121
speed = eval_data.get("speed", {})
122122
triage = eval_data.get("triage", {})
123+
tol = thresholds.get("regression_tolerance", 0)
123124
124125
rows = []
125126
126-
nid = scores.get("nid_mean")
127-
if nid is not None:
128-
t = thresholds.get("nid", 0)
129-
status = "✅" if nid >= t else "❌"
130-
rows.append(f"| NID | {nid:.4f} | ≥ {t} | {status} |")
131-
132-
teds = scores.get("teds_mean")
133-
if teds is not None:
134-
t = thresholds.get("teds", 0)
135-
status = "✅" if teds >= t else "❌"
136-
rows.append(f"| TEDS | {teds:.4f} | ≥ {t} | {status} |")
137-
138-
mhs = scores.get("mhs_mean")
139-
if mhs is not None:
140-
t = thresholds.get("mhs", 0)
141-
status = "✅" if mhs >= t else "❌"
142-
rows.append(f"| MHS | {mhs:.4f} | ≥ {t} | {status} |")
143-
144-
td_f1 = table_detection.get("f1")
145-
if td_f1 is not None:
146-
t = thresholds.get("table_detection_f1", 0)
147-
status = "✅" if td_f1 >= t else "❌"
148-
rows.append(f"| Table Detection F1 | {td_f1:.4f} | ≥ {t} | {status} |")
127+
for key, label, src in [
128+
("nid", "NID", scores.get("nid_mean")),
129+
("teds", "TEDS", scores.get("teds_mean")),
130+
("mhs", "MHS", scores.get("mhs_mean")),
131+
("table_detection_f1", "Table Detection F1", table_detection.get("f1")),
132+
]:
133+
t = thresholds.get(key)
134+
if src is not None and t is not None:
135+
effective = t - tol
136+
status = "✅" if src >= effective else "❌"
137+
rows.append(f"| {label} | {src:.4f} | ≥ {effective:.2f} | {status} |")
149138
150139
elapsed = speed.get("elapsed_per_doc")
151140
elapsed_thresh = thresholds.get("elapsed_per_doc")
@@ -155,10 +144,11 @@ jobs:
155144
156145
if triage:
157146
tr_recall = triage.get("recall")
158-
if tr_recall is not None:
159-
t = thresholds.get("triage_recall", 0)
160-
status = "✅" if tr_recall >= t else "❌"
161-
rows.append(f"| Triage Recall | {tr_recall:.4f} | ≥ {t} | {status} |")
147+
tr_thresh = thresholds.get("triage_recall")
148+
if tr_recall is not None and tr_thresh is not None:
149+
effective = tr_thresh - tol
150+
status = "✅" if tr_recall >= effective else "❌"
151+
rows.append(f"| Triage Recall | {tr_recall:.4f} | ≥ {effective:.2f} | {status} |")
162152
163153
tr_fn = triage.get("fn_count")
164154
tr_fn_max = thresholds.get("triage_fn_max")

0 commit comments

Comments
 (0)