Skip to content

Commit 2624921

Browse files
sjarmakclaude
andcommitted
feat: five-way oracle curation bias mitigations
curate_oracle.py: 1. LLM query expansion — Claude Haiku generates N semantically diverse queries from seed_prompt so no single keyword_search call recovers the full oracle; falls back to pattern variation when ANTHROPIC_API_KEY absent or --no-llm 2. Content validation via lineMatches — _classify_line_context / _infer_file_tier classify each hit as comment/string/code and annotate oracle files as 'required' (defines concept) or 'sufficient' (references/tests it) 3. Oracle quality gate — validate_oracle_quality warns on >15 files, >5 files/repo from a single-term pattern, zero required-tier files, and tier imbalance 4. Two-tier oracle — oracle files carry {"tier": "required"|"sufficient"} so weighted F1 in the evaluator concentrates score on definition-level files 5. Decouple search_pattern from curation — get_curation_queries checks params["curation_queries"] first so task authors can specify exact search queries without exposing them in the agent-visible search_pattern field New CLI flags: --no-llm, --anthropic-api-key oracle_checks.py + 215 task copies: - check_file_set_match: adds weighted_recall, weighted_f1, required_recall, required_total, required_matched when oracle has tier annotations; backward-compatible (untiered oracles unchanged) - _get_primary_score: prefers weighted_f1 over f1 for file_set_match when tier annotations are present so required files count 2x in composite score Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 161e510 commit 2624921

File tree

217 files changed

+13997
-1073
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

217 files changed

+13997
-1073
lines changed

benchmarks/ccb_mcp_compliance/ccx-compliance-051/tests/oracle_checks.py

Lines changed: 63 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,9 @@ def _path_key(item: Dict[str, str]) -> tuple:
132132
return all_matched_oracle, missing, extra
133133

134134

135+
_TIER_WEIGHTS: Dict[str, float] = {"required": 2.0, "sufficient": 1.0}
136+
137+
135138
def check_file_set_match(
136139
answer_files: List[Dict[str, str]],
137140
oracle_files: List[Dict[str, str]],
@@ -142,7 +145,14 @@ def check_file_set_match(
142145
Matching uses two-pass repo normalization: exact match first, then
143146
normalised-repo and path-only fallback for mirror/upstream name mismatches.
144147
145-
Returns raw scores without thresholds.
148+
Returns raw scores without thresholds. When oracle files carry a "tier"
149+
field ("required" or "sufficient"), also computes weighted scores:
150+
- weighted_recall: recall weighted by tier (required=2x, sufficient=1x)
151+
- weighted_f1: F1 using weighted_recall and unweighted precision
152+
- required_recall: recall restricted to "required"-tier files only
153+
154+
All added fields are backward-compatible — callers that ignore them are
155+
unaffected. _get_primary_score prefers weighted_f1 when available.
146156
147157
>>> result = check_file_set_match(
148158
... [{"repo": "a/b", "path": "x.go"}],
@@ -159,6 +169,16 @@ def check_file_set_match(
159169
... )
160170
>>> result["f1"]
161171
1.0
172+
173+
>>> result = check_file_set_match(
174+
... [{"repo": "a/b", "path": "x.go"}],
175+
... [{"repo": "a/b", "path": "x.go", "tier": "required"},
176+
... {"repo": "a/b", "path": "y.go", "tier": "sufficient"}],
177+
... )
178+
>>> result["required_recall"]
179+
1.0
180+
>>> result["weighted_recall"] # matched required(2) / total(3) = 0.6667
181+
0.6667
162182
"""
163183
matched, missing, extra = _match_items(answer_files, oracle_files, ["repo", "path"])
164184

@@ -169,7 +189,7 @@ def check_file_set_match(
169189
precision = len(matched) / n_answer if n_answer else 0.0
170190
f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0
171191

172-
return {
192+
result: Dict[str, Any] = {
173193
"recall": round(recall, 4),
174194
"precision": round(precision, 4),
175195
"f1": round(f1, 4),
@@ -178,6 +198,37 @@ def check_file_set_match(
178198
"extra": [{"repo": r, "path": p} for r, p in sorted(extra)],
179199
}
180200

201+
# Weighted scoring — only when oracle carries tier annotations
202+
has_tiers = any("tier" in f for f in oracle_files)
203+
if has_tiers:
204+
# Build weight map keyed by oracle's exact (repo, path) tuples.
205+
# _match_items returns oracle exact keys so the lookup is direct.
206+
weight_map: Dict[tuple, float] = {
207+
(f.get("repo", ""), f.get("path", "")): _TIER_WEIGHTS.get(f.get("tier", "sufficient"), 1.0)
208+
for f in oracle_files
209+
}
210+
total_weight = sum(weight_map.values()) or 1.0
211+
matched_weight = sum(weight_map.get(k, 1.0) for k in matched)
212+
213+
weighted_recall = matched_weight / total_weight
214+
weighted_f1 = (
215+
(2 * precision * weighted_recall / (precision + weighted_recall))
216+
if (precision + weighted_recall) > 0
217+
else 0.0
218+
)
219+
220+
required_keys = {k for k, w in weight_map.items() if w > 1.0}
221+
required_matched = required_keys & set(matched)
222+
required_recall = len(required_matched) / len(required_keys) if required_keys else None
223+
224+
result["weighted_recall"] = round(weighted_recall, 4)
225+
result["weighted_f1"] = round(weighted_f1, 4)
226+
result["required_recall"] = round(required_recall, 4) if required_recall is not None else None
227+
result["required_total"] = len(required_keys)
228+
result["required_matched"] = len(required_matched)
229+
230+
return result
231+
181232

182233
def check_symbol_resolution(
183234
answer_symbols: List[Dict[str, str]],
@@ -462,9 +513,17 @@ def check_test_ratio(
462513

463514

464515
def _get_primary_score(check_result: Dict[str, Any], check_type: str) -> float:
465-
"""Extract the primary score from a check result for composite scoring."""
516+
"""Extract the primary score from a check result for composite scoring.
517+
518+
For file_set_match, prefers weighted_f1 (available when oracle has tier
519+
annotations) over plain f1, so required-tier files count more heavily.
520+
"""
521+
if check_type == "file_set_match":
522+
# Use weighted_f1 when tiers are present, else fall back to f1
523+
value = check_result.get("weighted_f1", check_result.get("f1", 0))
524+
return float(value)
525+
466526
score_keys = {
467-
"file_set_match": "f1",
468527
"symbol_resolution": "recall",
469528
"dependency_chain": "chain_recall",
470529
"provenance": "provenance_score",

benchmarks/ccb_mcp_compliance/ccx-compliance-052/tests/oracle_checks.py

Lines changed: 63 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,9 @@ def _path_key(item: Dict[str, str]) -> tuple:
132132
return all_matched_oracle, missing, extra
133133

134134

135+
_TIER_WEIGHTS: Dict[str, float] = {"required": 2.0, "sufficient": 1.0}
136+
137+
135138
def check_file_set_match(
136139
answer_files: List[Dict[str, str]],
137140
oracle_files: List[Dict[str, str]],
@@ -142,7 +145,14 @@ def check_file_set_match(
142145
Matching uses two-pass repo normalization: exact match first, then
143146
normalised-repo and path-only fallback for mirror/upstream name mismatches.
144147
145-
Returns raw scores without thresholds.
148+
Returns raw scores without thresholds. When oracle files carry a "tier"
149+
field ("required" or "sufficient"), also computes weighted scores:
150+
- weighted_recall: recall weighted by tier (required=2x, sufficient=1x)
151+
- weighted_f1: F1 using weighted_recall and unweighted precision
152+
- required_recall: recall restricted to "required"-tier files only
153+
154+
All added fields are backward-compatible — callers that ignore them are
155+
unaffected. _get_primary_score prefers weighted_f1 when available.
146156
147157
>>> result = check_file_set_match(
148158
... [{"repo": "a/b", "path": "x.go"}],
@@ -159,6 +169,16 @@ def check_file_set_match(
159169
... )
160170
>>> result["f1"]
161171
1.0
172+
173+
>>> result = check_file_set_match(
174+
... [{"repo": "a/b", "path": "x.go"}],
175+
... [{"repo": "a/b", "path": "x.go", "tier": "required"},
176+
... {"repo": "a/b", "path": "y.go", "tier": "sufficient"}],
177+
... )
178+
>>> result["required_recall"]
179+
1.0
180+
>>> result["weighted_recall"] # matched required(2) / total(3) = 0.6667
181+
0.6667
162182
"""
163183
matched, missing, extra = _match_items(answer_files, oracle_files, ["repo", "path"])
164184

@@ -169,7 +189,7 @@ def check_file_set_match(
169189
precision = len(matched) / n_answer if n_answer else 0.0
170190
f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0
171191

172-
return {
192+
result: Dict[str, Any] = {
173193
"recall": round(recall, 4),
174194
"precision": round(precision, 4),
175195
"f1": round(f1, 4),
@@ -178,6 +198,37 @@ def check_file_set_match(
178198
"extra": [{"repo": r, "path": p} for r, p in sorted(extra)],
179199
}
180200

201+
# Weighted scoring — only when oracle carries tier annotations
202+
has_tiers = any("tier" in f for f in oracle_files)
203+
if has_tiers:
204+
# Build weight map keyed by oracle's exact (repo, path) tuples.
205+
# _match_items returns oracle exact keys so the lookup is direct.
206+
weight_map: Dict[tuple, float] = {
207+
(f.get("repo", ""), f.get("path", "")): _TIER_WEIGHTS.get(f.get("tier", "sufficient"), 1.0)
208+
for f in oracle_files
209+
}
210+
total_weight = sum(weight_map.values()) or 1.0
211+
matched_weight = sum(weight_map.get(k, 1.0) for k in matched)
212+
213+
weighted_recall = matched_weight / total_weight
214+
weighted_f1 = (
215+
(2 * precision * weighted_recall / (precision + weighted_recall))
216+
if (precision + weighted_recall) > 0
217+
else 0.0
218+
)
219+
220+
required_keys = {k for k, w in weight_map.items() if w > 1.0}
221+
required_matched = required_keys & set(matched)
222+
required_recall = len(required_matched) / len(required_keys) if required_keys else None
223+
224+
result["weighted_recall"] = round(weighted_recall, 4)
225+
result["weighted_f1"] = round(weighted_f1, 4)
226+
result["required_recall"] = round(required_recall, 4) if required_recall is not None else None
227+
result["required_total"] = len(required_keys)
228+
result["required_matched"] = len(required_matched)
229+
230+
return result
231+
181232

182233
def check_symbol_resolution(
183234
answer_symbols: List[Dict[str, str]],
@@ -462,9 +513,17 @@ def check_test_ratio(
462513

463514

464515
def _get_primary_score(check_result: Dict[str, Any], check_type: str) -> float:
465-
"""Extract the primary score from a check result for composite scoring."""
516+
"""Extract the primary score from a check result for composite scoring.
517+
518+
For file_set_match, prefers weighted_f1 (available when oracle has tier
519+
annotations) over plain f1, so required-tier files count more heavily.
520+
"""
521+
if check_type == "file_set_match":
522+
# Use weighted_f1 when tiers are present, else fall back to f1
523+
value = check_result.get("weighted_f1", check_result.get("f1", 0))
524+
return float(value)
525+
466526
score_keys = {
467-
"file_set_match": "f1",
468527
"symbol_resolution": "recall",
469528
"dependency_chain": "chain_recall",
470529
"provenance": "provenance_score",

benchmarks/ccb_mcp_compliance/ccx-compliance-053/tests/oracle_checks.py

Lines changed: 63 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,9 @@ def _path_key(item: Dict[str, str]) -> tuple:
132132
return all_matched_oracle, missing, extra
133133

134134

135+
_TIER_WEIGHTS: Dict[str, float] = {"required": 2.0, "sufficient": 1.0}
136+
137+
135138
def check_file_set_match(
136139
answer_files: List[Dict[str, str]],
137140
oracle_files: List[Dict[str, str]],
@@ -142,7 +145,14 @@ def check_file_set_match(
142145
Matching uses two-pass repo normalization: exact match first, then
143146
normalised-repo and path-only fallback for mirror/upstream name mismatches.
144147
145-
Returns raw scores without thresholds.
148+
Returns raw scores without thresholds. When oracle files carry a "tier"
149+
field ("required" or "sufficient"), also computes weighted scores:
150+
- weighted_recall: recall weighted by tier (required=2x, sufficient=1x)
151+
- weighted_f1: F1 using weighted_recall and unweighted precision
152+
- required_recall: recall restricted to "required"-tier files only
153+
154+
All added fields are backward-compatible — callers that ignore them are
155+
unaffected. _get_primary_score prefers weighted_f1 when available.
146156
147157
>>> result = check_file_set_match(
148158
... [{"repo": "a/b", "path": "x.go"}],
@@ -159,6 +169,16 @@ def check_file_set_match(
159169
... )
160170
>>> result["f1"]
161171
1.0
172+
173+
>>> result = check_file_set_match(
174+
... [{"repo": "a/b", "path": "x.go"}],
175+
... [{"repo": "a/b", "path": "x.go", "tier": "required"},
176+
... {"repo": "a/b", "path": "y.go", "tier": "sufficient"}],
177+
... )
178+
>>> result["required_recall"]
179+
1.0
180+
>>> result["weighted_recall"] # matched required(2) / total(3) = 0.6667
181+
0.6667
162182
"""
163183
matched, missing, extra = _match_items(answer_files, oracle_files, ["repo", "path"])
164184

@@ -169,7 +189,7 @@ def check_file_set_match(
169189
precision = len(matched) / n_answer if n_answer else 0.0
170190
f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0
171191

172-
return {
192+
result: Dict[str, Any] = {
173193
"recall": round(recall, 4),
174194
"precision": round(precision, 4),
175195
"f1": round(f1, 4),
@@ -178,6 +198,37 @@ def check_file_set_match(
178198
"extra": [{"repo": r, "path": p} for r, p in sorted(extra)],
179199
}
180200

201+
# Weighted scoring — only when oracle carries tier annotations
202+
has_tiers = any("tier" in f for f in oracle_files)
203+
if has_tiers:
204+
# Build weight map keyed by oracle's exact (repo, path) tuples.
205+
# _match_items returns oracle exact keys so the lookup is direct.
206+
weight_map: Dict[tuple, float] = {
207+
(f.get("repo", ""), f.get("path", "")): _TIER_WEIGHTS.get(f.get("tier", "sufficient"), 1.0)
208+
for f in oracle_files
209+
}
210+
total_weight = sum(weight_map.values()) or 1.0
211+
matched_weight = sum(weight_map.get(k, 1.0) for k in matched)
212+
213+
weighted_recall = matched_weight / total_weight
214+
weighted_f1 = (
215+
(2 * precision * weighted_recall / (precision + weighted_recall))
216+
if (precision + weighted_recall) > 0
217+
else 0.0
218+
)
219+
220+
required_keys = {k for k, w in weight_map.items() if w > 1.0}
221+
required_matched = required_keys & set(matched)
222+
required_recall = len(required_matched) / len(required_keys) if required_keys else None
223+
224+
result["weighted_recall"] = round(weighted_recall, 4)
225+
result["weighted_f1"] = round(weighted_f1, 4)
226+
result["required_recall"] = round(required_recall, 4) if required_recall is not None else None
227+
result["required_total"] = len(required_keys)
228+
result["required_matched"] = len(required_matched)
229+
230+
return result
231+
181232

182233
def check_symbol_resolution(
183234
answer_symbols: List[Dict[str, str]],
@@ -462,9 +513,17 @@ def check_test_ratio(
462513

463514

464515
def _get_primary_score(check_result: Dict[str, Any], check_type: str) -> float:
465-
"""Extract the primary score from a check result for composite scoring."""
516+
"""Extract the primary score from a check result for composite scoring.
517+
518+
For file_set_match, prefers weighted_f1 (available when oracle has tier
519+
annotations) over plain f1, so required-tier files count more heavily.
520+
"""
521+
if check_type == "file_set_match":
522+
# Use weighted_f1 when tiers are present, else fall back to f1
523+
value = check_result.get("weighted_f1", check_result.get("f1", 0))
524+
return float(value)
525+
466526
score_keys = {
467-
"file_set_match": "f1",
468527
"symbol_resolution": "recall",
469528
"dependency_chain": "chain_recall",
470529
"provenance": "provenance_score",

0 commit comments

Comments
 (0)