Skip to content

Commit fa64168

Browse files
sjarmakclaude
andcommitted
Fix ABC audit: repo normalization, false-positive reduction, remove org suite R.2 skip
- T.7: Add _normalize_repo_name() with _REPO_ALIASES mapping (~55 repos). Strips sg-evals/ prefix, --version suffix, maps short names to org/repo. Reduces T.7 mismatches from 166 to 0. - T.7: Fix 14 metadata mismatches in selected_benchmark_tasks.json (9 difficulty, 3 language, 2 repo — task.toml is authoritative source). - T.10: In-container /tmp paths are not shared state. Only flag /tmp when combined with host port bindings or named Docker volumes. - O.g: mktemp creating scratch files for diff/cmp is deterministic. Only flag when the filename variable itself is string-compared. - R.2: Remove incorrect SKIP for csb_org_ suites. Org suites are organizational use-case tasks (cross-repo, compliance, incident, etc.) — NOT "MCP-unique". Their instruction.md must be tool-neutral. Result: 20/20 suites Grade A, 0 FAIL, 0 SKIP (except R.2 was wrongly skipping). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 87d844c commit fa64168

File tree

2 files changed

+131
-36
lines changed

2 files changed

+131
-36
lines changed

configs/selected_benchmark_tasks.json

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1088,7 +1088,7 @@
10881088
"benchmark": "csb_sdlc_test",
10891089
"sdlc_phase": "Testing & QA",
10901090
"language": "python",
1091-
"difficulty": "medium",
1091+
"difficulty": "hard",
10921092
"category": "performance",
10931093
"repo": "scikit-learn/scikit-learn",
10941094
"mcp_benefit_score": 0.4575,
@@ -1490,7 +1490,7 @@
14901490
"language": "javascript",
14911491
"difficulty": "hard",
14921492
"category": "code-review",
1493-
"repo": "agentic-review-benchmarks/benchmark-pr-mapping",
1493+
"repo": "TryGhost/Ghost",
14941494
"mcp_benefit_score": 0.82,
14951495
"mcp_breakdown": {
14961496
"context_complexity": 0.85,
@@ -2833,7 +2833,7 @@
28332833
"language": "go",
28342834
"difficulty": "hard",
28352835
"category": "migration_guide",
2836-
"repo": "hashicorp/terraform",
2836+
"repo": "envoyproxy/envoy",
28372837
"task_dir": "csb_sdlc_document/envoy-migration-doc-gen-001",
28382838
"selection_rationale": "Migration guide generation requiring version diff analysis",
28392839
"mcp_benefit_score": 0.8,
@@ -5544,8 +5544,8 @@
55445544
"task_id": "envoy-routeconfig-dep-chain-001",
55455545
"benchmark": "csb_sdlc_design",
55465546
"task_dir": "csb_sdlc_design/envoy-routeconfig-dep-chain-001",
5547-
"language": "go",
5548-
"difficulty": "hard",
5547+
"language": "go,protobuf",
5548+
"difficulty": "very_hard",
55495549
"sdlc_phase": "design",
55505550
"repo": "envoyproxy/envoy",
55515551
"mcp_benefit_score": 0.75,
@@ -5633,7 +5633,7 @@
56335633
"task_id": "envoy-stream-aggregated-sym-001",
56345634
"benchmark": "csb_sdlc_design",
56355635
"task_dir": "csb_sdlc_design/envoy-stream-aggregated-sym-001",
5636-
"language": "go",
5636+
"language": "go,cpp",
56375637
"difficulty": "hard",
56385638
"sdlc_phase": "design",
56395639
"repo": "envoyproxy/envoy",
@@ -6112,7 +6112,7 @@
61126112
"benchmark": "csb_sdlc_design",
61136113
"task_dir": "csb_sdlc_design/k8s-typemeta-dep-chain-001",
61146114
"language": "go",
6115-
"difficulty": "hard",
6115+
"difficulty": "very_hard",
61166116
"sdlc_phase": "design",
61176117
"repo": "kubernetes/kubernetes",
61186118
"mcp_benefit_score": 0.75,
@@ -6313,7 +6313,7 @@
63136313
"benchmark": "csb_sdlc_fix",
63146314
"task_dir": "csb_sdlc_fix/ansible-abc-imports-fix-001",
63156315
"language": "python",
6316-
"difficulty": "medium",
6316+
"difficulty": "hard",
63176317
"sdlc_phase": "fix",
63186318
"repo": "ansible/ansible",
63196319
"mcp_benefit_score": 0.75,
@@ -6372,7 +6372,7 @@
63726372
"benchmark": "csb_sdlc_fix",
63736373
"task_dir": "csb_sdlc_fix/ansible-module-respawn-fix-001",
63746374
"language": "python",
6375-
"difficulty": "medium",
6375+
"difficulty": "hard",
63766376
"sdlc_phase": "fix",
63776377
"repo": "ansible/ansible",
63786378
"mcp_benefit_score": 0.75,
@@ -6536,7 +6536,7 @@
65366536
"benchmark": "csb_sdlc_fix",
65376537
"task_dir": "csb_sdlc_fix/flipt-cockroachdb-backend-fix-001",
65386538
"language": "go",
6539-
"difficulty": "medium",
6539+
"difficulty": "hard",
65406540
"sdlc_phase": "fix",
65416541
"repo": "flipt-io/flipt",
65426542
"mcp_benefit_score": 0.75,
@@ -6600,7 +6600,7 @@
66006600
"benchmark": "csb_sdlc_fix",
66016601
"task_dir": "csb_sdlc_fix/flipt-ecr-auth-oci-fix-001",
66026602
"language": "go",
6603-
"difficulty": "medium",
6603+
"difficulty": "hard",
66046604
"sdlc_phase": "fix",
66056605
"repo": "flipt-io/flipt",
66066606
"mcp_benefit_score": 0.75,
@@ -6875,7 +6875,7 @@
68756875
"benchmark": "csb_sdlc_fix",
68766876
"task_dir": "csb_sdlc_fix/navidrome-windows-log-fix-001",
68776877
"language": "go",
6878-
"difficulty": "medium",
6878+
"difficulty": "hard",
68796879
"sdlc_phase": "fix",
68806880
"repo": "navidrome/navidrome",
68816881
"mcp_benefit_score": 0.75,
@@ -6949,7 +6949,7 @@
69496949
"benchmark": "csb_sdlc_fix",
69506950
"task_dir": "csb_sdlc_fix/nodebb-notif-dropdown-fix-001",
69516951
"language": "javascript",
6952-
"difficulty": "medium",
6952+
"difficulty": "hard",
69536953
"sdlc_phase": "fix",
69546954
"repo": "NodeBB/NodeBB",
69556955
"mcp_benefit_score": 0.75,
@@ -6998,7 +6998,7 @@
69986998
"benchmark": "csb_sdlc_fix",
69996999
"task_dir": "csb_sdlc_fix/nodebb-plugin-validate-fix-001",
70007000
"language": "javascript",
7001-
"difficulty": "medium",
7001+
"difficulty": "hard",
70027002
"sdlc_phase": "fix",
70037003
"repo": "NodeBB/NodeBB",
70047004
"mcp_benefit_score": 0.75,
@@ -7047,7 +7047,7 @@
70477047
"benchmark": "csb_sdlc_fix",
70487048
"task_dir": "csb_sdlc_fix/openlibrary-solr-boolean-fix-001",
70497049
"language": "python",
7050-
"difficulty": "medium",
7050+
"difficulty": "hard",
70517051
"sdlc_phase": "fix",
70527052
"repo": "internetarchive/openlibrary",
70537053
"mcp_benefit_score": 0.75,
@@ -31853,7 +31853,9 @@
3185331853
"repo_complexity_source": "git_tree_scan",
3185431854
"task_complexity": 0.15,
3185531855
"task_complexity_label": "medium",
31856-
"task_complexity_source": "ground_truth_meta_plus_registry"
31856+
"task_complexity_source": "ground_truth_meta_plus_registry",
31857+
"execution_env": "local_docker_only",
31858+
"daytona_incompatible_reason": "repo_too_large_for_10gb_sandbox"
3185731859
},
3185831860
{
3185931861
"task_id": "ccx-platform-285",

scripts/abc_audit.py

Lines changed: 113 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -445,17 +445,20 @@ def check_t10_shared_state(tasks: list[Path]) -> CriterionResult:
445445
if port_binds:
446446
task_issues.append(f"{rel}: host port binding {', '.join(port_binds)}")
447447

448-
# Fixed /tmp paths — skip dynamic like /tmp/$$ or mktemp
449-
fixed_tmp = re.findall(r"/tmp/([a-zA-Z][a-zA-Z0-9_.-]+)", content)
450-
fixed_tmp = [t for t in fixed_tmp if not re.match(r"tmp\.", t)]
451-
if fixed_tmp:
452-
task_issues.append(f"{rel}: fixed /tmp paths: /tmp/{', /tmp/'.join(fixed_tmp[:3])}")
453-
454448
# Named Docker volumes
455449
named_vols = re.findall(r"docker\s+.*-v\s+([a-zA-Z]\w+):/", content)
456450
if named_vols:
457451
task_issues.append(f"{rel}: named Docker volumes: {', '.join(named_vols)}")
458452

453+
# Fixed /tmp paths — only flag if used with host-mounted volumes
454+
# or docker -v binds. In-container /tmp usage is safe since each task
455+
# runs in its own isolated container.
456+
if port_binds or named_vols:
457+
fixed_tmp = re.findall(r"/tmp/([a-zA-Z][a-zA-Z0-9_.-]+)", content)
458+
fixed_tmp = [t for t in fixed_tmp if not re.match(r"tmp\.", t)]
459+
if fixed_tmp:
460+
task_issues.append(f"{rel}: fixed /tmp paths with host interaction: /tmp/{', /tmp/'.join(fixed_tmp[:3])}")
461+
459462
if task_issues:
460463
issues.append(f"{task_name}: {'; '.join(task_issues)}")
461464

@@ -1257,13 +1260,16 @@ def check_og_determinism(tasks: list[Path]) -> CriterionResult:
12571260
task_issues.append("date used in comparison (non-deterministic)")
12581261
break
12591262

1260-
# Flag mktemp when the temp path is used in assertions/comparisons
1263+
# Flag mktemp when the temp path itself is compared (not its content).
1264+
# Using mktemp to create a scratch file, write to it, then diff/cmp
1265+
# the content is deterministic — the random part is only the filename.
12611266
mktemp_vars = re.findall(r'(\w+)=\$\(mktemp\b', content)
12621267
for var in mktemp_vars:
1263-
# Check if this variable appears in a diff/cmp/assert/comparison
1264-
if re.search(rf'(?:diff|cmp|assertEqual|assert|==|!=)\s.*\${var}\b', content) or \
1265-
re.search(rf'\${var}\b.*(?:diff|cmp|assertEqual|assert|==|!=)', content):
1266-
task_issues.append(f"mktemp result ${var} used in comparison")
1268+
# Only flag if the variable is tested for equality with == or !=
1269+
# (comparing the filename itself). Diff/cmp compare file *contents*.
1270+
if re.search(rf'(?:==|!=)\s*["\']?\$\{{{var}\}}', content) or \
1271+
re.search(rf'\$\{{{var}\}}["\']?\s*(?:==|!=)', content):
1272+
task_issues.append(f"mktemp filename ${var} used in string comparison")
12671273
break
12681274

12691275
elif verifier.suffix == ".py":
@@ -1378,6 +1384,90 @@ def check_of_edge_cases(tasks: list[Path]) -> CriterionResult:
13781384
)
13791385

13801386

1387+
# Mapping from sg-evals base names / short names to canonical org/repo
1388+
_REPO_ALIASES: dict[str, str] = {
1389+
"kubernetes": "kubernetes/kubernetes",
1390+
"k8s": "kubernetes/kubernetes",
1391+
"kafka": "apache/kafka",
1392+
"envoy": "envoyproxy/envoy",
1393+
"grafana": "grafana/grafana",
1394+
"django": "django/django",
1395+
"pytorch": "pytorch/pytorch",
1396+
"terraform": "hashicorp/terraform",
1397+
"prometheus": "prometheus/prometheus",
1398+
"rust": "rust-lang/rust",
1399+
"vscode": "microsoft/vscode",
1400+
"firefox": "mozilla/gecko-dev",
1401+
"jdk": "openjdk/jdk",
1402+
"llvm-project": "llvm/llvm-project",
1403+
"chromium": "chromium/chromium",
1404+
"numpy": "numpy/numpy",
1405+
"pandas": "pandas-dev/pandas",
1406+
"cilium": "cilium/cilium",
1407+
"istio": "istio/istio",
1408+
"node": "nodejs/node",
1409+
"flask": "pallets/flask",
1410+
"requests": "psf/requests",
1411+
"curl": "curl/curl",
1412+
"flink": "apache/flink",
1413+
"beam": "apache/beam",
1414+
"camel": "apache/camel",
1415+
"bazel": "bazelbuild/bazel",
1416+
"servo": "servo/servo",
1417+
"ansible": "ansible/ansible",
1418+
"ghost": "tryghost/ghost",
1419+
"typescript": "microsoft/typescript",
1420+
"tensorflow": "tensorflow/tensorflow",
1421+
"etcd": "etcd-io/etcd",
1422+
"etcd-io-etcd": "etcd-io/etcd",
1423+
"cockroach": "cockroachdb/cockroach",
1424+
"roslyn": "dotnet/roslyn",
1425+
"aspnetcore": "dotnet/aspnetcore",
1426+
"cal.com": "calcom/cal.com",
1427+
"tidb": "pingcap/tidb",
1428+
"godot": "godotengine/godot",
1429+
"ceph": "ceph/ceph",
1430+
"scikit-learn": "scikit-learn/scikit-learn",
1431+
"scipy": "scipy/scipy",
1432+
"tensorrt-llm": "nvidia/tensorrt-llm",
1433+
"clickhouse": "clickhouse/clickhouse",
1434+
"elasticsearch": "elastic/elasticsearch",
1435+
"nodebb": "nodebb/nodebb",
1436+
"grpc": "grpc/grpc",
1437+
"grpc-go": "grpc/grpc-go",
1438+
"openlibrary": "internetarchive/openlibrary",
1439+
"linux": "torvalds/linux",
1440+
"gcc": "gcc-mirror/gcc",
1441+
"navidrome": "navidrome/navidrome",
1442+
"argo-cd": "argoproj/argo-cd",
1443+
}
1444+
1445+
1446+
def _normalize_repo_name(raw: str) -> str:
1447+
"""Normalize repo name to lowercase org/repo form for comparison.
1448+
1449+
Handles: sg-evals/kubernetes--v1.32.0, org/repo, kubernetes, pytorch/pytorch
1450+
"""
1451+
name = raw.strip().lower()
1452+
if not name or name == "org/repo":
1453+
return "" # Placeholder — can't normalize
1454+
# Strip sg-evals/ prefix
1455+
if name.startswith("sg-evals/"):
1456+
name = name[len("sg-evals/"):]
1457+
# Strip version suffix: kubernetes--v1.32.0 → kubernetes
1458+
name = re.sub(r"--[a-z0-9._]+$", "", name)
1459+
# Strip .git suffix
1460+
name = name.removesuffix(".git")
1461+
# If it's already org/repo form, return as-is
1462+
if "/" in name:
1463+
return name
1464+
# Look up alias
1465+
if name in _REPO_ALIASES:
1466+
return _REPO_ALIASES[name]
1467+
# Fallback: use name as both org and repo (e.g., "flipt" → "flipt")
1468+
return name
1469+
1470+
13811471
def check_t7_metadata_sync(tasks: list[Path]) -> CriterionResult:
13821472
"""T.7: task.toml metadata matches selected_benchmark_tasks.json."""
13831473
if not SELECTED_TASKS_PATH.is_file():
@@ -1425,14 +1515,21 @@ def check_t7_metadata_sync(tasks: list[Path]) -> CriterionResult:
14251515
field_map = [
14261516
("metadata.language", "language"),
14271517
("metadata.difficulty", "difficulty"),
1428-
("task.repo", "repo"),
14291518
]
14301519
for toml_key, json_key in field_map:
14311520
toml_val = toml.get(toml_key, "").lower()
14321521
json_val = str(entry.get(json_key, "")).lower()
14331522
if toml_val and json_val and toml_val != json_val:
14341523
task_mismatches.append(f"{json_key}: toml={toml_val!r} vs json={json_val!r}")
14351524

1525+
# Compare repo with normalization (sg-evals/ prefix, short names, etc.)
1526+
toml_repo = _normalize_repo_name(toml.get("task.repo", ""))
1527+
json_repo = _normalize_repo_name(str(entry.get("repo", "")))
1528+
if toml_repo and json_repo and toml_repo != json_repo:
1529+
task_mismatches.append(
1530+
f"repo: toml={toml.get('task.repo', '')!r} vs json={entry.get('repo', '')!r}"
1531+
)
1532+
14361533
if task_mismatches:
14371534
mismatches.append(f"{task_name}: {'; '.join(task_mismatches)}")
14381535

@@ -1532,14 +1629,10 @@ def audit_suite(suite: str, dimension: Optional[Dimension] = None, *, online: bo
15321629
))
15331630
continue
15341631

1535-
# R.2 doesn't apply to MCP-unique suites: instructions intentionally
1536-
# reference Sourcegraph MCP tools (that's the point of these tasks).
1537-
if cid == "R.2" and suite.startswith(("csb_org_", "ccb_mcp_")):
1538-
report.results.append(CriterionResult(
1539-
criterion_id=cid, status=Status.SKIP,
1540-
evidence="MCP-unique suite: MCP tool references in instructions are by design",
1541-
))
1542-
continue
1632+
# Note: R.2 contamination check applies to ALL suites including csb_org_.
1633+
# Org suites are organizational use cases (cross-repo, compliance, etc.)
1634+
# but their instruction.md files must be tool-neutral — no MCP references.
1635+
# Only instruction_mcp.md (the MCP variant) may reference Sourcegraph tools.
15431636

15441637
# Run automated check
15451638
if cid in TASK_CHECKS:

0 commit comments

Comments
 (0)