|
| 1 | +{ |
| 2 | + "project": "CodeScaleBench ABC Check Implementation", |
| 3 | + "branchName": "ralph/abc-checks", |
| 4 | + "description": "Implement all 8 SKIP'd ABC framework criteria checks in abc_audit.py so that no criterion returns SKIP in the audit output.", |
| 5 | + "userStories": [ |
| 6 | + { |
| 7 | + "id": "US-001", |
| 8 | + "title": "Implement T.2 URL reachability check", |
| 9 | + "description": "As a benchmark maintainer, I want automated URL reachability checking in instruction.md files, so that T.2 stops being SKIP'd.", |
| 10 | + "acceptanceCriteria": [ |
| 11 | + "abc_audit.py contains a function check_t2_url_reachability(tasks) that extracts HTTP/HTTPS URLs from instruction.md files", |
| 12 | + "The function makes HEAD requests with 5s timeout to verify reachability, skipping localhost/10.x/172.x/192.168.x addresses", |
| 13 | + "The function returns PASS if all URLs reachable, WARN for timeouts, FAIL for 404s", |
| 14 | + "T.2 is removed from SKIP_CHECKS set on line 932 of abc_audit.py", |
| 15 | + "T.2 is added to TASK_CHECKS dict on line 898 of abc_audit.py", |
| 16 | + "python3 scripts/abc_audit.py --suite csb_sdlc_fix --format table 2>&1 | grep T.2 shows PASS or WARN or FAIL (not SKIP)" |
| 17 | + ], |
| 18 | + "priority": 1, |
| 19 | + "passes": true, |
| 20 | + "notes": "URL check should be gated behind an --online flag to avoid flaky CI. When --online is not set, return PASS with evidence 'URL check skipped (use --online)'. Add --online flag to argparse." |
| 21 | + }, |
| 22 | + { |
| 23 | + "id": "US-002", |
| 24 | + "title": "Implement T.9 false-positive detection check", |
| 25 | + "description": "As a benchmark maintainer, I want a check that detects systematic verifier false positives, so that T.9 stops being SKIP'd.", |
| 26 | + "acceptanceCriteria": [ |
| 27 | + "abc_audit.py contains a function check_t9_false_positives(tasks) that analyzes verifier scripts for patterns that could produce false positives", |
| 28 | + "The check flags verifiers that unconditionally pass (reward=1.0) without meaningful assertions — cross-references with O.c logic", |
| 29 | + "The check also flags verifiers that only check file existence without validating content", |
| 30 | + "T.9 is removed from SKIP_CHECKS set on line 932", |
| 31 | + "T.9 is added to TASK_CHECKS dict", |
| 32 | + "python3 scripts/abc_audit.py --suite csb_sdlc_fix --format table 2>&1 | grep T.9 shows PASS or WARN (not SKIP)" |
| 33 | + ], |
| 34 | + "priority": 2, |
| 35 | + "passes": true, |
| 36 | + "notes": "T.9 is RECOMMENDED severity, so use WARN not FAIL for issues found." |
| 37 | + }, |
| 38 | + { |
| 39 | + "id": "US-003", |
| 40 | + "title": "Implement T.10 shared-state check", |
| 41 | + "description": "As a benchmark maintainer, I want verification that tasks don't share mutable state, so that T.10 stops being SKIP'd.", |
| 42 | + "acceptanceCriteria": [ |
| 43 | + "abc_audit.py contains a function check_t10_shared_state(tasks) that scans Dockerfiles and test.sh for hardcoded ports, shared /tmp paths with fixed names, or named Docker volumes", |
| 44 | + "The check flags tasks that EXPOSE or bind to host ports, or write to paths outside /workspace and /logs", |
| 45 | + "T.10 is removed from SKIP_CHECKS set on line 932", |
| 46 | + "T.10 is added to TASK_CHECKS dict", |
| 47 | + "python3 scripts/abc_audit.py --suite csb_sdlc_fix --format table 2>&1 | grep T.10 shows PASS or WARN (not SKIP)" |
| 48 | + ], |
| 49 | + "priority": 3, |
| 50 | + "passes": true, |
| 51 | + "notes": "T.10 is IMPORTANT severity. Many tasks use /tmp but with unique names — only flag fixed paths like /tmp/mytest or /tmp/results." |
| 52 | + }, |
| 53 | + { |
| 54 | + "id": "US-004", |
| 55 | + "title": "Implement O.a equivalent-solution check", |
| 56 | + "description": "As a benchmark maintainer, I want a check that verifiers accept functionally equivalent solutions, so that O.a stops being SKIP'd.", |
| 57 | + "acceptanceCriteria": [ |
| 58 | + "abc_audit.py contains a function check_oa_equivalent_solutions(tasks) that analyzes verifier scripts for overly-strict matching", |
| 59 | + "The check flags verifiers that use exact string comparison (grep -Fx, diff without tolerance) on agent output without case-insensitive or regex matching", |
| 60 | + "The check flags verifiers that compare against a single hardcoded answer string without alternatives", |
| 61 | + "O.a is removed from SKIP_CHECKS set on line 932", |
| 62 | + "O.a is added to TASK_CHECKS dict", |
| 63 | + "python3 scripts/abc_audit.py --suite csb_sdlc_fix --format table 2>&1 | grep O.a shows PASS or WARN (not SKIP)" |
| 64 | + ], |
| 65 | + "priority": 4, |
| 66 | + "passes": false, |
| 67 | + "notes": "O.a is CRITICAL severity but this is heuristic analysis. Return WARN for tasks that need manual review, PASS when verifiers clearly use flexible matching." |
| 68 | + }, |
| 69 | + { |
| 70 | + "id": "US-005", |
| 71 | + "title": "Implement O.b negated-solution check", |
| 72 | + "description": "As a benchmark maintainer, I want verification that verifiers reject negated or inverted solutions, so that O.b stops being SKIP'd.", |
| 73 | + "acceptanceCriteria": [ |
| 74 | + "abc_audit.py contains a function check_ob_negated_solutions(tasks) that checks verifier scripts for keyword-only matching that would accept contradictory answers", |
| 75 | + "The check flags verifiers that grep for a single keyword without context (e.g., grep 'yes' would match 'the answer is NOT yes')", |
| 76 | + "O.b is removed from SKIP_CHECKS set on line 932", |
| 77 | + "O.b is added to TASK_CHECKS dict", |
| 78 | + "python3 scripts/abc_audit.py --suite csb_sdlc_fix --format table 2>&1 | grep O.b shows PASS or WARN (not SKIP)" |
| 79 | + ], |
| 80 | + "priority": 5, |
| 81 | + "passes": false, |
| 82 | + "notes": "O.b is IMPORTANT severity. Most verifiers use checklist/JSON validation which inherently handles negation. Focus on simple grep-based verifiers." |
| 83 | + }, |
| 84 | + { |
| 85 | + "id": "US-006", |
| 86 | + "title": "Implement O.f edge-case check", |
| 87 | + "description": "As a benchmark maintainer, I want checks for edge-case handling in verifiers, so that O.f stops being SKIP'd.", |
| 88 | + "acceptanceCriteria": [ |
| 89 | + "abc_audit.py contains a function check_of_edge_cases(tasks) that checks verifiers handle edge cases", |
| 90 | + "The check verifies that verifiers check for file existence before reading (e.g., test -f, [ -f ), handle empty output gracefully, and handle malformed JSON when parsing JSON output", |
| 91 | + "O.f is removed from SKIP_CHECKS set on line 932", |
| 92 | + "O.f is added to TASK_CHECKS dict", |
| 93 | + "python3 scripts/abc_audit.py --suite csb_sdlc_fix --format table 2>&1 | grep O.f shows PASS or WARN (not SKIP)" |
| 94 | + ], |
| 95 | + "priority": 6, |
| 96 | + "passes": false, |
| 97 | + "notes": "O.f is RECOMMENDED severity, so use WARN for issues. Check that verifiers using jq/python json.loads have error handling around them." |
| 98 | + }, |
| 99 | + { |
| 100 | + "id": "US-007", |
| 101 | + "title": "Implement O.g determinism check", |
| 102 | + "description": "As a benchmark maintainer, I want checks for determinism in verifiers, so that O.g stops being SKIP'd.", |
| 103 | + "acceptanceCriteria": [ |
| 104 | + "abc_audit.py contains a function check_og_determinism(tasks) that scans verifier scripts for non-deterministic commands", |
| 105 | + "The check flags usage of $RANDOM, date (for comparison not logging), uuidgen, shuf, mktemp (in comparisons), or unseeded random in Python verifiers", |
| 106 | + "O.g is removed from SKIP_CHECKS set on line 932", |
| 107 | + "O.g is added to TASK_CHECKS dict", |
| 108 | + "python3 scripts/abc_audit.py --suite csb_sdlc_fix --format table 2>&1 | grep O.g shows PASS or WARN (not SKIP)" |
| 109 | + ], |
| 110 | + "priority": 7, |
| 111 | + "passes": false, |
| 112 | + "notes": "O.g is IMPORTANT severity. mktemp for temporary files is fine — only flag it when the temp path is used in assertions/comparisons." |
| 113 | + }, |
| 114 | + { |
| 115 | + "id": "US-008", |
| 116 | + "title": "Implement R.6 multi-config comparison check", |
| 117 | + "description": "As a benchmark maintainer, I want automated verification that multiple config results exist for comparison, so that R.6 stops being SKIP'd.", |
| 118 | + "acceptanceCriteria": [ |
| 119 | + "abc_audit.py contains a function check_r6_multi_config(suite) that checks runs/official/ for results from at least 2 different configs", |
| 120 | + "The check identifies configs by directory name patterns: 'baseline', 'sourcegraph_full', 'SG_base', 'SG_full'", |
| 121 | + "Returns PASS if >=2 configs found, WARN if only 1, SKIP if no runs exist", |
| 122 | + "R.6 is removed from SKIP_CHECKS set on line 932", |
| 123 | + "R.6 is added to SUITE_CHECKS dict (takes suite: str)", |
| 124 | + "python3 scripts/abc_audit.py --suite csb_sdlc_fix --format table 2>&1 | grep R.6 shows PASS or WARN (not SKIP)" |
| 125 | + ], |
| 126 | + "priority": 8, |
| 127 | + "passes": false, |
| 128 | + "notes": "R.6 is IMPORTANT severity. This should check for run directory names containing config identifiers." |
| 129 | + }, |
| 130 | + { |
| 131 | + "id": "US-009", |
| 132 | + "title": "Implement T.7 metadata sync check", |
| 133 | + "description": "As a benchmark maintainer, I want automated verification that task.toml metadata matches selected_benchmark_tasks.json, so that T.7 stops being SKIP'd.", |
| 134 | + "acceptanceCriteria": [ |
| 135 | + "abc_audit.py contains a function check_t7_metadata_sync(tasks) that compares task.toml fields against selected_benchmark_tasks.json entries", |
| 136 | + "The check compares: task id/name, suite, repo, language, difficulty — reporting specific mismatches", |
| 137 | + "Returns PASS if all match, WARN if selected_benchmark_tasks.json has fewer entries than task.toml files, FAIL for value mismatches", |
| 138 | + "T.7 is removed from SKIP_CHECKS set (note: T.7 is not currently in SKIP_CHECKS, it's just missing from TASK_CHECKS — verify where it's handled)", |
| 139 | + "T.7 is added to TASK_CHECKS dict", |
| 140 | + "python3 scripts/abc_audit.py --suite csb_sdlc_fix --format table 2>&1 | grep T.7 shows PASS or WARN (not SKIP)" |
| 141 | + ], |
| 142 | + "priority": 9, |
| 143 | + "passes": false, |
| 144 | + "notes": "T.7 is IMPORTANT severity. Uses parse_task_toml_simple() already in the file for reading task.toml. selected_benchmark_tasks.json path is already defined as SELECTED_TASKS_PATH." |
| 145 | + }, |
| 146 | + { |
| 147 | + "id": "US-010", |
| 148 | + "title": "Verify all SKIP_CHECKS eliminated and audit passes", |
| 149 | + "description": "As a benchmark maintainer, I want to confirm that no criterion returns SKIP in the audit output after all checks are implemented.", |
| 150 | + "acceptanceCriteria": [ |
| 151 | + "SKIP_CHECKS set on line 932 of abc_audit.py is empty (or removed entirely)", |
| 152 | + "python3 scripts/abc_audit.py --all --format table 2>&1 | grep SKIP returns zero matches (excluding R.2 for org suites which SKIP by design)", |
| 153 | + "python3 scripts/abc_audit.py --all --format table 2>&1 runs without errors", |
| 154 | + "All 20 suites complete audit without Python exceptions" |
| 155 | + ], |
| 156 | + "priority": 10, |
| 157 | + "passes": false, |
| 158 | + "notes": "R.2 for csb_org_* suites will still show SKIP because MCP references in org suite instructions are by design (line 958). This is correct behavior. All other SKIPs should be eliminated." |
| 159 | + } |
| 160 | + ] |
| 161 | +} |
0 commit comments