@@ -947,6 +947,115 @@ def check_t10_shared_state(tasks: list[Path]) -> CriterionResult:
947947 )
948948
949949
950+ def check_oa_equivalent_solutions (tasks : list [Path ]) -> CriterionResult :
951+ """O.a: Verifiers accept functionally equivalent solutions (no overly-strict matching)."""
952+ issues = []
953+ for task_dir in tasks :
954+ verifier = _get_primary_verifier (task_dir )
955+ if not verifier :
956+ continue
957+
958+ content = verifier .read_text (errors = "replace" )
959+ task_name = task_dir .name
960+ task_issues = []
961+
962+ if verifier .suffix == ".sh" :
963+ # Flag grep -Fx (exact fixed-string line match)
964+ if re .search (r"\bgrep\s+.*-[A-Za-z]*F[A-Za-z]*x|grep\s+.*-[A-Za-z]*x[A-Za-z]*F" , content ):
965+ task_issues .append ("grep -Fx (exact fixed-string match)" )
966+
967+ # Flag direct string equality tests: [ "$var" = "hardcoded" ] or == "hardcoded"
968+ strict_eq = re .findall (r'\[\s*"\$\w+"\s*==?\s*"([^"]+)"\s*\]' , content )
969+ if strict_eq :
970+ task_issues .append (f"exact string comparison against: { ', ' .join (strict_eq [:3 ])} " )
971+
972+ # Flag diff without any tolerance flags (allow diff -w, diff -b, diff --ignore)
973+ diff_calls = re .finditer (r"\bdiff\s+([^\n|;&]+)" , content )
974+ for m in diff_calls :
975+ args = m .group (1 )
976+ if re .search (r"-[A-Za-z]*[wbBi]|--ignore|--strip" , args ):
977+ continue
978+ if "<(" in args :
979+ continue
980+ task_issues .append ("diff without tolerance flags (-w/-b/--ignore)" )
981+ break
982+
983+ if task_issues :
984+ issues .append (f"{ task_name } : { '; ' .join (task_issues )} " )
985+
986+ if not issues :
987+ return CriterionResult (
988+ criterion_id = "O.a" , status = Status .PASS ,
989+ evidence = f"No overly-strict matching found across { len (tasks )} verifiers" ,
990+ )
991+ return CriterionResult (
992+ criterion_id = "O.a" , status = Status .WARN ,
993+ evidence = "\n " .join (issues [:10 ]),
994+ remediation = "Consider using flexible matching (regex, -i flag, tolerance) in verifiers" ,
995+ details = {"issue_count" : len (issues ), "issues" : issues [:20 ]},
996+ )
997+
998+
999+ def check_ob_negated_solutions (tasks : list [Path ]) -> CriterionResult :
1000+ """O.b: Verifiers reject negated/inverted solutions (no keyword-only matching)."""
1001+ issues = []
1002+ for task_dir in tasks :
1003+ verifier = _get_primary_verifier (task_dir )
1004+ if not verifier or verifier .suffix != ".sh" :
1005+ continue
1006+
1007+ content = verifier .read_text (errors = "replace" )
1008+ task_name = task_dir .name
1009+ task_issues = []
1010+
1011+ # Find bare grep for a single short keyword without robust flags.
1012+ # These could match "NOT keyword" or "the answer is definitely not keyword".
1013+ # Exclude greps with flags: -E (regex), -P (perl), -w (word boundary),
1014+ # -c (count), -r/-R (recursive code search), -l (file list), -q (boolean),
1015+ # -n (line numbers).
1016+ bare_greps = re .finditer (
1017+ r"""grep\s+(?:-[A-Za-z]*\s+)*['"]([^'"]{1,20})['"]\s+(\S+)""" ,
1018+ content ,
1019+ )
1020+ for m in bare_greps :
1021+ keyword = m .group (1 ).strip ()
1022+ target = m .group (2 )
1023+ prefix = m .group (0 ).split (keyword )[0 ]
1024+
1025+ # Skip multi-word or regex patterns (inherently more specific)
1026+ if re .search (r"[.*+?^${}()|\\[\]]" , keyword ) or " " in keyword :
1027+ continue
1028+
1029+ # Skip if grep has flags that make matching more robust
1030+ if re .search (r"-[A-Za-z]*[cEPrlRwqn]" , prefix ):
1031+ continue
1032+
1033+ # Skip if grepping source code files (not agent output)
1034+ if re .search (r"\.(py|js|ts|go|java|rs|c|cpp|sh|rb|yaml|yml|toml|json|md)$" , target ):
1035+ continue
1036+
1037+ # Skip if target is log/reward/result paths (structured output)
1038+ if re .search (r"/logs/|reward\.|result\.|\.log" , target ):
1039+ continue
1040+
1041+ task_issues .append (f"bare grep for '{ keyword } ' could match negated answer" )
1042+
1043+ if task_issues :
1044+ issues .append (f"{ task_name } : { '; ' .join (task_issues [:3 ])} " )
1045+
1046+ if not issues :
1047+ return CriterionResult (
1048+ criterion_id = "O.b" , status = Status .PASS ,
1049+ evidence = f"No keyword-only matching vulnerable to negation across { len (tasks )} verifiers" ,
1050+ )
1051+ return CriterionResult (
1052+ criterion_id = "O.b" , status = Status .WARN ,
1053+ evidence = "\n " .join (issues [:10 ]),
1054+ remediation = "Use multi-word patterns, regex with context, or structured JSON validation instead of bare keyword grep" ,
1055+ details = {"issue_count" : len (issues ), "issues" : issues [:20 ]},
1056+ )
1057+
1058+
9501059# ---------------------------------------------------------------------------
9511060# Main auditor
9521061# ---------------------------------------------------------------------------
@@ -959,6 +1068,8 @@ def check_t10_shared_state(tasks: list[Path]) -> CriterionResult:
9591068 "T.4" : check_t4_git_sha ,
9601069 "T.5" : check_t5_no_solution_leak ,
9611070 "T.8" : check_t8_oracle_exists ,
1071+ "O.a" : check_oa_equivalent_solutions ,
1072+ "O.b" : check_ob_negated_solutions ,
9621073 "O.c" : check_oc_empty_solution_rejected ,
9631074 "O.d" : check_od_error_handling ,
9641075 "O.e" : check_oe_multiple_assertions ,
@@ -988,7 +1099,7 @@ def check_t10_shared_state(tasks: list[Path]) -> CriterionResult:
9881099}
9891100
9901101# Semi-automated / manual checks (skip with note)
991- SKIP_CHECKS = {"T.2" , "T.9" , "O.a" , "O.b" , "O. f" , "O.g" , "R.6" }
1102+ SKIP_CHECKS = {"T.2" , "T.9" , "O.f" , "O.g" , "R.6" }
9921103
9931104
9941105def audit_suite (suite : str , dimension : Optional [Dimension ] = None ) -> AuditReport :
0 commit comments