@@ -445,17 +445,20 @@ def check_t10_shared_state(tasks: list[Path]) -> CriterionResult:
445445 if port_binds :
446446 task_issues .append (f"{ rel } : host port binding { ', ' .join (port_binds )} " )
447447
448- # Fixed /tmp paths — skip dynamic like /tmp/$$ or mktemp
449- fixed_tmp = re .findall (r"/tmp/([a-zA-Z][a-zA-Z0-9_.-]+)" , content )
450- fixed_tmp = [t for t in fixed_tmp if not re .match (r"tmp\." , t )]
451- if fixed_tmp :
452- task_issues .append (f"{ rel } : fixed /tmp paths: /tmp/{ ', /tmp/' .join (fixed_tmp [:3 ])} " )
453-
454448 # Named Docker volumes
455449 named_vols = re .findall (r"docker\s+.*-v\s+([a-zA-Z]\w+):/" , content )
456450 if named_vols :
457451 task_issues .append (f"{ rel } : named Docker volumes: { ', ' .join (named_vols )} " )
458452
453+ # Fixed /tmp paths — only flag if used with host-mounted volumes
454+ # or docker -v binds. In-container /tmp usage is safe since each task
455+ # runs in its own isolated container.
456+ if port_binds or named_vols :
457+ fixed_tmp = re .findall (r"/tmp/([a-zA-Z][a-zA-Z0-9_.-]+)" , content )
458+ fixed_tmp = [t for t in fixed_tmp if not re .match (r"tmp\." , t )]
459+ if fixed_tmp :
460+ task_issues .append (f"{ rel } : fixed /tmp paths with host interaction: /tmp/{ ', /tmp/' .join (fixed_tmp [:3 ])} " )
461+
459462 if task_issues :
460463 issues .append (f"{ task_name } : { '; ' .join (task_issues )} " )
461464
@@ -1257,13 +1260,16 @@ def check_og_determinism(tasks: list[Path]) -> CriterionResult:
12571260 task_issues .append ("date used in comparison (non-deterministic)" )
12581261 break
12591262
1260- # Flag mktemp when the temp path is used in assertions/comparisons
1263+ # Flag mktemp when the temp path itself is compared (not its content).
1264+ # Using mktemp to create a scratch file, write to it, then diff/cmp
1265+ # the content is deterministic — the random part is only the filename.
12611266 mktemp_vars = re .findall (r'(\w+)=\$\(mktemp\b' , content )
12621267 for var in mktemp_vars :
1263- # Check if this variable appears in a diff/cmp/assert/comparison
1264- if re .search (rf'(?:diff|cmp|assertEqual|assert|==|!=)\s.*\${ var } \b' , content ) or \
1265- re .search (rf'\${ var } \b.*(?:diff|cmp|assertEqual|assert|==|!=)' , content ):
1266- task_issues .append (f"mktemp result ${ var } used in comparison" )
1268+ # Only flag if the variable is tested for equality with == or !=
1269+ # (comparing the filename itself). Diff/cmp compare file *contents*.
1270+ if re .search (rf'(?:==|!=)\s*["\']?\$\{{{ var } \}}' , content ) or \
1271+ re .search (rf'\$\{{{ var } \}}["\']?\s*(?:==|!=)' , content ):
1272+ task_issues .append (f"mktemp filename ${ var } used in string comparison" )
12671273 break
12681274
12691275 elif verifier .suffix == ".py" :
@@ -1378,6 +1384,90 @@ def check_of_edge_cases(tasks: list[Path]) -> CriterionResult:
13781384 )
13791385
13801386
1387+ # Mapping from sg-evals base names / short names to canonical org/repo
1388+ _REPO_ALIASES : dict [str , str ] = {
1389+ "kubernetes" : "kubernetes/kubernetes" ,
1390+ "k8s" : "kubernetes/kubernetes" ,
1391+ "kafka" : "apache/kafka" ,
1392+ "envoy" : "envoyproxy/envoy" ,
1393+ "grafana" : "grafana/grafana" ,
1394+ "django" : "django/django" ,
1395+ "pytorch" : "pytorch/pytorch" ,
1396+ "terraform" : "hashicorp/terraform" ,
1397+ "prometheus" : "prometheus/prometheus" ,
1398+ "rust" : "rust-lang/rust" ,
1399+ "vscode" : "microsoft/vscode" ,
1400+ "firefox" : "mozilla/gecko-dev" ,
1401+ "jdk" : "openjdk/jdk" ,
1402+ "llvm-project" : "llvm/llvm-project" ,
1403+ "chromium" : "chromium/chromium" ,
1404+ "numpy" : "numpy/numpy" ,
1405+ "pandas" : "pandas-dev/pandas" ,
1406+ "cilium" : "cilium/cilium" ,
1407+ "istio" : "istio/istio" ,
1408+ "node" : "nodejs/node" ,
1409+ "flask" : "pallets/flask" ,
1410+ "requests" : "psf/requests" ,
1411+ "curl" : "curl/curl" ,
1412+ "flink" : "apache/flink" ,
1413+ "beam" : "apache/beam" ,
1414+ "camel" : "apache/camel" ,
1415+ "bazel" : "bazelbuild/bazel" ,
1416+ "servo" : "servo/servo" ,
1417+ "ansible" : "ansible/ansible" ,
1418+ "ghost" : "tryghost/ghost" ,
1419+ "typescript" : "microsoft/typescript" ,
1420+ "tensorflow" : "tensorflow/tensorflow" ,
1421+ "etcd" : "etcd-io/etcd" ,
1422+ "etcd-io-etcd" : "etcd-io/etcd" ,
1423+ "cockroach" : "cockroachdb/cockroach" ,
1424+ "roslyn" : "dotnet/roslyn" ,
1425+ "aspnetcore" : "dotnet/aspnetcore" ,
1426+ "cal.com" : "calcom/cal.com" ,
1427+ "tidb" : "pingcap/tidb" ,
1428+ "godot" : "godotengine/godot" ,
1429+ "ceph" : "ceph/ceph" ,
1430+ "scikit-learn" : "scikit-learn/scikit-learn" ,
1431+ "scipy" : "scipy/scipy" ,
1432+ "tensorrt-llm" : "nvidia/tensorrt-llm" ,
1433+ "clickhouse" : "clickhouse/clickhouse" ,
1434+ "elasticsearch" : "elastic/elasticsearch" ,
1435+ "nodebb" : "nodebb/nodebb" ,
1436+ "grpc" : "grpc/grpc" ,
1437+ "grpc-go" : "grpc/grpc-go" ,
1438+ "openlibrary" : "internetarchive/openlibrary" ,
1439+ "linux" : "torvalds/linux" ,
1440+ "gcc" : "gcc-mirror/gcc" ,
1441+ "navidrome" : "navidrome/navidrome" ,
1442+ "argo-cd" : "argoproj/argo-cd" ,
1443+ }
1444+
1445+
1446+ def _normalize_repo_name (raw : str ) -> str :
1447+ """Normalize repo name to lowercase org/repo form for comparison.
1448+
1449+ Handles: sg-evals/kubernetes--v1.32.0, org/repo, kubernetes, pytorch/pytorch
1450+ """
1451+ name = raw .strip ().lower ()
1452+ if not name or name == "org/repo" :
1453+ return "" # Placeholder — can't normalize
1454+ # Strip sg-evals/ prefix
1455+ if name .startswith ("sg-evals/" ):
1456+ name = name [len ("sg-evals/" ):]
1457+ # Strip version suffix: kubernetes--v1.32.0 → kubernetes
1458+ name = re .sub (r"--[a-z0-9._]+$" , "" , name )
1459+ # Strip .git suffix
1460+ name = name .removesuffix (".git" )
1461+ # If it's already org/repo form, return as-is
1462+ if "/" in name :
1463+ return name
1464+ # Look up alias
1465+ if name in _REPO_ALIASES :
1466+ return _REPO_ALIASES [name ]
1467+ # Fallback: use name as both org and repo (e.g., "flipt" → "flipt")
1468+ return name
1469+
1470+
13811471def check_t7_metadata_sync (tasks : list [Path ]) -> CriterionResult :
13821472 """T.7: task.toml metadata matches selected_benchmark_tasks.json."""
13831473 if not SELECTED_TASKS_PATH .is_file ():
@@ -1425,14 +1515,21 @@ def check_t7_metadata_sync(tasks: list[Path]) -> CriterionResult:
14251515 field_map = [
14261516 ("metadata.language" , "language" ),
14271517 ("metadata.difficulty" , "difficulty" ),
1428- ("task.repo" , "repo" ),
14291518 ]
14301519 for toml_key , json_key in field_map :
14311520 toml_val = toml .get (toml_key , "" ).lower ()
14321521 json_val = str (entry .get (json_key , "" )).lower ()
14331522 if toml_val and json_val and toml_val != json_val :
14341523 task_mismatches .append (f"{ json_key } : toml={ toml_val !r} vs json={ json_val !r} " )
14351524
1525+ # Compare repo with normalization (sg-evals/ prefix, short names, etc.)
1526+ toml_repo = _normalize_repo_name (toml .get ("task.repo" , "" ))
1527+ json_repo = _normalize_repo_name (str (entry .get ("repo" , "" )))
1528+ if toml_repo and json_repo and toml_repo != json_repo :
1529+ task_mismatches .append (
1530+ f"repo: toml={ toml .get ('task.repo' , '' )!r} vs json={ entry .get ('repo' , '' )!r} "
1531+ )
1532+
14361533 if task_mismatches :
14371534 mismatches .append (f"{ task_name } : { '; ' .join (task_mismatches )} " )
14381535
@@ -1532,14 +1629,10 @@ def audit_suite(suite: str, dimension: Optional[Dimension] = None, *, online: bo
15321629 ))
15331630 continue
15341631
1535- # R.2 doesn't apply to MCP-unique suites: instructions intentionally
1536- # reference Sourcegraph MCP tools (that's the point of these tasks).
1537- if cid == "R.2" and suite .startswith (("csb_org_" , "ccb_mcp_" )):
1538- report .results .append (CriterionResult (
1539- criterion_id = cid , status = Status .SKIP ,
1540- evidence = "MCP-unique suite: MCP tool references in instructions are by design" ,
1541- ))
1542- continue
1632+ # Note: R.2 contamination check applies to ALL suites including csb_org_.
1633+ # Org suites are organizational use cases (cross-repo, compliance, etc.)
1634+ # but their instruction.md files must be tool-neutral — no MCP references.
1635+ # Only instruction_mcp.md (the MCP variant) may reference Sourcegraph tools.
15431636
15441637 # Run automated check
15451638 if cid in TASK_CHECKS :
0 commit comments