toolshed: enforce cuda_core SPDX license policy

rwgk · rwgk · commit 47eaed9e4c22 · 2026-04-12T20:45:56.000-07:00
Refactor the SPDX checker so package-specific license rules can be enforced cleanly while preserving the existing autofix flow. Keep focused regression coverage alongside the toolshed script instead of in routine package test collection.

Made-with: Cursor
diff --git a/toolshed/check_spdx.py b/toolshed/check_spdx.py
@@ -9,12 +9,14 @@
 
 import pathspec
 
-# Intentionally puzzling together EXPECTED_SPDX_BYTES so that we don't overlook
-# if the identifiers are missing in this file.
-EXPECTED_SPDX_BYTES = (
-    b"-".join((b"SPDX", b"License", b"Identifier: ")),
-    b"-".join((b"SPDX", b"FileCopyrightText: ")),
-)
+# Intentionally puzzling together SPDX prefixes so that we don't overlook if the
+# identifiers are missing in this file.
+SPDX_LICENSE_IDENTIFIER_PREFIX = b"-".join((b"SPDX", b"License", b"Identifier: "))
+SPDX_FILE_COPYRIGHT_TEXT_PREFIX = b"-".join((b"SPDX", b"FileCopyrightText: "))
+
+LICENSE_IDENTIFIER_REGEX = re.compile(re.escape(SPDX_LICENSE_IDENTIFIER_PREFIX) + rb"(?P<license_identifier>[^\r\n]+)")
+
+EXPECTED_LICENSE_IDENTIFIERS = (("cuda_core/", "Apache-2.0"),)
 
 SPDX_IGNORE_FILENAME = ".spdx-ignore"
 
@@ -47,51 +49,114 @@ def is_staged(filepath):
     return process.stdout.strip() != ""
 
 
+def normalize_repo_path(filepath):
+    normalized_path = filepath.replace("\\", "/")
+    while normalized_path.startswith("./"):
+        normalized_path = normalized_path[2:]
+    return normalized_path
+
+
+def get_expected_license_identifier(filepath):
+    normalized_path = normalize_repo_path(filepath)
+    for prefix, license_identifier in EXPECTED_LICENSE_IDENTIFIERS:
+        if normalized_path.startswith(prefix):
+            return license_identifier
+    return None
+
+
+def validate_required_spdx_field(filepath, blob, expected_bytes):
+    if expected_bytes in blob:
+        return True
+    print(f"MISSING {expected_bytes.decode()}{filepath!r}")
+    return False
+
+
+def extract_license_identifier(blob):
+    match = LICENSE_IDENTIFIER_REGEX.search(blob)
+    if match is None:
+        return None
+    try:
+        return match.group("license_identifier").decode("ascii")
+    except UnicodeDecodeError:
+        return None
+
+
+def validate_license_identifier(filepath, blob):
+    license_identifier = extract_license_identifier(blob)
+    if license_identifier is None:
+        print(f"MISSING valid SPDX license identifier in {filepath!r}")
+        return False
+
+    expected_license_identifier = get_expected_license_identifier(filepath)
+    if expected_license_identifier is None:
+        return True
+
+    if license_identifier != expected_license_identifier:
+        print(
+            f"INVALID SPDX license identifier {license_identifier!r} "
+            f"(expected {expected_license_identifier!r}) in {filepath!r}"
+        )
+        return False
+
+    return True
+
+
+def validate_or_fix_copyright(filepath, blob, fix):
+    match = re.search(COPYRIGHT_REGEX, blob)
+    if match is None:
+        print(f"MISSING valid copyright line in {filepath!r}")
+        return False, blob
+
+    years = match.group("years").decode()
+    if "-" in years:
+        start_year, end_year = years.split("-", 1)
+        if int(start_year) > int(end_year):
+            print(f"INVALID copyright years {years!r} in {filepath!r}")
+            return False, blob
+    else:
+        start_year = end_year = years
+
+    if not is_staged(filepath) or int(end_year) >= int(CURRENT_YEAR):
+        return True, blob
+
+    print(f"OUTDATED copyright {years!r} (expected {CURRENT_YEAR!r}) in {filepath!r}")
+    if not fix:
+        return False, blob
+
+    new_years = f"{start_year}-{CURRENT_YEAR}"
+    return (
+        False,
+        re.sub(
+            COPYRIGHT_REGEX,
+            COPYRIGHT_SUB.format(new_years).encode("ascii"),
+            blob,
+        ),
+    )
+
+
 def find_or_fix_spdx(filepath, fix):
     with open(filepath, "rb") as f:
         blob = f.read()
     if len(blob.strip()) == 0:
         return True
 
     good = True
-    for expected_bytes in EXPECTED_SPDX_BYTES:
-        if expected_bytes not in blob:
-            print(f"MISSING {expected_bytes.decode()}{filepath!r}")
-            good = False
-            continue
-
-        match = re.search(COPYRIGHT_REGEX, blob)
-        if match is None:
-            print(f"MISSING valid copyright line in {filepath!r}")
-            good = False
-            continue
+    has_license_identifier = validate_required_spdx_field(filepath, blob, SPDX_LICENSE_IDENTIFIER_PREFIX)
+    has_copyright = validate_required_spdx_field(filepath, blob, SPDX_FILE_COPYRIGHT_TEXT_PREFIX)
 
-        years = match.group("years").decode()
-        if "-" in years:
-            start_year, end_year = years.split("-", 1)
-            if int(start_year) > int(end_year):
-                print(f"INVALID copyright years {years!r} in {filepath!r}")
-                good = False
-                continue
-        else:
-            start_year = end_year = years
+    if not has_license_identifier or not validate_license_identifier(filepath, blob):
+        good = False
 
-        staged = is_staged(filepath)
-
-        if staged and int(end_year) < int(CURRENT_YEAR):
-            print(f"OUTDATED copyright {years!r} (expected {CURRENT_YEAR!r}) in {filepath!r}")
+    if not has_copyright:
+        good = False
+    else:
+        copyright_ok, updated_blob = validate_or_fix_copyright(filepath, blob, fix)
+        if updated_blob != blob:
+            with open(filepath, "wb") as f:
+                f.write(updated_blob)
+        if not copyright_ok:
             good = False
 
-            if fix:
-                new_years = f"{start_year}-{CURRENT_YEAR}"
-                blob = re.sub(
-                    COPYRIGHT_REGEX,
-                    COPYRIGHT_SUB.format(new_years).encode("ascii"),
-                    blob,
-                )
-                with open(filepath, "wb") as f:
-                    f.write(blob)
-
     return good
 
 
diff --git a/toolshed/test_check_spdx.py b/toolshed/test_check_spdx.py
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import importlib.util
+import sys
+from pathlib import Path
+from types import ModuleType
+
+
+def _install_pathspec_stub():
+    if "pathspec" in sys.modules:
+        return
+
+    class _StubSpec:
+        def match_file(self, _filepath):
+            return False
+
+    class _StubPathSpec:
+        @staticmethod
+        def from_lines(_pattern_type, _lines):
+            return _StubSpec()
+
+    module = ModuleType("pathspec")
+    module.PathSpec = _StubPathSpec
+    sys.modules["pathspec"] = module
+
+
+def _load_check_spdx():
+    check_spdx_path = Path(__file__).resolve().with_name("check_spdx.py")
+    spec = importlib.util.spec_from_file_location("check_spdx", check_spdx_path)
+    assert spec is not None
+    assert spec.loader is not None
+    _install_pathspec_stub()
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+check_spdx = _load_check_spdx()
+
+
+def _write_spdx_file(root, relative_path, license_identifier, *, years="2025-2026"):
+    path = root / relative_path
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(
+        (
+            f"# SPDX-FileCopyrightText: Copyright (c) {years} NVIDIA CORPORATION & AFFILIATES. "
+            "All rights reserved.\n"
+            f"# SPDX-License-Identifier: {license_identifier}\n"
+            "\n"
+            "print('hello')\n"
+        ),
+        encoding="ascii",
+    )
+    return path
+
+
+def test_get_expected_license_identifier_normalizes_windows_paths():
+    assert check_spdx.get_expected_license_identifier(r".\cuda_core\example.py") == "Apache-2.0"
+
+
+def test_find_or_fix_spdx_rejects_non_apache_license_under_cuda_core(tmp_path, monkeypatch, capsys):
+    monkeypatch.chdir(tmp_path)
+    monkeypatch.setattr(check_spdx, "is_staged", lambda _: False)
+    _write_spdx_file(tmp_path, "cuda_core/example.py", "LicenseRef-NVIDIA-SOFTWARE-LICENSE")
+
+    assert not check_spdx.find_or_fix_spdx("cuda_core/example.py", fix=False)
+
+    assert "expected 'Apache-2.0'" in capsys.readouterr().out
+
+
+def test_find_or_fix_spdx_allows_non_apache_license_outside_cuda_core(tmp_path, monkeypatch):
+    monkeypatch.chdir(tmp_path)
+    monkeypatch.setattr(check_spdx, "is_staged", lambda _: False)
+    _write_spdx_file(tmp_path, "cuda_bindings/example.py", "LicenseRef-NVIDIA-SOFTWARE-LICENSE")
+
+    assert check_spdx.find_or_fix_spdx("cuda_bindings/example.py", fix=False)
+
+
+def test_find_or_fix_spdx_updates_outdated_copyright_when_fix_requested(tmp_path, monkeypatch, capsys):
+    monkeypatch.chdir(tmp_path)
+    monkeypatch.setattr(check_spdx, "CURRENT_YEAR", "2026")
+    monkeypatch.setattr(check_spdx, "is_staged", lambda _: True)
+    path = _write_spdx_file(tmp_path, "cuda_core/example.py", "Apache-2.0", years="2024")
+
+    assert not check_spdx.find_or_fix_spdx("cuda_core/example.py", fix=True)
+
+    assert "OUTDATED copyright '2024' (expected '2026')" in capsys.readouterr().out
+    assert "Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved." in path.read_text(
+        encoding="ascii"
+    )