fixup! feat(core.utils): NVVM fingerprint, Windows replace retry, driver-linker validation

cpcloud · cpcloud · commit 55f4d47a8a0f · 2026-04-18T08:08:17.000-04:00
diff --git a/cuda_core/cuda/core/utils/_program_cache.py b/cuda_core/cuda/core/utils/_program_cache.py
@@ -219,6 +219,11 @@ def _linker_option_fingerprint(options: ProgramOptions) -> list[bytes]:
     return [f"{name}={getattr(options, name, None)!r}".encode() for name in _LINKER_RELEVANT_FIELDS]
 
 
+# LinkerOptions fields that the cuLink (driver) backend rejects outright
+# (_linker.pyx _prepare_driver_options). nvJitLink accepts all of them.
+_DRIVER_LINKER_UNSUPPORTED_FIELDS = ("time", "ptxas_options", "split_compile", "split_compile_extended")
+
+
 def _driver_version() -> int:
     return int(_handle_return(_driver.cuDriverGetVersion()))
 
@@ -247,6 +252,22 @@ def _linker_backend_and_version() -> tuple[str, str]:
     return ("nvJitLink", str(nvjitlink.version()))
 
 
+def _nvvm_fingerprint() -> str:
+    """Stable identifier for the loaded NVVM toolchain.
+
+    NVVM lacks a direct version API (nvbugs 5312315), but ``ir_version()``
+    reports the IR major/minor/debug pair the toolchain emits -- enough to
+    keep pre-/post-upgrade caches separate. Paired with the driver and
+    cuda-core versions already in the digest, this is a practical substitute
+    for a true libNVVM version.
+    """
+    from cuda.core._program import _get_nvvm_module
+
+    nvvm = _get_nvvm_module()
+    major, minor, debug_major, debug_minor = nvvm.ir_version()
+    return f"ir={major}.{minor}.{debug_major}.{debug_minor}"
+
+
 def _cuda_core_version() -> str:
     from cuda.core._version import __version__
 
@@ -389,6 +410,28 @@ def make_program_cache_key(
                 f"compile will read and pass it as extra_digest=..."
             )
 
+    # PTX compiles go through Linker. When the driver (cuLink) backend is
+    # selected (nvJitLink unavailable), Program.compile rejects a subset of
+    # options that nvJitLink would accept; reject them here too so we never
+    # store a key for a compilation that can't succeed in this environment.
+    # If the probe fails we can't tell which backend will run, so skip -- the
+    # failed-probe branch below already taints the key.
+    if backend == "linker":
+        try:
+            from cuda.core._linker import _decide_nvjitlink_or_driver
+
+            use_driver_linker = _decide_nvjitlink_or_driver()
+        except Exception:
+            use_driver_linker = None
+        if use_driver_linker is True:
+            unsupported = [name for name in _DRIVER_LINKER_UNSUPPORTED_FIELDS if _option_is_set(options, name)]
+            if unsupported:
+                raise ValueError(
+                    f"the cuLink driver linker does not support these options: "
+                    f"{', '.join(unsupported)}; Program.compile() would reject this "
+                    f"configuration before producing an ObjectCode."
+                )
+
     if isinstance(code, str):
         code_bytes = code.encode("utf-8")
     elif isinstance(code, (bytes, bytearray)):
@@ -431,17 +474,22 @@ def _update(label: str, payload: bytes) -> None:
         hasher.update(payload)
 
     def _probe(label: str, fn):
-        """Run an environment probe; on failure, hash the exception's class
-        and message under a ``*_probe_failed`` label. That label differs
-        from the success label (``driver``/``nvrtc``/...), so a broken env
-        never collides with a working one; and because the digest is
-        derived from the *stable* exception signature -- not a random
-        per-process marker -- two processes with the same failure produce
-        the same key and can reuse on-disk cache entries."""
+        """Run an environment probe; on failure, hash the exception's
+        CLASS NAME (not its message) under a ``*_probe_failed`` label.
+
+        Using only the class name keeps the digest stable across repeated
+        calls within one process (e.g. NVVM's loader reports different
+        messages on first vs. cached-failure attempts) AND across processes
+        that hit the same failure mode. The ``_probe_failed`` label differs
+        from the success labels (``driver``/``nvrtc``/...), so a broken env
+        never collides with a working one -- the cache "fails closed"
+        between broken and working environments while staying persistent
+        within either.
+        """
         try:
             return fn()
         except Exception as exc:
-            _update(f"{label}_probe_failed", f"{type(exc).__name__}:{exc}".encode())
+            _update(f"{label}_probe_failed", type(exc).__name__.encode())
             return None
 
     _update("schema", str(_KEY_SCHEMA_VERSION).encode("ascii"))
@@ -463,8 +511,9 @@ def _probe(label: str, fn):
             _update("linker_backend", lb_name.encode("ascii"))
             _update("linker_version", lb_version.encode("ascii"))
     else:
-        # NVVM lacks a direct version API; proxy via driver + cuda-core above.
-        _update("nvvm", b"proxied-by-driver-and-cuda-core-version")
+        nvvm_fp = _probe("nvvm", _nvvm_fingerprint)
+        if nvvm_fp is not None:
+            _update("nvvm", nvvm_fp.encode("ascii"))
     _update("code_type", code_type.encode("ascii"))
     _update("target_type", target_type.encode("ascii"))
     _update("code", code_bytes)
@@ -796,6 +845,34 @@ def _enforce_size_cap(self) -> None:
 _SCHEMA_FILE = "SCHEMA_VERSION"
 
 
+_SHARING_VIOLATION_WINERRORS = (32, 33)  # ERROR_SHARING_VIOLATION, ERROR_LOCK_VIOLATION
+_REPLACE_RETRY_DELAYS = (0.0, 0.005, 0.010, 0.020, 0.050, 0.100)  # ~185ms budget
+
+
+def _replace_with_sharing_retry(tmp_path: Path, target: Path) -> bool:
+    """Atomic rename with Windows-specific retry on sharing/lock violations.
+
+    Returns True on success. Returns False only after the retry budget is
+    exhausted on Windows with a genuine sharing violation -- the caller then
+    treats the cache write as dropped. Any other ``PermissionError`` (ACLs,
+    read-only dir, unexpected winerror, or any POSIX failure) propagates.
+    """
+    for i, delay in enumerate(_REPLACE_RETRY_DELAYS):
+        if delay:
+            time.sleep(delay)
+        try:
+            os.replace(tmp_path, target)
+            return True
+        except PermissionError as exc:
+            if not _IS_WINDOWS or getattr(exc, "winerror", None) not in _SHARING_VIOLATION_WINERRORS:
+                raise
+            # Windows sharing violation; loop and try again unless this was the
+            # last attempt, in which case fall through and return False.
+            if i == len(_REPLACE_RETRY_DELAYS) - 1:
+                return False
+    return False
+
+
 def _prune_if_stat_unchanged(path: Path, st_before: os.stat_result) -> None:
     """Unlink ``path`` iff its stat still matches ``st_before``.
 
@@ -938,24 +1015,15 @@ def __setitem__(self, key: object, value: object) -> None:
                 fh.write(record)
                 fh.flush()
                 os.fsync(fh.fileno())
-            # Narrow PermissionError suppression to os.replace only. Earlier
-            # failures (mkdir / mkstemp / fdopen / write / fsync) indicate a
-            # real configuration problem and must propagate.
-            try:
-                os.replace(tmp_path, target)
-            except PermissionError as exc:
+            # Retry os.replace under Windows sharing/lock violations; only
+            # give up (and drop the cache write) after a bounded backoff, so
+            # transient contention is not turned into a silent miss.
+            # Non-sharing PermissionErrors and all POSIX PermissionErrors
+            # propagate immediately (real config problem).
+            if not _replace_with_sharing_retry(tmp_path, target):
                 with contextlib.suppress(FileNotFoundError):
                     tmp_path.unlink()
-                # Windows raises PermissionError from os.replace specifically
-                # when the target is held open by another process (winerror
-                # 32 = ERROR_SHARING_VIOLATION, 33 = ERROR_LOCK_VIOLATION);
-                # swallow those as a cache miss. Any other winerror (ACL
-                # issues, read-only dir, etc.) is a real config problem and
-                # must propagate. POSIX has no such sharing-violation case
-                # and always propagates.
-                if _IS_WINDOWS and getattr(exc, "winerror", None) in (32, 33):
-                    return
-                raise
+                return
         except BaseException:
             with contextlib.suppress(FileNotFoundError):
                 tmp_path.unlink()
diff --git a/cuda_core/tests/test_program_cache.py b/cuda_core/tests/test_program_cache.py
@@ -340,6 +340,75 @@ def test_make_program_cache_key_ignores_name_expressions_for_non_nvrtc(code_type
     assert k_none == k_with
 
 
+def test_make_program_cache_key_nvvm_probe_changes_key(monkeypatch):
+    """NVVM keys must reflect the NVVM toolchain identity (IR version)
+    so an upgraded libNVVM does not silently reuse pre-upgrade entries."""
+    from cuda.core.utils import _program_cache
+
+    monkeypatch.setattr(_program_cache, "_nvvm_fingerprint", lambda: "ir=1.8.3.0")
+    k1 = _make_key(code="abc", code_type="nvvm", target_type="ptx")
+    monkeypatch.setattr(_program_cache, "_nvvm_fingerprint", lambda: "ir=2.0.3.0")
+    k2 = _make_key(code="abc", code_type="nvvm", target_type="ptx")
+    assert k1 != k2
+
+
+@pytest.mark.parametrize(
+    "option_kw",
+    [
+        pytest.param({"time": True}, id="time"),
+        pytest.param({"ptxas_options": "-v"}, id="ptxas_options"),
+        pytest.param({"split_compile": 0}, id="split_compile"),
+    ],
+)
+def test_make_program_cache_key_ptx_rejects_driver_linker_unsupported(option_kw, monkeypatch):
+    """When the driver (cuLink) linker backend is selected, options that
+    ``_prepare_driver_options`` rejects must also be rejected at key time
+    so we never cache a compilation that would fail."""
+    from cuda.core import _linker
+
+    monkeypatch.setattr(_linker, "_decide_nvjitlink_or_driver", lambda: True)  # driver
+    with pytest.raises(ValueError, match="driver linker"):
+        _make_key(code=".version 7.0", code_type="ptx", options=_opts(**option_kw))
+
+
+def test_make_program_cache_key_ptx_accepts_driver_linker_unsupported_with_nvjitlink(monkeypatch):
+    """Under nvJitLink those same options are valid and must not be
+    rejected at key time."""
+    from cuda.core import _linker
+
+    monkeypatch.setattr(_linker, "_decide_nvjitlink_or_driver", lambda: False)  # nvJitLink
+    # Should not raise.
+    _make_key(code=".version 7.0", code_type="ptx", options=_opts(time=True))
+
+
+def test_filestream_cache_replace_retries_on_sharing_violation(tmp_path, monkeypatch):
+    """Under Windows sharing/lock violations, os.replace is retried with a
+    bounded backoff; a transient violation that clears within the budget
+    must still produce a successful cache write."""
+    import os as _os
+
+    from cuda.core.utils import FileStreamProgramCache, _program_cache
+
+    monkeypatch.setattr(_program_cache, "_IS_WINDOWS", True)
+
+    real_replace = _os.replace
+    calls = {"n": 0}
+
+    def _flaky_replace(src, dst):
+        calls["n"] += 1
+        if calls["n"] < 3:
+            exc = PermissionError("sharing violation")
+            exc.winerror = 32
+            raise exc
+        return real_replace(src, dst)
+
+    with FileStreamProgramCache(tmp_path / "fc") as cache:
+        monkeypatch.setattr(_os, "replace", _flaky_replace)
+        cache[b"k"] = _fake_object_code(b"v")  # succeeds on third attempt
+        assert calls["n"] == 3
+        assert bytes(cache[b"k"].code) == b"v"
+
+
 @pytest.mark.parametrize(
     "option_kw",
     [