Skip to content

Commit 1935497

Browse files
committed
fix(core.utils): retry path.unlink on Windows sharing violations
CI on win-64 hit ``PermissionError: [WinError 5] Access is denied`` in ``test_concurrent_eviction_does_not_delete_replaced_file``: an evictor trying to unlink an entry that another process briefly held open for a read crashed the writer that triggered eviction. Python's ``open(p, "rb")`` does not pass ``FILE_SHARE_DELETE``, so a reader from another process briefly blocks our unlink. Mirror the existing ``_replace_with_sharing_retry`` and ``_stat_and_read_with_sharing_retry`` helpers with ``_unlink_with_sharing_retry``: same backoff budget, same winerror set (5/32/33 + EACCES), same non-Windows passthrough. Apply at every entry-file unlink site: * ``_prune_if_stat_unchanged`` -- best-effort, catches both FileNotFoundError and the post-retry PermissionError. * ``__init__`` schema-mismatch wipe -- same, with a comment that the next open will retry. * ``__delitem__`` -- retries; FileNotFoundError still maps to KeyError; exhausted-budget PermissionError propagates so explicit deletes fail loudly. * ``_enforce_size_cap`` -- retries; on exhausted budget the entry is skipped (a later eviction pass will retry) instead of crashing the writer that triggered eviction. Tests cover the retry-then-succeed path, the non-Windows passthrough, and the eviction skip-on-locked path.
1 parent 2bf4dad commit 1935497

2 files changed

Lines changed: 133 additions & 7 deletions

File tree

cuda_core/cuda/core/utils/_program_cache.py

Lines changed: 55 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1194,6 +1194,42 @@ def _touch_atime(path: Path, st_before: os.stat_result) -> None:
11941194
os.utime(path, ns=(new_atime_ns, st_before.st_mtime_ns))
11951195

11961196

1197+
def _unlink_with_sharing_retry(path: Path) -> None:
1198+
"""Unlink with Windows-specific retry on sharing/lock violations.
1199+
1200+
On Windows, ``Path.unlink`` raises ``PermissionError`` (winerror 5,
1201+
32, or 33; sometimes bare ``EACCES``) when another process holds
1202+
the file open without ``FILE_SHARE_DELETE``. Python's default
1203+
``open(p, "rb")`` does not pass that flag, so a reader from another
1204+
process briefly blocks our unlink while it reads. Retry with the
1205+
same backoff budget as :func:`_replace_with_sharing_retry` so
1206+
transient contention is not turned into a propagated error.
1207+
1208+
Raises ``FileNotFoundError`` if the file is absent; the last
1209+
``PermissionError`` if the Windows retry budget is exhausted; and
1210+
propagates any non-sharing ``PermissionError`` (or any non-Windows
1211+
``PermissionError``) immediately.
1212+
"""
1213+
last_exc: PermissionError | None = None
1214+
for delay in _REPLACE_RETRY_DELAYS:
1215+
if delay:
1216+
time.sleep(delay)
1217+
try:
1218+
path.unlink()
1219+
return
1220+
except FileNotFoundError:
1221+
raise
1222+
except PermissionError as exc:
1223+
if not _IS_WINDOWS:
1224+
raise
1225+
winerror = getattr(exc, "winerror", None)
1226+
if winerror not in _SHARING_VIOLATION_WINERRORS and exc.errno != errno.EACCES:
1227+
raise
1228+
last_exc = exc
1229+
if last_exc is not None:
1230+
raise last_exc
1231+
1232+
11971233
def _prune_if_stat_unchanged(path: Path, st_before: os.stat_result) -> None:
11981234
"""Unlink ``path`` iff its stat still matches ``st_before``.
11991235
@@ -1205,6 +1241,10 @@ def _prune_if_stat_unchanged(path: Path, st_before: os.stat_result) -> None:
12051241
delete their work. The residual TOCTOU window between stat and
12061242
unlink is narrow; worst case, a very-recently-written entry is
12071243
removed and the next read recompiles.
1244+
1245+
Best-effort: a Windows sharing violation that survives the retry
1246+
budget leaves the file in place. The caller is in an eviction or
1247+
cleanup pass, so re-trying on the next pass is the right outcome.
12081248
"""
12091249
try:
12101250
st_now = path.stat()
@@ -1214,8 +1254,8 @@ def _prune_if_stat_unchanged(path: Path, st_before: os.stat_result) -> None:
12141254
key_now = (st_now.st_ino, st_now.st_size, st_now.st_mtime_ns)
12151255
if key_before != key_now:
12161256
return
1217-
with contextlib.suppress(FileNotFoundError):
1218-
path.unlink()
1257+
with contextlib.suppress(FileNotFoundError, PermissionError):
1258+
_unlink_with_sharing_retry(path)
12191259

12201260

12211261
class FileStreamProgramCache(ProgramCacheResource):
@@ -1307,9 +1347,11 @@ def __init__(
13071347
if existing != expected:
13081348
# Schema mismatch: wipe incompatible entries. Losing cache
13091349
# contents is safe; returning bytes from an old format is not.
1350+
# Tolerate Windows sharing violations -- another process may
1351+
# be reading an old entry; the next open will retry the wipe.
13101352
for entry in list(self._iter_entry_paths()):
1311-
with contextlib.suppress(FileNotFoundError):
1312-
entry.unlink()
1353+
with contextlib.suppress(FileNotFoundError, PermissionError):
1354+
_unlink_with_sharing_retry(entry)
13131355
self._schema_path.write_text(expected)
13141356
# Opportunistic startup sweep of orphaned temp files left by any
13151357
# crashed writers. Age-based so concurrent in-flight writes from
@@ -1390,7 +1432,7 @@ def __setitem__(self, key: object, value: bytes | bytearray | memoryview | Objec
13901432
def __delitem__(self, key: object) -> None:
13911433
path = self._path_for_key(key)
13921434
try:
1393-
path.unlink()
1435+
_unlink_with_sharing_retry(path)
13941436
except FileNotFoundError:
13951437
raise KeyError(key) from None
13961438

@@ -1514,6 +1556,12 @@ def _enforce_size_cap(self) -> None:
15141556
# below could declare us done while still over the limit.
15151557
total += stat_now.st_size - size
15161558
continue
1517-
with contextlib.suppress(FileNotFoundError):
1518-
path.unlink()
1559+
# Tolerate Windows sharing violations during eviction: another
1560+
# process may briefly hold the file open for a read. Skip this
1561+
# entry; a later eviction pass will retry. Same outcome as if
1562+
# the stat-guard above had triggered.
1563+
try:
1564+
_unlink_with_sharing_retry(path)
15191565
total -= size
1566+
except (FileNotFoundError, PermissionError):
1567+
pass

cuda_core/tests/test_program_cache.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -736,6 +736,84 @@ def _flaky_replace(src, dst):
736736
assert cache[b"k"] == b"v"
737737

738738

739+
def test_filestream_cache_unlink_retries_on_sharing_violation(tmp_path, monkeypatch):
740+
"""``Path.unlink`` retries on Windows sharing violations (winerror 5/32/33
741+
or bare EACCES). Without this, an evictor unlinking an entry that another
742+
process briefly holds open for a read crashes the writer that triggered
743+
eviction -- which is what the win-64 multiprocess CI was hitting."""
744+
from pathlib import Path
745+
746+
from cuda.core.utils import FileStreamProgramCache, _program_cache
747+
748+
monkeypatch.setattr(_program_cache, "_IS_WINDOWS", True)
749+
750+
real_unlink = Path.unlink
751+
calls = {"n": 0}
752+
753+
def _flaky_unlink(self, *args, **kwargs):
754+
calls["n"] += 1
755+
if calls["n"] < 3:
756+
exc = PermissionError("sharing violation")
757+
exc.winerror = 32
758+
raise exc
759+
return real_unlink(self, *args, **kwargs)
760+
761+
with FileStreamProgramCache(tmp_path / "fc") as cache:
762+
cache[b"k"] = _fake_object_code(b"v")
763+
monkeypatch.setattr(Path, "unlink", _flaky_unlink)
764+
del cache[b"k"] # succeeds on third attempt
765+
assert calls["n"] == 3
766+
assert b"k" not in cache
767+
768+
769+
def test_filestream_cache_unlink_propagates_non_windows_permission_error(tmp_path, monkeypatch):
770+
"""On non-Windows, PermissionError on unlink is a real config error and
771+
must surface, never silently retried."""
772+
from pathlib import Path
773+
774+
from cuda.core.utils import FileStreamProgramCache, _program_cache
775+
776+
monkeypatch.setattr(_program_cache, "_IS_WINDOWS", False)
777+
778+
with FileStreamProgramCache(tmp_path / "fc") as cache:
779+
cache[b"k"] = _fake_object_code(b"v")
780+
781+
def _denied(self, *args, **kwargs):
782+
raise PermissionError("denied")
783+
784+
monkeypatch.setattr(Path, "unlink", _denied)
785+
with pytest.raises(PermissionError, match="denied"):
786+
del cache[b"k"]
787+
788+
789+
def test_filestream_cache_unlink_skips_locked_entries_in_eviction(tmp_path, monkeypatch):
790+
"""Eviction tolerates a Windows sharing violation: a locked entry is
791+
skipped (a later eviction pass will retry), and the eviction does not
792+
propagate the error to the writer that triggered it."""
793+
from pathlib import Path
794+
795+
from cuda.core.utils import FileStreamProgramCache, _program_cache
796+
797+
monkeypatch.setattr(_program_cache, "_IS_WINDOWS", True)
798+
799+
# Tiny cap so every write triggers eviction.
800+
with FileStreamProgramCache(tmp_path / "fc", max_size_bytes=200) as cache:
801+
cache[b"a"] = b"a" * 100
802+
cache[b"b"] = b"b" * 100 # both still fit
803+
804+
# Now make ALL unlink attempts fail with a sharing violation. The
805+
# next write triggers eviction; eviction must swallow the error
806+
# rather than crash the writer.
807+
def _always_locked(self, *args, **kwargs):
808+
exc = PermissionError("sharing violation")
809+
exc.winerror = 32
810+
raise exc
811+
812+
monkeypatch.setattr(Path, "unlink", _always_locked)
813+
# Should not raise.
814+
cache[b"c"] = b"c" * 100
815+
816+
739817
@pytest.mark.parametrize(
740818
"option_kw",
741819
[

0 commit comments

Comments
 (0)