Skip to content

Commit 7533b87

Browse files
committed
fix(core.utils): narrow PermissionError swallowing to sharing violations
Roborev review #1735 flagged two issues with the unlink-retry commit: 1. **HIGH**: schema-mismatch wipe in __init__ suppressed every PermissionError then advanced the schema marker anyway. If a Windows-locked entry survived the wipe, future opens would skip the wipe and lookups could return bytes from the incompatible old format. 2. **MEDIUM**: _enforce_size_cap and _prune_if_stat_unchanged suppressed every PermissionError, hiding real configuration failures (POSIX ACL issues, Windows non-sharing winerrors). Fix: add ``_is_windows_sharing_violation(exc)`` so best-effort callers can filter the exhausted-Windows-sharing case while letting other PermissionError instances propagate. Apply at: * ``_prune_if_stat_unchanged``: catch + filter; non-sharing PE re-raises. * Schema-mismatch wipe: track ``wipe_complete``; only advance the schema marker if every old entry was successfully removed. * ``_enforce_size_cap``: catch + filter. Tests cover the new propagation paths (POSIX PermissionError surfaces, not silently swallowed) and the schema-marker preservation when an old entry is locked.
1 parent 1935497 commit 7533b87

2 files changed

Lines changed: 111 additions & 7 deletions

File tree

cuda_core/cuda/core/utils/_program_cache.py

Lines changed: 51 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1194,6 +1194,23 @@ def _touch_atime(path: Path, st_before: os.stat_result) -> None:
11941194
os.utime(path, ns=(new_atime_ns, st_before.st_mtime_ns))
11951195

11961196

1197+
def _is_windows_sharing_violation(exc: BaseException) -> bool:
1198+
"""Return True if ``exc`` is a Windows sharing/lock violation that
1199+
:func:`_unlink_with_sharing_retry` would have retried.
1200+
1201+
Used by best-effort callers to filter out the exhausted-retry case
1202+
while letting other ``PermissionError`` instances (POSIX ACL
1203+
issues, Windows non-sharing winerrors) propagate -- those are real
1204+
configuration problems, not transient contention.
1205+
"""
1206+
if not _IS_WINDOWS:
1207+
return False
1208+
if not isinstance(exc, PermissionError):
1209+
return False
1210+
winerror = getattr(exc, "winerror", None)
1211+
return winerror in _SHARING_VIOLATION_WINERRORS or exc.errno == errno.EACCES
1212+
1213+
11971214
def _unlink_with_sharing_retry(path: Path) -> None:
11981215
"""Unlink with Windows-specific retry on sharing/lock violations.
11991216
@@ -1208,7 +1225,9 @@ def _unlink_with_sharing_retry(path: Path) -> None:
12081225
Raises ``FileNotFoundError`` if the file is absent; the last
12091226
``PermissionError`` if the Windows retry budget is exhausted; and
12101227
propagates any non-sharing ``PermissionError`` (or any non-Windows
1211-
``PermissionError``) immediately.
1228+
``PermissionError``) immediately. Best-effort callers should use
1229+
:func:`_is_windows_sharing_violation` to filter the exhausted-retry
1230+
case and re-raise any other ``PermissionError``.
12121231
"""
12131232
last_exc: PermissionError | None = None
12141233
for delay in _REPLACE_RETRY_DELAYS:
@@ -1254,8 +1273,16 @@ def _prune_if_stat_unchanged(path: Path, st_before: os.stat_result) -> None:
12541273
key_now = (st_now.st_ino, st_now.st_size, st_now.st_mtime_ns)
12551274
if key_before != key_now:
12561275
return
1257-
with contextlib.suppress(FileNotFoundError, PermissionError):
1276+
try:
12581277
_unlink_with_sharing_retry(path)
1278+
except FileNotFoundError:
1279+
pass
1280+
except PermissionError as exc:
1281+
# Swallow only the exhausted-Windows-sharing case. POSIX ACL
1282+
# errors and Windows non-sharing winerrors are real configuration
1283+
# problems and must surface, not be silently lost during a prune.
1284+
if not _is_windows_sharing_violation(exc):
1285+
raise
12591286

12601287

12611288
class FileStreamProgramCache(ProgramCacheResource):
@@ -1348,11 +1375,23 @@ def __init__(
13481375
# Schema mismatch: wipe incompatible entries. Losing cache
13491376
# contents is safe; returning bytes from an old format is not.
13501377
# Tolerate Windows sharing violations -- another process may
1351-
# be reading an old entry; the next open will retry the wipe.
1378+
# be reading an old entry; in that case leave the OLD schema
1379+
# marker so the next open re-runs the wipe. Advancing the
1380+
# marker before every old entry is gone would cause future
1381+
# opens to skip the wipe and let lookups return bytes from
1382+
# the incompatible format.
1383+
wipe_complete = True
13521384
for entry in list(self._iter_entry_paths()):
1353-
with contextlib.suppress(FileNotFoundError, PermissionError):
1385+
try:
13541386
_unlink_with_sharing_retry(entry)
1355-
self._schema_path.write_text(expected)
1387+
except FileNotFoundError:
1388+
pass
1389+
except PermissionError as exc:
1390+
if not _is_windows_sharing_violation(exc):
1391+
raise
1392+
wipe_complete = False
1393+
if wipe_complete:
1394+
self._schema_path.write_text(expected)
13561395
# Opportunistic startup sweep of orphaned temp files left by any
13571396
# crashed writers. Age-based so concurrent in-flight writes from
13581397
# other processes are preserved.
@@ -1559,9 +1598,14 @@ def _enforce_size_cap(self) -> None:
15591598
# Tolerate Windows sharing violations during eviction: another
15601599
# process may briefly hold the file open for a read. Skip this
15611600
# entry; a later eviction pass will retry. Same outcome as if
1562-
# the stat-guard above had triggered.
1601+
# the stat-guard above had triggered. Other PermissionErrors
1602+
# (POSIX ACL, Windows non-sharing winerrors) are real config
1603+
# problems -- surface them rather than silently exceed the cap.
15631604
try:
15641605
_unlink_with_sharing_retry(path)
15651606
total -= size
1566-
except (FileNotFoundError, PermissionError):
1607+
except FileNotFoundError:
15671608
pass
1609+
except PermissionError as exc:
1610+
if not _is_windows_sharing_violation(exc):
1611+
raise

cuda_core/tests/test_program_cache.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -814,6 +814,66 @@ def _always_locked(self, *args, **kwargs):
814814
cache[b"c"] = b"c" * 100
815815

816816

817+
def test_filestream_cache_eviction_propagates_non_sharing_permission_error(tmp_path, monkeypatch):
818+
"""Eviction must NOT silently swallow PermissionErrors that aren't
819+
Windows sharing violations -- those (POSIX ACL, Windows non-sharing
820+
winerrors) are real config issues and would otherwise let the cache
821+
sit over its cap with no signal to the caller."""
822+
from pathlib import Path
823+
824+
from cuda.core.utils import FileStreamProgramCache, _program_cache
825+
826+
monkeypatch.setattr(_program_cache, "_IS_WINDOWS", False)
827+
828+
with FileStreamProgramCache(tmp_path / "fc", max_size_bytes=200) as cache:
829+
cache[b"a"] = b"a" * 100
830+
cache[b"b"] = b"b" * 100
831+
832+
def _denied(self, *args, **kwargs):
833+
raise PermissionError("acl denied")
834+
835+
monkeypatch.setattr(Path, "unlink", _denied)
836+
with pytest.raises(PermissionError, match="acl denied"):
837+
cache[b"c"] = b"c" * 100
838+
839+
840+
def test_filestream_cache_schema_mismatch_keeps_old_marker_when_locked(tmp_path, monkeypatch):
841+
"""If a schema-mismatch wipe fails to remove an entry due to a Windows
842+
sharing violation, the old schema marker must NOT be advanced --
843+
otherwise a future open skips the wipe and a future lookup may return
844+
bytes from the incompatible old format."""
845+
from pathlib import Path
846+
847+
from cuda.core.utils import FileStreamProgramCache, _program_cache
848+
849+
root = tmp_path / "fc"
850+
# Seed a cache with one entry under the current schema.
851+
with FileStreamProgramCache(root) as cache:
852+
cache[b"k"] = b"v"
853+
854+
# Pretend the on-disk schema is from an old version.
855+
schema_path = root / "SCHEMA_VERSION"
856+
schema_path.write_text("0.0")
857+
expected_marker = schema_path.read_text()
858+
assert expected_marker == "0.0"
859+
860+
# Make every unlink raise a Windows sharing violation. Re-opening should
861+
# attempt the wipe, fail to remove the locked entry, and LEAVE the old
862+
# schema marker so a later open re-runs the wipe.
863+
monkeypatch.setattr(_program_cache, "_IS_WINDOWS", True)
864+
865+
def _always_locked(self, *args, **kwargs):
866+
exc = PermissionError("sharing violation")
867+
exc.winerror = 32
868+
raise exc
869+
870+
monkeypatch.setattr(Path, "unlink", _always_locked)
871+
872+
# The wipe is best-effort but must not silently advance the marker.
873+
FileStreamProgramCache(root)
874+
assert schema_path.read_text() == "0.0", "schema marker advanced despite locked entries surviving the wipe"
875+
876+
817877
@pytest.mark.parametrize(
818878
"option_kw",
819879
[

0 commit comments

Comments
 (0)