Skip to content

Commit c378eb7

Browse files
committed
fixup! feat(core.utils): retry read on Windows sharing PermissionError; ensure threads exit before cache close
1 parent 0625eb8 commit c378eb7

File tree

2 files changed

+41
-4
lines changed

2 files changed

+41
-4
lines changed

cuda_core/cuda/core/utils/_program_cache.py

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1100,6 +1100,34 @@ def _replace_with_sharing_retry(tmp_path: Path, target: Path) -> bool:
11001100
return False
11011101

11021102

1103+
def _stat_and_read_with_sharing_retry(path: Path) -> tuple[os.stat_result, bytes]:
1104+
"""Snapshot stat and read bytes, retrying briefly on Windows transient
1105+
sharing-violation ``PermissionError``.
1106+
1107+
Reads race the rewriter's ``os.replace``: on Windows, the destination
1108+
can be momentarily inaccessible (winerror 5/32/33) while the rename
1109+
completes. Mirroring ``_replace_with_sharing_retry``'s budget keeps
1110+
transient contention from being mistaken for a real read failure.
1111+
1112+
Raises ``FileNotFoundError`` on miss or after exhausting the Windows
1113+
sharing-retry budget. Other ``PermissionError`` (real ACL/permission
1114+
issues) and any non-Windows ``PermissionError`` propagate.
1115+
"""
1116+
last_exc: BaseException | None = None
1117+
for delay in _REPLACE_RETRY_DELAYS:
1118+
if delay:
1119+
time.sleep(delay)
1120+
try:
1121+
return path.stat(), path.read_bytes()
1122+
except FileNotFoundError:
1123+
raise
1124+
except PermissionError as exc:
1125+
if not _IS_WINDOWS or getattr(exc, "winerror", None) not in _SHARING_VIOLATION_WINERRORS:
1126+
raise
1127+
last_exc = exc
1128+
raise FileNotFoundError(path) from last_exc
1129+
1130+
11031131
def _prune_if_stat_unchanged(path: Path, st_before: os.stat_result) -> None:
11041132
"""Unlink ``path`` iff its stat still matches ``st_before``.
11051133
@@ -1236,8 +1264,10 @@ def __getitem__(self, key: object) -> ObjectCode:
12361264
# Snapshot stat *before* read so we can detect a concurrent
12371265
# os.replace during the read/parse window; a stale stat means
12381266
# another writer wrote a fresh file that must not be pruned.
1239-
st_before = path.stat()
1240-
data = path.read_bytes()
1267+
# The helper retries on Windows transient sharing-violation
1268+
# PermissionErrors so a racing rewriter does not turn a hit
1269+
# into a spurious propagated error.
1270+
st_before, data = _stat_and_read_with_sharing_retry(path)
12411271
except FileNotFoundError:
12421272
raise KeyError(key) from None
12431273
k = _as_key_bytes(key)
@@ -1309,8 +1339,7 @@ def __len__(self) -> int:
13091339
count = 0
13101340
for path in list(self._iter_entry_paths()):
13111341
try:
1312-
st_before = path.stat()
1313-
data = path.read_bytes()
1342+
st_before, data = _stat_and_read_with_sharing_retry(path)
13141343
except FileNotFoundError:
13151344
continue
13161345
try:

cuda_core/tests/test_program_cache.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1310,6 +1310,14 @@ def reader(thread_id: int):
13101310
threads += [threading.Thread(target=reader, args=(i,)) for i in range(4)]
13111311
for t in threads:
13121312
t.start()
1313+
for t in threads:
1314+
t.join(timeout=30)
1315+
# On slow runners (Windows CI in particular) the loop above can return
1316+
# before every worker has finished its 200 iterations. Signal stop and
1317+
# join again so the ``with`` block does NOT tear down the cache while
1318+
# workers are still mid-operation -- that would surface as spurious
1319+
# "SQLiteProgramCache is closed" errors that mask the real assertion.
1320+
stop.set()
13131321
for t in threads:
13141322
t.join(timeout=30)
13151323
assert not any(t.is_alive() for t in threads)

0 commit comments

Comments
 (0)