Skip to content

Commit 75737ad

Browse files
committed
Thread-safe reads via reference-counted mmap cache
Multiple threads reading the same file now share a single read-only mmap instead of each opening their own. A module-level _MmapCache protected by a threading.Lock manages reference counts per file path. The mmap is closed when the last reader releases it. Read-only mmap slicing (which is what the strip/tile readers do) is thread-safe at the OS level -- no seek or file position involved. Tested with 16 concurrent threads reading different windows from the same deflate+tiled file, and a stress test of 400 reads across 8 threads. Zero errors, cache drains properly. For dask lazy reads, this means all chunk-read tasks for the same file share one mmap instead of opening/closing the file per chunk.
1 parent 9cf43ab commit 75737ad

File tree

1 file changed

+62
-12
lines changed

1 file changed

+62
-12
lines changed

xrspatial/geotiff/_reader.py

Lines changed: 62 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33

44
import math
55
import mmap
6+
import threading
67
import urllib.request
78

89
import numpy as np
@@ -23,18 +24,69 @@
2324
# Data source abstraction
2425
# ---------------------------------------------------------------------------
2526

27+
class _MmapCache:
28+
"""Thread-safe, reference-counted mmap cache.
29+
30+
Multiple threads reading the same file share a single read-only mmap.
31+
The mmap is closed when the last reference is released.
32+
mmap slicing on a read-only mapping is thread-safe (no seek involved).
33+
"""
34+
35+
def __init__(self):
36+
self._lock = threading.Lock()
37+
# path -> (fh, mm, refcount)
38+
self._entries: dict[str, tuple] = {}
39+
40+
def acquire(self, path: str):
41+
"""Get or create a read-only mmap for *path*. Returns (mm, size)."""
42+
import os
43+
real = os.path.realpath(path)
44+
with self._lock:
45+
if real in self._entries:
46+
fh, mm, size, rc = self._entries[real]
47+
self._entries[real] = (fh, mm, size, rc + 1)
48+
return mm, size
49+
50+
fh = open(real, 'rb')
51+
fh.seek(0, 2)
52+
size = fh.tell()
53+
fh.seek(0)
54+
if size > 0:
55+
mm = mmap.mmap(fh.fileno(), 0, access=mmap.ACCESS_READ)
56+
else:
57+
mm = None
58+
self._entries[real] = (fh, mm, size, 1)
59+
return mm, size
60+
61+
def release(self, path: str):
62+
"""Decrement the reference count; close the mmap when it hits zero."""
63+
import os
64+
real = os.path.realpath(path)
65+
with self._lock:
66+
entry = self._entries.get(real)
67+
if entry is None:
68+
return
69+
fh, mm, size, rc = entry
70+
rc -= 1
71+
if rc <= 0:
72+
del self._entries[real]
73+
if mm is not None:
74+
mm.close()
75+
fh.close()
76+
else:
77+
self._entries[real] = (fh, mm, size, rc)
78+
79+
80+
# Module-level cache shared across all reads
81+
_mmap_cache = _MmapCache()
82+
83+
2684
class _FileSource:
27-
"""Local file data source using mmap for zero-copy access."""
85+
"""Local file data source using a shared, thread-safe mmap cache."""
2886

2987
def __init__(self, path: str):
30-
self._fh = open(path, 'rb')
31-
self._fh.seek(0, 2)
32-
self._size = self._fh.tell()
33-
self._fh.seek(0)
34-
if self._size > 0:
35-
self._mm = mmap.mmap(self._fh.fileno(), 0, access=mmap.ACCESS_READ)
36-
else:
37-
self._mm = None
88+
self._path = path
89+
self._mm, self._size = _mmap_cache.acquire(path)
3890

3991
def read_range(self, start: int, length: int) -> bytes:
4092
if self._mm is not None:
@@ -52,9 +104,7 @@ def size(self) -> int:
52104
return self._size
53105

54106
def close(self):
55-
if self._mm is not None:
56-
self._mm.close()
57-
self._fh.close()
107+
_mmap_cache.release(self._path)
58108

59109

60110
class _HTTPSource:

0 commit comments

Comments
 (0)