Skip to content

Commit 8874812

Browse files
authored
feat: open_reader accepts optional size to skip HEAD request (#664)
* feat: open_reader accepts optional size to skip HEAD request * feat(fsspec): BufferedFile forwards size to open_reader * refactor(buffered): drop ObjectMeta from ReadableFile
1 parent 49f68bb commit 8874812

5 files changed

Lines changed: 114 additions & 83 deletions

File tree

obstore/python/obstore/_buffered.pyi

Lines changed: 5 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ from contextlib import AbstractAsyncContextManager, AbstractContextManager
33

44
from ._attributes import Attributes
55
from ._bytes import Bytes
6-
from ._list import ObjectMeta
76
from ._store import ObjectStore
87

98
if sys.version_info >= (3, 11):
@@ -16,16 +15,12 @@ if sys.version_info >= (3, 12):
1615
else:
1716
from typing_extensions import Buffer
1817

19-
if sys.version_info >= (3, 13):
20-
from warnings import deprecated
21-
else:
22-
from typing_extensions import deprecated
23-
2418
def open_reader(
2519
store: ObjectStore,
2620
path: str,
2721
*,
2822
buffer_size: int = 1024 * 1024,
23+
size: int | None = None,
2924
) -> ReadableFile:
3025
"""Open a readable file object from the specified location.
3126
@@ -35,6 +30,9 @@ def open_reader(
3530
3631
Keyword Args:
3732
buffer_size: The minimum number of bytes to read in a single request. Up to `buffer_size` bytes will be buffered in memory.
33+
size: Optional byte size of the object. When provided, skips the HEAD request used to fetch the file size. Useful for callers that already know the size from external metadata.
34+
35+
The caller is responsible for accuracy: a value larger than the actual file surfaces as a read-time range error, a value smaller causes silent truncation. Defaults to `None`.
3836
3937
Returns:
4038
ReadableFile
@@ -46,6 +44,7 @@ async def open_reader_async(
4644
path: str,
4745
*,
4846
buffer_size: int = 1024 * 1024,
47+
size: int | None = None,
4948
) -> AsyncReadableFile:
5049
"""Call `open_reader` asynchronously, returning a readable file object with asynchronous operations.
5150
@@ -92,22 +91,6 @@ class ReadableFile:
9291
This is currently a no-op.
9392
"""
9493

95-
@property
96-
@deprecated(
97-
"`ReadableFile.meta` is deprecated and will be removed in a future release. "
98-
"Use the `head` or `head_async` methods directly if you need object metadata.",
99-
)
100-
def meta(self) -> ObjectMeta:
101-
"""Access the metadata of the underlying file.
102-
103-
!!! warning "Deprecated"
104-
105-
This attribute is deprecated and will be removed in a future
106-
release. Use the [`head`][obstore.head] or
107-
[`head_async`][obstore.head_async] methods directly if you need
108-
object metadata.
109-
"""
110-
11194
def read(self, size: int | None = None, /) -> Bytes:
11295
"""Read up to `size` bytes from the object and return them.
11396
@@ -186,22 +169,6 @@ class AsyncReadableFile:
186169
This is currently a no-op.
187170
"""
188171

189-
@property
190-
@deprecated(
191-
"`AsyncReadableFile.meta` is deprecated and will be removed in a future release. "
192-
"Use the `head` or `head_async` methods directly if you need object metadata.",
193-
)
194-
def meta(self) -> ObjectMeta:
195-
"""Access the metadata of the underlying file.
196-
197-
!!! warning "Deprecated"
198-
199-
This attribute is deprecated and will be removed in a future
200-
release. Use the [`head`][obstore.head] or
201-
[`head_async`][obstore.head_async] methods directly if you need
202-
object metadata.
203-
"""
204-
205172
async def read(self, size: int | None = None, /) -> Bytes:
206173
"""Read up to `size` bytes from the object and return them.
207174

obstore/python/obstore/fsspec.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -681,7 +681,12 @@ def __init__( # noqa: PLR0913
681681

682682
if self.mode == "rb":
683683
buffer_size = 1024 * 1024 if buffer_size is None else buffer_size
684-
self._reader = open_reader(store, path, buffer_size=buffer_size)
684+
self._reader = open_reader(
685+
store,
686+
path,
687+
buffer_size=buffer_size,
688+
size=self.size,
689+
)
685690
elif self.mode == "wb":
686691
buffer_size = 10 * 1024 * 1024 if buffer_size is None else buffer_size
687692
self._writer = open_writer(

obstore/src/buffered.rs

Lines changed: 31 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use std::sync::Arc;
44
use bytes::Bytes;
55
use object_store::buffered::{BufReader, BufWriter};
66
use object_store::{ObjectMeta, ObjectStore, ObjectStoreExt};
7-
use pyo3::exceptions::{PyDeprecationWarning, PyIOError, PyStopAsyncIteration, PyStopIteration};
7+
use pyo3::exceptions::{PyIOError, PyStopAsyncIteration, PyStopIteration};
88
use pyo3::prelude::*;
99
use pyo3::types::PyString;
1010
use pyo3::{intern, IntoPyObjectExt};
@@ -15,62 +15,75 @@ use tokio::io::{AsyncBufReadExt, AsyncReadExt, AsyncSeekExt, AsyncWriteExt, Line
1515
use tokio::sync::Mutex;
1616

1717
use crate::attributes::PyAttributes;
18-
use crate::list::PyObjectMeta;
1918
use crate::tags::PyTagSet;
2019

2120
#[pyfunction]
22-
#[pyo3(signature = (store, path, *, buffer_size=1024 * 1024))]
21+
#[pyo3(signature = (store, path, *, buffer_size=1024 * 1024, size=None))]
2322
pub(crate) fn open_reader(
2423
py: Python,
2524
store: PyObjectStore,
2625
path: PyPath,
2726
buffer_size: usize,
27+
size: Option<u64>,
2828
) -> PyObjectStoreResult<PyReadableFile> {
2929
let store = store.into_inner();
3030
let runtime = get_runtime();
31-
let (reader, meta) = py.detach(|| runtime.block_on(create_reader(store, path, buffer_size)))?;
32-
Ok(PyReadableFile::new(reader, meta, false))
31+
let (reader, resolved_size) =
32+
py.detach(|| runtime.block_on(create_reader(store, path, buffer_size, size)))?;
33+
Ok(PyReadableFile::new(reader, resolved_size, false))
3334
}
3435

3536
#[pyfunction]
36-
#[pyo3(signature = (store, path, *, buffer_size=1024 * 1024))]
37+
#[pyo3(signature = (store, path, *, buffer_size=1024 * 1024, size=None))]
3738
pub(crate) fn open_reader_async(
3839
py: Python,
3940
store: PyObjectStore,
4041
path: PyPath,
4142
buffer_size: usize,
43+
size: Option<u64>,
4244
) -> PyResult<Bound<PyAny>> {
4345
let store = store.into_inner();
4446
future_into_py(py, async move {
45-
let (reader, meta) = create_reader(store, path, buffer_size).await?;
46-
Ok(PyReadableFile::new(reader, meta, true))
47+
let (reader, resolved_size) = create_reader(store, path, buffer_size, size).await?;
48+
Ok(PyReadableFile::new(reader, resolved_size, true))
4749
})
4850
}
4951

5052
async fn create_reader(
5153
store: Arc<dyn ObjectStore>,
5254
path: PyPath,
5355
capacity: usize,
54-
) -> PyObjectStoreResult<(BufReader, ObjectMeta)> {
55-
let meta = store
56-
.head(path.as_ref())
57-
.await
58-
.map_err(PyObjectStoreError::ObjectStoreError)?;
59-
Ok((BufReader::with_capacity(store, &meta, capacity), meta))
56+
size: Option<u64>,
57+
) -> PyObjectStoreResult<(BufReader, u64)> {
58+
let meta = match size {
59+
Some(size) => ObjectMeta {
60+
location: path.as_ref().clone(),
61+
last_modified: Default::default(),
62+
size,
63+
e_tag: None,
64+
version: None,
65+
},
66+
None => store
67+
.head(path.as_ref())
68+
.await
69+
.map_err(PyObjectStoreError::ObjectStoreError)?,
70+
};
71+
let size = meta.size;
72+
Ok((BufReader::with_capacity(store, &meta, capacity), size))
6073
}
6174

6275
#[pyclass(name = "ReadableFile", frozen)]
6376
pub(crate) struct PyReadableFile {
6477
reader: Arc<Mutex<BufReader>>,
65-
meta: ObjectMeta,
78+
size: u64,
6679
r#async: bool,
6780
}
6881

6982
impl PyReadableFile {
70-
fn new(reader: BufReader, meta: ObjectMeta, r#async: bool) -> Self {
83+
fn new(reader: BufReader, size: u64, r#async: bool) -> Self {
7184
Self {
7285
reader: Arc::new(Mutex::new(reader)),
73-
meta,
86+
size,
7487
r#async,
7588
}
7689
}
@@ -91,17 +104,6 @@ impl PyReadableFile {
91104
// `Option<Arc<Mutex<BufReader>>>`.
92105
fn close(&self) {}
93106

94-
#[getter]
95-
fn meta(&self, py: Python) -> PyResult<PyObjectMeta> {
96-
let warnings_mod = py.import(intern!(py, "warnings"))?;
97-
let warning = PyDeprecationWarning::new_err(
98-
"The `meta` attribute is deprecated and will be removed in a future release. \
99-
Use the `head` or `head_async` methods directly if you need object metadata.",
100-
);
101-
warnings_mod.call_method1(intern!(py, "warn"), (warning,))?;
102-
Ok(self.meta.clone().into())
103-
}
104-
105107
#[pyo3(signature = (size = None, /))]
106108
fn read<'py>(&'py self, py: Python<'py>, size: Option<usize>) -> PyResult<Bound<'py, PyAny>> {
107109
let reader = self.reader.clone();
@@ -179,7 +181,7 @@ impl PyReadableFile {
179181

180182
#[getter]
181183
fn size(&self) -> u64 {
182-
self.meta.size
184+
self.size
183185
}
184186

185187
fn tell<'py>(&'py self, py: Python<'py>) -> PyResult<Bound<'py, PyAny>> {

tests/test_buffered.py

Lines changed: 55 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -114,28 +114,68 @@ async def test_read_past_eof_async():
114114
assert memoryview(expected) == memoryview(buffer)
115115

116116

117-
def test_readable_file_meta_emits_deprecation_warning():
117+
def test_open_reader_size_hint_sync():
118118
store = MemoryStore()
119+
data = b"x" * 1000
119120
path = "sized.bin"
120-
obs.put(store, path, b"x" * 100)
121-
122-
file = obs.open_reader(store, path)
123-
with pytest.warns(DeprecationWarning, match="`meta` attribute is deprecated"):
124-
meta = file.meta
121+
obs.put(store, path, data)
125122

126-
assert meta["size"] == 100
127-
assert meta["path"] == path
123+
file = obs.open_reader(store, path, size=len(data))
124+
assert file.size == len(data)
125+
assert memoryview(data) == memoryview(file.read())
128126

129127

130128
@pytest.mark.asyncio
131-
async def test_async_readable_file_meta_emits_deprecation_warning():
129+
async def test_open_reader_size_hint_async():
132130
store = MemoryStore()
131+
data = b"x" * 1000
133132
path = "sized.bin"
134-
await obs.put_async(store, path, b"x" * 100)
133+
await obs.put_async(store, path, data)
135134

136-
file = await obs.open_reader_async(store, path)
137-
with pytest.warns(DeprecationWarning, match="`meta` attribute is deprecated"):
138-
meta = file.meta
135+
file = await obs.open_reader_async(store, path, size=len(data))
136+
assert file.size == len(data)
137+
assert memoryview(data) == memoryview(await file.read())
138+
139+
140+
def test_open_reader_size_hint_larger_than_actual_errors_on_read():
141+
store = MemoryStore()
142+
data = b"x" * 1000
143+
path = "sized.bin"
144+
obs.put(store, path, data)
145+
146+
file = obs.open_reader(store, path, size=5000)
147+
assert file.size == 5000
148+
with pytest.raises(OSError, match="range"):
149+
file.read()
150+
151+
152+
def test_open_reader_size_hint_smaller_than_actual_truncates():
153+
store = MemoryStore()
154+
data = b"x" * 1000
155+
path = "sized.bin"
156+
obs.put(store, path, data)
157+
158+
file = obs.open_reader(store, path, size=500)
159+
assert file.size == 500
160+
buffer = file.read()
161+
assert memoryview(data[:500]) == memoryview(buffer)
162+
163+
164+
def test_open_reader_size_hint_zero_byte_file():
165+
store = MemoryStore()
166+
path = "empty.bin"
167+
obs.put(store, path, b"")
139168

140-
assert meta["size"] == 100
141-
assert meta["path"] == path
169+
file = obs.open_reader(store, path, size=0)
170+
assert file.size == 0
171+
assert memoryview(b"") == memoryview(file.read())
172+
173+
174+
def test_open_reader_no_longer_exposes_meta():
175+
store = MemoryStore()
176+
data = b"x" * 1000
177+
path = "sized.bin"
178+
obs.put(store, path, data)
179+
180+
file = obs.open_reader(store, path)
181+
assert not hasattr(file, "meta")

tests/test_fsspec.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,23 @@ async def test_info_synthesizes_directory_for_trailing_slash_query():
155155
assert mock_construct.call_count == 0
156156

157157

158+
def test_buffered_file_forwards_size_to_open_reader():
159+
register("file")
160+
fs: FsspecStore = fsspec.filesystem("file", asynchronous=False)
161+
162+
with TemporaryDirectory() as tmp:
163+
path = Path(tmp) / "sized.bin"
164+
path.write_bytes(b"x" * 1000)
165+
166+
file = fs._open(str(path), mode="rb", size=500)
167+
168+
assert file.size == 500
169+
assert file._reader.size == 500
170+
171+
data = file.read()
172+
assert len(data) == 500
173+
174+
158175
def test_construct_store_cache_diff_bucket_name(
159176
minio_bucket: tuple[S3Config, ClientConfig],
160177
):

0 commit comments

Comments
 (0)