Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions docs/authentication.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,28 +94,26 @@ Refer to [`obstore.auth.google`](api/auth/google.md).
You can use the [`AzureCredentialProvider`][obstore.auth.azure.AzureCredentialProvider] to use [`azure.identity`][] to handle credentials.

```py
import obstore as obs
from obstore.auth.azure import AzureCredentialProvider
from obstore.store import AzureStore

credential_provider = AzureAsyncCredentialProvider(credential=...)
store = AzureStore("container", credential_provider=credential_provider)
print(obs.list(store).collect())
print(store.list().collect())
```

Alternatively, you can use [`AzureAsyncCredentialProvider`][obstore.auth.azure.AzureAsyncCredentialProvider] with the async API:

```py
import asyncio
import obstore as obs
from obstore.auth.azure import AzureCredentialProvider
from obstore.store import AzureStore

credential_provider = AzureAsyncCredentialProvider(credential=...)
store = AzureStore("container", credential_provider=credential_provider)

async def fetch_blobs():
blobs = await obs.list(store).collect_async()
blobs = await store.list().collect_async()
print(blobs)

asyncio.run(fetch_blobs())
Expand Down
66 changes: 21 additions & 45 deletions docs/cookbook.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
Use the [`obstore.list`][] method.

```py
import obstore as obs

store = ... # store of your choice

# Recursively list all files below the 'data' path.
Expand All @@ -15,7 +13,7 @@ store = ... # store of your choice
prefix = "data"

# Get a stream of metadata objects:
list_stream = obs.list(store, prefix)
list_stream = store.list(prefix)

# Print info
for batch in list_stream:
Expand All @@ -32,12 +30,10 @@ Instead, you may consider passing `return_arrow=True` to [`obstore.list`][] to r
This Arrow integration requires the [`arro3-core` dependency](https://kylebarron.dev/arro3/latest/), a lightweight Arrow implementation. You can pass the emitted `RecordBatch` to [`pyarrow`](https://arrow.apache.org/docs/python/index.html) (zero-copy) by passing it to [`pyarrow.record_batch`][] or to [`polars`](https://pola.rs/) (also zero-copy) by passing it to `polars.DataFrame`.

```py
import obstore as obs

store = ... # store of your choice

# Get a stream of Arrow RecordBatches of metadata
list_stream = obs.list(store, prefix="data", return_arrow=True)
list_stream = store.list(prefix="data", return_arrow=True)
for record_batch in list_stream:
# Perform zero-copy conversion to your arrow-backed library of choice
#
Expand All @@ -58,13 +54,12 @@ for record_batch in list_stream:
Here's a working example with the [`sentinel-cogs` bucket](https://registry.opendata.aws/sentinel-2-l2a-cogs/) in AWS Open Data:

```py
import obstore as obs
import pandas as pd
import pyarrow as pa
from obstore.store import S3Store

store = S3Store("sentinel-cogs", region="us-west-2", skip_signature=True)
stream = obs.list(store, chunk_size=20, return_arrow=True)
stream = store.list(chunk_size=20, return_arrow=True)

for record_batch in stream:
# Convert to pyarrow (zero-copy), then to pandas for easy export to a
Expand All @@ -86,30 +81,28 @@ The Arrow record batch looks like the following:

## Fetch objects

Use the [`obstore.get`][] function to fetch data bytes from remote storage or files in the local filesystem.
Use the `get` method to fetch data bytes from remote storage or files in the local filesystem.

```py
import obstore as obs

store = ... # store of your choice

# Retrieve a specific file
path = "data/file01.parquet"

# Fetch just the file metadata
meta = obs.head(store, path)
meta = store.head(path)
print(meta)

# Fetch the object including metadata
result = obs.get(store, path)
result = store.get(path)
assert result.meta == meta

# Buffer the entire object in memory
buffer = result.bytes()
assert len(buffer) == meta.size

# Alternatively stream the bytes from object storage
stream = obs.get(store, path).stream()
stream = store.get(path).stream()

# We can now iterate over the stream
total_buffer_len = 0
Expand All @@ -125,9 +118,7 @@ Using the response as an iterator ensures that we don't buffer the entire file
into memory.

```py
import obstore as obs

resp = obs.get(store, path)
resp = store.get(path)

with open("output/file", "wb") as f:
for chunk in resp:
Expand All @@ -139,65 +130,56 @@ with open("output/file", "wb") as f:
Use the [`obstore.put`][] function to atomically write data. `obstore.put` will automatically use [multipart uploads](https://docs.aws.amazon.com/AmazonS3/latest/userguide/mpuoverview.html) for large input data.

```py
import obstore as obs

store = ... # store of your choice
path = "data/file1"
content = b"hello"
obs.put(store, path, content)
store.put(path, content)
```

You can also upload local files:

```py
from pathlib import Path
import obstore as obs

store = ... # store of your choice
path = "data/file1"
content = Path("path/to/local/file")
obs.put(store, path, content)
store.put(path, content)
```

Or file-like objects:

```py
import obstore as obs

store = ... # store of your choice
path = "data/file1"
with open("path/to/local/file", "rb") as content:
obs.put(store, path, content)
store.put(path, content)
```

Or iterables:

```py
import obstore as obs

def bytes_iter():
for i in range(5):
yield b"foo"

store = ... # store of your choice
path = "data/file1"
content = bytes_iter()
obs.put(store, path, content)
store.put(path, content)
```

Or async iterables:

```py
import obstore as obs

async def bytes_stream():
for i in range(5):
yield b"foo"

store = ... # store of your choice
path = "data/file1"
content = bytes_stream()
obs.put(store, path, content)
store.put(path, content)
```

## Copy objects from one store to another
Expand All @@ -209,16 +191,14 @@ Perhaps you have data in one store, say AWS S3, that you need to copy to another
Download the file, collect its bytes in memory, then upload it. Note that this will materialize the entire file in memory.

```py
import obstore as obs

store1 = ... # store of your choice
store2 = ... # store of your choice

path1 = "data/file1"
path2 = "data/file2"

buffer = obs.get(store1, path1).bytes()
obs.put(store2, path2, buffer)
buffer = store1.get(path1).bytes()
store2.put(path2, buffer)
```

### Local file
Expand All @@ -227,22 +207,21 @@ First download the file to disk, then upload it.

```py
from pathlib import Path
import obstore as obs

store1 = ... # store of your choice
store2 = ... # store of your choice

path1 = "data/file1"
path2 = "data/file2"

resp = obs.get(store1, path1)
resp = store1.get(path1)

with open("temporary_file", "wb") as f:
for chunk in resp:
f.write(chunk)

# Upload the path
obs.put(store2, path2, Path("temporary_file"))
store2.put(path2, Path("temporary_file"))
```

### Streaming
Expand All @@ -254,30 +233,27 @@ It's easy to **stream** a download from one store directly as the upload to anot
Using the async API is currently required to use streaming copies.

```py
import obstore as obs

store1 = ... # store of your choice
store2 = ... # store of your choice

path1 = "data/file1"
path2 = "data/file2"

# This only constructs the stream, it doesn't materialize the data in memory
resp = await obs.get_async(store1, path1)
resp = await store1.get_async(path1)
# A streaming upload is created to copy the file to path2
await obs.put_async(store2, path2, resp)
await store2.put_async(path2, resp)
```

Or, by customizing the chunk size and the upload concurrency you can control memory overhead.

```py
resp = await obs.get_async(store1, path1)
resp = await store1.get_async(path1)
chunk_size = 5 * 1024 * 1024 # 5MB
stream = resp.stream(min_chunk_size=chunk_size)

# A streaming upload is created to copy the file to path2
await obs.put_async(
store2,
await store2.put_async(
path2,
stream,
chunk_size=chunk_size,
Expand Down
5 changes: 2 additions & 3 deletions docs/examples/fastapi.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ First, import `fastapi` and `obstore` and create the FastAPI application.
from fastapi import FastAPI
from fastapi.responses import StreamingResponse

import obstore as obs
from obstore.store import HTTPStore, S3Store

app = FastAPI()
Expand All @@ -39,7 +38,7 @@ async def download_example() -> StreamingResponse:

# Make the request. This only begins the download; it does not wait for the
# download to finish.
resp = await obs.get_async(store, path)
resp = await store.get_async(path)
return StreamingResponse(resp)
```

Expand All @@ -56,7 +55,7 @@ async def large_example() -> StreamingResponse:

# Note: for large file downloads you may need to increase the timeout in
# the client configuration
resp = await obs.get_async(store, path)
resp = await store.get_async(path)

# Example: Ensure the stream returns at least 5MB of data in each chunk.
return StreamingResponse(resp.stream(min_chunk_size=5 * 1024 * 1024))
Expand Down
13 changes: 6 additions & 7 deletions docs/examples/minio.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ docker run -p 9000:9000 -p 9001:9001 \
Now we can create an `S3Store` to interact with minio:

```py
import obstore as obs
from obstore.store import S3Store

store = S3Store(
Expand All @@ -33,20 +32,20 @@ store = S3Store(
)

# Add files
obs.put(store, "a.txt", b"foo")
obs.put(store, "b.txt", b"bar")
obs.put(store, "c/d.txt", b"baz")
store.put("a.txt", b"foo")
store.put("b.txt", b"bar")
store.put("c/d.txt", b"baz")

# List files
files = obs.list(store).collect()
files = store.list().collect()
print(files)

# Download a file
resp = obs.get(store, "a.txt")
resp = store.get("a.txt")
print(resp.bytes())

# Delete a file
obs.delete(store, "a.txt")
store.delete("a.txt")
```

There's a [full example](https://github.com/developmentseed/obstore/tree/main/examples/minio) in the obstore repository.
28 changes: 14 additions & 14 deletions docs/getting-started.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,12 @@ File-like object support is also provided:
### Example

```py
import obstore as obs
from obstore.store import MemoryStore

store = obs.store.MemoryStore()
store = MemoryStore()

obs.put(store, "file.txt", b"hello world!")
response = obs.get(store, "file.txt")
store.put("file.txt", b"hello world!")
response = store.get("file.txt")
response.meta
# {'path': 'file.txt',
# 'last_modified': datetime.datetime(2024, 10, 21, 16, 19, 45, 102620, tzinfo=datetime.timezone.utc),
Expand All @@ -90,22 +90,22 @@ response.meta
# 'version': None}
assert response.bytes() == b"hello world!"

byte_range = obs.get_range(store, "file.txt", start=0, end=5)
byte_range = store.get_range("file.txt", start=0, end=5)
assert byte_range == b"hello"

obs.copy(store, "file.txt", "other.txt")
assert obs.get(store, "other.txt").bytes() == b"hello world!"
store.copy("file.txt", "other.txt")
assert store.get("other.txt").bytes() == b"hello world!"
```

All of these methods also have `async` counterparts, suffixed with `_async`.

```py
import obstore as obs
from obstore.store import MemoryStore

store = obs.store.MemoryStore()
store = MemoryStore()

await obs.put_async(store, "file.txt", b"hello world!")
response = await obs.get_async(store, "file.txt")
await store.put_async("file.txt", b"hello world!")
response = await store.get_async("file.txt")
response.meta
# {'path': 'file.txt',
# 'last_modified': datetime.datetime(2024, 10, 21, 16, 20, 36, 477418, tzinfo=datetime.timezone.utc),
Expand All @@ -114,10 +114,10 @@ response.meta
# 'version': None}
assert await response.bytes_async() == b"hello world!"

byte_range = await obs.get_range_async(store, "file.txt", start=0, end=5)
byte_range = await store.get_range_async("file.txt", start=0, end=5)
assert byte_range == b"hello"

await obs.copy_async(store, "file.txt", "other.txt")
resp = await obs.get_async(store, "other.txt")
await store.copy_async("file.txt", "other.txt")
resp = await store.get_async("other.txt")
assert await resp.bytes_async() == b"hello world!"
```
Loading