Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/snippets/multimodal.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ export const PyMultimodalImports = "import lancedb\nimport pyarrow as pa\nimport

export const PyProcessResults = "# Convert back to PIL Image\nfor _, row in results.iterrows():\n image_bytes = row['image_blob']\n image = Image.open(io.BytesIO(image_bytes))\n print(f\"Retrieved image: {row['filename']}, Size: {image.size}\")\n # You can now use 'image' with other libraries or display it\n";

export const PyQueryToPandasKwargs = "# Plain scan query: blob_mode is supported end to end\ndf_lazy = (\n tbl.search()\n .where(\"id = 1\")\n .select([\"id\", \"video\"])\n .to_pandas(blob_mode=\"lazy\")\n)\n\n# Same call shape works on async query builders\ndf_bytes = await (\n tbl_async.query()\n .where(\"id = 1\")\n .select([\"id\", \"video\"])\n .to_pandas(blob_mode=\"bytes\")\n)\n\n# Vector / FTS / hybrid queries only support blob_mode=\"descriptions\"\ndf_desc = (\n tbl.search(query_vector)\n .limit(10)\n .to_pandas(blob_mode=\"descriptions\", split_blocks=True, self_destruct=True)\n)\n";
export const PyQueryToPandasKwargs = "# Plain scan query: blob_mode is supported end to end\ndf_lazy = (\n tbl.search()\n .where(\"id = 1\")\n .select([\"id\", \"video\"])\n .to_pandas(blob_mode=\"lazy\")\n)\n\n# Same call shape works on async query builders\ndf_bytes = await (\n tbl_async.query()\n .where(\"id = 1\")\n .select([\"id\", \"video\"])\n .to_pandas(blob_mode=\"bytes\")\n)\n\n# Vector / FTS / hybrid queries can't materialize blob columns,\n# so omit them from the projection\ndf_vec = (\n tbl.search(query_vector)\n .limit(10)\n .select([\"id\", \"vector\"])\n .to_pandas(split_blocks=True, self_destruct=True)\n)\n";

export const PySearchData = "# Search for similar images\nquery_vector = np.random.rand(128).astype(np.float32)\nresults = tbl.search(query_vector).limit(1).to_pandas()\n";

Expand Down
2 changes: 1 addition & 1 deletion docs/tables/multimodal.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ Extra keyword arguments are forwarded to the underlying PyArrow / Lance pandas c
Query builders also accept `blob_mode` on their `to_pandas()` method:

- Plain scans support `"lazy"`, `"bytes"`, and `"descriptions"` with filters, projections, aliases, `limit`, and `offset`.
- Vector, FTS, hybrid, and ordered queries only support `blob_mode="descriptions"`; use that mode or omit blob columns from the projection.
- Vector, FTS, hybrid, and ordered queries can't materialize blob columns through `to_pandas()`; omit blob columns from the projection for those query shapes.
- This works on both sync and async query builders. Extra PyArrow kwargs like `split_blocks` and `self_destruct` are still forwarded.

<CodeGroup>
Expand Down
62 changes: 50 additions & 12 deletions tests/py/test_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,29 +166,67 @@ def test_blob_api_to_pandas(db_path_factory):
assert len(df_typed) == 2


def test_query_to_pandas_kwargs(db_path_factory):
db = lancedb.connect(db_path_factory("query_to_pandas_db"))
@pytest.mark.asyncio
async def test_query_to_pandas_kwargs(db_path_factory):
schema = pa.schema([
pa.field("id", pa.int64()),
pa.field("vector", pa.list_(pa.float32(), 128)),
pa.field(
"video",
pa.large_binary(),
metadata={"lance-encoding:blob": "true"},
),
])
tbl = db.create_table(
"search_demo",
data=[
{"id": i, "vector": np.random.rand(128).astype(np.float32)}
for i in range(10)
],
schema=schema,
mode="overwrite",
data = [
{
"id": i,
"vector": np.random.rand(128).astype(np.float32),
"video": f"fake_video_bytes_{i}".encode(),
}
for i in range(10)
]

db = lancedb.connect(db_path_factory("query_to_pandas_db"))
tbl = db.create_table("search_demo", data=data, schema=schema, mode="overwrite")

async_db = await lancedb.connect_async(
str(db_path_factory("query_to_pandas_async_db"))
)
tbl_async = await async_db.create_table(
"search_demo", data=data, schema=schema, mode="overwrite"
)

query_vector = np.random.rand(128).astype(np.float32)

# --8<-- [start:query_to_pandas_kwargs]
df = (
# Plain scan query: blob_mode is supported end to end
df_lazy = (
tbl.search()
.where("id = 1")
.select(["id", "video"])
.to_pandas(blob_mode="lazy")
)

# Same call shape works on async query builders
df_bytes = await (
tbl_async.query()
.where("id = 1")
.select(["id", "video"])
.to_pandas(blob_mode="bytes")
)

# Vector / FTS / hybrid queries can't materialize blob columns,
# so omit them from the projection
df_vec = (
tbl.search(query_vector)
.limit(10)
.select(["id", "vector"])
.to_pandas(split_blocks=True, self_destruct=True)
)
# --8<-- [end:query_to_pandas_kwargs]

assert len(df) == 10
assert len(df_lazy) == 1
assert len(df_bytes) == 1
assert df_bytes["video"].iloc[0] == b"fake_video_bytes_1"
assert len(df_vec) == 10
assert "video" not in df_vec.columns
Loading