Skip to content

Commit 0122126

Browse files
s-h-a-d-o-walwayslove2013
authored andcommitted
Add lancedb
1 parent de9aa90 commit 0122126

11 files changed

Lines changed: 442 additions & 6 deletions

File tree

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -267,13 +267,13 @@ pip install -e '.[pinecone]'
267267
```
268268
### Run test server
269269
```
270-
$ python -m vectordb_bench
270+
python -m vectordb_bench
271271
```
272272
273273
OR:
274274
275275
```shell
276-
$ init_bench
276+
init_bench
277277
```
278278

279279
OR:
@@ -290,13 +290,13 @@ After reopen the repository in container, run `python -m vectordb_bench` in the
290290

291291
### Check coding styles
292292
```shell
293-
$ make lint
293+
make lint
294294
```
295295

296296
To fix the coding styles automatically
297297

298298
```shell
299-
$ make format
299+
make format
300300
```
301301

302302
## How does it work?

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ all = [
7272
"PyMySQL",
7373
"clickhouse-connect",
7474
"pyvespa",
75+
"lancedb",
7576
]
7677

7778
qdrant = [ "qdrant-client" ]
@@ -94,6 +95,7 @@ mariadb = [ "mariadb" ]
9495
tidb = [ "PyMySQL" ]
9596
clickhouse = [ "clickhouse-connect" ]
9697
vespa = [ "pyvespa" ]
98+
lancedb = [ "lancedb" ]
9799

98100
[project.urls]
99101
"repository" = "https://github.com/zilliztech/VectorDBBench"

vectordb_bench/backend/clients/__init__.py

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,10 @@ class DB(Enum):
4545
TiDB = "TiDB"
4646
Clickhouse = "Clickhouse"
4747
Vespa = "Vespa"
48+
LanceDB = "LanceDB"
4849

4950
@property
50-
def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912, C901
51+
def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912, C901, PLR0915
5152
"""Import while in use"""
5253
if self == DB.Milvus:
5354
from .milvus.milvus import Milvus
@@ -164,11 +165,16 @@ def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912, C901
164165

165166
return Vespa
166167

168+
if self == DB.LanceDB:
169+
from .lancedb.lancedb import LanceDB
170+
171+
return LanceDB
172+
167173
msg = f"Unknown DB: {self.name}"
168174
raise ValueError(msg)
169175

170176
@property
171-
def config_cls(self) -> type[DBConfig]: # noqa: PLR0911, PLR0912, C901
177+
def config_cls(self) -> type[DBConfig]: # noqa: PLR0911, PLR0912, C901, PLR0915
172178
"""Import while in use"""
173179
if self == DB.Milvus:
174180
from .milvus.config import MilvusConfig
@@ -285,6 +291,11 @@ def config_cls(self) -> type[DBConfig]: # noqa: PLR0911, PLR0912, C901
285291

286292
return VespaConfig
287293

294+
if self == DB.LanceDB:
295+
from .lancedb.config import LanceDBConfig
296+
297+
return LanceDBConfig
298+
288299
msg = f"Unknown DB: {self.name}"
289300
raise ValueError(msg)
290301

@@ -382,6 +393,11 @@ def case_config_cls( # noqa: C901, PLR0911, PLR0912
382393

383394
return VespaHNSWConfig
384395

396+
if self == DB.LanceDB:
397+
from .lancedb.config import _lancedb_case_config
398+
399+
return _lancedb_case_config.get(index_type)
400+
385401
# DB.Pinecone, DB.Chroma, DB.Redis
386402
return EmptyDBCaseConfig
387403

vectordb_bench/backend/clients/api.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ class IndexType(str, Enum):
3434
GPU_IVF_PQ = "GPU_IVF_PQ"
3535
GPU_CAGRA = "GPU_CAGRA"
3636
SCANN = "scann"
37+
NONE = "NONE"
3738

3839

3940
class SQType(str, Enum):
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
from typing import Annotated, Unpack
2+
3+
import click
4+
from pydantic import SecretStr
5+
6+
from ....cli.cli import (
7+
CommonTypedDict,
8+
cli,
9+
click_parameter_decorators_from_typed_dict,
10+
run,
11+
)
12+
from .. import DB
13+
from ..api import IndexType
14+
15+
16+
class LanceDBTypedDict(CommonTypedDict):
17+
uri: Annotated[
18+
str,
19+
click.option("--uri", type=str, help="URI connection string", required=True),
20+
]
21+
token: Annotated[
22+
str | None,
23+
click.option("--token", type=str, help="Authentication token", required=False),
24+
]
25+
26+
27+
@cli.command()
28+
@click_parameter_decorators_from_typed_dict(LanceDBTypedDict)
29+
def LanceDB(**parameters: Unpack[LanceDBTypedDict]):
30+
from .config import LanceDBConfig, _lancedb_case_config
31+
32+
run(
33+
db=DB.LanceDB,
34+
db_config=LanceDBConfig(
35+
db_label=parameters["db_label"],
36+
uri=parameters["uri"],
37+
token=SecretStr(parameters["token"]) if parameters.get("token") else None,
38+
),
39+
db_case_config=_lancedb_case_config.get("NONE")(),
40+
**parameters,
41+
)
42+
43+
44+
@cli.command()
45+
@click_parameter_decorators_from_typed_dict(LanceDBTypedDict)
46+
def LanceDBAutoIndex(**parameters: Unpack[LanceDBTypedDict]):
47+
from .config import LanceDBConfig, _lancedb_case_config
48+
49+
run(
50+
db=DB.LanceDB,
51+
db_config=LanceDBConfig(
52+
db_label=parameters["db_label"],
53+
uri=parameters["uri"],
54+
token=SecretStr(parameters["token"]) if parameters.get("token") else None,
55+
),
56+
db_case_config=_lancedb_case_config.get(IndexType.AUTOINDEX)(),
57+
**parameters,
58+
)
59+
60+
61+
@cli.command()
62+
@click_parameter_decorators_from_typed_dict(LanceDBTypedDict)
63+
def LanceDBIVFPQ(**parameters: Unpack[LanceDBTypedDict]):
64+
from .config import LanceDBConfig, _lancedb_case_config
65+
66+
run(
67+
db=DB.LanceDB,
68+
db_config=LanceDBConfig(
69+
db_label=parameters["db_label"],
70+
uri=parameters["uri"],
71+
token=SecretStr(parameters["token"]) if parameters.get("token") else None,
72+
),
73+
db_case_config=_lancedb_case_config.get(IndexType.IVFPQ)(),
74+
**parameters,
75+
)
76+
77+
78+
@cli.command()
79+
@click_parameter_decorators_from_typed_dict(LanceDBTypedDict)
80+
def LanceDBHNSW(**parameters: Unpack[LanceDBTypedDict]):
81+
from .config import LanceDBConfig, _lancedb_case_config
82+
83+
run(
84+
db=DB.LanceDB,
85+
db_config=LanceDBConfig(
86+
db_label=parameters["db_label"],
87+
uri=parameters["uri"],
88+
token=SecretStr(parameters["token"]) if parameters.get("token") else None,
89+
),
90+
db_case_config=_lancedb_case_config.get(IndexType.HNSW)(),
91+
**parameters,
92+
)
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
from pydantic import BaseModel, SecretStr
2+
3+
from ..api import DBCaseConfig, DBConfig, IndexType, MetricType
4+
5+
6+
class LanceDBConfig(DBConfig):
7+
"""LanceDB connection configuration."""
8+
9+
db_label: str
10+
uri: str
11+
token: SecretStr | None = None
12+
13+
def to_dict(self) -> dict:
14+
return {
15+
"uri": self.uri,
16+
"token": self.token.get_secret_value() if self.token else None,
17+
}
18+
19+
20+
class LanceDBIndexConfig(BaseModel, DBCaseConfig):
21+
index: IndexType = IndexType.IVFPQ
22+
metric_type: MetricType = MetricType.L2
23+
num_partitions: int = 0
24+
num_sub_vectors: int = 0
25+
nbits: int = 8 # Must be 4 or 8
26+
sample_rate: int = 256
27+
max_iterations: int = 50
28+
29+
def index_param(self) -> dict:
30+
if self.index not in [
31+
IndexType.IVFPQ,
32+
IndexType.HNSW,
33+
IndexType.AUTOINDEX,
34+
IndexType.NONE,
35+
]:
36+
msg = f"Index type {self.index} is not supported for LanceDB!"
37+
raise ValueError(msg)
38+
39+
# See https://lancedb.github.io/lancedb/python/python/#lancedb.table.Table.create_index
40+
params = {
41+
"metric": self.parse_metric(),
42+
"num_bits": self.nbits,
43+
"sample_rate": self.sample_rate,
44+
"max_iterations": self.max_iterations,
45+
}
46+
47+
if self.num_partitions > 0:
48+
params["num_partitions"] = self.num_partitions
49+
if self.num_sub_vectors > 0:
50+
params["num_sub_vectors"] = self.num_sub_vectors
51+
52+
return params
53+
54+
def search_param(self) -> dict:
55+
pass
56+
57+
def parse_metric(self) -> str:
58+
if self.metric_type in [MetricType.L2, MetricType.COSINE]:
59+
return self.metric_type.value.lower()
60+
if self.metric_type in [MetricType.IP, MetricType.DP]:
61+
return "dot"
62+
msg = f"Metric type {self.metric_type} is not supported for LanceDB!"
63+
raise ValueError(msg)
64+
65+
66+
class LanceDBNoIndexConfig(LanceDBIndexConfig):
67+
index: IndexType = IndexType.NONE
68+
69+
def index_param(self) -> dict:
70+
return {}
71+
72+
73+
class LanceDBAutoIndexConfig(LanceDBIndexConfig):
74+
index: IndexType = IndexType.AUTOINDEX
75+
76+
def index_param(self) -> dict:
77+
return {}
78+
79+
80+
class LanceDBHNSWIndexConfig(LanceDBIndexConfig):
81+
index: IndexType = IndexType.HNSW
82+
m: int = 0
83+
ef_construction: int = 0
84+
85+
def index_param(self) -> dict:
86+
params = LanceDBIndexConfig.index_param(self)
87+
88+
# See https://lancedb.github.io/lancedb/python/python/#lancedb.index.HnswSq
89+
params["index_type"] = "IVF_HNSW_SQ"
90+
if self.m > 0:
91+
params["m"] = self.m
92+
if self.ef_construction > 0:
93+
params["ef_construction"] = self.ef_construction
94+
95+
return params
96+
97+
98+
_lancedb_case_config = {
99+
IndexType.IVFPQ: LanceDBIndexConfig,
100+
IndexType.AUTOINDEX: LanceDBAutoIndexConfig,
101+
IndexType.HNSW: LanceDBHNSWIndexConfig,
102+
IndexType.NONE: LanceDBNoIndexConfig,
103+
}
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
import logging
2+
from contextlib import contextmanager
3+
4+
import lancedb
5+
import pyarrow as pa
6+
from lancedb.pydantic import LanceModel
7+
8+
from ..api import IndexType, VectorDB
9+
from .config import LanceDBConfig, LanceDBIndexConfig
10+
11+
log = logging.getLogger(__name__)
12+
13+
14+
class VectorModel(LanceModel):
15+
id: int
16+
vector: list[float]
17+
18+
19+
class LanceDB(VectorDB):
20+
def __init__(
21+
self,
22+
dim: int,
23+
db_config: LanceDBConfig,
24+
db_case_config: LanceDBIndexConfig,
25+
collection_name: str = "vector_bench_test",
26+
drop_old: bool = False,
27+
**kwargs,
28+
):
29+
self.name = "LanceDB"
30+
self.db_config = db_config
31+
self.case_config = db_case_config
32+
self.table_name = collection_name
33+
self.dim = dim
34+
self.uri = db_config["uri"]
35+
36+
db = lancedb.connect(self.uri)
37+
38+
if drop_old:
39+
try:
40+
db.drop_table(self.table_name)
41+
except Exception as e:
42+
log.warning(f"Failed to drop table {self.table_name}: {e}")
43+
44+
try:
45+
db.open_table(self.table_name)
46+
except Exception:
47+
schema = pa.schema(
48+
[pa.field("id", pa.int64()), pa.field("vector", pa.list_(pa.float64(), list_size=self.dim))]
49+
)
50+
db.create_table(self.table_name, schema=schema, mode="overwrite")
51+
52+
@contextmanager
53+
def init(self):
54+
self.db = lancedb.connect(self.uri)
55+
self.table = self.db.open_table(self.table_name)
56+
yield
57+
self.db = None
58+
self.table = None
59+
60+
def insert_embeddings(
61+
self,
62+
embeddings: list[list[float]],
63+
metadata: list[int],
64+
) -> tuple[int, Exception | None]:
65+
try:
66+
data = [{"id": meta, "vector": emb} for meta, emb in zip(metadata, embeddings, strict=False)]
67+
self.table.add(data)
68+
return len(metadata), None
69+
except Exception as e:
70+
log.warning(f"Failed to insert data into LanceDB table ({self.table_name}), error: {e}")
71+
return 0, e
72+
73+
def search_embedding(
74+
self,
75+
query: list[float],
76+
k: int = 100,
77+
filters: dict | None = None,
78+
) -> list[int]:
79+
if filters:
80+
results = self.table.search(query).where(f"id >= {filters['id']}", prefilter=True).limit(k).to_list()
81+
else:
82+
results = self.table.search(query).limit(k).to_list()
83+
return [int(result["id"]) for result in results]
84+
85+
def optimize(self, data_size: int | None = None):
86+
if self.table and hasattr(self, "case_config") and self.case_config.index != IndexType.NONE:
87+
log.info(f"Creating index for LanceDB table ({self.table_name})")
88+
self.table.create_index(**self.case_config.index_param())
89+
# Better recall with IVF_PQ (though still bad) but breaks HNSW: https://github.com/lancedb/lancedb/issues/2369
90+
if self.case_config.index in (IndexType.IVFPQ, IndexType.AUTOINDEX):
91+
self.table.optimize()

0 commit comments

Comments
 (0)