Skip to content

Commit 184b3a4

Browse files
morning-colorkanichenKANIOYHmorningchen
authored
Add TencentES client (#623)
* add tencent es * support tencent es * Add support for TencentElasticsearch * Fix argument order in Elasticsearch config * Add Tencent ES installation command to README * Fix tencent_es installation command in README * Extends TencentElasticsearch client from ElasticCloud --------- Co-authored-by: kanichen <kanichen@tencent.com> Co-authored-by: KANI <yh_chan_kanio@163.com> Co-authored-by: morningchen <morningchen@tencent.com>
1 parent 368701f commit 184b3a4

10 files changed

Lines changed: 333 additions & 18 deletions

File tree

README.md

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -42,23 +42,24 @@ All the database client supported
4242
| Optional database client | install command |
4343
|--------------------------|---------------------------------------------|
4444
| pymilvus, zilliz_cloud (*default*) | `pip install vectordb-bench` |
45-
| all (*clients requirements might be conflict with each other*) | `pip install 'vectordb-bench[all]'` |
46-
| qdrant | `pip install 'vectordb-bench[qdrant]'` |
47-
| pinecone | `pip install 'vectordb-bench[pinecone]'` |
48-
| weaviate | `pip install 'vectordb-bench[weaviate]'` |
49-
| elastic, aliyun_elasticsearch| `pip install 'vectordb-bench[elastic]'` |
50-
| pgvector, pgvectorscale, pgdiskann, alloydb | `pip install 'vectordb-bench[pgvector]'` |
51-
| pgvecto.rs | `pip install 'vectordb-bench[pgvecto_rs]'` |
52-
| redis | `pip install 'vectordb-bench[redis]'` |
53-
| memorydb | `pip install 'vectordb-bench[memorydb]'` |
54-
| chromadb | `pip install 'vectordb-bench[chromadb]'` |
55-
| awsopensearch | `pip install 'vectordb-bench[opensearch]'` |
56-
| aliyun_opensearch | `pip install 'vectordb-bench[aliyun_opensearch]'` |
57-
| mongodb | `pip install 'vectordb-bench[mongodb]'` |
58-
| tidb | `pip install 'vectordb-bench[tidb]'` |
59-
| vespa | `pip install 'vectordb-bench[vespa]'` |
60-
| oceanbase | `pip install 'vectordb-bench[oceanbase]'` |
61-
| hologres | `pip install 'vectordb-bench[hologres]'` |
45+
| all (*clients requirements might be conflict with each other*) | `pip install vectordb-bench[all]` |
46+
| qdrant | `pip install vectordb-bench[qdrant]` |
47+
| pinecone | `pip install vectordb-bench[pinecone]` |
48+
| weaviate | `pip install vectordb-bench[weaviate]` |
49+
| elastic, aliyun_elasticsearch| `pip install vectordb-bench[elastic]` |
50+
| pgvector, pgvectorscale, pgdiskann, alloydb | `pip install vectordb-bench[pgvector]` |
51+
| pgvecto.rs | `pip install vectordb-bench[pgvecto_rs]` |
52+
| redis | `pip install vectordb-bench[redis]` |
53+
| memorydb | `pip install vectordb-bench[memorydb]` |
54+
| chromadb | `pip install vectordb-bench[chromadb]` |
55+
| awsopensearch | `pip install vectordb-bench[opensearch]` |
56+
| aliyun_opensearch | `pip install vectordb-bench[aliyun_opensearch]` |
57+
| mongodb | `pip install vectordb-bench[mongodb]` |
58+
| tidb | `pip install vectordb-bench[tidb]` |
59+
| vespa | `pip install vectordb-bench[vespa]` |
60+
| oceanbase | `pip install vectordb-bench[oceanbase]` |
61+
| hologres | `pip install vectordb-bench[hologres]` |
62+
| tencent_es | `pip install vectordb-bench[tencent_es]` |
6263

6364
### Run
6465

install/requirements_py3.11.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ grpcio-tools==1.53.0
33
qdrant-client
44
pinecone-client
55
weaviate-client
6-
elasticsearch
6+
elasticsearch==8.16.0
77
pgvector
88
pgvecto_rs[psycopg3]>=0.2.1
99
sqlalchemy

vectordb_bench/backend/clients/__init__.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ class DB(Enum):
5151
OceanBase = "OceanBase"
5252
S3Vectors = "S3Vectors"
5353
Hologres = "Alibaba Cloud Hologres"
54+
TencentElasticsearch = "TencentElasticsearch"
5455

5556
@property
5657
def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912, C901, PLR0915
@@ -200,6 +201,11 @@ def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912, C901, PLR0915
200201

201202
return Hologres
202203

204+
if self == DB.TencentElasticsearch:
205+
from .tencent_elasticsearch.tencent_elasticsearch import TencentElasticsearch
206+
207+
return TencentElasticsearch
208+
203209
msg = f"Unknown DB: {self.name}"
204210
raise ValueError(msg)
205211

@@ -351,6 +357,11 @@ def config_cls(self) -> type[DBConfig]: # noqa: PLR0911, PLR0912, C901, PLR0915
351357

352358
return HologresConfig
353359

360+
if self == DB.TencentElasticsearch:
361+
from .tencent_elasticsearch.config import TencentElasticsearchConfig
362+
363+
return TencentElasticsearchConfig
364+
354365
msg = f"Unknown DB: {self.name}"
355366
raise ValueError(msg)
356367

@@ -477,6 +488,11 @@ def case_config_cls( # noqa: C901, PLR0911, PLR0912
477488

478489
return HologresIndexConfig
479490

491+
if self == DB.TencentElasticsearch:
492+
from .tencent_elasticsearch.config import TencentElasticsearchIndexConfig
493+
494+
return TencentElasticsearchIndexConfig
495+
480496
# DB.Pinecone, DB.Chroma, DB.Redis
481497
return EmptyDBCaseConfig
482498

vectordb_bench/backend/clients/api.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ class IndexType(str, Enum):
3434
ES_HNSW_INT8 = "int8_hnsw"
3535
ES_HNSW_INT4 = "int4_hnsw"
3636
ES_HNSW_BBQ = "bbq_hnsw"
37+
TES_VSEARCH = "vsearch"
3738
ES_IVFFlat = "ivfflat"
3839
GPU_IVF_FLAT = "GPU_IVF_FLAT"
3940
GPU_BRUTE_FORCE = "GPU_BRUTE_FORCE"

vectordb_bench/backend/clients/elastic_cloud/config.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ def __hash__(self) -> int:
5757
self.use_routing,
5858
self.efConstruction,
5959
self.M,
60+
2,
6061
)
6162
)
6263

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
import os
2+
from typing import Annotated, Unpack
3+
4+
import click
5+
from pydantic import SecretStr
6+
7+
from vectordb_bench.backend.clients import DB
8+
from vectordb_bench.cli.cli import (
9+
CommonTypedDict,
10+
cli,
11+
click_parameter_decorators_from_typed_dict,
12+
run,
13+
)
14+
15+
16+
class TencentElasticsearchTypedDict(CommonTypedDict):
17+
scheme: Annotated[
18+
str,
19+
click.option(
20+
"--scheme",
21+
type=str,
22+
help="Protocol in use to connect to the node",
23+
default="http",
24+
show_default=True,
25+
),
26+
]
27+
host: Annotated[
28+
str,
29+
click.option("--host", type=str, help="shot connection string", required=True),
30+
]
31+
port: Annotated[
32+
int,
33+
click.option("--port", type=int, help="Port to connect to", default=9200, show_default=True),
34+
]
35+
user: Annotated[
36+
str,
37+
click.option("--user", type=str, help="Db username", required=True),
38+
]
39+
password: Annotated[
40+
str,
41+
click.option(
42+
"--password",
43+
type=str,
44+
help="TencentElasticsearch password",
45+
default=lambda: os.environ.get("TES_PASSWORD", ""),
46+
show_default="$TES_PASSWORD",
47+
),
48+
]
49+
m: Annotated[
50+
int,
51+
click.option("--m", type=int, help="HNSW M parameter", default=16, show_default=True),
52+
]
53+
ef_construction: Annotated[
54+
int,
55+
click.option(
56+
"--ef_construction",
57+
type=int,
58+
help="HNSW efConstruction parameter",
59+
default=200,
60+
show_default=True,
61+
),
62+
]
63+
num_candidates: Annotated[
64+
int,
65+
click.option(
66+
"--num_candidates",
67+
type=int,
68+
help="Number of candidates to consider during searching",
69+
default=200,
70+
show_default=True,
71+
),
72+
]
73+
74+
75+
@cli.command()
76+
@click_parameter_decorators_from_typed_dict(TencentElasticsearchTypedDict)
77+
def TencentElasticsearch(**parameters: Unpack[TencentElasticsearchTypedDict]):
78+
from .config import TencentElasticsearchConfig, TencentElasticsearchIndexConfig
79+
80+
run(
81+
db=DB.TencentElasticsearch,
82+
db_config=TencentElasticsearchConfig(
83+
db_label=parameters["db_label"],
84+
scheme=parameters["scheme"],
85+
host=parameters["host"],
86+
port=parameters["port"],
87+
user=parameters["user"],
88+
password=SecretStr(parameters["password"]),
89+
),
90+
db_case_config=TencentElasticsearchIndexConfig(
91+
M=parameters["m"],
92+
efConstruction=parameters["ef_construction"],
93+
num_candidates=parameters["num_candidates"],
94+
),
95+
**parameters,
96+
)
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
from enum import Enum
2+
3+
from pydantic import BaseModel, SecretStr
4+
5+
from ..api import DBCaseConfig, DBConfig, IndexType, MetricType
6+
7+
8+
class TencentElasticsearchConfig(DBConfig, BaseModel):
9+
#: Protocol in use to connect to the node
10+
scheme: str = "http"
11+
host: str = ""
12+
port: int = 9200
13+
user: str = "elastic"
14+
password: SecretStr
15+
16+
def to_dict(self) -> dict:
17+
return {
18+
"hosts": [{"scheme": self.scheme, "host": self.host, "port": self.port}],
19+
"basic_auth": (self.user, self.password.get_secret_value()),
20+
}
21+
22+
23+
class ESElementType(str, Enum):
24+
float = "float" # 4 byte
25+
byte = "byte" # 1 byte, -128 to 127
26+
27+
28+
class TencentElasticsearchIndexConfig(BaseModel, DBCaseConfig):
29+
element_type: ESElementType = ESElementType.float
30+
index: IndexType = IndexType.TES_VSEARCH
31+
number_of_shards: int = 1
32+
number_of_replicas: int = 0
33+
refresh_interval: str = "3s"
34+
merge_max_thread_count: int = 8
35+
use_rescore: bool = False
36+
oversample_ratio: float = 2.0
37+
use_routing: bool = False
38+
use_force_merge: bool = True
39+
40+
metric_type: MetricType | None = None
41+
efConstruction: int | None = None
42+
M: int | None = None
43+
num_candidates: int | None = None
44+
45+
def __eq__(self, obj: any):
46+
return (
47+
self.index == obj.index
48+
and self.number_of_shards == obj.number_of_shards
49+
and self.number_of_replicas == obj.number_of_replicas
50+
and self.use_routing == obj.use_routing
51+
and self.efConstruction == obj.efConstruction
52+
and self.M == obj.M
53+
)
54+
55+
def __hash__(self) -> int:
56+
return hash(
57+
(
58+
self.index,
59+
self.number_of_shards,
60+
self.number_of_replicas,
61+
self.use_routing,
62+
self.efConstruction,
63+
self.M,
64+
2,
65+
)
66+
)
67+
68+
def parse_metric(self) -> str:
69+
if self.metric_type == MetricType.L2:
70+
return "l2_norm"
71+
if self.metric_type == MetricType.IP:
72+
return "dot_product"
73+
return "cosine"
74+
75+
def index_param(self) -> dict:
76+
return {
77+
"type": "dense_vector",
78+
"index": True,
79+
"element_type": self.element_type.value,
80+
"similarity": self.parse_metric(),
81+
"index_options": {
82+
"type": self.index.value,
83+
"index": "hnsw",
84+
"m": self.M,
85+
"ef_construction": self.efConstruction,
86+
},
87+
}
88+
89+
def search_param(self) -> dict:
90+
return {
91+
"num_candidates": self.num_candidates,
92+
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import logging
2+
import time
3+
from contextlib import contextmanager
4+
5+
from vectordb_bench.backend.filter import Filter, FilterOp
6+
7+
from ..elastic_cloud.elastic_cloud import ElasticCloud
8+
from .config import TencentElasticsearchIndexConfig
9+
10+
for logger in ("elasticsearch", "elastic_transport"):
11+
logging.getLogger(logger).setLevel(logging.WARNING)
12+
13+
log = logging.getLogger(__name__)
14+
15+
16+
SECONDS_WAITING_FOR_FORCE_MERGE_API_CALL_SEC = 30
17+
18+
19+
class TencentElasticsearch(ElasticCloud):
20+
supported_filter_types: list[FilterOp] = [
21+
FilterOp.NonFilter,
22+
FilterOp.NumGE,
23+
FilterOp.StrEqual,
24+
]
25+
26+
@contextmanager
27+
def init(self) -> None:
28+
"""connect to elasticsearch"""
29+
from elasticsearch import Elasticsearch
30+
31+
self.client = Elasticsearch(**self.db_config, request_timeout=1800)
32+
33+
yield
34+
self.client = None
35+
del self.client
36+
37+
def optimize(self, data_size: int | None = None):
38+
"""optimize will be called between insertion and search in performance cases."""
39+
assert self.client is not None, "should self.init() first"
40+
self.client.indices.refresh(index=self.indice)
41+
time.sleep(SECONDS_WAITING_FOR_FORCE_MERGE_API_CALL_SEC)
42+
if self.case_config.use_force_merge:
43+
force_merge_task_id = self.client.indices.forcemerge(
44+
index=self.indice,
45+
max_num_segments=1,
46+
wait_for_completion=False,
47+
)["task"]
48+
log.info(f"Elasticsearch force merge task id: {force_merge_task_id}")
49+
while True:
50+
time.sleep(SECONDS_WAITING_FOR_FORCE_MERGE_API_CALL_SEC)
51+
task_status = self.client.tasks.get(task_id=force_merge_task_id)
52+
if task_status["completed"]:
53+
return

vectordb_bench/cli/vectordbbench.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from ..backend.clients.qdrant_local.cli import QdrantLocal
1717
from ..backend.clients.redis.cli import Redis
1818
from ..backend.clients.s3_vectors.cli import S3Vectors
19+
from ..backend.clients.tencent_elasticsearch.cli import TencentElasticsearch
1920
from ..backend.clients.test.cli import Test
2021
from ..backend.clients.tidb.cli import TiDB
2122
from ..backend.clients.vespa.cli import Vespa
@@ -50,6 +51,7 @@
5051
cli.add_command(QdrantLocal)
5152
cli.add_command(BatchCli)
5253
cli.add_command(S3Vectors)
54+
cli.add_command(TencentElasticsearch)
5355

5456

5557
if __name__ == "__main__":

0 commit comments

Comments
 (0)