From 5f0f1a3b735d444356409761586261d77367ad43 Mon Sep 17 00:00:00 2001 From: "zhuangye.yxw" <2510035537@qq.com> Date: Wed, 20 May 2026 07:15:23 +0000 Subject: [PATCH 1/5] feat: add MCI hybrid workflow support - add the MCI index, HGraph hybrid overlay, and external KNNG import path - add eval/export tooling plus benchmark configs for filtered MCI comparisons - document MCI in English and Chinese and add a runnable hybrid example Signed-off-by: zhuangye.yxw <2510035537@qq.com> Assisted-by: GitHub Copilot:GPT-5.4 --- docs/docs/en/src/SUMMARY.md | 1 + docs/docs/en/src/guide/create_index.md | 1 + docs/docs/en/src/indexes/README.md | 1 + docs/docs/en/src/indexes/mci.md | 179 ++ .../docs/en/src/resources/index_parameters.md | 43 + docs/docs/zh/src/SUMMARY.md | 1 + docs/docs/zh/src/guide/create_index.md | 1 + docs/docs/zh/src/indexes/README.md | 1 + docs/docs/zh/src/indexes/mci.md | 175 ++ .../docs/zh/src/resources/index_parameters.md | 42 + .../cpp/322_feature_mci_hybrid_filter.cpp | 327 +++ examples/cpp/CMakeLists.txt | 3 + include/vsag/constants.h | 1 + include/vsag/index.h | 2 +- src/algorithm/mci.cpp | 2013 +++++++++++++++++ src/algorithm/mci.h | 206 ++ src/algorithm/mci_parameter.cpp | 168 ++ src/algorithm/mci_parameter.h | 83 + src/algorithm/mci_parameter_test.cpp | 127 ++ src/algorithm/mci_test.cpp | 377 +++ src/constants.cpp | 1 + src/factory/factory_test.cpp | 18 + src/factory/index_creators.cpp | 8 + src/inner_string_params.h | 2 + tools/eval/CMakeLists.txt | 11 + tools/eval/case/search_eval_case.cpp | 61 +- tools/eval/case/search_eval_case.h | 7 + tools/eval/export_knng.cpp | 295 +++ tools/eval/monitor/recall_monitor.cpp | 36 +- 29 files changed, 4173 insertions(+), 18 deletions(-) create mode 100644 docs/docs/en/src/indexes/mci.md create mode 100644 docs/docs/zh/src/indexes/mci.md create mode 100644 examples/cpp/322_feature_mci_hybrid_filter.cpp create mode 100644 src/algorithm/mci.cpp create mode 100644 src/algorithm/mci.h create mode 100644 src/algorithm/mci_parameter.cpp create mode 100644 src/algorithm/mci_parameter.h create mode 100644 src/algorithm/mci_parameter_test.cpp create mode 100644 src/algorithm/mci_test.cpp create mode 100644 tools/eval/export_knng.cpp diff --git a/docs/docs/en/src/SUMMARY.md b/docs/docs/en/src/SUMMARY.md index 6addb3f18e..fe46bb1984 100644 --- a/docs/docs/en/src/SUMMARY.md +++ b/docs/docs/en/src/SUMMARY.md @@ -14,6 +14,7 @@ - [Overview](indexes/README.md) - [Index Parameters](resources/index_parameters.md) - [HGraph](indexes/hgraph.md) +- [MCI](indexes/mci.md) - [IVF](indexes/ivf.md) - [SINDI](indexes/sindi.md) - [Pyramid](indexes/pyramid.md) diff --git a/docs/docs/en/src/guide/create_index.md b/docs/docs/en/src/guide/create_index.md index b46cb2b410..0c9f106496 100644 --- a/docs/docs/en/src/guide/create_index.md +++ b/docs/docs/en/src/guide/create_index.md @@ -9,6 +9,7 @@ index-specific options. | Name | Description | Page | Example | |------|-------------|------|---------| | `hgraph` | Improved graph index with richer quantization options | [HGraph](../indexes/hgraph.md) | `examples/cpp/103_index_hgraph.cpp` | +| `mci` | Clique-based dense index with optional HGraph hybrid filtering | [MCI](../indexes/mci.md) | `examples/cpp/322_feature_mci_hybrid_filter.cpp` | | `ivf` | Inverted file with quantization | [IVF](../indexes/ivf.md) | `examples/cpp/106_index_ivf.cpp` | | `sindi` | Sparse-vector index (e.g. BM25, SPLADE) | [SINDI](../indexes/sindi.md) | `examples/cpp/109_index_sindi.cpp` | | `pyramid` | Multi-tenant / tag-partitioned graph index | [Pyramid](../indexes/pyramid.md) | `examples/cpp/107_index_pyramid.cpp` | diff --git a/docs/docs/en/src/indexes/README.md b/docs/docs/en/src/indexes/README.md index 5f3b03ee89..eb2cacc5ad 100644 --- a/docs/docs/en/src/indexes/README.md +++ b/docs/docs/en/src/indexes/README.md @@ -10,6 +10,7 @@ The pages in this section cover the actively developed indexes: | Index | Page | Best for | |-------|------|----------| | `hgraph` | [HGraph](hgraph.md) | General-purpose, high-recall graph with rich quantization options | +| `mci` | [MCI](mci.md) | Dense vectors with clique-based candidate routing and optional HGraph hybrid filtering | | `ivf` | [IVF](ivf.md) | Partition-based search, high-throughput batch queries, large corpora | | `sindi` | [SINDI](sindi.md) | Sparse vectors (BM25 / learned sparse) on inner-product | | `pyramid` | [Pyramid](pyramid.md) | Multi-tenant or tag-partitioned corpora with hierarchical paths | diff --git a/docs/docs/en/src/indexes/mci.md b/docs/docs/en/src/indexes/mci.md new file mode 100644 index 0000000000..8ea6539d5a --- /dev/null +++ b/docs/docs/en/src/indexes/mci.md @@ -0,0 +1,179 @@ +# MCI + +MCI is a dense-vector index in VSAG that combines a k-nearest-neighbor graph with a +maximal-clique candidate structure. Compared with a pure graph walk, MCI spends more work at +build time to organise neighbors into clique-like candidate groups, then uses those groups to +reduce the number of vectors scored at query time. + +MCI also provides an optional **HGraph hybrid overlay** for filtered search. In that mode, MCI +remains the serialized primary index, while a separate HGraph index can be loaded through +`hgraph_index_path` and used only when the filter is broad enough. + +- Source: `src/algorithm/mci.{h,cpp}` +- Example: [`examples/cpp/322_feature_mci_hybrid_filter.cpp`](https://github.com/antgroup/vsag/blob/main/examples/cpp/322_feature_mci_hybrid_filter.cpp) + +## How it works + +1. **Build or import a KNN graph.** MCI starts from a candidate graph whose degree is capped by + `mcs`. If `knng_path` is empty, MCI derives the graph internally with ODescent. If + `knng_path` is set, it reads a fixed-width binary KNNG file instead. +2. **Enumerate clique candidates.** The graph is reorganised into maximal-clique style groups, + bounded by `clique_max`, so each node can jump to a compact candidate set during search. +3. **Score within candidate sets.** At query time MCI seeds the search with `seed_count`, scans + clique candidates, and then optionally reorders the final heap if `use_reorder` is enabled. +4. **Route broad filters to HGraph when configured.** If `use_hgraph_hybrid` is enabled and the + filter's `ValidRatio()` is greater than or equal to `hgraph_valid_ratio_threshold`, MCI can + forward the request to the external HGraph index instead of using the clique path. + +## Quick start + +### Build a plain MCI index + +```cpp +#include + +std::string params = R"({ + "dtype": "float32", + "metric_type": "l2", + "dim": 128, + "index_param": { + "base_quantization_type": "sq8", + "base_codes_type": "flatten", + "max_degree": 32, + "mcs": 200, + "clique_max": 50 + } +})"; + +auto index = vsag::Factory::CreateIndex("mci", params).value(); + +auto base = vsag::Dataset::Make(); +base->NumElements(n)->Dim(128)->Ids(ids)->Float32Vectors(data)->Owner(false); +index->Build(base); + +auto query = vsag::Dataset::Make(); +query->NumElements(1)->Dim(128)->Float32Vectors(q)->Owner(false); +auto result = index->KnnSearch( + query, 10, R"({"mci": {"ef_search": 80, "seed_count": 32}})").value(); +``` + +### Enable the HGraph hybrid overlay + +```cpp +std::string hybrid_params = R"({ + "dtype": "float32", + "metric_type": "l2", + "dim": 128, + "index_param": { + "base_quantization_type": "sq8", + "base_codes_type": "flatten", + "max_degree": 32, + "mcs": 200, + "clique_max": 50, + "use_hgraph_hybrid": true, + "hgraph_valid_ratio_threshold": 0.2, + "hgraph_index_path": "/path/to/hgraph.index", + "hgraph_ef_search": 100, + "hgraph_index_param": { + "base_quantization_type": "fp32", + "graph_type": "odescent", + "max_degree": 32, + "alpha": 1.2, + "graph_iter_turn": 20, + "neighbor_sample_rate": 0.2 + } + } +})"; + +auto hybrid = vsag::Factory::CreateIndex("mci", hybrid_params).value(); +std::ifstream input("/path/to/mci.index", std::ios::binary); +hybrid->Deserialize(input); +``` + +Hybrid is **not** a separate on-disk index type. The on-disk primary index is still the MCI +serialization; `hgraph_index_path` points to the external HGraph companion index loaded by the +overlay. + +## Build parameters + +MCI uses the generic `index_param` object for build-time parameters. + +| Parameter | Type | Typical value | Description | +|-----------|------|---------------|-------------| +| `base_quantization_type` | string | `fp32`, `sq8`, `rabitq` | Quantization used for the base storage | +| `base_codes_type` | string | `flatten` | Base code layout used by the flat data cell | +| `max_degree` | int | `16`-`48` | Maximum out-degree of the clique/search graph | +| `mcs` | int | `64`-`256` | Candidate budget used when building or importing the KNN graph | +| `clique_max` | int | `16`-`64` | Upper bound on the size of a clique candidate group | +| `alpha` | float | `1.2` | ODescent expansion factor when MCI builds its own KNN graph | +| `knng_path` | string | empty | Optional fixed-width binary KNNG file; if unset, MCI builds the graph internally | +| `clique_path` | string | empty | Optional precomputed clique index file | +| `use_reorder` | bool | `false` | Keep a higher-precision copy and rerank final candidates | + +### KNNG file format + +When `knng_path` is provided, MCI expects a binary file with these properties: + +- no header +- one fixed-width row per base vector +- each row stores neighbor ids as `uint32_t` / `InnerIdType` +- all rows have the same degree + +The example [`examples/cpp/322_feature_mci_hybrid_filter.cpp`](https://github.com/antgroup/vsag/blob/main/examples/cpp/322_feature_mci_hybrid_filter.cpp) +shows one way to derive such a file from an HGraph index. + +## Search parameters + +Search-time parameters live under the `mci` object. + +| Parameter | Type | Description | +|-----------|------|-------------| +| `ef_search` | int | Number of retained candidates during MCI search | +| `seed_count` | int | Number of seed ids collected before clique expansion | +| `hops_limit` | int | Optional safety cap for graph expansion hops | +| `rabitq_one_bit_search` | bool | Enable RabitQ lower-bound search mode when the underlying codes support it | + +```cpp +auto result = index->KnnSearch( + query, 10, R"({"mci": {"ef_search": 120, "seed_count": 64}})").value(); +``` + +## HGraph hybrid overlay + +The hybrid overlay is meant for **filtered KNN** rather than plain unfiltered search. + +### Routing rule + +MCI routes a filtered request to HGraph only when all of the following are true: + +- `use_hgraph_hybrid` is `true` +- the HGraph companion index is loaded and has the same size as the MCI index +- the request uses a `Filter` object rather than a bitset-only blacklist +- `filter->ValidRatio()` is greater than or equal to `hgraph_valid_ratio_threshold` + +Otherwise the request stays on the normal MCI path. + +### Hybrid-specific build parameters + +| Parameter | Type | Description | +|-----------|------|-------------| +| `use_hgraph_hybrid` | bool | Enable HGraph-assisted filtered-search routing | +| `hgraph_valid_ratio_threshold` | float | Minimum valid ratio required before routing to HGraph | +| `hgraph_index_path` | string | Path to the serialized external HGraph index | +| `hgraph_ef_search` | int | Default HGraph `ef_search` when the request doesn't supply an `hgraph` search object | +| `hgraph_index_param` | object | Build parameters used to instantiate the companion HGraph index before loading it | + +When a query runs, the result statistics include `mci_hybrid_route`, +`mci_hybrid_valid_ratio`, and `mci_hybrid_threshold`, which are useful when checking whether a +filter actually took the HGraph route. + +## When to use MCI + +- Dense-vector workloads where you want a compact candidate structure instead of a pure graph walk. +- Pipelines that already have an offline KNN graph and want to reuse it through `knng_path`. +- Filtered-search scenarios where narrow predicates stay on MCI, but broader predicates can reuse + an existing HGraph index through the hybrid overlay. + +If your workload is mostly unfiltered and graph-first, compare against [HGraph](hgraph.md). If +your main need is vector + structured attributes rather than id-based filter objects, also see +[Attribute Filter (Hybrid Search)](../advanced/attribute_filter.md). \ No newline at end of file diff --git a/docs/docs/en/src/resources/index_parameters.md b/docs/docs/en/src/resources/index_parameters.md index 9b32faa877..0377b9f7f9 100644 --- a/docs/docs/en/src/resources/index_parameters.md +++ b/docs/docs/en/src/resources/index_parameters.md @@ -77,6 +77,49 @@ At search time: {"hgraph": {"ef_search": 100}} ``` +## MCI + +MCI also uses the generic `index_param` key for build-time parameters, and the `mci` key for +search-time parameters. + +```json +{ + "dim": 128, + "dtype": "float32", + "metric_type": "l2", + "index_param": { + "base_quantization_type": "sq8", + "base_codes_type": "flatten", + "max_degree": 32, + "mcs": 200, + "clique_max": 50, + "knng_path": "", + "use_hgraph_hybrid": false, + "hgraph_valid_ratio_threshold": 1.0, + "hgraph_index_path": "" + } +} +``` + +| Field | Typical | Description | +|-------|---------|-------------| +| `max_degree` | 16-48 | Maximum out-degree of the candidate graph | +| `mcs` | 64-256 | Candidate budget used when building or importing the KNN graph | +| `clique_max` | 16-64 | Maximum clique candidate size | +| `alpha` | 1.2 | ODescent expansion factor when building the graph internally | +| `knng_path` | path or empty | Optional fixed-width binary KNN graph file | +| `clique_path` | path or empty | Optional precomputed clique index | +| `use_hgraph_hybrid` | bool | Enable filtered-search routing to an external HGraph index | +| `hgraph_valid_ratio_threshold` | 0.0-1.0 | Minimum valid ratio required before routing to HGraph | +| `hgraph_index_path` | path | Serialized HGraph companion index | +| `hgraph_ef_search` | 32-200 | Default HGraph `ef_search` for hybrid-routed queries | + +At search time: + +```json +{"mci": {"ef_search": 80, "seed_count": 32}} +``` + ## DiskANN ```json diff --git a/docs/docs/zh/src/SUMMARY.md b/docs/docs/zh/src/SUMMARY.md index 65591e4234..d5a3c60399 100644 --- a/docs/docs/zh/src/SUMMARY.md +++ b/docs/docs/zh/src/SUMMARY.md @@ -14,6 +14,7 @@ - [总览](indexes/README.md) - [索引参数](resources/index_parameters.md) - [HGraph](indexes/hgraph.md) +- [MCI](indexes/mci.md) - [IVF](indexes/ivf.md) - [SINDI](indexes/sindi.md) - [Pyramid](indexes/pyramid.md) diff --git a/docs/docs/zh/src/guide/create_index.md b/docs/docs/zh/src/guide/create_index.md index b4515e19c8..bdc548a213 100644 --- a/docs/docs/zh/src/guide/create_index.md +++ b/docs/docs/zh/src/guide/create_index.md @@ -10,6 +10,7 @@ VSAG 中所有检索能力都围绕 `Index` 接口展开。要使用某种索引 | 名称 | `name` 字符串 | 文档 | 适用场景 | | -------------- | --------------- | ------------------------------------- | --------------------------------------------------- | | HGraph | `hgraph` | [HGraph](../indexes/hgraph.md) | VSAG 自研图索引,支持多级量化和调优(详见 `examples/cpp/103_index_hgraph.cpp`) | +| MCI | `mci` | [MCI](../indexes/mci.md) | 基于 clique 候选组织的稠密向量索引,并支持可选 HGraph Hybrid 过滤 | | IVF | `ivf` | [IVF](../indexes/ivf.md) | 倒排索引,适合大 `k` 和批量查询 | | SINDI | `sindi` | [SINDI](../indexes/sindi.md) | 稀疏向量上的倒排索引 | | Pyramid | `pyramid` | [Pyramid](../indexes/pyramid.md) | 多层级 / 按路径分区的索引结构 | diff --git a/docs/docs/zh/src/indexes/README.md b/docs/docs/zh/src/indexes/README.md index 1bd5ced68b..824f3a6560 100644 --- a/docs/docs/zh/src/indexes/README.md +++ b/docs/docs/zh/src/indexes/README.md @@ -9,6 +9,7 @@ VSAG 提供了一系列索引实现,它们共享同一套构建式 API、同 | 索引 | 文档 | 适用场景 | |------|------|---------| | `hgraph` | [HGraph](hgraph.md) | 通用高召回图索引,量化选项丰富 | +| `mci` | [MCI](mci.md) | 使用 clique 候选组织的稠密向量索引,并支持可选 HGraph Hybrid 过滤 | | `ivf` | [IVF](ivf.md) | 基于分桶的检索,适合高吞吐批查询与超大规模语料 | | `sindi` | [SINDI](sindi.md) | 稀疏向量(BM25 / 学习稀疏)上的内积检索 | | `pyramid` | [Pyramid](pyramid.md) | 多租户 / 标签分区的层级索引 | diff --git a/docs/docs/zh/src/indexes/mci.md b/docs/docs/zh/src/indexes/mci.md new file mode 100644 index 0000000000..b79e4fdae1 --- /dev/null +++ b/docs/docs/zh/src/indexes/mci.md @@ -0,0 +1,175 @@ +# MCI + +MCI 是 VSAG 中面向稠密向量的索引,它把 K 近邻图和极大团候选结构组合在一起。与纯图游走相比, +MCI 会在构建阶段多做一层候选组织,把邻接关系整理成 clique 风格的候选组,以减少查询阶段真正 +需要打分的向量数。 + +MCI 还提供可选的 **HGraph Hybrid overlay**,用于过滤检索。在这种模式下,序列化落盘的主索引 +仍然是 MCI;HGraph 作为一个独立索引通过 `hgraph_index_path` 加载,只在过滤范围足够宽时参与检索。 + +- 源码:`src/algorithm/mci.{h,cpp}` +- 示例:[`examples/cpp/322_feature_mci_hybrid_filter.cpp`](https://github.com/antgroup/vsag/blob/main/examples/cpp/322_feature_mci_hybrid_filter.cpp) + +## 工作原理 + +1. **构建或导入 KNN 图。** MCI 从一个候选图开始,图的度数上限由 `mcs` 控制。若 `knng_path` + 为空,MCI 会通过 ODescent 在内部构图;若设置了 `knng_path`,则直接读取外部固定宽度的二进制 + KNNG 文件。 +2. **枚举 clique 候选。** 在候选图之上继续整理出极大团风格的候选组,组大小受 `clique_max` + 限制,便于查询时快速跳到紧凑的候选集合。 +3. **在候选组内打分。** 查询阶段,MCI 先按 `seed_count` 收集种子点,再扩展 clique 候选,若 + 启用了 `use_reorder`,则会对最终候选再做一次精排。 +4. **满足条件时切到 HGraph。** 若启用了 `use_hgraph_hybrid`,且过滤器的 `ValidRatio()` 大于等于 + `hgraph_valid_ratio_threshold`,MCI 就可以把这次请求转发给外部 HGraph,而不是继续走自身的 + clique 搜索路径。 + +## 快速开始 + +### 构建普通 MCI 索引 + +```cpp +#include + +std::string params = R"({ + "dtype": "float32", + "metric_type": "l2", + "dim": 128, + "index_param": { + "base_quantization_type": "sq8", + "base_codes_type": "flatten", + "max_degree": 32, + "mcs": 200, + "clique_max": 50 + } +})"; + +auto index = vsag::Factory::CreateIndex("mci", params).value(); + +auto base = vsag::Dataset::Make(); +base->NumElements(n)->Dim(128)->Ids(ids)->Float32Vectors(data)->Owner(false); +index->Build(base); + +auto query = vsag::Dataset::Make(); +query->NumElements(1)->Dim(128)->Float32Vectors(q)->Owner(false); +auto result = index->KnnSearch( + query, 10, R"({"mci": {"ef_search": 80, "seed_count": 32}})").value(); +``` + +### 启用 HGraph Hybrid overlay + +```cpp +std::string hybrid_params = R"({ + "dtype": "float32", + "metric_type": "l2", + "dim": 128, + "index_param": { + "base_quantization_type": "sq8", + "base_codes_type": "flatten", + "max_degree": 32, + "mcs": 200, + "clique_max": 50, + "use_hgraph_hybrid": true, + "hgraph_valid_ratio_threshold": 0.2, + "hgraph_index_path": "/path/to/hgraph.index", + "hgraph_ef_search": 100, + "hgraph_index_param": { + "base_quantization_type": "fp32", + "graph_type": "odescent", + "max_degree": 32, + "alpha": 1.2, + "graph_iter_turn": 20, + "neighbor_sample_rate": 0.2 + } + } +})"; + +auto hybrid = vsag::Factory::CreateIndex("mci", hybrid_params).value(); +std::ifstream input("/path/to/mci.index", std::ios::binary); +hybrid->Deserialize(input); +``` + +Hybrid **不是** 一个独立的落盘索引类型。真正序列化到磁盘的仍是 MCI 本体; +`hgraph_index_path` 指向的是由 overlay 在加载时额外打开的 HGraph 配套索引。 + +## 构建参数 + +MCI 的构建参数放在通用的 `index_param` 对象下。 + +| 参数 | 类型 | 典型值 | 说明 | +|------|------|--------|------| +| `base_quantization_type` | string | `fp32`、`sq8`、`rabitq` | 主存储使用的量化方式 | +| `base_codes_type` | string | `flatten` | flat data cell 使用的底层编码布局 | +| `max_degree` | int | `16`-`48` | clique / 搜索图的最大出度 | +| `mcs` | int | `64`-`256` | 构建或导入 KNN 图时使用的候选预算 | +| `clique_max` | int | `16`-`64` | 单个 clique 候选组的大小上限 | +| `alpha` | float | `1.2` | 当 MCI 自建 KNN 图时,ODescent 使用的扩张系数 | +| `knng_path` | string | 空 | 可选的固定宽度二进制 KNNG 文件;为空时由 MCI 内部构图 | +| `clique_path` | string | 空 | 可选的预计算 clique 索引文件 | +| `use_reorder` | bool | `false` | 是否保留更高精度副本并对最终候选精排 | + +### KNNG 文件格式 + +当设置 `knng_path` 时,MCI 期望的二进制文件满足: + +- 无文件头 +- 每个 base 向量对应一行固定宽度记录 +- 每行按 `uint32_t` / `InnerIdType` 存储邻居 id +- 所有行的度数一致 + +[`examples/cpp/322_feature_mci_hybrid_filter.cpp`](https://github.com/antgroup/vsag/blob/main/examples/cpp/322_feature_mci_hybrid_filter.cpp) +展示了如何从 HGraph 检索结果导出这样一份 `.knng` 文件。 + +## 搜索参数 + +查询参数放在 `mci` 对象下。 + +| 参数 | 类型 | 说明 | +|------|------|------| +| `ef_search` | int | MCI 查询阶段保留的候选数 | +| `seed_count` | int | clique 扩展前收集的种子 id 数 | +| `hops_limit` | int | 可选的搜索 hop 数上限 | +| `rabitq_one_bit_search` | bool | 当底层编码支持时启用 RabitQ lower-bound 搜索模式 | + +```cpp +auto result = index->KnnSearch( + query, 10, R"({"mci": {"ef_search": 120, "seed_count": 64}})").value(); +``` + +## HGraph Hybrid overlay + +Hybrid overlay 面向的是 **带过滤的 KNN 检索**,而不是普通无过滤检索。 + +### 路由规则 + +仅当以下条件同时满足时,MCI 才会把带过滤请求路由到 HGraph: + +- `use_hgraph_hybrid` 为 `true` +- HGraph 配套索引已经成功加载,且元素数量与 MCI 一致 +- 请求使用的是 `Filter` 对象,而不是单独的 bitset 黑名单过滤 +- `filter->ValidRatio()` 大于等于 `hgraph_valid_ratio_threshold` + +否则,请求仍走普通 MCI 路径。 + +### Hybrid 专用参数 + +| 参数 | 类型 | 说明 | +|------|------|------| +| `use_hgraph_hybrid` | bool | 开启 HGraph 辅助的过滤检索路由 | +| `hgraph_valid_ratio_threshold` | float | 只有当 valid ratio 达到该阈值时才切到 HGraph | +| `hgraph_index_path` | string | 外部 HGraph 序列化文件路径 | +| `hgraph_ef_search` | int | 当请求里没有显式 `hgraph` 搜索参数时使用的默认 `ef_search` | +| `hgraph_index_param` | object | 在加载外部 HGraph 前,用于实例化该索引对象的构建参数 | + +查询完成后,结果统计信息会包含 `mci_hybrid_route`、`mci_hybrid_valid_ratio` 和 +`mci_hybrid_threshold`,便于确认这次过滤请求是否真的走了 HGraph 路线。 + +## 何时使用 MCI + +- 稠密向量场景,希望使用更紧凑的候选结构,而不是纯图游走。 +- 已经离线构建了 KNN 图,希望通过 `knng_path` 复用这份图数据。 +- 带过滤检索场景:窄过滤走 MCI,自身 broad filter 可通过 Hybrid overlay 复用现有 + HGraph 索引。 + +如果你的工作负载大多是不带过滤的图检索,建议与 [HGraph](hgraph.md) 对比评估。如果你的主要需求 +是向量 + 结构化属性谓词,而不是基于 id 的 `Filter` 对象,也可以参考 +[属性过滤(混合搜索)](../advanced/attribute_filter.md)。 \ No newline at end of file diff --git a/docs/docs/zh/src/resources/index_parameters.md b/docs/docs/zh/src/resources/index_parameters.md index b8abd583f2..c926ead41e 100644 --- a/docs/docs/zh/src/resources/index_parameters.md +++ b/docs/docs/zh/src/resources/index_parameters.md @@ -75,6 +75,48 @@ HGraph 的构建参数使用通用的 `index_param` 键(参见 `examples/cpp/1 {"hgraph": {"ef_search": 100}} ``` +## MCI + +MCI 的构建参数同样放在通用的 `index_param` 下,而查询参数放在 `mci` 子对象里。 + +```json +{ + "dim": 128, + "dtype": "float32", + "metric_type": "l2", + "index_param": { + "base_quantization_type": "sq8", + "base_codes_type": "flatten", + "max_degree": 32, + "mcs": 200, + "clique_max": 50, + "knng_path": "", + "use_hgraph_hybrid": false, + "hgraph_valid_ratio_threshold": 1.0, + "hgraph_index_path": "" + } +} +``` + +| 字段 | 典型值 | 说明 | +|------|-------|------| +| `max_degree` | 16-48 | 候选图的最大出度 | +| `mcs` | 64-256 | 构建或导入 KNN 图时使用的候选预算 | +| `clique_max` | 16-64 | 单个 clique 候选组大小上限 | +| `alpha` | 1.2 | 内部自建图时 ODescent 的扩张系数 | +| `knng_path` | 路径或空 | 可选的固定宽度二进制 KNN 图文件 | +| `clique_path` | 路径或空 | 可选的预计算 clique 索引 | +| `use_hgraph_hybrid` | bool | 开启到外部 HGraph 的过滤检索路由 | +| `hgraph_valid_ratio_threshold` | 0.0-1.0 | 只有 valid ratio 达阈值时才会切到 HGraph | +| `hgraph_index_path` | 路径 | HGraph 配套索引的序列化文件 | +| `hgraph_ef_search` | 32-200 | Hybrid 路由到 HGraph 时使用的默认 `ef_search` | + +搜索时: + +```json +{"mci": {"ef_search": 80, "seed_count": 32}} +``` + ## DiskANN ```json diff --git a/examples/cpp/322_feature_mci_hybrid_filter.cpp b/examples/cpp/322_feature_mci_hybrid_filter.cpp new file mode 100644 index 0000000000..1a79106a2f --- /dev/null +++ b/examples/cpp/322_feature_mci_hybrid_filter.cpp @@ -0,0 +1,327 @@ +// Copyright 2024-present the vsag project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace { + +constexpr int64_t kDim = 16; +constexpr int64_t kCount = 128; +constexpr uint64_t kKnngDegree = 8; +constexpr int64_t kTopK = 5; + +const std::string kHGraphSearchParams = R"( +{ + "hgraph": { + "ef_search": 32 + } +} +)"; + +const std::string kMciSearchParams = R"( +{ + "mci": { + "ef_search": 16, + "seed_count": 8 + } +} +)"; + +void +check(bool condition, const std::string& message) { + if (not condition) { + throw std::runtime_error(message); + } +} + +template +void +check_expected(const tl::expected& result, const std::string& prefix) { + if (not result.has_value()) { + throw std::runtime_error(prefix + result.error().message); + } +} + +std::string +make_hgraph_params() { + std::ostringstream builder; + builder << R"({ + "dtype": "float32", + "metric_type": "l2", + "dim": )" + << kDim + << R"(, + "index_param": { + "base_quantization_type": "fp32", + "graph_type": "odescent", + "max_degree": 8, + "alpha": 1.2, + "graph_iter_turn": 10, + "neighbor_sample_rate": 0.2 + } +})"; + return builder.str(); +} + +std::string +make_mci_params(const std::string& knng_path) { + std::ostringstream builder; + builder << R"({ + "dtype": "float32", + "metric_type": "l2", + "dim": )" + << kDim + << R"(, + "index_param": { + "max_degree": 8, + "mcs": )" + << kKnngDegree + << R"(, + "clique_max": 8, + "build_thread_count": 2, + "knng_path": ")" + << knng_path + << R"(" + } +})"; + return builder.str(); +} + +std::string +make_mci_hybrid_params(const std::string& knng_path, const std::string& hgraph_index_path) { + std::ostringstream builder; + builder << R"({ + "dtype": "float32", + "metric_type": "l2", + "dim": )" + << kDim + << R"(, + "index_param": { + "max_degree": 8, + "mcs": )" + << kKnngDegree + << R"(, + "clique_max": 8, + "build_thread_count": 2, + "knng_path": ")" + << knng_path + << R"(", + "use_hgraph_hybrid": true, + "hgraph_valid_ratio_threshold": 0.4, + "hgraph_index_path": ")" + << hgraph_index_path + << R"(", + "hgraph_ef_search": 32, + "hgraph_index_param": { + "base_quantization_type": "fp32", + "graph_type": "odescent", + "max_degree": 8, + "alpha": 1.2, + "graph_iter_turn": 10, + "neighbor_sample_rate": 0.2 + } + } +})"; + return builder.str(); +} + +class MinIdFilter : public vsag::Filter { +public: + explicit MinIdFilter(int64_t min_id) : min_id_(min_id) { + valid_ids_.reserve(static_cast(kCount - min_id)); + for (int64_t id = min_id; id < kCount; ++id) { + valid_ids_.push_back(id); + } + valid_ratio_ = static_cast(valid_ids_.size()) / static_cast(kCount); + } + + [[nodiscard]] bool + CheckValid(int64_t id) const override { + return id >= min_id_; + } + + [[nodiscard]] float + ValidRatio() const override { + return valid_ratio_; + } + + void + GetValidIds(const int64_t** valid_ids, int64_t& count) const override { + *valid_ids = valid_ids_.data(); + count = static_cast(valid_ids_.size()); + } + +private: + int64_t min_id_; + float valid_ratio_{1.0F}; + std::vector valid_ids_{}; +}; + +void +serialize_index(const vsag::IndexPtr& index, const std::string& path) { + std::ofstream output(path, std::ios::binary); + check(output.good(), "failed to open index file for writing: " + path); + auto serialize_result = index->Serialize(output); + check_expected(serialize_result, "failed to serialize index: "); +} + +void +export_knng_from_hgraph(const vsag::IndexPtr& hgraph, + const std::vector& vectors, + const std::string& knng_path) { + std::ofstream output(knng_path, std::ios::binary); + check(output.good(), "failed to open KNNG file for writing: " + knng_path); + + const auto query_k = static_cast(std::min(kCount, kKnngDegree + 1)); + for (int64_t row = 0; row < kCount; ++row) { + auto query = vsag::Dataset::Make(); + query->NumElements(1) + ->Dim(kDim) + ->Float32Vectors(const_cast(vectors.data() + row * kDim)) + ->Owner(false); + + auto search_result = hgraph->KnnSearch(query, query_k, kHGraphSearchParams); + check_expected(search_result, + "failed to export KNNG row " + std::to_string(row) + ": "); + + std::vector neighbors; + neighbors.reserve(kKnngDegree); + auto result = search_result.value(); + for (int64_t rank = 0; + rank < result->GetDim() && static_cast(neighbors.size()) < kKnngDegree; + ++rank) { + auto neighbor_id = result->GetIds()[rank]; + if (neighbor_id < 0 || neighbor_id >= kCount || neighbor_id == row) { + continue; + } + auto neighbor = static_cast(neighbor_id); + if (std::find(neighbors.begin(), neighbors.end(), neighbor) != neighbors.end()) { + continue; + } + neighbors.push_back(neighbor); + } + + check(not neighbors.empty(), + "HGraph search produced no usable neighbors for row " + std::to_string(row)); + while (static_cast(neighbors.size()) < kKnngDegree) { + neighbors.push_back(neighbors.back()); + } + + output.write(reinterpret_cast(neighbors.data()), + static_cast(kKnngDegree * sizeof(uint32_t))); + check(output.good(), "failed to write KNNG row " + std::to_string(row)); + } +} + +std::vector +generate_vectors() { + std::vector vectors(kCount * kDim, 0.0F); + std::mt19937 rng(47); + std::uniform_real_distribution noise(0.0F, 0.02F); + for (int64_t row = 0; row < kCount; ++row) { + for (int64_t col = 0; col < kDim; ++col) { + const auto block = static_cast(row / 16); + const auto lane = static_cast(row % 16) * 0.05F; + vectors[row * kDim + col] = block + lane + static_cast(col) * 0.01F + + noise(rng); + } + } + return vectors; +} + +} // namespace + +int +main() { + vsag::init(); + + const auto work_dir = std::filesystem::path("/tmp/vsag_mci_hybrid_example"); + std::filesystem::create_directories(work_dir); + const auto hgraph_path = (work_dir / "hgraph.index").string(); + const auto knng_path = (work_dir / "hgraph.knng").string(); + const auto mci_path = (work_dir / "mci.index").string(); + + std::vector ids(kCount); + for (int64_t id = 0; id < kCount; ++id) { + ids[id] = id; + } + auto vectors = generate_vectors(); + + auto base = vsag::Dataset::Make(); + base->NumElements(kCount) + ->Dim(kDim) + ->Ids(ids.data()) + ->Float32Vectors(vectors.data()) + ->Owner(false); + + auto hgraph = vsag::Factory::CreateIndex("hgraph", make_hgraph_params()); + check_expected(hgraph, "failed to create HGraph index: "); + auto hgraph_build = hgraph.value()->Build(base); + check_expected(hgraph_build, "failed to build HGraph index: "); + serialize_index(hgraph.value(), hgraph_path); + export_knng_from_hgraph(hgraph.value(), vectors, knng_path); + + auto mci = vsag::Factory::CreateIndex("mci", make_mci_params(knng_path)); + check_expected(mci, "failed to create MCI index: "); + auto mci_build = mci.value()->Build(base); + check_expected(mci_build, "failed to build MCI index: "); + serialize_index(mci.value(), mci_path); + + auto hybrid = + vsag::Factory::CreateIndex("mci", make_mci_hybrid_params(knng_path, hgraph_path)); + check_expected(hybrid, "failed to create Hybrid overlay: "); + std::ifstream input(mci_path, std::ios::binary); + check(input.good(), "failed to open MCI index for loading: " + mci_path); + auto deserialize_result = hybrid.value()->Deserialize(input); + check_expected(deserialize_result, "failed to load MCI index into Hybrid overlay: "); + + auto query = vsag::Dataset::Make(); + query->NumElements(1) + ->Dim(kDim) + ->Float32Vectors(vectors.data() + 96 * kDim) + ->Owner(false); + + auto filter = std::make_shared(64); + auto search_result = hybrid.value()->KnnSearch(query, kTopK, kMciSearchParams, filter); + check_expected(search_result, "failed to run Hybrid filtered search: "); + + auto result = search_result.value(); + auto stats = result->GetStatistics({"mci_hybrid_route", "mci_hybrid_valid_ratio"}); + check(stats.size() == 2, "Hybrid search did not return expected statistics"); + check(stats[0].find("hgraph") != std::string::npos, + "expected Hybrid query to route to HGraph, got: " + stats[0]); + + std::cout << "HGraph index: " << hgraph_path << std::endl; + std::cout << "Derived KNNG: " << knng_path << std::endl; + std::cout << "MCI index: " << mci_path << std::endl; + std::cout << "Hybrid route: " << stats[0] << std::endl; + std::cout << "Hybrid valid ratio: " << stats[1] << std::endl; + std::cout << "Filtered results:" << std::endl; + for (int64_t rank = 0; rank < result->GetDim(); ++rank) { + check(result->GetIds()[rank] >= 64, "filtered result contains an invalid id"); + std::cout << result->GetIds()[rank] << " " << result->GetDistances()[rank] << std::endl; + } + + return 0; +} \ No newline at end of file diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt index 8e6cacc9e7..14312c945b 100644 --- a/examples/cpp/CMakeLists.txt +++ b/examples/cpp/CMakeLists.txt @@ -124,3 +124,6 @@ target_link_libraries (501_quantization_transform vsag) add_executable (321_index_fp16_hgraph 321_index_fp16_hgraph.cpp) target_link_libraries (321_index_fp16_hgraph vsag) + +add_executable(322_feature_mci_hybrid_filter 322_feature_mci_hybrid_filter.cpp) +target_link_libraries(322_feature_mci_hybrid_filter vsag) diff --git a/include/vsag/constants.h b/include/vsag/constants.h index 9c133dd069..d06540429a 100644 --- a/include/vsag/constants.h +++ b/include/vsag/constants.h @@ -26,6 +26,7 @@ extern const char* const INDEX_SINDI; extern const char* const INDEX_BRUTE_FORCE; extern const char* const INDEX_IVF; extern const char* const INDEX_WARP; +extern const char* const INDEX_MCI; extern const char* const DIM; extern const char* const NUM_ELEMENTS; extern const char* const IDS; diff --git a/include/vsag/index.h b/include/vsag/index.h index d4f7ea9950..fd63ce5cc4 100644 --- a/include/vsag/index.h +++ b/include/vsag/index.h @@ -51,7 +51,7 @@ struct MergeUnit { IdMapFunction id_map_func = nullptr; }; -enum class IndexType { HNSW, DISKANN, HGRAPH, IVF, PYRAMID, BRUTEFORCE, SPARSE, SINDI, WARP }; +enum class IndexType { HNSW, DISKANN, HGRAPH, IVF, PYRAMID, BRUTEFORCE, SPARSE, SINDI, WARP, MCI }; #define DATA_FLAG_FLOAT32_VECTOR 0x01 #define DATA_FLAG_INT8_VECTOR 0x02 diff --git a/src/algorithm/mci.cpp b/src/algorithm/mci.cpp new file mode 100644 index 0000000000..c47c54706e --- /dev/null +++ b/src/algorithm/mci.cpp @@ -0,0 +1,2013 @@ +// Copyright 2024-present the vsag project +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "mci.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" +#include "datacell/flatten_datacell_parameter.h" +#include "datacell/graph_interface.h" +#include "datacell/sparse_graph_datacell_parameter.h" +#include "dataset_impl.h" +#include "hgraph.h" +#include "impl/filter/black_list_filter.h" +#include "impl/filter/combined_filter.h" +#include "impl/filter/inner_id_wrapper_filter.h" +#include "impl/heap/standard_heap.h" +#include "impl/odescent/odescent_graph_builder.h" +#include "impl/odescent/odescent_graph_parameter.h" +#include "impl/reorder/flatten_reorder.h" +#include "index_common_param.h" +#include "index_feature_list.h" +#include "inner_string_params.h" +#include "io/memory_io_parameter.h" +#include "quantization/scalar_quantization/scalar_quantizer_parameter.h" +#include "storage/serialization.h" +#include "utils/util_functions.h" +#include "vsag/constants.h" + +namespace vsag { +namespace { + +static const std::string MCI_PARAMS_TEMPLATE = R"( + { + "type": "mci", + "use_reorder": false, + "reorder_source": "precise", + "max_degree": 32, + "mcs": 200, + "clique_max": 50, + "alpha": 1.2, + "knng_path": "", + "clique_path": "", + "use_hgraph_hybrid": false, + "hgraph_valid_ratio_threshold": 1.0, + "hgraph_index_path": "", + "hgraph_ef_search": 100, + "base_codes": { + "io_params": { + "type": "memory_io", + "file_path": "./default_file_path" + }, + "codes_type": "flatten", + "quantization_params": { + "type": "fp32", + "sq4_uniform_trunc_rate": 0.05, + "pca_dim": 0, + "rabitq_version": "standard", + "rabitq_bits_per_dim_query": 32, + "rabitq_bits_per_dim_base": 1, + "rabitq_error_rate": 1.9, + "tq_chain": "", + "nbits": 8, + "pq_dim": 1, + "hold_molds": false + } + }, + "precise_codes": { + "io_params": { + "type": "block_memory_io", + "file_path": "./default_file_path" + }, + "codes_type": "flatten", + "quantization_params": { + "type": "fp32", + "sq4_uniform_trunc_rate": 0.05, + "pca_dim": 0, + "pq_dim": 1, + "hold_molds": false + } + }, + "build_thread_count": 1, + "use_attribute_filter": false, + "attr_params": { + "has_buckets": true + } + })"; + +Vector> +run_local_ccr_mce(const Vector& local_nodes, + const Vector& local_edges, + const std::vector>& num_cliques_per_node, + uint64_t threshold, + uint64_t max_saved_cliques, + Allocator* allocator) { + Vector> saved_cliques(allocator); + const auto local_count = local_nodes.size(); + if (local_count < threshold or max_saved_cliques == 0) { + return saved_cliques; + } + auto has_local_edge = [&](uint64_t lhs, uint64_t rhs) { + return local_edges[lhs * local_count + rhs] != 0; + }; + + Vector degree(local_count, 0, allocator); + for (uint64_t i = 0; i < local_count; ++i) { + for (uint64_t j = 0; j < local_count; ++j) { + if (has_local_edge(i, j)) { + ++degree[i]; + } + } + } + + Vector active_to_local(allocator); + Vector local_to_active(local_count, local_count, allocator); + active_to_local.reserve(local_count); + for (uint64_t local_id = 0; local_id < local_count; ++local_id) { + if (degree[local_id] == 0) { + continue; + } + local_to_active[local_id] = active_to_local.size(); + active_to_local.push_back(local_id); + } + if (active_to_local.size() < threshold) { + return saved_cliques; + } + + Vector active_degree(allocator); + active_degree.reserve(active_to_local.size()); + for (auto local_id : active_to_local) { + active_degree.push_back(degree[local_id]); + } + + Vector order_to_local(allocator); + Vector local_to_order(local_count, local_count, allocator); + Vector removed(active_to_local.size(), 0, allocator); + order_to_local.reserve(active_to_local.size()); + for (uint64_t order = 0; order < active_to_local.size(); ++order) { + uint64_t best = active_to_local.size(); + uint64_t best_degree = std::numeric_limits::max(); + for (uint64_t active_id = 0; active_id < active_to_local.size(); ++active_id) { + const auto local_id = active_to_local[active_id]; + if (removed[active_id] == 0 and + (active_degree[active_id] < best_degree or + (active_degree[active_id] == best_degree and + (best == active_to_local.size() or local_id < active_to_local[best])))) { + best = active_id; + best_degree = active_degree[active_id]; + } + } + if (best == active_to_local.size()) { + break; + } + const auto best_local = active_to_local[best]; + removed[best] = 1; + order_to_local.push_back(best_local); + local_to_order[best_local] = order; + for (uint64_t active_id = 0; active_id < active_to_local.size(); ++active_id) { + const auto neighbor = active_to_local[active_id]; + if (removed[active_id] == 0 and has_local_edge(best_local, neighbor) and + active_degree[active_id] > 0) { + --active_degree[active_id]; + } + } + } + if (order_to_local.size() < threshold) { + return saved_cliques; + } + + const auto core_count = order_to_local.size(); + Vector core_edges(core_count * core_count, 0, allocator); + auto has_core_edge = [&](uint64_t lhs, uint64_t rhs) { + return core_edges[lhs * core_count + rhs] != 0; + }; + auto set_core_edge = [&](uint64_t lhs, uint64_t rhs) { + core_edges[lhs * core_count + rhs] = 1; + }; + Vector must_contain(core_count, 0, allocator); + uint64_t remaining_must = 0; + for (uint64_t lhs = 0; lhs < local_count; ++lhs) { + const auto core_lhs = local_to_order[lhs]; + if (core_lhs >= core_count) { + continue; + } + if (num_cliques_per_node[local_nodes[lhs]].load(std::memory_order_relaxed) == 0) { + must_contain[core_lhs] = 1; + ++remaining_must; + } + for (uint64_t rhs = lhs + 1; rhs < local_count; ++rhs) { + if (not has_local_edge(lhs, rhs)) { + continue; + } + const auto core_rhs = local_to_order[rhs]; + if (core_rhs < core_count) { + set_core_edge(core_lhs, core_rhs); + set_core_edge(core_rhs, core_lhs); + } + } + } + if (remaining_must == 0) { + return saved_cliques; + } + + auto state_has_must = [&](const Vector& current, const Vector& candidates) { + for (auto node : current) { + if (must_contain[node] != 0) { + return true; + } + } + for (auto node : candidates) { + if (must_contain[node] != 0) { + return true; + } + } + return false; + }; + + auto save_clique = [&](const Vector& current) { + if (current.size() < threshold or saved_cliques.size() >= max_saved_cliques) { + return false; + } + bool has_must = false; + for (auto node : current) { + if (must_contain[node] != 0) { + has_must = true; + break; + } + } + if (not has_must) { + return false; + } + + saved_cliques.push_back(Vector(allocator)); + auto& saved = saved_cliques.back(); + saved.reserve(current.size()); + for (auto node : current) { + saved.push_back(order_to_local[node]); + if (must_contain[node] != 0) { + must_contain[node] = 0; + --remaining_must; + } + } + return remaining_must == 0 or saved_cliques.size() >= max_saved_cliques; + }; + + std::function&, Vector&, Vector&)> expand = + [&](Vector& current, Vector& candidates, Vector& excluded) { + if (remaining_must == 0 or saved_cliques.size() >= max_saved_cliques) { + return; + } + if (current.size() + candidates.size() < threshold) { + return; + } + if (not state_has_must(current, candidates)) { + return; + } + if (candidates.empty() and excluded.empty()) { + save_clique(current); + return; + } + + uint64_t pivot = core_count; + uint64_t pivot_degree = 0; + auto update_pivot = [&](uint64_t node) { + uint64_t node_degree = 0; + for (auto candidate : candidates) { + if (has_core_edge(node, candidate)) { + ++node_degree; + } + } + if (pivot == core_count or node_degree > pivot_degree) { + pivot = node; + pivot_degree = node_degree; + } + }; + for (auto node : candidates) { + update_pivot(node); + } + for (auto node : excluded) { + update_pivot(node); + } + + Vector branches(allocator); + branches.reserve(candidates.size()); + for (auto node : candidates) { + if (pivot == core_count or not has_core_edge(pivot, node)) { + branches.push_back(node); + } + } + + for (auto node : branches) { + auto iter = std::find(candidates.begin(), candidates.end(), node); + if (iter == candidates.end()) { + continue; + } + + current.push_back(node); + Vector next_candidates(allocator); + Vector next_excluded(allocator); + next_candidates.reserve(candidates.size()); + next_excluded.reserve(excluded.size()); + for (auto candidate : candidates) { + if (has_core_edge(node, candidate)) { + next_candidates.push_back(candidate); + } + } + for (auto excluded_node : excluded) { + if (has_core_edge(node, excluded_node)) { + next_excluded.push_back(excluded_node); + } + } + expand(current, next_candidates, next_excluded); + current.pop_back(); + + iter = std::find(candidates.begin(), candidates.end(), node); + if (iter != candidates.end()) { + candidates.erase(iter); + excluded.push_back(node); + } + if (remaining_must == 0 or saved_cliques.size() >= max_saved_cliques) { + return; + } + } + }; + + for (uint64_t root = 0; root < core_count; ++root) { + if (remaining_must == 0 or saved_cliques.size() >= max_saved_cliques) { + break; + } + Vector current(allocator); + Vector candidates(allocator); + Vector excluded(allocator); + current.push_back(root); + for (uint64_t node = 0; node < core_count; ++node) { + if (not has_core_edge(root, node)) { + continue; + } + if (node > root) { + candidates.push_back(node); + } else { + excluded.push_back(node); + } + } + if (current.size() + candidates.size() < threshold) { + continue; + } + if (not state_has_must(current, candidates)) { + continue; + } + expand(current, candidates, excluded); + } + return saved_cliques; +} + +FilterPtr +make_inner_id_filter(const FilterPtr& filter, const LabelTable& label_table) { + if (filter == nullptr) { + return nullptr; + } + auto combined_filter = std::make_shared(); + combined_filter->AppendFilter(std::make_shared(filter, label_table)); + if (combined_filter->IsEmpty()) { + return nullptr; + } + return combined_filter; +} + +FlattenInterfacePtr +make_temporary_sq8_flatten(MetricType metric, + DataTypes data_type, + int64_t dim, + int64_t extra_info_size, + const std::shared_ptr& thread_pool, + Allocator* allocator) { + auto sq8_param = std::make_shared(); + sq8_param->quantizer_parameter = std::make_shared>(); + sq8_param->io_parameter = std::make_shared(); + + IndexCommonParam common_param; + common_param.metric_ = metric; + common_param.data_type_ = data_type; + common_param.dim_ = dim; + common_param.extra_info_size_ = extra_info_size; + common_param.thread_pool_ = thread_pool; + common_param.allocator_ = std::shared_ptr(allocator, [](Allocator*) {}); + return FlattenInterface::MakeInstance(sq8_param, common_param); +} + +bool +need_temporary_sq8_build_data(const FlattenInterfacePtr& base_codes, bool has_precise_reorder) { + return not has_precise_reorder and + base_codes->GetQuantizerName() == QUANTIZATION_TYPE_VALUE_RABITQ; +} + +bool +is_connected(const Vector>& graph, InnerIdType lhs, InnerIdType rhs) { + const auto& neighbors = graph[lhs]; + return std::binary_search(neighbors.begin(), neighbors.end(), rhs); +} + +Vector +collect_valid_inner_ids(const FilterPtr& filter, + const LabelTable& label_table, + uint64_t seed_count, + Allocator* allocator) { + Vector inner_ids(allocator); + if (filter == nullptr or seed_count == 0) { + return inner_ids; + } + + const int64_t* valid_labels = nullptr; + int64_t valid_count = 0; + filter->GetValidIds(&valid_labels, valid_count); + if (valid_labels == nullptr or valid_count <= 0) { + return inner_ids; + } + + const auto sampled_count = std::min(seed_count, static_cast(valid_count)); + inner_ids.reserve(sampled_count); + for (uint64_t i = 0; i < sampled_count; ++i) { + const auto offset = i * static_cast(valid_count) / sampled_count; + auto [found, inner_id] = label_table.TryGetIdByLabel(valid_labels[offset]); + if (found) { + inner_ids.push_back(inner_id); + } + } + std::sort(inner_ids.begin(), inner_ids.end()); + inner_ids.erase(std::unique(inner_ids.begin(), inner_ids.end()), inner_ids.end()); + return inner_ids; +} + +struct MCISearchCandidate { + float distance{0.0F}; + InnerIdType inner_id{0}; + bool expanded{false}; +}; + +} // namespace + +MCI::MCI(const MCIParameterPtr& param, const IndexCommonParam& common_param) + : InnerIndexInterface(param, common_param), + p_maxc_(common_param.allocator_.get()), + maxcs_(common_param.allocator_.get()), + p_node_to_cid_(common_param.allocator_.get()), + node_to_cids_(common_param.allocator_.get()), + max_degree_(param->max_degree), + mcs_(param->mcs), + clique_max_(param->clique_max), + alpha_(param->alpha), + knng_path_(param->knng_path), + clique_path_(param->clique_path), + reorder_by_base_(param->reorder_source == HGRAPH_REORDER_SOURCE_BASE), + use_hgraph_hybrid_(param->use_hgraph_hybrid), + hgraph_valid_ratio_threshold_(param->hgraph_valid_ratio_threshold), + hgraph_index_path_(param->hgraph_index_path), + hgraph_ef_search_(param->hgraph_ef_search) { + this->base_codes_ = FlattenInterface::MakeInstance(param->base_codes_param, common_param); + if (this->use_reorder_ and not this->reorder_by_base_) { + this->reorder_codes_ = + FlattenInterface::MakeInstance(param->precise_codes_param, common_param); + } + if (this->use_hgraph_hybrid_) { + CHECK_ARGUMENT(param->hgraph_param != nullptr, + "mci hgraph hybrid requires hgraph_index_param"); + this->hgraph_index_ = std::make_shared(param->hgraph_param, common_param); + } + this->p_maxc_.push_back(0); + this->p_node_to_cid_.push_back(0); +} + +std::vector +MCI::Build(const DatasetPtr& data) { + CHECK_ARGUMENT(GetNumElements() == 0, "index is not empty"); + if (this->hgraph_index_ != nullptr) { + if (not this->hgraph_index_path_.empty()) { + this->load_hgraph_index(this->hgraph_index_path_); + } else { + auto hgraph_failed_ids = this->hgraph_index_->Build(data); + CHECK_ARGUMENT(hgraph_failed_ids.empty(), + "mci hgraph hybrid sub-index failed to build all vectors"); + } + } + this->Train(data); + Vector> inserted_ids(this->allocator_); + auto failed_ids = this->add_dataset(data, false, &inserted_ids); + if (not inserted_ids.empty()) { + this->build_clique_index( + this->get_float_vectors(data), data->GetNumElements(), inserted_ids); + } + if (this->hgraph_index_ != nullptr) { + CHECK_ARGUMENT(this->hgraph_index_->GetNumElements() == this->GetNumElements(), + "mci hgraph hybrid sub-index size mismatch after build"); + } + this->cal_memory_usage(); + return failed_ids; +} + +void +MCI::Train(const DatasetPtr& data) { + auto total = data->GetNumElements(); + if (total == 0) { + return; + } + const auto* vectors = this->get_float_vectors(data); + this->base_codes_->Train(vectors, total); + if (this->reorder_codes_ != nullptr) { + this->reorder_codes_->Train(vectors, total); + } +} + +std::vector +MCI::Add(const DatasetPtr& data, AddMode mode) { + (void)mode; + if (this->hgraph_index_ != nullptr) { + auto hgraph_failed_ids = this->hgraph_index_->Add(data, mode); + CHECK_ARGUMENT(hgraph_failed_ids.empty(), + "mci hgraph hybrid sub-index failed to add all vectors"); + } + auto failed_ids = this->add_dataset(data, true, nullptr); + this->clear_clique_index(); + this->cal_memory_usage(); + return failed_ids; +} + +std::vector +MCI::add_dataset(const DatasetPtr& data, + bool train_if_empty, + Vector>* inserted_ids) { + std::vector failed_ids; + auto base_dim = data->GetDim(); + CHECK_ARGUMENT(base_dim == dim_, + fmt::format("base.dim({}) must be equal to index.dim({})", base_dim, dim_)); + const auto* vectors = this->get_float_vectors(data); + const auto* labels = data->GetIds(); + CHECK_ARGUMENT(labels != nullptr, "base.ids is nullptr"); + + std::unique_lock add_lock(this->add_mutex_); + if (train_if_empty and this->total_count_.load() == 0) { + this->Train(data); + } + + const auto total = data->GetNumElements(); + for (int64_t local_id = 0; local_id < total; ++local_id) { + const auto label = labels[local_id]; + { + std::lock_guard label_lock(this->label_lookup_mutex_); + if (this->label_table_->CheckLabel(label)) { + failed_ids.emplace_back(label); + continue; + } + } + + const auto inner_id = static_cast(this->total_count_.load()); + this->resize(inner_id + 1); + { + std::lock_guard label_lock(this->label_lookup_mutex_); + this->label_table_->Insert(inner_id, label); + } + const auto* vector = vectors + local_id * dim_; + this->base_codes_->InsertVector(vector, inner_id); + if (this->reorder_codes_ != nullptr) { + this->reorder_codes_->InsertVector(vector, inner_id); + } + if (inserted_ids != nullptr) { + inserted_ids->emplace_back(inner_id, local_id); + } + this->total_count_.fetch_add(1); + this->p_node_to_cid_.push_back(this->p_node_to_cid_.back()); + } + return failed_ids; +} + +void +MCI::clear_clique_index() { + std::unique_lock lock(this->global_mutex_); + this->p_maxc_.clear(); + this->maxcs_.clear(); + this->p_node_to_cid_.clear(); + this->node_to_cids_.clear(); + this->p_maxc_.push_back(0); + this->p_node_to_cid_.assign(this->total_count_.load() + 1, 0); + this->total_clique_count_ = 0; +} + +void +MCI::build_clique_index(const float* vectors, + uint64_t data_count, + const Vector>& inserted_ids) { + const auto total = this->total_count_.load(); + if (total == 0) { + this->clear_clique_index(); + return; + } + if (not this->clique_path_.empty()) { + this->load_clique_index(this->clique_path_, total); + return; + } + + auto has_precise_reorder = use_reorder_ and not reorder_by_base_ and this->reorder_codes_; + auto build_codes = has_precise_reorder ? this->reorder_codes_ : this->base_codes_; + FlattenInterfacePtr temporary_sq8_build_data = nullptr; + if (need_temporary_sq8_build_data(this->base_codes_, has_precise_reorder)) { + temporary_sq8_build_data = make_temporary_sq8_flatten(this->metric_, + this->data_type_, + this->dim_, + this->extra_info_size_, + this->thread_pool_, + this->allocator_); + temporary_sq8_build_data->Train(vectors, data_count); + for (const auto& [inner_id, local_idx] : inserted_ids) { + temporary_sq8_build_data->InsertVector(vectors + dim_ * local_idx, inner_id); + } + build_codes = temporary_sq8_build_data; + } + + auto graph = this->build_knn_graph(build_codes, total); + auto cliques = this->enumerate_maximal_cliques(graph, build_codes, total); + + Vector new_p_maxc(this->allocator_); + Vector new_maxcs(this->allocator_); + Vector> node_to_clique( + total, Vector(this->allocator_), this->allocator_); + new_p_maxc.push_back(0); + for (InnerIdType clique_id = 0; clique_id < cliques.size(); ++clique_id) { + for (auto inner_id : cliques[clique_id]) { + new_maxcs.push_back(inner_id); + node_to_clique[inner_id].push_back(clique_id); + } + new_p_maxc.push_back(static_cast(new_maxcs.size())); + } + + Vector new_p_node_to_cid(this->allocator_); + Vector new_node_to_cids(this->allocator_); + new_p_node_to_cid.push_back(0); + for (InnerIdType inner_id = 0; inner_id < total; ++inner_id) { + auto& ids = node_to_clique[inner_id]; + std::sort(ids.begin(), ids.end()); + ids.erase(std::unique(ids.begin(), ids.end()), ids.end()); + new_node_to_cids.insert(new_node_to_cids.end(), ids.begin(), ids.end()); + new_p_node_to_cid.push_back(static_cast(new_node_to_cids.size())); + } + + std::unique_lock lock(this->global_mutex_); + this->p_maxc_.swap(new_p_maxc); + this->maxcs_.swap(new_maxcs); + this->p_node_to_cid_.swap(new_p_node_to_cid); + this->node_to_cids_.swap(new_node_to_cids); + this->total_clique_count_ = cliques.size(); +} + +void +MCI::load_clique_index(const std::string& clique_path, uint64_t total) { + auto make_file_path = [](const std::string& dir, const std::string& name) { + if (dir.empty() or dir.back() == '/') { + return dir + name; + } + return dir + "/" + name; + }; + auto read_vector = [&](const std::string& name) { + const auto file_path = make_file_path(clique_path, name); + std::ifstream input(file_path, std::ios::binary | std::ios::ate); + CHECK_ARGUMENT(input.good(), fmt::format("failed to open mci clique file: {}", file_path)); + const auto file_size = static_cast(input.tellg()); + CHECK_ARGUMENT(file_size % sizeof(InnerIdType) == 0, + fmt::format("invalid mci clique file size: {}", file_path)); + Vector values(file_size / sizeof(InnerIdType), this->allocator_); + input.seekg(0, std::ios::beg); + if (not values.empty()) { + input.read(reinterpret_cast(values.data()), + static_cast(file_size)); + CHECK_ARGUMENT(input.good(), + fmt::format("failed to read mci clique file: {}", file_path)); + } + return values; + }; + + auto new_p_maxc = read_vector("pMaxC"); + auto new_maxcs = read_vector("maxCs"); + auto new_p_node_to_cid = read_vector("pNodeToCid"); + auto new_node_to_cids = read_vector("nodeToCids"); + + CHECK_ARGUMENT(not new_p_maxc.empty(), "mci pMaxC must not be empty"); + CHECK_ARGUMENT( + new_p_node_to_cid.size() == total + 1, + fmt::format( + "mci pNodeToCid size {} must be total + 1 ({})", new_p_node_to_cid.size(), total + 1)); + CHECK_ARGUMENT(new_p_maxc.front() == 0 and new_p_node_to_cid.front() == 0, + "mci imported CSR offsets must start from 0"); + CHECK_ARGUMENT( + new_p_maxc.back() == new_maxcs.size(), + fmt::format( + "mci pMaxC tail {} must equal maxCs size {}", new_p_maxc.back(), new_maxcs.size())); + CHECK_ARGUMENT(new_p_node_to_cid.back() == new_node_to_cids.size(), + fmt::format("mci pNodeToCid tail {} must equal nodeToCids size {}", + new_p_node_to_cid.back(), + new_node_to_cids.size())); + + auto offsets_non_decreasing = [](const Vector& offsets) { + return std::is_sorted(offsets.begin(), offsets.end()); + }; + CHECK_ARGUMENT(offsets_non_decreasing(new_p_maxc), "mci pMaxC offsets must be sorted"); + CHECK_ARGUMENT(offsets_non_decreasing(new_p_node_to_cid), + "mci pNodeToCid offsets must be sorted"); + if (not new_maxcs.empty()) { + const auto max_node = *std::max_element(new_maxcs.begin(), new_maxcs.end()); + CHECK_ARGUMENT(max_node < total, + fmt::format("mci maxCs node {} is out of range {}", max_node, total)); + } + if (not new_node_to_cids.empty()) { + const auto clique_count = static_cast(new_p_maxc.size() - 1); + const auto max_cid = *std::max_element(new_node_to_cids.begin(), new_node_to_cids.end()); + CHECK_ARGUMENT( + max_cid < clique_count, + fmt::format("mci nodeToCids id {} is out of range {}", max_cid, clique_count)); + } + + std::unique_lock lock(this->global_mutex_); + this->p_maxc_.swap(new_p_maxc); + this->maxcs_.swap(new_maxcs); + this->p_node_to_cid_.swap(new_p_node_to_cid); + this->node_to_cids_.swap(new_node_to_cids); + this->total_clique_count_ = this->p_maxc_.size() - 1; +} + +Vector> +MCI::build_knn_graph(const FlattenInterfacePtr& build_codes, uint64_t total) const { + Vector> graph( + total, Vector(this->allocator_), this->allocator_); + if (total <= 1) { + return graph; + } + + const auto candidate_limit = std::min(this->mcs_, total - 1); + if (not this->knng_path_.empty()) { + std::ifstream input(this->knng_path_, std::ios::binary | std::ios::ate); + CHECK_ARGUMENT(input.good(), fmt::format("failed to open knng file: {}", knng_path_)); + const auto file_size = static_cast(input.tellg()); + CHECK_ARGUMENT(file_size > 0, fmt::format("knng file is empty: {}", knng_path_)); + CHECK_ARGUMENT(file_size % sizeof(InnerIdType) == 0, + fmt::format("invalid knng file size: {}", knng_path_)); + const auto entry_count = file_size / sizeof(InnerIdType); + CHECK_ARGUMENT( + entry_count % total == 0, + fmt::format("knng entries are not divisible by total count: {}", knng_path_)); + const auto file_degree = entry_count / total; + CHECK_ARGUMENT(file_degree > 0, fmt::format("knng degree is zero: {}", knng_path_)); + + input.seekg(0, std::ios::beg); + Vector row(file_degree, this->allocator_); + Vector seen(total, 0, this->allocator_); + for (InnerIdType inner_id = 0; inner_id < total; ++inner_id) { + input.read(reinterpret_cast(row.data()), + static_cast(file_degree * sizeof(InnerIdType))); + CHECK_ARGUMENT(input.good(), fmt::format("failed to read knng row: {}", knng_path_)); + auto& neighbors = graph[inner_id]; + for (uint64_t rank = 0; rank < file_degree and neighbors.size() < candidate_limit; + ++rank) { + const auto neighbor = row[rank]; + CHECK_ARGUMENT(neighbor < total, + fmt::format("knng id {} is out of range {}", neighbor, total)); + if (neighbor == inner_id or seen[neighbor] != 0) { + continue; + } + seen[neighbor] = 1; + neighbors.push_back(neighbor); + } + for (auto neighbor : neighbors) { + seen[neighbor] = 0; + } + } + return graph; + } + + auto graph_param = std::make_shared(); + graph_param->max_degree_ = candidate_limit; + graph_param->support_delete_ = false; + IndexCommonParam graph_common_param; + graph_common_param.metric_ = this->metric_; + graph_common_param.data_type_ = this->data_type_; + graph_common_param.dim_ = this->dim_; + graph_common_param.extra_info_size_ = static_cast(this->extra_info_size_); + graph_common_param.thread_pool_ = this->thread_pool_; + graph_common_param.allocator_ = std::shared_ptr(this->allocator_, [](Allocator*) {}); + auto graph_storage = GraphInterface::MakeInstance(graph_param, graph_common_param); + + auto odescent_param = std::make_shared(); + odescent_param->max_degree = static_cast(candidate_limit); + odescent_param->alpha = this->alpha_; + odescent_param->sample_rate = 0.2F; + odescent_param->turn = 30; + odescent_param->min_in_degree = 1; + ODescent odescent_builder( + odescent_param, build_codes, this->allocator_, this->thread_pool_.get()); + odescent_builder.Build(); + odescent_builder.SaveGraph(graph_storage); + + for (InnerIdType inner_id = 0; inner_id < total; ++inner_id) { + graph_storage->GetNeighbors(inner_id, graph[inner_id]); + auto& neighbors = graph[inner_id]; + std::sort(neighbors.begin(), neighbors.end()); + neighbors.erase(std::unique(neighbors.begin(), neighbors.end()), neighbors.end()); + } + return graph; +} + +Vector> +MCI::enumerate_maximal_cliques(const Vector>& graph, + const FlattenInterfacePtr& build_codes, + uint64_t total) const { + Vector> cliques(this->allocator_); + std::vector> num_cliques_per_node(total); + for (auto& count : num_cliques_per_node) { + count.store(0, std::memory_order_relaxed); + } + + auto get_clique_count = [&](InnerIdType inner_id) { + return num_cliques_per_node[inner_id].load(std::memory_order_relaxed); + }; + + auto normalize_clique = [&](const Vector& clique) { + Vector normalized(this->allocator_); + if (clique.empty()) { + return normalized; + } + normalized.assign(clique.begin(), clique.end()); + std::sort(normalized.begin(), normalized.end()); + normalized.erase(std::unique(normalized.begin(), normalized.end()), normalized.end()); + return normalized; + }; + + auto append_selected_clique = [&](const Vector& clique) { + if (clique.empty()) { + return; + } + cliques.push_back(Vector(this->allocator_)); + cliques.back().assign(clique.begin(), clique.end()); + }; + + auto try_select_clique = [&](const Vector& clique, + Vector>& output) { + auto normalized = normalize_clique(clique); + if (normalized.empty()) { + return false; + } + bool has_uncovered = false; + for (auto inner_id : normalized) { + if (get_clique_count(inner_id) == 0) { + has_uncovered = true; + break; + } + } + if (not has_uncovered) { + return false; + } + for (auto inner_id : normalized) { + num_cliques_per_node[inner_id].fetch_add(1, std::memory_order_relaxed); + } + output.push_back(std::move(normalized)); + return true; + }; + + const auto candidate_limit = std::min(this->mcs_, total - 1); + const auto clique_min = + std::max(2, std::min({this->clique_max_, candidate_limit + 1, total})); + const auto node_clique_limit = std::max(3, static_cast(total / 100)); + const auto max_saved_per_seed = + std::min(candidate_limit, static_cast(this->max_degree_ + 2)); + const auto enable_build_stats = std::getenv("VSAG_MCI_BUILD_STATS") != nullptr; + + auto collect_candidates = + [&](InnerIdType seed, Vector& local_nodes, Vector& seed_distances) { + local_nodes.clear(); + seed_distances.clear(); + local_nodes.push_back(seed); + seed_distances.push_back(0.0F); + for (auto neighbor : graph[seed]) { + if (neighbor >= total or neighbor == seed or + get_clique_count(neighbor) >= node_clique_limit) { + continue; + } + local_nodes.push_back(neighbor); + seed_distances.push_back(build_codes->ComputePairVectors(seed, neighbor)); + if (local_nodes.size() > candidate_limit) { + break; + } + } + Vector order(this->allocator_); + order.reserve(local_nodes.size() - 1); + for (uint64_t i = 1; i < local_nodes.size(); ++i) { + order.push_back(i); + } + std::sort(order.begin(), order.end(), [&](uint64_t lhs, uint64_t rhs) { + if (seed_distances[lhs] != seed_distances[rhs]) { + return seed_distances[lhs] < seed_distances[rhs]; + } + return local_nodes[lhs] < local_nodes[rhs]; + }); + Vector sorted_nodes(this->allocator_); + Vector sorted_distances(this->allocator_); + sorted_nodes.reserve(local_nodes.size()); + sorted_distances.reserve(seed_distances.size()); + sorted_nodes.push_back(seed); + sorted_distances.push_back(0.0F); + for (auto idx : order) { + sorted_nodes.push_back(local_nodes[idx]); + sorted_distances.push_back(seed_distances[idx]); + } + local_nodes.swap(sorted_nodes); + seed_distances.swap(sorted_distances); + }; + + auto build_local_edges = [&](float now_alpha, + Vector& local_nodes, + Vector& seed_distances, + Vector& local_edges) { + const auto local_count = local_nodes.size(); + local_edges.assign(local_count * local_count, 0); + auto set_local_edge = [&](uint64_t lhs, uint64_t rhs) { + local_edges[lhs * local_count + rhs] = 1; + }; + uint64_t edge_count = 0; + if (local_nodes.size() <= 1) { + return edge_count; + } + const auto distance_limit = seed_distances[1] * now_alpha; + for (uint64_t i = 1; i < local_nodes.size(); ++i) { + if (seed_distances[i] <= distance_limit) { + set_local_edge(0, i); + set_local_edge(i, 0); + ++edge_count; + } + } + for (uint64_t i = 1; i < local_nodes.size(); ++i) { + for (uint64_t j = i + 1; j < local_nodes.size(); ++j) { + if (build_codes->ComputePairVectors(local_nodes[i], local_nodes[j]) <= + distance_limit) { + set_local_edge(i, j); + set_local_edge(j, i); + ++edge_count; + } + } + } + return edge_count; + }; + + auto append_high_alpha_fallback = [&](float now_alpha, + const Vector& local_nodes, + Vector>& output) { + if (now_alpha <= 100.0F or local_nodes.empty()) { + return false; + } + Vector fallback(this->allocator_); + fallback.reserve(local_nodes.size()); + fallback.push_back(local_nodes.front()); + for (uint64_t i = 1; i < local_nodes.size(); ++i) { + if (get_clique_count(local_nodes[i]) > 0) { + fallback.push_back(local_nodes[i]); + } + } + return try_select_clique(fallback, output); + }; + + auto build_ccr_saved_group = [&](const Vector& local_clique, + const Vector& local_nodes, + const Vector& local_edges) { + Vector group(this->allocator_); + const auto local_count = local_nodes.size(); + if (local_clique.empty() or local_count == 0) { + return group; + } + auto has_edge = [&](uint64_t lhs, uint64_t rhs) { + return local_edges[lhs * local_count + rhs] != 0; + }; + const auto root = local_clique.front(); + Vector forward_neighbors(this->allocator_); + forward_neighbors.reserve(local_count); + for (uint64_t local_id = 0; local_id < local_count; ++local_id) { + if (local_id != root and has_edge(root, local_id)) { + forward_neighbors.push_back(local_id); + } + } + if (forward_neighbors.size() + 1 < clique_min) { + return group; + } + + Vector degree(local_count, 0, this->allocator_); + for (uint64_t i = 0; i < forward_neighbors.size(); ++i) { + const auto lhs = forward_neighbors[i]; + for (uint64_t j = i + 1; j < forward_neighbors.size(); ++j) { + const auto rhs = forward_neighbors[j]; + if (has_edge(lhs, rhs)) { + ++degree[lhs]; + ++degree[rhs]; + } + } + } + + Vector removed(local_count, 0, this->allocator_); + Vector queue(this->allocator_); + for (auto node : forward_neighbors) { + if (degree[node] + 2 < clique_min) { + queue.push_back(node); + } + } + for (uint64_t offset = 0; offset < queue.size(); ++offset) { + const auto node = queue[offset]; + if (removed[node] != 0) { + continue; + } + removed[node] = 1; + for (auto neighbor : forward_neighbors) { + if (removed[neighbor] != 0 or not has_edge(node, neighbor)) { + continue; + } + if (degree[neighbor] > 0) { + --degree[neighbor]; + } + if (degree[neighbor] + 2 < clique_min) { + queue.push_back(neighbor); + } + } + } + + Vector active(this->allocator_); + active.reserve(forward_neighbors.size()); + for (auto node : forward_neighbors) { + if (removed[node] == 0) { + active.push_back(node); + } + } + if (active.size() + 1 < clique_min) { + return group; + } + + Vector active_degree(local_count, 0, this->allocator_); + for (uint64_t i = 0; i < active.size(); ++i) { + const auto lhs = active[i]; + for (uint64_t j = i + 1; j < active.size(); ++j) { + const auto rhs = active[j]; + if (has_edge(lhs, rhs)) { + ++active_degree[lhs]; + ++active_degree[rhs]; + } + } + } + + Vector peeled(local_count, 0, this->allocator_); + Vector peel_order(this->allocator_); + Vector core_nodes(this->allocator_); + Vector p_nodes(this->allocator_); + peel_order.reserve(active.size()); + core_nodes.reserve(active.size()); + p_nodes.reserve(active.size()); + for (uint64_t step = 0; step < active.size(); ++step) { + uint64_t best = local_count; + uint64_t best_degree = std::numeric_limits::max(); + for (auto node : active) { + if (peeled[node] == 0 and (active_degree[node] < best_degree or + (active_degree[node] == best_degree and node < best))) { + best = node; + best_degree = active_degree[node]; + } + } + if (best == local_count) { + break; + } + if (best_degree + step + 1 == active.size()) { + p_nodes.assign(peel_order.begin(), peel_order.end()); + core_nodes.push_back(best); + peeled[best] = 1; + for (auto node : active) { + if (peeled[node] == 0) { + core_nodes.push_back(node); + } + } + break; + } + peeled[best] = 1; + peel_order.push_back(best); + for (auto neighbor : active) { + if (peeled[neighbor] == 0 and has_edge(best, neighbor) and + active_degree[neighbor] > 0) { + --active_degree[neighbor]; + } + } + } + if (core_nodes.empty()) { + core_nodes.assign(active.begin(), active.end()); + } + if (core_nodes.size() + 1 < clique_min) { + return group; + } + + Vector in_core(local_count, 0, this->allocator_); + Vector in_p(local_count, 0, this->allocator_); + for (auto node : core_nodes) { + in_core[node] = 1; + } + for (auto node : p_nodes) { + in_p[node] = 1; + } + auto count_core_neighbors = [&](uint64_t node) { + uint64_t count = 0; + for (auto core_node : core_nodes) { + if (has_edge(node, core_node)) { + ++count; + } + } + return count; + }; + + bool should_save = true; + for (auto node : p_nodes) { + if (count_core_neighbors(node) == core_nodes.size()) { + should_save = false; + break; + } + } + if (should_save) { + for (auto node : forward_neighbors) { + if (node >= root or in_core[node] != 0 or in_p[node] != 0) { + continue; + } + if (count_core_neighbors(node) == core_nodes.size()) { + should_save = false; + break; + } + } + } + if (not should_save) { + return group; + } + + group.reserve(1 + core_nodes.size() + p_nodes.size()); + group.push_back(local_nodes[root]); + for (auto node : core_nodes) { + group.push_back(local_nodes[node]); + } + for (auto node : p_nodes) { + group.push_back(local_nodes[node]); + } + return group; + }; + + auto solve_seed = [&](InnerIdType seed, + float now_alpha, + Vector>& output, + Vector& local_nodes, + Vector& seed_distances, + Vector& local_edges) { + collect_candidates(seed, local_nodes, seed_distances); + if (local_nodes.size() < clique_min) { + append_high_alpha_fallback(now_alpha, local_nodes, output); + return; + } + const auto edge_count = + build_local_edges(now_alpha, local_nodes, seed_distances, local_edges); + if (edge_count < clique_min * (clique_min - 1) / 2) { + append_high_alpha_fallback(now_alpha, local_nodes, output); + return; + } + auto local_cliques = run_local_ccr_mce(local_nodes, + local_edges, + num_cliques_per_node, + clique_min, + max_saved_per_seed, + this->allocator_); + if (local_cliques.empty()) { + append_high_alpha_fallback(now_alpha, local_nodes, output); + return; + } + + uint64_t selected = 0; + for (const auto& local_clique : local_cliques) { + auto clique = build_ccr_saved_group(local_clique, local_nodes, local_edges); + if (clique.empty()) { + clique.reserve(local_clique.size()); + for (auto local_id : local_clique) { + clique.push_back(local_nodes[local_id]); + } + } + bool has_uncovered = false; + for (auto node : clique) { + if (get_clique_count(node) == 0) { + has_uncovered = true; + break; + } + } + if (has_uncovered and try_select_clique(clique, output)) { + if (++selected >= this->max_degree_) { + break; + } + } + } + }; + + auto solve_serial_round = [&](float now_alpha) { + Vector local_nodes(this->allocator_); + Vector seed_distances(this->allocator_); + Vector local_edges(this->allocator_); + Vector> seed_cliques(this->allocator_); + for (InnerIdType seed = 0; seed < total; ++seed) { + if (get_clique_count(seed) != 0) { + continue; + } + seed_cliques.clear(); + solve_seed(seed, now_alpha, seed_cliques, local_nodes, seed_distances, local_edges); + for (const auto& clique : seed_cliques) { + append_selected_clique(clique); + } + } + }; + + auto solve_parallel_round = [&](float now_alpha) { + if (this->thread_pool_ == nullptr or this->build_thread_count_ <= 1) { + solve_serial_round(now_alpha); + return; + } + + const auto thread_count = this->build_thread_count_; + const auto batch_seed_limit = std::max(thread_count, thread_count * 16); + Vector batch_seeds(this->allocator_); + batch_seeds.reserve(batch_seed_limit); + + std::vector>> thread_cliques; + thread_cliques.reserve(thread_count); + for (uint64_t thread_id = 0; thread_id < thread_count; ++thread_id) { + thread_cliques.emplace_back(this->allocator_); + } + + auto worker = [&](uint64_t thread_id, + uint64_t begin, + uint64_t end, + const Vector& seeds) { + Vector local_nodes(this->allocator_); + Vector seed_distances(this->allocator_); + Vector local_edges(this->allocator_); + auto& output = thread_cliques[thread_id]; + for (uint64_t i = begin; i < end; ++i) { + if (get_clique_count(seeds[i]) != 0) { + continue; + } + solve_seed(seeds[i], now_alpha, output, local_nodes, seed_distances, local_edges); + } + }; + + InnerIdType next_seed = 0; + while (next_seed < total) { + batch_seeds.clear(); + while (next_seed < total and batch_seeds.size() < batch_seed_limit) { + if (get_clique_count(next_seed) == 0) { + batch_seeds.push_back(next_seed); + } + ++next_seed; + } + if (batch_seeds.empty()) { + continue; + } + + for (auto& one_thread_cliques : thread_cliques) { + one_thread_cliques.clear(); + } + const auto active_thread_count = std::min(thread_count, batch_seeds.size()); + const auto item_per_thread = + (batch_seeds.size() + active_thread_count - 1) / active_thread_count; + std::vector> futures; + futures.reserve(active_thread_count); + for (uint64_t thread_id = 0; thread_id < active_thread_count; ++thread_id) { + const auto begin = thread_id * item_per_thread; + const auto end = std::min(begin + item_per_thread, batch_seeds.size()); + futures.emplace_back(this->thread_pool_->GeneralEnqueue( + worker, thread_id, begin, end, std::cref(batch_seeds))); + } + for (auto& future : futures) { + future.get(); + } + for (const auto& one_thread_cliques : thread_cliques) { + for (const auto& clique : one_thread_cliques) { + append_selected_clique(clique); + } + } + } + }; + + float now_alpha = std::max(1.2F, this->alpha_); + uint64_t previous_uncovered = total; + for (uint64_t round = 0; round < 16; ++round) { + const auto cliques_before_round = cliques.size(); + solve_parallel_round(now_alpha); + + uint64_t uncovered = 0; + for (InnerIdType inner_id = 0; inner_id < total; ++inner_id) { + if (get_clique_count(inner_id) == 0) { + ++uncovered; + } + } + if (enable_build_stats) { + std::cerr << "mci_build_round round=" << round + 1 << " alpha=" << now_alpha + << " uncovered=" << uncovered + << " round_cliques=" << cliques.size() - cliques_before_round + << " total_cliques=" << cliques.size() << std::endl; + } + if (uncovered == 0) { + break; + } + if (uncovered < previous_uncovered * 9 / 10) { + now_alpha += std::max(1.2F, this->alpha_); + } else { + now_alpha *= 2.0F; + } + previous_uncovered = uncovered; + } + + for (InnerIdType inner_id = 0; inner_id < total; ++inner_id) { + if (get_clique_count(inner_id) == 0) { + Vector singleton(this->allocator_); + singleton.push_back(inner_id); + for (auto neighbor : graph[inner_id]) { + if (singleton.size() >= this->max_degree_ or neighbor >= total) { + break; + } + singleton.push_back(neighbor); + } + Vector> fallback_cliques(this->allocator_); + if (try_select_clique(singleton, fallback_cliques)) { + append_selected_clique(fallback_cliques.front()); + } + } + } + if (enable_build_stats) { + uint64_t max_membership = 0; + uint64_t total_memberships = 0; + for (InnerIdType inner_id = 0; inner_id < total; ++inner_id) { + const auto membership = static_cast(get_clique_count(inner_id)); + total_memberships += membership; + max_membership = std::max(max_membership, membership); + } + std::cerr << "mci_build_summary total_cliques=" << cliques.size() + << " total_memberships=" << total_memberships << " avg_membership=" + << static_cast(total_memberships) / static_cast(total) + << " max_membership=" << max_membership << std::endl; + } + return cliques; +} + +bool +MCI::has_clique_index(uint64_t total) const { + return this->total_clique_count_ > 0 and + this->p_maxc_.size() == this->total_clique_count_ + 1 and + this->p_node_to_cid_.size() == total + 1; +} + +DistHeapPtr +MCI::scan_knn_candidates(const FlattenInterfacePtr& codes, + const ComputerInterfacePtr& computer, + const FilterPtr& inner_filter, + int64_t candidate_limit, + bool use_distance_lower_bound, + QueryContext& ctx, + DistanceRecordVector* rabitq_lower_bound_candidates, + uint32_t& dist_cmp) const { + auto heap = DistanceHeap::MakeInstanceBySize(this->allocator_, candidate_limit); + const auto total = static_cast(this->total_count_.load()); + for (InnerIdType inner_id = 0; inner_id < total; ++inner_id) { + if (inner_filter != nullptr and not inner_filter->CheckValid(inner_id)) { + continue; + } + float dist = 0.0F; + float lower_bound = std::numeric_limits::max(); + if (use_distance_lower_bound) { + codes->QueryWithDistanceLowerBound(&dist, &lower_bound, computer, &inner_id, 1, &ctx); + if (rabitq_lower_bound_candidates != nullptr) { + rabitq_lower_bound_candidates->emplace_back(lower_bound, inner_id); + } + } else { + codes->Query(&dist, computer, &inner_id, 1, &ctx); + } + ++dist_cmp; + heap->Push(dist, inner_id); + } + return heap; +} + +DistHeapPtr +MCI::search_clique_candidates(const ComputerInterfacePtr& computer, + const FilterPtr& inner_filter, + const Vector* seed_inner_ids, + const MCISearchParameters& search_params, + int64_t candidate_limit, + QueryContext& ctx, + DistanceRecordVector* rabitq_lower_bound_candidates, + uint32_t& dist_cmp, + uint32_t& hops) const { + auto heap = DistanceHeap::MakeInstanceBySize(this->allocator_, candidate_limit); + const auto total = static_cast(this->total_count_.load()); + Vector visited_nodes(total, 0, this->allocator_); + Vector visited_cliques(this->total_clique_count_, 0, this->allocator_); + Vector candidates(this->allocator_); + candidates.reserve(static_cast(candidate_limit)); + + auto is_better = [](const MCISearchCandidate& lhs, const MCISearchCandidate& rhs) { + if (lhs.distance != rhs.distance) { + return lhs.distance < rhs.distance; + } + return lhs.inner_id < rhs.inner_id; + }; + + auto can_update = [&](float distance) { + return static_cast(candidates.size()) < candidate_limit or + distance < candidates.back().distance; + }; + + auto insert_candidate = [&](float distance, InnerIdType inner_id) { + if (not can_update(distance)) { + return; + } + MCISearchCandidate candidate{distance, inner_id, false}; + auto iter = std::lower_bound(candidates.begin(), candidates.end(), candidate, is_better); + candidates.insert(iter, candidate); + if (static_cast(candidates.size()) > candidate_limit) { + candidates.pop_back(); + } + }; + + auto get_closest_unexpanded = [&]() -> MCISearchCandidate* { + for (auto& candidate : candidates) { + if (not candidate.expanded) { + candidate.expanded = true; + return &candidate; + } + } + return nullptr; + }; + + auto try_visit = [&](InnerIdType inner_id) -> bool { + if (inner_id >= total or visited_nodes[inner_id] != 0) { + return false; + } + visited_nodes[inner_id] = 1; + if (inner_filter != nullptr and not inner_filter->CheckValid(inner_id)) { + return false; + } + float dist = 0.0F; + float lower_bound = std::numeric_limits::max(); + if (search_params.rabitq_one_bit_search) { + this->base_codes_->QueryWithDistanceLowerBound( + &dist, &lower_bound, computer, &inner_id, 1, &ctx); + if (rabitq_lower_bound_candidates != nullptr) { + rabitq_lower_bound_candidates->emplace_back(lower_bound, inner_id); + } + } else { + this->base_codes_->Query(&dist, computer, &inner_id, 1, &ctx); + } + ++dist_cmp; + insert_candidate(dist, inner_id); + return true; + }; + + const auto seed_target = std::min(search_params.seed_count, total); + uint64_t seeds = 0; + if (seed_inner_ids != nullptr and not seed_inner_ids->empty()) { + const auto seed_count = seed_inner_ids->size(); + const auto sampled_seed_count = std::min(seed_target, seed_count); + for (uint64_t i = 0; i < sampled_seed_count; ++i) { + const auto offset = i * seed_count / sampled_seed_count; + if (try_visit((*seed_inner_ids)[offset])) { + ++seeds; + } + } + } + for (InnerIdType seed = 0; seed < total and seeds < seed_target; ++seed) { + if (try_visit(seed)) { + ++seeds; + } + } + + const auto hop_limit = std::min( + search_params.hops_limit, static_cast(std::numeric_limits::max())); + while (hops < hop_limit) { + auto* current = get_closest_unexpanded(); + if (current == nullptr) { + break; + } + const auto inner_id = current->inner_id; + const auto clique_begin = this->p_node_to_cid_[inner_id]; + const auto clique_end = this->p_node_to_cid_[inner_id + 1]; + for (auto offset = clique_begin; offset < clique_end; ++offset) { + const auto clique_id = this->node_to_cids_[offset]; + if (visited_cliques[clique_id] != 0) { + continue; + } + visited_cliques[clique_id] = 1; + ++hops; + const auto node_begin = this->p_maxc_[clique_id]; + const auto node_end = this->p_maxc_[clique_id + 1]; + for (auto node_offset = node_begin; node_offset < node_end; ++node_offset) { + try_visit(this->maxcs_[node_offset]); + } + if (hops >= hop_limit) { + break; + } + } + } + + for (const auto& candidate : candidates) { + heap->Push(candidate.distance, candidate.inner_id); + } + return heap; +} + +DatasetPtr +MCI::KnnSearch(const DatasetPtr& query, + int64_t k, + const std::string& parameters, + const FilterPtr& filter) const { + SearchRequest request; + request.query_ = query; + request.topk_ = k; + request.params_str_ = parameters; + if (filter != nullptr) { + request.enable_filter_ = true; + request.filter_ = filter; + } + return this->SearchWithRequest(request); +} + +DatasetPtr +MCI::RangeSearch(const DatasetPtr& query, + float radius, + const std::string& parameters, + const FilterPtr& filter, + int64_t limited_size) const { + SearchRequest request; + request.query_ = query; + request.mode_ = SearchMode::RANGE_SEARCH; + request.radius_ = radius; + request.limited_size_ = limited_size; + request.params_str_ = parameters; + if (filter != nullptr) { + request.enable_filter_ = true; + request.filter_ = filter; + } + return this->SearchWithRequest(request); +} + +bool +MCI::should_use_hgraph_hybrid(const SearchRequest& request, float valid_ratio) const { + if (not this->use_hgraph_hybrid_ or this->hgraph_index_ == nullptr) { + return false; + } + if (this->hgraph_index_->GetNumElements() != this->GetNumElements()) { + return false; + } + if (request.enable_bitset_filter_ and request.bitset_filter_ != nullptr) { + return false; + } + return valid_ratio >= this->hgraph_valid_ratio_threshold_; +} + +std::string +MCI::get_hgraph_search_params(const std::string& request_params) const { + if (not request_params.empty()) { + auto params = JsonType::Parse(request_params); + if (params.Contains(INDEX_TYPE_HGRAPH)) { + return request_params; + } + } + return fmt::format(R"({{"hgraph":{{"ef_search":{}}}}})", this->hgraph_ef_search_); +} + +DatasetPtr +MCI::search_hgraph_hybrid(const SearchRequest& request, float valid_ratio) const { + SearchRequest hgraph_request = request; + hgraph_request.params_str_ = this->get_hgraph_search_params(request.params_str_); + auto result = this->hgraph_index_->SearchWithRequest(hgraph_request); + + JsonType stats; + auto stats_str = result->GetStatistics(); + if (not stats_str.empty()) { + stats = JsonType::Parse(stats_str); + } + stats["mci_hybrid_route"].SetString("hgraph"); + stats["mci_hybrid_valid_ratio"].SetFloat(valid_ratio); + stats["mci_hybrid_threshold"].SetFloat(this->hgraph_valid_ratio_threshold_); + result->Statistics(stats.Dump()); + return result; +} + +DatasetPtr +MCI::SearchWithRequest(const SearchRequest& request) const { + std::shared_lock read_lock(this->global_mutex_); + CHECK_ARGUMENT(request.query_ != nullptr, "query dataset is nullptr"); + CHECK_ARGUMENT( + request.query_->GetDim() == dim_, + fmt::format( + "query.dim({}) must be equal to index.dim({})", request.query_->GetDim(), dim_)); + CHECK_ARGUMENT(request.query_->GetFloat32Vectors() != nullptr, "query.float_vector is nullptr"); + CHECK_ARGUMENT(request.topk_ > 0, "mci topk must be positive"); + CHECK_ARGUMENT(request.limited_size_ != 0, "mci limited_size must not be equal to 0"); + + auto search_params = MCISearchParameters::FromJson(request.params_str_); + const auto* query_data = request.query_->GetFloat32Vectors(); + auto final_codes = this->get_reorder_codes(); + auto final_computer = final_codes->FactoryComputer(query_data); + FilterPtr filter = nullptr; + if (request.enable_filter_ and request.filter_ != nullptr) { + filter = request.filter_; + } + const auto hybrid_valid_ratio = filter != nullptr ? filter->ValidRatio() : 1.0F; + if (this->should_use_hgraph_hybrid(request, hybrid_valid_ratio)) { + return this->search_hgraph_hybrid(request, hybrid_valid_ratio); + } + if (request.enable_bitset_filter_ and request.bitset_filter_ != nullptr) { + auto combined_filter = std::make_shared(); + if (filter != nullptr) { + combined_filter->AppendFilter(filter); + } + combined_filter->AppendFilter(std::make_shared(request.bitset_filter_)); + filter = combined_filter; + } + auto seed_inner_ids = collect_valid_inner_ids( + filter, *this->label_table_, search_params.seed_count, this->allocator_); + auto* seed_inner_ids_ptr = seed_inner_ids.empty() ? nullptr : &seed_inner_ids; + auto inner_filter = make_inner_id_filter(filter, *this->label_table_); + + auto total = static_cast(this->total_count_.load()); + if (total == 0) { + return DatasetImpl::MakeEmptyDataset(); + } + + uint32_t dist_cmp = 0; + if (request.mode_ == SearchMode::RANGE_SEARCH) { + int64_t limited_size = + request.limited_size_ < 0 ? std::numeric_limits::max() : request.limited_size_; + auto heap = DistanceHeap::MakeInstanceBySize(this->allocator_, limited_size); + for (InnerIdType inner_id = 0; inner_id < total; ++inner_id) { + if (inner_filter != nullptr and not inner_filter->CheckValid(inner_id)) { + continue; + } + float dist = 0.0F; + final_codes->Query(&dist, final_computer, &inner_id, 1); + ++dist_cmp; + if (dist <= request.radius_) { + heap->Push(dist, inner_id); + } + } + auto [dataset_results, dists, ids] = + create_fast_dataset(static_cast(heap->Size()), allocator_); + for (auto i = static_cast(heap->Size() - 1); i >= 0; --i) { + dists[i] = heap->Top().first; + ids[i] = this->label_table_->GetLabelById(heap->Top().second); + heap->Pop(); + } + JsonType stats; + stats["dist_cmp"].SetInt(static_cast(dist_cmp)); + stats["ef_search"].SetInt(search_params.ef_search); + dataset_results->Statistics(stats.Dump()); + return dataset_results; + } + + auto topk = std::max(request.topk_, 1); + auto candidate_limit = std::max(topk, search_params.ef_search); + QueryContext ctx; + ctx.alloc = this->allocator_; + uint32_t hops = 0; + DistanceRecordVector rabitq_lower_bound_candidates(this->allocator_); + auto* rabitq_lower_bound_candidates_ptr = + search_params.rabitq_one_bit_search and use_reorder_ and reorder_by_base_ + ? &rabitq_lower_bound_candidates + : nullptr; + + auto base_computer = this->base_codes_->FactoryComputer(query_data); + DistHeapPtr heap = nullptr; + if (this->has_clique_index(total) and static_cast(total) > candidate_limit) { + heap = this->search_clique_candidates(base_computer, + inner_filter, + seed_inner_ids_ptr, + search_params, + candidate_limit, + ctx, + rabitq_lower_bound_candidates_ptr, + dist_cmp, + hops); + } else { + heap = this->scan_knn_candidates(this->base_codes_, + base_computer, + inner_filter, + candidate_limit, + search_params.rabitq_one_bit_search, + ctx, + rabitq_lower_bound_candidates_ptr, + dist_cmp); + } + + if (heap->Empty()) { + return DatasetImpl::MakeEmptyDataset(); + } + + if (use_reorder_ or search_params.rabitq_one_bit_search) { + auto reorder_codes = use_reorder_ ? this->get_reorder_codes() : this->base_codes_; + FlattenReorder reorder(reorder_codes, this->allocator_); + const auto* lower_bound_candidates_for_reorder = + use_reorder_ and reorder_by_base_ and search_params.rabitq_one_bit_search + ? rabitq_lower_bound_candidates_ptr + : nullptr; + heap = reorder.Reorder( + heap, query_data, topk, ctx, nullptr, lower_bound_candidates_for_reorder); + } else if (static_cast(heap->Size()) > topk) { + auto trimmed_heap = DistanceHeap::MakeInstanceBySize(this->allocator_, topk); + const auto* candidates = heap->GetData(); + for (uint64_t i = 0; i < heap->Size(); ++i) { + trimmed_heap->Push(candidates[i]); + } + heap = trimmed_heap; + } + + auto [dataset_results, dists, ids] = + create_fast_dataset(static_cast(heap->Size()), allocator_); + for (auto i = static_cast(heap->Size() - 1); i >= 0; --i) { + dists[i] = heap->Top().first; + ids[i] = this->label_table_->GetLabelById(heap->Top().second); + heap->Pop(); + } + JsonType stats; + stats["dist_cmp"].SetInt(static_cast(dist_cmp)); + stats["ef_search"].SetInt(search_params.ef_search); + stats["seed_count"].SetInt(static_cast(search_params.seed_count)); + stats["rabitq_one_bit_search"].SetBool(search_params.rabitq_one_bit_search); + stats["hops"].SetInt(static_cast(hops)); + stats["total_clique_count"].SetInt(static_cast(this->total_clique_count_)); + stats["mci_hybrid_route"].SetString("mci"); + stats["mci_hybrid_valid_ratio"].SetFloat(hybrid_valid_ratio); + stats["mci_hybrid_threshold"].SetFloat(this->hgraph_valid_ratio_threshold_); + dataset_results->Statistics(stats.Dump()); + return dataset_results; +} + +void +MCI::Serialize(StreamWriter& writer) const { + this->base_codes_->Serialize(writer); + if (this->reorder_codes_ != nullptr) { + this->reorder_codes_->Serialize(writer); + } + this->label_table_->Serialize(writer); + StreamWriter::WriteVector(writer, this->p_maxc_); + StreamWriter::WriteVector(writer, this->maxcs_); + StreamWriter::WriteVector(writer, this->p_node_to_cid_); + StreamWriter::WriteVector(writer, this->node_to_cids_); + + uint64_t hgraph_serialized_size = 0; + + auto metadata = std::make_shared(); + JsonType basic_info; + basic_info["dim"].SetInt(dim_); + basic_info["total_count"].SetInt(static_cast(this->total_count_.load())); + basic_info["max_capacity"].SetInt(static_cast(this->max_capacity_.load())); + basic_info["total_clique_count"].SetInt(static_cast(this->total_clique_count_)); + basic_info["hgraph_serialized_size"].SetInt(static_cast(hgraph_serialized_size)); + basic_info[INDEX_PARAM].SetString(this->create_param_ptr_->ToString()); + metadata->Set(BASIC_INFO, basic_info); + auto footer = std::make_shared