Skip to content

Commit 258b45f

Browse files
authored
Remove deduplication for relation node/edge vectors (#3643)
* Remove deduplication * Reorder protos to avoid storing field_id
1 parent 7639675 commit 258b45f

15 files changed

Lines changed: 407 additions & 381 deletions

File tree

nidx/nidx_protos/noderesources.proto

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,8 +175,16 @@ message Resource {
175175
bool skip_paragraphs = 19;
176176
bool skip_json = 24;
177177

178-
map<string, utils.RelationNodeVectors> relation_node_vectors = 20;
179-
map<string, utils.RelationEdgeVectors> relation_edge_vectors = 21;
178+
map<string, IndexFieldNodeVectors> field_node_vectors = 20;
179+
map<string, IndexFieldEdgeVectors> field_edge_vectors = 21;
180+
}
181+
182+
message IndexFieldNodeVectors {
183+
map<string, utils.RelationNodeVectors> node_vectors = 1; // key: vectorset_id
184+
}
185+
186+
message IndexFieldEdgeVectors {
187+
map<string, utils.RelationEdgeVectors> edge_vectors = 1; // key: vectorset_id
180188
}
181189

182190
message ShardMetadata {

nidx/nidx_vector/src/config.rs

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -176,12 +176,10 @@ impl VectorConfig {
176176
&& matches!(&self.vector_type, VectorType::DenseF32 { dimension } if dimension.is_multiple_of(64))
177177
}
178178

179-
/// Whether to deduplicate paragraphs based on their key, using `metadata` to store the list of field_keys
180-
pub fn deduplicate_keys(&self) -> bool {
181-
match self.entity {
182-
IndexEntity::Paragraph => false,
183-
IndexEntity::RelationNode | IndexEntity::RelationEdge => true,
184-
}
179+
/// Whether this index uses relation-style inverted indexes (field mapping from metadata)
180+
/// rather than paragraph-style inverted indexes (field mapping from key).
181+
pub fn uses_relation_inverted_index(&self) -> bool {
182+
matches!(self.entity, IndexEntity::RelationNode | IndexEntity::RelationEdge)
185183
}
186184

187185
pub fn from_paragraph_proto(proto: VectorIndexConfig) -> VectorR<Self> {

nidx/nidx_vector/src/data_store/v2.rs

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ use crate::{
2929
use super::{DataStore, OpenReason, ParagraphAddr, VectorAddr};
3030
pub use paragraph_store::StoredParagraph;
3131
use paragraph_store::{ParagraphStore, ParagraphStoreWriter};
32-
use std::{collections::HashMap, path::Path};
32+
use std::path::Path;
3333
use vector_store::{VectorStore, VectorStoreWriter};
3434

3535
mod paragraph_store;
@@ -88,7 +88,6 @@ impl DataStoreV2 {
8888
path: &Path,
8989
producers: Vec<(impl Iterator<Item = ParagraphAddr>, &dyn DataStore)>,
9090
config: &VectorConfig,
91-
mut paragraph_deduplicator: Option<HashMap<String, Vec<u8>>>,
9291
) -> VectorR<()> {
9392
let mut paragraphs = ParagraphStoreWriter::new(path)?;
9493
let mut vectors = VectorStoreWriter::new(path, &config.vector_type)?;
@@ -105,17 +104,6 @@ impl DataStoreV2 {
105104
let paragraph = store.get_paragraph(paragraph_addr);
106105
let p_vectors = paragraph.vectors(&paragraph_addr).map(|v| store.get_vector(v).vector());
107106

108-
let metadata = if let Some(paragraph_deduplicator) = &mut paragraph_deduplicator {
109-
// Entry is removed so if it appears in other segments it is not copied again
110-
let metadata = paragraph_deduplicator.remove(paragraph.id());
111-
if metadata.is_none() {
112-
continue;
113-
};
114-
metadata
115-
} else {
116-
None
117-
};
118-
119107
// Write to new store
120108
let (first_vector, last_vector) = vectors.write(p_idx, p_vectors)?;
121109
if let Some(quantized) = &mut quantized {
@@ -132,12 +120,7 @@ impl DataStoreV2 {
132120
}
133121
}
134122

135-
paragraphs.write_paragraph_ref(
136-
paragraph,
137-
first_vector,
138-
last_vector - first_vector + 1,
139-
metadata.as_deref(),
140-
)?;
123+
paragraphs.write_paragraph_ref(paragraph, first_vector, last_vector - first_vector + 1)?;
141124

142125
p_idx += 1;
143126
}

nidx/nidx_vector/src/data_store/v2/paragraph_store.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,13 +152,12 @@ impl ParagraphStoreWriter {
152152
paragraph: ParagraphRef,
153153
first_vector: u32,
154154
num_vectors: u32,
155-
override_metadata: Option<&[u8]>,
156155
) -> VectorR<u32> {
157156
let labels = paragraph.labels();
158157
let paragraph = StoredParagraph {
159158
key: paragraph.id(),
160159
labels: labels.iter().map(|x| x.as_str()).collect(),
161-
metadata: override_metadata.unwrap_or(paragraph.metadata()),
160+
metadata: paragraph.metadata(),
162161
first_vector,
163162
num_vectors,
164163
};

nidx/nidx_vector/src/field_list_metadata.rs

Lines changed: 0 additions & 48 deletions
This file was deleted.

nidx/nidx_vector/src/indexer.rs

Lines changed: 20 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
// along with this program. If not, see <http://www.gnu.org/licenses/>.
1919

2020
use crate::config::{VectorCardinality, VectorConfig};
21-
use crate::field_list_metadata::encode_field_list_metadata;
2221
use crate::multivector::extract_multi_vectors;
2322
use crate::segment::{self, Elem};
2423
use crate::utils::FieldKey;
@@ -152,13 +151,8 @@ pub fn index_resource(
152151
Ok(Some(segment.into_metadata()))
153152
}
154153

155-
fn encode_metadata_field(rid: &str, fields: &HashSet<&String>) -> Vec<u8> {
156-
let encoded_fields: Vec<_> = fields
157-
.iter()
158-
.map(|f| format!("{rid}/{f}"))
159-
.filter_map(|f| FieldKey::from_field_id(&f))
160-
.collect();
161-
encode_field_list_metadata(&encoded_fields)
154+
fn encode_metadata_field(rid: &str, field: &str) -> Option<Vec<u8>> {
155+
FieldKey::from_field_id(&format!("{rid}/{field}")).map(|k| k.bytes().to_vec())
162156
}
163157

164158
pub fn index_relation_nodes(
@@ -169,50 +163,25 @@ pub fn index_relation_nodes(
169163
) -> anyhow::Result<Option<VectorSegmentMetadata>> {
170164
debug!("Creating elements for the main index");
171165

172-
let mut entity_fields = HashMap::new();
173166
let Some(resource_id) = &resource.resource else {
174167
return Err(anyhow!("resource_id required"));
175168
};
176169
let rid = &resource_id.uuid;
177170

178-
for (field, relations) in &resource.field_relations {
179-
// Find all copies of each relation node
180-
for relation in &relations.relations {
181-
let Some(relation) = &relation.relation else {
182-
return Err(anyhow!("relation required"));
183-
};
184-
let Some(source) = &relation.source else {
185-
return Err(anyhow!("relation source node required"));
186-
};
187-
entity_fields
188-
.entry(source.value.clone())
189-
.or_insert_with(HashSet::new)
190-
.insert(field);
191-
192-
let Some(to) = &relation.to else {
193-
return Err(anyhow!("relation to node required"));
194-
};
195-
entity_fields
196-
.entry(to.value.clone())
197-
.or_insert_with(HashSet::new)
198-
.insert(field);
199-
}
200-
}
201-
202-
// Index each vector
203171
let mut elems = Vec::new();
204-
if let Some(vectorset) = &resource.relation_node_vectors.get(index_name) {
172+
for (field_id, field_data) in &resource.field_node_vectors {
173+
let Some(vectorset) = field_data.node_vectors.get(index_name) else {
174+
continue;
175+
};
176+
let Some(metadata) = encode_metadata_field(rid, field_id) else {
177+
continue;
178+
};
205179
for node_vector in &vectorset.vectors {
206-
let vector = node_vector.vector.clone();
207-
let fields = entity_fields.get(&node_vector.node_value);
208-
let Some(fields) = fields else {
209-
continue;
210-
};
211180
elems.push(Elem::new(
212181
node_vector.node_value.clone(),
213-
vector,
182+
node_vector.vector.clone(),
214183
vec![],
215-
Some(encode_metadata_field(rid, fields)),
184+
Some(metadata.clone()),
216185
));
217186
}
218187
}
@@ -235,40 +204,25 @@ pub fn index_relation_edges(
235204
) -> anyhow::Result<Option<VectorSegmentMetadata>> {
236205
debug!("Creating elements for the main index");
237206

238-
let mut entity_fields = HashMap::new();
239207
let Some(resource_id) = &resource.resource else {
240208
return Err(anyhow!("resource_id required"));
241209
};
242210
let rid = &resource_id.uuid;
243211

244-
for (field, relations) in &resource.field_relations {
245-
// Find all copies of each relation edge
246-
for relation in &relations.relations {
247-
let Some(relation) = &relation.relation else {
248-
return Err(anyhow!("relation required"));
249-
};
250-
251-
entity_fields
252-
.entry(relation.relation_label.clone())
253-
.or_insert_with(HashSet::new)
254-
.insert(field);
255-
}
256-
}
257-
258-
// Index each vector
259212
let mut elems = Vec::new();
260-
if let Some(vectorset) = &resource.relation_edge_vectors.get(index_name) {
213+
for (field_id, field_data) in &resource.field_edge_vectors {
214+
let Some(vectorset) = field_data.edge_vectors.get(index_name) else {
215+
continue;
216+
};
217+
let Some(metadata) = encode_metadata_field(rid, field_id) else {
218+
continue;
219+
};
261220
for rel_vector in &vectorset.vectors {
262-
let vector = rel_vector.vector.clone();
263-
let fields = entity_fields.get(&rel_vector.relation_label);
264-
let Some(fields) = fields else {
265-
continue;
266-
};
267221
elems.push(Elem::new(
268222
rel_vector.relation_label.clone(),
269-
vector,
223+
rel_vector.vector.clone(),
270224
vec![],
271-
Some(encode_metadata_field(rid, fields)),
225+
Some(metadata.clone()),
272226
));
273227
}
274228
}

nidx/nidx_vector/src/inverted_index.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ impl IndexBuilder {
6666

6767
/// Build indexes from a DataStore.
6868
pub fn build_indexes(work_path: &Path, config: &VectorConfig, data_store: &impl DataStore) -> VectorR<()> {
69-
if config.deduplicate_keys() {
69+
if config.uses_relation_inverted_index() {
7070
RelationInvertedIndexes::build(work_path, data_store)
7171
} else {
7272
ParagraphInvertedIndexes::build(work_path, data_store)
@@ -84,7 +84,7 @@ pub enum InvertedIndexes {
8484

8585
impl InvertedIndexes {
8686
pub fn open(config: &VectorConfig, work_path: &Path, records: usize, options: OpenOptions) -> VectorR<Self> {
87-
if config.deduplicate_keys() {
87+
if config.uses_relation_inverted_index() {
8888
Ok(InvertedIndexes::Relation(RelationInvertedIndexes::open(
8989
work_path, options,
9090
)?))

nidx/nidx_vector/src/inverted_index/relation.rs

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ use super::{
2727
use crate::{
2828
ParagraphAddr, VectorR,
2929
data_store::{DataStore, iter_paragraphs},
30-
field_list_metadata::decode_field_list_metadata,
3130
utils::FieldKey,
3231
};
3332

@@ -62,11 +61,8 @@ impl RelationInvertedIndexes {
6261

6362
for paragraph_addr in iter_paragraphs(data_store) {
6463
let paragraph = data_store.get_paragraph(paragraph_addr);
65-
let fields = decode_field_list_metadata(paragraph.metadata());
66-
67-
for field_key in fields {
68-
field_builder.insert(field_key.bytes().to_vec(), paragraph_addr);
69-
}
64+
let field_key = FieldKey::from_bytes(paragraph.metadata());
65+
field_builder.insert(field_key.bytes().to_vec(), paragraph_addr);
7066
}
7167

7268
let mut map = InvertedMapWriter::new(&work_path.join(file::INDEX_MAP))?;

nidx/nidx_vector/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
pub mod config;
2222
mod data_store;
2323
mod data_types;
24-
mod field_list_metadata;
24+
2525
pub mod formula;
2626
mod hnsw;
2727
mod indexer;

0 commit comments

Comments
 (0)