Skip to content

Commit 7af6cdf

Browse files
committed
refactor: centralize cryptographic proof generation and fix Python SDK RAG paths
1 parent cfb08bb commit 7af6cdf

11 files changed

Lines changed: 811 additions & 226 deletions

File tree

all-functions.md

Lines changed: 559 additions & 149 deletions
Large diffs are not rendered by default.

ffi/src/lib.rs

Lines changed: 80 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ use valori_kernel::types::vector::FxpVector;
77
use valori_kernel::types::id::RecordId;
88
use valori_kernel::fxp::ops::from_f32;
99
use valori_kernel::event::KernelEvent;
10+
use valori_kernel::proof::generate_proof_bytes;
1011
use serde_json; // For metadata serialization
1112
use hex; // For hash encoding
1213

@@ -379,6 +380,85 @@ impl ValoriEngine {
379380
Err(pyo3::exceptions::PyRuntimeError::new_err("Event Log not initialized"))
380381
}
381382
}
383+
384+
/// Batch atomic insert with Merkle proofs.
385+
/// Returns a list of (record_id, proof_hex).
386+
#[pyo3(signature = (vectors, tags))]
387+
fn insert_batch_with_proof(&self, vectors: Vec<Vec<f32>>, tags: Vec<u64>) -> PyResult<Vec<(u32, String)>> {
388+
if vectors.len() != tags.len() {
389+
return Err(pyo3::exceptions::PyValueError::new_err("vectors and tags must have the same length"));
390+
}
391+
392+
let mut engine = self.inner.lock().unwrap();
393+
394+
let mut events = Vec::with_capacity(vectors.len());
395+
let mut results = Vec::with_capacity(vectors.len());
396+
let mut temp_used_ids = std::collections::HashSet::new();
397+
let mut next_candidate = 0;
398+
399+
for (vec, &tag) in vectors.iter().zip(tags.iter()) {
400+
if vec.len() != D {
401+
return Err(pyo3::exceptions::PyValueError::new_err(format!("Expected {} dims", D)));
402+
}
403+
404+
let mut fxp_vec = FxpVector::<D>::new_zeros();
405+
let mut fixed_values = Vec::with_capacity(D);
406+
for (i, &f) in vec.iter().enumerate() {
407+
if f < -32767.0 || f > 32767.0 {
408+
return Err(pyo3::exceptions::PyValueError::new_err(format!(
409+
"Float at index {} ({}) outside valid range [-32767.0, 32767.0]", i, f
410+
)));
411+
}
412+
let scalar = from_f32(f);
413+
fxp_vec.data[i] = scalar;
414+
fixed_values.push(scalar.0);
415+
}
416+
417+
let proof_bytes = generate_proof_bytes(&fixed_values);
418+
let proof_hex = hex::encode(&proof_bytes);
419+
420+
let mut found_id = None;
421+
for i in next_candidate..MAX_RECORDS {
422+
let rid = RecordId(i as u32);
423+
if engine.state.get_record(rid).is_none() && !temp_used_ids.contains(&i) {
424+
found_id = Some(rid);
425+
next_candidate = i + 1;
426+
break;
427+
}
428+
}
429+
430+
let rid = found_id.ok_or_else(|| {
431+
pyo3::exceptions::PyRuntimeError::new_err("Capacity Exceeded")
432+
})?;
433+
temp_used_ids.insert(rid.0 as usize);
434+
435+
events.push(KernelEvent::InsertRecord {
436+
id: rid,
437+
vector: fxp_vec,
438+
metadata: Some(proof_bytes),
439+
tag,
440+
});
441+
results.push((rid.0, proof_hex));
442+
}
443+
444+
if let Some(ref mut committer) = engine.event_committer {
445+
match committer.commit_batch(events.clone()) {
446+
Ok(_) => {
447+
for event in &events {
448+
engine.apply_committed_event(event).map_err(|e| {
449+
pyo3::exceptions::PyRuntimeError::new_err(format!("Apply failed: {:?}", e))
450+
})?;
451+
}
452+
Ok(results)
453+
}
454+
Err(e) => Err(pyo3::exceptions::PyRuntimeError::new_err(
455+
format!("Batch commit failed: {:?}", e)
456+
)),
457+
}
458+
} else {
459+
Err(pyo3::exceptions::PyRuntimeError::new_err("Event Log not initialized"))
460+
}
461+
}
382462
}
383463

384464
// ============================================================================
@@ -405,27 +485,6 @@ fn ingest_embedding(floats: Vec<f32>) -> PyResult<Vec<i32>> {
405485
Ok(fixed)
406486
}
407487

408-
/// Internal helper — generates Merkle proof as raw bytes.
409-
/// Single source of truth for Merkle logic.
410-
/// Used by both generate_proof() (hex output) and insert_with_proof() (Record.metadata).
411-
fn generate_proof_bytes(fixed_values: &[i32]) -> Vec<u8> {
412-
let leaves: Vec<[u8; 32]> = fixed_values
413-
.iter()
414-
.enumerate()
415-
.map(|(pos, &val)| {
416-
let mut buf = [0u8; 8];
417-
buf[..4].copy_from_slice(&(pos as u32).to_le_bytes());
418-
buf[4..].copy_from_slice(&val.to_le_bytes());
419-
420-
let mut hasher = blake3::Hasher::new();
421-
hasher.update(&buf);
422-
*hasher.finalize().as_bytes()
423-
})
424-
.collect();
425-
426-
merkle_root(&leaves).to_vec()
427-
}
428-
429488
/// Build a position-aware Merkle tree over Q16.16 integers.
430489
///
431490
/// Each leaf = BLAKE3(position_u32_le || value_i32_le).
@@ -441,25 +500,6 @@ fn generate_proof(fixed_values: Vec<i32>) -> PyResult<String> {
441500
Ok(hex::encode(generate_proof_bytes(&fixed_values)))
442501
}
443502

444-
/// Standard binary Merkle tree. Odd leaf: hashed with itself.
445-
fn merkle_root(leaves: &[[u8; 32]]) -> [u8; 32] {
446-
if leaves.len() == 1 {
447-
return leaves[0];
448-
}
449-
450-
let next_level: Vec<[u8; 32]> = leaves
451-
.chunks(2)
452-
.map(|pair| {
453-
let mut hasher = blake3::Hasher::new();
454-
hasher.update(&pair[0]);
455-
hasher.update(pair.get(1).unwrap_or(&pair[0]));
456-
*hasher.finalize().as_bytes()
457-
})
458-
.collect();
459-
460-
merkle_root(&next_level)
461-
}
462-
463503
/// Verify a float embedding against a claimed proof hash.
464504
///
465505
/// Full pipeline in Rust: f32 → Q16.16 → Merkle → compare.

node/src/engine.rs

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ use valori_kernel::types::enums::{NodeKind, EdgeKind};
99
use valori_kernel::snapshot::{encode::encode_state, decode::decode_state};
1010
// use valori_kernel::fxp::ops::from_f32; // Explicit rounding now preferred
1111
use valori_kernel::verify::{kernel_state_hash, snapshot_hash};
12-
use valori_kernel::proof::DeterministicProof;
12+
use valori_kernel::proof::{DeterministicProof, generate_proof_bytes};
1313

1414
use crate::config::{NodeConfig, IndexKind, QuantizationKind};
1515
use crate::errors::EngineError;
@@ -180,9 +180,11 @@ impl<const MAX_RECORDS: usize, const D: usize, const MAX_NODES: usize, const MAX
180180
// 1. Build FxpVector for Kernel
181181
// STRICT DETERMINISM: Explicit Rounding to Nearest
182182
let mut vector = FxpVector::<D>::new_zeros();
183+
let mut fixed_values = Vec::with_capacity(D);
183184
for (i, v) in values.iter().enumerate() {
184185
let fixed = (v * SCALE).round().clamp(i32::MIN as f32, i32::MAX as f32) as i32;
185186
vector.data[i] = FxpScalar(fixed);
187+
fixed_values.push(fixed);
186188
}
187189

188190
// 2. Determine ID (first free slot strategy)
@@ -199,8 +201,9 @@ impl<const MAX_RECORDS: usize, const D: usize, const MAX_NODES: usize, const MAX
199201
// Phase 23: Event-sourced path (preferred)
200202
if let Some(ref mut committer) = self.event_committer {
201203
let start = std::time::Instant::now();
204+
let proof_bytes = generate_proof_bytes(&fixed_values);
202205
// Generate event (no state change yet)
203-
let event = KernelEvent::InsertRecord { id, vector, metadata: None, tag: 0 };
206+
let event = KernelEvent::InsertRecord { id, vector, metadata: Some(proof_bytes), tag: 0 };
204207

205208
// Commit via event pipeline (shadow → persist → commit → live)
206209
// Clone event for local apply if needed
@@ -238,7 +241,8 @@ impl<const MAX_RECORDS: usize, const D: usize, const MAX_NODES: usize, const MAX
238241
Ok(id.0)
239242
} else {
240243
// Fallback: Legacy WAL path
241-
let cmd = Command::InsertRecord { id, vector, metadata: None, tag: 0 };
244+
let proof_bytes = generate_proof_bytes(&fixed_values);
245+
let cmd = Command::InsertRecord { id, vector, metadata: Some(proof_bytes), tag: 0 };
242246

243247
// Write to WAL FIRST
244248
if let Some(ref mut wal) = self.wal_writer {
@@ -322,12 +326,15 @@ impl<const MAX_RECORDS: usize, const D: usize, const MAX_NODES: usize, const MAX
322326

323327
// Create FxpVector
324328
let mut vector = FxpVector::<D>::new_zeros();
329+
let mut fixed_values = Vec::with_capacity(D);
325330
for (i, v) in values.iter().enumerate() {
326331
let fixed = (v * SCALE).round().clamp(i32::MIN as f32, i32::MAX as f32) as i32;
327332
vector.data[i] = FxpScalar(fixed);
333+
fixed_values.push(fixed);
328334
}
329335

330-
events.push(KernelEvent::InsertRecord { id, vector, metadata: None, tag: 0 });
336+
let proof_bytes = generate_proof_bytes(&fixed_values);
337+
events.push(KernelEvent::InsertRecord { id, vector, metadata: Some(proof_bytes), tag: 0 });
331338
}
332339

333340
let start = std::time::Instant::now();

python/test_insert_with_proof.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,3 +179,25 @@ def test_deterministic_proof_across_engines(self, db_dir):
179179

180180
if __name__ == "__main__":
181181
pytest.main([__file__, "-v"])
182+
183+
def test_insert_batch_with_proof(self, engine):
184+
"""insert_batch_with_proof returns list of tuples and stores metadata."""
185+
np.random.seed(42)
186+
batch = []
187+
for _ in range(3):
188+
emb = np.random.randn(384).astype(np.float32)
189+
emb = (emb / np.linalg.norm(emb)).tolist()
190+
batch.append(emb)
191+
192+
results = engine.insert_batch_with_proof(batch)
193+
194+
assert len(results) == 3
195+
for i, (rid, proof) in enumerate(results):
196+
assert isinstance(rid, int)
197+
assert isinstance(proof, str)
198+
assert len(proof) == 64
199+
200+
# Verify metadata was stored
201+
meta = engine.kernel.get_metadata(rid)
202+
assert meta is not None
203+
assert bytes(meta).hex() == proof

python/valori/adapter.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,41 @@ def insert(
6565
self._id_map[id] = record_id
6666
return proof_hash
6767

68+
def insert_batch(
69+
self,
70+
ids: list[str],
71+
embeddings: list[np.ndarray],
72+
metadata_list: list[dict] = None
73+
) -> list[str]:
74+
"""
75+
Insert a batch into existing DB and generate kernel-backed proofs.
76+
77+
Returns:
78+
List of proof hashes (hex strings).
79+
"""
80+
if metadata_list is None:
81+
metadata_list = [{}] * len(ids)
82+
83+
# 1. Store in existing DB
84+
if hasattr(self.db, "insert_batch"):
85+
self.db.insert_batch(ids, embeddings, metadata_list)
86+
else:
87+
# Fallback for DBs without batch method
88+
for i in range(len(ids)):
89+
self.db.insert(ids[i], embeddings[i], metadata_list[i])
90+
91+
# 2. Single atomic Rust call for the batch
92+
vectors = [emb.flatten().tolist() for emb in embeddings]
93+
results = self._valori.insert_batch_with_proof(vectors)
94+
95+
# 3. Track mapping and collect proofs
96+
proof_hashes = []
97+
for i, (record_id, proof_hash) in enumerate(results):
98+
self._id_map[ids[i]] = record_id
99+
proof_hashes.append(proof_hash)
100+
101+
return proof_hashes
102+
68103
def search(
69104
self,
70105
query_embedding: np.ndarray,

python/valori/adapters/utils.py

Lines changed: 13 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,18 @@
1-
# Copyright (c) 2025 Varshith Gudur. Licensed under AGPLv3.
2-
import numpy as np
1+
import valori
32
from typing import List, Union
43
from ..protocol import ValidationError
54

6-
FXP_MAX = 32767.0
7-
FXP_MIN = -32767.0
8-
SCALE = 1 << 16
9-
10-
def validate_float_range(vec: Union[List[float], np.ndarray]) -> List[float]:
5+
def validate_float_range(vec: Union[List[float], "np.ndarray"]) -> List[float]:
116
"""
12-
Validates and converts a float vector to Q16.16 compatible floats.
13-
Valori kernel expects floats but validates they are within safe range.
14-
Here we ensure they are finite and clamped/checked.
7+
Validates a float vector using the exact Rust kernel validation path.
8+
Guarantees that float validation is identical across Python and Rust.
159
"""
16-
if isinstance(vec, list):
17-
vec = np.array(vec, dtype=np.float64)
18-
19-
if not np.isfinite(vec).all():
20-
raise ValidationError("Embedding contains non-finite values (NaN/Inf)")
21-
22-
if vec.ndim != 1:
23-
raise ValidationError(f"Embedding must be 1D, got {vec.ndim}")
24-
25-
if (vec > FXP_MAX).any() or (vec < FXP_MIN).any():
26-
# Option: clamp or raise? Prompt says "Reject ... > +32767" -> Raise.
27-
raise ValidationError(f"Embedding components must be within [{FXP_MIN}, {FXP_MAX}]")
28-
29-
# Valori protocol currently takes List[float], not int raw.
30-
# The kernel does conversion. We just validate range here.
31-
# Prompt says "Reject if > 32767".
32-
# Return list of floats.
33-
return vec.tolist()
10+
if hasattr(vec, "tolist"):
11+
vec = vec.tolist()
12+
13+
try:
14+
# Call the Rust FFI single source of truth for validation
15+
valori.ingest_embedding(vec)
16+
return vec
17+
except ValueError as e:
18+
raise ValidationError(str(e))

python/valori/local.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@ def __init__(self, path: str = "./valori_db"):
2323
def insert(self, vector: List[float], tag: int = 0) -> int:
2424
return self.kernel.insert(vector, tag)
2525

26+
def insert_with_proof(self, vector: List[float], tag: int = 0) -> tuple:
27+
"""Insert a vector and return its ID and Merkle proof hash."""
28+
return self.kernel.insert_with_proof(vector, tag)
29+
2630
def search(self, query: List[float], k: int, filter_tag: Optional[int] = None) -> List[Dict[str, Any]]:
2731
# FFI returns [(id, score), ...]
2832
hits = self.kernel.search(query, k, filter_tag)
@@ -57,6 +61,20 @@ def insert_batch(self, vectors: List[List[float]]) -> List[int]:
5761
"""
5862
return self.kernel.insert_batch(vectors)
5963

64+
def insert_batch_with_proof(self, vectors: List[List[float]], tags: Optional[List[int]] = None) -> List[tuple]:
65+
"""Insert multiple vectors atomically and generate a proof for each.
66+
67+
Args:
68+
vectors: List of vectors to insert
69+
tags: Optional list of tags. If None, defaults to 0 for all vectors.
70+
71+
Returns:
72+
List of (record_id, proof_hash) tuples
73+
"""
74+
if tags is None:
75+
tags = [0] * len(vectors)
76+
return self.kernel.insert_batch_with_proof(vectors, tags)
77+
6078
def get_metadata(self, record_id: int) -> Optional[bytes]:
6179
"""Get metadata for a record.
6280

0 commit comments

Comments
 (0)