Merge branch 'master' into ae/arbitrary-models

donhardman · donhardman · commit 02730d2e4237 · 2026-05-20T15:13:12.000+03:00
diff --git a/.github/workflows/embedding_build_template.yml b/.github/workflows/embedding_build_template.yml
@@ -298,6 +298,9 @@ jobs:
         env:
           GIT_COMMIT_ID: ${{ steps.git_meta.outputs.commit }}
           GIT_TIMESTAMP_ID: ${{ steps.git_meta.outputs.timestamp }}
+          # Windows: opt for speed (opt-level=3) since size is less critical there;
+          # other targets keep Cargo.toml's opt-level=z for smaller binaries.
+          CARGO_PROFILE_RELEASE_OPT_LEVEL: ${{ inputs.distr == 'windows' && '3' || 'z' }}
 
       - run: | 
           mkdir build
diff --git a/embeddings/Cargo.toml b/embeddings/Cargo.toml
@@ -36,7 +36,7 @@ name = "manticore_knn_embeddings"
 crate-type = ["cdylib"]
 
 [profile.release]
-opt-level = 3
+opt-level = "z"
 codegen-units = 1
 lto = true
 strip = "debuginfo"
diff --git a/embeddings/src/model/local.rs b/embeddings/src/model/local.rs
@@ -401,9 +401,16 @@ pub fn load_tokenizer(path: &PathBuf) -> Result<Tokenizer, Box<dyn Error>> {
     Tokenizer::from_bytes(&bytes).map_err(|_| LibError::ModelTokenizerLoadFailed.into())
 }
 
-/// BERT-style local embedding model
+/// BERT-style local embedding model.
+///
+/// `model` is `Arc<Mutex<BertModel>>` to match T5/Causal/Quantized — candle's
+/// BertModel takes `&self` on forward but concurrent forward calls produced
+/// flaky crashes in the daemon when multiple INSERTs / queries hit the same
+/// model in parallel. Serialising forward here mirrors the other model types'
+/// existing posture and trades nothing measurable on perf (uncontended Mutex
+/// is sub-100ns; a BERT forward is six orders of magnitude more).
 pub struct BertEmbeddingModel {
-    model: BertModel,
+    model: Arc<Mutex<BertModel>>,
     tokenizer: Tokenizer,
     max_input_len: usize,
     hidden_size: usize,
@@ -440,7 +447,7 @@ impl BertEmbeddingModel {
         let model = BertModel::load(vb, &config).map_err(|_| LibError::ModelLoadFailed)?;
 
         Ok(Self {
-            model,
+            model: Arc::new(Mutex::new(model)),
             tokenizer: tokenizer.clone(),
             max_input_len,
             hidden_size,
@@ -454,6 +461,27 @@ impl BertEmbeddingModel {
         let mut all_embeddings = Vec::with_capacity(chunks.len());
 
         for batch in chunks.chunks(batch_size()) {
+            // Fast path for batch-of-1 (daemon's SELECT KNN(text,...) hot path):
+            // no padding needed, so skip the attention_mask multiply and use a
+            // plain sum/scalar-div mean pool. Matches pre-975b294 behavior.
+            if batch.len() == 1 {
+                let chunk = &batch[0];
+                let token_ids = Tensor::new(chunk.as_slice(), &self.device)?.unsqueeze(0)?;
+                let token_type_ids = token_ids.zeros_like()?;
+                let emb = {
+                    let model = self.model.lock().unwrap();
+                    model.forward(&token_ids, &token_type_ids, None)?
+                };
+                let seq_len = token_ids.dims()[1];
+                let summed = emb.sum(1)?.to_dtype(DType::F32)?;
+                let divisor = Tensor::new(seq_len as f32, &self.device)?;
+                let mean_emb = summed.broadcast_div(&divisor)?;
+                let mut emb_vec: Vec<f32> = mean_emb.get(0)?.to_vec1::<f32>()?;
+                normalize(&mut emb_vec);
+                all_embeddings.push(emb_vec);
+                continue;
+            }
+
             let batch_size = batch.len();
             let max_len = batch.iter().map(|c| c.len()).max().unwrap_or(0);
 
@@ -473,9 +501,10 @@ impl BertEmbeddingModel {
                 Tensor::from_vec(flat_mask.clone(), (batch_size, max_len), &self.device)?;
             let token_type_ids = token_ids.zeros_like()?;
 
-            let emb = self
-                .model
-                .forward(&token_ids, &token_type_ids, Some(&attention_mask))?;
+            let emb = {
+                let model = self.model.lock().unwrap();
+                model.forward(&token_ids, &token_type_ids, Some(&attention_mask))?
+            };
             // emb: [batch_size, max_len, hidden_size]
 
             // Attention-mask-aware mean pooling: sum(emb * mask) / sum(mask)
@@ -1119,15 +1148,22 @@ impl LocalModel {
             .map(|t| pre_truncate_text(t, max_input_len))
             .collect();
 
-        // Enable parallel tokenization via rayon (once)
-        static INIT_PARALLEL: std::sync::Once = std::sync::Once::new();
-        INIT_PARALLEL.call_once(|| {
-            std::env::set_var("TOKENIZERS_PARALLELISM", "true");
-        });
-
-        let encodings = tokenizer
-            .encode_batch(texts, true)
-            .map_err(|_| LibError::ModelTokenizerEncodeFailed)?;
+        // Adaptive tokenization: encode_batch fans out via rayon, which is pure
+        // overhead for small batches. The daemon's SELECT KNN(text,...) hot path
+        // always sends batch=1 — go sequential there. Parallelise only when the
+        // batch is big enough to amortise the rayon dispatch. Threshold mirrors
+        // the ONNX path's "no threading overhead" cutoff.
+        let encodings = if texts.len() > batch_size() {
+            tokenizer
+                .encode_batch(texts, true)
+                .map_err(|_| LibError::ModelTokenizerEncodeFailed)?
+        } else {
+            texts
+                .iter()
+                .map(|t| tokenizer.encode(*t, true))
+                .collect::<Result<Vec<_>, _>>()
+                .map_err(|_| LibError::ModelTokenizerEncodeFailed)?
+        };
 
         let truncated: Vec<Vec<u32>> = encodings
             .iter()
@@ -1151,6 +1187,33 @@ impl TextModel for LocalModel {
         // BERT and ONNX: batched path (batch_size up to batch_size() per forward pass)
         match self {
             LocalModel::Bert(m) => {
+                // Dedicated single-text bypass: SELECT KNN(field, k, 'text') hits this
+                // path on every query. Skip all batching wrappers, intermediate Vecs,
+                // and the chunks.chunks() loop — go straight encode → forward → pool.
+                if texts.len() == 1 {
+                    let text = pre_truncate_text(texts[0], m.max_input_len);
+                    let enc = m
+                        .tokenizer
+                        .encode(text, true)
+                        .map_err(|_| LibError::ModelTokenizerEncodeFailed)?;
+                    let ids = enc.get_ids();
+                    let ids = &ids[..ids.len().min(m.max_input_len)];
+
+                    let token_ids = Tensor::new(ids, &m.device)?.unsqueeze(0)?;
+                    let token_type_ids = token_ids.zeros_like()?;
+                    let emb = {
+                        let model = m.model.lock().unwrap();
+                        model.forward(&token_ids, &token_type_ids, None)?
+                    };
+                    let seq_len = token_ids.dims()[1];
+                    let summed = emb.sum(1)?.to_dtype(DType::F32)?;
+                    let divisor = Tensor::new(seq_len as f32, &m.device)?;
+                    let mean_emb = summed.broadcast_div(&divisor)?;
+                    let mut emb_vec: Vec<f32> = mean_emb.get(0)?.to_vec1::<f32>()?;
+                    normalize(&mut emb_vec);
+                    return Ok(vec![emb_vec]);
+                }
+
                 return Self::predict_batched(&m.tokenizer, m.max_input_len, texts, |chunks| {
                     m.predict_chunks(chunks)
                 });
diff --git a/embeddings/src/model/text_model_wrapper.rs b/embeddings/src/model/text_model_wrapper.rs
@@ -2,6 +2,46 @@ use crate::model::{create_model, Model, ModelOptions, TextModel};
 use std::os::raw::c_char;
 use std::{ffi::c_void, ptr};
 
+/// Sentinel written at offset 0 of every live model handle. Lets FFI entry
+/// points detect garbage, null, or freed pointers handed in by the C++ caller
+/// and return a clean error instead of dereferencing into UB.
+const MODEL_MAGIC: u64 = 0xC0FF_EE5E_E7BE_EFDE;
+
+/// Sentinel written over MODEL_MAGIC in `Drop` before the inner fields are
+/// destroyed. A concurrent reader racing with `free_model_result` either sees
+/// MAGIC (and proceeds safely) or DEAD (and gets a clean error).
+const MODEL_DEAD: u64 = 0xDEAD_DEAD_DEAD_DEAD;
+
+/// Heap-allocated wrapper that the FFI hands to C++ as `*mut c_void`. The C++
+/// side stores the raw pointer and passes it back into every call; we use the
+/// `magic` field to validate that the pointer still references a live handle.
+///
+/// Layout note: `#[repr(C)]` and `magic` as the first field guarantee that the
+/// first 8 bytes of the allocation are the canary, regardless of what the inner
+/// `Model` enum's discriminant looks like.
+#[repr(C)]
+struct ModelHandle {
+    magic: u64,
+    inner: Model,
+}
+
+impl ModelHandle {
+    fn new(inner: Model) -> Self {
+        Self {
+            magic: MODEL_MAGIC,
+            inner,
+        }
+    }
+}
+
+impl Drop for ModelHandle {
+    fn drop(&mut self) {
+        // Tombstone before the inner Model is dropped so any concurrent FFI
+        // reader sees MODEL_DEAD rather than MODEL_MAGIC.
+        self.magic = MODEL_DEAD;
+    }
+}
+
 /// cbindgen:field-names=[m_pModel, m_szError]
 #[repr(C)]
 pub struct TextModelResult {
@@ -94,7 +134,7 @@ impl TextModelWrapper {
 
         match create_model(options) {
             Ok(model) => TextModelResult {
-                model: Box::into_raw(Box::new(model)) as *mut c_void,
+                model: Box::into_raw(Box::new(ModelHandle::new(model))) as *mut c_void,
                 error: ptr::null_mut(),
             },
             Err(e) => {
@@ -110,7 +150,9 @@ impl TextModelWrapper {
     pub extern "C" fn free_model_result(res: TextModelResult) {
         unsafe {
             if !res.model.is_null() {
-                drop(Box::from_raw(res.model as *mut Model));
+                // Drop runs ModelHandle::drop first (tombstones magic to
+                // MODEL_DEAD), then destroys the inner Model.
+                drop(Box::from_raw(res.model as *mut ModelHandle));
             }
 
             if !res.error.is_null() {
@@ -119,15 +161,45 @@ impl TextModelWrapper {
         }
     }
 
-    fn as_model(&self) -> &Model {
-        unsafe { &*(self.0 as *const Model) }
+    /// Validate the handle pointer before dereferencing. Returns a static error
+    /// string the caller can surface to C++ instead of crashing on a bad ptr.
+    /// Catches null, double-free / freed (MODEL_DEAD), and garbage handles.
+    /// Cannot catch a free that happens mid-call — that requires shared
+    /// ownership on the C++ side and is out of scope here.
+    fn as_model(&self) -> Result<&Model, &'static str> {
+        if self.0.is_null() {
+            return Err("embeddings: model handle is null");
+        }
+        // Read the magic without forming a &ModelHandle reference first — that
+        // would already be UB if the pointer is invalid. ptr::read of an
+        // 8-byte aligned u64 is a single atomic load on every target Manticore
+        // ships on, so this is safe against a concurrent Drop tombstone write.
+        let magic = unsafe { std::ptr::read(self.0 as *const u64) };
+        match magic {
+            MODEL_MAGIC => Ok(unsafe { &(*(self.0 as *const ModelHandle)).inner }),
+            MODEL_DEAD => Err("embeddings: model has been freed (use-after-free)"),
+            _ => Err("embeddings: model handle is corrupted (invalid magic)"),
+        }
     }
 
     pub extern "C" fn make_vect_embeddings(
         &self,
         texts: *const StringItem,
         count: usize,
     ) -> FloatVecResult {
+        let model = match self.as_model() {
+            Ok(m) => m,
+            Err(msg) => {
+                let c_error = std::ffi::CString::new(msg).unwrap();
+                return FloatVecResult {
+                    error: c_error.into_raw(),
+                    ptr: ptr::null(),
+                    len: 0,
+                    cap: 0,
+                };
+            }
+        };
+
         let string_slice = unsafe { std::slice::from_raw_parts(texts, count) };
 
         // Zero-copy: borrow C++ strings directly as &str.
@@ -141,7 +213,6 @@ impl TextModelWrapper {
             .collect();
 
         let mut float_vec_list: Vec<FloatVec> = Vec::new();
-        let model = self.as_model();
         let embeddings_list = model.predict(&string_refs);
         let c_error = match embeddings_list {
             Ok(embeddings_list) => {
@@ -198,18 +269,29 @@ impl TextModelWrapper {
     }
 
     pub extern "C" fn get_hidden_size(&self) -> usize {
-        self.as_model().get_hidden_size()
+        // No error channel here; return 0 on a bad handle so the C++ caller
+        // sees an obviously-wrong dimension instead of UB. The handle is
+        // already validated before any real work, so a 0 here means the C++
+        // side handed us an invalid pointer.
+        self.as_model().map(|m| m.get_hidden_size()).unwrap_or(0)
     }
 
     pub extern "C" fn get_max_input_len(&self) -> usize {
-        self.as_model().get_max_input_len()
+        self.as_model().map(|m| m.get_max_input_len()).unwrap_or(0)
     }
 
     /// Validates the API key by making a minimal test request to the API.
     /// Returns null on success, or an error message string on failure.
     /// The caller is responsible for freeing the error string using free_string().
     pub extern "C" fn validate_api_key(&self) -> *mut c_char {
-        let model = self.as_model();
+        let model = match self.as_model() {
+            Ok(m) => m,
+            Err(msg) => {
+                return std::ffi::CString::new(msg)
+                    .map(|c| c.into_raw())
+                    .unwrap_or(ptr::null_mut());
+            }
+        };
         match model.validate_api_key() {
             Ok(()) => ptr::null_mut(),
             Err(e) => {