From b0e407578def9a92a54574ba2df68f34cc29a13d Mon Sep 17 00:00:00 2001
From: Sergey Nikolaev <prostuda@academ.org>
Date: Fri, 17 Apr 2026 11:19:58 +0700
Subject: [PATCH 1/6] feat(embeddings): support passthrough remote model ids

1. Allow explicit provider-prefixed passthrough model ids for remote endpoints
- keep the existing slash-prefixed forms (openai/..., voyage/..., jina/...) working as before
- add explicit colon-prefixed forms (openai:..., voyage:..., jina:...)
- when the colon form is used, pass the model id through after stripping only the provider prefix
- this allows OpenAI-compatible custom endpoints to receive full upstream model ids unchanged, for example:
  - openai:openai/text-embedding-ada-002
  - openai:jinaai/jina-embeddings-v3
- preserve strict built-in validation for default provider endpoints while allowing passthrough mode for custom API_URL-based setups

2. Allow CMake to pass optional cargo features to the embeddings crate
- add EMBEDDINGS_CARGO_FEATURE_ARGS in cmake/build_embeddings.cmake
- if EMBEDDINGS_CARGO_FEATURES is set, convert it to a valid cargo CLI fragment:
  --features <value>
- this makes it possible to configure builds such as download-ort from the CMake side without hard-coding the flag in the build script

Additional remote-model adjustment:
- cache inferred embedding dimensionality in remote providers so passthrough/custom models can learn their vector dimension from a successful response instead of requiring a built-in static mapping
- apply that caching approach consistently across OpenAI, Voyage, and Jina
---
 cmake/build_embeddings.cmake              | 31 +++++----
 embeddings/src/model/create_model_test.rs | 77 +++++++++++++++++++++++
 embeddings/src/model/jina.rs              | 73 +++++++++++++++------
 embeddings/src/model/mod.rs               | 66 ++++++++++++++-----
 embeddings/src/model/openai.rs            | 54 +++++++++++++---
 embeddings/src/model/voyage.rs            | 65 ++++++++++++++-----
 embeddings/src/utils.rs                   | 11 +++-
 7 files changed, 305 insertions(+), 72 deletions(-)
 create mode 100644 embeddings/src/model/create_model_test.rs
diff --git a/cmake/build_embeddings.cmake b/cmake/build_embeddings.cmake
index dd035035..5010b098 100644
--- a/cmake/build_embeddings.cmake
+++ b/cmake/build_embeddings.cmake
@@ -50,20 +50,27 @@ function(build_embeddings_lib)
 	set(ENV{GIT_COMMIT_ID} "${GIT_COMMIT_ID}")
 	set(ENV{GIT_TIMESTAMP_ID} "${GIT_TIMESTAMP_ID}")
 
-	# Enable platform-specific BLAS acceleration for candle when available
-	set(EMBEDDINGS_CARGO_FEATURES "")
-	if(APPLE)
-		set(EMBEDDINGS_CARGO_FEATURES "--features" "accelerate")
-	elseif(UNIX)
-		# MKL provides multi-threaded BLAS on Linux; skip if not available
-		execute_process(COMMAND pkg-config --exists mkl-dynamic-lp64-seq RESULT_VARIABLE MKL_FOUND OUTPUT_QUIET ERROR_QUIET)
-		if(MKL_FOUND EQUAL 0)
-			set(EMBEDDINGS_CARGO_FEATURES "--features" "mkl")
-		endif()
-	endif()
+	# EMBEDDINGS_CARGO_FEATURES may be set externally (e.g., parent CMake) to inject
+	# extra cargo features. If unset, default to platform-specific BLAS acceleration
+	# for candle: accelerate on macOS, mkl on Linux when available.
+	if (NOT DEFINED EMBEDDINGS_CARGO_FEATURES OR "${EMBEDDINGS_CARGO_FEATURES}" STREQUAL "")
+		if (APPLE)
+			set(EMBEDDINGS_CARGO_FEATURES "accelerate")
+		elseif (UNIX)
+			execute_process(COMMAND pkg-config --exists mkl-dynamic-lp64-seq RESULT_VARIABLE MKL_FOUND OUTPUT_QUIET ERROR_QUIET)
+			if (MKL_FOUND EQUAL 0)
+				set(EMBEDDINGS_CARGO_FEATURES "mkl")
+			endif ()
+		endif ()
+	endif ()
+
+	set(EMBEDDINGS_CARGO_FEATURE_ARGS "")
+	if (NOT "${EMBEDDINGS_CARGO_FEATURES}" STREQUAL "")
+		set(EMBEDDINGS_CARGO_FEATURE_ARGS --features ${EMBEDDINGS_CARGO_FEATURES})
+	endif ()
 
 	execute_process (
-			COMMAND cargo build --manifest-path ${CMAKE_SOURCE_DIR}/embeddings/Cargo.toml --lib --release ${EMBEDDINGS_CARGO_FEATURES} --target-dir ${CMAKE_CURRENT_BINARY_DIR}/embeddings
+			COMMAND cargo build --manifest-path ${CMAKE_SOURCE_DIR}/embeddings/Cargo.toml --lib --release ${EMBEDDINGS_CARGO_FEATURE_ARGS} --target-dir ${CMAKE_CURRENT_BINARY_DIR}/embeddings
 			RESULT_VARIABLE CMD_RESULT
 	)
 
diff --git a/embeddings/src/model/create_model_test.rs b/embeddings/src/model/create_model_test.rs
new file mode 100644
index 00000000..4e2dd3be
--- /dev/null
+++ b/embeddings/src/model/create_model_test.rs
@@ -0,0 +1,77 @@
+use super::{create_model, Model, ModelOptions};
+
+#[test]
+fn test_create_model_allows_custom_openai_model_when_custom_api_url_is_set() {
+    let model = create_model(ModelOptions {
+        model_id: "openai/rubert-tiny-turbo".to_string(),
+        cache_path: None,
+        api_key: Some("test-key".to_string()),
+        api_url: Some("http://localhost:8080/v1/embeddings".to_string()),
+        api_timeout: None,
+        use_gpu: None,
+    });
+
+    assert!(model.is_ok());
+
+    match model.unwrap() {
+        Model::OpenAI(model) => assert_eq!(model.model, "rubert-tiny-turbo"),
+        _ => panic!("expected OpenAI model"),
+    }
+}
+
+#[test]
+fn test_create_model_with_custom_url_still_uses_prefixed_jina_as_remote_signal() {
+    let model = create_model(ModelOptions {
+        model_id: "jina/custom-model".to_string(),
+        cache_path: None,
+        api_key: Some("test-key".to_string()),
+        api_url: Some("http://localhost:8080/v1/embeddings".to_string()),
+        api_timeout: None,
+        use_gpu: None,
+    });
+
+    assert!(model.is_ok());
+
+    match model.unwrap() {
+        Model::Jina(model) => assert_eq!(model.model, "custom-model"),
+        _ => panic!("expected Jina model"),
+    }
+}
+
+#[test]
+fn test_create_model_supports_explicit_openai_colon_syntax() {
+    let model = create_model(ModelOptions {
+        model_id: "openai:openai/text-embedding-ada-002".to_string(),
+        cache_path: None,
+        api_key: Some("test-key".to_string()),
+        api_url: Some("http://localhost:8080/v1/embeddings".to_string()),
+        api_timeout: None,
+        use_gpu: None,
+    });
+
+    assert!(model.is_ok());
+
+    match model.unwrap() {
+        Model::OpenAI(model) => assert_eq!(model.model, "openai/text-embedding-ada-002"),
+        _ => panic!("expected OpenAI model"),
+    }
+}
+
+#[test]
+fn test_create_model_supports_explicit_openai_colon_syntax_with_simple_model() {
+    let model = create_model(ModelOptions {
+        model_id: "openai:text-embedding-ada-002".to_string(),
+        cache_path: None,
+        api_key: Some("test-key".to_string()),
+        api_url: Some("http://localhost:8080/v1/embeddings".to_string()),
+        api_timeout: None,
+        use_gpu: None,
+    });
+
+    assert!(model.is_ok());
+
+    match model.unwrap() {
+        Model::OpenAI(model) => assert_eq!(model.model, "text-embedding-ada-002"),
+        _ => panic!("expected OpenAI model"),
+    }
+}
diff --git a/embeddings/src/model/jina.rs b/embeddings/src/model/jina.rs
index fa029193..508fa9ac 100644
--- a/embeddings/src/model/jina.rs
+++ b/embeddings/src/model/jina.rs
@@ -1,6 +1,7 @@
-use super::TextModel;
+use super::{ModelValidationMode, TextModel};
 use crate::LibError;
 use reqwest::blocking::Client;
+use std::sync::Mutex;
 
 #[derive(Debug)]
 pub struct JinaModel {
@@ -8,6 +9,7 @@ pub struct JinaModel {
     pub model: String,
     pub api_key: String,
     pub api_url: Option<String>,
+    hidden_size_cache: Mutex<Option<usize>>,
 }
 
 pub fn validate_model(model: &str) -> Result<(), String> {
@@ -50,8 +52,32 @@ impl JinaModel {
         api_url: Option<&str>,
         api_timeout: Option<u64>,
     ) -> Result<Self, Box<dyn std::error::Error>> {
-        let model = model_id.trim_start_matches("jina/").to_string();
-        validate_model(&model).map_err(|_| LibError::RemoteUnsupportedModel { status: None })?;
+        let validation_mode = if api_url.is_some() {
+            ModelValidationMode::Passthrough
+        } else {
+            ModelValidationMode::StrictBuiltInList
+        };
+
+        Self::new_with_validation_mode(model_id, api_key, api_url, api_timeout, validation_mode)
+    }
+
+    pub fn new_with_validation_mode(
+        model_id: &str,
+        api_key: &str,
+        api_url: Option<&str>,
+        api_timeout: Option<u64>,
+        validation_mode: ModelValidationMode,
+    ) -> Result<Self, Box<dyn std::error::Error>> {
+        let model = if let Some(model) = model_id.strip_prefix("jina:") {
+            model.to_string()
+        } else {
+            model_id.trim_start_matches("jina/").to_string()
+        };
+
+        if validation_mode == ModelValidationMode::StrictBuiltInList {
+            validate_model(&model)
+                .map_err(|_| LibError::RemoteUnsupportedModel { status: None })?;
+        }
         // Only validate basic requirements (non-empty, no whitespace)
         // Real validation happens via actual API request in validate_api_key()
         validate_api_key_basic(api_key)
@@ -62,8 +88,26 @@ impl JinaModel {
             model,
             api_key: api_key.to_string(),
             api_url: api_url.map(|s| s.to_string()),
+            hidden_size_cache: Mutex::new(None),
         })
     }
+
+    fn known_hidden_size(&self) -> Option<usize> {
+        match self.model.as_str() {
+            "jina-embeddings-v4" => Some(2048), // 32K context, 2048 dimensions
+            "jina-clip-v2" => Some(1024),       // 8K context, 1024 dimensions, multimodal
+            "jina-embeddings-v3" => Some(1024), // 8K context, 1024 dimensions
+            "jina-colbert-v2" => Some(128),     // Multi-vector model, 8K context
+            "jina-clip-v1" => Some(768),        // 8K context, 768 dimensions, multimodal
+            "jina-colbert-v1-en" => Some(128),  // Multi-vector model, 8K context
+            "jina-embeddings-v2-base-es" => Some(768), // 8K context, 768 dimensions
+            "jina-embeddings-v2-base-code" => Some(768), // 8K context, 768 dimensions
+            "jina-embeddings-v2-base-de" => Some(768), // 8K context, 768 dimensions
+            "jina-embeddings-v2-base-zh" => Some(768), // 8K context, 768 dimensions
+            "jina-embeddings-v2-base-en" => Some(768), // 8K context, 768 dimensions
+            _ => None,
+        }
+    }
 }
 
 impl TextModel for JinaModel {
@@ -254,15 +298,17 @@ impl TextModel for JinaModel {
             }));
         }
 
+        let inferred_dim = embeddings[0].len();
+        *self.hidden_size_cache.lock().unwrap() = Some(inferred_dim);
+
         // Validate embedding dimensions and handle empty individual embeddings
-        let expected_dim = self.get_hidden_size();
         for embedding in embeddings.iter() {
             if embedding.is_empty() {
                 return Err(Box::new(LibError::RemoteHttpError {
                     status: status_code,
                 }));
             }
-            if embedding.len() != expected_dim {
+            if embedding.len() != inferred_dim {
                 // Some models might return different dimensions, but we should validate
                 // For now, we'll be lenient but could add stricter validation later
             }
@@ -272,20 +318,9 @@ impl TextModel for JinaModel {
     }
 
     fn get_hidden_size(&self) -> usize {
-        match self.model.as_str() {
-            "jina-embeddings-v4" => 2048,          // 32K context, 2048 dimensions
-            "jina-clip-v2" => 1024,                // 8K context, 1024 dimensions, multimodal
-            "jina-embeddings-v3" => 1024,          // 8K context, 1024 dimensions
-            "jina-colbert-v2" => 128,              // Multi-vector model, 8K context
-            "jina-clip-v1" => 768,                 // 8K context, 768 dimensions, multimodal
-            "jina-colbert-v1-en" => 128,           // Multi-vector model, 8K context
-            "jina-embeddings-v2-base-es" => 768,   // 8K context, 768 dimensions
-            "jina-embeddings-v2-base-code" => 768, // 8K context, 768 dimensions
-            "jina-embeddings-v2-base-de" => 768,   // 8K context, 768 dimensions
-            "jina-embeddings-v2-base-zh" => 768,   // 8K context, 768 dimensions
-            "jina-embeddings-v2-base-en" => 768,   // 8K context, 768 dimensions
-            _ => panic!("Unknown model"),
-        }
+        self.known_hidden_size()
+            .or_else(|| *self.hidden_size_cache.lock().unwrap())
+            .unwrap_or_else(|| panic!("Unknown model"))
     }
 
     fn get_max_input_len(&self) -> usize {
diff --git a/embeddings/src/model/mod.rs b/embeddings/src/model/mod.rs
index d675cfba..1727e7d4 100644
--- a/embeddings/src/model/mod.rs
+++ b/embeddings/src/model/mod.rs
@@ -19,6 +19,9 @@ mod local_test;
 #[cfg(test)]
 mod ffi_test;
 
+#[cfg(test)]
+mod create_model_test;
+
 use std::error::Error;
 use std::path::PathBuf;
 
@@ -41,6 +44,12 @@ pub struct ModelOptions {
     pub use_gpu: Option<bool>,
 }
 
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum ModelValidationMode {
+    StrictBuiltInList,
+    Passthrough,
+}
+
 /// Unified model enum
 ///
 /// Architecture:
@@ -96,34 +105,55 @@ impl TextModel for Model {
 
 pub fn create_model(options: ModelOptions) -> Result<Model, Box<dyn Error>> {
     let model_id = options.model_id.as_str();
+    let api_key = options.api_key.unwrap_or_default();
+    let api_url = options.api_url;
+    let api_timeout = options.api_timeout;
 
     // Remote providers (HTTP APIs)
-    if model_id.starts_with("openai/") {
-        let model = openai::OpenAIModel::new(
+    if model_id.starts_with("openai:") {
+        let model = openai::OpenAIModel::new_with_validation_mode(
             model_id,
-            options.api_key.unwrap_or_default().as_str(),
-            options.api_url.as_deref(),
-            options.api_timeout,
+            api_key.as_str(),
+            api_url.as_deref(),
+            api_timeout,
+            ModelValidationMode::Passthrough,
         )?;
 
         Ok(Model::OpenAI(Box::new(model)))
-    } else if model_id.starts_with("voyage/") {
-        let model = voyage::VoyageModel::new(
+    } else if model_id.starts_with("openai/") {
+        let model =
+            openai::OpenAIModel::new(model_id, api_key.as_str(), api_url.as_deref(), api_timeout)?;
+
+        Ok(Model::OpenAI(Box::new(model)))
+    } else if model_id.starts_with("voyage:") {
+        let model = voyage::VoyageModel::new_with_validation_mode(
             model_id,
-            options.api_key.unwrap_or_default().as_str(),
-            options.api_url.as_deref(),
-            options.api_timeout,
+            api_key.as_str(),
+            api_url.as_deref(),
+            api_timeout,
+            ModelValidationMode::Passthrough,
         )?;
 
         Ok(Model::Voyage(Box::new(model)))
-    } else if model_id.starts_with("jina/") {
-        let model = jina::JinaModel::new(
+    } else if model_id.starts_with("voyage/") {
+        let model =
+            voyage::VoyageModel::new(model_id, api_key.as_str(), api_url.as_deref(), api_timeout)?;
+
+        Ok(Model::Voyage(Box::new(model)))
+    } else if model_id.starts_with("jina:") {
+        let model = jina::JinaModel::new_with_validation_mode(
             model_id,
-            options.api_key.unwrap_or_default().as_str(),
-            options.api_url.as_deref(),
-            options.api_timeout,
+            api_key.as_str(),
+            api_url.as_deref(),
+            api_timeout,
+            ModelValidationMode::Passthrough,
         )?;
 
+        Ok(Model::Jina(Box::new(model)))
+    } else if model_id.starts_with("jina/") {
+        let model =
+            jina::JinaModel::new(model_id, api_key.as_str(), api_url.as_deref(), api_timeout)?;
+
         Ok(Model::Jina(Box::new(model)))
     } else {
         // Local models - auto-detect architecture from config
@@ -135,7 +165,11 @@ pub fn create_model(options: ModelOptions) -> Result<Model, Box<dyn Error>> {
                 .unwrap_or(String::from(".cache/manticore")),
         );
 
-        let hf_token = options.api_key.as_deref();
+        let hf_token = if api_key.is_empty() {
+            None
+        } else {
+            Some(api_key.as_str())
+        };
         let model = local::LocalModel::new(
             model_id,
             cache_path,
diff --git a/embeddings/src/model/openai.rs b/embeddings/src/model/openai.rs
index b880c116..323592f2 100644
--- a/embeddings/src/model/openai.rs
+++ b/embeddings/src/model/openai.rs
@@ -1,6 +1,7 @@
-use super::TextModel;
+use super::{ModelValidationMode, TextModel};
 use crate::LibError;
 use reqwest::blocking::Client;
+use std::sync::Mutex;
 
 #[derive(Debug)]
 pub struct OpenAIModel {
@@ -8,6 +9,7 @@ pub struct OpenAIModel {
     pub model: String,
     pub api_key: String,
     pub api_url: Option<String>,
+    hidden_size_cache: Mutex<Option<usize>>,
 }
 
 pub fn validate_model(model: &str) -> Result<(), String> {
@@ -40,8 +42,32 @@ impl OpenAIModel {
         api_url: Option<&str>,
         api_timeout: Option<u64>,
     ) -> Result<Self, Box<dyn std::error::Error>> {
-        let model = model_id.trim_start_matches("openai/").to_string();
-        validate_model(&model).map_err(|_| LibError::RemoteUnsupportedModel { status: None })?;
+        let validation_mode = if api_url.is_some() {
+            ModelValidationMode::Passthrough
+        } else {
+            ModelValidationMode::StrictBuiltInList
+        };
+
+        Self::new_with_validation_mode(model_id, api_key, api_url, api_timeout, validation_mode)
+    }
+
+    pub fn new_with_validation_mode(
+        model_id: &str,
+        api_key: &str,
+        api_url: Option<&str>,
+        api_timeout: Option<u64>,
+        validation_mode: ModelValidationMode,
+    ) -> Result<Self, Box<dyn std::error::Error>> {
+        let model = if let Some(model) = model_id.strip_prefix("openai:") {
+            model.to_string()
+        } else {
+            model_id.trim_start_matches("openai/").to_string()
+        };
+
+        if validation_mode == ModelValidationMode::StrictBuiltInList {
+            validate_model(&model)
+                .map_err(|_| LibError::RemoteUnsupportedModel { status: None })?;
+        }
         // Only validate basic requirements (non-empty, no whitespace)
         // Real validation happens via actual API request in validate_api_key()
         validate_api_key_basic(api_key)
@@ -52,8 +78,18 @@ impl OpenAIModel {
             model,
             api_key: api_key.to_string(),
             api_url: api_url.map(|s| s.to_string()),
+            hidden_size_cache: Mutex::new(None),
         })
     }
+
+    fn known_hidden_size(&self) -> Option<usize> {
+        match self.model.as_str() {
+            "text-embedding-ada-002" => Some(1536), // Fixed: was 768, should be 1536
+            "text-embedding-3-small" => Some(1536),
+            "text-embedding-3-large" => Some(3072),
+            _ => None,
+        }
+    }
 }
 
 impl TextModel for OpenAIModel {
@@ -163,16 +199,16 @@ impl TextModel for OpenAIModel {
             }));
         }
 
+        *self.hidden_size_cache.lock().unwrap() =
+            embeddings.first().map(|embedding| embedding.len());
+
         Ok(embeddings)
     }
 
     fn get_hidden_size(&self) -> usize {
-        match self.model.as_str() {
-            "text-embedding-ada-002" => 1536, // Fixed: was 768, should be 1536
-            "text-embedding-3-small" => 1536,
-            "text-embedding-3-large" => 3072,
-            _ => panic!("Unknown model"),
-        }
+        self.known_hidden_size()
+            .or_else(|| *self.hidden_size_cache.lock().unwrap())
+            .unwrap_or_else(|| panic!("Unknown model"))
     }
 
     fn get_max_input_len(&self) -> usize {
diff --git a/embeddings/src/model/voyage.rs b/embeddings/src/model/voyage.rs
index 1374dd4e..353bd69f 100644
--- a/embeddings/src/model/voyage.rs
+++ b/embeddings/src/model/voyage.rs
@@ -1,6 +1,7 @@
-use super::TextModel;
+use super::{ModelValidationMode, TextModel};
 use crate::LibError;
 use reqwest::blocking::Client;
+use std::sync::Mutex;
 
 #[derive(Debug)]
 pub struct VoyageModel {
@@ -8,6 +9,7 @@ pub struct VoyageModel {
     pub model: String,
     pub api_key: String,
     pub api_url: Option<String>,
+    hidden_size_cache: Mutex<Option<usize>>,
 }
 
 pub fn validate_model(model: &str) -> Result<(), String> {
@@ -41,8 +43,32 @@ impl VoyageModel {
         api_url: Option<&str>,
         api_timeout: Option<u64>,
     ) -> Result<Self, Box<dyn std::error::Error>> {
-        let model = model_id.trim_start_matches("voyage/").to_string();
-        validate_model(&model).map_err(|_| LibError::RemoteUnsupportedModel { status: None })?;
+        let validation_mode = if api_url.is_some() {
+            ModelValidationMode::Passthrough
+        } else {
+            ModelValidationMode::StrictBuiltInList
+        };
+
+        Self::new_with_validation_mode(model_id, api_key, api_url, api_timeout, validation_mode)
+    }
+
+    pub fn new_with_validation_mode(
+        model_id: &str,
+        api_key: &str,
+        api_url: Option<&str>,
+        api_timeout: Option<u64>,
+        validation_mode: ModelValidationMode,
+    ) -> Result<Self, Box<dyn std::error::Error>> {
+        let model = if let Some(model) = model_id.strip_prefix("voyage:") {
+            model.to_string()
+        } else {
+            model_id.trim_start_matches("voyage/").to_string()
+        };
+
+        if validation_mode == ModelValidationMode::StrictBuiltInList {
+            validate_model(&model)
+                .map_err(|_| LibError::RemoteUnsupportedModel { status: None })?;
+        }
         // Only validate basic requirements (non-empty, no whitespace)
         // Real validation happens via actual API request in validate_api_key()
         validate_api_key_basic(api_key)
@@ -53,8 +79,22 @@ impl VoyageModel {
             model,
             api_key: api_key.to_string(),
             api_url: api_url.map(|s| s.to_string()),
+            hidden_size_cache: Mutex::new(None),
         })
     }
+
+    fn known_hidden_size(&self) -> Option<usize> {
+        match self.model.as_str() {
+            "voyage-3-large" => Some(1024), // Default 1024, supports 256, 512, 2048
+            "voyage-3.5" => Some(1024),     // Default 1024, supports 256, 512, 2048
+            "voyage-3.5-lite" => Some(1024), // Default 1024, supports 256, 512, 2048
+            "voyage-code-3" => Some(1024),  // Default 1024, supports 256, 512, 2048
+            "voyage-finance-2" => Some(1024),
+            "voyage-law-2" => Some(1024),
+            "voyage-code-2" => Some(1536),
+            _ => None,
+        }
+    }
 }
 
 impl TextModel for VoyageModel {
@@ -175,15 +215,17 @@ impl TextModel for VoyageModel {
             }));
         }
 
+        let inferred_dim = embeddings[0].len();
+        *self.hidden_size_cache.lock().unwrap() = Some(inferred_dim);
+
         // Validate embedding dimensions and handle empty individual embeddings
-        let expected_dim = self.get_hidden_size();
         for embedding in embeddings.iter() {
             if embedding.is_empty() {
                 return Err(Box::new(LibError::RemoteHttpError {
                     status: status_code,
                 }));
             }
-            if embedding.len() != expected_dim {
+            if embedding.len() != inferred_dim {
                 // Some models might return different dimensions, but we should validate
                 // For now, we'll be lenient but could add stricter validation later
             }
@@ -193,16 +235,9 @@ impl TextModel for VoyageModel {
     }
 
     fn get_hidden_size(&self) -> usize {
-        match self.model.as_str() {
-            "voyage-3-large" => 1024,  // Default 1024, supports 256, 512, 2048
-            "voyage-3.5" => 1024,      // Default 1024, supports 256, 512, 2048
-            "voyage-3.5-lite" => 1024, // Default 1024, supports 256, 512, 2048
-            "voyage-code-3" => 1024,   // Default 1024, supports 256, 512, 2048
-            "voyage-finance-2" => 1024,
-            "voyage-law-2" => 1024,
-            "voyage-code-2" => 1536,
-            _ => panic!("Unknown model"),
-        }
+        self.known_hidden_size()
+            .or_else(|| *self.hidden_size_cache.lock().unwrap())
+            .unwrap_or_else(|| panic!("Unknown model"))
     }
 
     fn get_max_input_len(&self) -> usize {
diff --git a/embeddings/src/utils.rs b/embeddings/src/utils.rs
index e2423f88..294baa28 100644
--- a/embeddings/src/utils.rs
+++ b/embeddings/src/utils.rs
@@ -5,6 +5,15 @@ use serde_json::Value;
 /// Most tokenizers average 3–5 bytes/token; 8 covers worst-case (CJK, emoji).
 const BYTES_PER_TOKEN_UPPER_BOUND: usize = 8;
 
+#[inline]
+fn floor_char_boundary(text: &str, index: usize) -> usize {
+    let mut i = index.min(text.len());
+    while i > 0 && !text.is_char_boundary(i) {
+        i -= 1;
+    }
+    i
+}
+
 /// Pre-truncate text to avoid running BPE on excessively long input.
 /// Cuts at a valid UTF-8 char boundary with a safe byte margin.
 /// `truncate_tokens` remains the final guarantee on token count.
@@ -14,7 +23,7 @@ pub fn pre_truncate_text(text: &str, max_seq_len: usize) -> &str {
     if text.len() <= byte_limit {
         text
     } else {
-        &text[..text.floor_char_boundary(byte_limit)]
+        &text[..floor_char_boundary(text, byte_limit)]
     }
 }
 

From 1c0a3b6744dc77a412766d95ad335648af8fb9d6 Mon Sep 17 00:00:00 2001
From: Don Hardman <don.hardman@muvon.io>
Date: Sun, 24 May 2026 15:45:36 +0300
Subject: [PATCH 2/6] refactor(embeddings): use native floor_char_boundary

---
 embeddings/src/utils.rs | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/embeddings/src/utils.rs b/embeddings/src/utils.rs
index 294baa28..e2423f88 100644
--- a/embeddings/src/utils.rs
+++ b/embeddings/src/utils.rs
@@ -5,15 +5,6 @@ use serde_json::Value;
 /// Most tokenizers average 3–5 bytes/token; 8 covers worst-case (CJK, emoji).
 const BYTES_PER_TOKEN_UPPER_BOUND: usize = 8;
 
-#[inline]
-fn floor_char_boundary(text: &str, index: usize) -> usize {
-    let mut i = index.min(text.len());
-    while i > 0 && !text.is_char_boundary(i) {
-        i -= 1;
-    }
-    i
-}
-
 /// Pre-truncate text to avoid running BPE on excessively long input.
 /// Cuts at a valid UTF-8 char boundary with a safe byte margin.
 /// `truncate_tokens` remains the final guarantee on token count.
@@ -23,7 +14,7 @@ pub fn pre_truncate_text(text: &str, max_seq_len: usize) -> &str {
     if text.len() <= byte_limit {
         text
     } else {
-        &text[..floor_char_boundary(text, byte_limit)]
+        &text[..text.floor_char_boundary(byte_limit)]
     }
 }
 

From 1f689aed12e1799caf607c8d867e471396f84be4 Mon Sep 17 00:00:00 2001
From: Don Hardman <don.hardman@muvon.io>
Date: Sun, 24 May 2026 16:06:50 +0300
Subject: [PATCH 3/6] refactor(embeddings): optimize hidden size initialization

- Replace Mutex with OnceLock for thread-safe hidden size caching
- Probe remote models during construction to ensure dimensions are known
- Add catch_unwind to get_hidden_size FFI wrapper to prevent crashes
- Remove runtime panics and lock overhead in favor of early validation
---
 embeddings/src/model/jina.rs               | 30 +++++++++++++++------
 embeddings/src/model/openai.rs             | 31 +++++++++++++++-------
 embeddings/src/model/text_model_wrapper.rs | 14 ++++++----
 embeddings/src/model/voyage.rs             | 28 +++++++++++++------
 4 files changed, 73 insertions(+), 30 deletions(-)

diff --git a/embeddings/src/model/jina.rs b/embeddings/src/model/jina.rs
index 508fa9ac..92aad481 100644
--- a/embeddings/src/model/jina.rs
+++ b/embeddings/src/model/jina.rs
@@ -1,7 +1,7 @@
 use super::{ModelValidationMode, TextModel};
 use crate::LibError;
 use reqwest::blocking::Client;
-use std::sync::Mutex;
+use std::sync::OnceLock;
 
 #[derive(Debug)]
 pub struct JinaModel {
@@ -9,7 +9,7 @@ pub struct JinaModel {
     pub model: String,
     pub api_key: String,
     pub api_url: Option<String>,
-    hidden_size_cache: Mutex<Option<usize>>,
+    hidden_size_cache: OnceLock<usize>,
 }
 
 pub fn validate_model(model: &str) -> Result<(), String> {
@@ -83,13 +83,22 @@ impl JinaModel {
         validate_api_key_basic(api_key)
             .map_err(|_| LibError::RemoteInvalidAPIKey { status: None })?;
         let timeout_duration = api_timeout.map(std::time::Duration::from_secs);
-        Ok(Self {
+        let model = Self {
             client: Client::builder().timeout(timeout_duration).build()?,
             model,
             api_key: api_key.to_string(),
             api_url: api_url.map(|s| s.to_string()),
-            hidden_size_cache: Mutex::new(None),
-        })
+            hidden_size_cache: OnceLock::new(),
+        };
+        // Enforce the invariant: by the time the model is handed back, the
+        // hidden size is known. Built-in models have it hardcoded; passthrough
+        // models need one API round-trip to learn it. predict() populates the
+        // OnceLock on success. If probing fails the caller never gets a
+        // partially initialized model.
+        if model.known_hidden_size().is_none() {
+            model.predict(&["probe"])?;
+        }
+        Ok(model)
     }
 
     fn known_hidden_size(&self) -> Option<usize> {
@@ -299,7 +308,7 @@ impl TextModel for JinaModel {
         }
 
         let inferred_dim = embeddings[0].len();
-        *self.hidden_size_cache.lock().unwrap() = Some(inferred_dim);
+        let _ = self.hidden_size_cache.set(inferred_dim);
 
         // Validate embedding dimensions and handle empty individual embeddings
         for embedding in embeddings.iter() {
@@ -318,9 +327,14 @@ impl TextModel for JinaModel {
     }
 
     fn get_hidden_size(&self) -> usize {
+        // Invariant: cache is populated by new_with_validation_mode() — either
+        // implicitly via known_hidden_size() for built-ins or explicitly via a
+        // probe predict() for passthrough models. A miss here is a construction
+        // bug; the catch_unwind at the FFI boundary stops the panic from
+        // crossing into C++.
         self.known_hidden_size()
-            .or_else(|| *self.hidden_size_cache.lock().unwrap())
-            .unwrap_or_else(|| panic!("Unknown model"))
+            .or_else(|| self.hidden_size_cache.get().copied())
+            .expect("hidden size must be populated during model construction")
     }
 
     fn get_max_input_len(&self) -> usize {
diff --git a/embeddings/src/model/openai.rs b/embeddings/src/model/openai.rs
index 323592f2..3efe1e3b 100644
--- a/embeddings/src/model/openai.rs
+++ b/embeddings/src/model/openai.rs
@@ -1,7 +1,7 @@
 use super::{ModelValidationMode, TextModel};
 use crate::LibError;
 use reqwest::blocking::Client;
-use std::sync::Mutex;
+use std::sync::OnceLock;
 
 #[derive(Debug)]
 pub struct OpenAIModel {
@@ -9,7 +9,7 @@ pub struct OpenAIModel {
     pub model: String,
     pub api_key: String,
     pub api_url: Option<String>,
-    hidden_size_cache: Mutex<Option<usize>>,
+    hidden_size_cache: OnceLock<usize>,
 }
 
 pub fn validate_model(model: &str) -> Result<(), String> {
@@ -73,13 +73,22 @@ impl OpenAIModel {
         validate_api_key_basic(api_key)
             .map_err(|_| LibError::RemoteInvalidAPIKey { status: None })?;
         let timeout_duration = api_timeout.map(std::time::Duration::from_secs);
-        Ok(Self {
+        let model = Self {
             client: Client::builder().timeout(timeout_duration).build()?,
             model,
             api_key: api_key.to_string(),
             api_url: api_url.map(|s| s.to_string()),
-            hidden_size_cache: Mutex::new(None),
-        })
+            hidden_size_cache: OnceLock::new(),
+        };
+        // Enforce the invariant: by the time the model is handed back, the
+        // hidden size is known. Built-in models have it hardcoded; passthrough
+        // models need one API round-trip to learn it. predict() populates the
+        // OnceLock on success. If probing fails the caller never gets a
+        // partially initialized model.
+        if model.known_hidden_size().is_none() {
+            model.predict(&["probe"])?;
+        }
+        Ok(model)
     }
 
     fn known_hidden_size(&self) -> Option<usize> {
@@ -199,16 +208,20 @@ impl TextModel for OpenAIModel {
             }));
         }
 
-        *self.hidden_size_cache.lock().unwrap() =
-            embeddings.first().map(|embedding| embedding.len());
+        if let Some(dim) = embeddings.first().map(|e| e.len()) {
+            let _ = self.hidden_size_cache.set(dim);
+        }
 
         Ok(embeddings)
     }
 
     fn get_hidden_size(&self) -> usize {
+        // Invariant: cache populated during new_with_validation_mode().
+        // A miss here is a construction bug; catch_unwind at the FFI
+        // boundary stops the panic from crossing into C++.
         self.known_hidden_size()
-            .or_else(|| *self.hidden_size_cache.lock().unwrap())
-            .unwrap_or_else(|| panic!("Unknown model"))
+            .or_else(|| self.hidden_size_cache.get().copied())
+            .expect("hidden size must be populated during model construction")
     }
 
     fn get_max_input_len(&self) -> usize {
diff --git a/embeddings/src/model/text_model_wrapper.rs b/embeddings/src/model/text_model_wrapper.rs
index 9dd799c4..75252e10 100644
--- a/embeddings/src/model/text_model_wrapper.rs
+++ b/embeddings/src/model/text_model_wrapper.rs
@@ -318,11 +318,15 @@ impl TextModelWrapper {
     }
 
     pub extern "C" fn get_hidden_size(&self) -> usize {
-        // No error channel here; return 0 on a bad handle so the C++ caller
-        // sees an obviously-wrong dimension instead of UB. The handle is
-        // already validated before any real work, so a 0 here means the C++
-        // side handed us an invalid pointer.
-        self.as_model().map(|m| m.get_hidden_size()).unwrap_or(0)
+        // No error channel here; return 0 on a bad handle or unwind so the
+        // C++ caller sees an obviously-wrong dimension instead of UB. The
+        // remote model impls already return 0 instead of panicking when the
+        // dim is unknown; catch_unwind is a defense-in-depth guard so any
+        // future panic on this path can never unwind across FFI.
+        catch_unwind(AssertUnwindSafe(|| {
+            self.as_model().map(|m| m.get_hidden_size()).unwrap_or(0)
+        }))
+        .unwrap_or(0)
     }
 
     pub extern "C" fn get_max_input_len(&self) -> usize {
diff --git a/embeddings/src/model/voyage.rs b/embeddings/src/model/voyage.rs
index 353bd69f..3ed5fc03 100644
--- a/embeddings/src/model/voyage.rs
+++ b/embeddings/src/model/voyage.rs
@@ -1,7 +1,7 @@
 use super::{ModelValidationMode, TextModel};
 use crate::LibError;
 use reqwest::blocking::Client;
-use std::sync::Mutex;
+use std::sync::OnceLock;
 
 #[derive(Debug)]
 pub struct VoyageModel {
@@ -9,7 +9,7 @@ pub struct VoyageModel {
     pub model: String,
     pub api_key: String,
     pub api_url: Option<String>,
-    hidden_size_cache: Mutex<Option<usize>>,
+    hidden_size_cache: OnceLock<usize>,
 }
 
 pub fn validate_model(model: &str) -> Result<(), String> {
@@ -74,13 +74,22 @@ impl VoyageModel {
         validate_api_key_basic(api_key)
             .map_err(|_| LibError::RemoteInvalidAPIKey { status: None })?;
         let timeout_duration = api_timeout.map(std::time::Duration::from_secs);
-        Ok(Self {
+        let model = Self {
             client: Client::builder().timeout(timeout_duration).build()?,
             model,
             api_key: api_key.to_string(),
             api_url: api_url.map(|s| s.to_string()),
-            hidden_size_cache: Mutex::new(None),
-        })
+            hidden_size_cache: OnceLock::new(),
+        };
+        // Enforce the invariant: by the time the model is handed back, the
+        // hidden size is known. Built-in models have it hardcoded; passthrough
+        // models need one API round-trip to learn it. predict() populates the
+        // OnceLock on success. If probing fails the caller never gets a
+        // partially initialized model.
+        if model.known_hidden_size().is_none() {
+            model.predict(&["probe"])?;
+        }
+        Ok(model)
     }
 
     fn known_hidden_size(&self) -> Option<usize> {
@@ -216,7 +225,7 @@ impl TextModel for VoyageModel {
         }
 
         let inferred_dim = embeddings[0].len();
-        *self.hidden_size_cache.lock().unwrap() = Some(inferred_dim);
+        let _ = self.hidden_size_cache.set(inferred_dim);
 
         // Validate embedding dimensions and handle empty individual embeddings
         for embedding in embeddings.iter() {
@@ -235,9 +244,12 @@ impl TextModel for VoyageModel {
     }
 
     fn get_hidden_size(&self) -> usize {
+        // Invariant: cache populated during new_with_validation_mode().
+        // A miss here is a construction bug; catch_unwind at the FFI
+        // boundary stops the panic from crossing into C++.
         self.known_hidden_size()
-            .or_else(|| *self.hidden_size_cache.lock().unwrap())
-            .unwrap_or_else(|| panic!("Unknown model"))
+            .or_else(|| self.hidden_size_cache.get().copied())
+            .expect("hidden size must be populated during model construction")
     }
 
     fn get_max_input_len(&self) -> usize {

From 69d031b57f79567afa9086565321cf75608d4ae9 Mon Sep 17 00:00:00 2001
From: Don Hardman <don.hardman@muvon.io>
Date: Sun, 24 May 2026 16:25:24 +0300
Subject: [PATCH 4/6] refactor(embeddings): remove eager hidden size probing

- Remove automatic probe request during model initialization
- Update Jina, OpenAI, and Voyage model constructors
- Adjust corresponding test expectations for hidden size panics
---
 embeddings/src/model/jina.rs        | 13 ++-----------
 embeddings/src/model/jina_test.rs   |  2 +-
 embeddings/src/model/openai.rs      | 13 ++-----------
 embeddings/src/model/openai_test.rs |  2 +-
 embeddings/src/model/voyage.rs      | 13 ++-----------
 embeddings/src/model/voyage_test.rs |  2 +-
 6 files changed, 9 insertions(+), 36 deletions(-)

diff --git a/embeddings/src/model/jina.rs b/embeddings/src/model/jina.rs
index 92aad481..d09067d5 100644
--- a/embeddings/src/model/jina.rs
+++ b/embeddings/src/model/jina.rs
@@ -83,22 +83,13 @@ impl JinaModel {
         validate_api_key_basic(api_key)
             .map_err(|_| LibError::RemoteInvalidAPIKey { status: None })?;
         let timeout_duration = api_timeout.map(std::time::Duration::from_secs);
-        let model = Self {
+        Ok(Self {
             client: Client::builder().timeout(timeout_duration).build()?,
             model,
             api_key: api_key.to_string(),
             api_url: api_url.map(|s| s.to_string()),
             hidden_size_cache: OnceLock::new(),
-        };
-        // Enforce the invariant: by the time the model is handed back, the
-        // hidden size is known. Built-in models have it hardcoded; passthrough
-        // models need one API round-trip to learn it. predict() populates the
-        // OnceLock on success. If probing fails the caller never gets a
-        // partially initialized model.
-        if model.known_hidden_size().is_none() {
-            model.predict(&["probe"])?;
-        }
-        Ok(model)
+        })
     }
 
     fn known_hidden_size(&self) -> Option<usize> {
diff --git a/embeddings/src/model/jina_test.rs b/embeddings/src/model/jina_test.rs
index 3c6a2896..0efb6682 100644
--- a/embeddings/src/model/jina_test.rs
+++ b/embeddings/src/model/jina_test.rs
@@ -88,7 +88,7 @@ mod tests {
         }
     }
 
-    #[should_panic(expected = "Unknown model")]
+    #[should_panic(expected = "hidden size must be populated during model construction")]
     #[test]
     fn test_get_hidden_size_unknown_model() {
         // This test verifies the panic behavior for unknown models
diff --git a/embeddings/src/model/openai.rs b/embeddings/src/model/openai.rs
index 3efe1e3b..3a7922f3 100644
--- a/embeddings/src/model/openai.rs
+++ b/embeddings/src/model/openai.rs
@@ -73,22 +73,13 @@ impl OpenAIModel {
         validate_api_key_basic(api_key)
             .map_err(|_| LibError::RemoteInvalidAPIKey { status: None })?;
         let timeout_duration = api_timeout.map(std::time::Duration::from_secs);
-        let model = Self {
+        Ok(Self {
             client: Client::builder().timeout(timeout_duration).build()?,
             model,
             api_key: api_key.to_string(),
             api_url: api_url.map(|s| s.to_string()),
             hidden_size_cache: OnceLock::new(),
-        };
-        // Enforce the invariant: by the time the model is handed back, the
-        // hidden size is known. Built-in models have it hardcoded; passthrough
-        // models need one API round-trip to learn it. predict() populates the
-        // OnceLock on success. If probing fails the caller never gets a
-        // partially initialized model.
-        if model.known_hidden_size().is_none() {
-            model.predict(&["probe"])?;
-        }
-        Ok(model)
+        })
     }
 
     fn known_hidden_size(&self) -> Option<usize> {
diff --git a/embeddings/src/model/openai_test.rs b/embeddings/src/model/openai_test.rs
index 8a24f8e0..56e2ed10 100644
--- a/embeddings/src/model/openai_test.rs
+++ b/embeddings/src/model/openai_test.rs
@@ -158,7 +158,7 @@ mod tests {
     }
 
     #[test]
-    #[should_panic(expected = "Unknown model")]
+    #[should_panic(expected = "hidden size must be populated during model construction")]
     fn test_get_hidden_size_unknown_model() {
         // This test verifies the panic behavior for unknown models
         // In practice, this shouldn't happen due to validation in new()
diff --git a/embeddings/src/model/voyage.rs b/embeddings/src/model/voyage.rs
index 3ed5fc03..b56fd57e 100644
--- a/embeddings/src/model/voyage.rs
+++ b/embeddings/src/model/voyage.rs
@@ -74,22 +74,13 @@ impl VoyageModel {
         validate_api_key_basic(api_key)
             .map_err(|_| LibError::RemoteInvalidAPIKey { status: None })?;
         let timeout_duration = api_timeout.map(std::time::Duration::from_secs);
-        let model = Self {
+        Ok(Self {
             client: Client::builder().timeout(timeout_duration).build()?,
             model,
             api_key: api_key.to_string(),
             api_url: api_url.map(|s| s.to_string()),
             hidden_size_cache: OnceLock::new(),
-        };
-        // Enforce the invariant: by the time the model is handed back, the
-        // hidden size is known. Built-in models have it hardcoded; passthrough
-        // models need one API round-trip to learn it. predict() populates the
-        // OnceLock on success. If probing fails the caller never gets a
-        // partially initialized model.
-        if model.known_hidden_size().is_none() {
-            model.predict(&["probe"])?;
-        }
-        Ok(model)
+        })
     }
 
     fn known_hidden_size(&self) -> Option<usize> {
diff --git a/embeddings/src/model/voyage_test.rs b/embeddings/src/model/voyage_test.rs
index 1a9a59a2..cae5668b 100644
--- a/embeddings/src/model/voyage_test.rs
+++ b/embeddings/src/model/voyage_test.rs
@@ -177,7 +177,7 @@ mod tests {
     }
 
     #[test]
-    #[should_panic(expected = "Unknown model")]
+    #[should_panic(expected = "hidden size must be populated during model construction")]
     fn test_get_hidden_size_unknown_model() {
         // This test verifies the panic behavior for unknown models
         // In practice, this shouldn't happen due to validation in new()

From b0bc59ebe449f19cae194d0f54ab09da9784bb88 Mon Sep 17 00:00:00 2001
From: Don Hardman <don.hardman@muvon.io>
Date: Sun, 24 May 2026 17:07:09 +0300
Subject: [PATCH 5/6] build(embeddings): simplify BLAS feature selection

---
 cmake/build_embeddings.cmake | 31 ++++++++++++-------------------
 1 file changed, 12 insertions(+), 19 deletions(-)

diff --git a/cmake/build_embeddings.cmake b/cmake/build_embeddings.cmake
index 5010b098..dd035035 100644
--- a/cmake/build_embeddings.cmake
+++ b/cmake/build_embeddings.cmake
@@ -50,27 +50,20 @@ function(build_embeddings_lib)
 	set(ENV{GIT_COMMIT_ID} "${GIT_COMMIT_ID}")
 	set(ENV{GIT_TIMESTAMP_ID} "${GIT_TIMESTAMP_ID}")
 
-	# EMBEDDINGS_CARGO_FEATURES may be set externally (e.g., parent CMake) to inject
-	# extra cargo features. If unset, default to platform-specific BLAS acceleration
-	# for candle: accelerate on macOS, mkl on Linux when available.
-	if (NOT DEFINED EMBEDDINGS_CARGO_FEATURES OR "${EMBEDDINGS_CARGO_FEATURES}" STREQUAL "")
-		if (APPLE)
-			set(EMBEDDINGS_CARGO_FEATURES "accelerate")
-		elseif (UNIX)
-			execute_process(COMMAND pkg-config --exists mkl-dynamic-lp64-seq RESULT_VARIABLE MKL_FOUND OUTPUT_QUIET ERROR_QUIET)
-			if (MKL_FOUND EQUAL 0)
-				set(EMBEDDINGS_CARGO_FEATURES "mkl")
-			endif ()
-		endif ()
-	endif ()
-
-	set(EMBEDDINGS_CARGO_FEATURE_ARGS "")
-	if (NOT "${EMBEDDINGS_CARGO_FEATURES}" STREQUAL "")
-		set(EMBEDDINGS_CARGO_FEATURE_ARGS --features ${EMBEDDINGS_CARGO_FEATURES})
-	endif ()
+	# Enable platform-specific BLAS acceleration for candle when available
+	set(EMBEDDINGS_CARGO_FEATURES "")
+	if(APPLE)
+		set(EMBEDDINGS_CARGO_FEATURES "--features" "accelerate")
+	elseif(UNIX)
+		# MKL provides multi-threaded BLAS on Linux; skip if not available
+		execute_process(COMMAND pkg-config --exists mkl-dynamic-lp64-seq RESULT_VARIABLE MKL_FOUND OUTPUT_QUIET ERROR_QUIET)
+		if(MKL_FOUND EQUAL 0)
+			set(EMBEDDINGS_CARGO_FEATURES "--features" "mkl")
+		endif()
+	endif()
 
 	execute_process (
-			COMMAND cargo build --manifest-path ${CMAKE_SOURCE_DIR}/embeddings/Cargo.toml --lib --release ${EMBEDDINGS_CARGO_FEATURE_ARGS} --target-dir ${CMAKE_CURRENT_BINARY_DIR}/embeddings
+			COMMAND cargo build --manifest-path ${CMAKE_SOURCE_DIR}/embeddings/Cargo.toml --lib --release ${EMBEDDINGS_CARGO_FEATURES} --target-dir ${CMAKE_CURRENT_BINARY_DIR}/embeddings
 			RESULT_VARIABLE CMD_RESULT
 	)
 

From 83f1f61e2d2555248e057f21d052c57f4f481804 Mon Sep 17 00:00:00 2001
From: Don Hardman <don.hardman@muvon.io>
Date: Sun, 24 May 2026 23:38:07 +0300
Subject: [PATCH 6/6] fix(embeddings): ensure tensor contiguity for FFI safety

---
 embeddings/src/model/local.rs | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/embeddings/src/model/local.rs b/embeddings/src/model/local.rs
index 7631d8d2..b63710d2 100644
--- a/embeddings/src/model/local.rs
+++ b/embeddings/src/model/local.rs
@@ -482,7 +482,16 @@ impl BertEmbeddingModel {
                     let summed = emb.sum(1)?.to_dtype(DType::F32)?;
                     let divisor = Tensor::new(seq_len as f32, &self.device)?;
                     let mean_emb = summed.broadcast_div(&divisor)?;
-                    mean_emb.get(0)?.to_vec1::<f32>()?
+                    // .contiguous() forces candle's to_vec1 to take its
+                    // contiguous-offsets path (slice::to_vec, cap == len).
+                    // The strided path uses Iterator::collect, which can
+                    // produce Vec with cap > len from FromIterator growth
+                    // doubling — that would mean the (ptr, len, cap) we
+                    // hand across FFI doesn't match the canonical layout
+                    // glibc expects when Vec::from_raw_parts drops on the
+                    // C++ side via free_vec_result. Eliminate the path
+                    // dependency entirely.
+                    mean_emb.get(0)?.contiguous()?.to_vec1::<f32>()?
                 };
                 normalize(&mut emb_vec);
                 all_embeddings.push(emb_vec);
@@ -527,7 +536,10 @@ impl BertEmbeddingModel {
 
                 let mut out = Vec::with_capacity(batch_size);
                 for i in 0..batch_size {
-                    out.push(mean_emb.get(i)?.to_vec1::<f32>()?);
+                    // See contiguous() rationale on the batch-of-1 fast path
+                    // above — same FFI cap/len invariant requirement applies
+                    // to each row pulled out of the batched mean_emb.
+                    out.push(mean_emb.get(i)?.contiguous()?.to_vec1::<f32>()?);
                 }
                 out
             };
@@ -1236,7 +1248,10 @@ impl TextModel for LocalModel {
                         let summed = emb.sum(1)?.to_dtype(DType::F32)?;
                         let divisor = Tensor::new(seq_len as f32, &m.device)?;
                         let mean_emb = summed.broadcast_div(&divisor)?;
-                        mean_emb.get(0)?.to_vec1::<f32>()?
+                        // See contiguous() rationale on
+                        // BertEmbeddingModel::predict_chunks above. Same FFI
+                        // canonical-layout invariant required here.
+                        mean_emb.get(0)?.contiguous()?.to_vec1::<f32>()?
                     };
                     normalize(&mut emb_vec);
                     return Ok(vec![emb_vec]);
@@ -1361,7 +1376,12 @@ impl TextModel for LocalModel {
                 };
 
                 if let Ok(e_j) = embeddings.get(0) {
+                    // See contiguous() rationale on BertEmbeddingModel above.
+                    // Same FFI canonical-layout invariant for T5 / Causal /
+                    // Quantized sequential output.
                     let emb_vec: Vec<f32> = e_j
+                        .contiguous()
+                        .map_err(|e| -> Box<dyn Error> { Box::new(e) })?
                         .to_vec1::<f32>()
                         .map_err(|e| -> Box<dyn Error> { Box::new(e) })?;
                     let mut emb = emb_vec;