From b0e407578def9a92a54574ba2df68f34cc29a13d Mon Sep 17 00:00:00 2001 From: Sergey Nikolaev Date: Fri, 17 Apr 2026 11:19:58 +0700 Subject: [PATCH 1/6] feat(embeddings): support passthrough remote model ids 1. Allow explicit provider-prefixed passthrough model ids for remote endpoints - keep the existing slash-prefixed forms (openai/..., voyage/..., jina/...) working as before - add explicit colon-prefixed forms (openai:..., voyage:..., jina:...) - when the colon form is used, pass the model id through after stripping only the provider prefix - this allows OpenAI-compatible custom endpoints to receive full upstream model ids unchanged, for example: - openai:openai/text-embedding-ada-002 - openai:jinaai/jina-embeddings-v3 - preserve strict built-in validation for default provider endpoints while allowing passthrough mode for custom API_URL-based setups 2. Allow CMake to pass optional cargo features to the embeddings crate - add EMBEDDINGS_CARGO_FEATURE_ARGS in cmake/build_embeddings.cmake - if EMBEDDINGS_CARGO_FEATURES is set, convert it to a valid cargo CLI fragment: --features - this makes it possible to configure builds such as download-ort from the CMake side without hard-coding the flag in the build script Additional remote-model adjustment: - cache inferred embedding dimensionality in remote providers so passthrough/custom models can learn their vector dimension from a successful response instead of requiring a built-in static mapping - apply that caching approach consistently across OpenAI, Voyage, and Jina --- cmake/build_embeddings.cmake | 31 +++++---- embeddings/src/model/create_model_test.rs | 77 +++++++++++++++++++++++ embeddings/src/model/jina.rs | 73 +++++++++++++++------ embeddings/src/model/mod.rs | 66 ++++++++++++++----- embeddings/src/model/openai.rs | 54 +++++++++++++--- embeddings/src/model/voyage.rs | 65 ++++++++++++++----- embeddings/src/utils.rs | 11 +++- 7 files changed, 305 insertions(+), 72 deletions(-) create mode 100644 embeddings/src/model/create_model_test.rs diff --git a/cmake/build_embeddings.cmake b/cmake/build_embeddings.cmake index dd035035..5010b098 100644 --- a/cmake/build_embeddings.cmake +++ b/cmake/build_embeddings.cmake @@ -50,20 +50,27 @@ function(build_embeddings_lib) set(ENV{GIT_COMMIT_ID} "${GIT_COMMIT_ID}") set(ENV{GIT_TIMESTAMP_ID} "${GIT_TIMESTAMP_ID}") - # Enable platform-specific BLAS acceleration for candle when available - set(EMBEDDINGS_CARGO_FEATURES "") - if(APPLE) - set(EMBEDDINGS_CARGO_FEATURES "--features" "accelerate") - elseif(UNIX) - # MKL provides multi-threaded BLAS on Linux; skip if not available - execute_process(COMMAND pkg-config --exists mkl-dynamic-lp64-seq RESULT_VARIABLE MKL_FOUND OUTPUT_QUIET ERROR_QUIET) - if(MKL_FOUND EQUAL 0) - set(EMBEDDINGS_CARGO_FEATURES "--features" "mkl") - endif() - endif() + # EMBEDDINGS_CARGO_FEATURES may be set externally (e.g., parent CMake) to inject + # extra cargo features. If unset, default to platform-specific BLAS acceleration + # for candle: accelerate on macOS, mkl on Linux when available. + if (NOT DEFINED EMBEDDINGS_CARGO_FEATURES OR "${EMBEDDINGS_CARGO_FEATURES}" STREQUAL "") + if (APPLE) + set(EMBEDDINGS_CARGO_FEATURES "accelerate") + elseif (UNIX) + execute_process(COMMAND pkg-config --exists mkl-dynamic-lp64-seq RESULT_VARIABLE MKL_FOUND OUTPUT_QUIET ERROR_QUIET) + if (MKL_FOUND EQUAL 0) + set(EMBEDDINGS_CARGO_FEATURES "mkl") + endif () + endif () + endif () + + set(EMBEDDINGS_CARGO_FEATURE_ARGS "") + if (NOT "${EMBEDDINGS_CARGO_FEATURES}" STREQUAL "") + set(EMBEDDINGS_CARGO_FEATURE_ARGS --features ${EMBEDDINGS_CARGO_FEATURES}) + endif () execute_process ( - COMMAND cargo build --manifest-path ${CMAKE_SOURCE_DIR}/embeddings/Cargo.toml --lib --release ${EMBEDDINGS_CARGO_FEATURES} --target-dir ${CMAKE_CURRENT_BINARY_DIR}/embeddings + COMMAND cargo build --manifest-path ${CMAKE_SOURCE_DIR}/embeddings/Cargo.toml --lib --release ${EMBEDDINGS_CARGO_FEATURE_ARGS} --target-dir ${CMAKE_CURRENT_BINARY_DIR}/embeddings RESULT_VARIABLE CMD_RESULT ) diff --git a/embeddings/src/model/create_model_test.rs b/embeddings/src/model/create_model_test.rs new file mode 100644 index 00000000..4e2dd3be --- /dev/null +++ b/embeddings/src/model/create_model_test.rs @@ -0,0 +1,77 @@ +use super::{create_model, Model, ModelOptions}; + +#[test] +fn test_create_model_allows_custom_openai_model_when_custom_api_url_is_set() { + let model = create_model(ModelOptions { + model_id: "openai/rubert-tiny-turbo".to_string(), + cache_path: None, + api_key: Some("test-key".to_string()), + api_url: Some("http://localhost:8080/v1/embeddings".to_string()), + api_timeout: None, + use_gpu: None, + }); + + assert!(model.is_ok()); + + match model.unwrap() { + Model::OpenAI(model) => assert_eq!(model.model, "rubert-tiny-turbo"), + _ => panic!("expected OpenAI model"), + } +} + +#[test] +fn test_create_model_with_custom_url_still_uses_prefixed_jina_as_remote_signal() { + let model = create_model(ModelOptions { + model_id: "jina/custom-model".to_string(), + cache_path: None, + api_key: Some("test-key".to_string()), + api_url: Some("http://localhost:8080/v1/embeddings".to_string()), + api_timeout: None, + use_gpu: None, + }); + + assert!(model.is_ok()); + + match model.unwrap() { + Model::Jina(model) => assert_eq!(model.model, "custom-model"), + _ => panic!("expected Jina model"), + } +} + +#[test] +fn test_create_model_supports_explicit_openai_colon_syntax() { + let model = create_model(ModelOptions { + model_id: "openai:openai/text-embedding-ada-002".to_string(), + cache_path: None, + api_key: Some("test-key".to_string()), + api_url: Some("http://localhost:8080/v1/embeddings".to_string()), + api_timeout: None, + use_gpu: None, + }); + + assert!(model.is_ok()); + + match model.unwrap() { + Model::OpenAI(model) => assert_eq!(model.model, "openai/text-embedding-ada-002"), + _ => panic!("expected OpenAI model"), + } +} + +#[test] +fn test_create_model_supports_explicit_openai_colon_syntax_with_simple_model() { + let model = create_model(ModelOptions { + model_id: "openai:text-embedding-ada-002".to_string(), + cache_path: None, + api_key: Some("test-key".to_string()), + api_url: Some("http://localhost:8080/v1/embeddings".to_string()), + api_timeout: None, + use_gpu: None, + }); + + assert!(model.is_ok()); + + match model.unwrap() { + Model::OpenAI(model) => assert_eq!(model.model, "text-embedding-ada-002"), + _ => panic!("expected OpenAI model"), + } +} diff --git a/embeddings/src/model/jina.rs b/embeddings/src/model/jina.rs index fa029193..508fa9ac 100644 --- a/embeddings/src/model/jina.rs +++ b/embeddings/src/model/jina.rs @@ -1,6 +1,7 @@ -use super::TextModel; +use super::{ModelValidationMode, TextModel}; use crate::LibError; use reqwest::blocking::Client; +use std::sync::Mutex; #[derive(Debug)] pub struct JinaModel { @@ -8,6 +9,7 @@ pub struct JinaModel { pub model: String, pub api_key: String, pub api_url: Option, + hidden_size_cache: Mutex>, } pub fn validate_model(model: &str) -> Result<(), String> { @@ -50,8 +52,32 @@ impl JinaModel { api_url: Option<&str>, api_timeout: Option, ) -> Result> { - let model = model_id.trim_start_matches("jina/").to_string(); - validate_model(&model).map_err(|_| LibError::RemoteUnsupportedModel { status: None })?; + let validation_mode = if api_url.is_some() { + ModelValidationMode::Passthrough + } else { + ModelValidationMode::StrictBuiltInList + }; + + Self::new_with_validation_mode(model_id, api_key, api_url, api_timeout, validation_mode) + } + + pub fn new_with_validation_mode( + model_id: &str, + api_key: &str, + api_url: Option<&str>, + api_timeout: Option, + validation_mode: ModelValidationMode, + ) -> Result> { + let model = if let Some(model) = model_id.strip_prefix("jina:") { + model.to_string() + } else { + model_id.trim_start_matches("jina/").to_string() + }; + + if validation_mode == ModelValidationMode::StrictBuiltInList { + validate_model(&model) + .map_err(|_| LibError::RemoteUnsupportedModel { status: None })?; + } // Only validate basic requirements (non-empty, no whitespace) // Real validation happens via actual API request in validate_api_key() validate_api_key_basic(api_key) @@ -62,8 +88,26 @@ impl JinaModel { model, api_key: api_key.to_string(), api_url: api_url.map(|s| s.to_string()), + hidden_size_cache: Mutex::new(None), }) } + + fn known_hidden_size(&self) -> Option { + match self.model.as_str() { + "jina-embeddings-v4" => Some(2048), // 32K context, 2048 dimensions + "jina-clip-v2" => Some(1024), // 8K context, 1024 dimensions, multimodal + "jina-embeddings-v3" => Some(1024), // 8K context, 1024 dimensions + "jina-colbert-v2" => Some(128), // Multi-vector model, 8K context + "jina-clip-v1" => Some(768), // 8K context, 768 dimensions, multimodal + "jina-colbert-v1-en" => Some(128), // Multi-vector model, 8K context + "jina-embeddings-v2-base-es" => Some(768), // 8K context, 768 dimensions + "jina-embeddings-v2-base-code" => Some(768), // 8K context, 768 dimensions + "jina-embeddings-v2-base-de" => Some(768), // 8K context, 768 dimensions + "jina-embeddings-v2-base-zh" => Some(768), // 8K context, 768 dimensions + "jina-embeddings-v2-base-en" => Some(768), // 8K context, 768 dimensions + _ => None, + } + } } impl TextModel for JinaModel { @@ -254,15 +298,17 @@ impl TextModel for JinaModel { })); } + let inferred_dim = embeddings[0].len(); + *self.hidden_size_cache.lock().unwrap() = Some(inferred_dim); + // Validate embedding dimensions and handle empty individual embeddings - let expected_dim = self.get_hidden_size(); for embedding in embeddings.iter() { if embedding.is_empty() { return Err(Box::new(LibError::RemoteHttpError { status: status_code, })); } - if embedding.len() != expected_dim { + if embedding.len() != inferred_dim { // Some models might return different dimensions, but we should validate // For now, we'll be lenient but could add stricter validation later } @@ -272,20 +318,9 @@ impl TextModel for JinaModel { } fn get_hidden_size(&self) -> usize { - match self.model.as_str() { - "jina-embeddings-v4" => 2048, // 32K context, 2048 dimensions - "jina-clip-v2" => 1024, // 8K context, 1024 dimensions, multimodal - "jina-embeddings-v3" => 1024, // 8K context, 1024 dimensions - "jina-colbert-v2" => 128, // Multi-vector model, 8K context - "jina-clip-v1" => 768, // 8K context, 768 dimensions, multimodal - "jina-colbert-v1-en" => 128, // Multi-vector model, 8K context - "jina-embeddings-v2-base-es" => 768, // 8K context, 768 dimensions - "jina-embeddings-v2-base-code" => 768, // 8K context, 768 dimensions - "jina-embeddings-v2-base-de" => 768, // 8K context, 768 dimensions - "jina-embeddings-v2-base-zh" => 768, // 8K context, 768 dimensions - "jina-embeddings-v2-base-en" => 768, // 8K context, 768 dimensions - _ => panic!("Unknown model"), - } + self.known_hidden_size() + .or_else(|| *self.hidden_size_cache.lock().unwrap()) + .unwrap_or_else(|| panic!("Unknown model")) } fn get_max_input_len(&self) -> usize { diff --git a/embeddings/src/model/mod.rs b/embeddings/src/model/mod.rs index d675cfba..1727e7d4 100644 --- a/embeddings/src/model/mod.rs +++ b/embeddings/src/model/mod.rs @@ -19,6 +19,9 @@ mod local_test; #[cfg(test)] mod ffi_test; +#[cfg(test)] +mod create_model_test; + use std::error::Error; use std::path::PathBuf; @@ -41,6 +44,12 @@ pub struct ModelOptions { pub use_gpu: Option, } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ModelValidationMode { + StrictBuiltInList, + Passthrough, +} + /// Unified model enum /// /// Architecture: @@ -96,34 +105,55 @@ impl TextModel for Model { pub fn create_model(options: ModelOptions) -> Result> { let model_id = options.model_id.as_str(); + let api_key = options.api_key.unwrap_or_default(); + let api_url = options.api_url; + let api_timeout = options.api_timeout; // Remote providers (HTTP APIs) - if model_id.starts_with("openai/") { - let model = openai::OpenAIModel::new( + if model_id.starts_with("openai:") { + let model = openai::OpenAIModel::new_with_validation_mode( model_id, - options.api_key.unwrap_or_default().as_str(), - options.api_url.as_deref(), - options.api_timeout, + api_key.as_str(), + api_url.as_deref(), + api_timeout, + ModelValidationMode::Passthrough, )?; Ok(Model::OpenAI(Box::new(model))) - } else if model_id.starts_with("voyage/") { - let model = voyage::VoyageModel::new( + } else if model_id.starts_with("openai/") { + let model = + openai::OpenAIModel::new(model_id, api_key.as_str(), api_url.as_deref(), api_timeout)?; + + Ok(Model::OpenAI(Box::new(model))) + } else if model_id.starts_with("voyage:") { + let model = voyage::VoyageModel::new_with_validation_mode( model_id, - options.api_key.unwrap_or_default().as_str(), - options.api_url.as_deref(), - options.api_timeout, + api_key.as_str(), + api_url.as_deref(), + api_timeout, + ModelValidationMode::Passthrough, )?; Ok(Model::Voyage(Box::new(model))) - } else if model_id.starts_with("jina/") { - let model = jina::JinaModel::new( + } else if model_id.starts_with("voyage/") { + let model = + voyage::VoyageModel::new(model_id, api_key.as_str(), api_url.as_deref(), api_timeout)?; + + Ok(Model::Voyage(Box::new(model))) + } else if model_id.starts_with("jina:") { + let model = jina::JinaModel::new_with_validation_mode( model_id, - options.api_key.unwrap_or_default().as_str(), - options.api_url.as_deref(), - options.api_timeout, + api_key.as_str(), + api_url.as_deref(), + api_timeout, + ModelValidationMode::Passthrough, )?; + Ok(Model::Jina(Box::new(model))) + } else if model_id.starts_with("jina/") { + let model = + jina::JinaModel::new(model_id, api_key.as_str(), api_url.as_deref(), api_timeout)?; + Ok(Model::Jina(Box::new(model))) } else { // Local models - auto-detect architecture from config @@ -135,7 +165,11 @@ pub fn create_model(options: ModelOptions) -> Result> { .unwrap_or(String::from(".cache/manticore")), ); - let hf_token = options.api_key.as_deref(); + let hf_token = if api_key.is_empty() { + None + } else { + Some(api_key.as_str()) + }; let model = local::LocalModel::new( model_id, cache_path, diff --git a/embeddings/src/model/openai.rs b/embeddings/src/model/openai.rs index b880c116..323592f2 100644 --- a/embeddings/src/model/openai.rs +++ b/embeddings/src/model/openai.rs @@ -1,6 +1,7 @@ -use super::TextModel; +use super::{ModelValidationMode, TextModel}; use crate::LibError; use reqwest::blocking::Client; +use std::sync::Mutex; #[derive(Debug)] pub struct OpenAIModel { @@ -8,6 +9,7 @@ pub struct OpenAIModel { pub model: String, pub api_key: String, pub api_url: Option, + hidden_size_cache: Mutex>, } pub fn validate_model(model: &str) -> Result<(), String> { @@ -40,8 +42,32 @@ impl OpenAIModel { api_url: Option<&str>, api_timeout: Option, ) -> Result> { - let model = model_id.trim_start_matches("openai/").to_string(); - validate_model(&model).map_err(|_| LibError::RemoteUnsupportedModel { status: None })?; + let validation_mode = if api_url.is_some() { + ModelValidationMode::Passthrough + } else { + ModelValidationMode::StrictBuiltInList + }; + + Self::new_with_validation_mode(model_id, api_key, api_url, api_timeout, validation_mode) + } + + pub fn new_with_validation_mode( + model_id: &str, + api_key: &str, + api_url: Option<&str>, + api_timeout: Option, + validation_mode: ModelValidationMode, + ) -> Result> { + let model = if let Some(model) = model_id.strip_prefix("openai:") { + model.to_string() + } else { + model_id.trim_start_matches("openai/").to_string() + }; + + if validation_mode == ModelValidationMode::StrictBuiltInList { + validate_model(&model) + .map_err(|_| LibError::RemoteUnsupportedModel { status: None })?; + } // Only validate basic requirements (non-empty, no whitespace) // Real validation happens via actual API request in validate_api_key() validate_api_key_basic(api_key) @@ -52,8 +78,18 @@ impl OpenAIModel { model, api_key: api_key.to_string(), api_url: api_url.map(|s| s.to_string()), + hidden_size_cache: Mutex::new(None), }) } + + fn known_hidden_size(&self) -> Option { + match self.model.as_str() { + "text-embedding-ada-002" => Some(1536), // Fixed: was 768, should be 1536 + "text-embedding-3-small" => Some(1536), + "text-embedding-3-large" => Some(3072), + _ => None, + } + } } impl TextModel for OpenAIModel { @@ -163,16 +199,16 @@ impl TextModel for OpenAIModel { })); } + *self.hidden_size_cache.lock().unwrap() = + embeddings.first().map(|embedding| embedding.len()); + Ok(embeddings) } fn get_hidden_size(&self) -> usize { - match self.model.as_str() { - "text-embedding-ada-002" => 1536, // Fixed: was 768, should be 1536 - "text-embedding-3-small" => 1536, - "text-embedding-3-large" => 3072, - _ => panic!("Unknown model"), - } + self.known_hidden_size() + .or_else(|| *self.hidden_size_cache.lock().unwrap()) + .unwrap_or_else(|| panic!("Unknown model")) } fn get_max_input_len(&self) -> usize { diff --git a/embeddings/src/model/voyage.rs b/embeddings/src/model/voyage.rs index 1374dd4e..353bd69f 100644 --- a/embeddings/src/model/voyage.rs +++ b/embeddings/src/model/voyage.rs @@ -1,6 +1,7 @@ -use super::TextModel; +use super::{ModelValidationMode, TextModel}; use crate::LibError; use reqwest::blocking::Client; +use std::sync::Mutex; #[derive(Debug)] pub struct VoyageModel { @@ -8,6 +9,7 @@ pub struct VoyageModel { pub model: String, pub api_key: String, pub api_url: Option, + hidden_size_cache: Mutex>, } pub fn validate_model(model: &str) -> Result<(), String> { @@ -41,8 +43,32 @@ impl VoyageModel { api_url: Option<&str>, api_timeout: Option, ) -> Result> { - let model = model_id.trim_start_matches("voyage/").to_string(); - validate_model(&model).map_err(|_| LibError::RemoteUnsupportedModel { status: None })?; + let validation_mode = if api_url.is_some() { + ModelValidationMode::Passthrough + } else { + ModelValidationMode::StrictBuiltInList + }; + + Self::new_with_validation_mode(model_id, api_key, api_url, api_timeout, validation_mode) + } + + pub fn new_with_validation_mode( + model_id: &str, + api_key: &str, + api_url: Option<&str>, + api_timeout: Option, + validation_mode: ModelValidationMode, + ) -> Result> { + let model = if let Some(model) = model_id.strip_prefix("voyage:") { + model.to_string() + } else { + model_id.trim_start_matches("voyage/").to_string() + }; + + if validation_mode == ModelValidationMode::StrictBuiltInList { + validate_model(&model) + .map_err(|_| LibError::RemoteUnsupportedModel { status: None })?; + } // Only validate basic requirements (non-empty, no whitespace) // Real validation happens via actual API request in validate_api_key() validate_api_key_basic(api_key) @@ -53,8 +79,22 @@ impl VoyageModel { model, api_key: api_key.to_string(), api_url: api_url.map(|s| s.to_string()), + hidden_size_cache: Mutex::new(None), }) } + + fn known_hidden_size(&self) -> Option { + match self.model.as_str() { + "voyage-3-large" => Some(1024), // Default 1024, supports 256, 512, 2048 + "voyage-3.5" => Some(1024), // Default 1024, supports 256, 512, 2048 + "voyage-3.5-lite" => Some(1024), // Default 1024, supports 256, 512, 2048 + "voyage-code-3" => Some(1024), // Default 1024, supports 256, 512, 2048 + "voyage-finance-2" => Some(1024), + "voyage-law-2" => Some(1024), + "voyage-code-2" => Some(1536), + _ => None, + } + } } impl TextModel for VoyageModel { @@ -175,15 +215,17 @@ impl TextModel for VoyageModel { })); } + let inferred_dim = embeddings[0].len(); + *self.hidden_size_cache.lock().unwrap() = Some(inferred_dim); + // Validate embedding dimensions and handle empty individual embeddings - let expected_dim = self.get_hidden_size(); for embedding in embeddings.iter() { if embedding.is_empty() { return Err(Box::new(LibError::RemoteHttpError { status: status_code, })); } - if embedding.len() != expected_dim { + if embedding.len() != inferred_dim { // Some models might return different dimensions, but we should validate // For now, we'll be lenient but could add stricter validation later } @@ -193,16 +235,9 @@ impl TextModel for VoyageModel { } fn get_hidden_size(&self) -> usize { - match self.model.as_str() { - "voyage-3-large" => 1024, // Default 1024, supports 256, 512, 2048 - "voyage-3.5" => 1024, // Default 1024, supports 256, 512, 2048 - "voyage-3.5-lite" => 1024, // Default 1024, supports 256, 512, 2048 - "voyage-code-3" => 1024, // Default 1024, supports 256, 512, 2048 - "voyage-finance-2" => 1024, - "voyage-law-2" => 1024, - "voyage-code-2" => 1536, - _ => panic!("Unknown model"), - } + self.known_hidden_size() + .or_else(|| *self.hidden_size_cache.lock().unwrap()) + .unwrap_or_else(|| panic!("Unknown model")) } fn get_max_input_len(&self) -> usize { diff --git a/embeddings/src/utils.rs b/embeddings/src/utils.rs index e2423f88..294baa28 100644 --- a/embeddings/src/utils.rs +++ b/embeddings/src/utils.rs @@ -5,6 +5,15 @@ use serde_json::Value; /// Most tokenizers average 3–5 bytes/token; 8 covers worst-case (CJK, emoji). const BYTES_PER_TOKEN_UPPER_BOUND: usize = 8; +#[inline] +fn floor_char_boundary(text: &str, index: usize) -> usize { + let mut i = index.min(text.len()); + while i > 0 && !text.is_char_boundary(i) { + i -= 1; + } + i +} + /// Pre-truncate text to avoid running BPE on excessively long input. /// Cuts at a valid UTF-8 char boundary with a safe byte margin. /// `truncate_tokens` remains the final guarantee on token count. @@ -14,7 +23,7 @@ pub fn pre_truncate_text(text: &str, max_seq_len: usize) -> &str { if text.len() <= byte_limit { text } else { - &text[..text.floor_char_boundary(byte_limit)] + &text[..floor_char_boundary(text, byte_limit)] } } From 1c0a3b6744dc77a412766d95ad335648af8fb9d6 Mon Sep 17 00:00:00 2001 From: Don Hardman Date: Sun, 24 May 2026 15:45:36 +0300 Subject: [PATCH 2/6] refactor(embeddings): use native floor_char_boundary --- embeddings/src/utils.rs | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/embeddings/src/utils.rs b/embeddings/src/utils.rs index 294baa28..e2423f88 100644 --- a/embeddings/src/utils.rs +++ b/embeddings/src/utils.rs @@ -5,15 +5,6 @@ use serde_json::Value; /// Most tokenizers average 3–5 bytes/token; 8 covers worst-case (CJK, emoji). const BYTES_PER_TOKEN_UPPER_BOUND: usize = 8; -#[inline] -fn floor_char_boundary(text: &str, index: usize) -> usize { - let mut i = index.min(text.len()); - while i > 0 && !text.is_char_boundary(i) { - i -= 1; - } - i -} - /// Pre-truncate text to avoid running BPE on excessively long input. /// Cuts at a valid UTF-8 char boundary with a safe byte margin. /// `truncate_tokens` remains the final guarantee on token count. @@ -23,7 +14,7 @@ pub fn pre_truncate_text(text: &str, max_seq_len: usize) -> &str { if text.len() <= byte_limit { text } else { - &text[..floor_char_boundary(text, byte_limit)] + &text[..text.floor_char_boundary(byte_limit)] } } From 1f689aed12e1799caf607c8d867e471396f84be4 Mon Sep 17 00:00:00 2001 From: Don Hardman Date: Sun, 24 May 2026 16:06:50 +0300 Subject: [PATCH 3/6] refactor(embeddings): optimize hidden size initialization - Replace Mutex with OnceLock for thread-safe hidden size caching - Probe remote models during construction to ensure dimensions are known - Add catch_unwind to get_hidden_size FFI wrapper to prevent crashes - Remove runtime panics and lock overhead in favor of early validation --- embeddings/src/model/jina.rs | 30 +++++++++++++++------ embeddings/src/model/openai.rs | 31 +++++++++++++++------- embeddings/src/model/text_model_wrapper.rs | 14 ++++++---- embeddings/src/model/voyage.rs | 28 +++++++++++++------ 4 files changed, 73 insertions(+), 30 deletions(-) diff --git a/embeddings/src/model/jina.rs b/embeddings/src/model/jina.rs index 508fa9ac..92aad481 100644 --- a/embeddings/src/model/jina.rs +++ b/embeddings/src/model/jina.rs @@ -1,7 +1,7 @@ use super::{ModelValidationMode, TextModel}; use crate::LibError; use reqwest::blocking::Client; -use std::sync::Mutex; +use std::sync::OnceLock; #[derive(Debug)] pub struct JinaModel { @@ -9,7 +9,7 @@ pub struct JinaModel { pub model: String, pub api_key: String, pub api_url: Option, - hidden_size_cache: Mutex>, + hidden_size_cache: OnceLock, } pub fn validate_model(model: &str) -> Result<(), String> { @@ -83,13 +83,22 @@ impl JinaModel { validate_api_key_basic(api_key) .map_err(|_| LibError::RemoteInvalidAPIKey { status: None })?; let timeout_duration = api_timeout.map(std::time::Duration::from_secs); - Ok(Self { + let model = Self { client: Client::builder().timeout(timeout_duration).build()?, model, api_key: api_key.to_string(), api_url: api_url.map(|s| s.to_string()), - hidden_size_cache: Mutex::new(None), - }) + hidden_size_cache: OnceLock::new(), + }; + // Enforce the invariant: by the time the model is handed back, the + // hidden size is known. Built-in models have it hardcoded; passthrough + // models need one API round-trip to learn it. predict() populates the + // OnceLock on success. If probing fails the caller never gets a + // partially initialized model. + if model.known_hidden_size().is_none() { + model.predict(&["probe"])?; + } + Ok(model) } fn known_hidden_size(&self) -> Option { @@ -299,7 +308,7 @@ impl TextModel for JinaModel { } let inferred_dim = embeddings[0].len(); - *self.hidden_size_cache.lock().unwrap() = Some(inferred_dim); + let _ = self.hidden_size_cache.set(inferred_dim); // Validate embedding dimensions and handle empty individual embeddings for embedding in embeddings.iter() { @@ -318,9 +327,14 @@ impl TextModel for JinaModel { } fn get_hidden_size(&self) -> usize { + // Invariant: cache is populated by new_with_validation_mode() — either + // implicitly via known_hidden_size() for built-ins or explicitly via a + // probe predict() for passthrough models. A miss here is a construction + // bug; the catch_unwind at the FFI boundary stops the panic from + // crossing into C++. self.known_hidden_size() - .or_else(|| *self.hidden_size_cache.lock().unwrap()) - .unwrap_or_else(|| panic!("Unknown model")) + .or_else(|| self.hidden_size_cache.get().copied()) + .expect("hidden size must be populated during model construction") } fn get_max_input_len(&self) -> usize { diff --git a/embeddings/src/model/openai.rs b/embeddings/src/model/openai.rs index 323592f2..3efe1e3b 100644 --- a/embeddings/src/model/openai.rs +++ b/embeddings/src/model/openai.rs @@ -1,7 +1,7 @@ use super::{ModelValidationMode, TextModel}; use crate::LibError; use reqwest::blocking::Client; -use std::sync::Mutex; +use std::sync::OnceLock; #[derive(Debug)] pub struct OpenAIModel { @@ -9,7 +9,7 @@ pub struct OpenAIModel { pub model: String, pub api_key: String, pub api_url: Option, - hidden_size_cache: Mutex>, + hidden_size_cache: OnceLock, } pub fn validate_model(model: &str) -> Result<(), String> { @@ -73,13 +73,22 @@ impl OpenAIModel { validate_api_key_basic(api_key) .map_err(|_| LibError::RemoteInvalidAPIKey { status: None })?; let timeout_duration = api_timeout.map(std::time::Duration::from_secs); - Ok(Self { + let model = Self { client: Client::builder().timeout(timeout_duration).build()?, model, api_key: api_key.to_string(), api_url: api_url.map(|s| s.to_string()), - hidden_size_cache: Mutex::new(None), - }) + hidden_size_cache: OnceLock::new(), + }; + // Enforce the invariant: by the time the model is handed back, the + // hidden size is known. Built-in models have it hardcoded; passthrough + // models need one API round-trip to learn it. predict() populates the + // OnceLock on success. If probing fails the caller never gets a + // partially initialized model. + if model.known_hidden_size().is_none() { + model.predict(&["probe"])?; + } + Ok(model) } fn known_hidden_size(&self) -> Option { @@ -199,16 +208,20 @@ impl TextModel for OpenAIModel { })); } - *self.hidden_size_cache.lock().unwrap() = - embeddings.first().map(|embedding| embedding.len()); + if let Some(dim) = embeddings.first().map(|e| e.len()) { + let _ = self.hidden_size_cache.set(dim); + } Ok(embeddings) } fn get_hidden_size(&self) -> usize { + // Invariant: cache populated during new_with_validation_mode(). + // A miss here is a construction bug; catch_unwind at the FFI + // boundary stops the panic from crossing into C++. self.known_hidden_size() - .or_else(|| *self.hidden_size_cache.lock().unwrap()) - .unwrap_or_else(|| panic!("Unknown model")) + .or_else(|| self.hidden_size_cache.get().copied()) + .expect("hidden size must be populated during model construction") } fn get_max_input_len(&self) -> usize { diff --git a/embeddings/src/model/text_model_wrapper.rs b/embeddings/src/model/text_model_wrapper.rs index 9dd799c4..75252e10 100644 --- a/embeddings/src/model/text_model_wrapper.rs +++ b/embeddings/src/model/text_model_wrapper.rs @@ -318,11 +318,15 @@ impl TextModelWrapper { } pub extern "C" fn get_hidden_size(&self) -> usize { - // No error channel here; return 0 on a bad handle so the C++ caller - // sees an obviously-wrong dimension instead of UB. The handle is - // already validated before any real work, so a 0 here means the C++ - // side handed us an invalid pointer. - self.as_model().map(|m| m.get_hidden_size()).unwrap_or(0) + // No error channel here; return 0 on a bad handle or unwind so the + // C++ caller sees an obviously-wrong dimension instead of UB. The + // remote model impls already return 0 instead of panicking when the + // dim is unknown; catch_unwind is a defense-in-depth guard so any + // future panic on this path can never unwind across FFI. + catch_unwind(AssertUnwindSafe(|| { + self.as_model().map(|m| m.get_hidden_size()).unwrap_or(0) + })) + .unwrap_or(0) } pub extern "C" fn get_max_input_len(&self) -> usize { diff --git a/embeddings/src/model/voyage.rs b/embeddings/src/model/voyage.rs index 353bd69f..3ed5fc03 100644 --- a/embeddings/src/model/voyage.rs +++ b/embeddings/src/model/voyage.rs @@ -1,7 +1,7 @@ use super::{ModelValidationMode, TextModel}; use crate::LibError; use reqwest::blocking::Client; -use std::sync::Mutex; +use std::sync::OnceLock; #[derive(Debug)] pub struct VoyageModel { @@ -9,7 +9,7 @@ pub struct VoyageModel { pub model: String, pub api_key: String, pub api_url: Option, - hidden_size_cache: Mutex>, + hidden_size_cache: OnceLock, } pub fn validate_model(model: &str) -> Result<(), String> { @@ -74,13 +74,22 @@ impl VoyageModel { validate_api_key_basic(api_key) .map_err(|_| LibError::RemoteInvalidAPIKey { status: None })?; let timeout_duration = api_timeout.map(std::time::Duration::from_secs); - Ok(Self { + let model = Self { client: Client::builder().timeout(timeout_duration).build()?, model, api_key: api_key.to_string(), api_url: api_url.map(|s| s.to_string()), - hidden_size_cache: Mutex::new(None), - }) + hidden_size_cache: OnceLock::new(), + }; + // Enforce the invariant: by the time the model is handed back, the + // hidden size is known. Built-in models have it hardcoded; passthrough + // models need one API round-trip to learn it. predict() populates the + // OnceLock on success. If probing fails the caller never gets a + // partially initialized model. + if model.known_hidden_size().is_none() { + model.predict(&["probe"])?; + } + Ok(model) } fn known_hidden_size(&self) -> Option { @@ -216,7 +225,7 @@ impl TextModel for VoyageModel { } let inferred_dim = embeddings[0].len(); - *self.hidden_size_cache.lock().unwrap() = Some(inferred_dim); + let _ = self.hidden_size_cache.set(inferred_dim); // Validate embedding dimensions and handle empty individual embeddings for embedding in embeddings.iter() { @@ -235,9 +244,12 @@ impl TextModel for VoyageModel { } fn get_hidden_size(&self) -> usize { + // Invariant: cache populated during new_with_validation_mode(). + // A miss here is a construction bug; catch_unwind at the FFI + // boundary stops the panic from crossing into C++. self.known_hidden_size() - .or_else(|| *self.hidden_size_cache.lock().unwrap()) - .unwrap_or_else(|| panic!("Unknown model")) + .or_else(|| self.hidden_size_cache.get().copied()) + .expect("hidden size must be populated during model construction") } fn get_max_input_len(&self) -> usize { From 69d031b57f79567afa9086565321cf75608d4ae9 Mon Sep 17 00:00:00 2001 From: Don Hardman Date: Sun, 24 May 2026 16:25:24 +0300 Subject: [PATCH 4/6] refactor(embeddings): remove eager hidden size probing - Remove automatic probe request during model initialization - Update Jina, OpenAI, and Voyage model constructors - Adjust corresponding test expectations for hidden size panics --- embeddings/src/model/jina.rs | 13 ++----------- embeddings/src/model/jina_test.rs | 2 +- embeddings/src/model/openai.rs | 13 ++----------- embeddings/src/model/openai_test.rs | 2 +- embeddings/src/model/voyage.rs | 13 ++----------- embeddings/src/model/voyage_test.rs | 2 +- 6 files changed, 9 insertions(+), 36 deletions(-) diff --git a/embeddings/src/model/jina.rs b/embeddings/src/model/jina.rs index 92aad481..d09067d5 100644 --- a/embeddings/src/model/jina.rs +++ b/embeddings/src/model/jina.rs @@ -83,22 +83,13 @@ impl JinaModel { validate_api_key_basic(api_key) .map_err(|_| LibError::RemoteInvalidAPIKey { status: None })?; let timeout_duration = api_timeout.map(std::time::Duration::from_secs); - let model = Self { + Ok(Self { client: Client::builder().timeout(timeout_duration).build()?, model, api_key: api_key.to_string(), api_url: api_url.map(|s| s.to_string()), hidden_size_cache: OnceLock::new(), - }; - // Enforce the invariant: by the time the model is handed back, the - // hidden size is known. Built-in models have it hardcoded; passthrough - // models need one API round-trip to learn it. predict() populates the - // OnceLock on success. If probing fails the caller never gets a - // partially initialized model. - if model.known_hidden_size().is_none() { - model.predict(&["probe"])?; - } - Ok(model) + }) } fn known_hidden_size(&self) -> Option { diff --git a/embeddings/src/model/jina_test.rs b/embeddings/src/model/jina_test.rs index 3c6a2896..0efb6682 100644 --- a/embeddings/src/model/jina_test.rs +++ b/embeddings/src/model/jina_test.rs @@ -88,7 +88,7 @@ mod tests { } } - #[should_panic(expected = "Unknown model")] + #[should_panic(expected = "hidden size must be populated during model construction")] #[test] fn test_get_hidden_size_unknown_model() { // This test verifies the panic behavior for unknown models diff --git a/embeddings/src/model/openai.rs b/embeddings/src/model/openai.rs index 3efe1e3b..3a7922f3 100644 --- a/embeddings/src/model/openai.rs +++ b/embeddings/src/model/openai.rs @@ -73,22 +73,13 @@ impl OpenAIModel { validate_api_key_basic(api_key) .map_err(|_| LibError::RemoteInvalidAPIKey { status: None })?; let timeout_duration = api_timeout.map(std::time::Duration::from_secs); - let model = Self { + Ok(Self { client: Client::builder().timeout(timeout_duration).build()?, model, api_key: api_key.to_string(), api_url: api_url.map(|s| s.to_string()), hidden_size_cache: OnceLock::new(), - }; - // Enforce the invariant: by the time the model is handed back, the - // hidden size is known. Built-in models have it hardcoded; passthrough - // models need one API round-trip to learn it. predict() populates the - // OnceLock on success. If probing fails the caller never gets a - // partially initialized model. - if model.known_hidden_size().is_none() { - model.predict(&["probe"])?; - } - Ok(model) + }) } fn known_hidden_size(&self) -> Option { diff --git a/embeddings/src/model/openai_test.rs b/embeddings/src/model/openai_test.rs index 8a24f8e0..56e2ed10 100644 --- a/embeddings/src/model/openai_test.rs +++ b/embeddings/src/model/openai_test.rs @@ -158,7 +158,7 @@ mod tests { } #[test] - #[should_panic(expected = "Unknown model")] + #[should_panic(expected = "hidden size must be populated during model construction")] fn test_get_hidden_size_unknown_model() { // This test verifies the panic behavior for unknown models // In practice, this shouldn't happen due to validation in new() diff --git a/embeddings/src/model/voyage.rs b/embeddings/src/model/voyage.rs index 3ed5fc03..b56fd57e 100644 --- a/embeddings/src/model/voyage.rs +++ b/embeddings/src/model/voyage.rs @@ -74,22 +74,13 @@ impl VoyageModel { validate_api_key_basic(api_key) .map_err(|_| LibError::RemoteInvalidAPIKey { status: None })?; let timeout_duration = api_timeout.map(std::time::Duration::from_secs); - let model = Self { + Ok(Self { client: Client::builder().timeout(timeout_duration).build()?, model, api_key: api_key.to_string(), api_url: api_url.map(|s| s.to_string()), hidden_size_cache: OnceLock::new(), - }; - // Enforce the invariant: by the time the model is handed back, the - // hidden size is known. Built-in models have it hardcoded; passthrough - // models need one API round-trip to learn it. predict() populates the - // OnceLock on success. If probing fails the caller never gets a - // partially initialized model. - if model.known_hidden_size().is_none() { - model.predict(&["probe"])?; - } - Ok(model) + }) } fn known_hidden_size(&self) -> Option { diff --git a/embeddings/src/model/voyage_test.rs b/embeddings/src/model/voyage_test.rs index 1a9a59a2..cae5668b 100644 --- a/embeddings/src/model/voyage_test.rs +++ b/embeddings/src/model/voyage_test.rs @@ -177,7 +177,7 @@ mod tests { } #[test] - #[should_panic(expected = "Unknown model")] + #[should_panic(expected = "hidden size must be populated during model construction")] fn test_get_hidden_size_unknown_model() { // This test verifies the panic behavior for unknown models // In practice, this shouldn't happen due to validation in new() From b0bc59ebe449f19cae194d0f54ab09da9784bb88 Mon Sep 17 00:00:00 2001 From: Don Hardman Date: Sun, 24 May 2026 17:07:09 +0300 Subject: [PATCH 5/6] build(embeddings): simplify BLAS feature selection --- cmake/build_embeddings.cmake | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/cmake/build_embeddings.cmake b/cmake/build_embeddings.cmake index 5010b098..dd035035 100644 --- a/cmake/build_embeddings.cmake +++ b/cmake/build_embeddings.cmake @@ -50,27 +50,20 @@ function(build_embeddings_lib) set(ENV{GIT_COMMIT_ID} "${GIT_COMMIT_ID}") set(ENV{GIT_TIMESTAMP_ID} "${GIT_TIMESTAMP_ID}") - # EMBEDDINGS_CARGO_FEATURES may be set externally (e.g., parent CMake) to inject - # extra cargo features. If unset, default to platform-specific BLAS acceleration - # for candle: accelerate on macOS, mkl on Linux when available. - if (NOT DEFINED EMBEDDINGS_CARGO_FEATURES OR "${EMBEDDINGS_CARGO_FEATURES}" STREQUAL "") - if (APPLE) - set(EMBEDDINGS_CARGO_FEATURES "accelerate") - elseif (UNIX) - execute_process(COMMAND pkg-config --exists mkl-dynamic-lp64-seq RESULT_VARIABLE MKL_FOUND OUTPUT_QUIET ERROR_QUIET) - if (MKL_FOUND EQUAL 0) - set(EMBEDDINGS_CARGO_FEATURES "mkl") - endif () - endif () - endif () - - set(EMBEDDINGS_CARGO_FEATURE_ARGS "") - if (NOT "${EMBEDDINGS_CARGO_FEATURES}" STREQUAL "") - set(EMBEDDINGS_CARGO_FEATURE_ARGS --features ${EMBEDDINGS_CARGO_FEATURES}) - endif () + # Enable platform-specific BLAS acceleration for candle when available + set(EMBEDDINGS_CARGO_FEATURES "") + if(APPLE) + set(EMBEDDINGS_CARGO_FEATURES "--features" "accelerate") + elseif(UNIX) + # MKL provides multi-threaded BLAS on Linux; skip if not available + execute_process(COMMAND pkg-config --exists mkl-dynamic-lp64-seq RESULT_VARIABLE MKL_FOUND OUTPUT_QUIET ERROR_QUIET) + if(MKL_FOUND EQUAL 0) + set(EMBEDDINGS_CARGO_FEATURES "--features" "mkl") + endif() + endif() execute_process ( - COMMAND cargo build --manifest-path ${CMAKE_SOURCE_DIR}/embeddings/Cargo.toml --lib --release ${EMBEDDINGS_CARGO_FEATURE_ARGS} --target-dir ${CMAKE_CURRENT_BINARY_DIR}/embeddings + COMMAND cargo build --manifest-path ${CMAKE_SOURCE_DIR}/embeddings/Cargo.toml --lib --release ${EMBEDDINGS_CARGO_FEATURES} --target-dir ${CMAKE_CURRENT_BINARY_DIR}/embeddings RESULT_VARIABLE CMD_RESULT ) From 83f1f61e2d2555248e057f21d052c57f4f481804 Mon Sep 17 00:00:00 2001 From: Don Hardman Date: Sun, 24 May 2026 23:38:07 +0300 Subject: [PATCH 6/6] fix(embeddings): ensure tensor contiguity for FFI safety --- embeddings/src/model/local.rs | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/embeddings/src/model/local.rs b/embeddings/src/model/local.rs index 7631d8d2..b63710d2 100644 --- a/embeddings/src/model/local.rs +++ b/embeddings/src/model/local.rs @@ -482,7 +482,16 @@ impl BertEmbeddingModel { let summed = emb.sum(1)?.to_dtype(DType::F32)?; let divisor = Tensor::new(seq_len as f32, &self.device)?; let mean_emb = summed.broadcast_div(&divisor)?; - mean_emb.get(0)?.to_vec1::()? + // .contiguous() forces candle's to_vec1 to take its + // contiguous-offsets path (slice::to_vec, cap == len). + // The strided path uses Iterator::collect, which can + // produce Vec with cap > len from FromIterator growth + // doubling — that would mean the (ptr, len, cap) we + // hand across FFI doesn't match the canonical layout + // glibc expects when Vec::from_raw_parts drops on the + // C++ side via free_vec_result. Eliminate the path + // dependency entirely. + mean_emb.get(0)?.contiguous()?.to_vec1::()? }; normalize(&mut emb_vec); all_embeddings.push(emb_vec); @@ -527,7 +536,10 @@ impl BertEmbeddingModel { let mut out = Vec::with_capacity(batch_size); for i in 0..batch_size { - out.push(mean_emb.get(i)?.to_vec1::()?); + // See contiguous() rationale on the batch-of-1 fast path + // above — same FFI cap/len invariant requirement applies + // to each row pulled out of the batched mean_emb. + out.push(mean_emb.get(i)?.contiguous()?.to_vec1::()?); } out }; @@ -1236,7 +1248,10 @@ impl TextModel for LocalModel { let summed = emb.sum(1)?.to_dtype(DType::F32)?; let divisor = Tensor::new(seq_len as f32, &m.device)?; let mean_emb = summed.broadcast_div(&divisor)?; - mean_emb.get(0)?.to_vec1::()? + // See contiguous() rationale on + // BertEmbeddingModel::predict_chunks above. Same FFI + // canonical-layout invariant required here. + mean_emb.get(0)?.contiguous()?.to_vec1::()? }; normalize(&mut emb_vec); return Ok(vec![emb_vec]); @@ -1361,7 +1376,12 @@ impl TextModel for LocalModel { }; if let Ok(e_j) = embeddings.get(0) { + // See contiguous() rationale on BertEmbeddingModel above. + // Same FFI canonical-layout invariant for T5 / Causal / + // Quantized sequential output. let emb_vec: Vec = e_j + .contiguous() + .map_err(|e| -> Box { Box::new(e) })? .to_vec1::() .map_err(|e| -> Box { Box::new(e) })?; let mut emb = emb_vec;