|
| 1 | +//! Auto-detect model architecture from config metadata. |
| 2 | +//! |
| 3 | +//! EmbedAnything reads config.json and routes to the right model. |
| 4 | +//! We read GGUF metadata or config.json and route to the right |
| 5 | +//! codebook builder + tokenizer + lens configuration. |
| 6 | +//! |
| 7 | +//! No hardcoded paths per model. One entry point, automatic routing. |
| 8 | +
|
| 9 | +use std::collections::HashMap; |
| 10 | + |
| 11 | +/// Known model architectures and their properties. |
| 12 | +#[derive(Clone, Debug, PartialEq)] |
| 13 | +pub enum Architecture { |
| 14 | + /// XLM-RoBERTa (Jina v3, BGE-M3). SentencePiece BPE, 250K vocab. |
| 15 | + XlmRoberta, |
| 16 | + /// Qwen2/Qwen3 (Reranker, Qwopus, Jina v5). Qwen BPE, 151K vocab. |
| 17 | + Qwen, |
| 18 | + /// ModernBERT (OLMo tokenizer, code-friendly). 50K vocab. |
| 19 | + ModernBert, |
| 20 | + /// BERT base (sentence-transformers, MiniLM). WordPiece, 30K vocab. |
| 21 | + Bert, |
| 22 | + /// Unknown architecture. |
| 23 | + Unknown(String), |
| 24 | +} |
| 25 | + |
| 26 | +/// Detected model configuration. |
| 27 | +#[derive(Clone, Debug)] |
| 28 | +pub struct DetectedModel { |
| 29 | + pub architecture: Architecture, |
| 30 | + pub name: String, |
| 31 | + pub vocab_size: usize, |
| 32 | + pub hidden_dim: usize, |
| 33 | + pub num_layers: usize, |
| 34 | + pub num_heads: usize, |
| 35 | + /// Recommended lens for this model. |
| 36 | + pub recommended_lens: Option<super::builder::Lens>, |
| 37 | + /// Per-role gate policy. |
| 38 | + pub gate_modulated: bool, |
| 39 | + /// Whether this is an MoE model. |
| 40 | + pub is_moe: bool, |
| 41 | + pub num_experts: Option<usize>, |
| 42 | +} |
| 43 | + |
| 44 | +/// Detect architecture from a config.json content string. |
| 45 | +pub fn detect_from_config_json(json_str: &str) -> Result<DetectedModel, String> { |
| 46 | + let parsed: serde_json::Value = serde_json::from_str(json_str) |
| 47 | + .map_err(|e| format!("invalid config.json: {}", e))?; |
| 48 | + |
| 49 | + // Architecture detection: check "architectures" array or "model_type" |
| 50 | + let arch_str = parsed.get("architectures") |
| 51 | + .and_then(|a| a.as_array()) |
| 52 | + .and_then(|a| a.first()) |
| 53 | + .and_then(|v| v.as_str()) |
| 54 | + .or_else(|| parsed.get("model_type").and_then(|v| v.as_str())) |
| 55 | + .unwrap_or("unknown"); |
| 56 | + |
| 57 | + let architecture = match arch_str { |
| 58 | + s if s.contains("XLMRoberta") || s.contains("xlm-roberta") => Architecture::XlmRoberta, |
| 59 | + s if s.contains("Qwen") || s.contains("qwen") => Architecture::Qwen, |
| 60 | + s if s.contains("ModernBert") || s.contains("modernbert") => Architecture::ModernBert, |
| 61 | + s if s.contains("Bert") || s.contains("bert") => Architecture::Bert, |
| 62 | + s if s.contains("JinaBert") => Architecture::XlmRoberta, // Jina BERT = modified XLM-RoBERTa |
| 63 | + other => Architecture::Unknown(other.into()), |
| 64 | + }; |
| 65 | + |
| 66 | + let vocab_size = parsed.get("vocab_size") |
| 67 | + .and_then(|v| v.as_u64()) |
| 68 | + .unwrap_or(0) as usize; |
| 69 | + let hidden_dim = parsed.get("hidden_size") |
| 70 | + .and_then(|v| v.as_u64()) |
| 71 | + .unwrap_or(0) as usize; |
| 72 | + let num_layers = parsed.get("num_hidden_layers") |
| 73 | + .and_then(|v| v.as_u64()) |
| 74 | + .unwrap_or(0) as usize; |
| 75 | + let num_heads = parsed.get("num_attention_heads") |
| 76 | + .and_then(|v| v.as_u64()) |
| 77 | + .unwrap_or(0) as usize; |
| 78 | + let num_experts = parsed.get("num_experts") |
| 79 | + .and_then(|v| v.as_u64()) |
| 80 | + .map(|v| v as usize); |
| 81 | + |
| 82 | + let recommended_lens = match &architecture { |
| 83 | + Architecture::XlmRoberta if vocab_size >= 250_000 => { |
| 84 | + Some(super::builder::Lens::Jina) // or BgeM3, both XLM-RoBERTa |
| 85 | + } |
| 86 | + Architecture::Qwen if vocab_size >= 150_000 => { |
| 87 | + Some(super::builder::Lens::Reranker) // Qwen tokenizer = reranker |
| 88 | + } |
| 89 | + _ => None, |
| 90 | + }; |
| 91 | + |
| 92 | + let name = parsed.get("_name_or_path") |
| 93 | + .or_else(|| parsed.get("model_type")) |
| 94 | + .and_then(|v| v.as_str()) |
| 95 | + .unwrap_or("unknown") |
| 96 | + .to_string(); |
| 97 | + |
| 98 | + Ok(DetectedModel { |
| 99 | + architecture, |
| 100 | + name, |
| 101 | + vocab_size, |
| 102 | + hidden_dim, |
| 103 | + num_layers, |
| 104 | + num_heads, |
| 105 | + recommended_lens, |
| 106 | + gate_modulated: num_experts.is_some() || num_layers > 24, |
| 107 | + is_moe: num_experts.is_some(), |
| 108 | + num_experts, |
| 109 | + }) |
| 110 | +} |
| 111 | + |
| 112 | +/// Detect architecture from GGUF metadata key-value pairs. |
| 113 | +/// GGUF stores metadata as typed KV pairs in the header. |
| 114 | +pub fn detect_from_gguf_metadata(metadata: &HashMap<String, String>) -> DetectedModel { |
| 115 | + let arch = metadata.get("general.architecture") |
| 116 | + .or_else(|| metadata.get("general.name")) |
| 117 | + .cloned() |
| 118 | + .unwrap_or_default(); |
| 119 | + |
| 120 | + let architecture = if arch.contains("bert") || arch.contains("roberta") { |
| 121 | + Architecture::XlmRoberta |
| 122 | + } else if arch.contains("qwen") { |
| 123 | + Architecture::Qwen |
| 124 | + } else if arch.contains("modernbert") { |
| 125 | + Architecture::ModernBert |
| 126 | + } else { |
| 127 | + Architecture::Unknown(arch.clone()) |
| 128 | + }; |
| 129 | + |
| 130 | + let vocab_size = metadata.get("tokenizer.ggml.tokens") |
| 131 | + .and_then(|v| v.parse().ok()) |
| 132 | + .or_else(|| metadata.get("bert.token_count").and_then(|v| v.parse().ok())) |
| 133 | + .unwrap_or(0); |
| 134 | + |
| 135 | + let hidden_dim = metadata.get("bert.embedding_length") |
| 136 | + .or_else(|| metadata.get("qwen2.embedding_length")) |
| 137 | + .and_then(|v| v.parse().ok()) |
| 138 | + .unwrap_or(0); |
| 139 | + |
| 140 | + let num_layers = metadata.get("bert.block_count") |
| 141 | + .or_else(|| metadata.get("qwen2.block_count")) |
| 142 | + .and_then(|v| v.parse().ok()) |
| 143 | + .unwrap_or(0); |
| 144 | + |
| 145 | + let num_heads = metadata.get("bert.attention.head_count") |
| 146 | + .or_else(|| metadata.get("qwen2.attention.head_count")) |
| 147 | + .and_then(|v| v.parse().ok()) |
| 148 | + .unwrap_or(0); |
| 149 | + |
| 150 | + let num_experts = metadata.get("qwen2.expert_count") |
| 151 | + .and_then(|v| v.parse().ok()); |
| 152 | + |
| 153 | + let name = metadata.get("general.name").cloned().unwrap_or_default(); |
| 154 | + |
| 155 | + let recommended_lens = match &architecture { |
| 156 | + Architecture::XlmRoberta => Some(super::builder::Lens::Jina), |
| 157 | + Architecture::Qwen => Some(super::builder::Lens::Reranker), |
| 158 | + _ => None, |
| 159 | + }; |
| 160 | + |
| 161 | + DetectedModel { |
| 162 | + architecture, |
| 163 | + name, |
| 164 | + vocab_size, |
| 165 | + hidden_dim, |
| 166 | + num_layers, |
| 167 | + num_heads, |
| 168 | + recommended_lens, |
| 169 | + gate_modulated: num_experts.is_some(), |
| 170 | + is_moe: num_experts.is_some(), |
| 171 | + num_experts, |
| 172 | + } |
| 173 | +} |
| 174 | + |
| 175 | +#[cfg(test)] |
| 176 | +mod tests { |
| 177 | + use super::*; |
| 178 | + |
| 179 | + #[test] |
| 180 | + fn detect_jina_v3() { |
| 181 | + let config = r#"{ |
| 182 | + "architectures": ["XLMRobertaForMaskedLM"], |
| 183 | + "vocab_size": 250002, |
| 184 | + "hidden_size": 1024, |
| 185 | + "num_hidden_layers": 24, |
| 186 | + "num_attention_heads": 16, |
| 187 | + "_name_or_path": "jinaai/jina-embeddings-v3" |
| 188 | + }"#; |
| 189 | + let model = detect_from_config_json(config).unwrap(); |
| 190 | + assert_eq!(model.architecture, Architecture::XlmRoberta); |
| 191 | + assert_eq!(model.vocab_size, 250_002); |
| 192 | + assert_eq!(model.hidden_dim, 1024); |
| 193 | + assert!(model.recommended_lens.is_some()); |
| 194 | + } |
| 195 | + |
| 196 | + #[test] |
| 197 | + fn detect_qwen3() { |
| 198 | + let config = r#"{ |
| 199 | + "architectures": ["Qwen3ForCausalLM"], |
| 200 | + "vocab_size": 151936, |
| 201 | + "hidden_size": 1024, |
| 202 | + "num_hidden_layers": 28, |
| 203 | + "num_attention_heads": 16, |
| 204 | + "model_type": "qwen3" |
| 205 | + }"#; |
| 206 | + let model = detect_from_config_json(config).unwrap(); |
| 207 | + assert_eq!(model.architecture, Architecture::Qwen); |
| 208 | + assert_eq!(model.vocab_size, 151_936); |
| 209 | + } |
| 210 | + |
| 211 | + #[test] |
| 212 | + fn detect_modernbert() { |
| 213 | + let config = r#"{ |
| 214 | + "architectures": ["ModernBertModel"], |
| 215 | + "vocab_size": 50368, |
| 216 | + "hidden_size": 2624, |
| 217 | + "num_hidden_layers": 28, |
| 218 | + "num_attention_heads": 16 |
| 219 | + }"#; |
| 220 | + let model = detect_from_config_json(config).unwrap(); |
| 221 | + assert_eq!(model.architecture, Architecture::ModernBert); |
| 222 | + assert_eq!(model.vocab_size, 50_368); |
| 223 | + assert_eq!(model.hidden_dim, 2624); |
| 224 | + } |
| 225 | + |
| 226 | + #[test] |
| 227 | + fn detect_moe() { |
| 228 | + let config = r#"{ |
| 229 | + "architectures": ["Qwen3ForCausalLM"], |
| 230 | + "vocab_size": 202048, |
| 231 | + "hidden_size": 5120, |
| 232 | + "num_hidden_layers": 48, |
| 233 | + "num_attention_heads": 40, |
| 234 | + "num_experts": 128 |
| 235 | + }"#; |
| 236 | + let model = detect_from_config_json(config).unwrap(); |
| 237 | + assert!(model.is_moe); |
| 238 | + assert_eq!(model.num_experts, Some(128)); |
| 239 | + assert!(model.gate_modulated); |
| 240 | + } |
| 241 | + |
| 242 | + #[test] |
| 243 | + fn detect_from_gguf() { |
| 244 | + let mut meta = HashMap::new(); |
| 245 | + meta.insert("general.architecture".into(), "bert".into()); |
| 246 | + meta.insert("general.name".into(), "jina-embeddings-v3".into()); |
| 247 | + meta.insert("bert.embedding_length".into(), "1024".into()); |
| 248 | + meta.insert("bert.block_count".into(), "24".into()); |
| 249 | + |
| 250 | + let model = detect_from_gguf_metadata(&meta); |
| 251 | + assert_eq!(model.architecture, Architecture::XlmRoberta); |
| 252 | + assert_eq!(model.hidden_dim, 1024); |
| 253 | + } |
| 254 | + |
| 255 | + #[test] |
| 256 | + fn detect_unknown_graceful() { |
| 257 | + let config = r#"{"architectures": ["SomeNewModel"], "vocab_size": 100000}"#; |
| 258 | + let model = detect_from_config_json(config).unwrap(); |
| 259 | + assert!(matches!(model.architecture, Architecture::Unknown(_))); |
| 260 | + } |
| 261 | +} |
0 commit comments