@@ -81,20 +81,16 @@ std::string inferModelTypeFromTensorKeys(const std::vector<std::string>& tensor_
8181 return " vae" ; // Default to VAE if we can't determine
8282 }
8383
84- // Check for LLM model indicators
85- for (const std::string& name : tensor_keys) {
86- if (name.find (" blk.35.attn_k.weight" ) != std::string::npos ||
87- name.find (" model.layers.35.post_attention_layernorm.weight" ) != std::string::npos) {
88- LOG_DEBUG (" Detected LLM model" );
89- return " llm" ;
90- }
91- }
92-
9384 bool has_text_model = false ;
9485 bool has_text_projection = false ;
9586 bool has_position_ids = false ;
9687 bool has_self_attention = false ;
9788 bool has_dense_relu_dense = false ;
89+ bool has_llm_token_embedding = false ;
90+ bool has_llm_attention = false ;
91+ bool has_llm_mlp = false ;
92+ bool has_llm_output_norm = false ;
93+ bool has_llm_qk_norm = false ;
9894
9995 // Count transformer layers to distinguish CLIP-L (12 layers) from CLIP-G (32 layers)
10096 int max_layer_number = -1 ;
@@ -124,6 +120,44 @@ std::string inferModelTypeFromTensorKeys(const std::vector<std::string>& tensor_
124120 has_dense_relu_dense = true ;
125121 }
126122
123+ // LLM model indicators. Support both raw GGUF naming and converted safetensors naming.
124+ if (name_lower.find (" token_embd.weight" ) != std::string::npos ||
125+ name_lower.find (" embed_tokens.weight" ) != std::string::npos) {
126+ has_llm_token_embedding = true ;
127+ }
128+ if ((name_lower.find (" blk." ) != std::string::npos &&
129+ (name_lower.find (" attn_q.weight" ) != std::string::npos ||
130+ name_lower.find (" attn_k.weight" ) != std::string::npos ||
131+ name_lower.find (" attn_v.weight" ) != std::string::npos ||
132+ name_lower.find (" attn_output.weight" ) != std::string::npos)) ||
133+ (name_lower.find (" model.layers." ) != std::string::npos &&
134+ (name_lower.find (" self_attn.q_proj.weight" ) != std::string::npos ||
135+ name_lower.find (" self_attn.k_proj.weight" ) != std::string::npos ||
136+ name_lower.find (" self_attn.v_proj.weight" ) != std::string::npos ||
137+ name_lower.find (" self_attn.o_proj.weight" ) != std::string::npos))) {
138+ has_llm_attention = true ;
139+ }
140+ if ((name_lower.find (" blk." ) != std::string::npos &&
141+ (name_lower.find (" ffn_gate.weight" ) != std::string::npos ||
142+ name_lower.find (" ffn_up.weight" ) != std::string::npos ||
143+ name_lower.find (" ffn_down.weight" ) != std::string::npos)) ||
144+ (name_lower.find (" model.layers." ) != std::string::npos &&
145+ (name_lower.find (" mlp.gate_proj.weight" ) != std::string::npos ||
146+ name_lower.find (" mlp.up_proj.weight" ) != std::string::npos ||
147+ name_lower.find (" mlp.down_proj.weight" ) != std::string::npos))) {
148+ has_llm_mlp = true ;
149+ }
150+ if (name_lower.find (" output_norm.weight" ) != std::string::npos ||
151+ name_lower.find (" model.norm.weight" ) != std::string::npos) {
152+ has_llm_output_norm = true ;
153+ }
154+ if (name_lower.find (" attn_q_norm.weight" ) != std::string::npos ||
155+ name_lower.find (" attn_k_norm.weight" ) != std::string::npos ||
156+ name_lower.find (" self_attn.q_norm.weight" ) != std::string::npos ||
157+ name_lower.find (" self_attn.k_norm.weight" ) != std::string::npos) {
158+ has_llm_qk_norm = true ;
159+ }
160+
127161 // Extract layer numbers from tensor names
128162 // Look for patterns like "layers.11", "layer.31", "blocks.5", etc.
129163 if (name_lower.find (" layer" ) != std::string::npos || name_lower.find (" block" ) != std::string::npos) {
@@ -152,6 +186,15 @@ std::string inferModelTypeFromTensorKeys(const std::vector<std::string>& tensor_
152186 return " t5xxl" ;
153187 }
154188
189+ // Qwen3 and similar LLMs expose a transformer block structure with token embeddings,
190+ // attention projections, MLP projections, and a final output norm.
191+ if ((has_llm_token_embedding && has_llm_attention && has_llm_mlp) ||
192+ (has_llm_attention && has_llm_mlp && has_llm_output_norm) ||
193+ (has_llm_attention && has_llm_mlp && has_llm_qk_norm)) {
194+ LOG_DEBUG (" Detected LLM model" );
195+ return " llm" ;
196+ }
197+
155198 // If it's a CLIP model (has text model indicators)
156199 if (has_text_model || has_text_projection || has_position_ids) {
157200 // Distinguish between CLIP-L and CLIP-G based on layer count
0 commit comments