Fix configuration reading and error handling for kernels (#45610)

hmellor · Copilot · vasqu · commit c1f84589f41a · 2026-04-23T20:19:39.000+02:00
* Fix missing conversion of experts

Signed-off-by: Harry Mellor &lt;19981378+hmellor@users.noreply.github.com&gt;

* Fix eager config attribute reading

Co-authored-by: Copilot &lt;copilot@github.com&gt;
Signed-off-by: Harry Mellor &lt;19981378+hmellor@users.noreply.github.com&gt;

* Add proper error when kernels isn't installed

Signed-off-by: Harry Mellor &lt;19981378+hmellor@users.noreply.github.com&gt;

* remove unnecessary mapping

Signed-off-by: Harry Mellor &lt;19981378+hmellor@users.noreply.github.com&gt;

* review comments

Co-authored-by: Copilot &lt;copilot@github.com&gt;
Signed-off-by: Harry Mellor &lt;19981378+hmellor@users.noreply.github.com&gt;

* remove double newline

Signed-off-by: Harry Mellor &lt;19981378+hmellor@users.noreply.github.com&gt;

---------

Signed-off-by: Harry Mellor &lt;19981378+hmellor@users.noreply.github.com&gt;
Co-authored-by: Copilot &lt;copilot@github.com&gt;
diff --git a/src/transformers/conversion_mapping.py b/src/transformers/conversion_mapping.py
@@ -75,7 +75,6 @@
     "qwen2_5_vl": "qwen2_vl",
     "sam3_tracker_video": "sam3_tracker",
     "pp_chart2table": "llava",
-    "qwen3_5_moe_text": "qwen3_5_text",
     "altclip_vision_model": "clip_vision_model",
     "chinese_clip_vision_model": "clip_vision_model",
     "clipseg_vision_model": "clip_vision_model",
@@ -600,6 +599,8 @@ def _build_checkpoint_conversion_mapping():
         WeightRenaming(source_patterns=r"mlp\.expert_bias", target_patterns="mlp.e_score_correction_bias"),
         WeightRenaming(source_patterns=r"mlp\.shared_mlp\.", target_patterns="mlp.shared_experts."),
     ]
+    mapping["qwen3_5_moe_text"] = mapping["qwen3_5_text"].copy()
+    mapping["qwen3_5_moe_text"] += mapping["qwen2_moe"].copy()
 
     for model_type, base_pattern in _MODEL_TO_CONVERSION_PATTERN.items():
         if model_type in mapping:
diff --git a/src/transformers/integrations/finegrained_fp8.py b/src/transformers/integrations/finegrained_fp8.py
@@ -53,6 +53,13 @@
 _deepgemm_available = None
 
 
+def _first_attr(obj, *names):
+    for name in names:
+        if hasattr(obj, name):
+            return getattr(obj, name)
+    raise AttributeError(f"{type(obj).__name__} has none of: {names}")
+
+
 def _load_triton_kernel():
     """Lazily load the finegrained-fp8 Triton kernel and extract functions.
 
@@ -74,10 +81,10 @@ def _load_triton_kernel():
     _triton_available = False  # mark attempted before any early exit
 
     kernel = lazy_load_kernel("finegrained-fp8")
-    triton_fp8_matmul = getattr(kernel, "w8a8_fp8_matmul")
-    triton_fp8_act_quant = getattr(kernel, "fp8_act_quant")
-    triton_batched_fp8_matmul = getattr(kernel, "w8a8_fp8_matmul_batched")
-    triton_grouped_fp8_matmul = getattr(kernel, "w8a8_fp8_matmul_grouped")
+    triton_fp8_matmul = getattr(kernel, "w8a8_fp8_matmul", None)
+    triton_fp8_act_quant = getattr(kernel, "fp8_act_quant", None)
+    triton_batched_fp8_matmul = getattr(kernel, "w8a8_fp8_matmul_batched", None)
+    triton_grouped_fp8_matmul = getattr(kernel, "w8a8_fp8_matmul_grouped", None)
 
     missing = [
         name
@@ -137,8 +144,8 @@ def _load_deepgemm_kernel():
         )
 
     kernel = lazy_load_kernel("deep-gemm")
-    deepgemm_fp8_matmul = getattr(kernel, "fp8_gemm_nt")
-    deepgemm_grouped_fp8_matmul = getattr(kernel, "m_grouped_fp8_gemm_nt_contiguous")
+    deepgemm_fp8_matmul = getattr(kernel, "fp8_gemm_nt", None)
+    deepgemm_grouped_fp8_matmul = getattr(kernel, "m_grouped_fp8_gemm_nt_contiguous", None)
     deepgemm_per_token_cast_to_fp8 = resolve_internal_import(kernel, chained_path="utils.per_token_cast_to_fp8")
 
     missing = [
@@ -596,9 +603,9 @@ def __init__(
         self.block_size = block_size
         self.hidden_dim = config.hidden_size
         self.activation_scheme = activation_scheme
-        self.num_experts = getattr(config, "num_local_experts", config.num_experts)
-        self.intermediate_dim = getattr(config, "moe_intermediate_size", config.intermediate_size)
-        self.act_fn = ACT2FN[getattr(config, "hidden_activation", config.hidden_act)]
+        self.num_experts = _first_attr(config, "num_local_experts", "num_experts")
+        self.intermediate_dim = _first_attr(config, "moe_intermediate_size", "intermediate_size")
+        self.act_fn = ACT2FN[_first_attr(config, "hidden_activation", "hidden_act")]
 
         if self.has_gate:
             gu_proj_out, gu_proj_in = 2 * self.intermediate_dim, self.hidden_dim