Skip to content

Commit c1f8458

Browse files
hmellorCopilot
authored andcommitted
Fix configuration reading and error handling for kernels (#45610)
* Fix missing conversion of experts Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> * Fix eager config attribute reading Co-authored-by: Copilot <copilot@github.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> * Add proper error when kernels isn't installed Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> * remove unnecessary mapping Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> * review comments Co-authored-by: Copilot <copilot@github.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> * remove double newline Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --------- Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Copilot <copilot@github.com>
1 parent 45add8a commit c1f8458

2 files changed

Lines changed: 18 additions & 10 deletions

File tree

src/transformers/conversion_mapping.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@
7575
"qwen2_5_vl": "qwen2_vl",
7676
"sam3_tracker_video": "sam3_tracker",
7777
"pp_chart2table": "llava",
78-
"qwen3_5_moe_text": "qwen3_5_text",
7978
"altclip_vision_model": "clip_vision_model",
8079
"chinese_clip_vision_model": "clip_vision_model",
8180
"clipseg_vision_model": "clip_vision_model",
@@ -600,6 +599,8 @@ def _build_checkpoint_conversion_mapping():
600599
WeightRenaming(source_patterns=r"mlp\.expert_bias", target_patterns="mlp.e_score_correction_bias"),
601600
WeightRenaming(source_patterns=r"mlp\.shared_mlp\.", target_patterns="mlp.shared_experts."),
602601
]
602+
mapping["qwen3_5_moe_text"] = mapping["qwen3_5_text"].copy()
603+
mapping["qwen3_5_moe_text"] += mapping["qwen2_moe"].copy()
603604

604605
for model_type, base_pattern in _MODEL_TO_CONVERSION_PATTERN.items():
605606
if model_type in mapping:

src/transformers/integrations/finegrained_fp8.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,13 @@
5353
_deepgemm_available = None
5454

5555

56+
def _first_attr(obj, *names):
57+
for name in names:
58+
if hasattr(obj, name):
59+
return getattr(obj, name)
60+
raise AttributeError(f"{type(obj).__name__} has none of: {names}")
61+
62+
5663
def _load_triton_kernel():
5764
"""Lazily load the finegrained-fp8 Triton kernel and extract functions.
5865
@@ -74,10 +81,10 @@ def _load_triton_kernel():
7481
_triton_available = False # mark attempted before any early exit
7582

7683
kernel = lazy_load_kernel("finegrained-fp8")
77-
triton_fp8_matmul = getattr(kernel, "w8a8_fp8_matmul")
78-
triton_fp8_act_quant = getattr(kernel, "fp8_act_quant")
79-
triton_batched_fp8_matmul = getattr(kernel, "w8a8_fp8_matmul_batched")
80-
triton_grouped_fp8_matmul = getattr(kernel, "w8a8_fp8_matmul_grouped")
84+
triton_fp8_matmul = getattr(kernel, "w8a8_fp8_matmul", None)
85+
triton_fp8_act_quant = getattr(kernel, "fp8_act_quant", None)
86+
triton_batched_fp8_matmul = getattr(kernel, "w8a8_fp8_matmul_batched", None)
87+
triton_grouped_fp8_matmul = getattr(kernel, "w8a8_fp8_matmul_grouped", None)
8188

8289
missing = [
8390
name
@@ -137,8 +144,8 @@ def _load_deepgemm_kernel():
137144
)
138145

139146
kernel = lazy_load_kernel("deep-gemm")
140-
deepgemm_fp8_matmul = getattr(kernel, "fp8_gemm_nt")
141-
deepgemm_grouped_fp8_matmul = getattr(kernel, "m_grouped_fp8_gemm_nt_contiguous")
147+
deepgemm_fp8_matmul = getattr(kernel, "fp8_gemm_nt", None)
148+
deepgemm_grouped_fp8_matmul = getattr(kernel, "m_grouped_fp8_gemm_nt_contiguous", None)
142149
deepgemm_per_token_cast_to_fp8 = resolve_internal_import(kernel, chained_path="utils.per_token_cast_to_fp8")
143150

144151
missing = [
@@ -596,9 +603,9 @@ def __init__(
596603
self.block_size = block_size
597604
self.hidden_dim = config.hidden_size
598605
self.activation_scheme = activation_scheme
599-
self.num_experts = getattr(config, "num_local_experts", config.num_experts)
600-
self.intermediate_dim = getattr(config, "moe_intermediate_size", config.intermediate_size)
601-
self.act_fn = ACT2FN[getattr(config, "hidden_activation", config.hidden_act)]
606+
self.num_experts = _first_attr(config, "num_local_experts", "num_experts")
607+
self.intermediate_dim = _first_attr(config, "moe_intermediate_size", "intermediate_size")
608+
self.act_fn = ACT2FN[_first_attr(config, "hidden_activation", "hidden_act")]
602609

603610
if self.has_gate:
604611
gu_proj_out, gu_proj_in = 2 * self.intermediate_dim, self.hidden_dim

0 commit comments

Comments
 (0)