make moe output dtype consistent on non-cuda backends

Gasoonjia · Gasoonjia · commit 15589f390417 · 2026-04-27T13:40:48.000-07:00
diff --git a/examples/models/qwen3_5_moe/model.py b/examples/models/qwen3_5_moe/model.py
@@ -21,7 +21,6 @@
 
 import torch
 import torch.nn as nn
-
 from executorch.examples.models.qwen3_5_moe.sampler import sample
 from torch.nn import functional as F
 
@@ -186,7 +185,6 @@ def _apply_rotary(x, cos, sin):
 
 
 class KVCache(nn.Module):
-
     def __init__(self, n_kv_heads, head_dim, max_seq_len):
         super().__init__()
         self.register_buffer(
@@ -207,7 +205,6 @@ def update(self, input_pos, k_val, v_val):
 
 
 class FullAttention(nn.Module):
-
     def __init__(self, config):
         super().__init__()
         self.n_heads = config.num_attention_heads
@@ -318,7 +315,6 @@ def forward(self, x, input_pos):
 
 
 class GatedDeltaNet(nn.Module):
-
     def __init__(self, config):
         super().__init__()
         self.num_k_heads = config.linear_num_key_heads
@@ -540,7 +536,6 @@ def forward(self, x):
 
 
 class SparseMoE(nn.Module):
-
     def __init__(self, config):
         super().__init__()
         self.top_k = config.num_experts_per_tok
@@ -574,7 +569,6 @@ def forward(self, x):
 
 
 class Block(nn.Module):
-
     def __init__(self, config, layer_idx):
         super().__init__()
         self.layer_type = config.layer_types[layer_idx]
@@ -599,7 +593,6 @@ def forward(self, x, input_pos):
 
 
 class Qwen35MoE(nn.Module):
-
     def __init__(self, config):
         super().__init__()
         self.config = config
@@ -625,7 +618,7 @@ def forward(
         # position. Otherwise apply the prefill optimization and only
         # materialize ``[B, V]`` for the last token.
         if temperature is None:
-            return self.lm_head(x).float()  # [B, T, V] float32
+            return self.lm_head(x)  # [B, T, V] in model dtype
         logits = self.lm_head(x[:, -1, :]).float()  # [B, V] float32
         # GPU-side Gumbel-max sampling: argmax(logits/T + gumbel_noise) is
         # equivalent to drawing from softmax(logits/T) but stays entirely