make moe output dtype consistent on non-cuda backends

Gasoonjia · Gasoonjia · commit 08777af210ca · 2026-04-27T13:41:19.000-07:00
diff --git a/examples/models/qwen3_5_moe/model.py b/examples/models/qwen3_5_moe/model.py
@@ -613,10 +613,6 @@ def forward(
         for layer in self.layers:
             x = layer(x, input_pos)
         x = self.norm(x)
-        # When no sampling is requested, return the full ``[B, T, V]``
-        # logits so callers (eval, custom samplers) can inspect every
-        # position. Otherwise apply the prefill optimization and only
-        # materialize ``[B, V]`` for the last token.
         if temperature is None:
             return self.lm_head(x)  # [B, T, V] in model dtype
         logits = self.lm_head(x[:, -1, :]).float()  # [B, V] float32