add model temperature input cuda backend only

Gasoonjia · Gasoonjia · commit be3e950c5602 · 2026-04-27T14:19:51.000-07:00
diff --git a/examples/models/qwen3_5_moe/main.cpp b/examples/models/qwen3_5_moe/main.cpp
@@ -25,6 +25,8 @@
 
 #ifdef EXECUTORCH_BUILD_CUDA
 #include <cuda_runtime.h>
+#else
+#include <executorch/extension/llm/sampler/util.h>
 #endif
 
 DEFINE_string(model_path, "", "Model .pte file path.");
@@ -37,7 +39,7 @@ DEFINE_string(
     "Path to file containing prompt text (overrides --prompt).");
 DEFINE_double(temperature, 0.8, "Sampling temperature (0 = greedy).");
 DEFINE_int32(max_new_tokens, 128, "Maximum tokens to generate.");
-DEFINE_bool(cuda_graph, false, "Enable CUDA graph for decode method.");
+DEFINE_bool(cuda_graph, false, "Enable CUDA graph for decode method. CUDA only.");
 
 namespace llm = ::executorch::extension::llm;
 using ::executorch::extension::from_blob;
@@ -48,10 +50,18 @@ using ::executorch::runtime::EValue;
 
 using SizesType = executorch::aten::SizesType;
 
-// Read a sampled token from the model output tensor [B, 1].
-// The model performs Gumbel-max sampling on-device and returns a single
-// float token ID. This function copies it from GPU and casts to uint64.
+// Convert a model output tensor to the next sampled token id.
+//
+// On the CUDA build, the model fuses the sampler in (see sampler.py /
+// Qwen35MoE.forward) and returns a single sampled token id as a [B, 1]
+// float tensor; we just copy that scalar back from device.
+//
+// On non-CUDA builds (Metal / MLX / CPU), the model returns raw logits
+// of shape [B, T, V] in the model dtype (typically bf16). We sample on
+// CPU via the shared `llm::logits_to_token` helper, which accepts a
+// temperature (0 = greedy / argmax).
 static uint64_t read_token(const executorch::aten::Tensor& output) {
+#ifdef EXECUTORCH_BUILD_CUDA
   const void* ptr = output.const_data_ptr();
 
   cudaPointerAttributes attrs;
@@ -73,6 +83,14 @@ static uint64_t read_token(const executorch::aten::Tensor& output) {
     memcpy(&val, ptr, sizeof(float));
   }
   return static_cast<uint64_t>(val);
+#else
+  // logits_to_token handles 2D / 3D logits and Float / Half / BFloat16 /
+  // UInt16 dtypes. Negative temperatures are clamped to 0 (greedy).
+  const float temp = FLAGS_temperature <= 0.0
+      ? 0.0f
+      : static_cast<float>(FLAGS_temperature);
+  return static_cast<uint64_t>(llm::logits_to_token(output, temp));
+#endif
 }
 
 int main(int argc, char** argv) {
@@ -133,16 +151,23 @@ int main(int argc, char** argv) {
   }
   auto metadata = metadata_result.get();
 
+#ifdef EXECUTORCH_BUILD_CUDA
   // Set CUDA graph option if requested (must be before load_method)
   if (FLAGS_cuda_graph) {
     executorch::runtime::BackendOptions<2> cuda_opts;
     cuda_opts.set_option("enable_cuda_graph_for_method", "decode");
     executorch::runtime::set_option("CudaBackend", cuda_opts.view());
     printf("CUDA graph enabled for decode method\n");
   }
+#else
+  if (FLAGS_cuda_graph) {
+    ET_LOG(Info, "--cuda_graph ignored on non-CUDA build");
+  }
+#endif
 
   printf("Loading methods...\n");
 
+#ifdef EXECUTORCH_BUILD_CUDA
   // Enable cross-method per-FQN weight sharing in the CUDA backend so that
   // prefill and decode (which share KV cache and other mutable buffers /
   // weights) avoid duplicate GPU allocations. This is critical for fitting
@@ -170,6 +195,7 @@ int main(int argc, char** argv) {
       return 1;
     }
   }
+#endif
 
   auto err = module->load_method("prefill");
   if (err != Error::Ok) {
@@ -224,12 +250,16 @@ int main(int argc, char** argv) {
   // ---------------------------------------------------------------
   auto S = [](int64_t v) -> SizesType { return static_cast<SizesType>(v); };
 
-  // Use a very small temperature for greedy to avoid division by zero
-  // while keeping the Gumbel noise negligible relative to logit differences.
+#ifdef EXECUTORCH_BUILD_CUDA
+  // CUDA build: model fuses the sampler in. Pass a temperature tensor as
+  // a third input. Use a very small temperature for greedy to avoid
+  // division by zero while keeping the Gumbel noise negligible relative
+  // to logit differences.
   float temp_val =
       FLAGS_temperature <= 0.0 ? 1e-6f : static_cast<float>(FLAGS_temperature);
   auto temp_tensor =
       from_blob(&temp_val, {1}, executorch::aten::ScalarType::Float);
+#endif
 
   // ---------------------------------------------------------------
   // Prefill
@@ -260,7 +290,9 @@ int main(int argc, char** argv) {
   std::vector<EValue> prefill_inputs;
   prefill_inputs.push_back(tokens_tensor);
   prefill_inputs.push_back(pos_tensor);
+#ifdef EXECUTORCH_BUILD_CUDA
   prefill_inputs.push_back(temp_tensor);
+#endif
 
   auto prefill_result = module->execute(run_method, prefill_inputs);
   if (prefill_result.error() != Error::Ok) {
@@ -308,7 +340,9 @@ int main(int argc, char** argv) {
     std::vector<EValue> decode_inputs;
     decode_inputs.push_back(EValue(decode_tokens));
     decode_inputs.push_back(EValue(decode_pos));
+#ifdef EXECUTORCH_BUILD_CUDA
     decode_inputs.push_back(EValue(temp_tensor));
+#endif
 
     auto decode_result = module->execute("decode", decode_inputs);
     if (decode_result.error() != Error::Ok) {