Revert model.py, export.py, main.cpp to main branch

Gasoonjia · Gasoonjia · commit fc5018edc3bd · 2026-04-02T21:07:24.000-07:00
Only chunk_gated_delta_rule.py needs modification — dispatch logic
is internal to the triton_op, no model/export/runner changes needed.
diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py
@@ -374,19 +374,17 @@ def export_and_lower(model, config, args):
     # -O0 compiles ~8x faster than -O1 with no measurable runtime impact.
     inductor_config.aot_inductor.compile_wrapper_opt_level = "O0"
 
-    # --- Single method: dynamic T ---
-    # Runtime dispatch between recurrent (T=1) and chunked (T>1) happens
-    # inside the chunk_gated_delta_rule triton_op, not at model level.
-    tokens = torch.tensor([[0, 1]], dtype=torch.long)
-    input_pos = torch.tensor([0, 1], dtype=torch.long)
+    # Dynamic shapes
+    example_tokens = torch.tensor([[0, 1]], dtype=torch.long)
+    example_input_pos = torch.tensor([0, 1], dtype=torch.long)
     seq_dim = Dim("seq_len", min=1, max=config.max_seq_len - 1)
     dynamic_shapes = ({1: seq_dim}, {0: seq_dim})
 
-    print("Exporting model (single method, dynamic T)...")
+    print("Exporting with torch.export...")
     with torch.no_grad():
-        prog = export(
+        exported = export(
             model,
-            (tokens, input_pos),
+            (example_tokens, example_input_pos),
             dynamic_shapes=dynamic_shapes,
             strict=True,
         )
@@ -404,7 +402,7 @@ def export_and_lower(model, config, args):
         "enable_dynamic_shape": True,
     }
     et_prog = to_edge_transform_and_lower(
-        prog,
+        exported,
         partitioner=[CudaPartitioner(compile_specs)],
         compile_config=EdgeCompileConfig(
             _check_ir_validity=False,
diff --git a/examples/models/qwen3_5_moe/main.cpp b/examples/models/qwen3_5_moe/main.cpp
@@ -9,11 +9,12 @@
 #include <gflags/gflags.h>
 
 #include <executorch/extension/llm/runner/text_llm_runner.h>
+#include <executorch/extension/module/module.h>
 #include <executorch/runtime/platform/log.h>
 #include <pytorch/tokenizers/hf_tokenizer.h>
 
-#include <optional>
 #include <string>
+#include <vector>
 
 DEFINE_string(model_path, "", "Model .pte file path.");
 DEFINE_string(data_path, "", "Data file (.ptd) for CUDA backend.");
@@ -23,7 +24,6 @@ DEFINE_double(temperature, 0.8, "Sampling temperature (0 = greedy).");
 DEFINE_int32(max_new_tokens, 128, "Maximum tokens to generate.");
 
 namespace llm = ::executorch::extension::llm;
-using ::executorch::runtime::Error;
 
 int main(int argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
@@ -37,6 +37,11 @@ int main(int argc, char** argv) {
     return 1;
   }
 
+  std::vector<std::string> data_files;
+  if (!FLAGS_data_path.empty()) {
+    data_files.push_back(FLAGS_data_path);
+  }
+
   // Load tokenizer
   auto tokenizer = std::make_unique<tokenizers::HFTokenizer>();
   auto tok_status = tokenizer->load(FLAGS_tokenizer_path);
@@ -48,37 +53,23 @@ int main(int argc, char** argv) {
     return 1;
   }
 
-  // Single-method runner: "forward" handles both prefill (T>1) and decode (T=1)
-  // via torch.cond dispatch inside the model.
-  fprintf(stderr, "Loading model from %s...\n", FLAGS_model_path.c_str());
-  std::optional<const std::string> data_path =
-      FLAGS_data_path.empty() ? std::nullopt
-                              : std::optional<const std::string>(FLAGS_data_path);
+  // Create LLM runner
   auto runner = llm::create_text_llm_runner(
-      FLAGS_model_path,
-      std::move(tokenizer),
-      data_path,
-      FLAGS_temperature);
-  fprintf(stderr, "Runner created successfully\n");
+      FLAGS_model_path, std::move(tokenizer), data_files, FLAGS_temperature);
+
+  if (runner == nullptr) {
+    ET_LOG(Error, "Failed to create runner");
+    return 1;
+  }
 
   // Generate
   llm::GenerationConfig config;
   config.temperature = FLAGS_temperature;
   config.max_new_tokens = FLAGS_max_new_tokens;
 
-  fprintf(stderr, "Starting generation with prompt: %s\n", FLAGS_prompt.c_str());
-  try {
-    auto error = runner->generate(FLAGS_prompt.c_str(), config);
-    if (error != Error::Ok) {
-      fprintf(stderr, "Generation failed with error code: %d\n", static_cast<int>(error));
-      return 1;
-    }
-    fprintf(stderr, "Generation completed successfully\n");
-  } catch (const std::exception& e) {
-    fprintf(stderr, "Exception during generation: %s\n", e.what());
-    return 1;
-  } catch (...) {
-    fprintf(stderr, "Unknown exception during generation\n");
+  auto error = runner->generate(FLAGS_prompt.c_str(), config);
+  if (error != executorch::runtime::Error::Ok) {
+    ET_LOG(Error, "Generation failed");
     return 1;
   }
 
diff --git a/examples/models/qwen3_5_moe/model.py b/examples/models/qwen3_5_moe/model.py
@@ -114,9 +114,9 @@ def __init__(self, dim, eps=1e-6):
         self.eps = eps
 
     def forward(self, x):
-        x_fp32 = x.float()
-        rms = torch.rsqrt(x_fp32.pow(2).mean(-1, keepdim=True) + self.eps)
-        return (x_fp32 * rms * (1.0 + self.weight.float())).to(x.dtype)
+        x_float = x.float()
+        normed = x_float * torch.rsqrt(x_float.pow(2).mean(-1, keepdim=True) + self.eps)
+        return (normed * (1.0 + self.weight.float())).type_as(x)
 
 
 class RMSNormGated(nn.Module):
@@ -128,10 +128,10 @@ def __init__(self, dim, eps=1e-6):
         self.eps = eps
 
     def forward(self, x, z):
-        x_fp32 = x.float()
-        rms = torch.rsqrt(x_fp32.pow(2).mean(-1, keepdim=True) + self.eps)
-        normed = x_fp32 * rms
-        return (self.weight.float() * normed * torch.nn.functional.silu(z.float())).to(x.dtype)
+        x_float = x.float()
+        normed = x_float * torch.rsqrt(x_float.pow(2).mean(-1, keepdim=True) + self.eps)
+        normed = self.weight * normed.type_as(x)
+        return (normed * F.silu(z.float())).type_as(x)
 
 
 # ---------------------------------------------------------------------------
@@ -390,8 +390,7 @@ def forward(self, x, input_pos):
         beta = b.sigmoid()
         g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
 
-        # Gated delta rule: dispatch happens inside the triton_op
-        # (recurrent kernel for T=1 decode, chunked FLA for T>1 prefill).
+        # FLA Triton kernel (returns final_state separately, does not mutate initial_state)
         output, state = torch.ops.triton.chunk_gated_delta_rule(
             q, k, v, g, beta, self.recurrent_state[:B]
         )