fengmk2
diff --git a/‎crates/mlx-core/src/array/banded_attention.rs‎
Lines changed: 8 additions & 10 deletions b/‎crates/mlx-core/src/array/banded_attention.rs‎
Lines changed: 8 additions & 10 deletions
diff --git a/‎crates/mlx-core/src/convert.rs‎
Lines changed: 11 additions & 13 deletions b/‎crates/mlx-core/src/convert.rs‎
Lines changed: 11 additions & 13 deletions
diff --git a/‎crates/mlx-core/src/decode_profiler.rs‎
Lines changed: 1 addition & 1 deletion b/‎crates/mlx-core/src/decode_profiler.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/mlx-core/src/models/lfm2/compiled_parity_test.rs‎
Lines changed: 13 additions & 14 deletions b/‎crates/mlx-core/src/models/lfm2/compiled_parity_test.rs‎
Lines changed: 13 additions & 14 deletions
diff --git a/‎crates/mlx-core/src/models/lfm2/config.rs‎
Lines changed: 80 additions & 0 deletions b/‎crates/mlx-core/src/models/lfm2/config.rs‎
Lines changed: 80 additions & 0 deletions
@@ -1,16 +1,14 @@
 //! Bidirectional banded attention with per-head sinks — reference implementation.
 //!
-//! This is the **correctness oracle** for the future Metal kernel (Task B2). It
-//! composes existing MLX ops (matmul, softmax, mask construction, concat,
-//! slice) instead of running a single fused kernel, so it is intentionally
-//! slow but provably correct.
+//! This is the **correctness oracle** for the fused Metal kernel. It composes
+//! existing MLX ops (matmul, softmax, mask construction, concat, slice) instead
+//! of running a single fused kernel, so it is intentionally slow but provably
+//! correct.
 //!
-//! Architectural note: the eventual kernel-backed primitive lives in
-//! `mlx-paged-attn`, but adding `mlx-core` as a dependency there would create a
-//! cycle (`mlx-core → mlx-paged-attn` already exists). We therefore keep this
-//! reference alongside `attention.rs` in `mlx-core`. The B2 kernel will be able
-//! to depend on this crate via `dev-dependencies` for its tests, or duplicate
-//! a minimal Rust-side scaffold for input prep.
+//! Architectural note: the kernel-backed primitive lives in `mlx-paged-attn`,
+//! but adding `mlx-core` as a dependency there would create a cycle
+//! (`mlx-core → mlx-paged-attn` already exists). We therefore keep this
+//! reference alongside `attention.rs` in `mlx-core`.
 //!
 //! ## Math
 //!
 
@@ -345,17 +345,15 @@ async fn convert_model_inner(options: ConversionOptions) -> Result<ConversionRes
         )));
     }
 
-    // LFM2 mxfp/nvfp now SUPPORTED for non-MoE linears (fast-follow #1a): the
-    // lfm2 loader's attention / conv / dense-MLP projections are mode-aware
+    // LFM2 mxfp/nvfp is supported for non-MoE linears: the lfm2 loader's
+    // attention / conv / dense-MLP projections are mode-aware
     // `LinearProj`/`MLPVariant` backed by `QuantizedLinear`, which threads the
     // resolved mode (affine / mxfp4 / mxfp8 / nvfp4) into `mlx_quantized_matmul`
-    // at forward time. The MoE experts/gate already supported all four modes.
-    // The EMBEDDING and lm_head remain excluded from quantization (vocab-dim
-    // tensors): `should_quantize` skips `embed_tokens`/`lm_head`, so an
-    // mxfp8/mxfp4/nvfp4 lfm2 checkpoint ships quantized experts + attn/conv/
-    // dense-MLP and a plain bf16 embedding — which the #1a loader can load. A
-    // quant-capable embedding lands in #1b; the prior affine-only gate is thus
-    // removed.
+    // at forward time. The MoE experts/gate support all four modes. The
+    // embedding and lm_head are excluded from quantization (vocab-dim tensors):
+    // `should_quantize` skips `embed_tokens`/`lm_head`, so an mxfp8/mxfp4/nvfp4
+    // lfm2 checkpoint ships quantized experts + attn/conv/dense-MLP and a plain
+    // bf16 embedding.
 
     // Validate recipe
     if let Some(ref recipe) = quant_recipe {
@@ -563,9 +561,9 @@ async fn convert_model_inner(options: ConversionOptions) -> Result<ConversionRes
     let is_privacy_filter = matches!(model_type.as_deref(), Some("privacy-filter"));
 
     // Refuse `--quantize` against pre-quantized MTP sources for Qwen3.5/3.6.
-    // The W6.15 convert path retains `mtp.*` tensors untouched (MTPLX
-    // "final form" convention), which means existing `mtp.*.scales` /
-    // `.biases` flow through to the output. Re-quantizing the
+    // The convert path retains `mtp.*` tensors untouched (MTPLX "final form"
+    // convention), which means existing `mtp.*.scales` / `.biases` flow
+    // through to the output. Re-quantizing the
     // language-model body simultaneously rewrites the global `quantization`
     // block in `config.json` to whatever bits/group_size/mode the user
     // asked for, and the load path (`mtp.rs::apply_weights`) resolves
@@ -761,7 +759,7 @@ async fn convert_model_inner(options: ConversionOptions) -> Result<ConversionRes
             // projections (q/k/v/o) and MoE experts (gate_up_proj, down_proj);
             // quantize routers at 8-bit affine when --q-mode affine; leave
             // embeddings, classifier head, norms, biases, and attention sinks
-            // at bf16. Inference path is bf16-only until Phase C lands.
+            // at bf16. Inference path is currently bf16-only.
             let preserved_extra = if quant_mode == "affine" {
                 "8-bit-affine routers"
             } else {
 
@@ -85,7 +85,7 @@ pub struct DecodeProfiler {
     first_token_marked: bool,
     memory_before: Option<MemorySnapshot>,
     memory_after: Option<MemorySnapshot>,
-    /// MTP speculative-decode acceptance counters (W6.33). Updated by
+    /// MTP speculative-decode acceptance counters. Updated by
     /// `record_mtp_cycle` once per draft+verify cycle. `mtp_cycles == 0`
     /// means no MTP cycle ran (a plain autoregressive decode).
     mtp_cycles: u64,
 
@@ -1,9 +1,8 @@
-//! Phase-1 component-parity gate for the lfm2 compiled C++ forward path.
+//! Component-parity gate for the lfm2 compiled C++ forward path.
 //!
-//! lfm2's compiled forward is not end-to-end runnable until the full backbone
-//! lands (Phase 2+), so we validate the parity-critical novel C++ — the
-//! attention pure-fn, the dense SwiGLU MLP, and the ShortConv operator — in
-//! ISOLATION here, against the Rust-native single-layer forward. The C++ probes
+//! Validates the parity-critical novel C++ — the attention pure-fn, the dense
+//! SwiGLU MLP, and the ShortConv operator — in ISOLATION here, against the
+//! Rust-native single-layer forward. The C++ probes
 //! (`mlx_lfm2_probe_attn_seq`, `mlx_lfm2_probe_dense_mlp`,
 //! `mlx_lfm2_probe_conv_seq`) register one layer's weights into the shared
 //! `g_weights()` map, run the compiled pure-fn, and return the output.
@@ -437,7 +436,7 @@ fn compiled_conv_seq_matches_native_with_bias() {
     run_conv_parity(true);
 }
 
-/// 2b-1 end-to-end-SHAPED gate: the full `lfm2_decode_fn` assembly (driven via
+/// End-to-end-SHAPED gate: the full `lfm2_decode_fn` assembly (driven via
 /// the synthetic-model probe) must match a hand-assembled native `[conv, attn,
 /// conv]` dense stack over the same `T`-step decode. Exercises the per-layer
 /// conv/attn dispatch (from `is_attn[]`), the operator_norm→op→+res→ffn_norm→
@@ -716,13 +715,13 @@ fn run_decode_seq_parity(conv_bias: bool) {
     );
 }
 
-/// 2b-1 full-decode parity WITHOUT conv biases (LFM2.5 production default).
+/// Full-decode parity WITHOUT conv biases (LFM2.5 production default).
 #[test]
 fn compiled_decode_seq_matches_native() {
     run_decode_seq_parity(false);
 }
 
-/// Phase 4 Piece 1: the SAME full synthetic decode-sequence parity, but with the
+/// The SAME full synthetic decode-sequence parity, but with the
 /// ShortConv biases (`conv.in_proj.bias`, `conv.conv.bias`, `conv.out_proj.bias`)
 /// seeded into the registry and applied on BOTH sides — the compiled
 /// `lfm2_decode_fn` via `cfg.conv_bias` (threaded through the probe's `conv_bias`
@@ -734,7 +733,7 @@ fn compiled_decode_seq_matches_native_with_conv_bias() {
     run_decode_seq_parity(true);
 }
 
-/// Phase-3a end-to-end-SHAPED MoE gate: the full `lfm2_decode_fn` assembly with
+/// End-to-end-SHAPED MoE gate: the full `lfm2_decode_fn` assembly with
 /// the sparse-MoE FFN branch (driven via `mlx_lfm2_probe_moe_decode_seq`) must
 /// match a hand-assembled native `[conv(dense), attn(MoE), conv(MoE)]` stack over
 /// the same `T`-step decode. The dense layer (idx 0 < num_dense_layers) routes
@@ -1094,7 +1093,7 @@ fn compiled_moe_decode_seq_matches_native() {
     );
 }
 
-/// DECISIVE H1/H2 experiment: COMPILED-vs-EAGER synthetic MoE.
+/// COMPILED-vs-EAGER synthetic MoE.
 ///
 /// Drives the process-global `compiled_lfm2_decode()` (NOT eager
 /// `lfm2_decode_fn`) with a FIXED 3-layer synthetic MoE stack and compares the
@@ -1109,7 +1108,7 @@ fn compiled_moe_decode_seq_matches_native() {
 /// (2) NEAR-TIE router (`expert_bias` gaps of 1e-4, E=32/k=4 fan-out matching
 ///     the real 8B model -> selection decided by softmax(routing) near-ties,
 ///     FP-fusion sensitive). Diagnostic: a nonzero diff here positively
-///     confirms the near-tie selection-flip mechanism (H2).
+///     confirms the near-tie selection-flip mechanism.
 ///
 /// Runs in its OWN test so its fixed synthetic topology bakes into the compiled
 /// static cleanly. `WS_TOL`, `NT_MIN_DIVERGENCE`, `ASSERT_NT_GT_WS` env vars
@@ -1205,13 +1204,13 @@ fn compiled_moe_ab_model_swap_recompiles() {
     // DIFFERENT constants than MODEL A. If `warm_seed == seed_a`, the stale closure
     // would replay constants byte-identical to MODEL A's and (wrongly) still
     // produce A's correct logits, making the MODEL-A epoch-bump non-load-bearing
-    // and the F3 gold-standard vacuous. With a different seed, removing the MODEL-A
+    // and this gold-standard vacuous. With a different seed, removing the MODEL-A
     // bump makes A's compiled run replay `warm_seed`'s weights and
     // `a_comp_vs_a_eager` blows past PARITY_TOL — which is the regression the
     // MODEL-A bump must defeat.
     let warm_seed = 0x7777_8888_9999_AAAAu64;
 
-    // F3 soundness: PRE-SEED a compiled closure at the current epoch BEFORE the
+    // Soundness: PRE-SEED a compiled closure at the current epoch BEFORE the
     // measured probe so the A-side stale-closure hazard manifests
     // DETERMINISTICALLY. The dedicated `warm_compiled_no_bump` probe registers its
     // own synthetic `warm_seed` weights, then — crucially — performs a SAME-EPOCH
@@ -1223,7 +1222,7 @@ fn compiled_moe_ab_model_swap_recompiles() {
     // well-separated compiled-vs-eager probe). It then clears the weights and, by
     // design, does NOT bump the compile epoch. So the measured A->B probe below
     // re-enters with `warm_seed`'s stale closure cached at this epoch: WITHOUT the
-    // MODEL-A `build_model` epoch bump (the F3 production-style fix), MODEL A's
+    // MODEL-A `build_model` epoch bump (the production-style fix), MODEL A's
     // compiled run reuses that stale closure, replays `warm_seed`'s frozen
     // constants, and `a_comp_vs_a_eager` blows past PARITY_TOL — i.e. removing the
     // MODEL-A bump makes THIS test fail. WITH the bump, MODEL A is epoch-fresh and
 
@@ -192,6 +192,34 @@ impl Lfm2Config {
         self.num_experts.is_some()
     }
 
+    /// Resolve the load-time default for `use_block_paged_cache`.
+    ///
+    /// Policy (pure, no I/O — isolated here for unit testing):
+    /// * An explicit `Some(_)` from config.json always wins (user/converter
+    ///   pinned the storage backend — never override it).
+    /// * When the field is absent (`None`) AND the checkpoint is quantized,
+    ///   default to `Some(false)` (flat eager decode). Quantized checkpoints
+    ///   can never register with the compiled-PAGED C++ path
+    ///   (`should_register_compiled` short-circuits on `is_quantized`), so the
+    ///   paged route degenerates to the slow eager-PAGED loop
+    ///   (~12 `synchronize_mlx()`/token, blocking `y.eval()`, no async
+    ///   double-buffering). The flat path uses an in-graph `KVCache` +
+    ///   `async_eval_arrays` (zero per-layer sync) and is ~1.84× faster on the
+    ///   measured mxfp8 LFM2.5-8B-A1B workload.
+    /// * Otherwise (absent + not quantized, e.g. bf16) leave it `None` so
+    ///   `Lfm2Inner::new`'s `unwrap_or(true)` continues to yield PAGED — which
+    ///   bf16 wants (PR #66 compiled-PAGED ~1.5×).
+    pub fn resolve_use_block_paged_default(
+        explicit: Option<bool>,
+        is_quantized: bool,
+    ) -> Option<bool> {
+        match explicit {
+            Some(_) => explicit,
+            None if is_quantized => Some(false),
+            None => None,
+        }
+    }
+
     /// Whether the layer at `idx` uses a sparse MoE feed-forward block.
     ///
     /// MoE layers are those at or after `num_dense_layers` in a MoE
@@ -431,6 +459,58 @@ mod tests {
         assert!(cfg.is_moe_layer(3));
     }
 
+    /// `resolve_use_block_paged_default` policy: quantized + unset -> flat;
+    /// bf16 + unset -> left None (so `Lfm2Inner::new`'s unwrap_or(true) yields
+    /// paged); explicit Some(_) always honored regardless of quant-ness.
+    #[test]
+    fn test_resolve_use_block_paged_default_quantized_none_goes_flat() {
+        // quantized + unset -> flat eager decode.
+        assert_eq!(
+            Lfm2Config::resolve_use_block_paged_default(None, true),
+            Some(false),
+            "quantized checkpoint with use_block_paged_cache unset must default to flat"
+        );
+        // bf16 + unset -> left None so the downstream unwrap_or(true) keeps paged.
+        assert_eq!(
+            Lfm2Config::resolve_use_block_paged_default(None, false),
+            None,
+            "bf16 checkpoint with use_block_paged_cache unset must stay None (paged via unwrap_or)"
+        );
+        // Explicit paged honored even on a quantized checkpoint.
+        assert_eq!(
+            Lfm2Config::resolve_use_block_paged_default(Some(true), true),
+            Some(true),
+            "explicit use_block_paged_cache:true must win on quantized"
+        );
+        assert_eq!(
+            Lfm2Config::resolve_use_block_paged_default(Some(true), false),
+            Some(true),
+            "explicit use_block_paged_cache:true must win on bf16"
+        );
+        // Explicit flat honored on both.
+        assert_eq!(
+            Lfm2Config::resolve_use_block_paged_default(Some(false), true),
+            Some(false),
+            "explicit use_block_paged_cache:false must win on quantized"
+        );
+        assert_eq!(
+            Lfm2Config::resolve_use_block_paged_default(Some(false), false),
+            Some(false),
+            "explicit use_block_paged_cache:false must win on bf16"
+        );
+
+        // The resolved values feed Lfm2Inner::new's `unwrap_or(true)`:
+        // bf16/None -> true (paged); quantized/None -> Some(false) -> false (flat).
+        assert!(
+            Lfm2Config::resolve_use_block_paged_default(None, false).unwrap_or(true),
+            "bf16 None must resolve to paged (true) at the unwrap_or(true) site"
+        );
+        assert!(
+            !Lfm2Config::resolve_use_block_paged_default(None, true).unwrap_or(true),
+            "quantized None must resolve to flat (false) at the unwrap_or(true) site"
+        );
+    }
+
     /// `norm_topk_prob` / `use_expert_bias` round-trip false through serde.
     #[test]
     fn test_moe_bool_flags_round_trip_false() {