fix(lfm2_5_vl): fp16 vision encoder for --quantize/--dtype fp16, fix KV cache dtype mismatch

NorbertKlockiewicz · claude · NorbertKlockiewicz · commit 3fef2085baf0 · 2026-03-04T09:54:28.000+01:00
- Cast only vision_tower + multi_modal_projector to fp16 (not text_model),
  preventing update_cache dtype assertion when --quantize is used with fp32 text decoder
- Replace deprecated HintBasedSymShapeEvalPass with ConstraintBasedSymShapeEvalPass

Co-Authored-By: Claude Sonnet 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/examples/models/lfm2_5_vl/export_lfm2_5_vl.py b/examples/models/lfm2_5_vl/export_lfm2_5_vl.py
@@ -9,7 +9,7 @@
 generic MultimodalRunner (C++ llava_main).
 
 Methods:
-  vision_encoder  : [1, 3, 512, 512] f32 NCHW pixels [0,255] -> [1, 256, 2048] f32
+  vision_encoder  : [1, 3, 512, 512] f32 NCHW pixels [0,255] -> [1, 256, 2048] f32/f16
   token_embedding : [1, seq_len] i64                          -> [1, seq_len, 2048] f32
   text_decoder    : ([1, seq_len, 2048] f32, [seq_len] i64)   -> [1, 65536] f32
 
@@ -53,10 +53,7 @@
 )
 from executorch.exir.passes import MemoryPlanningPass
 from executorch.exir.passes.quant_fusion_pass import QuantFusionPass
-from executorch.exir.passes.sym_shape_eval_pass import (
-    ConstraintBasedSymShapeEvalPass,
-    HintBasedSymShapeEvalPass,
-)
+from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
 from executorch.extension.llm.export.builder import DType, LLMEdgeManager
 from executorch.extension.llm.export.config.llm_config import LlmConfig
 from torch.export import Dim
@@ -87,14 +84,18 @@ def export(self) -> "Lfm2p5VlEdgeManager":
         return self
 
 
-def export_image_encoder(lfm2) -> torch.export.ExportedProgram:
+def export_image_encoder(
+    lfm2, dtype: DType = DType.fp32
+) -> torch.export.ExportedProgram:
     """Export vision encoder as 'vision_encoder' method.
 
     Input:  [1, 3, 512, 512] float32 NCHW pixels in [0, 255]
-    Output: [1, 256, 2048]   float32 image embeddings
+    Output: [1, 256, 2048]   f32/f16 image embeddings
 
     Normalize + patch extraction are baked in so the C++ runner only
     needs to resize to 512x512 and pass the raw pixel buffer.
+    Weights are cast to dtype (fp16 halves the ~1.6 GB vision encoder).
+    The input pixel tensor always stays fp32.
     """
 
     class ImageEncoder(torch.nn.Module):
@@ -106,11 +107,16 @@ def forward(self, images: torch.Tensor) -> torch.Tensor:
             return self.lfm2.image_embedding(images)
 
     encoder = ImageEncoder(lfm2)
+    if dtype != DType.fp32:
+        # Cast only the vision parts of the HF model, not text_model (KV cache buffers
+        # must stay in the text decoder's dtype, not the vision encoder's dtype).
+        lfm2.model_.model.vision_tower.to(dtype.to_torch_dtype())
+        lfm2.model_.model.multi_modal_projector.to(dtype.to_torch_dtype())
     example_pixels = torch.randint(
         0, 256, (1, 3, IMAGE_SIZE, IMAGE_SIZE), dtype=torch.float32
     )
 
-    logging.info("Exporting vision encoder...")
+    logging.info(f"Exporting vision encoder ({dtype.name})...")
     with torch.no_grad():
         ep = torch.export.export(encoder, (example_pixels,), strict=False)
     return ep
@@ -251,8 +257,10 @@ def export_all(
     if dtype != DType.fp32:
         lfm2 = lfm2.to(dtype.to_torch_dtype())
 
+    # Vision encoder: use fp16 when quantizing (halves ~1.6 GB SigLIP2) or when dtype=fp16
+    vision_dtype = DType.fp16 if (quantize or dtype == DType.fp16) else DType.fp32
     logging.info("[1/3] Exporting vision encoder...")
-    vision_ep = export_image_encoder(lfm2)
+    vision_ep = export_image_encoder(lfm2, vision_dtype)
 
     # Text decoder MUST come before token embedding (see export_token_embedding docstring)
     logging.info("[2/3] Exporting text decoder...")
@@ -304,7 +312,7 @@ def export_all(
             memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
             sym_shape_eval_pass={
                 "vision_encoder": ConstraintBasedSymShapeEvalPass(),
-                "token_embedding": HintBasedSymShapeEvalPass(),
+                "token_embedding": ConstraintBasedSymShapeEvalPass(),
                 "text_decoder": ConstraintBasedSymShapeEvalPass(),
             },
         )