pytorch · vincentzed · Apr 14, 2026
@@ -74,7 +74,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         with torch.no_grad():
             self.conv_state.copy_(new_conv_state)
 
-        conv_out = self.conv(Bx)[..., : x.size(-1)]  # (batch_size, dim, seq_len)
+        # Manual depthwise conv: Triton has no template for nn.Conv1d with
+        # groups=dim and dynamic seq_len.  kernel_size is always 3.
+        w = self.conv.weight[:, 0, :]  # (dim, 3)
+        conv_out = (
+            Bx[..., :-2] * w[:, 0:1]
+            + Bx[..., 1:-1] * w[:, 1:2]
+            + Bx[..., 2:] * w[:, 2:3]
+        )  # (batch_size, dim, seq_len)
         y = C * conv_out  # (batch_size, dim, seq_len)
 
         y = y.transpose(-1, -2)  # (batch_size, seq_len, dim)

@@ -0,0 +1,13 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.examples.models.lfm2_5_vl.convert_weights import convert_weights
+from executorch.examples.models.lfm2_5_vl.model import Lfm2p5VlModel
+
+__all__ = [
+    "convert_weights",
+    "Lfm2p5VlModel",
+]
@@ -0,0 +1,33 @@
+{
+  "dim": 2048,
+  "ffn_dim_multiplier": 1,
+  "hidden_dim": 8192,
+  "n_heads": 32,
+  "n_kv_heads": 8,
+  "n_layers": 16,
+  "norm_eps": 1e-5,
+  "rope_theta": 1000000.0,
+  "use_scaled_rope": false,
+  "vocab_size": 65536,
+  "use_hf_rope": true,
+  "use_qk_norm": true,
+  "qk_norm_before_rope": true,
+  "layer_types": [
+    "conv",
+    "conv",
+    "full_attention",
+    "conv",
+    "conv",
+    "full_attention",
+    "conv",
+    "conv",
+    "full_attention",
+    "conv",
+    "full_attention",
+    "conv",
+    "full_attention",
+    "conv",
+    "full_attention",
+    "conv"
+  ]
+}
@@ -0,0 +1,33 @@
+{
+  "dim": 1024,
+  "ffn_dim_multiplier": 1,
+  "hidden_dim": 4608,
+  "n_heads": 16,
+  "n_kv_heads": 8,
+  "n_layers": 16,
+  "norm_eps": 1e-5,
+  "rope_theta": 1000000.0,
+  "use_scaled_rope": false,
+  "vocab_size": 65536,
+  "use_hf_rope": true,
+  "use_qk_norm": true,
+  "qk_norm_before_rope": true,
+  "layer_types": [
+    "conv",
+    "conv",
+    "full_attention",
+    "conv",
+    "conv",
+    "full_attention",
+    "conv",
+    "conv",
+    "full_attention",
+    "conv",
+    "full_attention",
+    "conv",
+    "full_attention",
+    "conv",
+    "full_attention",
+    "conv"
+  ]
+}
@@ -0,0 +1,81 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Convert LFM2.5-VL text decoder weights from HuggingFace to ET format."""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+
+import torch
+from executorch.examples.models.checkpoint import get_mapped_key
+from safetensors.torch import load_file
+
+_LFM2_5_VL_TO_META: dict[str, str] = {
+    "model.language_model.embed_tokens.weight": "tok_embeddings.weight",
+    "model.language_model.embedding_norm.weight": "norm.weight",
+    "model.language_model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight",
+    "model.language_model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
+    "model.language_model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
+    "model.language_model.layers.{}.self_attn.out_proj.weight": "layers.{}.attention.wo.weight",
+    "model.language_model.layers.{}.self_attn.q_layernorm.weight": "layers.{}.attention.q_norm_fn.weight",
+    "model.language_model.layers.{}.self_attn.k_layernorm.weight": "layers.{}.attention.k_norm_fn.weight",
+    "model.language_model.layers.{}.operator_norm.weight": "layers.{}.attention_norm.weight",
+    "model.language_model.layers.{}.ffn_norm.weight": "layers.{}.ffn_norm.weight",
+    "model.language_model.layers.{}.feed_forward.w1.weight": "layers.{}.feed_forward.w1.weight",
+    "model.language_model.layers.{}.feed_forward.w2.weight": "layers.{}.feed_forward.w2.weight",
+    "model.language_model.layers.{}.feed_forward.w3.weight": "layers.{}.feed_forward.w3.weight",
+    "model.language_model.layers.{}.conv.conv.weight": "layers.{}.conv.conv.weight",
+    "model.language_model.layers.{}.conv.out_proj.weight": "layers.{}.conv.out_proj.weight",
+    "model.language_model.lm_head.weight": "output.weight",
+}
+
+_IN_PROJ_SPLITS = ("B_proj", "C_proj", "x_proj")
+
+
+def lfm2_5_vl_to_meta(state_dict: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+    """Extract and remap language model weights from a full VL state dict."""
+    converted: dict[str, torch.Tensor] = {}
+
+    for key, value in state_dict.items():
+        if not key.startswith("model.language_model."):
+            continue
+
+        try:
+            new_key = get_mapped_key(key, _LFM2_5_VL_TO_META)
+        except Exception:
+            new_key = key.removeprefix("model.language_model.")
+
+        if new_key.endswith(".conv.in_proj.weight"):
+            for name, chunk in zip(_IN_PROJ_SPLITS, torch.chunk(value, 3, dim=0)):
+                converted[new_key.replace("in_proj", name)] = chunk
+        else:
+            converted[new_key] = value
+
+    if "output.weight" not in converted:
+        converted["output.weight"] = converted["tok_embeddings.weight"]
+
+    return converted
+
+
+def convert_weights(input_dir: str, output_file: str) -> None:
+    sd = load_file(str(Path(input_dir) / "model.safetensors"))
+    sd = lfm2_5_vl_to_meta(sd)
+    torch.save(sd, output_file)
+    print(f"Saved {len(sd)} tensors to {output_file}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Convert LFM2.5-VL weights to ET format.")
+    parser.add_argument("input_dir", help="Directory containing model.safetensors.")
+    parser.add_argument("output", help="Output .pt checkpoint path.")
+    args = parser.parse_args()
+    convert_weights(args.input_dir, args.output)
+
+
+if __name__ == "__main__":
+    main()