Integrate LLM metadata NamedData into export/runner pipeline

kirklandsign · facebook-github-bot · commit 417218c95c34 · 2026-05-08T15:38:16.000-07:00
Summary:
This integrates the metadata storage POC (D104322796) into the real LLM
export and runner pipeline. Metadata values (max_seq_len, bos_id, eos_ids,
etc.) are now dual-written: both as constant_methods (for backward
compatibility) and as NamedData entries (for efficient C++ runtime access).

On the export side:
- Added metadata.py with helpers to encode/decode metadata values
  into the PTE files NamedData section.
- Modified builder.py to call add_metadata() after edge creation
  in both export_to_edge() and to_edge_transform_and_lower().

On the runner side:
- Added metadata.h with typed accessors (get_int, get_string,
  get_int_list) for reading metadata from NamedDataMap.
- Modified llm_runner_helper.cpp so get_llm_metadata() and
  get_eos_ids() try NamedData first, falling back to constant_methods
  for old PTE files.

Key design decisions:
- Backward compatible: constant_methods are NOT removed
- Dual-write on export, prefer-NamedData on read with fallback
- Failure to write NamedData is non-fatal (logged warning)
- NamedData keys use dotted namespace: metadata.context.max_seq_len

Differential Revision: D104471143
diff --git a/extension/llm/export/BUCK b/extension/llm/export/BUCK
@@ -55,6 +55,7 @@ fbcode_target(_kind = runtime.python_library,
         "//executorch/extension/export_util:export_util",
         "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
         "//executorch/extension/llm/custom_ops:custom_ops_aot_py",
+        "//executorch/extension/llm/export:metadata",
         "//pytorch/tokenizers/pytorch_tokenizers:tokenizers",
     ],
 )
@@ -108,3 +109,15 @@ fbcode_target(_kind = runtime.python_test,
         ":export_lib",
     ],
 )
+
+fbcode_target(_kind = runtime.python_library,
+    name = "metadata",
+    srcs = [
+        "metadata.py",
+    ],
+    base_module = "executorch.extension.llm.export",
+    visibility = ["PUBLIC"],
+    deps = [
+        "//executorch/exir:lib",
+    ],
+)
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
@@ -34,6 +34,7 @@
 from executorch.extension.export_util.utils import export_to_edge, save_pte_program
 
 from executorch.extension.llm.export.export_passes import RemoveRedundantTransposes
+from executorch.extension.llm.export.metadata import add_metadata
 from pytorch_tokenizers import get_tokenizer
 from torch.export import export, ExportedProgram
 from torch.nn.attention import SDPBackend
@@ -71,6 +72,18 @@ def from_torch_dtype(dtype: torch.dtype):
         return mapping[dtype]
 
 
+_CONSTANT_METHOD_TO_NAMED_DATA = {
+    "get_bos_id": "tokenizer.bos_id",
+    "get_eos_ids": "tokenizer.eos_ids",
+    "get_max_seq_len": "context.max_seq_len",
+    "get_max_context_len": "context.max_context_len",
+    "get_vocab_size": "model.vocab_size",
+    "use_kv_cache": "model.use_kv_cache",
+    "use_sdpa_with_kv_cache": "model.use_sdpa_with_kv_cache",
+    "enable_dynamic_shape": "model.enable_dynamic_shape",
+}
+
+
 class LLMEdgeManager:
     """
     Host a torch.nn.Module for LLM model and facilitates exporting to ExecuTorch.
@@ -393,6 +406,28 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage
             logging.info("No quantizer provided, passing...")
             return self
 
+    def _write_metadata_to_named_data(self):
+        """Write metadata to NamedData for efficient C++ runtime access.
+
+        This writes the same metadata values stored as constant_methods
+        also as NamedData entries, enabling the C++ runner to read them
+        without loading full ExecutionPlan entries.
+        """
+        if self.edge_manager is None:
+            return
+        named_data = {}
+        for key, value in self.metadata.items():
+            nd_key = _CONSTANT_METHOD_TO_NAMED_DATA.get(key, key)
+            named_data[nd_key] = value
+        try:
+            add_metadata(self.edge_manager, named_data)
+        except Exception:
+            # Don't fail the export if metadata writing fails
+            logging.warning(
+                "Failed to write metadata to NamedData, "
+                "falling back to constant_methods only"
+            )
+
     def export_to_edge(self) -> "LLMEdgeManager":
         """
         Export the model to Edge dialect and retrieve a LLMEdgeManager.
@@ -418,6 +453,7 @@ def export_to_edge(self) -> "LLMEdgeManager":
                     edge_compile_config=edge_config,
                     verbose=self.verbose,
                 )
+        self._write_metadata_to_named_data()
         return self
 
     def to_backend(self, partitioners: Optional[List[Partitioner]]) -> "LLMEdgeManager":
@@ -470,6 +506,7 @@ def to_edge_transform_and_lower(
             constant_methods=self.metadata,
             generate_etrecord=self.generate_etrecord,
         )
+        self._write_metadata_to_named_data()
         if self.verbose:
             logging.info(f"Exported graph:\n{self.edge_manager.exported_program()}")
         return self
diff --git a/extension/llm/export/metadata.py b/extension/llm/export/metadata.py
@@ -0,0 +1,100 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Model metadata storage for PTE files.
+
+Embeds model metadata (tokenizer config, chat templates, architecture info)
+directly in PTE files via the NamedData mechanism. Replaces the current
+constant_methods approach (which creates full ExecutionPlan entries for
+simple constant values).
+
+Keys use a dotted namespace.field convention:
+    tokenizer.bos_id, tokenizer.eos_ids, context.max_seq_len, etc.
+"""
+
+import struct
+from typing import Dict, List, Sequence, Union
+
+METADATA_PREFIX = "metadata."
+
+MetadataValue = Union[str, int, float, bytes, Sequence[int]]
+
+
+def _encode_value(key: str, value: MetadataValue) -> bytes:
+    if isinstance(value, str):
+        return value.encode("utf-8")
+    elif isinstance(value, (list, tuple)):
+        return struct.pack(f"<I{len(value)}q", len(value), *value)
+    elif isinstance(value, int):
+        return struct.pack("<q", value)
+    elif isinstance(value, float):
+        return struct.pack("<d", value)
+    elif isinstance(value, bytes):
+        return value
+    raise TypeError(f"Unsupported metadata type {type(value)} for key \'{key}\'")
+
+
+def add_metadata(
+    edge_manager,  # EdgeProgramManager
+    metadata: Dict[str, MetadataValue],
+) -> None:
+    """Add metadata KV pairs to a PTE file during export.
+
+    Call BEFORE edge_manager.to_executorch().
+
+    Args:
+        edge_manager: The EdgeProgramManager from to_edge() or
+            to_edge_transform_and_lower().
+        metadata: Dict mapping string keys to values (str, int, float, or bytes).
+            Keys are automatically prefixed with "metadata." to avoid collision
+            with backend named data.
+    """
+    for key, value in metadata.items():
+        edge_manager._named_data_store.add_named_data(
+            key=f"{METADATA_PREFIX}{key}",
+            data=_encode_value(key, value),
+        )
+
+
+def read_metadata(pte_path: str) -> Dict[str, bytes]:
+    """Read all metadata entries from a PTE file.
+
+    Returns raw bytes for each key (without the "metadata." prefix).
+    Use get_string/get_int/get_float for typed access.
+    """
+    from executorch.exir._serialize._program import deserialize_pte_binary
+
+    with open(pte_path, "rb") as f:
+        pte_data = f.read()
+
+    pte_file = deserialize_pte_binary(pte_data)
+
+    result = {}
+    if pte_file.named_data is not None:
+        for key, entry in pte_file.named_data.pte_data.items():
+            if key.startswith(METADATA_PREFIX):
+                short_key = key[len(METADATA_PREFIX):]
+                result[short_key] = pte_file.named_data.buffers[entry.buffer_index]
+
+    return result
+
+
+def get_string(metadata: Dict[str, bytes], key: str) -> str:
+    return metadata[key].decode("utf-8")
+
+
+def get_int(metadata: Dict[str, bytes], key: str) -> int:
+    return struct.unpack("<q", metadata[key])[0]
+
+
+def get_float(metadata: Dict[str, bytes], key: str) -> float:
+    return struct.unpack("<d", metadata[key])[0]
+
+
+def get_int_list(metadata: Dict[str, bytes], key: str) -> List[int]:
+    data = metadata[key]
+    (count,) = struct.unpack_from("<I", data, 0)
+    return list(struct.unpack_from(f"<{count}q", data, 4))
diff --git a/extension/llm/runner/llm_runner_helper.cpp b/extension/llm/runner/llm_runner_helper.cpp
@@ -10,6 +10,7 @@
 
 #include <executorch/extension/llm/runner/image_prefiller.h>
 #include <executorch/extension/llm/runner/llm_runner_helper.h>
+#include <executorch/extension/llm/runner/metadata.h>
 #include <executorch/extension/llm/runner/multimodal_decoder_runner.h>
 #include <executorch/extension/llm/runner/multimodal_prefiller.h>
 #include <executorch/extension/llm/runner/multimodal_runner.h>
@@ -99,7 +100,84 @@ get_llm_metadata(tokenizers::Tokenizer* tokenizer, Module* module) {
       {llm::kUseSDPAWithKVCache, false},
   });
 
-  // Read metadata from the model
+  // Try reading from NamedDataMap first (new format)
+  auto program = module->program();
+  if (program) {
+    auto ndm_result = program->get_named_data_map();
+    if (ndm_result.ok() && ndm_result.get() != nullptr) {
+      const auto* named_data_map = ndm_result.get();
+
+      // Map from runtime keys to NamedData keys
+      struct KeyMapping {
+        const char* runtime_key;
+        const char* named_data_key;
+      };
+      static const KeyMapping mappings[] = {
+          {llm::kMaxSeqLen, metadata::kMaxSeqLen},
+          {llm::kMaxContextLen, metadata::kMaxContextLen},
+          {llm::kUseKVCache, metadata::kUseKVCache},
+          {llm::kEnableDynamicShape, metadata::kEnableDynamicShape},
+          {llm::kUseSDPAWithKVCache, metadata::kUseSDPAWithKVCache},
+      };
+
+      // Check if kMaxSeqLen exists in NamedData (required key)
+      auto max_seq_result =
+          metadata::get_int(*named_data_map, metadata::kMaxSeqLen);
+      if (max_seq_result.ok()) {
+        ET_LOG(Info, "Reading metadata from NamedData");
+
+        for (const auto& mapping : mappings) {
+          auto val =
+              metadata::get_int(*named_data_map, mapping.named_data_key);
+          if (val.ok()) {
+            metadata[mapping.runtime_key] = val.get();
+            ET_LOG(
+                Info,
+                "NamedData: %s = %" PRId64,
+                mapping.runtime_key,
+                val.get());
+          }
+        }
+
+        // Read bos_id from NamedData
+        auto bos_result =
+            metadata::get_int(*named_data_map, metadata::kBosId);
+        if (bos_result.ok()) {
+          metadata[llm::kBosId] = bos_result.get();
+        } else {
+          metadata[llm::kBosId] = tokenizer->bos_tok();
+        }
+
+        // Read vocab_size from NamedData
+        auto vocab_result =
+            metadata::get_int(*named_data_map, metadata::kVocabSize);
+        if (vocab_result.ok()) {
+          metadata[llm::kVocabSize] = vocab_result.get();
+        } else {
+          metadata[llm::kVocabSize] = tokenizer->vocab_size();
+        }
+
+        // Handle kMaxContextLen default: if not explicitly set,
+        // default to kMaxSeqLen
+        if (metadata.find(llm::kMaxContextLen) == metadata.end() ||
+            metadata[llm::kMaxContextLen] == 128) {
+          auto ctx_result =
+              metadata::get_int(*named_data_map, metadata::kMaxContextLen);
+          if (!ctx_result.ok()) {
+            metadata[llm::kMaxContextLen] = metadata[llm::kMaxSeqLen];
+          }
+        }
+
+        for (auto& pair : metadata) {
+          ET_LOG(
+              Info, "Metadata: %s = %" PRId64, pair.first.c_str(), pair.second);
+        }
+        return metadata;
+      }
+    }
+  }
+
+  // Fallback: Read metadata from constant_methods (legacy format)
   auto method_names_result = module->method_names();
   if (method_names_result.error() != Error::Ok) {
     ET_LOG(Error, "Failed reading method names");
@@ -158,7 +236,26 @@ std::unordered_set<uint64_t> get_eos_ids(
     tokenizers::Tokenizer* tokenizer,
     Module* module) {
   std::unordered_set<uint64_t> eos_ids = {tokenizer->eos_tok()};
-  // Get EOS IDs if available
+
+  // Try NamedData first (new format)
+  auto program = module->program();
+  if (program) {
+    auto ndm_result = program->get_named_data_map();
+    if (ndm_result.ok() && ndm_result.get() != nullptr) {
+      auto eos_result =
+          metadata::get_int_list(*ndm_result.get(), metadata::kEosIds);
+      if (eos_result.ok()) {
+        eos_ids.clear();
+        for (auto id : eos_result.get()) {
+          eos_ids.emplace(static_cast<uint64_t>(id));
+          ET_LOG(Info, "NamedData eos_id = %" PRId64, id);
+        }
+        return eos_ids;
+      }
+    }
+  }
+
+  // Fallback: Get EOS IDs from constant_methods (legacy format)
   auto method_names_result = module->method_names();
   if (method_names_result.error() != Error::Ok) {
     ET_LOG(Error, "Failed reading method names");
diff --git a/extension/llm/runner/metadata.h b/extension/llm/runner/metadata.h
diff --git a/extension/llm/runner/targets.bzl b/extension/llm/runner/targets.bzl