LLM support: improve VGF export and calibration pipeline (pytorch#19157)

xingguo01 · zingo · web-flow · commit f6be9851aa90 · 2026-05-29T10:01:03.000+01:00
This is stacked on top of pytorch#19029 - make non-KV-cache example inputs match the static export window - fix PT2E calibration flow for padded prefixes and optional LM-Eval tasks - update SmolLM2 export settings used by the VGF PT2E workflow - Fix rope_theta in 135M_config.json to align with Hugging face model config cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils @Sebastian-Larsson @robell @rascani Signed-off-by: Xingguo Li <xingguo.li@arm.com> Co-authored-by: Zingo Andersen <zingo.andersen@arm.com>
diff --git a/examples/models/llama/eval_llama_lib.py b/examples/models/llama/eval_llama_lib.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -46,9 +47,13 @@ def __init__(
         use_kv_cache: bool = False,
         generate_full_logits: bool = False,
         enable_dynamic_shape: bool = True,
+        device: Optional[str] = None,
     ):
         super().__init__(
-            model=model, tokenizer=tokenizer, max_seq_length=max_seq_length
+            model=model,
+            tokenizer=tokenizer,
+            max_seq_length=max_seq_length,
+            device=device,
         )
         self._model = model.to(self.device)
         self._use_kv_cache = use_kv_cache
@@ -57,30 +62,70 @@ def __init__(
 
     def _model_call(self, inps):
         if self._use_kv_cache:
-            if not self._enable_dynamic_shape:
-                # graph module exported without dynamic shape won't work with a different shape.
-                # And we have to do single token prefill here.
-                result_logits = []
-                for pos in range(inps.shape[-1]):
-                    pos_tensor = torch.tensor([pos], dtype=torch.int64)
-                    logits = self._model(
-                        inps[:, pos : pos + 1], {"input_pos": pos_tensor}
-                    )
-                    result_logits.append(logits)
-                if self._generate_full_logits:
-                    return torch.cat(result_logits, dim=1)
-                else:
-                    return torch.stack(result_logits, dim=1)
-            else:
-                pos_tensor = torch.tensor([0], dtype=torch.int64, device=self.device)
-                # Batch process the whole sequence.
-                logits = self._model(
-                    inps[:, : self._max_seq_length], {"input_pos": pos_tensor}
-                )
-                return logits
+            return self._model_call_kv_cache(inps)
+        return self._model_call_no_kv_cache(inps)
 
-        else:
-            return self._model(inps)
+    def _model_call_kv_cache(self, inps):
+        if self._enable_dynamic_shape:
+            pos_tensor = torch.tensor([0], dtype=torch.int64, device=self.device)
+            return self._model(
+                inps[:, : self._max_seq_length], {"input_pos": pos_tensor}
+            )
+
+        # graph module exported without dynamic shape won't work with a different shape.
+        # And we have to do single token prefill here.
+        result_logits = []
+        for pos in range(inps.shape[-1]):
+            pos_tensor = torch.tensor([pos], dtype=torch.int64)
+            logits = self._model(inps[:, pos : pos + 1], {"input_pos": pos_tensor})
+            result_logits.append(logits)
+        if self._generate_full_logits:
+            return torch.cat(result_logits, dim=1)
+        return torch.stack(result_logits, dim=1)
+
+    def _model_call_no_kv_cache(self, inps):
+        # lm-eval expects logits shaped [batch, seq, vocab]. In the non-KV path,
+        # some exported graphs (when generate_full_logits=False) return only
+        # last-position logits [batch, vocab], so reconstruct per-position
+        # logits by running prefix calls.
+        if not self._enable_dynamic_shape and not self._generate_full_logits:
+            raise ValueError(
+                "Static non-KV lm-eval requires generate_full_logits=True "
+                "so logits can be read from the last non-pad token."
+            )
+
+        if self._generate_full_logits:
+            return self._model(self._pad_to_max_len(inps))
+
+        result_logits = []
+        seq_len = inps.shape[-1]
+        for pos in range(min(seq_len, self._max_seq_length)):
+            prefix = self._pad_to_max_len(inps[:, : pos + 1])
+            logits = self._model(prefix)
+            if logits.dim() == 3:
+                logits = logits[:, -1, :]
+            result_logits.append(logits)
+
+        return torch.stack(result_logits, dim=1)
+
+    def _pad_to_max_len(self, tokens: torch.Tensor) -> torch.Tensor:
+        if self._enable_dynamic_shape:
+            return tokens
+        token_len = tokens.shape[-1]
+        if token_len > self._max_seq_length:
+            return tokens[:, : self._max_seq_length]
+        if token_len == self._max_seq_length:
+            return tokens
+
+        pad_len = self._max_seq_length - token_len
+        pad_token = getattr(self._tokenizer, "pad_id", self._tokenizer.eos_id)
+        pad = torch.full(
+            (tokens.shape[0], pad_len),
+            pad_token,
+            dtype=tokens.dtype,
+            device=tokens.device,
+        )
+        return torch.cat((tokens, pad), dim=-1)
 
     def _model_generate(self, context, max_length, eos_token_id):
         raise Exception("unimplemented")
@@ -219,6 +264,7 @@ def gen_eval_wrapper(
             tokenizer=tokenizer,
             max_seq_length=llm_config.export.max_seq_length,
             use_kv_cache=llm_config.model.use_kv_cache,
+            generate_full_logits=llm_config.debug.generate_full_logits,
             enable_dynamic_shape=llm_config.model.enable_dynamic_shape,
         )
     else:
diff --git a/examples/models/llama/evaluate/eager_eval.py b/examples/models/llama/evaluate/eager_eval.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -28,12 +29,13 @@ def __init__(
         tokenizer: Union[SentencePieceTokenizer, Tiktoken, HuggingFaceTokenizer],
         max_seq_length: Optional[int] = None,
         use_kv_cache: bool = False,
+        device: Optional[str] = None,
     ):
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        super().__init__(device=device, pretrained="gpt2")
+        resolved_device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        super().__init__(device=resolved_device, pretrained="gpt2")
         self._model = model
         self._tokenizer = tokenizer
-        self._device = torch.device(device)
+        self._device = torch.device(resolved_device)
         self._max_seq_length = 2048 if max_seq_length is None else max_seq_length
         self._use_kv_cache = use_kv_cache
 
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -1,5 +1,6 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
+# Copyright 2026 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -285,11 +286,25 @@ def get_example_inputs(self):
         if self.use_kv_cache:
             return self.get_example_inputs_kvcache_sdpa()
         else:
-            return (
-                torch.tensor(
-                    [[1, 2, 3]], dtype=torch.long
-                ),  # tokens, with kv cache our input token length is always just 1 token.
+            max_seq_len = getattr(self.llm_config.export, "max_seq_length", 3)
+            # Preserve the historical three-token example input as the minimum.
+            max_seq_len = max(3, int(max_seq_len))
+            max_len = max_seq_len - 1 if self.enable_dynamic_shape else max_seq_len
+            backend = self.llm_config.backend
+            token_dtype = (
+                torch.int32
+                if (
+                    backend.ethosu.enabled
+                    or backend.tosa.enabled
+                    or backend.vgf.enabled
+                )
+                else torch.long
             )
+            example_tokens = torch.arange(max_len, dtype=token_dtype).unsqueeze(0)
+            vocab_size = int(getattr(self.model_.params, "vocab_size", 0))
+            if vocab_size > 1:
+                example_tokens = example_tokens % (vocab_size - 1) + 1
+            return (example_tokens,)
 
     # assumption is the custom op doesnt support dynamic shape right now. It might but its untested so lets first get static shape working
     def get_example_inputs_kvcache_sdpa(self):
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
@@ -256,6 +256,35 @@ def run_canonical_optimizations(self):
             assert res.graph_module is not None, "Pass returned None"
             self.pre_autograd_graph_module = res.graph_module
 
+    def _check_calibration_prefix_options(self) -> None:
+        if (
+            not self.use_kv_cache
+            and not self.enable_dynamic_shape
+            and not self.generate_full_logits
+        ):
+            raise ValueError(
+                "Static non-KV calibration with padded prefixes requires "
+                "generate_full_logits so calibration can sample the last "
+                "non-pad token position."
+            )
+
+    def _prepare_calibration_prefix(
+        self, token_list: List[int], pos: int, max_len: int, pad_token: int
+    ) -> Tuple[torch.Tensor, int]:
+        prefix_tokens = list(token_list[: pos + 1])
+        logits_token_pos = min(len(prefix_tokens), max_len) - 1
+
+        if self.enable_dynamic_shape:
+            prefix_tokens = prefix_tokens[:max_len]
+        elif len(prefix_tokens) < max_len:
+            prefix_tokens.extend([pad_token] * (max_len - len(prefix_tokens)))
+        else:
+            prefix_tokens = prefix_tokens[:max_len]
+
+        input_dtype = self.example_inputs[0].dtype
+        prefix = torch.tensor(prefix_tokens, dtype=input_dtype).unsqueeze(0)
+        return prefix, logits_token_pos
+
     def pt2e_calibrate(
         self,
         prepared_module,
@@ -266,39 +295,41 @@ def pt2e_calibrate(
         tokenizer_path,
     ):
         logging.info("Run calibration...")
-        try:
-            from executorch.examples.models.llama.eval_llama_lib import (
-                GraphModuleEvalWrapper,
-            )
-            from lm_eval.evaluator import simple_evaluate
-        except ImportError:
-            raise ImportError(
-                "Please install the llm eval dependency via examples/models/llama/install_requirements.sh"
-            )
-
+        self._check_calibration_prefix_options()
         tokenizer = get_tokenizer(tokenizer_path)
 
         def calibrate_template(
             module: torch.fx.GraphModule, tokenizer, prompts: str, max_len: int
         ):
             # TODO: change criteria & support batch inputs if necessary
-            pos = torch.tensor(0, dtype=torch.int64)
+            pos = 0
             token_list = tokenizer.encode(prompts, bos=True, eos=False)
 
+            pad_token = getattr(tokenizer, "pad_id", tokenizer.eos_id)
+
             with torch.no_grad():
                 while token_list[-1] != tokenizer.eos_id and pos < max_len:
-                    logits = module(
-                        torch.full((1, 1), token_list[pos]),
-                        {"input_pos": torch.tensor((pos,))},
-                    )
+                    logits_token_pos = -1
+                    if self.use_kv_cache:
+                        logits = module(
+                            torch.full((1, 1), token_list[pos]),
+                            {"input_pos": torch.tensor((pos,))},
+                        )
+                    else:
+                        prefix, logits_token_pos = self._prepare_calibration_prefix(
+                            token_list, pos, max_len, pad_token
+                        )
+                        logits = module(prefix)
+
                     pos += 1
                     if pos >= len(token_list):
                         if self.generate_full_logits:
-                            token_list.append(
-                                torch.argmax(logits[:, -1], dim=-1).item()
-                            )
+                            next_token = torch.argmax(
+                                logits[:, logits_token_pos], dim=-1
+                            ).item()
                         else:
-                            token_list.append(torch.argmax(logits[:], dim=-1).item())
+                            next_token = torch.argmax(logits[:], dim=-1).item()
+                        token_list.append(next_token)
 
         calibrate_template(
             module=prepared_module,
@@ -307,26 +338,41 @@ def calibrate_template(
             max_len=calibration_seq_length,
         )
 
-        eval_wrapper = GraphModuleEvalWrapper(
-            model=prepared_module,
-            tokenizer=tokenizer,
-            max_seq_length=calibration_seq_length,
-            use_kv_cache=self.use_kv_cache,
-            generate_full_logits=self.generate_full_logits,
-            enable_dynamic_shape=self.enable_dynamic_shape,
-        )
+        if calibration_tasks:
+            try:
+                from executorch.examples.models.llama.eval_llama_lib import (
+                    GraphModuleEvalWrapper,
+                )
+                from lm_eval.evaluator import simple_evaluate
+            except ImportError:
+                raise ImportError(
+                    "Please install the llm eval dependency via examples/models/llama/install_requirements.sh"
+                )
 
-        # Evaluate the model
-        with torch.no_grad():
-            eval_results = simple_evaluate(
-                model=eval_wrapper,
-                tasks=calibration_tasks,
-                limit=calibration_limit,
+            eval_wrapper = GraphModuleEvalWrapper(
+                model=prepared_module,
+                tokenizer=tokenizer,
+                max_seq_length=calibration_seq_length,
+                use_kv_cache=self.use_kv_cache,
+                generate_full_logits=self.generate_full_logits,
+                enable_dynamic_shape=self.enable_dynamic_shape,
+                # The exported graph can contain ops like aten.full.default
+                # without explicit device, which default to CPU and can
+                # trigger device-mismatch errors when lm_eval runs on CUDA.
+                # Calibrate on CPU for stability.
+                device="cpu",
             )
 
-        for task, res in eval_results["results"].items():
-            print(f"{task}: {res}")
-        logging.info("Calibration finish...")
+            with torch.no_grad():
+                eval_results = simple_evaluate(
+                    model=eval_wrapper,
+                    tasks=calibration_tasks,
+                    limit=calibration_limit,
+                )
+
+            for task, res in eval_results["results"].items():
+                print(f"{task}: {res}")
+            logging.info("Calibration finish...")
 
     def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManager":
         """
@@ -351,18 +397,19 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage
                 assert (
                     self.pre_autograd_graph_module is not None
                 ), "Please run export() first"
+                if self.calibration_tasks and self.calibration_limit is None:
+                    logging.warning(
+                        "calibration_tasks provided without calibration_limit; "
+                        "lm-eval will run the full task dataset during "
+                        "calibration."
+                    )
                 m = prepare_pt2e(
                     self.pre_autograd_graph_module,  # pyre-ignore[6]
                     composed_quantizer,
                 )
-                logging.info(
-                    f"Calibrating with tasks: {self.calibration_tasks}, limit: {self.calibration_limit}, calibration_data: {self.calibration_data}, tokenizer_path: {self.tokenizer_path}, seq_length: {self.calibration_seq_length}"
-                )
                 # Calibrate
                 if (
-                    self.calibration_tasks is not None
-                    and self.calibration_limit is not None
-                    and self.calibration_seq_length is not None
+                    self.calibration_seq_length is not None
                     and self.calibration_data is not None
                     and self.tokenizer_path is not None
                 ):