Arm backend: Enable and support KV cache on Llama (pytorch#20026)

Christoffer-JL · web-flow · commit 44a91bff8d2c · 2026-06-04T17:13:33.000+02:00
- Run llama with use_kv_cache option - Add LlamaPositionalAdapter to handle input_pos mismatch - Extract USER_OUTPUT in arm test pipeline in order to avoid irrelevant cache data being accidentally analysed against the ref model cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils @Sebastian-Larsson @robell @rascani Signed-off-by: Christoffer J.L <christoffer.johanssonlundqvist@arm.com>
diff --git a/backends/arm/test/models/test_llama.py b/backends/arm/test/models/test_llama.py
@@ -34,7 +34,7 @@
 from transformers import GenerationConfig, LlamaConfig, LlamaForCausalLM
 from transformers.integrations.executorch import TorchExportableModuleForDecoderOnlyLM
 
-input_t = Tuple[torch.Tensor]
+input_t = Tuple[torch.Tensor, ...]
 input_th = Tuple[torch.Tensor, torch.Tensor]
 
 # Add project dir to sys path to workaround importlib.import_module() conditions in model_factory.py
@@ -61,6 +61,15 @@ def forward(self, input_ids, cache_position):
         return self.inner(input_ids=input_ids, cache_position=cp)
 
 
+class LlamaPositionalAdapter(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+
+    def forward(self, tokens, input_pos):
+        return self.model(tokens, {"input_pos": input_pos})
+
+
 class TestLlama:
     """Test class of Llama models.
 
@@ -154,6 +163,7 @@ def prepare_model(self):
             params_file,
             "--model",
             model_name,
+            "--use_kv_cache",
         ]
 
         parser = build_args_parser()
@@ -162,6 +172,11 @@ def prepare_model(self):
 
         llama_model, llama_inputs, llama_meta = get_llama_model(llm_config)
 
+        if llm_config.model.use_kv_cache:
+            tokens, attn_options = llama_inputs
+            llama_model = LlamaPositionalAdapter(llama_model).eval()
+            llama_inputs = (tokens, attn_options["input_pos"])
+
         return llama_model, llama_inputs, llama_meta
 
 
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
@@ -641,6 +641,18 @@ def run_method_and_compare_outputs(
                     test_stage.run_artifact(test_input)
                 )
 
+            # When we run with KV cache enabled, the model returns cache data in the results. This we need to strip away by extracting only USER_OUTPUT.
+            if hasattr(test_stage.artifact, "exported_program"):
+                output_specs = (
+                    test_stage.artifact.exported_program().graph_signature.output_specs
+                )
+                user_outputs = [
+                    output
+                    for output, spec in zip(test_outputs, output_specs)
+                    if spec.kind == OutputKind.USER_OUTPUT
+                ]
+                test_outputs = user_outputs
+
             logger.info(f"\n      Input: {original_input}")
             logger.info(f"\n Ref output: {reference_outputs}")
             logger.info(f"\nTest output: {test_outputs}")