Add golden artifact generation to nightly backend test suite (#17663)

kirklandsign · web-flow · commit 7e1e6b40342f · 2026-02-24T16:44:30.000-08:00
After successful model correctness verification (torch.allclose against
eager), dump the input tensors, eager reference output, and serialized
.pte as golden files. These artifacts are packaged into per-model zips
and a combined golden_artifacts_yymmddhh.zip, then uploaded to S3 via
the existing test-infra artifact pipeline.

Controlled by the GOLDEN_ARTIFACTS_DIR environment variable — when
unset, behavior is unchanged. The test_backend.sh script sets this
automatically.
diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh
@@ -85,7 +85,10 @@ else
 fi
 CMAKE_ARGS="$EXTRA_BUILD_ARGS" ${CONDA_RUN_CMD} $SETUP_SCRIPT --build-tool cmake --build-mode Release --editable true
 
+GOLDEN_DIR="${ARTIFACT_DIR}/golden-artifacts"
+export GOLDEN_ARTIFACTS_DIR="${GOLDEN_DIR}"
+
 EXIT_CODE=0
-${CONDA_RUN_CMD} pytest -c /dev/nul -n auto backends/test/suite/$SUITE/ -m flow_$FLOW --json-report --json-report-file="$REPORT_FILE" || EXIT_CODE=$?
+${CONDA_RUN_CMD} pytest -c /dev/null -n auto backends/test/suite/$SUITE/ -m flow_$FLOW --json-report --json-report-file="$REPORT_FILE" || EXIT_CODE=$?
 # Generate markdown summary.
 ${CONDA_RUN_CMD} python -m executorch.backends.test.suite.generate_markdown_summary_json "$REPORT_FILE" > ${GITHUB_STEP_SUMMARY:-"step_summary.md"} --exit-code $EXIT_CODE
diff --git a/.github/workflows/_test_backend.yml b/.github/workflows/_test_backend.yml
@@ -59,6 +59,61 @@ jobs:
 
         source .ci/scripts/test_backend.sh "${{ matrix.suite }}" "${{ matrix.flow }}" "${RUNNER_ARTIFACT_DIR}"
 
+  package-golden-artifacts:
+    if: ${{ inputs.run-linux }}
+    needs: test-backend-linux
+    runs-on: linux.2xlarge
+    steps:
+      - name: Download model test artifacts
+        uses: actions/download-artifact@v4
+        with:
+          pattern: test-report-*-models
+          path: downloaded/
+
+      - name: Package golden artifacts
+        run: |
+          set -eux
+          TIMESTAMP=$(date -u +%y%m%d%H)
+          mkdir -p golden_combined
+
+          # Collect golden artifacts preserving flow directory structure.
+          # Raw files live under downloaded/*/golden-artifacts/{flow}/.
+          for flow_dir in downloaded/*/golden-artifacts/*/; do
+            [ -d "$flow_dir" ] || continue
+            flow_name=$(basename "$flow_dir")
+            if ls "$flow_dir"/*.pte 1>/dev/null 2>&1; then
+              mkdir -p "golden_combined/${flow_name}"
+              cp "$flow_dir"/*.pte "$flow_dir"/*_input*.bin "$flow_dir"/*_expected_output*.bin \
+                "golden_combined/${flow_name}/" 2>/dev/null || true
+            fi
+          done
+
+          if find golden_combined -name '*.pte' | grep -q .; then
+            (cd golden_combined && zip -r "../golden_artifacts_${TIMESTAMP}.zip" .)
+            echo "Created golden_artifacts_${TIMESTAMP}.zip"
+            find golden_combined -type f | head -20
+          else
+            echo "No golden artifacts found."
+          fi
+
+      - name: Upload combined golden artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: golden-artifacts-${{ inputs.backend }}
+          path: golden_artifacts_*.zip
+          if-no-files-found: ignore
+
+      - name: Upload golden artifacts to S3
+        uses: seemethere/upload-artifact-s3@v5
+        if: ${{ hashFiles('golden_artifacts_*.zip') != '' }}
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/test-backend-artifacts/golden-artifacts-${{ inputs.backend }}
+          retention-days: 90
+          if-no-files-found: ignore
+          path: golden_artifacts_*.zip
+
   test-backend-macos:
     if: ${{ inputs.run-macos }}
     strategy:
diff --git a/.github/workflows/test-backend-xnnpack.yml b/.github/workflows/test-backend-xnnpack.yml
@@ -12,6 +12,9 @@ on:
     paths:
       - .github/workflows/test-backend-xnnpack.yml
       - .github/workflows/_test_backend.yml
+      - .ci/scripts/test_backend.sh
+      - backends/test/harness/**
+      - backends/test/suite/**
   workflow_dispatch:
 
 concurrency:
diff --git a/backends/test/harness/tester.py b/backends/test/harness/tester.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import logging
+import os
 import random
 from collections import Counter, OrderedDict
 from typing import Any, Callable, Dict, List, Optional, Tuple
@@ -317,11 +319,14 @@ def run_method_and_compare_outputs(
         rtol=1e-03,
         qtol=0,
         statistics_callback: Callable[[ErrorStatistics], None] | None = None,
+        artifact_dir: Optional[str] = None,
+        artifact_name: Optional[str] = None,
     ):
         number_of_runs = 1 if inputs is not None else num_runs
         reference_stage = self.stages[StageType.EXPORT]
 
         stage = stage or self.cur
+        artifacts_saved = False
 
         for _ in range(number_of_runs):
             inputs_to_run = inputs if inputs else next(self.generate_random_inputs())
@@ -346,8 +351,54 @@ def run_method_and_compare_outputs(
                 statistics_callback,
             )
 
+            if artifact_dir and artifact_name and not artifacts_saved:
+                try:
+                    self._dump_golden_artifacts(
+                        artifact_dir,
+                        artifact_name,
+                        inputs_to_run,
+                        reference_output,
+                    )
+                except Exception:
+                    logging.getLogger(__name__).warning(
+                        f"Failed to dump golden artifacts for {artifact_name}",
+                        exc_info=True,
+                    )
+                artifacts_saved = True
+
         return self
 
+    @staticmethod
+    def _dump_golden_artifacts(
+        artifact_dir: str,
+        artifact_name: str,
+        inputs: Tuple[torch.Tensor],
+        reference_output,
+    ):
+        logger = logging.getLogger(__name__)
+        os.makedirs(artifact_dir, exist_ok=True)
+
+        for i, inp in enumerate(inputs):
+            if isinstance(inp, torch.Tensor):
+                suffix = "" if len(inputs) == 1 else f"_{i}"
+                path = os.path.join(artifact_dir, f"{artifact_name}_input{suffix}.bin")
+                inp.contiguous().numpy().tofile(path)
+                logger.info(f"Saved golden input to {path}")
+
+        if isinstance(reference_output, torch.Tensor):
+            reference_output = (reference_output,)
+        elif isinstance(reference_output, OrderedDict):
+            reference_output = tuple(reference_output.values())
+
+        for i, out in enumerate(reference_output):
+            if isinstance(out, torch.Tensor):
+                suffix = "" if len(reference_output) == 1 else f"_{i}"
+                path = os.path.join(
+                    artifact_dir, f"{artifact_name}_expected_output{suffix}.bin"
+                )
+                out.contiguous().numpy().tofile(path)
+                logger.info(f"Saved golden output to {path}")
+
     @staticmethod
     def _assert_outputs_equal(
         model_output,
diff --git a/backends/test/suite/conftest.py b/backends/test/suite/conftest.py
@@ -1,3 +1,4 @@
+import os
 from typing import Any
 
 import pytest
@@ -32,6 +33,13 @@ def __init__(self, flow, test_name, test_base_name):
         self._test_base_name = test_base_name
         self._subtest = 0
         self._results = []
+        self._artifact_dir = self._resolve_artifact_dir()
+
+    def _resolve_artifact_dir(self) -> str | None:
+        base = os.environ.get("GOLDEN_ARTIFACTS_DIR")
+        if not base:
+            return None
+        return os.path.join(base, self._flow.name)
 
     def lower_and_run_model(
         self,
@@ -50,6 +58,7 @@ def lower_and_run_model(
             None,
             generate_random_test_inputs=generate_random_test_inputs,
             dynamic_shapes=dynamic_shapes,
+            artifact_dir=self._artifact_dir,
         )
 
         self._subtest += 1
diff --git a/backends/test/suite/runner.py b/backends/test/suite/runner.py
@@ -1,6 +1,8 @@
 import argparse
 import hashlib
 import importlib
+import logging
+import os
 import random
 import re
 import time
@@ -92,6 +94,7 @@ def run_test(  # noqa: C901
     params: dict | None,
     dynamic_shapes: Any | None = None,
     generate_random_test_inputs: bool = True,
+    artifact_dir: str | None = None,
 ) -> TestCaseSummary:
     """
     Top-level test run function for a model, input set, and tester. Handles test execution
@@ -201,6 +204,11 @@ def build_result(
             # We can do this if we ever see to_executorch() or serialize() fail due a backend issue.
             return build_result(TestResult.UNKNOWN_FAIL, e)
 
+        artifact_name = None
+        if artifact_dir:
+            base = test_base_name.removeprefix("test_")
+            artifact_name = f"{base}_{subtest_index}" if subtest_index > 0 else base
+
         # TODO We should consider refactoring the tester slightly to return more signal on
         # the cause of a failure in run_method_and_compare_outputs. We can look for
         # AssertionErrors to catch output mismatches, but this might catch more than that.
@@ -210,11 +218,25 @@ def build_result(
                 statistics_callback=lambda stats: error_statistics.append(stats),
                 atol=1e-1,
                 rtol=4e-2,
+                artifact_dir=artifact_dir,
+                artifact_name=artifact_name,
             )
         except AssertionError as e:
             return build_result(TestResult.OUTPUT_MISMATCH_FAIL, e)
         except Exception as e:
             return build_result(TestResult.PTE_RUN_FAIL, e)
+
+        # Dump .pte after successful comparison.
+        if artifact_dir and artifact_name and flow.supports_serialize:
+            logger = logging.getLogger(__name__)
+            try:
+                pte_path = os.path.join(artifact_dir, f"{artifact_name}.pte")
+                tester.stages[StageType.SERIALIZE].dump_artifact(pte_path)
+                logger.info(f"Saved golden .pte to {pte_path}")
+            except Exception:
+                logger.warning(
+                    f"Failed to save .pte for {artifact_name}", exc_info=True
+                )
     else:
         # Skip the test if nothing is delegated
         return build_result(TestResult.SUCCESS_UNDELEGATED)