Skip to content

Commit 7e1e6b4

Browse files
authored
Add golden artifact generation to nightly backend test suite (#17663)
After successful model correctness verification (torch.allclose against eager), dump the input tensors, eager reference output, and serialized .pte as golden files. These artifacts are packaged into per-model zips and a combined golden_artifacts_yymmddhh.zip, then uploaded to S3 via the existing test-infra artifact pipeline. Controlled by the GOLDEN_ARTIFACTS_DIR environment variable — when unset, behavior is unchanged. The test_backend.sh script sets this automatically.
1 parent 9cb6a09 commit 7e1e6b4

6 files changed

Lines changed: 144 additions & 1 deletion

File tree

.ci/scripts/test_backend.sh

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,10 @@ else
8585
fi
8686
CMAKE_ARGS="$EXTRA_BUILD_ARGS" ${CONDA_RUN_CMD} $SETUP_SCRIPT --build-tool cmake --build-mode Release --editable true
8787

88+
GOLDEN_DIR="${ARTIFACT_DIR}/golden-artifacts"
89+
export GOLDEN_ARTIFACTS_DIR="${GOLDEN_DIR}"
90+
8891
EXIT_CODE=0
89-
${CONDA_RUN_CMD} pytest -c /dev/nul -n auto backends/test/suite/$SUITE/ -m flow_$FLOW --json-report --json-report-file="$REPORT_FILE" || EXIT_CODE=$?
92+
${CONDA_RUN_CMD} pytest -c /dev/null -n auto backends/test/suite/$SUITE/ -m flow_$FLOW --json-report --json-report-file="$REPORT_FILE" || EXIT_CODE=$?
9093
# Generate markdown summary.
9194
${CONDA_RUN_CMD} python -m executorch.backends.test.suite.generate_markdown_summary_json "$REPORT_FILE" > ${GITHUB_STEP_SUMMARY:-"step_summary.md"} --exit-code $EXIT_CODE

.github/workflows/_test_backend.yml

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,61 @@ jobs:
5959
6060
source .ci/scripts/test_backend.sh "${{ matrix.suite }}" "${{ matrix.flow }}" "${RUNNER_ARTIFACT_DIR}"
6161
62+
package-golden-artifacts:
63+
if: ${{ inputs.run-linux }}
64+
needs: test-backend-linux
65+
runs-on: linux.2xlarge
66+
steps:
67+
- name: Download model test artifacts
68+
uses: actions/download-artifact@v4
69+
with:
70+
pattern: test-report-*-models
71+
path: downloaded/
72+
73+
- name: Package golden artifacts
74+
run: |
75+
set -eux
76+
TIMESTAMP=$(date -u +%y%m%d%H)
77+
mkdir -p golden_combined
78+
79+
# Collect golden artifacts preserving flow directory structure.
80+
# Raw files live under downloaded/*/golden-artifacts/{flow}/.
81+
for flow_dir in downloaded/*/golden-artifacts/*/; do
82+
[ -d "$flow_dir" ] || continue
83+
flow_name=$(basename "$flow_dir")
84+
if ls "$flow_dir"/*.pte 1>/dev/null 2>&1; then
85+
mkdir -p "golden_combined/${flow_name}"
86+
cp "$flow_dir"/*.pte "$flow_dir"/*_input*.bin "$flow_dir"/*_expected_output*.bin \
87+
"golden_combined/${flow_name}/" 2>/dev/null || true
88+
fi
89+
done
90+
91+
if find golden_combined -name '*.pte' | grep -q .; then
92+
(cd golden_combined && zip -r "../golden_artifacts_${TIMESTAMP}.zip" .)
93+
echo "Created golden_artifacts_${TIMESTAMP}.zip"
94+
find golden_combined -type f | head -20
95+
else
96+
echo "No golden artifacts found."
97+
fi
98+
99+
- name: Upload combined golden artifacts
100+
uses: actions/upload-artifact@v4
101+
with:
102+
name: golden-artifacts-${{ inputs.backend }}
103+
path: golden_artifacts_*.zip
104+
if-no-files-found: ignore
105+
106+
- name: Upload golden artifacts to S3
107+
uses: seemethere/upload-artifact-s3@v5
108+
if: ${{ hashFiles('golden_artifacts_*.zip') != '' }}
109+
with:
110+
s3-bucket: gha-artifacts
111+
s3-prefix: |
112+
${{ github.repository }}/test-backend-artifacts/golden-artifacts-${{ inputs.backend }}
113+
retention-days: 90
114+
if-no-files-found: ignore
115+
path: golden_artifacts_*.zip
116+
62117
test-backend-macos:
63118
if: ${{ inputs.run-macos }}
64119
strategy:

.github/workflows/test-backend-xnnpack.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@ on:
1212
paths:
1313
- .github/workflows/test-backend-xnnpack.yml
1414
- .github/workflows/_test_backend.yml
15+
- .ci/scripts/test_backend.sh
16+
- backends/test/harness/**
17+
- backends/test/suite/**
1518
workflow_dispatch:
1619

1720
concurrency:

backends/test/harness/tester.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
# This source code is licensed under the BSD-style license found in the
44
# LICENSE file in the root directory of this source tree.
55

6+
import logging
7+
import os
68
import random
79
from collections import Counter, OrderedDict
810
from typing import Any, Callable, Dict, List, Optional, Tuple
@@ -317,11 +319,14 @@ def run_method_and_compare_outputs(
317319
rtol=1e-03,
318320
qtol=0,
319321
statistics_callback: Callable[[ErrorStatistics], None] | None = None,
322+
artifact_dir: Optional[str] = None,
323+
artifact_name: Optional[str] = None,
320324
):
321325
number_of_runs = 1 if inputs is not None else num_runs
322326
reference_stage = self.stages[StageType.EXPORT]
323327

324328
stage = stage or self.cur
329+
artifacts_saved = False
325330

326331
for _ in range(number_of_runs):
327332
inputs_to_run = inputs if inputs else next(self.generate_random_inputs())
@@ -346,8 +351,54 @@ def run_method_and_compare_outputs(
346351
statistics_callback,
347352
)
348353

354+
if artifact_dir and artifact_name and not artifacts_saved:
355+
try:
356+
self._dump_golden_artifacts(
357+
artifact_dir,
358+
artifact_name,
359+
inputs_to_run,
360+
reference_output,
361+
)
362+
except Exception:
363+
logging.getLogger(__name__).warning(
364+
f"Failed to dump golden artifacts for {artifact_name}",
365+
exc_info=True,
366+
)
367+
artifacts_saved = True
368+
349369
return self
350370

371+
@staticmethod
372+
def _dump_golden_artifacts(
373+
artifact_dir: str,
374+
artifact_name: str,
375+
inputs: Tuple[torch.Tensor],
376+
reference_output,
377+
):
378+
logger = logging.getLogger(__name__)
379+
os.makedirs(artifact_dir, exist_ok=True)
380+
381+
for i, inp in enumerate(inputs):
382+
if isinstance(inp, torch.Tensor):
383+
suffix = "" if len(inputs) == 1 else f"_{i}"
384+
path = os.path.join(artifact_dir, f"{artifact_name}_input{suffix}.bin")
385+
inp.contiguous().numpy().tofile(path)
386+
logger.info(f"Saved golden input to {path}")
387+
388+
if isinstance(reference_output, torch.Tensor):
389+
reference_output = (reference_output,)
390+
elif isinstance(reference_output, OrderedDict):
391+
reference_output = tuple(reference_output.values())
392+
393+
for i, out in enumerate(reference_output):
394+
if isinstance(out, torch.Tensor):
395+
suffix = "" if len(reference_output) == 1 else f"_{i}"
396+
path = os.path.join(
397+
artifact_dir, f"{artifact_name}_expected_output{suffix}.bin"
398+
)
399+
out.contiguous().numpy().tofile(path)
400+
logger.info(f"Saved golden output to {path}")
401+
351402
@staticmethod
352403
def _assert_outputs_equal(
353404
model_output,

backends/test/suite/conftest.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import os
12
from typing import Any
23

34
import pytest
@@ -32,6 +33,13 @@ def __init__(self, flow, test_name, test_base_name):
3233
self._test_base_name = test_base_name
3334
self._subtest = 0
3435
self._results = []
36+
self._artifact_dir = self._resolve_artifact_dir()
37+
38+
def _resolve_artifact_dir(self) -> str | None:
39+
base = os.environ.get("GOLDEN_ARTIFACTS_DIR")
40+
if not base:
41+
return None
42+
return os.path.join(base, self._flow.name)
3543

3644
def lower_and_run_model(
3745
self,
@@ -50,6 +58,7 @@ def lower_and_run_model(
5058
None,
5159
generate_random_test_inputs=generate_random_test_inputs,
5260
dynamic_shapes=dynamic_shapes,
61+
artifact_dir=self._artifact_dir,
5362
)
5463

5564
self._subtest += 1

backends/test/suite/runner.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import argparse
22
import hashlib
33
import importlib
4+
import logging
5+
import os
46
import random
57
import re
68
import time
@@ -92,6 +94,7 @@ def run_test( # noqa: C901
9294
params: dict | None,
9395
dynamic_shapes: Any | None = None,
9496
generate_random_test_inputs: bool = True,
97+
artifact_dir: str | None = None,
9598
) -> TestCaseSummary:
9699
"""
97100
Top-level test run function for a model, input set, and tester. Handles test execution
@@ -201,6 +204,11 @@ def build_result(
201204
# We can do this if we ever see to_executorch() or serialize() fail due a backend issue.
202205
return build_result(TestResult.UNKNOWN_FAIL, e)
203206

207+
artifact_name = None
208+
if artifact_dir:
209+
base = test_base_name.removeprefix("test_")
210+
artifact_name = f"{base}_{subtest_index}" if subtest_index > 0 else base
211+
204212
# TODO We should consider refactoring the tester slightly to return more signal on
205213
# the cause of a failure in run_method_and_compare_outputs. We can look for
206214
# AssertionErrors to catch output mismatches, but this might catch more than that.
@@ -210,11 +218,25 @@ def build_result(
210218
statistics_callback=lambda stats: error_statistics.append(stats),
211219
atol=1e-1,
212220
rtol=4e-2,
221+
artifact_dir=artifact_dir,
222+
artifact_name=artifact_name,
213223
)
214224
except AssertionError as e:
215225
return build_result(TestResult.OUTPUT_MISMATCH_FAIL, e)
216226
except Exception as e:
217227
return build_result(TestResult.PTE_RUN_FAIL, e)
228+
229+
# Dump .pte after successful comparison.
230+
if artifact_dir and artifact_name and flow.supports_serialize:
231+
logger = logging.getLogger(__name__)
232+
try:
233+
pte_path = os.path.join(artifact_dir, f"{artifact_name}.pte")
234+
tester.stages[StageType.SERIALIZE].dump_artifact(pte_path)
235+
logger.info(f"Saved golden .pte to {pte_path}")
236+
except Exception:
237+
logger.warning(
238+
f"Failed to save .pte for {artifact_name}", exc_info=True
239+
)
218240
else:
219241
# Skip the test if nothing is delegated
220242
return build_result(TestResult.SUCCESS_UNDELEGATED)

0 commit comments

Comments
 (0)