Fix golden artifact collisions, error handling, and pytest config path

kirklandsign · kirklandsign · commit ff2dbda6de62 · 2026-02-24T13:26:18.000-08:00
- Prefix per-model golden zips with flow name to avoid cross-flow
  filename collisions (e.g. xnnpack_mobilenet_v3_small_golden.zip)
- Collect pre-packaged golden zips in workflow instead of flattening
  raw .pte/.bin files that would overwrite across flows
- Wrap _dump_golden_artifacts in try/except so filesystem errors don't
  fail otherwise-passing correctness tests
- Append subtest_index to artifact name for parameterized test variants
- Fix /dev/nul typo to /dev/null in pytest config override
diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh
@@ -89,18 +89,17 @@ GOLDEN_DIR="${ARTIFACT_DIR}/golden-artifacts"
 export GOLDEN_ARTIFACTS_DIR="${GOLDEN_DIR}"
 
 EXIT_CODE=0
-${CONDA_RUN_CMD} pytest -c /dev/nul -n auto backends/test/suite/$SUITE/ -m flow_$FLOW --json-report --json-report-file="$REPORT_FILE" || EXIT_CODE=$?
+${CONDA_RUN_CMD} pytest -c /dev/null -n auto backends/test/suite/$SUITE/ -m flow_$FLOW --json-report --json-report-file="$REPORT_FILE" || EXIT_CODE=$?
 # Generate markdown summary.
 ${CONDA_RUN_CMD} python -m executorch.backends.test.suite.generate_markdown_summary_json "$REPORT_FILE" > ${GITHUB_STEP_SUMMARY:-"step_summary.md"} --exit-code $EXIT_CODE
 
 # Package golden artifacts into per-model zips for downstream consumers.
 if [[ -d "${GOLDEN_DIR}/${FLOW}" ]]; then
     pushd "${GOLDEN_DIR}/${FLOW}"
-    # Group files by model name prefix and zip each model's artifacts.
     for pte in *.pte; do
         [[ -f "$pte" ]] || continue
         model_name="${pte%.pte}"
-        zip -j "${GOLDEN_DIR}/${model_name}_golden.zip" \
+        zip -j "${GOLDEN_DIR}/${FLOW}_${model_name}_golden.zip" \
             "${model_name}.pte" \
             ${model_name}_input*.bin \
             ${model_name}_expected_output*.bin \
diff --git a/.github/workflows/_test_backend.yml b/.github/workflows/_test_backend.yml
@@ -76,12 +76,12 @@ jobs:
           TIMESTAMP=$(date -u +%y%m%d%H)
           mkdir -p golden_combined
 
-          find downloaded/ \( -name '*.pte' -o -name '*_input*.bin' -o -name '*_expected_output*.bin' \) \
-            -exec cp {} golden_combined/ \;
+          # Collect per-flow golden zips (already namespaced by flow in filename).
+          find downloaded/ -name '*_golden.zip' -exec cp {} golden_combined/ \;
 
-          if ls golden_combined/*.pte 1>/dev/null 2>&1; then
+          if ls golden_combined/*_golden.zip 1>/dev/null 2>&1; then
             (cd golden_combined && zip -r "../golden_artifacts_${TIMESTAMP}.zip" .)
-            echo "Created golden_artifacts_${TIMESTAMP}.zip with $(ls golden_combined/*.pte | wc -l) models."
+            echo "Created golden_artifacts_${TIMESTAMP}.zip with $(ls golden_combined/ | wc -l) golden zips."
           else
             echo "No golden artifacts found."
           fi
diff --git a/backends/test/harness/tester.py b/backends/test/harness/tester.py
@@ -352,9 +352,18 @@ def run_method_and_compare_outputs(
             )
 
             if artifact_dir and artifact_name and not artifacts_saved:
-                self._dump_golden_artifacts(
-                    artifact_dir, artifact_name, inputs_to_run, reference_output
-                )
+                try:
+                    self._dump_golden_artifacts(
+                        artifact_dir,
+                        artifact_name,
+                        inputs_to_run,
+                        reference_output,
+                    )
+                except Exception:
+                    logging.getLogger(__name__).warning(
+                        f"Failed to dump golden artifacts for {artifact_name}",
+                        exc_info=True,
+                    )
                 artifacts_saved = True
 
         return self
diff --git a/backends/test/suite/runner.py b/backends/test/suite/runner.py
@@ -204,10 +204,12 @@ def build_result(
             # We can do this if we ever see to_executorch() or serialize() fail due a backend issue.
             return build_result(TestResult.UNKNOWN_FAIL, e)
 
-        # Derive a clean model name for golden artifacts (e.g. "test_mobilenet_v3_small" -> "mobilenet_v3_small").
         artifact_name = None
         if artifact_dir:
-            artifact_name = test_base_name.removeprefix("test_")
+            base = test_base_name.removeprefix("test_")
+            artifact_name = (
+                f"{base}_{subtest_index}" if subtest_index > 0 else base
+            )
 
         # TODO We should consider refactoring the tester slightly to return more signal on
         # the cause of a failure in run_method_and_compare_outputs. We can look for