[executorch][PR] add cuda backend to backend test infra

Gasoonjia · Gasoonjia · commit 658d85eee60f · 2026-03-06T11:53:41.000-08:00
Pull Request resolved: #17873 Integrate cuda backend into backend test infra; skipped the unsupported test for now ghstack-source-id: 348683618 @exported-using-ghexport Differential Revision: [D93019490](https://our.internmc.facebook.com/intern/diff/D93019490/)
diff --git a/.ci/scripts/test_backend.sh b/.ci/scripts/test_backend.sh
@@ -56,6 +56,25 @@ if [[ "$FLOW" == *vulkan* ]]; then
     EXTRA_BUILD_ARGS+=" -DEXECUTORCH_BUILD_VULKAN=ON"
 fi
 
+if [[ "$FLOW" == *cuda* ]]; then
+    # Fix libstdc++ GLIBCXX version for CUDA backend.
+    # The embedded .so files in the CUDA blob require GLIBCXX_3.4.30
+    # which the default conda libstdc++ doesn't have.
+    echo "Installing newer libstdc++ for CUDA backend..."
+    conda install -y -c conda-forge 'libstdcxx-ng>=12'
+    export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH:-}"
+
+    # CUDA backend uses the generic PyTorch test-infra Docker image (not the
+    # custom executorch image), so PyTorch is NOT pre-installed from a pinned
+    # commit.  Install executorch with nightly PyTorch (which auto-detects
+    # CUDA via nvcc) and then build the runner manually.
+    echo "Installing ExecuTorch with nightly PyTorch (CUDA-enabled)..."
+    ./install_executorch.sh --editable
+    CMAKE_ARGS="$EXTRA_BUILD_ARGS" source .ci/scripts/utils.sh
+    build_executorch_runner cmake Release
+    CUDA_SETUP_DONE=1
+fi
+
 if [[ "$FLOW" == *arm* ]]; then
 
     # Setup ARM deps.
@@ -78,12 +97,14 @@ if [[ "$FLOW" == *arm* ]]; then
     fi
 fi
 
-if [[ $IS_MACOS -eq 1 ]]; then
-    SETUP_SCRIPT=.ci/scripts/setup-macos.sh
-else
-    SETUP_SCRIPT=.ci/scripts/setup-linux.sh
+if [[ "${CUDA_SETUP_DONE:-0}" != "1" ]]; then
+    if [[ $IS_MACOS -eq 1 ]]; then
+        SETUP_SCRIPT=.ci/scripts/setup-macos.sh
+    else
+        SETUP_SCRIPT=.ci/scripts/setup-linux.sh
+    fi
+    CMAKE_ARGS="$EXTRA_BUILD_ARGS" ${CONDA_RUN_CMD} $SETUP_SCRIPT --build-tool cmake --build-mode Release --editable true
 fi
-CMAKE_ARGS="$EXTRA_BUILD_ARGS" ${CONDA_RUN_CMD} $SETUP_SCRIPT --build-tool cmake --build-mode Release --editable true
 
 GOLDEN_DIR="${ARTIFACT_DIR}/golden-artifacts"
 export GOLDEN_ARTIFACTS_DIR="${GOLDEN_DIR}"
diff --git a/.github/workflows/_test_backend.yml b/.github/workflows/_test_backend.yml
@@ -36,6 +36,26 @@ on:
         required: false
         type: string
         default: linux.4xlarge.memory
+      docker-image:
+        description: 'Docker image for Linux jobs'
+        required: false
+        type: string
+        default: ci-image:executorch-ubuntu-22.04-clang12
+      use-custom-docker-registry:
+        description: 'Whether to use a custom Docker registry (set false for CUDA to use default PyTorch test-infra images)'
+        required: false
+        type: boolean
+        default: true
+      gpu-arch-type:
+        description: 'GPU architecture type (e.g. cuda)'
+        required: false
+        type: string
+        default: ''
+      gpu-arch-version:
+        description: 'GPU architecture version (e.g. 12.6)'
+        required: false
+        type: string
+        default: ''
 
 jobs:
   test-backend-linux:
@@ -50,7 +70,10 @@ jobs:
     with:
       ref: ${{ inputs.ref }}
       runner: ${{ inputs.runner-linux }}
-      docker-image: ci-image:executorch-ubuntu-22.04-clang12
+      docker-image: ${{ inputs.docker-image }}
+      use-custom-docker-registry: ${{ inputs.use-custom-docker-registry }}
+      gpu-arch-type: ${{ inputs.gpu-arch-type }}
+      gpu-arch-version: ${{ inputs.gpu-arch-version }}
       submodules: recursive
       timeout: ${{ inputs.timeout }}
       upload-artifact: test-report-${{ matrix.flow }}-${{ matrix.suite }}
diff --git a/.github/workflows/test-backend-cuda.yml b/.github/workflows/test-backend-cuda.yml
@@ -0,0 +1,33 @@
+name: Test CUDA Backend
+
+on:
+  schedule:
+    - cron: 0 2 * * *
+  push:
+    branches:
+      - release/*
+    tags:
+      - ciflow/nightly/*
+  pull_request:
+    paths:
+      - .github/workflows/test-backend-cuda.yml
+      - .github/workflows/_test_backend.yml
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}--${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  test-cuda:
+    uses: ./.github/workflows/_test_backend.yml
+    with:
+      backend: cuda
+      flows: '["cuda"]'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 120
+      run-linux: true
+      runner-linux: linux.g5.4xlarge.nvidia.gpu
+      use-custom-docker-registry: false
+      gpu-arch-type: cuda
+      gpu-arch-version: '12.6'
diff --git a/backends/cuda/test/tester.py b/backends/cuda/test/tester.py
@@ -0,0 +1,71 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, List, Optional, Tuple
+
+import executorch
+import executorch.backends.test.harness.stages as BaseStages
+import torch
+from executorch.backends.cuda.cuda_backend import CudaBackend
+from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
+from executorch.backends.test.harness import Tester as TesterBase
+from executorch.backends.test.harness.stages import StageType
+from executorch.exir import EdgeCompileConfig
+from executorch.exir.backend.partitioner import Partitioner
+
+
+def _create_default_partitioner() -> CudaPartitioner:
+    """Create a CudaPartitioner with default compile specs."""
+    compile_specs = [CudaBackend.generate_method_name_compile_spec("forward")]
+    return CudaPartitioner(compile_specs)
+
+
+class ToEdgeTransformAndLower(BaseStages.ToEdgeTransformAndLower):
+    """CUDA-specific ToEdgeTransformAndLower stage."""
+
+    def __init__(
+        self,
+        partitioners: Optional[List[Partitioner]] = None,
+        edge_compile_config: Optional[EdgeCompileConfig] = None,
+    ):
+        if partitioners is None:
+            partitioners = [_create_default_partitioner()]
+
+        super().__init__(
+            default_partitioner_cls=_create_default_partitioner,
+            partitioners=partitioners,
+            edge_compile_config=edge_compile_config
+            or EdgeCompileConfig(_check_ir_validity=False),
+        )
+
+
+class CudaTester(TesterBase):
+    """
+    Tester subclass for CUDA backend.
+
+    This tester defines the recipe for lowering models to the CUDA backend
+    using AOTInductor compilation.
+    """
+
+    def __init__(
+        self,
+        module: torch.nn.Module,
+        example_inputs: Tuple[torch.Tensor],
+        dynamic_shapes: Optional[Tuple[Any]] = None,
+    ):
+        stage_classes = (
+            executorch.backends.test.harness.Tester.default_stage_classes()
+            | {
+                StageType.TO_EDGE_TRANSFORM_AND_LOWER: ToEdgeTransformAndLower,
+            }
+        )
+
+        super().__init__(
+            module=module,
+            stage_classes=stage_classes,
+            example_inputs=example_inputs,
+            dynamic_shapes=dynamic_shapes,
+        )
diff --git a/backends/test/harness/stages/serialize.py b/backends/test/harness/stages/serialize.py
@@ -1,11 +1,9 @@
 import copy
 import logging
-
-from typing import Optional
+from typing import Dict, Optional
 
 from executorch.backends.test.harness.stages.stage import Stage, StageType
 from executorch.exir import ExecutorchProgramManager
-
 from torch.utils._pytree import tree_flatten
 
 logger = logging.getLogger(__name__)
@@ -23,12 +21,15 @@
 class Serialize(Stage):
     def __init__(self):
         self.buffer = None
+        self.data_files: Dict[str, bytes] = {}
 
     def stage_type(self) -> StageType:
         return StageType.SERIALIZE
 
     def run(self, artifact: ExecutorchProgramManager, inputs=None) -> None:
         self.buffer = artifact.buffer
+        # Capture external data files (e.g., .ptd files for CUDA backend)
+        self.data_files = artifact.data_files
 
     @property
     def artifact(self) -> bytes:
@@ -40,8 +41,29 @@ def graph_module(self) -> None:
 
     def run_artifact(self, inputs):
         inputs_flattened, _ = tree_flatten(inputs)
+
+        # Combine all external data files into a single buffer for data_map_buffer
+        # Most backends have at most one external data file, but we concatenate
+        # in case there are multiple (though this may not be fully supported)
+        data_map_buffer = None
+        if self.data_files:
+            # If there's exactly one data file, use it directly
+            # Otherwise, log a warning - multiple external files may need special handling
+            if len(self.data_files) == 1:
+                data_map_buffer = list(self.data_files.values())[0]
+            else:
+                # For multiple files, we use the first one and warn
+                # This is a limitation - proper handling would need runtime support
+                logger.warning(
+                    f"Multiple external data files found ({list(self.data_files.keys())}). "
+                    f"Using the first one. This may not work correctly for all backends."
+                )
+                data_map_buffer = list(self.data_files.values())[0]
+
         executorch_module = _load_for_executorch_from_buffer(
-            self.buffer, program_verification=Verification.Minimal
+            self.buffer,
+            data_map_buffer=data_map_buffer,
+            program_verification=Verification.Minimal,
         )
         executorch_output = copy.deepcopy(
             executorch_module.run_method("forward", tuple(inputs_flattened))
diff --git a/backends/test/suite/conftest.py b/backends/test/suite/conftest.py
@@ -3,7 +3,6 @@
 
 import pytest
 import torch
-
 from executorch.backends.test.suite.flow import all_flows
 from executorch.backends.test.suite.reporting import _sum_op_counts
 from executorch.backends.test.suite.runner import run_test
@@ -88,7 +87,14 @@ def lower_and_run_model(
     ids=str,
 )
 def test_runner(request):
-    return TestRunner(request.param, request.node.name, request.node.originalname)
+    flow = request.param
+    test_name = request.node.name
+
+    # Check if this test should be skipped based on the flow's skip_patterns
+    if flow.should_skip_test(test_name):
+        pytest.skip(f"Test '{test_name}' matches skip pattern for flow '{flow.name}'")
+
+    return TestRunner(flow, test_name, request.node.originalname)
 
 
 @pytest.hookimpl(optionalhook=True)
diff --git a/backends/test/suite/flow.py b/backends/test/suite/flow.py
diff --git a/backends/test/suite/flows/cuda.py b/backends/test/suite/flows/cuda.py
diff --git a/exir/program/_program.py b/exir/program/_program.py