diff --git a/.github/workflows/generate_benchmark_matrix.yml b/.github/workflows/generate_benchmark_matrix.yml index 0092905a4ac1c..12bf7d46e04e2 100644 --- a/.github/workflows/generate_benchmark_matrix.yml +++ b/.github/workflows/generate_benchmark_matrix.yml @@ -44,7 +44,7 @@ jobs: generate: name: Generate Matrix (${{ inputs.workflow_type }}) runs-on: linux-mi250-4 - container: rocm/tensorflow-build@sha256:7fcfbd36b7ac8f6b0805b37c4248e929e31cf5ee3af766c8409dd70d5ab65faa + container: ${{ vars.DOCKER_IMAGE }} outputs: matrix_json_output: ${{ steps.run_generator.outputs.matrix_json }} defaults: diff --git a/.github/workflows/postsubmit_benchmark.yml b/.github/workflows/postsubmit_benchmark.yml index cf71b802e07ed..e23ecc90f0295 100644 --- a/.github/workflows/postsubmit_benchmark.yml +++ b/.github/workflows/postsubmit_benchmark.yml @@ -147,6 +147,15 @@ jobs: ref: ${{ env.CHECKOUT_REF }} persist-credentials: false + - name: Get RBE cluster keys + env: + RBE_CI_CERT: ${{ secrets.RBE_CI_CERT }} + RBE_CI_KEY: ${{ secrets.RBE_CI_KEY }} + run: | + mkdir -p /tf/certificates + echo "$RBE_CI_CERT" > /tf/certificates/ci-cert.crt + echo "$RBE_CI_KEY" > /tf/certificates/ci-cert.key + - name: Build Binaries id: build_binaries run: | diff --git a/build_tools/ci/build.py b/build_tools/ci/build.py index 6995adf188ec6..a0c130a8be870 100755 --- a/build_tools/ci/build.py +++ b/build_tools/ci/build.py @@ -701,18 +701,24 @@ def nvidia_gpu_build_with_compute_capability( Build( type_=BuildType.XLA_LINUX_X86_GPU_ROCM_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS, repo="openxla/xla", - configs=("rocm_ci",), + configs=("rocm_ci", "rocm_rbe"), target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS, test_tag_filters=rocm_tag_filters, build_tag_filters=rocm_tag_filters, options={ "run_under": "//build_tools/ci:parallel_gpu_execute", "//xla/tsl:ci_build": True, + "remote_download_toplevel": True, # Override remote_download_minimal from rocm_rbe + "spawn_strategy": "local", **_DEFAULT_BAZEL_OPTIONS, }, repo_env={ "TF_ROCM_AMDGPU_TARGETS": "gfx90a", + "TF_ROCM_RBE_DOCKER_IMAGE": "rocm/" # rocm/tensorflow-build:latest-jammy-pythonall-rocm7.2.1-ci_official + "tensorflow-build@sha256:" + "66eb4c1e39db76fae2eb0a1029490acbe7bfce0e00d6ab435e170f743921f4c4" }, + startup_options={"bazelrc": "build_tools/rocm/rocm_xla.bazelrc"}, subcommand="build", ) diff --git a/xla/tools/benchmarks/registries/default_registry.yml b/xla/tools/benchmarks/registries/default_registry.yml index c2ec56d097027..1188bbe084ff1 100644 --- a/xla/tools/benchmarks/registries/default_registry.yml +++ b/xla/tools/benchmarks/registries/default_registry.yml @@ -30,7 +30,7 @@ benchmarks: [ topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false } target_metrics: [GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME] workflow_type: [POSTSUBMIT] - runtime_flags: ["--num_repeats=5", "--hlo_argument_mode=uninitialized"] + runtime_flags: ["--num_repeats=5"] }] update_frequency_policy: QUARTERLY }, @@ -48,8 +48,7 @@ benchmarks: [ topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false } target_metrics: [GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME] workflow_type: [POSTSUBMIT] - runtime_flags: ["--num_repeats=5", "--hlo_argument_mode=uninitialized"] - xla_compilation_flags: ["--xla_gpu_enable_command_buffer="] + runtime_flags: ["--num_repeats=5"] }] update_frequency_policy: QUARTERLY # TODO(juliagmt): remove this label once the benchmark is stable. diff --git a/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc b/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc index 3762d99138fa0..2ba224e247bb2 100644 --- a/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc +++ b/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc @@ -130,7 +130,7 @@ GetHardwareToContainerImage() { {"GPU_MI250", "rocm/" "tensorflow-build@sha256:" - "7fcfbd36b7ac8f6b0805b37c4248e929e31cf5ee3af766c8409dd70d5ab65faa"}, + "66eb4c1e39db76fae2eb0a1029490acbe7bfce0e00d6ab435e170f743921f4c4"}, }; return *kHardwareToContainerImage; }