From ae3c1b1a2e6e62238a34bd408b6c197c3778e220 Mon Sep 17 00:00:00 2001 From: Milica Makevic Date: Tue, 31 Mar 2026 07:43:00 +0000 Subject: [PATCH 1/8] Use rocm_rbe config --- .github/workflows/postsubmit_benchmark.yml | 9 +++++++++ build_tools/ci/build.py | 5 ++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/postsubmit_benchmark.yml b/.github/workflows/postsubmit_benchmark.yml index cf71b802e07ed..e23ecc90f0295 100644 --- a/.github/workflows/postsubmit_benchmark.yml +++ b/.github/workflows/postsubmit_benchmark.yml @@ -147,6 +147,15 @@ jobs: ref: ${{ env.CHECKOUT_REF }} persist-credentials: false + - name: Get RBE cluster keys + env: + RBE_CI_CERT: ${{ secrets.RBE_CI_CERT }} + RBE_CI_KEY: ${{ secrets.RBE_CI_KEY }} + run: | + mkdir -p /tf/certificates + echo "$RBE_CI_CERT" > /tf/certificates/ci-cert.crt + echo "$RBE_CI_KEY" > /tf/certificates/ci-cert.key + - name: Build Binaries id: build_binaries run: | diff --git a/build_tools/ci/build.py b/build_tools/ci/build.py index 6995adf188ec6..cc8603730f94c 100755 --- a/build_tools/ci/build.py +++ b/build_tools/ci/build.py @@ -701,7 +701,7 @@ def nvidia_gpu_build_with_compute_capability( Build( type_=BuildType.XLA_LINUX_X86_GPU_ROCM_BENCHMARK_PRESUBMIT_GITHUB_ACTIONS, repo="openxla/xla", - configs=("rocm_ci",), + configs=("rocm_ci", "rocm_rbe"), target_patterns=_XLA_GPU_PRESUBMIT_BENCHMARKS_DEFAULT_TARGET_PATTERNS, test_tag_filters=rocm_tag_filters, build_tag_filters=rocm_tag_filters, @@ -712,6 +712,9 @@ def nvidia_gpu_build_with_compute_capability( }, repo_env={ "TF_ROCM_AMDGPU_TARGETS": "gfx90a", + "TF_ROCM_RBE_DOCKER_IMAGE": "rocm/" + "tensorflow-build@sha256:" + "7fcfbd36b7ac8f6b0805b37c4248e929e31cf5ee3af766c8409dd70d5ab65faa" }, subcommand="build", ) From 435242927cd77ae67a49b76001896d05b56653e8 Mon Sep 17 00:00:00 2001 From: Milica Makevic Date: Tue, 31 Mar 2026 08:45:10 +0000 Subject: [PATCH 2/8] Unset hlo_argument_mode --- xla/tools/benchmarks/registries/default_registry.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xla/tools/benchmarks/registries/default_registry.yml b/xla/tools/benchmarks/registries/default_registry.yml index c2ec56d097027..fc6cd33e15efd 100644 --- a/xla/tools/benchmarks/registries/default_registry.yml +++ b/xla/tools/benchmarks/registries/default_registry.yml @@ -30,7 +30,7 @@ benchmarks: [ topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false } target_metrics: [GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME] workflow_type: [POSTSUBMIT] - runtime_flags: ["--num_repeats=5", "--hlo_argument_mode=uninitialized"] + runtime_flags: ["--num_repeats=5"] }] update_frequency_policy: QUARTERLY }, @@ -48,7 +48,7 @@ benchmarks: [ topology: { num_hosts: 1, num_devices_per_host: 1, multi_host: false, multi_device: false } target_metrics: [GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME] workflow_type: [POSTSUBMIT] - runtime_flags: ["--num_repeats=5", "--hlo_argument_mode=uninitialized"] + runtime_flags: ["--num_repeats=5"] xla_compilation_flags: ["--xla_gpu_enable_command_buffer="] }] update_frequency_policy: QUARTERLY From 3751bdfb2f3a7f63b66aea24b4938d773147f9ff Mon Sep 17 00:00:00 2001 From: Milica Makevic Date: Tue, 31 Mar 2026 13:24:48 +0000 Subject: [PATCH 3/8] Use newer Docker image --- .github/workflows/generate_benchmark_matrix.yml | 2 +- build_tools/ci/build.py | 2 +- xla/tools/benchmarks/utils/generate_benchmark_matrices.cc | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/generate_benchmark_matrix.yml b/.github/workflows/generate_benchmark_matrix.yml index 0092905a4ac1c..12bf7d46e04e2 100644 --- a/.github/workflows/generate_benchmark_matrix.yml +++ b/.github/workflows/generate_benchmark_matrix.yml @@ -44,7 +44,7 @@ jobs: generate: name: Generate Matrix (${{ inputs.workflow_type }}) runs-on: linux-mi250-4 - container: rocm/tensorflow-build@sha256:7fcfbd36b7ac8f6b0805b37c4248e929e31cf5ee3af766c8409dd70d5ab65faa + container: ${{ vars.DOCKER_IMAGE }} outputs: matrix_json_output: ${{ steps.run_generator.outputs.matrix_json }} defaults: diff --git a/build_tools/ci/build.py b/build_tools/ci/build.py index cc8603730f94c..c6be14ca252c9 100755 --- a/build_tools/ci/build.py +++ b/build_tools/ci/build.py @@ -714,7 +714,7 @@ def nvidia_gpu_build_with_compute_capability( "TF_ROCM_AMDGPU_TARGETS": "gfx90a", "TF_ROCM_RBE_DOCKER_IMAGE": "rocm/" "tensorflow-build@sha256:" - "7fcfbd36b7ac8f6b0805b37c4248e929e31cf5ee3af766c8409dd70d5ab65faa" + "66eb4c1e39db76fae2eb0a1029490acbe7bfce0e00d6ab435e170f743921f4c4" }, subcommand="build", ) diff --git a/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc b/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc index 3762d99138fa0..2ba224e247bb2 100644 --- a/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc +++ b/xla/tools/benchmarks/utils/generate_benchmark_matrices.cc @@ -130,7 +130,7 @@ GetHardwareToContainerImage() { {"GPU_MI250", "rocm/" "tensorflow-build@sha256:" - "7fcfbd36b7ac8f6b0805b37c4248e929e31cf5ee3af766c8409dd70d5ab65faa"}, + "66eb4c1e39db76fae2eb0a1029490acbe7bfce0e00d6ab435e170f743921f4c4"}, }; return *kHardwareToContainerImage; } From ba190d51eb0de72c814028119e39a4fbfdeaf97e Mon Sep 17 00:00:00 2001 From: Milica Makevic Date: Tue, 31 Mar 2026 13:26:28 +0000 Subject: [PATCH 4/8] Remove unnecessary xla compilation flag from gemma2 --- xla/tools/benchmarks/registries/default_registry.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/xla/tools/benchmarks/registries/default_registry.yml b/xla/tools/benchmarks/registries/default_registry.yml index fc6cd33e15efd..1188bbe084ff1 100644 --- a/xla/tools/benchmarks/registries/default_registry.yml +++ b/xla/tools/benchmarks/registries/default_registry.yml @@ -49,7 +49,6 @@ benchmarks: [ target_metrics: [GPU_DEVICE_TIME, GPU_DEVICE_MEMCPY_TIME] workflow_type: [POSTSUBMIT] runtime_flags: ["--num_repeats=5"] - xla_compilation_flags: ["--xla_gpu_enable_command_buffer="] }] update_frequency_policy: QUARTERLY # TODO(juliagmt): remove this label once the benchmark is stable. From 5b3f526d7699041dd328874318fece7e1f328ed7 Mon Sep 17 00:00:00 2001 From: Milica Makevic Date: Wed, 1 Apr 2026 11:45:50 +0000 Subject: [PATCH 5/8] Specify rocm_xla.bazelrc in Bazel command in order to use rocm_rbe --- build_tools/ci/build.py | 1 + 1 file changed, 1 insertion(+) diff --git a/build_tools/ci/build.py b/build_tools/ci/build.py index c6be14ca252c9..24c88b09685e7 100755 --- a/build_tools/ci/build.py +++ b/build_tools/ci/build.py @@ -716,6 +716,7 @@ def nvidia_gpu_build_with_compute_capability( "tensorflow-build@sha256:" "66eb4c1e39db76fae2eb0a1029490acbe7bfce0e00d6ab435e170f743921f4c4" }, + startup_options={"bazelrc": "build_tools/rocm/rocm_xla.bazelrc"}, subcommand="build", ) From 9323d5e200929cbb426babc6e817e5e778a867f4 Mon Sep 17 00:00:00 2001 From: Milica Makevic Date: Mon, 6 Apr 2026 14:46:18 +0000 Subject: [PATCH 6/8] Override remote_download_minimal from rocm_rbe --- build_tools/ci/build.py | 1 + 1 file changed, 1 insertion(+) diff --git a/build_tools/ci/build.py b/build_tools/ci/build.py index 24c88b09685e7..e91f64e3c196e 100755 --- a/build_tools/ci/build.py +++ b/build_tools/ci/build.py @@ -708,6 +708,7 @@ def nvidia_gpu_build_with_compute_capability( options={ "run_under": "//build_tools/ci:parallel_gpu_execute", "//xla/tsl:ci_build": True, + "remote_download_toplevel": True, # Override remote_download_minimal from rocm_rbe **_DEFAULT_BAZEL_OPTIONS, }, repo_env={ From 13f4b952f3648a5726d2cadbb4ee8212feb16c6b Mon Sep 17 00:00:00 2001 From: Milica Makevic Date: Mon, 20 Apr 2026 14:48:33 +0000 Subject: [PATCH 7/8] Set spawn strategy to local --- build_tools/ci/build.py | 1 + 1 file changed, 1 insertion(+) diff --git a/build_tools/ci/build.py b/build_tools/ci/build.py index e91f64e3c196e..b30a36b852451 100755 --- a/build_tools/ci/build.py +++ b/build_tools/ci/build.py @@ -709,6 +709,7 @@ def nvidia_gpu_build_with_compute_capability( "run_under": "//build_tools/ci:parallel_gpu_execute", "//xla/tsl:ci_build": True, "remote_download_toplevel": True, # Override remote_download_minimal from rocm_rbe + "spawn_strategy": "local", **_DEFAULT_BAZEL_OPTIONS, }, repo_env={ From 1bb53918708a05ad741e5af5e2224d50a4372cb3 Mon Sep 17 00:00:00 2001 From: Milica Makevic Date: Wed, 22 Apr 2026 12:07:42 +0000 Subject: [PATCH 8/8] Add rbe image name in comment --- build_tools/ci/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/ci/build.py b/build_tools/ci/build.py index b30a36b852451..a0c130a8be870 100755 --- a/build_tools/ci/build.py +++ b/build_tools/ci/build.py @@ -714,7 +714,7 @@ def nvidia_gpu_build_with_compute_capability( }, repo_env={ "TF_ROCM_AMDGPU_TARGETS": "gfx90a", - "TF_ROCM_RBE_DOCKER_IMAGE": "rocm/" + "TF_ROCM_RBE_DOCKER_IMAGE": "rocm/" # rocm/tensorflow-build:latest-jammy-pythonall-rocm7.2.1-ci_official "tensorflow-build@sha256:" "66eb4c1e39db76fae2eb0a1029490acbe7bfce0e00d6ab435e170f743921f4c4" },