From af4da8b678c1cc0c2887abf52e2cf034e020c153 Mon Sep 17 00:00:00 2001 From: Dmitri Latushko Date: Mon, 18 May 2026 15:10:30 -0700 Subject: [PATCH 1/2] Set default hidden visibility for the hwloc build. Fixes #112572 PiperOrigin-RevId: 917448969 --- third_party/hwloc/hwloc.BUILD | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/third_party/hwloc/hwloc.BUILD b/third_party/hwloc/hwloc.BUILD index b4c59905f5021..1d83c4b1fecda 100644 --- a/third_party/hwloc/hwloc.BUILD +++ b/third_party/hwloc/hwloc.BUILD @@ -302,7 +302,9 @@ cc_library( "include/hwloc/memattrs.h", "include/hwloc/rename.h", ], - copts = COMMON_INCLUDE_COPTS + DISABLE_WARNINGS_COPTS + VAR_SETTINGS_COPTS, + copts = COMMON_INCLUDE_COPTS + DISABLE_WARNINGS_COPTS + VAR_SETTINGS_COPTS + [ + "-fvisibility=hidden", + ], features = [ "-parse_headers", "-layering_check", From dc4ae9bc243a5d9cfcb698a387958f6e8673a24e Mon Sep 17 00:00:00 2001 From: Aleksei Nurmukhametov Date: Thu, 26 Mar 2026 05:22:21 -0500 Subject: [PATCH 2/2] [ROCm] Add Triton autotuning configs for MI300 and MI350 Extend the Triton GEMM autotuner with dedicated config sets for AMD MI300 (gfx942, 33 configs) and MI350 (gfx950, 58 configs), expanding beyond the generic 6-config ROCm default. --- xla/backends/gpu/autotuner/BUILD | 2 +- xla/backends/gpu/autotuner/triton.cc | 8 +++ xla/backends/gpu/autotuner/triton.h | 1 - .../triton/default_configs/mi300.txtpb | 47 ++++++++++++ .../triton/default_configs/mi350.txtpb | 72 +++++++++++++++++++ .../gpu/autotuner/triton/triton_configs.cc | 6 +- .../gpu/autotuner/triton/triton_configs.h | 2 + .../autotuner/triton/triton_configs_test.cc | 4 ++ 8 files changed, 139 insertions(+), 3 deletions(-) create mode 100644 xla/backends/gpu/autotuner/triton/default_configs/mi300.txtpb create mode 100644 xla/backends/gpu/autotuner/triton/default_configs/mi350.txtpb diff --git a/xla/backends/gpu/autotuner/BUILD b/xla/backends/gpu/autotuner/BUILD index d1489acac7dbd..5cd3875effdc0 100644 --- a/xla/backends/gpu/autotuner/BUILD +++ b/xla/backends/gpu/autotuner/BUILD @@ -405,7 +405,6 @@ cc_library( "//xla/service/gpu:matmul_utils", "//xla/service/gpu/model:triton_emitter_constraints", "//xla/stream_executor:device_description", - "//xla/stream_executor:stream_executor_h", "//xla/stream_executor/cuda:cuda_compute_capability", "//xla/tsl/platform:env", "//xla/tsl/platform:errors", @@ -416,6 +415,7 @@ cc_library( "@com_google_absl//absl/status", "@com_google_absl//absl/status:statusor", "@com_google_absl//absl/strings", + "@com_google_protobuf//:any_cc_proto", "@com_google_protobuf//:protobuf", "@llvm-project//mlir:IR", ], diff --git a/xla/backends/gpu/autotuner/triton.cc b/xla/backends/gpu/autotuner/triton.cc index b2ade1fb5e767..d6aebe1966c03 100644 --- a/xla/backends/gpu/autotuner/triton.cc +++ b/xla/backends/gpu/autotuner/triton.cc @@ -28,6 +28,7 @@ limitations under the License. #include "absl/strings/str_cat.h" #include "absl/strings/string_view.h" #include "xla/tsl/platform/status_macros.h" +#include "google/protobuf/any.pb.h" #include "google/protobuf/text_format.h" #include "xla/autotuning.pb.h" #include "xla/backends/autotuner/codegen_backend.h" @@ -70,6 +71,13 @@ namespace { std::vector GetDefaultTritonConfigs( se::GpuComputeCapability compute_capability) { if (compute_capability.IsRocm()) { + const auto* rocm_cc = compute_capability.rocm_compute_capability(); + if (rocm_cc->gfx9_mi300()) { + return GetTritonConfigsForPlatform(TritonConfigsPlatform::kMI300); + } + if (rocm_cc->gfx9_mi350()) { + return GetTritonConfigsForPlatform(TritonConfigsPlatform::kMI350); + } return GetTritonConfigsForPlatform(TritonConfigsPlatform::kDefaultRocm); } diff --git a/xla/backends/gpu/autotuner/triton.h b/xla/backends/gpu/autotuner/triton.h index d0e1ff8a82fd8..e46cefa579c91 100644 --- a/xla/backends/gpu/autotuner/triton.h +++ b/xla/backends/gpu/autotuner/triton.h @@ -29,7 +29,6 @@ limitations under the License. #include "xla/hlo/ir/hlo_instruction.h" #include "xla/hlo/ir/hlo_module.h" #include "xla/service/compiler.h" -#include "xla/stream_executor/stream_executor.h" #include "xla/xla.pb.h" namespace xla { diff --git a/xla/backends/gpu/autotuner/triton/default_configs/mi300.txtpb b/xla/backends/gpu/autotuner/triton/default_configs/mi300.txtpb new file mode 100644 index 0000000000000..f2784e204a307 --- /dev/null +++ b/xla/backends/gpu/autotuner/triton/default_configs/mi300.txtpb @@ -0,0 +1,47 @@ +# Copyright 2026 The OpenXLA Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +config { block_m: 32 block_n: 32 block_k: 256 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 32 block_k: 32 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 64 block_k: 64 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 128 block_k: 64 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 16 block_n: 16 block_k: 256 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 16 block_n: 128 block_k: 32 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 256 block_k: 32 num_stages: 2 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 256 block_k: 64 num_stages: 2 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 256 block_k: 32 num_stages: 2 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 128 block_k: 64 num_stages: 2 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 128 block_k: 64 num_stages: 2 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 8 block_k: 16 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 32 block_k: 16 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 32 block_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 64 block_k: 128 num_stages: 2 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 128 block_k: 32 num_stages: 3 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 128 block_k: 32 num_stages: 2 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 256 block_k: 32 num_stages: 1 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 32 block_k: 32 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 32 block_k: 32 num_stages: 3 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 32 block_k: 32 num_stages: 2 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 32 block_k: 32 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 32 block_k: 128 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 8 block_k: 32 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 16 block_k: 128 num_stages: 2 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 16 block_k: 128 num_stages: 5 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 16 block_k: 128 num_stages: 2 num_warps: 4 num_ctas: 1 waves_per_eu: 4 } +config { block_m: 64 block_n: 8 block_k: 128 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 8 block_k: 128 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 4 } +config { block_m: 32 block_n: 16 block_k: 256 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 16 block_k: 256 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 2 } +config { block_m: 256 block_n: 8 block_k: 16 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 8 block_k: 16 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } diff --git a/xla/backends/gpu/autotuner/triton/default_configs/mi350.txtpb b/xla/backends/gpu/autotuner/triton/default_configs/mi350.txtpb new file mode 100644 index 0000000000000..43d84ab88336e --- /dev/null +++ b/xla/backends/gpu/autotuner/triton/default_configs/mi350.txtpb @@ -0,0 +1,72 @@ +# Copyright 2026 The OpenXLA Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +config { block_m: 32 block_n: 32 block_k: 256 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 32 block_k: 32 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 64 block_k: 64 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 128 block_k: 64 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 16 block_n: 16 block_k: 256 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 16 block_n: 128 block_k: 32 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 256 block_k: 32 num_stages: 2 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 256 block_k: 64 num_stages: 2 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 256 block_k: 32 num_stages: 2 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 128 block_k: 64 num_stages: 2 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 128 block_k: 64 num_stages: 2 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 8 block_k: 16 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 32 block_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 4 } +config { block_m: 32 block_n: 8 block_k: 32 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 32 block_k: 16 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 256 block_k: 16 num_stages: 4 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 128 block_k: 64 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 128 block_k: 64 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 128 block_k: 16 num_stages: 2 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 16 block_n: 16 block_k: 128 num_stages: 3 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 16 block_k: 128 num_stages: 2 num_warps: 4 num_ctas: 1 waves_per_eu: 4 } +config { block_m: 64 block_n: 8 block_k: 128 num_stages: 4 num_warps: 2 num_ctas: 1 waves_per_eu: 2 } +config { block_m: 64 block_n: 16 block_k: 64 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 32 block_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 32 block_k: 16 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 16 block_k: 32 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 8 block_k: 16 num_stages: 3 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 16 block_n: 8 block_k: 256 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 1 } +config { block_m: 16 block_n: 16 block_k: 128 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 16 block_n: 16 block_k: 128 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 16 block_n: 64 block_k: 128 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 4 } +config { block_m: 32 block_n: 16 block_k: 64 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 8 block_k: 16 num_stages: 3 num_warps: 2 num_ctas: 1 waves_per_eu: 4 } +config { block_m: 64 block_n: 8 block_k: 64 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 8 block_k: 256 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 16 block_k: 256 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 8 block_k: 32 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 32 block_k: 64 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 64 block_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 32 block_k: 128 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 32 block_k: 128 num_stages: 1 num_warps: 8 num_ctas: 1 waves_per_eu: 4 } +config { block_m: 32 block_n: 64 block_k: 128 num_stages: 4 num_warps: 2 num_ctas: 1 waves_per_eu: 1 } +config { block_m: 64 block_n: 32 block_k: 32 num_stages: 3 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 64 block_k: 32 num_stages: 4 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 64 block_k: 128 num_stages: 4 num_warps: 4 num_ctas: 1 waves_per_eu: 2 } +config { block_m: 64 block_n: 64 block_k: 128 num_stages: 3 num_warps: 4 num_ctas: 1 waves_per_eu: 1 } +config { block_m: 64 block_n: 64 block_k: 128 num_stages: 3 num_warps: 4 num_ctas: 1 waves_per_eu: 4 } +config { block_m: 64 block_n: 64 block_k: 128 num_stages: 4 num_warps: 4 num_ctas: 1 waves_per_eu: 4 } +config { block_m: 64 block_n: 128 block_k: 128 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 1 } +config { block_m: 128 block_n: 64 block_k: 64 num_stages: 2 num_warps: 4 num_ctas: 1 waves_per_eu: 2 } +config { block_m: 128 block_n: 64 block_k: 128 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 128 block_k: 32 num_stages: 2 num_warps: 8 num_ctas: 1 waves_per_eu: 1 } +config { block_m: 128 block_n: 128 block_k: 128 num_stages: 3 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 128 block_k: 128 num_stages: 3 num_warps: 8 num_ctas: 1 waves_per_eu: 2 } +config { block_m: 128 block_n: 128 block_k: 128 num_stages: 1 num_warps: 8 num_ctas: 1 waves_per_eu: 4 } +config { block_m: 128 block_n: 256 block_k: 64 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 2 } +config { block_m: 256 block_n: 256 block_k: 64 num_stages: 3 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 256 block_k: 64 num_stages: 3 num_warps: 8 num_ctas: 1 waves_per_eu: 1 } diff --git a/xla/backends/gpu/autotuner/triton/triton_configs.cc b/xla/backends/gpu/autotuner/triton/triton_configs.cc index b37b00f3529b1..36712718cacd9 100644 --- a/xla/backends/gpu/autotuner/triton/triton_configs.cc +++ b/xla/backends/gpu/autotuner/triton/triton_configs.cc @@ -73,7 +73,11 @@ const std::vector& GetTritonConfigsForPlatform( {TritonConfigsPlatform::kDefaultRocm, ParseConfig(GetDefaultConfigStr("rocm.txtpb"))}, {TritonConfigsPlatform::kHopper, - ParseConfig(GetDefaultConfigStr("h100.txtpb"))}}); + ParseConfig(GetDefaultConfigStr("h100.txtpb"))}, + {TritonConfigsPlatform::kMI300, + ParseConfig(GetDefaultConfigStr("mi300.txtpb"))}, + {TritonConfigsPlatform::kMI350, + ParseConfig(GetDefaultConfigStr("mi350.txtpb"))}}); return kConfigs->at(platform); } diff --git a/xla/backends/gpu/autotuner/triton/triton_configs.h b/xla/backends/gpu/autotuner/triton/triton_configs.h index 5f72fa2190e37..f8a3658876c0f 100644 --- a/xla/backends/gpu/autotuner/triton/triton_configs.h +++ b/xla/backends/gpu/autotuner/triton/triton_configs.h @@ -29,6 +29,8 @@ enum class TritonConfigsPlatform { kDefaultCuda, kDefaultRocm, kHopper, + kMI300, + kMI350, }; const std::vector& GetTritonConfigsForPlatform( diff --git a/xla/backends/gpu/autotuner/triton/triton_configs_test.cc b/xla/backends/gpu/autotuner/triton/triton_configs_test.cc index 8d8f8c7b30895..eb60b261978d1 100644 --- a/xla/backends/gpu/autotuner/triton/triton_configs_test.cc +++ b/xla/backends/gpu/autotuner/triton/triton_configs_test.cc @@ -34,6 +34,10 @@ TEST(TritonConfigsTest, PlatformsReturnNonEmptyConfig) { SizeIs(2)); EXPECT_THAT(GetTritonConfigsForPlatform(TritonConfigsPlatform::kHopper), SizeIs(25)); + EXPECT_THAT(GetTritonConfigsForPlatform(TritonConfigsPlatform::kMI300), + SizeIs(33)); + EXPECT_THAT(GetTritonConfigsForPlatform(TritonConfigsPlatform::kMI350), + SizeIs(58)); } } // namespace