diff --git a/third_party/hwloc/hwloc.BUILD b/third_party/hwloc/hwloc.BUILD index b4c59905f5021..1d83c4b1fecda 100644 --- a/third_party/hwloc/hwloc.BUILD +++ b/third_party/hwloc/hwloc.BUILD @@ -302,7 +302,9 @@ cc_library( "include/hwloc/memattrs.h", "include/hwloc/rename.h", ], - copts = COMMON_INCLUDE_COPTS + DISABLE_WARNINGS_COPTS + VAR_SETTINGS_COPTS, + copts = COMMON_INCLUDE_COPTS + DISABLE_WARNINGS_COPTS + VAR_SETTINGS_COPTS + [ + "-fvisibility=hidden", + ], features = [ "-parse_headers", "-layering_check", diff --git a/xla/backends/gpu/autotuner/BUILD b/xla/backends/gpu/autotuner/BUILD index d1489acac7dbd..5cd3875effdc0 100644 --- a/xla/backends/gpu/autotuner/BUILD +++ b/xla/backends/gpu/autotuner/BUILD @@ -405,7 +405,6 @@ cc_library( "//xla/service/gpu:matmul_utils", "//xla/service/gpu/model:triton_emitter_constraints", "//xla/stream_executor:device_description", - "//xla/stream_executor:stream_executor_h", "//xla/stream_executor/cuda:cuda_compute_capability", "//xla/tsl/platform:env", "//xla/tsl/platform:errors", @@ -416,6 +415,7 @@ cc_library( "@com_google_absl//absl/status", "@com_google_absl//absl/status:statusor", "@com_google_absl//absl/strings", + "@com_google_protobuf//:any_cc_proto", "@com_google_protobuf//:protobuf", "@llvm-project//mlir:IR", ], diff --git a/xla/backends/gpu/autotuner/triton.cc b/xla/backends/gpu/autotuner/triton.cc index b2ade1fb5e767..d6aebe1966c03 100644 --- a/xla/backends/gpu/autotuner/triton.cc +++ b/xla/backends/gpu/autotuner/triton.cc @@ -28,6 +28,7 @@ limitations under the License. #include "absl/strings/str_cat.h" #include "absl/strings/string_view.h" #include "xla/tsl/platform/status_macros.h" +#include "google/protobuf/any.pb.h" #include "google/protobuf/text_format.h" #include "xla/autotuning.pb.h" #include "xla/backends/autotuner/codegen_backend.h" @@ -70,6 +71,13 @@ namespace { std::vector GetDefaultTritonConfigs( se::GpuComputeCapability compute_capability) { if (compute_capability.IsRocm()) { + const auto* rocm_cc = compute_capability.rocm_compute_capability(); + if (rocm_cc->gfx9_mi300()) { + return GetTritonConfigsForPlatform(TritonConfigsPlatform::kMI300); + } + if (rocm_cc->gfx9_mi350()) { + return GetTritonConfigsForPlatform(TritonConfigsPlatform::kMI350); + } return GetTritonConfigsForPlatform(TritonConfigsPlatform::kDefaultRocm); } diff --git a/xla/backends/gpu/autotuner/triton.h b/xla/backends/gpu/autotuner/triton.h index d0e1ff8a82fd8..e46cefa579c91 100644 --- a/xla/backends/gpu/autotuner/triton.h +++ b/xla/backends/gpu/autotuner/triton.h @@ -29,7 +29,6 @@ limitations under the License. #include "xla/hlo/ir/hlo_instruction.h" #include "xla/hlo/ir/hlo_module.h" #include "xla/service/compiler.h" -#include "xla/stream_executor/stream_executor.h" #include "xla/xla.pb.h" namespace xla { diff --git a/xla/backends/gpu/autotuner/triton/default_configs/mi300.txtpb b/xla/backends/gpu/autotuner/triton/default_configs/mi300.txtpb new file mode 100644 index 0000000000000..f2784e204a307 --- /dev/null +++ b/xla/backends/gpu/autotuner/triton/default_configs/mi300.txtpb @@ -0,0 +1,47 @@ +# Copyright 2026 The OpenXLA Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +config { block_m: 32 block_n: 32 block_k: 256 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 32 block_k: 32 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 64 block_k: 64 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 128 block_k: 64 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 16 block_n: 16 block_k: 256 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 16 block_n: 128 block_k: 32 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 256 block_k: 32 num_stages: 2 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 256 block_k: 64 num_stages: 2 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 256 block_k: 32 num_stages: 2 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 128 block_k: 64 num_stages: 2 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 128 block_k: 64 num_stages: 2 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 8 block_k: 16 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 32 block_k: 16 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 32 block_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 64 block_k: 128 num_stages: 2 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 128 block_k: 32 num_stages: 3 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 128 block_k: 32 num_stages: 2 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 256 block_k: 32 num_stages: 1 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 32 block_k: 32 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 32 block_k: 32 num_stages: 3 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 32 block_k: 32 num_stages: 2 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 32 block_k: 32 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 32 block_k: 128 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 8 block_k: 32 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 16 block_k: 128 num_stages: 2 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 16 block_k: 128 num_stages: 5 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 16 block_k: 128 num_stages: 2 num_warps: 4 num_ctas: 1 waves_per_eu: 4 } +config { block_m: 64 block_n: 8 block_k: 128 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 8 block_k: 128 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 4 } +config { block_m: 32 block_n: 16 block_k: 256 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 16 block_k: 256 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 2 } +config { block_m: 256 block_n: 8 block_k: 16 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 8 block_k: 16 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } diff --git a/xla/backends/gpu/autotuner/triton/default_configs/mi350.txtpb b/xla/backends/gpu/autotuner/triton/default_configs/mi350.txtpb new file mode 100644 index 0000000000000..43d84ab88336e --- /dev/null +++ b/xla/backends/gpu/autotuner/triton/default_configs/mi350.txtpb @@ -0,0 +1,72 @@ +# Copyright 2026 The OpenXLA Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +config { block_m: 32 block_n: 32 block_k: 256 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 32 block_k: 32 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 64 block_k: 64 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 128 block_k: 64 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 16 block_n: 16 block_k: 256 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 16 block_n: 128 block_k: 32 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 256 block_k: 32 num_stages: 2 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 256 block_k: 64 num_stages: 2 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 256 block_k: 32 num_stages: 2 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 128 block_k: 64 num_stages: 2 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 128 block_k: 64 num_stages: 2 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 8 block_k: 16 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 32 block_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 4 } +config { block_m: 32 block_n: 8 block_k: 32 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 32 block_k: 16 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 256 block_k: 16 num_stages: 4 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 128 block_k: 64 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 128 block_k: 64 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 128 block_k: 16 num_stages: 2 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 16 block_n: 16 block_k: 128 num_stages: 3 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 16 block_k: 128 num_stages: 2 num_warps: 4 num_ctas: 1 waves_per_eu: 4 } +config { block_m: 64 block_n: 8 block_k: 128 num_stages: 4 num_warps: 2 num_ctas: 1 waves_per_eu: 2 } +config { block_m: 64 block_n: 16 block_k: 64 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 32 block_k: 16 num_stages: 3 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 32 block_k: 16 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 16 block_k: 32 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 8 block_k: 16 num_stages: 3 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 16 block_n: 8 block_k: 256 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 1 } +config { block_m: 16 block_n: 16 block_k: 128 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 16 block_n: 16 block_k: 128 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 16 block_n: 64 block_k: 128 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 4 } +config { block_m: 32 block_n: 16 block_k: 64 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 8 block_k: 16 num_stages: 3 num_warps: 2 num_ctas: 1 waves_per_eu: 4 } +config { block_m: 64 block_n: 8 block_k: 64 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 8 block_k: 256 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 16 block_k: 256 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 8 block_k: 32 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 32 block_k: 64 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 64 block_k: 16 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 32 block_k: 128 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 32 block_n: 32 block_k: 128 num_stages: 1 num_warps: 8 num_ctas: 1 waves_per_eu: 4 } +config { block_m: 32 block_n: 64 block_k: 128 num_stages: 4 num_warps: 2 num_ctas: 1 waves_per_eu: 1 } +config { block_m: 64 block_n: 32 block_k: 32 num_stages: 3 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 64 block_k: 32 num_stages: 4 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 64 block_n: 64 block_k: 128 num_stages: 4 num_warps: 4 num_ctas: 1 waves_per_eu: 2 } +config { block_m: 64 block_n: 64 block_k: 128 num_stages: 3 num_warps: 4 num_ctas: 1 waves_per_eu: 1 } +config { block_m: 64 block_n: 64 block_k: 128 num_stages: 3 num_warps: 4 num_ctas: 1 waves_per_eu: 4 } +config { block_m: 64 block_n: 64 block_k: 128 num_stages: 4 num_warps: 4 num_ctas: 1 waves_per_eu: 4 } +config { block_m: 64 block_n: 128 block_k: 128 num_stages: 1 num_warps: 2 num_ctas: 1 waves_per_eu: 1 } +config { block_m: 128 block_n: 64 block_k: 64 num_stages: 2 num_warps: 4 num_ctas: 1 waves_per_eu: 2 } +config { block_m: 128 block_n: 64 block_k: 128 num_stages: 2 num_warps: 2 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 128 block_k: 32 num_stages: 2 num_warps: 8 num_ctas: 1 waves_per_eu: 1 } +config { block_m: 128 block_n: 128 block_k: 128 num_stages: 3 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 128 block_n: 128 block_k: 128 num_stages: 3 num_warps: 8 num_ctas: 1 waves_per_eu: 2 } +config { block_m: 128 block_n: 128 block_k: 128 num_stages: 1 num_warps: 8 num_ctas: 1 waves_per_eu: 4 } +config { block_m: 128 block_n: 256 block_k: 64 num_stages: 1 num_warps: 4 num_ctas: 1 waves_per_eu: 2 } +config { block_m: 256 block_n: 256 block_k: 64 num_stages: 3 num_warps: 8 num_ctas: 1 waves_per_eu: 0 } +config { block_m: 256 block_n: 256 block_k: 64 num_stages: 3 num_warps: 8 num_ctas: 1 waves_per_eu: 1 } diff --git a/xla/backends/gpu/autotuner/triton/triton_configs.cc b/xla/backends/gpu/autotuner/triton/triton_configs.cc index b37b00f3529b1..36712718cacd9 100644 --- a/xla/backends/gpu/autotuner/triton/triton_configs.cc +++ b/xla/backends/gpu/autotuner/triton/triton_configs.cc @@ -73,7 +73,11 @@ const std::vector& GetTritonConfigsForPlatform( {TritonConfigsPlatform::kDefaultRocm, ParseConfig(GetDefaultConfigStr("rocm.txtpb"))}, {TritonConfigsPlatform::kHopper, - ParseConfig(GetDefaultConfigStr("h100.txtpb"))}}); + ParseConfig(GetDefaultConfigStr("h100.txtpb"))}, + {TritonConfigsPlatform::kMI300, + ParseConfig(GetDefaultConfigStr("mi300.txtpb"))}, + {TritonConfigsPlatform::kMI350, + ParseConfig(GetDefaultConfigStr("mi350.txtpb"))}}); return kConfigs->at(platform); } diff --git a/xla/backends/gpu/autotuner/triton/triton_configs.h b/xla/backends/gpu/autotuner/triton/triton_configs.h index 5f72fa2190e37..f8a3658876c0f 100644 --- a/xla/backends/gpu/autotuner/triton/triton_configs.h +++ b/xla/backends/gpu/autotuner/triton/triton_configs.h @@ -29,6 +29,8 @@ enum class TritonConfigsPlatform { kDefaultCuda, kDefaultRocm, kHopper, + kMI300, + kMI350, }; const std::vector& GetTritonConfigsForPlatform( diff --git a/xla/backends/gpu/autotuner/triton/triton_configs_test.cc b/xla/backends/gpu/autotuner/triton/triton_configs_test.cc index 8d8f8c7b30895..eb60b261978d1 100644 --- a/xla/backends/gpu/autotuner/triton/triton_configs_test.cc +++ b/xla/backends/gpu/autotuner/triton/triton_configs_test.cc @@ -34,6 +34,10 @@ TEST(TritonConfigsTest, PlatformsReturnNonEmptyConfig) { SizeIs(2)); EXPECT_THAT(GetTritonConfigsForPlatform(TritonConfigsPlatform::kHopper), SizeIs(25)); + EXPECT_THAT(GetTritonConfigsForPlatform(TritonConfigsPlatform::kMI300), + SizeIs(33)); + EXPECT_THAT(GetTritonConfigsForPlatform(TritonConfigsPlatform::kMI350), + SizeIs(58)); } } // namespace