diff --git a/mlir/include/mlir/Dialect/Rock/Tuning/GridwiseGemmParams.h b/mlir/include/mlir/Dialect/Rock/Tuning/GridwiseGemmParams.h index 6f0c8ffbeaf5..7d1e52fa6f27 100644 --- a/mlir/include/mlir/Dialect/Rock/Tuning/GridwiseGemmParams.h +++ b/mlir/include/mlir/Dialect/Rock/Tuning/GridwiseGemmParams.h @@ -56,6 +56,17 @@ int64_t obtainBlockSize(int64_t waveSize, int64_t mPerBlock, int64_t nPerBlock, int64_t obtainBlockSize(int64_t waveSize, RockAccelTuningParamAttrInterface params); +/// Raw convolution dimensions for classifier features. +struct ConvMeta { + int64_t batchN = 1; + int64_t cChannels = 1, kChannels = 1; + int64_t inH = 1, inW = 1; + int64_t filterH = 1, filterW = 1; + int64_t padH = 0, padW = 0; + int64_t strideH = 1, strideW = 1; + int64_t dilH = 1, dilW = 1; +}; + /// Store information useful for populating perf configurations struct PopulateParamsInfo { GemmSize gemmSize; @@ -67,6 +78,7 @@ struct PopulateParamsInfo { int64_t batchSize; uint32_t numCu; bool hasFusedReduction; + std::optional convMeta; PopulateParamsInfo(GemmSize gemmSize, StringRef arch, GemmFeatures gemmFeatures, Type gemmAType, Type gemmBType, diff --git a/mlir/include/mlir/Dialect/Rock/Tuning/QuickTuningClassifier.h b/mlir/include/mlir/Dialect/Rock/Tuning/QuickTuningClassifier.h new file mode 100644 index 000000000000..fcc538d16dfa --- /dev/null +++ b/mlir/include/mlir/Dialect/Rock/Tuning/QuickTuningClassifier.h @@ -0,0 +1,54 @@ +//===- QuickTuningClassifier.h - XGBoost-based perfconfig ranking ---------===// +// +// Part of the rocMLIR Project, under the Apache License v2.0 with LLVM +// Exceptions. See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (c) 2025 Advanced Micro Devices Inc. +//===----------------------------------------------------------------------===// +// +// This file declares the QuickTuningClassifier, which uses XGBoost models to +// rank quick-tune perfconfigs and select the top-N most likely performant ones +// for a given problem. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_ROCK_TUNING_QUICK_TUNING_CLASSIFIER_H +#define MLIR_DIALECT_ROCK_TUNING_QUICK_TUNING_CLASSIFIER_H + +#include "mlir/Dialect/Rock/IR/Rock.h" +#include "mlir/Dialect/Rock/IR/RockGemmGemmWrapperInterface.h" +#include "mlir/Dialect/Rock/Tuning/GridwiseGemmGemmParams.h" +#include "mlir/Dialect/Rock/Tuning/GridwiseGemmParams.h" +#include "llvm/ADT/ArrayRef.h" +#include + +namespace mlir { +namespace rock { + +class QuickTuningClassifier { +public: + /// Read ROCMLIR_QUICK_TUNE_TOP_N env var. Default 30, 0 disables classifier. + static unsigned getTopN(); + + /// Filter XDL/WMMA candidates down to the top-N using the classifier. + /// Returns the full list if no model is found or top-N is 0. + static std::vector + filterTopN(const PopulateParamsInfo &info, + llvm::ArrayRef candidates); + + /// Filter non-accel candidates down to the top-N. + static std::vector + filterTopN(const PopulateParamsInfo &info, + llvm::ArrayRef candidates); + + /// Filter gemm-gemm (attention) candidates down to the top-N. + static std::vector + filterTopN(RockGemmGemmWrapperInterface op, + llvm::ArrayRef candidates); +}; + +} // namespace rock +} // namespace mlir + +#endif // MLIR_DIALECT_ROCK_TUNING_QUICK_TUNING_CLASSIFIER_H diff --git a/mlir/include/mlir/Dialect/Rock/Tuning/QuickTuningPerfconfigs.inc b/mlir/include/mlir/Dialect/Rock/Tuning/QuickTuningPerfconfigs.inc index 9024879a1b68..18a48591ccb8 100644 --- a/mlir/include/mlir/Dialect/Rock/Tuning/QuickTuningPerfconfigs.inc +++ b/mlir/include/mlir/Dialect/Rock/Tuning/QuickTuningPerfconfigs.inc @@ -23,24 +23,6 @@ const StringRef PopulateParams::initParametersF32GemmGfx1000[] = { }; // END_GEMM_NonAccel_f32_gfx1000_DEFS -// BEGIN_GEMM_NonAccel_f32_gfx1100_DEFS -const StringRef PopulateParams::initParametersF32GemmGfx1100[] = { - "v3:64,64,32,16,2,2,1,1,2", - "v3:64,64,64,4,2,2,1,1,2", - "v3:128,128,128,16,2,2,1,1,2", - "v3:64,32,128,16,2,2,1,1,2", - "v3:128,128,32,16,4,2,1,1,2", - "v3:64,32,64,8,4,4,1,1,2", - "v3:64,64,128,8,4,2,1,1,2", - "v3:64,32,32,4,2,2,1,1,2", - "v3:128,64,64,4,2,2,1,1,2", - "v3:64,32,32,16,2,4,1,1,2", - "v3:128,32,64,16,2,4,1,1,2", - "v3:256,64,32,16,4,2,1,1,2", - "v3:256,64,128,8,2,2,1,1,2" -}; -// END_GEMM_NonAccel_f32_gfx1100_DEFS - // BEGIN_CONV_NonAccel_f32_gfx1000_DEFS const StringRef PopulateParams::initParametersF32ConvGfx1000[] = { "v3:128,128,128,4,2,4,1,1,2", @@ -70,82 +52,51 @@ const StringRef PopulateParams::initParametersF32ConvGfx1000[] = { }; // END_CONV_NonAccel_f32_gfx1000_DEFS -// BEGIN_CONV_NonAccel_f32_gfx1100_DEFS -const StringRef PopulateParams::initParametersF32ConvGfx1100[] = { - "v3:64,32,32,8,4,4,1,1,2", - "v3:64,32,32,8,2,2,1,1,2", - "v3:128,32,32,4,2,2,1,1,2", - "v3:128,32,64,4,2,2,1,1,2", - "v3:64,32,32,16,2,4,1,1,2", - "v3:64,32,32,4,2,4,1,1,2", - "v3:64,64,32,16,2,2,1,1,2", - "v3:64,64,128,4,2,2,1,1,2", - "v3:128,32,32,16,2,2,1,1,2", - "v3:64,32,64,8,4,2,1,1,2", - "v3:128,32,32,16,2,4,1,1,2", - "v3:64,64,64,16,2,2,1,1,2", - "v3:64,64,64,4,4,2,1,1,2", - "v3:128,32,32,8,2,4,1,1,2", - "v3:128,128,128,8,2,2,1,1,2", - "v3:256,32,32,16,2,2,1,1,2", - "v3:64,128,64,8,2,4,1,1,2", - "v3:64,64,64,16,2,4,1,1,2", - "v3:256,32,32,8,2,2,1,1,2", - "v3:64,64,128,16,4,4,1,1,2", - "v3:128,64,128,16,2,2,1,1,2", - "v3:128,128,32,16,2,4,1,1,2", - "v3:64,32,32,4,2,2,1,1,2", - "v3:64,64,64,4,2,2,1,1,2", - "v3:128,32,32,4,2,4,1,1,2" -}; -// END_CONV_NonAccel_f32_gfx1100_DEFS - // BEGIN_GEMM_NonAccel_f32_gfx1201_DEFS const StringRef PopulateParams::initParametersF32GemmGfx1201[] = { - "v3:128,128,128,16,2,2,1,1,2", + "v3:64,64,128,4,2,4,1,1,2", + "v3:128,128,128,8,2,4,1,1,2", "v3:64,128,64,16,2,2,1,1,2", - "v3:64,64,32,16,2,2,1,1,2", - "v3:128,128,128,4,4,2,1,1,2", - "v3:64,64,64,16,4,4,1,1,2", - "v3:64,64,128,16,4,4,1,1,2", - "v3:64,64,64,8,2,4,1,1,2", - "v3:128,128,64,8,2,4,1,1,2", "v3:128,32,32,16,2,2,1,1,2", - "v3:64,32,32,8,2,2,1,1,2" + "v3:64,64,128,16,2,2,1,1,2", + "v3:64,128,64,16,4,4,1,1,2", + "v3:64,128,64,4,2,4,1,1,2", + "v3:64,32,64,8,2,4,1,1,2", + "v3:64,32,128,8,2,2,1,1,2", + "v3:64,64,64,8,2,4,1,1,2", + "v3:128,64,64,8,2,2,1,1,2", + "v3:256,64,64,8,4,2,1,1,2" }; // END_GEMM_NonAccel_f32_gfx1201_DEFS // BEGIN_CONV_NonAccel_f32_gfx1201_DEFS const StringRef PopulateParams::initParametersF32ConvGfx1201[] = { - "v3:64,64,128,4,2,4,1,1,2", - "v3:64,64,128,8,2,4,1,1,2", + "v3:64,64,128,4,2,2,1,1,2", + "v3:128,64,64,8,2,2,1,1,2", + "v3:64,64,128,8,2,2,1,1,2", + "v3:64,64,64,16,2,2,1,1,2", + "v3:64,64,64,16,2,4,1,1,2", "v3:64,64,64,4,4,4,1,1,2", - "v3:64,128,64,16,2,2,1,1,2", - "v3:64,64,128,8,4,4,1,1,2", + "v3:64,32,128,8,2,2,1,1,2", "v3:64,64,128,16,2,2,1,1,2", - "v3:64,64,64,4,2,2,1,1,2", - "v3:64,64,128,16,2,4,1,1,2", - "v3:64,64,64,16,2,2,1,1,2", - "v3:64,32,128,4,2,2,1,1,2", - "v3:128,128,128,16,2,2,1,1,2", - "v3:128,128,128,16,4,2,1,1,2", - "v3:128,128,128,8,2,2,1,1,2", - "v3:64,64,32,8,2,2,1,1,2", - "v3:64,64,32,16,2,4,1,1,2", - "v3:128,128,64,8,4,4,1,1,2", - "v3:64,32,64,8,2,4,1,1,2", - "v3:64,64,64,16,4,4,1,1,2", - "v3:128,64,64,8,2,4,1,1,2", + "v3:64,32,32,8,2,2,1,1,2", + "v3:64,32,64,8,4,4,1,1,2", + "v3:128,128,128,16,2,4,1,1,2", "v3:128,128,128,4,2,2,1,1,2", - "v3:128,128,32,16,2,4,1,1,2", + "v3:64,64,128,16,2,4,1,1,2", "v3:128,32,32,16,2,4,1,1,2", - "v3:64,128,64,8,4,2,1,1,2", - "v3:64,32,32,8,2,2,1,1,2", + "v3:64,32,64,4,4,2,1,1,2", "v3:256,128,128,8,4,2,1,1,2", - "v3:128,64,32,16,2,2,1,1,2", - "v3:128,128,64,16,2,4,1,1,2", - "v3:256,128,128,16,4,4,1,1,2", - "v3:256,32,32,16,2,2,1,1,2" + "v3:128,64,128,8,4,4,1,1,2", + "v3:64,32,128,4,4,2,1,1,2", + "v3:128,128,128,8,2,4,1,1,2", + "v3:64,128,64,16,4,4,1,1,2", + "v3:64,32,32,16,2,2,1,1,2", + "v3:128,128,64,8,2,4,1,1,2", + "v3:64,64,32,8,2,2,1,1,2", + "v3:256,128,128,16,2,4,1,1,2", + "v3:128,128,64,8,4,4,1,1,2", + "v3:128,128,32,16,2,2,1,1,2" }; // END_CONV_NonAccel_f32_gfx1201_DEFS @@ -209,69 +160,139 @@ const StringRef PopulateParams::initParametersF32ConvGfx1151[] = { // BEGIN_GEMM_NonAccel_f32_gfx1150_DEFS const StringRef PopulateParams::initParametersF32GemmGfx1150[] = { + "v3:64,64,128,16,2,4,1,1,2", + "v3:64,128,32,16,2,2,1,1,2", "v3:64,64,128,4,2,2,1,1,2", - "v3:128,128,128,16,4,2,1,1,2", - "v3:64,64,128,16,2,2,1,1,2", - "v3:64,128,64,16,2,2,1,1,2", - "v3:64,64,64,8,2,2,1,1,2", - "v3:64,128,32,16,2,4,1,1,2", - "v3:256,128,128,8,4,4,1,1,2", "v3:128,32,32,16,2,4,1,1,2", - "v3:64,32,64,8,4,4,1,1,2", - "v3:128,64,128,4,2,2,1,1,2", - "v3:256,64,128,16,4,4,1,1,2", - "v3:128,64,64,4,4,2,1,1,2", - "v3:64,32,32,16,4,2,1,1,2" + "v3:64,64,64,16,2,2,1,1,2", + "v3:128,128,128,8,2,4,1,1,2", + "v3:64,64,128,4,2,4,1,1,2", + "v3:64,64,128,4,4,4,1,1,2", + "v3:64,64,64,4,2,2,1,1,2", + "v3:64,128,64,8,2,2,1,1,2", + "v3:256,32,64,16,2,4,1,1,2", + "v3:64,128,64,4,2,2,1,1,2", + "v3:64,64,32,16,2,2,1,1,2", + "v3:256,128,128,8,4,4,1,1,2" }; // END_GEMM_NonAccel_f32_gfx1150_DEFS // BEGIN_CONV_NonAccel_f32_gfx1150_DEFS const StringRef PopulateParams::initParametersF32ConvGfx1150[] = { - "v3:64,64,128,16,2,2,1,1,2", - "v3:64,64,128,8,2,2,1,1,2", "v3:64,64,128,4,2,2,1,1,2", - "v3:64,64,128,4,2,4,1,1,2", - "v3:64,64,128,16,4,2,1,1,2", - "v3:64,64,128,8,2,4,1,1,2", - "v3:64,128,64,16,2,2,1,1,2", + "v3:64,64,128,8,2,2,1,1,2", "v3:64,64,128,16,4,4,1,1,2", - "v3:64,64,128,4,4,2,1,1,2", - "v3:128,128,128,16,2,4,1,1,2", - "v3:128,128,128,16,4,2,1,1,2", - "v3:64,64,64,4,2,4,1,1,2", - "v3:64,32,128,4,2,2,1,1,2", - "v3:64,64,64,8,4,2,1,1,2", - "v3:64,64,64,16,2,2,1,1,2", + "v3:64,64,128,16,2,2,1,1,2", "v3:128,128,128,4,2,2,1,1,2", - "v3:64,128,64,16,4,2,1,1,2", - "v3:128,128,128,4,2,4,1,1,2", - "v3:64,32,64,4,2,2,1,1,2", - "v3:64,64,32,16,2,2,1,1,2", + "v3:64,64,128,8,2,4,1,1,2", + "v3:64,128,64,16,2,4,1,1,2", + "v3:64,64,128,8,4,2,1,1,2", + "v3:128,64,128,4,2,2,1,1,2", + "v3:64,128,64,16,4,4,1,1,2", + "v3:64,64,64,16,2,2,1,1,2", + "v3:64,128,64,8,2,2,1,1,2", + "v3:128,64,128,8,2,2,1,1,2", + "v3:64,64,64,8,2,4,1,1,2", + "v3:128,128,128,4,4,2,1,1,2", + "v3:64,64,128,4,4,4,1,1,2", + "v3:128,64,128,8,2,4,1,1,2", + "v3:128,64,128,4,2,4,1,1,2", + "v3:64,32,128,8,2,2,1,1,2", + "v3:64,64,64,4,2,2,1,1,2", + "v3:64,64,64,8,4,2,1,1,2", + "v3:64,64,64,16,4,2,1,1,2", + "v3:128,128,128,16,2,2,1,1,2", + "v3:128,64,128,4,4,2,1,1,2", + "v3:64,64,64,8,4,4,1,1,2", + "v3:64,32,128,4,2,2,1,1,2", + "v3:128,64,128,4,4,4,1,1,2", + "v3:128,128,128,16,4,2,1,1,2", + "v3:128,64,128,8,4,4,1,1,2", + "v3:256,128,128,8,2,2,1,1,2", + "v3:256,64,128,8,2,2,1,1,2", + "v3:64,32,128,16,2,2,1,1,2", + "v3:64,128,64,4,4,4,1,1,2", + "v3:256,64,128,8,4,2,1,1,2", + "v3:256,64,64,16,2,4,1,1,2", + "v3:128,64,64,16,2,2,1,1,2", "v3:64,32,64,16,2,2,1,1,2", + "v3:128,32,32,16,2,2,1,1,2", + "v3:256,32,64,16,2,2,1,1,2", + "v3:128,128,128,8,4,4,1,1,2", + "v3:64,128,32,16,2,4,1,1,2", + "v3:128,128,64,16,2,2,1,1,2", + "v3:256,64,128,16,4,2,1,1,2", + "v3:128,32,64,16,2,4,1,1,2", + "v3:64,128,32,16,4,4,1,1,2", + "v3:128,64,128,16,4,4,1,1,2", "v3:128,128,64,8,4,2,1,1,2", - "v3:64,128,64,4,4,4,1,1,2", - "v3:256,128,128,8,2,2,1,1,2", - "v3:64,128,32,16,4,2,1,1,2", - "v3:64,64,32,8,2,4,1,1,2", - "v3:128,32,32,16,2,4,1,1,2", - "v3:128,64,64,16,2,4,1,1,2", - "v3:128,64,32,16,2,2,1,1,2", - "v3:64,32,32,8,2,2,1,1,2", - "v3:64,128,32,8,4,2,1,1,2", - "v3:256,128,128,4,2,4,1,1,2", + "v3:128,32,128,16,2,2,1,1,2", + "v3:128,64,32,16,4,2,1,1,2", + "v3:256,32,32,16,2,2,1,1,2", + "v3:128,128,32,16,4,2,1,1,2", + "v3:64,64,32,4,4,2,1,1,2", "v3:128,128,32,16,2,4,1,1,2", - "v3:256,128,128,16,4,2,1,1,2", - "v3:64,128,32,8,2,4,1,1,2", - "v3:128,64,64,16,4,2,1,1,2", - "v3:256,64,64,16,2,4,1,1,2", - "v3:128,64,64,8,4,4,1,1,2", - "v3:256,32,128,8,2,2,1,1,2", - "v3:256,32,64,8,2,2,1,1,2", - "v3:256,64,32,16,2,2,1,1,2", - "v3:128,64,32,4,2,2,1,1,2" + "v3:128,128,64,4,4,4,1,1,2", + "v3:64,32,64,4,2,2,1,1,2", + "v3:64,32,64,4,4,4,1,1,2", + "v3:64,32,32,8,4,4,1,1,2" }; // END_CONV_NonAccel_f32_gfx1150_DEFS +// BEGIN_GEMM_NonAccel_f32_gfx1101_DEFS +const StringRef PopulateParams::initParametersF32GemmGfx1101[] = { + "v3:128,128,128,8,2,4,1,1,2", + "v3:128,32,32,16,2,2,1,1,2", + "v3:64,128,64,16,2,4,1,1,2", + "v3:64,64,128,4,2,2,1,1,2", + "v3:64,64,128,8,2,4,1,1,2", + "v3:64,64,64,16,4,2,1,1,2", + "v3:64,64,128,16,4,2,1,1,2", + "v3:64,128,64,4,2,2,1,1,2", + "v3:64,32,32,8,2,4,1,1,2", + "v3:128,128,128,8,2,2,1,1,2", + "v3:64,32,64,8,4,4,1,1,2", + "v3:64,32,128,16,4,2,1,1,2", + "v3:128,128,64,8,2,2,1,1,2" +}; +// END_GEMM_NonAccel_f32_gfx1101_DEFS + +// BEGIN_CONV_NonAccel_f32_gfx1101_DEFS +const StringRef PopulateParams::initParametersF32ConvGfx1101[] = { + "v3:64,64,128,4,2,2,1,1,2", + "v3:64,64,64,4,2,2,1,1,2", + "v3:64,64,128,4,2,4,1,1,2", + "v3:64,64,128,8,2,4,1,1,2", + "v3:128,64,64,8,2,4,1,1,2", + "v3:64,64,64,4,4,2,1,1,2", + "v3:64,32,32,8,2,2,1,1,2", + "v3:64,64,128,8,2,2,1,1,2", + "v3:64,64,128,8,4,4,1,1,2", + "v3:128,64,64,8,2,2,1,1,2", + "v3:64,32,64,8,4,2,1,1,2", + "v3:128,128,128,4,2,2,1,1,2", + "v3:64,32,128,8,2,2,1,1,2", + "v3:64,32,128,4,2,2,1,1,2", + "v3:128,32,32,16,2,2,1,1,2", + "v3:64,128,64,8,2,4,1,1,2", + "v3:64,64,64,16,2,4,1,1,2", + "v3:64,128,64,4,4,2,1,1,2", + "v3:64,64,128,16,2,4,1,1,2", + "v3:128,128,128,16,4,4,1,1,2", + "v3:256,128,128,8,4,4,1,1,2", + "v3:64,64,64,8,4,4,1,1,2", + "v3:256,32,32,16,2,2,1,1,2", + "v3:64,64,32,8,2,4,1,1,2", + "v3:64,32,32,16,2,2,1,1,2", + "v3:128,128,128,8,4,2,1,1,2", + "v3:128,128,64,8,2,2,1,1,2", + "v3:128,64,32,16,2,4,1,1,2", + "v3:256,64,128,16,4,2,1,1,2", + "v3:128,128,64,16,2,4,1,1,2", + "v3:256,128,64,4,2,2,1,1,2" +}; +// END_CONV_NonAccel_f32_gfx1101_DEFS + // BEGIN_GEMM_NonAccel_f32_gfx1152_DEFS const StringRef PopulateParams::initParametersF32GemmGfx1152[] = { "v3:128,128,128,8,2,4,1,1,2", @@ -457,28 +478,18 @@ static constexpr size_t nInitParametersF32GemmGfx1000 = 14; static const StringRef initParametersF32GemmGfx1000[nInitParametersF32GemmGfx1000]; // END_GEMM_NonAccel_f32_gfx1000_DECS -// BEGIN_GEMM_NonAccel_f32_gfx1100_DECS -static constexpr size_t nInitParametersF32GemmGfx1100 = 13; -static const StringRef initParametersF32GemmGfx1100[nInitParametersF32GemmGfx1100]; -// END_GEMM_NonAccel_f32_gfx1100_DECS - // BEGIN_CONV_NonAccel_f32_gfx1000_DECS static constexpr size_t nInitParametersF32ConvGfx1000 = 24; static const StringRef initParametersF32ConvGfx1000[nInitParametersF32ConvGfx1000]; // END_CONV_NonAccel_f32_gfx1000_DECS -// BEGIN_CONV_NonAccel_f32_gfx1100_DECS -static constexpr size_t nInitParametersF32ConvGfx1100 = 25; -static const StringRef initParametersF32ConvGfx1100[nInitParametersF32ConvGfx1100]; -// END_CONV_NonAccel_f32_gfx1100_DECS - // BEGIN_GEMM_NonAccel_f32_gfx1201_DECS -static constexpr size_t nInitParametersF32GemmGfx1201 = 10; +static constexpr size_t nInitParametersF32GemmGfx1201 = 12; static const StringRef initParametersF32GemmGfx1201[nInitParametersF32GemmGfx1201]; // END_GEMM_NonAccel_f32_gfx1201_DECS // BEGIN_CONV_NonAccel_f32_gfx1201_DECS -static constexpr size_t nInitParametersF32ConvGfx1201 = 29; +static constexpr size_t nInitParametersF32ConvGfx1201 = 26; static const StringRef initParametersF32ConvGfx1201[nInitParametersF32ConvGfx1201]; // END_CONV_NonAccel_f32_gfx1201_DECS @@ -493,15 +504,25 @@ static const StringRef initParametersF32ConvGfx1151[nInitParametersF32ConvGfx115 // END_CONV_NonAccel_f32_gfx1151_DECS // BEGIN_GEMM_NonAccel_f32_gfx1150_DECS -static constexpr size_t nInitParametersF32GemmGfx1150 = 13; +static constexpr size_t nInitParametersF32GemmGfx1150 = 14; static const StringRef initParametersF32GemmGfx1150[nInitParametersF32GemmGfx1150]; // END_GEMM_NonAccel_f32_gfx1150_DECS // BEGIN_CONV_NonAccel_f32_gfx1150_DECS -static constexpr size_t nInitParametersF32ConvGfx1150 = 42; +static constexpr size_t nInitParametersF32ConvGfx1150 = 57; static const StringRef initParametersF32ConvGfx1150[nInitParametersF32ConvGfx1150]; // END_CONV_NonAccel_f32_gfx1150_DECS +// BEGIN_GEMM_NonAccel_f32_gfx1101_DECS +static constexpr size_t nInitParametersF32GemmGfx1101 = 13; +static const StringRef initParametersF32GemmGfx1101[nInitParametersF32GemmGfx1101]; +// END_GEMM_NonAccel_f32_gfx1101_DECS + +// BEGIN_CONV_NonAccel_f32_gfx1101_DECS +static constexpr size_t nInitParametersF32ConvGfx1101 = 31; +static const StringRef initParametersF32ConvGfx1101[nInitParametersF32ConvGfx1101]; +// END_CONV_NonAccel_f32_gfx1101_DECS + // BEGIN_GEMM_NonAccel_f32_gfx1152_DECS static constexpr size_t nInitParametersF32GemmGfx1152 = 17; static const StringRef initParametersF32GemmGfx1152[nInitParametersF32GemmGfx1152]; @@ -528,347 +549,739 @@ static const StringRef initParametersF32GemmGfx1103[nInitParametersF32GemmGfx110 // BEGIN_GEMM_XDL_f32_gfx908_DEFS const StringRef PopulateParamsXDL::initParametersF32GemmGfx908[] = { - "v4:64,64,8,32,32,32,4,1,2,2,0,0,1,1", - "v4:64,64,4,32,32,32,4,1,2,2,0,0,1,1", - "v4:128,64,4,128,16,16,4,1,1,2,0,0,1,1", - "v4:32,32,4,16,16,16,8,1,2,2,0,0,1,1", - "v4:64,32,8,32,16,16,4,1,1,2,0,0,1,1", - "v4:32,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:64,16,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:16,16,8,16,16,16,8,1,2,2,0,0,1,1" + "v4:16,32,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:32,64,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:64,128,8,16,64,16,4,1,1,2,0,0,1,1", + "v4:64,128,4,32,32,32,4,1,2,2,0,0,1,1", + "v4:64,64,4,32,32,32,8,1,2,2,0,0,1,1", + "v4:128,64,4,32,32,32,4,1,2,2,0,0,1,1", + "v4:32,16,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:256,128,4,64,32,32,4,1,2,2,0,0,1,1", + "v4:64,128,8,32,32,32,1,1,2,2,0,0,1,1", + "v4:32,64,8,16,64,16,4,1,2,2,0,0,1,1", + "v4:64,64,4,32,16,16,8,1,2,2,0,0,1,1", + "v4:16,96,8,16,48,16,8,1,2,2,0,0,1,1", + "v4:128,96,8,16,48,16,4,1,2,2,0,0,1,1", + "v4:96,128,4,96,32,32,4,1,2,1,1,64,1,1", + "v4:96,64,8,96,16,16,4,1,1,0,1,32,1,1" }; // END_GEMM_XDL_f32_gfx908_DEFS // BEGIN_GEMM_XDL_f32_gfx90a_DEFS const StringRef PopulateParamsXDL::initParametersF32GemmGfx90a[] = { - "v4:32,64,8,16,32,16,4,1,1,2,0,0,1,1", - "v4:32,32,8,16,16,16,4,1,2,2,0,0,1,1", + "v4:32,64,8,32,16,16,4,1,1,2,0,0,1,1", + "v4:16,32,8,16,16,16,8,1,2,2,0,0,1,1", "v4:64,64,4,32,32,32,8,1,1,2,0,0,1,1", - "v4:32,16,8,16,16,16,4,1,2,2,0,0,1,1", - "v4:64,32,4,16,32,16,8,1,2,2,0,0,1,1", - "v4:128,64,4,128,16,16,4,1,1,2,0,0,1,1", - "v4:16,32,8,16,16,16,8,1,2,2,0,0,1,1" + "v4:32,16,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:128,128,4,128,32,16,4,1,2,2,0,0,1,1", + "v4:64,128,2,32,32,32,8,1,1,2,0,0,1,1", + "v4:32,64,8,16,16,16,8,1,1,2,0,0,1,1", + "v4:64,64,8,32,32,16,1,1,2,2,0,0,1,1", + "v4:64,64,8,32,32,32,4,1,2,2,0,0,1,1", + "v4:64,64,8,32,16,16,8,1,2,2,0,0,1,1", + "v4:32,64,8,32,16,16,8,1,2,2,0,0,1,1", + "v4:32,64,8,32,32,32,4,1,2,2,0,0,1,1", + "v4:128,32,4,16,16,16,8,1,2,2,0,0,1,1" }; // END_GEMM_XDL_f32_gfx90a_DEFS // BEGIN_GEMM_XDL_f32_gfx942_DEFS const StringRef PopulateParamsXDL::initParametersF32GemmGfx942[] = { - "v4:32,32,8,16,16,16,4,1,2,2,0,0,1,1", + "v4:16,32,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:32,16,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:16,32,4,16,16,16,8,1,2,2,0,0,1,1", "v4:32,32,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:64,16,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:64,64,4,64,16,16,4,1,2,2,0,0,1,1", - "v4:32,64,8,16,32,16,4,1,2,2,0,0,1,1", - "v4:32,64,8,32,16,16,4,1,1,2,0,0,1,1", - "v4:16,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:64,64,8,64,16,16,1,1,2,2,0,0,1,1", - "v4:256,128,4,128,64,16,4,1,2,2,0,0,1,1" + "v4:64,64,8,16,64,16,4,1,1,2,0,0,1,1", + "v4:128,128,4,64,64,16,4,1,2,2,0,0,1,1", + "v4:32,128,8,16,64,16,4,1,2,2,0,0,1,1", + "v4:64,64,8,32,64,16,4,1,2,2,0,0,1,1", + "v4:64,64,8,32,32,16,1,1,2,2,0,0,1,1", + "v4:128,64,4,128,16,16,8,1,1,2,0,0,1,1", + "v4:16,32,4,16,32,16,8,1,2,2,0,0,1,1", + "v4:256,256,4,64,128,16,4,1,2,2,0,0,1,1", + "v4:64,64,8,64,64,16,1,1,2,2,0,0,1,1", + "v4:96,128,8,96,32,32,4,1,2,2,0,0,1,1", + "v4:256,256,8,32,256,32,1,1,4,2,0,0,1,1", + "v4:32,256,8,32,16,16,4,1,1,2,0,0,1,1", + "v4:128,192,4,32,96,16,4,1,1,0,2,0,1,1" }; // END_GEMM_XDL_f32_gfx942_DEFS // BEGIN_GEMM_XDL_f32_gfx950_DEFS const StringRef PopulateParamsXDL::initParametersF32GemmGfx950[] = { - "v4:32,64,8,32,16,16,8,1,1,2,0,0,1,1", - "v4:32,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:64,128,8,16,128,16,4,1,2,2,0,0,1,1", - "v4:16,32,8,16,16,16,4,1,2,2,0,0,1,1", - "v4:64,16,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:64,16,8,16,16,16,4,1,2,2,0,0,1,1" + "v4:16,16,8,16,16,16,8,1,4,2,0,0,1,1", + "v4:64,32,4,32,16,16,8,1,4,2,0,0,1,1", + "v4:64,64,4,64,16,16,4,1,4,2,0,0,1,1", + "v4:32,32,4,16,16,16,8,1,4,2,0,0,1,1", + "v4:16,16,4,16,16,16,32,1,4,2,0,0,1,1", + "v4:128,256,4,128,64,16,4,1,2,2,0,0,1,1", + "v4:32,16,8,32,16,16,8,1,2,2,0,0,1,1", + "v4:32,64,8,32,16,16,8,1,4,2,0,0,1,1", + "v4:64,16,8,64,16,16,1,1,1,2,0,0,1,1", + "v4:16,128,4,16,16,16,16,1,4,2,0,0,1,1" }; // END_GEMM_XDL_f32_gfx950_DEFS // BEGIN_CONV_XDL_f32_gfx908_DEFS const StringRef PopulateParamsXDL::initParametersF32ConvGfx908[] = { - "v4:32,32,8,16,16,16,4,1,1,2,0,0,1,1", - "v4:32,32,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:32,16,8,16,16,16,4,1,2,2,0,0,1,1", - "v4:32,16,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:64,64,2,32,32,32,8,1,2,2,0,0,1,1", - "v4:64,64,8,32,32,32,4,1,2,2,0,0,1,1", - "v4:16,16,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:64,32,8,16,32,16,4,1,2,2,0,0,1,1", + "v4:64,64,4,32,32,32,4,1,2,2,0,0,1,1", + "v4:64,128,4,32,32,32,4,1,2,2,0,0,1,1", + "v4:32,32,4,16,16,16,8,1,2,2,0,0,1,1", + "v4:64,64,2,32,32,32,8,1,1,2,0,0,1,1", + "v4:64,128,4,64,32,32,4,1,1,2,0,0,1,1", + "v4:64,64,4,32,32,32,8,1,1,2,0,0,1,1", + "v4:128,128,2,128,32,32,4,1,2,2,0,0,1,1", "v4:64,16,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:64,64,4,64,16,16,4,1,1,2,0,0,1,1", - "v4:16,32,4,16,16,16,4,1,2,2,0,0,1,1", + "v4:32,64,8,16,32,16,4,1,2,2,0,0,1,1", + "v4:16,64,4,16,16,16,8,1,2,2,0,0,1,1", + "v4:64,256,4,32,32,32,4,1,2,2,0,0,1,1", + "v4:16,32,8,16,16,16,4,1,2,2,0,0,1,1", + "v4:16,64,4,16,16,16,4,1,2,2,0,0,1,1", + "v4:64,64,8,16,32,16,4,1,2,2,0,0,1,1", + "v4:64,32,8,16,32,16,4,1,2,2,0,0,1,1", + "v4:32,128,2,32,32,32,8,1,2,2,0,0,1,1", "v4:64,128,4,64,32,16,4,1,1,2,0,0,1,1", - "v4:128,64,4,64,32,32,4,1,2,2,0,0,1,1", - "v4:128,128,2,128,32,32,1,1,2,2,0,0,1,1", - "v4:64,32,2,64,32,32,1,1,1,2,0,0,1,1", - "v4:32,128,4,16,64,16,4,1,2,2,0,0,1,1", - "v4:64,64,2,64,32,32,1,1,2,2,0,0,1,1", - "v4:64,64,4,64,16,16,4,1,2,2,0,0,1,1" + "v4:16,128,4,16,32,16,4,1,2,2,0,0,1,1", + "v4:16,64,4,16,32,16,4,1,2,2,0,0,1,1", + "v4:64,32,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:16,64,4,16,64,16,4,1,1,2,0,0,1,1", + "v4:64,32,8,32,32,32,4,1,2,2,0,0,1,1", + "v4:16,32,8,16,32,16,4,1,2,2,0,0,1,1", + "v4:128,128,8,32,32,32,4,1,2,2,0,0,1,1", + "v4:32,128,8,32,32,16,4,1,2,2,0,0,1,1", + "v4:32,32,4,32,32,32,4,1,2,2,0,0,1,1", + "v4:64,128,4,32,32,32,8,1,2,2,0,0,1,1", + "v4:64,64,4,16,16,16,8,1,2,2,0,0,1,1", + "v4:128,64,8,32,32,32,4,1,2,2,0,0,1,1", + "v4:32,256,4,32,32,32,4,1,2,2,0,0,1,1", + "v4:64,256,8,64,64,16,1,1,2,2,0,0,1,1", + "v4:128,256,2,64,32,32,4,1,1,2,0,0,1,1", + "v4:16,256,4,16,64,16,4,1,1,2,0,0,1,1", + "v4:256,128,2,256,32,32,1,1,2,2,0,0,1,1", + "v4:32,128,4,32,64,32,4,1,1,2,0,0,1,1", + "v4:32,256,4,32,64,32,4,1,1,2,0,0,1,1", + "v4:256,128,2,128,64,32,4,1,2,2,0,0,1,1", + "v4:32,128,4,32,16,16,4,1,2,1,4,0,1,1", + "v4:128,96,2,32,96,32,4,1,2,0,2,16,1,1" }; // END_CONV_XDL_f32_gfx908_DEFS // BEGIN_CONV_XDL_f32_gfx90a_DEFS const StringRef PopulateParamsXDL::initParametersF32ConvGfx90a[] = { - "v4:32,32,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:32,32,4,16,16,16,8,1,1,2,0,0,1,1", - "v4:32,16,8,16,16,16,4,1,2,2,0,0,1,1", "v4:64,64,4,32,32,32,4,1,2,2,0,0,1,1", - "v4:32,16,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:64,128,2,64,32,32,4,1,2,2,0,0,1,1", - "v4:16,16,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:64,128,4,32,32,32,4,1,2,2,0,0,1,1", + "v4:64,128,4,64,32,16,4,1,1,2,0,0,1,1", + "v4:96,64,4,48,16,16,4,1,2,2,0,0,1,1", + "v4:32,32,8,16,16,16,4,1,2,2,0,0,1,1", + "v4:64,64,8,32,32,32,4,1,1,2,0,0,1,1", + "v4:32,64,8,32,16,16,4,1,1,2,0,0,1,1", + "v4:64,256,2,32,64,32,4,1,2,2,0,0,1,1", + "v4:64,64,4,32,32,16,4,1,2,2,0,0,1,1", "v4:64,16,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:64,64,8,64,16,16,4,1,1,2,0,0,1,1", - "v4:64,32,8,16,32,16,4,1,2,2,0,0,1,1", - "v4:16,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:16,64,8,16,16,16,4,1,1,2,0,0,1,1", - "v4:32,64,4,32,16,16,4,1,1,2,0,0,1,1", - "v4:64,64,2,64,32,32,1,1,2,2,0,0,1,1", - "v4:32,16,4,32,16,16,1,1,1,2,0,0,1,1", - "v4:128,256,8,128,64,32,1,1,1,2,0,0,1,1", - "v4:32,64,2,32,32,32,4,1,1,2,0,0,1,1", - "v4:32,32,4,32,16,16,1,1,2,2,0,0,1,1" + "v4:96,128,2,96,32,32,4,1,2,2,0,0,1,1", + "v4:32,16,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:64,64,8,16,32,16,4,1,2,2,0,0,1,1", + "v4:16,32,4,16,16,16,8,1,2,2,0,0,1,1", + "v4:32,32,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:32,64,8,16,32,16,4,1,2,2,0,0,1,1", + "v4:128,128,4,32,32,32,4,1,2,2,0,0,1,1", + "v4:64,32,8,16,16,16,4,1,2,2,0,0,1,1", + "v4:64,32,4,32,16,16,8,1,1,2,0,0,1,1", + "v4:32,64,4,16,32,16,4,1,2,2,0,0,1,1", + "v4:16,16,8,16,16,16,4,1,2,2,0,0,1,1", + "v4:128,256,2,128,64,32,4,1,1,2,0,0,1,1", + "v4:128,32,8,16,16,16,4,1,2,2,0,0,1,1", + "v4:64,256,4,64,64,16,4,1,1,2,0,0,1,1", + "v4:16,64,8,16,16,16,8,1,1,2,0,0,1,1", + "v4:16,32,4,16,32,16,4,1,1,2,0,0,1,1", + "v4:16,64,4,16,64,16,4,1,1,2,0,0,1,1", + "v4:32,128,4,16,32,16,4,1,2,2,0,0,1,1", + "v4:256,256,2,256,32,32,1,1,2,2,0,0,1,1", + "v4:128,128,4,32,32,16,4,1,2,2,0,0,1,1", + "v4:64,32,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:32,256,2,32,64,32,4,1,1,2,0,0,1,1", + "v4:32,64,2,32,64,32,4,1,1,2,0,0,1,1", + "v4:256,128,4,128,32,16,1,1,2,2,0,0,1,1", + "v4:32,256,4,32,128,16,1,1,1,2,0,0,1,1", + "v4:64,128,2,64,64,32,4,1,2,2,0,0,1,1", + "v4:256,256,8,256,64,16,1,1,1,2,0,0,1,1", + "v4:16,128,8,16,16,16,8,1,1,2,0,0,1,1", + "v4:64,256,4,32,64,32,4,1,2,2,0,0,1,1", + "v4:192,32,4,48,16,16,8,1,2,2,0,0,1,1", + "v4:32,128,4,16,16,16,8,1,1,2,0,0,1,1", + "v4:256,128,4,128,64,16,4,1,1,2,0,0,1,1", + "v4:32,64,4,32,16,16,16,1,2,2,0,0,1,1", + "v4:16,16,4,16,16,16,8,1,2,0,8,32,1,1", + "v4:16,64,4,16,16,16,4,1,2,1,2,32,1,1", + "v4:96,128,4,96,32,16,4,1,1,0,1,64,1,1", + "v4:192,128,4,96,32,32,4,1,2,1,0,8,1,1" }; // END_CONV_XDL_f32_gfx90a_DEFS // BEGIN_CONV_XDL_f32_gfx942_DEFS const StringRef PopulateParamsXDL::initParametersF32ConvGfx942[] = { - "v4:16,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:16,16,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:16,64,8,16,16,16,8,1,1,2,0,0,1,1", + "v4:32,16,8,16,16,16,8,1,2,2,0,0,1,1", "v4:32,32,8,16,16,16,4,1,2,2,0,0,1,1", - "v4:16,32,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:16,16,4,16,16,16,8,1,1,2,0,0,1,1", - "v4:32,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:64,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:16,16,8,16,16,16,4,1,1,2,0,0,1,1", - "v4:64,32,8,16,32,16,4,1,1,2,0,0,1,1", - "v4:16,32,4,16,16,16,4,1,2,2,0,0,1,1", - "v4:32,64,4,16,32,16,4,1,2,2,0,0,1,1", + "v4:16,32,4,16,16,16,8,1,2,2,0,0,1,1", + "v4:16,16,4,16,16,16,8,1,2,2,0,0,1,1", + "v4:32,32,8,16,16,16,8,1,1,2,0,0,1,1", "v4:64,16,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:64,64,8,32,32,16,4,1,1,2,0,0,1,1", - "v4:64,16,4,16,16,16,8,1,2,2,0,0,1,1", - "v4:16,32,4,16,16,16,4,1,1,2,0,0,1,1", - "v4:32,32,4,32,16,16,4,1,1,2,0,0,1,1", - "v4:32,64,8,32,16,16,1,1,1,2,0,0,1,1", - "v4:64,64,4,64,16,16,1,1,1,2,0,0,1,1", - "v4:32,64,8,32,16,16,1,1,2,2,0,0,1,1", - "v4:32,32,4,32,32,32,1,1,1,2,0,0,1,1", - "v4:32,64,8,16,32,16,8,1,1,2,0,0,1,1", - "v4:128,128,4,128,32,16,1,1,1,2,0,0,1,1" + "v4:64,32,8,32,16,16,4,1,1,2,0,0,1,1", + "v4:16,32,4,16,16,16,8,1,1,2,0,0,1,1", + "v4:64,16,8,16,16,16,4,1,4,2,0,0,1,1", + "v4:16,16,4,16,16,16,16,1,4,2,0,0,1,1", + "v4:16,16,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:32,16,4,16,16,16,4,1,2,2,0,0,1,1", + "v4:64,128,4,64,32,16,4,1,1,2,0,0,1,1", + "v4:32,64,8,16,32,16,4,1,1,2,0,0,1,1", + "v4:32,32,4,32,16,16,4,1,2,2,0,0,1,1", + "v4:16,64,4,16,32,16,4,1,2,2,0,0,1,1", + "v4:32,64,4,16,32,16,4,1,2,2,0,0,1,1", + "v4:64,64,4,32,32,16,4,1,2,2,0,0,1,1", + "v4:128,128,8,128,32,16,1,1,4,2,0,0,1,1", + "v4:128,128,4,128,16,16,4,1,1,2,0,0,1,1", + "v4:64,32,4,32,32,16,8,1,2,2,0,0,1,1", + "v4:128,64,4,64,32,16,4,1,4,2,0,0,1,1", + "v4:128,16,4,16,16,16,8,1,2,2,0,0,1,1", + "v4:64,48,8,16,48,16,4,1,1,2,0,0,1,1", + "v4:256,64,4,128,16,16,4,1,2,2,0,0,1,1", + "v4:48,16,4,48,16,16,8,1,1,2,0,0,1,1", + "v4:64,64,8,64,32,16,1,1,4,2,0,0,1,1", + "v4:64,64,4,64,64,16,4,1,1,2,0,0,1,1", + "v4:32,128,4,32,128,16,1,1,4,2,0,0,1,1", + "v4:128,64,8,64,16,16,4,1,2,2,0,0,1,1", + "v4:192,64,4,48,32,16,4,1,2,2,0,0,1,1", + "v4:64,256,8,64,32,16,1,1,2,2,0,0,1,1", + "v4:16,32,4,16,32,16,16,1,2,2,0,0,1,1", + "v4:32,64,8,32,64,16,1,1,4,2,0,0,1,1", + "v4:64,16,8,16,16,16,8,1,3,2,0,0,1,1", + "v4:96,64,4,48,16,16,4,1,2,2,0,0,1,1", + "v4:128,128,4,128,16,16,8,1,2,2,0,0,1,1", + "v4:128,256,4,128,32,16,4,1,1,2,0,0,1,1", + "v4:256,128,4,64,64,16,4,1,2,2,0,0,1,1", + "v4:256,256,4,128,64,16,1,1,4,2,0,0,1,1", + "v4:32,256,4,32,256,16,1,1,4,2,0,0,1,1", + "v4:64,32,8,16,32,16,4,1,3,2,0,0,1,1", + "v4:128,96,4,32,48,16,4,1,1,2,0,0,1,1", + "v4:16,128,4,16,128,16,4,1,1,2,0,0,1,1", + "v4:256,128,8,64,32,16,1,1,2,2,0,0,1,1", + "v4:256,256,8,128,128,16,1,1,1,2,0,0,1,1", + "v4:64,256,4,64,16,16,4,1,2,2,0,0,1,1", + "v4:64,64,2,32,64,32,16,1,2,2,0,0,1,1", + "v4:96,64,8,48,32,16,4,1,1,2,0,0,1,1", + "v4:48,32,4,48,16,16,16,1,3,2,0,0,1,1", + "v4:64,256,4,16,64,16,8,1,4,2,0,0,1,1", + "v4:96,192,8,48,192,16,1,1,4,2,0,0,1,1", + "v4:96,64,4,48,16,16,8,1,2,0,2,64,1,1", + "v4:160,64,8,80,32,16,4,1,1,2,0,0,1,1", + "v4:192,64,4,192,16,16,4,1,2,2,0,0,1,1", + "v4:96,96,8,48,48,16,1,1,4,1,0,0,1,1", + "v4:192,256,4,96,32,16,8,1,1,1,8,8,1,1", + "v4:64,192,8,32,48,16,1,1,4,0,8,0,1,1", + "v4:64,96,4,32,48,16,4,1,1,0,8,64,1,1", + "v4:96,16,4,96,16,16,8,1,1,2,0,0,1,1" }; // END_CONV_XDL_f32_gfx942_DEFS // BEGIN_CONV_XDL_f32_gfx950_DEFS const StringRef PopulateParamsXDL::initParametersF32ConvGfx950[] = { - "v4:128,128,4,128,32,16,1,1,1,2,0,0,1,1", - "v4:32,64,8,16,32,16,8,1,1,2,0,0,1,1", - "v4:64,64,8,16,64,16,8,1,1,2,0,0,1,1", - "v4:64,32,8,16,32,16,4,1,1,2,0,0,1,1", + "v4:64,32,8,32,16,16,4,1,4,2,0,0,1,1", + "v4:32,32,4,16,16,16,8,1,4,2,0,0,1,1", + "v4:16,16,4,16,16,16,8,1,4,2,0,0,1,1", + "v4:32,16,8,16,16,16,16,1,4,2,0,0,1,1", + "v4:64,16,4,16,16,16,16,1,4,2,0,0,1,1", + "v4:16,16,8,16,16,16,4,1,2,2,0,0,1,1", + "v4:128,16,8,32,16,16,4,1,4,2,0,0,1,1", + "v4:64,32,4,32,16,16,8,1,2,2,0,0,1,1", + "v4:16,32,4,16,16,16,8,1,2,2,0,0,1,1", + "v4:32,16,8,16,16,16,4,1,2,2,0,0,1,1", "v4:32,32,8,16,16,16,4,1,2,2,0,0,1,1", "v4:64,16,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:128,32,4,64,16,16,8,1,1,2,0,0,1,1", - "v4:16,32,4,16,16,16,4,1,2,2,0,0,1,1", - "v4:256,32,4,128,16,16,4,1,1,2,0,0,1,1", - "v4:128,32,8,32,32,16,4,1,1,2,0,0,1,1", - "v4:16,16,8,16,16,16,1,1,1,2,0,0,1,1", - "v4:16,32,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:64,16,8,16,16,16,4,1,2,2,0,0,1,1", - "v4:64,64,4,64,16,16,4,1,2,2,0,0,1,1", - "v4:32,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:64,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:64,16,4,64,16,16,1,1,1,2,0,0,1,1", - "v4:128,64,4,128,16,16,4,1,1,2,0,0,1,1", - "v4:64,64,4,32,32,16,8,1,1,2,0,0,1,1", - "v4:64,32,4,16,32,16,4,1,2,2,0,0,1,1", - "v4:16,32,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:16,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:128,128,4,128,32,16,1,1,2,2,0,0,1,1", - "v4:32,32,4,16,16,16,8,1,1,2,0,0,1,1", - "v4:32,32,2,32,32,32,1,1,2,2,0,0,1,1", - "v4:32,32,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:128,32,4,32,32,32,8,1,2,2,0,0,1,1", - "v4:32,64,8,32,16,16,8,1,1,2,0,0,1,1", - "v4:128,256,2,128,64,32,4,1,2,2,0,0,1,1", + "v4:16,32,8,16,32,16,4,1,4,2,0,0,1,1", + "v4:32,16,8,16,16,16,16,1,3,2,0,0,1,1", + "v4:64,64,8,16,32,16,4,1,2,2,0,0,1,1", + "v4:64,64,4,32,32,16,4,1,1,2,0,0,1,1", "v4:64,128,4,64,32,16,4,1,1,2,0,0,1,1", - "v4:32,32,8,16,16,16,4,1,1,2,0,0,1,1", - "v4:64,64,2,64,32,32,1,1,2,2,0,0,1,1", - "v4:64,32,8,32,16,16,4,1,2,2,0,0,1,1" + "v4:64,16,4,64,16,16,8,1,4,2,0,0,1,1", + "v4:128,32,4,32,32,16,8,1,4,2,0,0,1,1", + "v4:128,128,4,128,128,16,1,1,4,2,0,0,1,1", + "v4:32,64,4,16,32,16,4,1,2,2,0,0,1,1", + "v4:256,64,4,256,16,16,4,1,4,2,0,0,1,1", + "v4:128,64,8,128,64,16,1,1,4,2,0,0,1,1", + "v4:64,128,4,64,16,16,4,1,2,2,0,0,1,1", + "v4:128,64,4,64,32,16,8,1,1,2,0,0,1,1", + "v4:128,128,4,128,32,16,4,1,2,2,0,0,1,1", + "v4:32,64,4,32,64,16,4,1,1,2,0,0,1,1", + "v4:64,32,4,64,16,16,4,1,2,2,0,0,1,1", + "v4:64,64,8,32,32,16,4,1,4,2,0,0,1,1", + "v4:64,32,4,64,16,16,4,1,4,2,0,0,1,1", + "v4:128,64,4,128,64,16,4,1,4,2,0,0,1,1", + "v4:32,128,4,32,16,16,4,1,2,2,0,0,1,1", + "v4:32,128,2,32,32,32,4,1,2,2,0,0,1,1", + "v4:32,32,4,32,32,32,8,1,2,2,0,0,1,1", + "v4:64,256,4,64,64,32,1,1,4,2,0,0,1,1", + "v4:128,256,4,128,128,16,1,1,4,2,0,0,1,1", + "v4:32,256,4,32,64,16,4,1,3,2,0,0,1,1", + "v4:64,32,4,64,32,16,8,1,4,2,0,0,1,1", + "v4:192,64,4,96,32,16,8,1,4,2,0,0,1,1", + "v4:48,16,4,48,16,16,8,1,1,2,0,0,1,1", + "v4:256,256,8,256,16,16,1,1,2,2,0,0,1,1", + "v4:256,32,4,64,32,16,4,1,1,2,0,0,1,1", + "v4:64,32,4,32,32,32,4,1,2,2,0,0,1,1", + "v4:64,256,4,64,32,32,4,1,3,2,0,0,1,1", + "v4:128,16,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:128,16,8,64,16,16,4,1,1,2,0,0,1,1", + "v4:128,256,4,128,16,16,8,1,1,2,0,0,1,1", + "v4:160,80,8,80,80,16,4,1,4,2,0,0,1,1", + "v4:192,64,8,48,32,16,4,1,1,2,0,0,1,1", + "v4:128,160,8,32,160,32,4,1,1,2,0,0,1,1", + "v4:16,16,4,16,16,16,4,1,2,0,1,4,1,1", + "v4:128,48,8,16,48,16,4,1,2,2,0,0,1,1", + "v4:96,16,8,96,16,16,4,1,4,2,0,0,1,1", + "v4:96,64,4,48,16,16,4,1,4,1,0,8,1,1", + "v4:16,64,4,16,32,16,4,1,2,0,1,64,1,1", + "v4:192,32,8,48,32,16,4,1,4,1,4,64,1,1", + "v4:192,32,8,96,32,16,4,1,4,1,1,32,1,1", + "v4:48,16,4,48,16,16,4,1,4,0,8,4,1,1", + "v4:64,16,4,16,16,16,16,1,4,1,8,0,1,1", + "v4:80,16,8,80,16,16,4,1,4,0,4,32,1,1" }; // END_CONV_XDL_f32_gfx950_DEFS // BEGIN_GEMM_XDL_f16_gfx908_DEFS const StringRef PopulateParamsXDL::initParametersF16GemmGfx908[] = { - "v4:32,32,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:64,16,8,16,16,16,8,1,1,2,0,0,1,1", + "v4:32,32,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:32,64,8,16,16,16,16,1,2,2,0,0,1,1", "v4:64,64,8,32,32,32,8,1,2,2,0,0,1,1", - "v4:32,64,8,32,16,16,8,1,1,2,0,0,1,1", - "v4:128,128,8,32,128,16,4,1,2,2,0,0,1,1", - "v4:64,128,4,32,64,32,8,1,2,2,0,0,1,1", - "v4:16,64,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:16,64,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:32,128,8,32,32,32,8,1,1,2,0,0,1,1", - "v4:128,64,4,64,32,32,8,1,1,2,0,0,1,1", - "v4:128,128,8,32,128,32,8,1,2,2,0,0,1,1", - "v4:128,128,8,32,128,32,4,1,1,2,0,0,1,1", - "v4:128,256,8,128,64,16,4,1,2,2,0,0,1,1", - "v4:128,256,8,64,128,32,4,1,1,2,0,0,1,1", - "v4:128,64,8,32,64,16,8,1,2,2,0,0,1,1", - "v4:64,128,4,64,32,32,8,1,1,2,0,0,1,1", - "v4:256,128,4,128,64,16,8,1,1,2,0,0,1,1" + "v4:64,128,8,32,32,32,8,1,1,2,0,0,1,1", + "v4:128,128,8,64,32,32,8,1,2,2,0,0,1,1", + "v4:64,128,8,32,32,32,8,1,2,2,0,0,1,1", + "v4:128,128,8,64,64,32,4,1,1,2,0,0,1,1", + "v4:128,256,8,32,64,32,8,1,1,2,0,0,1,1", + "v4:32,128,8,32,16,16,16,1,1,2,0,0,1,1", + "v4:128,128,8,128,16,16,8,1,1,2,0,0,1,1", + "v4:128,128,8,128,32,32,8,1,1,2,0,0,1,1", + "v4:256,256,8,256,16,16,8,1,1,2,0,0,1,1", + "v4:256,256,4,32,128,32,16,1,1,2,0,0,1,1", + "v4:64,32,8,32,32,16,16,1,2,2,0,0,1,1", + "v4:256,192,8,32,96,32,8,1,1,2,0,0,1,1", + "v4:128,128,4,64,64,32,8,1,1,0,1,64,1,1", + "v4:128,64,8,32,64,32,8,1,2,0,8,16,1,1", + "v4:160,32,4,80,16,16,16,1,1,1,1,0,1,1", + "v4:256,256,4,128,32,32,8,1,1,0,2,0,1,1", + "v4:32,16,4,16,16,16,16,1,2,0,0,16,1,1", + "v4:80,128,8,80,16,16,8,1,2,1,1,32,1,1" }; // END_GEMM_XDL_f16_gfx908_DEFS // BEGIN_GEMM_XDL_f16_gfx90a_DEFS const StringRef PopulateParamsXDL::initParametersF16GemmGfx90a[] = { + "v4:16,64,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:32,64,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:16,16,8,16,16,16,16,1,2,2,0,0,1,1", "v4:64,64,8,32,32,32,8,1,1,2,0,0,1,1", - "v4:32,64,8,32,16,16,8,1,2,2,0,0,1,1", + "v4:64,128,8,64,32,32,4,1,1,2,0,0,1,1", "v4:64,64,8,32,32,32,8,1,2,2,0,0,1,1", - "v4:32,32,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:64,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:128,128,4,64,64,32,8,1,1,2,0,0,1,1", - "v4:64,128,8,64,32,16,4,1,1,2,0,0,1,1", - "v4:16,32,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:16,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:16,16,8,16,16,16,4,1,1,2,0,0,1,1", - "v4:32,32,8,16,16,16,4,1,1,2,0,0,1,1", - "v4:32,128,4,32,32,16,8,1,1,2,0,0,1,1", - "v4:16,64,4,16,16,16,8,1,2,2,0,0,1,1", - "v4:128,128,8,64,64,32,8,1,2,2,0,0,1,1", - "v4:128,256,8,128,64,16,4,1,2,2,0,0,1,1", - "v4:64,256,4,64,64,16,4,1,1,2,0,0,1,1", - "v4:256,128,4,128,64,16,8,1,1,2,0,0,1,1" + "v4:128,128,8,128,32,16,4,1,1,2,0,0,1,1", + "v4:32,64,8,16,32,16,16,1,1,2,0,0,1,1", + "v4:32,32,8,32,16,16,16,1,2,2,0,0,1,1", + "v4:128,128,8,64,32,32,8,1,1,2,0,0,1,1", + "v4:128,16,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:64,128,8,16,64,16,8,1,2,2,0,0,1,1", + "v4:32,64,8,32,32,32,8,1,2,2,0,0,1,1", + "v4:128,128,8,32,32,32,16,1,1,2,0,0,1,1", + "v4:128,32,4,16,16,16,16,1,2,2,0,0,1,1", + "v4:256,256,8,64,64,16,8,1,1,2,0,0,1,1", + "v4:128,256,8,128,16,16,8,1,1,2,0,0,1,1", + "v4:128,64,8,64,64,16,8,1,1,2,0,0,1,1", + "v4:64,128,8,64,32,32,4,1,1,1,8,16,1,1", + "v4:80,128,8,80,16,16,8,1,2,1,0,64,1,1", + "v4:256,256,4,256,64,16,4,1,1,0,0,0,1,1", + "v4:32,256,8,32,32,16,4,1,1,0,0,0,1,1", + "v4:64,256,8,64,32,32,4,1,1,0,2,16,1,1", + "v4:96,256,8,96,32,16,4,1,2,2,0,0,1,1" }; // END_GEMM_XDL_f16_gfx90a_DEFS // BEGIN_GEMM_XDL_f16_gfx942_DEFS const StringRef PopulateParamsXDL::initParametersF16GemmGfx942[] = { - "v4:64,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:32,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:32,64,8,16,32,16,8,1,1,2,0,0,1,1", - "v4:32,32,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:16,32,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:64,64,8,16,64,16,8,1,1,2,0,0,1,1", - "v4:32,32,8,32,16,16,8,1,2,2,0,0,1,1", - "v4:64,128,4,32,64,32,8,1,1,2,0,0,1,1", - "v4:64,32,8,32,32,32,8,1,1,2,0,0,1,1", - "v4:128,128,8,64,64,16,8,1,2,2,0,0,1,1", - "v4:256,128,4,128,64,16,8,1,1,2,0,0,1,1", - "v4:64,128,4,64,32,32,8,1,1,2,0,0,1,1", - "v4:128,128,2,64,64,32,8,1,1,2,0,0,1,1", - "v4:128,256,8,64,128,16,8,1,1,2,0,0,1,1", - "v4:64,16,8,32,16,16,8,1,2,2,0,0,1,1", - "v4:64,16,4,16,16,16,4,1,2,2,0,0,1,1", - "v4:256,32,2,128,32,32,4,1,1,2,0,0,1,1", - "v4:16,32,4,16,16,16,4,1,1,2,0,0,1,1", - "v4:64,32,4,64,16,16,8,1,2,2,0,0,1,1", - "v4:128,256,8,64,128,32,4,1,2,2,0,0,1,1", - "v4:16,32,8,16,16,16,4,1,2,2,0,0,1,1" + "v4:32,16,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:16,32,8,16,16,16,16,1,1,2,0,0,1,1", + "v4:32,64,8,16,32,16,8,1,2,2,0,0,1,1", + "v4:64,64,8,32,32,16,8,1,2,2,0,0,1,1", + "v4:16,64,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:64,32,8,32,32,32,8,1,2,2,0,0,1,1", + "v4:32,64,8,32,32,32,16,1,2,2,0,0,1,1", + "v4:16,16,8,16,16,16,8,1,4,2,0,0,1,1", + "v4:128,64,8,64,32,32,8,1,2,2,0,0,1,1", + "v4:64,64,4,32,32,16,16,1,2,2,0,0,1,1", + "v4:128,128,8,32,64,16,8,1,2,2,0,0,1,1", + "v4:128,128,4,64,64,16,8,1,1,2,0,0,1,1", + "v4:64,64,8,64,64,16,8,1,1,2,0,0,1,1", + "v4:16,256,8,16,16,16,8,1,1,2,0,0,1,1", + "v4:128,64,8,32,32,32,8,1,2,2,0,0,1,1", + "v4:256,256,8,32,128,16,8,1,1,2,0,0,1,1", + "v4:128,128,8,64,128,16,8,1,1,2,0,0,1,1", + "v4:128,256,8,64,128,32,4,1,1,2,0,0,1,1", + "v4:256,256,4,64,128,32,8,1,2,2,0,0,1,1", + "v4:128,128,8,32,32,32,8,1,2,2,0,0,1,1", + "v4:128,256,4,64,64,16,8,1,1,2,0,0,1,1", + "v4:64,32,4,64,32,16,16,1,3,2,0,0,1,1", + "v4:96,128,8,96,32,32,4,1,2,2,0,0,1,1", + "v4:192,256,8,192,32,16,4,1,2,2,0,0,1,1", + "v4:256,256,8,32,128,32,8,1,1,1,2,32,1,1", + "v4:128,128,8,64,64,32,4,1,1,0,1,64,1,1", + "v4:192,192,4,96,192,16,8,1,1,1,1,16,1,1" }; // END_GEMM_XDL_f16_gfx942_DEFS // BEGIN_GEMM_XDL_f16_gfx950_DEFS const StringRef PopulateParamsXDL::initParametersF16GemmGfx950[] = { - "v4:32,64,8,16,32,16,8,1,2,2,0,0,1,1", - "v4:16,32,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:32,64,8,32,16,16,8,1,2,2,0,0,1,1", - "v4:64,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:16,32,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:64,64,8,32,32,32,8,1,1,2,0,0,1,1", - "v4:64,64,8,32,32,32,8,1,2,2,0,0,1,1", - "v4:32,16,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:32,16,8,32,16,16,8,1,2,2,0,0,1,1", + "v4:16,32,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:32,32,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:32,64,8,32,32,32,16,1,2,2,0,0,1,1", + "v4:16,16,8,16,16,16,32,1,4,2,0,0,1,1", + "v4:32,32,4,32,16,16,16,1,4,2,0,0,1,1", "v4:128,64,8,32,64,16,8,1,2,2,0,0,1,1", - "v4:128,128,8,64,64,16,8,1,1,2,0,0,1,1", - "v4:256,128,4,128,64,32,8,1,2,2,0,0,1,1", - "v4:32,128,4,16,64,16,4,1,2,2,0,0,1,1", - "v4:32,128,8,32,32,32,8,1,1,2,0,0,1,1" + "v4:32,64,4,32,16,16,16,1,2,2,0,0,1,1", + "v4:16,16,8,16,16,16,16,1,3,2,0,0,1,1", + "v4:128,128,8,32,64,32,8,1,2,2,0,0,1,1", + "v4:64,128,8,32,32,32,8,1,2,2,0,0,1,1", + "v4:16,16,4,16,16,16,16,1,3,2,0,0,1,1", + "v4:64,64,8,16,32,16,16,1,1,2,0,0,1,1", + "v4:64,16,4,32,16,16,16,1,2,2,0,0,1,1", + "v4:64,64,4,64,32,32,8,1,4,2,0,0,1,1", + "v4:32,64,4,32,32,32,8,1,4,2,0,0,1,1", + "v4:32,64,8,32,16,16,8,1,4,2,0,0,1,1", + "v4:16,64,4,16,16,16,32,1,4,2,0,0,1,1", + "v4:32,128,4,32,64,16,16,1,2,2,0,0,1,1", + "v4:256,128,8,64,64,32,8,1,2,2,0,0,1,1", + "v4:128,128,4,128,64,16,8,1,4,2,0,0,1,1", + "v4:128,256,2,128,64,32,8,1,4,2,0,0,1,1", + "v4:256,128,4,256,32,16,8,1,4,2,0,0,1,1", + "v4:128,128,4,128,64,16,4,1,2,2,0,0,1,1", + "v4:256,128,8,64,128,16,8,1,1,2,0,0,1,1", + "v4:256,256,4,64,64,32,8,1,2,2,0,0,1,1", + "v4:256,256,2,128,64,32,16,1,2,2,0,0,1,1", + "v4:128,256,8,128,32,32,8,1,1,2,0,0,1,1", + "v4:96,48,8,48,48,16,8,1,4,1,2,4,1,1" }; // END_GEMM_XDL_f16_gfx950_DEFS // BEGIN_CONV_XDL_f16_gfx908_DEFS const StringRef PopulateParamsXDL::initParametersF16ConvGfx908[] = { - "v4:32,32,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:64,32,8,32,16,16,8,1,2,2,0,0,1,1", + "v4:64,32,4,32,16,16,16,1,2,2,0,0,1,1", + "v4:64,16,4,16,16,16,16,1,2,2,0,0,1,1", + "v4:64,64,8,32,32,32,8,1,1,2,0,0,1,1", + "v4:128,128,8,128,32,32,4,1,1,2,0,0,1,1", + "v4:64,16,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:64,64,4,32,32,32,16,1,2,2,0,0,1,1", + "v4:128,256,2,128,64,32,4,1,1,2,0,0,1,1", + "v4:64,128,8,64,32,32,4,1,1,2,0,0,1,1", + "v4:16,32,4,16,16,16,16,1,2,2,0,0,1,1", + "v4:32,64,8,16,16,16,16,1,2,2,0,0,1,1", "v4:64,64,4,32,32,32,8,1,2,2,0,0,1,1", - "v4:64,128,8,64,32,32,8,1,2,2,0,0,1,1", - "v4:64,128,8,32,64,32,4,1,1,2,0,0,1,1", - "v4:32,128,8,32,32,32,8,1,2,2,0,0,1,1", - "v4:64,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:64,64,8,16,64,16,8,1,1,2,0,0,1,1", - "v4:128,128,4,64,64,32,8,1,2,2,0,0,1,1", - "v4:16,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:16,64,4,16,16,16,8,1,1,2,0,0,1,1", + "v4:16,32,4,16,32,16,8,1,2,2,0,0,1,1", + "v4:64,128,8,32,32,32,8,1,2,2,0,0,1,1", + "v4:128,128,4,128,32,32,8,1,1,2,0,0,1,1", + "v4:32,128,8,32,32,32,4,1,2,2,0,0,1,1", + "v4:128,16,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:16,16,4,16,16,16,16,1,2,2,0,0,1,1", + "v4:32,32,4,32,16,16,8,1,2,2,0,0,1,1", "v4:32,256,2,32,64,32,4,1,2,2,0,0,1,1", - "v4:128,256,2,128,64,32,4,1,1,2,0,0,1,1", - "v4:64,32,2,64,32,32,4,1,1,2,0,0,1,1" + "v4:128,128,2,64,32,32,8,1,2,2,0,0,1,1", + "v4:32,32,8,32,32,16,8,1,2,2,0,0,1,1", + "v4:128,128,2,64,64,32,16,1,1,2,0,0,1,1", + "v4:32,64,4,32,16,16,8,1,2,2,0,0,1,1", + "v4:64,64,4,32,16,16,4,1,2,2,0,0,1,1", + "v4:128,128,8,32,64,32,8,1,1,2,0,0,1,1", + "v4:128,32,8,32,32,32,8,1,2,2,0,0,1,1", + "v4:128,64,8,64,32,32,4,1,2,2,0,0,1,1", + "v4:64,64,4,64,32,32,8,1,1,2,0,0,1,1", + "v4:128,128,4,64,32,32,4,1,1,2,0,0,1,1", + "v4:128,32,4,32,32,32,16,1,1,2,0,0,1,1", + "v4:16,64,8,16,16,16,4,1,2,2,0,0,1,1", + "v4:128,128,2,32,32,32,4,1,2,2,0,0,1,1", + "v4:128,256,4,64,32,32,16,1,1,2,0,0,1,1", + "v4:128,48,8,16,48,16,4,1,2,2,0,0,1,1", + "v4:256,64,4,128,32,32,8,1,1,2,0,0,1,1", + "v4:64,64,4,16,16,16,4,1,2,2,0,0,1,1", + "v4:256,256,2,64,64,32,16,1,2,2,0,0,1,1", + "v4:128,256,8,128,64,32,4,1,1,2,0,0,1,1", + "v4:64,32,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:256,64,4,64,32,32,8,1,2,2,0,0,1,1", + "v4:64,256,2,64,32,32,16,1,2,2,0,0,1,1", + "v4:128,64,2,128,32,32,16,1,1,2,0,0,1,1", + "v4:128,32,8,16,16,16,16,1,1,2,0,0,1,1", + "v4:256,64,8,32,64,32,8,1,1,2,0,0,1,1", + "v4:256,32,8,16,32,16,4,1,2,2,0,0,1,1", + "v4:192,64,4,96,32,16,8,1,2,2,0,0,1,1", + "v4:64,32,4,64,32,32,16,1,1,2,0,0,1,1", + "v4:128,128,8,32,32,32,8,1,2,2,0,0,1,1", + "v4:256,256,8,64,64,32,8,1,1,2,0,0,1,1", + "v4:256,128,8,32,64,32,8,1,1,2,0,0,1,1", + "v4:192,256,2,96,32,32,16,1,2,2,0,0,1,1", + "v4:160,64,4,160,32,32,8,1,2,1,8,16,1,1", + "v4:96,128,4,48,32,16,16,1,2,2,0,0,1,1", + "v4:16,32,4,16,32,16,8,1,2,1,1,4,1,1", + "v4:128,256,4,64,256,32,8,1,1,0,0,0,1,1", + "v4:192,64,8,96,16,16,8,1,1,1,8,32,1,1", + "v4:256,192,4,32,96,32,4,1,2,0,4,0,1,1", + "v4:32,16,4,16,16,16,8,1,2,1,1,8,1,1" }; // END_CONV_XDL_f16_gfx908_DEFS // BEGIN_CONV_XDL_f16_gfx90a_DEFS const StringRef PopulateParamsXDL::initParametersF16ConvGfx90a[] = { - "v4:32,32,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:64,128,8,32,32,32,8,1,1,2,0,0,1,1", "v4:64,64,8,32,32,32,8,1,1,2,0,0,1,1", - "v4:64,64,8,32,32,32,8,1,2,2,0,0,1,1", - "v4:64,16,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:64,32,8,32,32,32,8,1,1,2,0,0,1,1", - "v4:64,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:128,128,4,128,32,32,8,1,2,2,0,0,1,1", - "v4:64,64,2,32,32,32,8,1,1,2,0,0,1,1", - "v4:64,32,8,32,32,32,8,1,2,2,0,0,1,1", - "v4:16,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:16,64,4,16,16,16,4,1,1,2,0,0,1,1", - "v4:32,128,8,32,32,32,4,1,1,2,0,0,1,1", - "v4:16,128,4,16,32,16,4,1,1,2,0,0,1,1", + "v4:64,16,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:128,256,2,128,64,32,4,1,1,2,0,0,1,1", + "v4:64,128,8,64,32,32,4,1,1,2,0,0,1,1", "v4:64,256,2,64,64,32,4,1,2,2,0,0,1,1", - "v4:128,64,2,128,32,32,4,1,1,2,0,0,1,1" + "v4:128,16,8,16,16,16,16,1,1,2,0,0,1,1", + "v4:128,128,4,128,32,16,8,1,1,2,0,0,1,1", + "v4:64,16,8,16,16,16,16,1,1,2,0,0,1,1", + "v4:32,256,2,32,64,32,4,1,2,2,0,0,1,1", + "v4:64,256,8,32,64,32,4,1,1,2,0,0,1,1", + "v4:32,32,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:128,256,2,64,128,32,4,1,2,2,0,0,1,1", + "v4:32,32,8,32,16,16,8,1,2,2,0,0,1,1", + "v4:128,64,4,32,64,16,8,1,1,2,0,0,1,1", + "v4:128,16,4,16,16,16,16,1,2,2,0,0,1,1", + "v4:128,256,8,128,32,32,4,1,1,2,0,0,1,1", + "v4:16,32,4,16,16,16,16,1,2,2,0,0,1,1", + "v4:16,32,4,16,32,16,8,1,1,2,0,0,1,1", + "v4:64,32,4,32,16,16,16,1,2,2,0,0,1,1", + "v4:64,64,4,32,32,32,8,1,2,2,0,0,1,1", + "v4:128,16,8,16,16,16,8,1,1,2,0,0,1,1", + "v4:64,32,8,32,16,16,16,1,1,2,0,0,1,1", + "v4:32,64,4,32,32,32,8,1,2,2,0,0,1,1", + "v4:256,128,8,128,32,16,4,1,1,2,0,0,1,1", + "v4:128,128,4,64,64,32,8,1,1,2,0,0,1,1", + "v4:128,64,4,32,64,32,8,1,2,2,0,0,1,1", + "v4:32,16,4,16,16,16,16,1,2,2,0,0,1,1", + "v4:32,16,4,32,16,16,8,1,1,2,0,0,1,1", + "v4:128,128,2,32,32,32,4,1,2,2,0,0,1,1", + "v4:128,32,8,32,32,32,8,1,1,2,0,0,1,1", + "v4:128,32,8,32,32,16,8,1,2,2,0,0,1,1", + "v4:16,32,4,16,16,16,8,1,2,2,0,0,1,1", + "v4:256,64,4,128,32,32,8,1,1,2,0,0,1,1", + "v4:128,256,4,128,64,32,4,1,1,2,0,0,1,1", + "v4:64,256,4,32,64,16,8,1,1,2,0,0,1,1", + "v4:16,32,4,16,32,16,16,1,1,2,0,0,1,1", + "v4:32,32,8,16,16,16,4,1,1,2,0,0,1,1", + "v4:64,64,2,32,64,32,4,1,2,2,0,0,1,1", + "v4:64,64,4,16,16,16,4,1,2,2,0,0,1,1", + "v4:32,64,4,16,32,16,8,1,2,2,0,0,1,1", + "v4:64,128,8,32,32,32,4,1,2,2,0,0,1,1", + "v4:128,256,4,64,128,32,8,1,1,2,0,0,1,1", + "v4:128,64,8,32,32,16,8,1,1,2,0,0,1,1", + "v4:128,128,8,128,16,16,4,1,2,2,0,0,1,1", + "v4:64,64,4,32,64,16,8,1,1,2,0,0,1,1", + "v4:128,64,8,32,32,32,8,1,1,2,0,0,1,1", + "v4:128,128,8,32,64,16,4,1,1,2,0,0,1,1", + "v4:128,64,4,64,64,32,8,1,2,2,0,0,1,1", + "v4:128,256,4,32,128,32,8,1,1,2,0,0,1,1", + "v4:128,256,4,64,128,16,4,1,1,2,0,0,1,1", + "v4:192,32,4,48,32,16,16,1,1,2,0,0,1,1", + "v4:256,128,4,32,64,32,16,1,1,2,0,0,1,1", + "v4:128,32,4,16,32,16,16,1,2,2,0,0,1,1", + "v4:128,32,8,32,32,32,4,1,2,2,0,0,1,1", + "v4:256,32,4,16,32,16,8,1,2,2,0,0,1,1", + "v4:256,256,4,64,64,32,16,1,1,2,0,0,1,1", + "v4:128,64,4,32,64,32,16,1,2,2,0,0,1,1", + "v4:256,128,8,64,128,32,4,1,2,2,0,0,1,1", + "v4:64,128,8,16,64,16,4,1,1,2,0,0,1,1", + "v4:64,32,8,64,32,32,4,1,2,2,0,0,1,1", + "v4:128,128,4,32,128,32,16,1,1,2,0,0,1,1", + "v4:96,32,4,96,16,16,16,1,2,2,0,0,1,1", + "v4:64,256,4,64,16,16,16,1,1,2,0,0,1,1", + "v4:64,32,4,64,32,16,16,1,1,2,0,0,1,1", + "v4:128,256,4,32,256,16,4,1,1,2,0,0,1,1", + "v4:192,64,8,96,32,16,4,1,2,2,0,0,1,1", + "v4:256,48,8,16,48,16,8,1,1,1,1,4,1,1", + "v4:256,128,8,128,32,32,4,1,1,1,8,4,1,1", + "v4:256,192,8,64,96,32,8,1,1,0,1,64,1,1", + "v4:96,64,8,96,32,32,4,1,1,1,0,64,1,1" }; // END_CONV_XDL_f16_gfx90a_DEFS // BEGIN_CONV_XDL_f16_gfx942_DEFS const StringRef PopulateParamsXDL::initParametersF16ConvGfx942[] = { - "v4:16,32,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:16,16,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:32,16,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:16,16,8,16,16,16,8,1,1,2,0,0,1,1", + "v4:32,16,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:64,16,8,32,16,16,8,1,2,2,0,0,1,1", "v4:64,64,8,32,32,32,8,1,2,2,0,0,1,1", - "v4:64,64,8,32,32,32,8,1,1,2,0,0,1,1", - "v4:16,64,8,16,16,16,4,1,1,2,0,0,1,1", - "v4:64,128,4,32,64,32,8,1,2,2,0,0,1,1", - "v4:64,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:32,32,4,16,16,16,8,1,1,2,0,0,1,1", - "v4:32,64,8,32,16,16,4,1,1,2,0,0,1,1", - "v4:128,256,4,128,64,32,8,1,2,2,0,0,1,1", - "v4:64,64,8,16,64,16,4,1,1,2,0,0,1,1", - "v4:16,64,4,16,16,16,4,1,1,2,0,0,1,1", - "v4:64,32,4,32,32,32,8,1,2,2,0,0,1,1", - "v4:32,128,8,32,32,32,1,1,2,2,0,0,1,1", - "v4:128,64,2,64,32,32,8,1,2,2,0,0,1,1", - "v4:64,32,4,16,32,16,4,1,1,2,0,0,1,1", + "v4:128,16,8,16,16,16,4,1,2,2,0,0,1,1", + "v4:64,16,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:16,16,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:16,16,4,16,16,16,16,1,2,2,0,0,1,1", + "v4:16,16,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:32,64,8,32,32,32,4,1,2,2,0,0,1,1", + "v4:16,32,4,16,16,16,16,1,2,2,0,0,1,1", + "v4:64,16,8,32,16,16,8,1,1,2,0,0,1,1", + "v4:64,128,8,64,32,16,4,1,2,2,0,0,1,1", "v4:32,256,2,32,64,32,4,1,2,2,0,0,1,1", - "v4:64,32,2,64,32,32,4,1,1,2,0,0,1,1" + "v4:16,128,4,16,32,16,8,1,2,2,0,0,1,1", + "v4:16,32,8,16,16,16,16,1,1,2,0,0,1,1", + "v4:128,64,8,64,32,16,4,1,1,2,0,0,1,1", + "v4:16,16,8,16,16,16,4,1,2,2,0,0,1,1", + "v4:128,128,2,128,64,32,4,1,2,2,0,0,1,1", + "v4:128,32,8,32,32,32,8,1,2,2,0,0,1,1", + "v4:32,64,8,32,16,16,4,1,2,2,0,0,1,1", + "v4:64,64,4,32,32,32,8,1,2,2,0,0,1,1", + "v4:32,32,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:64,128,8,16,64,16,8,1,2,2,0,0,1,1", + "v4:128,64,8,32,32,32,8,1,2,2,0,0,1,1", + "v4:128,16,8,64,16,16,8,1,1,2,0,0,1,1", + "v4:128,64,4,64,64,16,8,1,1,2,0,0,1,1", + "v4:64,32,2,64,32,32,16,1,1,2,0,0,1,1", + "v4:128,32,2,32,32,32,4,1,2,2,0,0,1,1", + "v4:128,32,4,128,16,16,4,1,2,2,0,0,1,1", + "v4:128,16,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:128,128,4,64,64,16,8,1,2,2,0,0,1,1", + "v4:48,32,8,48,16,16,4,1,2,2,0,0,1,1", + "v4:64,64,2,64,64,32,16,1,2,2,0,0,1,1", + "v4:64,128,4,64,128,16,4,1,1,2,0,0,1,1", + "v4:256,256,4,64,128,16,8,1,2,2,0,0,1,1", + "v4:64,256,4,64,64,32,4,1,1,2,0,0,1,1", + "v4:128,128,8,32,64,32,1,1,2,2,0,0,1,1", + "v4:32,128,4,32,128,16,4,1,1,2,0,0,1,1", + "v4:256,32,8,32,16,16,4,1,2,2,0,0,1,1", + "v4:32,256,4,32,32,16,8,1,1,2,0,0,1,1", + "v4:128,256,4,128,32,16,8,1,1,2,0,0,1,1", + "v4:128,256,8,128,32,16,4,1,2,2,0,0,1,1", + "v4:64,64,4,16,16,16,4,1,2,2,0,0,1,1", + "v4:128,256,4,128,64,16,8,1,1,2,0,0,1,1", + "v4:256,128,4,64,64,32,8,1,2,2,0,0,1,1", + "v4:256,128,8,128,32,16,4,1,1,2,0,0,1,1", + "v4:256,64,4,64,64,16,8,1,2,2,0,0,1,1", + "v4:32,64,4,16,16,16,4,1,2,2,0,0,1,1", + "v4:128,128,2,128,128,32,8,1,2,2,0,0,1,1", + "v4:128,16,8,16,16,16,4,1,1,2,0,0,1,1", + "v4:128,256,8,128,128,32,1,1,2,2,0,0,1,1", + "v4:128,64,8,64,64,16,8,1,1,2,0,0,1,1", + "v4:64,256,8,16,128,16,4,1,1,2,0,0,1,1", + "v4:32,128,8,32,32,32,4,1,3,2,0,0,1,1", + "v4:64,128,8,64,128,16,4,1,1,2,0,0,1,1", + "v4:128,16,8,16,16,16,8,1,1,2,0,0,1,1", + "v4:128,256,8,64,128,16,4,1,1,2,0,0,1,1", + "v4:192,32,4,48,32,16,16,1,1,2,0,0,1,1", + "v4:256,64,4,256,64,16,4,1,1,2,0,0,1,1", + "v4:96,16,8,48,16,16,16,1,2,2,0,0,1,1", + "v4:256,256,4,128,64,16,4,1,2,2,0,0,1,1", + "v4:256,256,4,64,64,32,4,1,1,2,0,0,1,1", + "v4:32,64,4,32,64,32,16,1,3,2,0,0,1,1", + "v4:128,64,4,128,16,16,16,1,1,2,0,0,1,1", + "v4:256,64,2,256,64,32,16,1,1,2,0,0,1,1", + "v4:128,256,2,128,64,32,8,1,4,2,0,0,1,1", + "v4:256,32,4,64,32,16,8,1,1,2,0,0,1,1", + "v4:128,256,2,32,256,32,16,1,1,2,0,0,1,1", + "v4:256,128,8,64,32,16,8,1,1,2,0,0,1,1", + "v4:16,16,4,16,16,16,8,1,2,0,0,4,1,1", + "v4:16,16,8,16,16,16,8,1,2,1,0,4,1,1", + "v4:160,64,4,160,32,32,8,1,2,2,0,0,1,1", + "v4:256,48,8,16,48,16,8,1,1,2,0,0,1,1", + "v4:160,64,8,80,32,16,8,1,1,0,0,4,1,1", + "v4:96,128,8,48,32,16,8,1,2,1,0,8,1,1", + "v4:96,64,4,48,16,16,16,1,2,0,8,4,1,1", + "v4:128,128,4,128,32,16,8,1,2,0,8,4,1,1", + "v4:128,256,2,32,256,32,4,1,1,0,0,64,1,1", + "v4:128,256,4,32,256,32,8,1,3,2,0,0,1,1", + "v4:192,32,8,48,32,16,8,1,1,0,8,16,1,1", + "v4:224,32,4,224,32,32,16,1,3,2,0,0,1,1", + "v4:256,256,2,32,128,32,16,1,2,2,0,0,1,1", + "v4:256,256,4,128,64,32,8,1,2,0,2,8,1,1", + "v4:32,64,8,32,32,16,16,1,3,2,0,0,1,1", + "v4:32,64,8,32,64,32,16,1,4,2,0,0,1,1", + "v4:64,192,4,16,192,16,8,1,2,2,0,0,1,1", + "v4:64,64,4,16,16,16,32,1,3,2,0,0,1,1" }; // END_CONV_XDL_f16_gfx942_DEFS // BEGIN_CONV_XDL_f16_gfx950_DEFS const StringRef PopulateParamsXDL::initParametersF16ConvGfx950[] = { - "v4:64,128,8,32,64,16,8,1,1,2,0,0,1,1", - "v4:64,16,8,32,16,16,8,1,2,2,0,0,1,1", - "v4:32,64,8,32,32,32,1,1,1,2,0,0,1,1", - "v4:128,128,8,64,64,32,8,1,2,2,0,0,1,1", - "v4:32,32,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:16,64,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:32,64,8,32,16,16,4,1,1,2,0,0,1,1", - "v4:32,16,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:64,16,8,32,16,16,8,1,1,2,0,0,1,1", - "v4:64,16,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:64,256,8,16,256,16,4,1,2,2,0,0,1,1", - "v4:128,128,4,128,32,16,8,1,1,2,0,0,1,1", - "v4:64,256,2,64,64,32,4,1,1,2,0,0,1,1", - "v4:32,64,8,32,16,16,8,1,2,2,0,0,1,1", - "v4:64,256,2,32,128,32,4,1,1,2,0,0,1,1", - "v4:64,64,8,64,16,16,8,1,2,2,0,0,1,1", - "v4:64,16,8,16,16,16,8,1,1,2,0,0,1,1", "v4:64,64,8,32,32,32,8,1,2,2,0,0,1,1", - "v4:64,64,4,32,32,32,8,1,2,2,0,0,1,1", + "v4:64,16,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:128,16,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:16,16,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:32,16,4,16,16,16,16,1,2,2,0,0,1,1", + "v4:32,32,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:16,32,4,16,16,16,16,1,1,2,0,0,1,1", + "v4:64,64,4,32,64,16,16,1,2,2,0,0,1,1", + "v4:256,16,4,16,16,16,16,1,2,2,0,0,1,1", + "v4:32,64,4,32,32,32,16,1,1,2,0,0,1,1", + "v4:16,32,8,16,32,16,4,1,1,2,0,0,1,1", + "v4:16,16,8,16,16,16,16,1,1,2,0,0,1,1", + "v4:64,64,2,64,64,32,4,1,1,2,0,0,1,1", + "v4:64,32,4,64,32,16,8,1,1,2,0,0,1,1", + "v4:16,64,8,16,32,16,4,1,2,2,0,0,1,1", + "v4:32,256,2,32,64,32,4,1,1,2,0,0,1,1", + "v4:128,64,8,64,32,16,8,1,1,2,0,0,1,1", + "v4:32,32,4,32,16,16,8,1,2,2,0,0,1,1", + "v4:128,128,4,64,64,16,8,1,2,2,0,0,1,1", + "v4:16,16,8,16,16,16,4,1,2,2,0,0,1,1", + "v4:32,128,4,32,64,32,8,1,2,2,0,0,1,1", + "v4:64,16,4,64,16,16,16,1,1,2,0,0,1,1", + "v4:64,64,8,64,32,16,4,1,2,2,0,0,1,1", + "v4:64,64,8,32,16,16,16,1,2,2,0,0,1,1", + "v4:128,128,2,128,32,32,16,1,1,2,0,0,1,1", + "v4:32,64,4,32,16,16,4,1,1,2,0,0,1,1", + "v4:128,64,4,128,32,32,8,1,1,2,0,0,1,1", + "v4:256,128,4,128,64,32,8,1,1,2,0,0,1,1", + "v4:32,128,8,32,32,32,16,1,2,2,0,0,1,1", + "v4:64,32,8,32,16,16,8,1,2,2,0,0,1,1", "v4:32,32,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:32,64,8,16,32,16,4,1,2,2,0,0,1,1", + "v4:256,64,2,64,64,32,16,1,2,2,0,0,1,1", + "v4:32,128,4,32,128,16,8,1,1,2,0,0,1,1", + "v4:64,32,4,16,32,16,16,1,1,2,0,0,1,1", + "v4:128,32,4,32,32,16,16,1,2,2,0,0,1,1", + "v4:256,64,4,256,16,16,8,1,1,2,0,0,1,1", + "v4:64,128,8,32,32,16,4,1,2,2,0,0,1,1", + "v4:128,128,4,64,32,16,16,1,2,2,0,0,1,1", + "v4:128,32,4,32,32,32,16,1,1,2,0,0,1,1", + "v4:128,32,4,64,32,16,8,1,2,2,0,0,1,1", + "v4:16,64,8,16,64,16,16,1,1,2,0,0,1,1", + "v4:256,128,8,64,64,32,8,1,2,2,0,0,1,1", + "v4:64,64,4,16,16,16,4,1,2,2,0,0,1,1", + "v4:256,32,4,32,16,16,8,1,2,2,0,0,1,1", + "v4:256,64,8,64,32,32,8,1,2,2,0,0,1,1", + "v4:192,32,4,48,32,16,16,1,1,2,0,0,1,1", + "v4:32,64,4,32,16,16,4,1,3,2,0,0,1,1", + "v4:256,32,8,64,32,16,8,1,2,2,0,0,1,1", + "v4:256,128,4,64,64,16,16,1,1,2,0,0,1,1", + "v4:128,32,8,64,32,32,4,1,1,2,0,0,1,1", + "v4:256,256,8,256,16,16,4,1,2,2,0,0,1,1", + "v4:48,32,8,48,16,16,8,1,2,2,0,0,1,1", + "v4:16,16,4,16,16,16,8,1,2,0,1,4,1,1", + "v4:32,80,8,16,80,16,8,1,2,2,0,0,1,1", + "v4:48,48,4,48,48,16,8,1,2,0,1,32,1,1", + "v4:192,32,4,96,16,16,16,1,1,0,4,32,1,1", + "v4:192,64,8,96,32,32,8,1,2,2,0,0,1,1", + "v4:256,256,8,64,64,32,8,1,1,2,0,0,1,1", + "v4:128,256,4,64,128,16,16,1,2,2,0,0,1,1", + "v4:192,256,2,192,64,32,4,1,1,2,0,0,1,1", + "v4:192,32,8,96,16,16,8,1,2,2,0,0,1,1", + "v4:256,32,8,256,16,16,8,1,1,2,0,0,1,1", "v4:256,32,8,64,32,32,8,1,2,2,0,0,1,1", - "v4:32,128,4,32,32,32,8,1,1,2,0,0,1,1", - "v4:128,32,8,32,32,32,8,1,2,2,0,0,1,1", - "v4:256,128,4,128,64,32,8,1,2,2,0,0,1,1", - "v4:256,64,8,64,64,32,8,1,2,2,0,0,1,1", - "v4:32,32,2,32,32,32,4,1,1,2,0,0,1,1" + "v4:256,64,4,128,32,32,16,1,1,1,2,32,1,1", + "v4:96,16,8,48,16,16,8,1,2,2,0,0,1,1", + "v4:96,256,8,96,32,32,4,1,2,2,0,0,1,1", + "v4:96,64,8,48,32,16,16,1,2,0,4,4,1,1", + "v4:32,16,8,16,16,16,16,1,2,1,4,32,1,1", + "v4:64,96,8,32,96,16,8,1,2,1,4,4,1,1", + "v4:128,64,8,32,32,16,8,1,2,1,2,16,1,1", + "v4:16,128,8,16,64,16,32,1,4,2,0,0,1,1", + "v4:16,16,8,16,16,16,8,1,1,0,4,16,1,1", + "v4:192,32,8,48,32,16,8,1,2,1,8,64,1,1", + "v4:256,128,2,64,64,32,16,1,2,0,1,8,1,1", + "v4:256,128,4,128,64,16,8,1,1,1,8,8,1,1", + "v4:256,32,8,32,16,16,8,1,2,2,0,0,1,1", + "v4:64,192,4,32,96,32,8,1,2,0,8,32,1,1", + "v4:96,16,4,96,16,16,8,1,1,1,1,8,1,1", + "v4:96,48,4,96,48,16,8,1,1,0,0,8,1,1", + "v4:96,64,4,48,16,16,16,1,2,0,1,0,1,1" }; // END_CONV_XDL_f16_gfx950_DEFS @@ -928,64 +1341,126 @@ const StringRef PopulateParamsXDL::initParametersFp8ConvGfx900[] = { // BEGIN_GEMM_XDL_i8_gfx908_DEFS const StringRef PopulateParamsXDL::initParametersI8GemmGfx908[] = { - "v4:64,32,4,16,32,16,16,1,2,2,0,0,1,1", - "v4:64,64,4,32,32,32,16,1,2,2,0,0,1,1", - "v4:128,64,16,128,16,16,4,1,1,2,0,0,1,1", - "v4:128,128,8,128,32,32,8,1,2,2,0,0,1,1", "v4:32,32,16,16,16,16,16,1,2,2,0,0,1,1", - "v4:32,16,8,16,16,16,16,1,1,2,0,0,1,1", - "v4:32,64,32,32,16,16,8,1,1,2,0,0,1,1", - "v4:32,32,16,16,16,16,16,1,1,2,0,0,1,1" + "v4:32,64,16,16,16,16,16,1,1,2,0,0,1,1", + "v4:64,64,8,32,32,32,16,1,1,2,0,0,1,1", + "v4:32,32,32,16,16,16,16,1,1,2,0,0,1,1", + "v4:64,128,8,64,32,32,8,1,1,2,0,0,1,1", + "v4:128,128,16,128,32,32,4,1,1,2,0,0,1,1", + "v4:128,16,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:64,64,16,16,32,16,16,1,2,2,0,0,1,1", + "v4:32,128,32,16,16,16,8,1,1,2,0,0,1,1", + "v4:32,256,32,32,32,32,1,1,2,2,0,0,1,1", + "v4:256,64,16,128,32,32,1,1,2,2,0,0,1,1", + "v4:48,64,8,48,16,16,16,1,1,2,0,0,1,1", + "v4:64,128,16,16,128,16,16,1,2,2,0,0,1,1", + "v4:128,256,4,64,64,32,16,1,2,1,0,0,1,1", + "v4:128,32,4,32,32,32,16,1,2,1,0,0,1,1", + "v4:48,256,16,48,16,16,8,1,1,0,0,4,1,1", + "v4:64,128,32,64,32,32,1,1,1,0,4,0,1,1", + "v4:80,128,16,80,16,16,8,1,2,1,8,8,1,1", + "v4:96,128,16,48,16,16,16,1,1,1,2,16,1,1" }; // END_GEMM_XDL_i8_gfx908_DEFS // BEGIN_GEMM_XDL_i8_gfx90a_DEFS const StringRef PopulateParamsXDL::initParametersI8GemmGfx90a[] = { - "v4:64,64,8,32,32,16,16,1,1,2,0,0,1,1", - "v4:32,64,8,32,16,16,8,1,1,2,0,0,1,1", - "v4:64,16,16,16,16,16,8,1,1,2,0,0,1,1", - "v4:64,16,16,16,16,16,16,1,2,2,0,0,1,1", - "v4:64,256,8,64,64,16,4,1,1,2,0,0,1,1", - "v4:32,16,8,16,16,16,16,1,1,2,0,0,1,1", - "v4:32,128,4,32,32,16,16,1,2,2,0,0,1,1", - "v4:128,64,8,32,64,32,8,1,1,2,0,0,1,1" + "v4:32,32,16,16,16,16,16,1,2,2,0,0,1,1", + "v4:16,16,16,16,16,16,16,1,2,2,0,0,1,1", + "v4:64,128,8,64,32,16,8,1,1,2,0,0,1,1", + "v4:32,64,32,16,16,16,8,1,1,2,0,0,1,1", + "v4:64,64,8,16,64,16,16,1,1,2,0,0,1,1", + "v4:64,128,16,64,32,16,4,1,1,2,0,0,1,1", + "v4:128,128,16,128,16,16,8,1,1,2,0,0,1,1", + "v4:128,128,16,64,64,16,4,1,1,2,0,0,1,1", + "v4:16,128,16,16,16,16,8,1,2,2,0,0,1,1", + "v4:64,64,32,16,16,16,16,1,1,2,0,0,1,1", + "v4:96,128,16,96,32,32,4,1,2,2,0,0,1,1", + "v4:256,16,4,16,16,16,16,1,2,2,0,0,1,1", + "v4:16,64,32,16,32,16,1,1,2,2,0,0,1,1", + "v4:32,128,32,32,32,32,1,1,2,2,0,0,1,1", + "v4:128,128,4,32,32,16,4,1,2,2,0,0,1,1", + "v4:128,32,16,32,16,16,8,1,2,2,0,0,1,1", + "v4:32,256,16,32,32,32,4,1,1,2,0,0,1,1", + "v4:128,128,16,32,64,16,8,1,2,2,0,0,1,1", + "v4:16,16,32,16,16,16,16,1,1,0,4,0,1,1", + "v4:64,64,32,32,64,32,1,1,1,2,0,0,1,1", + "v4:128,128,32,32,32,16,8,1,2,0,0,4,1,1", + "v4:128,16,32,32,16,16,8,1,1,2,0,0,1,1", + "v4:128,32,16,128,32,32,16,1,1,2,0,0,1,1", + "v4:64,128,32,16,32,16,8,1,1,0,2,32,1,1", + "v4:80,128,8,80,16,16,16,1,2,0,4,8,1,1" }; // END_GEMM_XDL_i8_gfx90a_DEFS // BEGIN_GEMM_XDL_i8_gfx942_DEFS const StringRef PopulateParamsXDL::initParametersI8GemmGfx942[] = { - "v4:64,16,8,16,16,16,16,1,1,2,0,0,1,1", - "v4:64,64,16,32,32,32,8,1,1,2,0,0,1,1", - "v4:128,128,8,32,128,16,16,1,1,2,0,0,1,1", - "v4:128,16,4,32,16,16,16,1,1,2,0,0,1,1", - "v4:16,16,16,16,16,16,16,1,1,2,0,0,1,1", - "v4:64,16,32,16,16,16,16,1,1,2,0,0,1,1", - "v4:16,64,4,16,16,16,8,1,2,2,0,0,1,1", - "v4:16,128,16,16,32,16,16,1,1,2,0,0,1,1", - "v4:32,64,8,32,16,16,8,1,1,2,0,0,1,1", - "v4:64,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:256,128,8,64,128,16,8,1,1,2,0,0,1,1", - "v4:128,32,16,32,32,32,8,1,2,2,0,0,1,1", - "v4:64,16,32,16,16,16,8,1,2,2,0,0,1,1" + "v4:32,32,32,16,16,16,16,1,2,2,0,0,1,1", + "v4:16,16,32,16,16,16,16,1,1,2,0,0,1,1", + "v4:64,64,8,32,32,16,16,1,1,2,0,0,1,1", + "v4:32,64,32,16,16,16,16,1,1,2,0,0,1,1", + "v4:32,128,8,32,32,16,16,1,2,2,0,0,1,1", + "v4:64,16,8,32,16,16,16,1,2,2,0,0,1,1", + "v4:128,128,8,32,64,32,16,1,1,2,0,0,1,1", + "v4:64,64,32,16,32,16,16,1,1,2,0,0,1,1", + "v4:128,256,8,64,128,16,8,1,1,2,0,0,1,1", + "v4:64,128,8,32,64,32,16,1,1,2,0,0,1,1", + "v4:128,128,16,128,16,16,8,1,1,2,0,0,1,1", + "v4:256,128,8,64,64,16,16,1,1,2,0,0,1,1", + "v4:128,64,16,32,32,16,16,1,1,2,0,0,1,1", + "v4:128,64,4,32,32,32,8,1,3,2,0,0,1,1", + "v4:128,256,8,16,256,16,8,1,2,2,0,0,1,1", + "v4:96,128,16,96,32,16,8,1,2,1,0,32,1,1", + "v4:256,256,16,128,32,32,8,1,2,2,0,0,1,1", + "v4:128,64,8,32,64,16,16,1,1,1,0,32,1,1", + "v4:192,128,8,96,64,16,16,1,1,0,1,8,1,1", + "v4:192,256,16,96,64,16,8,1,1,0,2,0,1,1", + "v4:192,64,8,96,32,16,16,1,1,0,2,0,1,1", + "v4:64,64,16,32,32,16,16,1,2,1,4,4,1,1" }; // END_GEMM_XDL_i8_gfx942_DEFS // BEGIN_GEMM_XDL_i8_gfx950_DEFS const StringRef PopulateParamsXDL::initParametersI8GemmGfx950[] = { - "v4:64,64,16,32,32,32,16,1,1,2,0,0,1,1", - "v4:32,64,16,32,16,16,16,1,2,2,0,0,1,1", - "v4:32,16,16,16,16,16,16,1,1,2,0,0,1,1", - "v4:64,64,32,32,32,32,16,1,2,2,0,0,1,1", - "v4:128,64,16,32,64,32,16,1,1,2,0,0,1,1", - "v4:64,64,4,32,32,32,8,1,1,2,0,0,1,1", - "v4:128,128,8,128,32,16,8,1,2,2,0,0,1,1", - "v4:32,16,16,16,16,16,8,1,1,2,0,0,1,1", - "v4:128,128,8,32,128,16,16,1,1,2,0,0,1,1", - "v4:64,16,32,16,16,16,16,1,1,2,0,0,1,1", - "v4:64,16,8,16,16,16,16,1,2,2,0,0,1,1", - "v4:16,16,16,16,16,16,16,1,2,2,0,0,1,1", - "v4:32,32,32,16,16,16,16,1,1,2,0,0,1,1", - "v4:16,16,32,16,16,16,16,1,1,2,0,0,1,1" + "v4:16,16,8,16,16,16,16,1,4,2,0,0,1,1", + "v4:32,16,32,16,16,16,16,1,1,2,0,0,1,1", + "v4:16,16,8,16,16,16,16,1,1,2,0,0,1,1", + "v4:16,64,4,16,16,16,16,1,4,2,0,0,1,1", + "v4:64,32,32,16,16,16,16,1,2,2,0,0,1,1", + "v4:32,16,16,16,16,16,8,1,2,2,0,0,1,1", + "v4:64,128,8,64,32,16,8,1,2,2,0,0,1,1", + "v4:64,64,16,32,32,16,16,1,1,2,0,0,1,1", + "v4:128,128,8,64,32,32,16,1,2,2,0,0,1,1", + "v4:64,128,8,16,64,16,16,1,2,2,0,0,1,1", + "v4:128,128,4,128,64,32,16,1,4,2,0,0,1,1", + "v4:128,16,8,64,16,16,16,1,2,2,0,0,1,1", + "v4:128,64,16,32,32,32,16,1,2,2,0,0,1,1", + "v4:256,64,8,64,64,32,16,1,1,2,0,0,1,1", + "v4:128,64,32,128,64,32,1,1,4,2,0,0,1,1", + "v4:16,32,8,16,32,16,8,1,3,2,0,0,1,1", + "v4:256,128,4,256,32,16,16,1,4,2,0,0,1,1", + "v4:32,128,8,16,64,16,16,1,1,2,0,0,1,1", + "v4:256,32,32,128,32,32,1,1,4,2,0,0,1,1", + "v4:64,64,16,32,32,16,8,1,1,2,0,0,1,1", + "v4:128,256,4,32,256,32,16,1,2,2,0,0,1,1", + "v4:128,64,16,64,32,32,16,1,2,2,0,0,1,1", + "v4:32,128,16,32,32,32,16,1,1,2,0,0,1,1", + "v4:64,64,16,32,64,16,16,1,1,2,0,0,1,1", + "v4:64,128,8,32,128,16,8,1,3,2,0,0,1,1", + "v4:256,256,8,256,32,16,16,1,2,2,0,0,1,1", + "v4:64,256,4,64,64,32,8,1,4,2,0,0,1,1", + "v4:64,128,16,64,32,32,16,1,1,2,0,0,1,1", + "v4:64,256,8,64,32,32,8,1,2,2,0,0,1,1", + "v4:64,64,32,16,32,16,16,1,2,2,0,0,1,1", + "v4:128,32,32,32,32,32,16,1,2,2,0,0,1,1", + "v4:192,64,8,96,32,32,16,1,2,2,0,0,1,1", + "v4:128,128,16,64,32,32,16,1,1,2,0,0,1,1", + "v4:256,128,32,128,32,16,1,1,1,2,0,0,1,1", + "v4:128,128,32,16,64,16,16,1,1,2,0,0,1,1", + "v4:128,256,8,128,32,16,16,1,4,2,0,0,1,1", + "v4:256,128,16,32,128,32,16,1,1,2,0,0,1,1", + "v4:128,128,16,32,64,16,16,1,2,1,1,4,1,1", + "v4:256,256,16,32,128,16,8,1,1,2,0,0,1,1" }; // END_GEMM_XDL_i8_gfx950_DEFS @@ -1010,70 +1485,144 @@ const StringRef PopulateParamsXDL::initParametersFp4GemmGfx950[] = { // BEGIN_CONV_XDL_i8_gfx908_DEFS const StringRef PopulateParamsXDL::initParametersI8ConvGfx908[] = { - "v4:32,32,8,16,16,16,16,1,1,2,0,0,1,1", - "v4:32,32,32,16,16,16,16,1,1,2,0,0,1,1", - "v4:16,16,8,16,16,16,16,1,2,2,0,0,1,1", - "v4:64,32,16,16,32,16,4,1,1,2,0,0,1,1", - "v4:64,16,16,16,16,16,8,1,1,2,0,0,1,1", - "v4:64,64,4,16,64,16,4,1,1,2,0,0,1,1", - "v4:32,256,4,32,64,16,4,1,2,2,0,0,1,1", - "v4:128,32,8,128,32,32,1,1,1,2,0,0,1,1" + "v4:32,32,32,16,16,16,8,1,1,2,0,0,1,1", + "v4:128,16,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:64,16,16,16,16,16,16,1,2,2,0,0,1,1", + "v4:32,16,4,16,16,16,16,1,2,2,0,0,1,1", + "v4:64,128,16,32,32,32,8,1,1,2,0,0,1,1", + "v4:128,32,8,32,32,32,16,1,1,2,0,0,1,1", + "v4:64,256,4,64,64,32,4,1,1,2,0,0,1,1", + "v4:128,64,8,64,32,32,8,1,1,2,0,0,1,1", + "v4:64,64,32,16,16,16,16,1,1,2,0,0,1,1", + "v4:128,128,8,128,32,32,8,1,1,2,0,0,1,1", + "v4:128,64,16,64,32,32,4,1,1,2,0,0,1,1", + "v4:128,32,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:256,128,8,256,32,32,4,1,1,2,0,0,1,1", + "v4:64,64,4,16,16,16,4,1,2,2,0,0,1,1", + "v4:256,128,8,64,32,32,16,1,1,2,0,0,1,1", + "v4:192,16,16,48,16,16,8,1,2,2,0,0,1,1", + "v4:256,128,16,64,32,32,4,1,1,2,0,0,1,1", + "v4:256,64,8,32,32,32,16,1,1,2,0,0,1,1", + "v4:128,256,32,64,64,16,1,1,2,2,0,0,1,1" }; // END_CONV_XDL_i8_gfx908_DEFS // BEGIN_CONV_XDL_i8_gfx90a_DEFS const StringRef PopulateParamsXDL::initParametersI8ConvGfx90a[] = { - "v4:32,32,16,16,16,16,16,1,1,2,0,0,1,1", - "v4:32,32,8,16,16,16,16,1,2,2,0,0,1,1", - "v4:32,16,16,16,16,16,16,1,2,2,0,0,1,1", - "v4:64,16,8,16,16,16,16,1,1,2,0,0,1,1", - "v4:32,32,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:64,64,8,32,32,32,1,1,1,2,0,0,1,1", - "v4:64,32,32,32,16,16,8,1,2,2,0,0,1,1", - "v4:16,256,4,16,64,16,4,1,1,2,0,0,1,1", - "v4:32,64,16,32,16,16,1,1,1,2,0,0,1,1" + "v4:32,32,32,16,16,16,8,1,1,2,0,0,1,1", + "v4:128,16,8,16,16,16,16,1,1,2,0,0,1,1", + "v4:32,64,32,16,16,16,8,1,2,2,0,0,1,1", + "v4:128,16,16,16,16,16,16,1,1,2,0,0,1,1", + "v4:64,16,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:64,64,8,32,32,16,16,1,1,2,0,0,1,1", + "v4:128,32,8,32,32,32,16,1,1,2,0,0,1,1", + "v4:64,256,4,64,64,32,4,1,1,2,0,0,1,1", + "v4:64,16,32,16,16,16,16,1,1,2,0,0,1,1", + "v4:128,128,32,64,32,16,4,1,1,2,0,0,1,1", + "v4:128,128,8,128,32,16,8,1,1,2,0,0,1,1", + "v4:64,128,8,64,32,16,8,1,1,2,0,0,1,1", + "v4:256,64,8,128,16,16,8,1,1,2,0,0,1,1", + "v4:32,128,32,16,32,16,4,1,1,2,0,0,1,1", + "v4:128,32,16,32,32,32,4,1,2,2,0,0,1,1", + "v4:128,64,4,32,32,16,4,1,1,2,0,0,1,1", + "v4:128,64,4,32,64,32,16,1,1,2,0,0,1,1", + "v4:128,128,4,64,16,16,8,1,2,2,0,0,1,1", + "v4:256,128,8,64,32,32,8,1,2,2,0,0,1,1", + "v4:128,128,8,64,16,16,16,1,2,2,0,0,1,1", + "v4:256,128,8,64,64,32,16,1,1,2,0,0,1,1", + "v4:128,64,4,64,64,32,16,1,1,0,2,4,1,1", + "v4:256,128,4,256,32,32,16,1,1,2,0,0,1,1", + "v4:256,128,8,64,32,32,16,1,1,1,8,4,1,1", + "v4:32,128,8,32,128,32,4,1,2,2,0,0,1,1", + "v4:64,128,4,64,128,32,4,1,1,0,0,8,1,1", + "v4:64,16,16,16,16,16,4,1,2,0,1,8,1,1", + "v4:64,64,32,32,64,16,4,1,2,2,0,0,1,1" }; // END_CONV_XDL_i8_gfx90a_DEFS // BEGIN_CONV_XDL_i8_gfx942_DEFS const StringRef PopulateParamsXDL::initParametersI8ConvGfx942[] = { - "v4:16,16,16,16,16,16,16,1,1,2,0,0,1,1", - "v4:16,32,8,16,16,16,16,1,1,2,0,0,1,1", - "v4:32,32,16,16,16,16,16,1,2,2,0,0,1,1", - "v4:32,32,32,16,16,16,16,1,1,2,0,0,1,1", - "v4:32,16,16,16,16,16,16,1,1,2,0,0,1,1", - "v4:32,16,32,16,16,16,1,1,2,2,0,0,1,1", - "v4:64,16,32,16,16,16,16,1,2,2,0,0,1,1", - "v4:16,32,32,16,16,16,8,1,1,2,0,0,1,1", - "v4:64,64,16,64,32,32,1,1,2,2,0,0,1,1", - "v4:128,128,16,128,32,32,1,1,2,2,0,0,1,1", - "v4:32,64,32,32,16,16,1,1,2,2,0,0,1,1" + "v4:32,16,16,16,16,16,8,1,2,2,0,0,1,1", + "v4:32,16,16,32,16,16,16,1,2,2,0,0,1,1", + "v4:64,64,8,64,32,32,8,1,1,2,0,0,1,1", + "v4:128,128,8,64,64,16,8,1,1,2,0,0,1,1", + "v4:64,16,32,64,16,16,1,1,4,2,0,0,1,1", + "v4:128,64,8,32,64,32,16,1,1,2,0,0,1,1", + "v4:64,64,8,32,32,16,8,1,2,2,0,0,1,1", + "v4:32,64,16,32,32,32,1,1,1,2,0,0,1,1", + "v4:128,16,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:128,32,8,32,32,16,16,1,1,2,0,0,1,1", + "v4:256,32,4,128,32,32,16,1,2,2,0,0,1,1", + "v4:256,64,4,256,32,16,16,1,1,2,0,0,1,1", + "v4:128,128,32,64,64,32,1,1,4,2,0,0,1,1", + "v4:128,64,8,32,64,32,8,1,2,2,0,0,1,1", + "v4:256,64,4,64,64,32,16,1,2,2,0,0,1,1", + "v4:64,128,8,16,64,16,16,1,2,2,0,0,1,1", + "v4:128,256,4,64,256,16,8,1,1,2,0,0,1,1", + "v4:128,32,8,64,32,16,8,1,3,2,0,0,1,1", + "v4:128,128,8,64,32,16,16,1,2,2,0,0,1,1", + "v4:256,128,4,128,32,32,16,1,1,2,0,0,1,1", + "v4:256,64,32,256,32,32,1,1,2,2,0,0,1,1", + "v4:128,256,4,128,128,16,8,1,1,2,0,0,1,1", + "v4:256,32,16,64,32,32,1,1,2,2,0,0,1,1", + "v4:128,64,4,32,32,16,16,1,1,2,0,0,1,1", + "v4:256,256,4,256,16,16,16,1,4,2,0,0,1,1", + "v4:256,64,32,64,32,32,1,1,2,2,0,0,1,1", + "v4:256,128,32,256,32,16,1,1,3,2,0,0,1,1", + "v4:64,192,4,32,96,16,8,1,2,2,0,0,1,1", + "v4:128,32,16,32,16,16,16,1,2,2,0,0,1,1", + "v4:256,16,8,16,16,16,8,1,2,0,0,64,1,1", + "v4:256,256,8,32,128,32,16,1,4,2,0,0,1,1", + "v4:256,80,8,16,80,16,8,1,2,0,2,64,1,1", + "v4:64,16,32,64,16,16,8,1,3,2,0,0,1,1" }; // END_CONV_XDL_i8_gfx942_DEFS // BEGIN_CONV_XDL_i8_gfx950_DEFS const StringRef PopulateParamsXDL::initParametersI8ConvGfx950[] = { - "v4:64,32,8,16,32,16,8,1,1,2,0,0,1,1", - "v4:32,32,16,32,32,32,1,1,1,2,0,0,1,1", - "v4:64,128,16,64,32,32,1,1,2,2,0,0,1,1", - "v4:128,16,8,64,16,16,16,1,2,2,0,0,1,1", - "v4:256,32,8,64,32,16,16,1,2,2,0,0,1,1", - "v4:64,64,32,32,32,16,1,1,1,2,0,0,1,1", - "v4:64,16,16,16,16,16,16,1,2,2,0,0,1,1", - "v4:32,16,32,16,16,16,16,1,2,2,0,0,1,1", - "v4:128,128,4,64,64,32,16,1,1,2,0,0,1,1", - "v4:256,64,4,128,32,32,16,1,2,2,0,0,1,1", + "v4:32,16,8,32,16,16,16,1,4,2,0,0,1,1", "v4:64,16,8,32,16,16,16,1,2,2,0,0,1,1", - "v4:64,256,4,64,64,32,8,1,2,2,0,0,1,1", - "v4:64,64,4,64,16,16,16,1,1,2,0,0,1,1", - "v4:64,32,4,32,16,16,16,1,2,2,0,0,1,1", + "v4:128,32,4,64,16,16,16,1,4,2,0,0,1,1", + "v4:128,128,8,128,16,16,8,1,4,2,0,0,1,1", + "v4:32,16,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:32,64,16,32,32,32,8,1,3,2,0,0,1,1", + "v4:64,64,4,32,64,16,16,1,1,2,0,0,1,1", + "v4:128,32,4,128,32,16,16,1,2,2,0,0,1,1", + "v4:32,32,16,32,32,16,8,1,2,2,0,0,1,1", + "v4:64,128,32,32,32,32,1,1,4,2,0,0,1,1", + "v4:128,16,8,64,16,16,16,1,2,2,0,0,1,1", + "v4:64,32,4,64,32,16,8,1,2,2,0,0,1,1", + "v4:32,64,32,32,32,32,1,1,1,2,0,0,1,1", + "v4:128,32,4,64,32,16,8,1,1,2,0,0,1,1", + "v4:32,64,32,32,32,32,1,1,2,2,0,0,1,1", + "v4:64,256,4,64,64,16,8,1,2,2,0,0,1,1", + "v4:64,64,32,32,64,32,1,1,3,2,0,0,1,1", "v4:64,64,16,64,32,32,1,1,2,2,0,0,1,1", - "v4:16,32,32,16,16,16,16,1,2,2,0,0,1,1", - "v4:32,16,16,16,16,16,16,1,1,2,0,0,1,1", - "v4:32,16,16,16,16,16,8,1,2,2,0,0,1,1", - "v4:256,128,8,64,128,32,16,1,1,2,0,0,1,1", - "v4:64,256,4,64,64,32,8,1,1,2,0,0,1,1", - "v4:32,32,8,16,16,16,16,1,1,2,0,0,1,1" + "v4:32,256,4,32,64,32,8,1,1,2,0,0,1,1", + "v4:128,128,4,128,32,16,16,1,1,2,0,0,1,1", + "v4:256,64,4,64,64,16,16,1,2,2,0,0,1,1", + "v4:128,256,8,128,64,32,8,1,2,2,0,0,1,1", + "v4:64,64,8,64,64,32,8,1,2,2,0,0,1,1", + "v4:32,128,16,32,64,32,8,1,4,2,0,0,1,1", + "v4:32,64,8,32,64,32,16,1,1,2,0,0,1,1", + "v4:128,128,8,32,64,16,16,1,2,2,0,0,1,1", + "v4:256,32,8,64,32,32,16,1,2,2,0,0,1,1", + "v4:256,64,8,64,32,32,16,1,2,2,0,0,1,1", + "v4:128,128,16,64,16,16,16,1,1,2,0,0,1,1", + "v4:128,32,32,32,32,16,16,1,1,2,0,0,1,1", + "v4:256,128,8,64,64,32,16,1,1,2,0,0,1,1", + "v4:32,128,4,16,128,16,8,1,3,2,0,0,1,1", + "v4:64,64,16,32,64,32,16,1,1,2,0,0,1,1", + "v4:128,256,4,32,128,32,16,1,2,2,0,0,1,1", + "v4:256,32,8,128,16,16,16,1,1,2,0,0,1,1", + "v4:256,64,4,64,32,32,16,1,2,2,0,0,1,1", + "v4:128,64,8,128,64,32,16,1,2,2,0,0,1,1", + "v4:256,16,4,16,16,16,16,1,2,0,1,4,1,1", + "v4:256,32,8,32,16,16,16,1,2,0,1,64,1,1", + "v4:256,80,8,16,80,16,8,1,2,0,4,32,1,1", + "v4:32,64,16,16,16,16,16,1,4,2,0,0,1,1", + "v4:32,64,16,16,32,16,16,1,4,2,0,0,1,1", + "v4:64,16,32,64,16,16,16,1,2,2,0,0,1,1" }; // END_CONV_XDL_i8_gfx950_DEFS @@ -1082,82 +1631,82 @@ const StringRef PopulateParamsXDL::initParametersI8ConvGfx950[] = { #ifdef XDL_DECLARATIONS_GEN // BEGIN_GEMM_XDL_f32_gfx908_DECS -static constexpr size_t nInitParametersF32GemmGfx908 = 8; +static constexpr size_t nInitParametersF32GemmGfx908 = 15; static const StringRef initParametersF32GemmGfx908[nInitParametersF32GemmGfx908]; // END_GEMM_XDL_f32_gfx908_DECS // BEGIN_GEMM_XDL_f32_gfx90a_DECS -static constexpr size_t nInitParametersF32GemmGfx90a = 7; +static constexpr size_t nInitParametersF32GemmGfx90a = 13; static const StringRef initParametersF32GemmGfx90a[nInitParametersF32GemmGfx90a]; // END_GEMM_XDL_f32_gfx90a_DECS // BEGIN_GEMM_XDL_f32_gfx942_DECS -static constexpr size_t nInitParametersF32GemmGfx942 = 9; +static constexpr size_t nInitParametersF32GemmGfx942 = 17; static const StringRef initParametersF32GemmGfx942[nInitParametersF32GemmGfx942]; // END_GEMM_XDL_f32_gfx942_DECS // BEGIN_GEMM_XDL_f32_gfx950_DECS -static constexpr size_t nInitParametersF32GemmGfx950 = 6; +static constexpr size_t nInitParametersF32GemmGfx950 = 10; static const StringRef initParametersF32GemmGfx950[nInitParametersF32GemmGfx950]; // END_GEMM_XDL_f32_gfx950_DECS // BEGIN_CONV_XDL_f32_gfx908_DECS -static constexpr size_t nInitParametersF32ConvGfx908 = 18; +static constexpr size_t nInitParametersF32ConvGfx908 = 39; static const StringRef initParametersF32ConvGfx908[nInitParametersF32ConvGfx908]; // END_CONV_XDL_f32_gfx908_DECS // BEGIN_CONV_XDL_f32_gfx90a_DECS -static constexpr size_t nInitParametersF32ConvGfx90a = 18; +static constexpr size_t nInitParametersF32ConvGfx90a = 47; static const StringRef initParametersF32ConvGfx90a[nInitParametersF32ConvGfx90a]; // END_CONV_XDL_f32_gfx90a_DECS // BEGIN_CONV_XDL_f32_gfx942_DECS -static constexpr size_t nInitParametersF32ConvGfx942 = 23; +static constexpr size_t nInitParametersF32ConvGfx942 = 60; static const StringRef initParametersF32ConvGfx942[nInitParametersF32ConvGfx942]; // END_CONV_XDL_f32_gfx942_DECS // BEGIN_CONV_XDL_f32_gfx950_DECS -static constexpr size_t nInitParametersF32ConvGfx950 = 33; +static constexpr size_t nInitParametersF32ConvGfx950 = 60; static const StringRef initParametersF32ConvGfx950[nInitParametersF32ConvGfx950]; // END_CONV_XDL_f32_gfx950_DECS // BEGIN_GEMM_XDL_f16_gfx908_DECS -static constexpr size_t nInitParametersF16GemmGfx908 = 17; +static constexpr size_t nInitParametersF16GemmGfx908 = 21; static const StringRef initParametersF16GemmGfx908[nInitParametersF16GemmGfx908]; // END_GEMM_XDL_f16_gfx908_DECS // BEGIN_GEMM_XDL_f16_gfx90a_DECS -static constexpr size_t nInitParametersF16GemmGfx90a = 17; +static constexpr size_t nInitParametersF16GemmGfx90a = 24; static const StringRef initParametersF16GemmGfx90a[nInitParametersF16GemmGfx90a]; // END_GEMM_XDL_f16_gfx90a_DECS // BEGIN_GEMM_XDL_f16_gfx942_DECS -static constexpr size_t nInitParametersF16GemmGfx942 = 21; +static constexpr size_t nInitParametersF16GemmGfx942 = 27; static const StringRef initParametersF16GemmGfx942[nInitParametersF16GemmGfx942]; // END_GEMM_XDL_f16_gfx942_DECS // BEGIN_GEMM_XDL_f16_gfx950_DECS -static constexpr size_t nInitParametersF16GemmGfx950 = 14; +static constexpr size_t nInitParametersF16GemmGfx950 = 28; static const StringRef initParametersF16GemmGfx950[nInitParametersF16GemmGfx950]; // END_GEMM_XDL_f16_gfx950_DECS // BEGIN_CONV_XDL_f16_gfx908_DECS -static constexpr size_t nInitParametersF16ConvGfx908 = 14; +static constexpr size_t nInitParametersF16ConvGfx908 = 58; static const StringRef initParametersF16ConvGfx908[nInitParametersF16ConvGfx908]; // END_CONV_XDL_f16_gfx908_DECS // BEGIN_CONV_XDL_f16_gfx90a_DECS -static constexpr size_t nInitParametersF16ConvGfx90a = 16; +static constexpr size_t nInitParametersF16ConvGfx90a = 71; static const StringRef initParametersF16ConvGfx90a[nInitParametersF16ConvGfx90a]; // END_CONV_XDL_f16_gfx90a_DECS // BEGIN_CONV_XDL_f16_gfx942_DECS -static constexpr size_t nInitParametersF16ConvGfx942 = 20; +static constexpr size_t nInitParametersF16ConvGfx942 = 87; static const StringRef initParametersF16ConvGfx942[nInitParametersF16ConvGfx942]; // END_CONV_XDL_f16_gfx942_DECS // BEGIN_CONV_XDL_f16_gfx950_DECS -static constexpr size_t nInitParametersF16ConvGfx950 = 26; +static constexpr size_t nInitParametersF16ConvGfx950 = 81; static const StringRef initParametersF16ConvGfx950[nInitParametersF16ConvGfx950]; // END_CONV_XDL_f16_gfx950_DECS @@ -1172,22 +1721,22 @@ static const StringRef initParametersFp8ConvGfx900[nInitParametersFp8ConvGfx900] // END_CONV_XDL_fp8_gfx900_DECS // BEGIN_GEMM_XDL_i8_gfx908_DECS -static constexpr size_t nInitParametersI8GemmGfx908 = 8; +static constexpr size_t nInitParametersI8GemmGfx908 = 19; static const StringRef initParametersI8GemmGfx908[nInitParametersI8GemmGfx908]; // END_GEMM_XDL_i8_gfx908_DECS // BEGIN_GEMM_XDL_i8_gfx90a_DECS -static constexpr size_t nInitParametersI8GemmGfx90a = 8; +static constexpr size_t nInitParametersI8GemmGfx90a = 25; static const StringRef initParametersI8GemmGfx90a[nInitParametersI8GemmGfx90a]; // END_GEMM_XDL_i8_gfx90a_DECS // BEGIN_GEMM_XDL_i8_gfx942_DECS -static constexpr size_t nInitParametersI8GemmGfx942 = 13; +static constexpr size_t nInitParametersI8GemmGfx942 = 22; static const StringRef initParametersI8GemmGfx942[nInitParametersI8GemmGfx942]; // END_GEMM_XDL_i8_gfx942_DECS // BEGIN_GEMM_XDL_i8_gfx950_DECS -static constexpr size_t nInitParametersI8GemmGfx950 = 14; +static constexpr size_t nInitParametersI8GemmGfx950 = 39; static const StringRef initParametersI8GemmGfx950[nInitParametersI8GemmGfx950]; // END_GEMM_XDL_i8_gfx950_DECS @@ -1197,22 +1746,22 @@ static const StringRef initParametersFp4GemmGfx950[nInitParametersFp4GemmGfx950] // END_GEMM_XDL_fp4_gfx950_DECS // BEGIN_CONV_XDL_i8_gfx908_DECS -static constexpr size_t nInitParametersI8ConvGfx908 = 8; +static constexpr size_t nInitParametersI8ConvGfx908 = 19; static const StringRef initParametersI8ConvGfx908[nInitParametersI8ConvGfx908]; // END_CONV_XDL_i8_gfx908_DECS // BEGIN_CONV_XDL_i8_gfx90a_DECS -static constexpr size_t nInitParametersI8ConvGfx90a = 9; +static constexpr size_t nInitParametersI8ConvGfx90a = 28; static const StringRef initParametersI8ConvGfx90a[nInitParametersI8ConvGfx90a]; // END_CONV_XDL_i8_gfx90a_DECS // BEGIN_CONV_XDL_i8_gfx942_DECS -static constexpr size_t nInitParametersI8ConvGfx942 = 11; +static constexpr size_t nInitParametersI8ConvGfx942 = 33; static const StringRef initParametersI8ConvGfx942[nInitParametersI8ConvGfx942]; // END_CONV_XDL_i8_gfx942_DECS // BEGIN_CONV_XDL_i8_gfx950_DECS -static constexpr size_t nInitParametersI8ConvGfx950 = 21; +static constexpr size_t nInitParametersI8ConvGfx950 = 43; static const StringRef initParametersI8ConvGfx950[nInitParametersI8ConvGfx950]; // END_CONV_XDL_i8_gfx950_DECS @@ -1242,32 +1791,6 @@ const StringRef PopulateParamsWmma::initParametersF16GemmGfx1000[] = { }; // END_GEMM_Wmma_f16_gfx1000_DEFS -// BEGIN_GEMM_Wmma_f16_gfx1100_DEFS -const StringRef PopulateParamsWmma::initParametersF16GemmGfx1100[] = { - "v4:128,64,4,128,16,16,8,1,1,2,0,0,1,1", - "v4:128,128,8,64,32,16,8,1,1,2,0,0,1,1", - "v4:32,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:16,32,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:32,32,8,32,16,16,8,1,1,2,0,0,1,1", - "v4:256,128,4,128,32,16,8,1,1,2,0,0,1,1", - "v4:128,128,2,128,32,16,8,1,2,2,0,0,1,1", - "v4:128,256,8,128,32,16,8,1,1,2,0,0,1,1", - "v4:32,256,8,32,32,16,8,1,1,2,0,0,1,1", - "v4:128,32,2,32,32,16,8,1,2,2,0,0,1,1", - "v4:16,16,4,16,16,16,4,1,1,2,0,0,1,1", - "v4:32,32,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:128,64,2,32,64,16,8,1,2,2,0,0,1,1", - "v4:32,32,8,32,32,16,16,1,1,2,0,0,1,1", - "v4:32,16,8,16,16,16,16,1,1,2,0,0,1,1", - "v4:16,32,8,16,16,16,16,1,1,2,0,0,1,1", - "v4:16,16,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:64,64,4,16,16,16,8,1,1,2,0,0,1,1", - "v4:32,32,8,16,16,16,16,1,1,2,0,0,1,1", - "v4:32,16,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:32,16,8,32,16,16,16,1,1,2,0,0,1,1" -}; -// END_GEMM_Wmma_f16_gfx1100_DEFS - // BEGIN_CONV_Wmma_f16_gfx1000_DEFS const StringRef PopulateParamsWmma::initParametersF16ConvGfx1000[] = { "v4:128,64,8,32,64,16,8,1,1,2,0,0,1,1", @@ -1299,38 +1822,6 @@ const StringRef PopulateParamsWmma::initParametersF16ConvGfx1000[] = { }; // END_CONV_Wmma_f16_gfx1000_DEFS -// BEGIN_CONV_Wmma_f16_gfx1100_DEFS -const StringRef PopulateParamsWmma::initParametersF16ConvGfx1100[] = { - "v4:64,128,8,32,32,16,8,1,1,2,0,0,1,1", - "v4:16,32,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:32,64,4,32,32,16,8,1,1,2,0,0,1,1", - "v4:256,64,8,32,32,16,8,1,1,2,0,0,1,1", - "v4:64,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:64,128,4,64,32,16,8,1,1,2,0,0,1,1", - "v4:32,32,8,32,16,16,8,1,2,2,0,0,1,1", - "v4:32,16,4,16,16,16,16,1,1,2,0,0,1,1", - "v4:64,32,4,64,16,16,8,1,1,2,0,0,1,1", - "v4:32,16,8,16,16,16,16,1,1,2,0,0,1,1", - "v4:16,16,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:64,32,4,32,16,16,8,1,1,2,0,0,1,1", - "v4:128,64,2,32,64,16,8,1,2,2,0,0,1,1", - "v4:16,32,4,16,32,16,16,1,1,2,0,0,1,1", - "v4:64,64,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:128,256,2,32,64,16,8,1,1,2,0,0,1,1", - "v4:128,256,2,128,64,16,8,1,1,2,0,0,1,1", - "v4:128,128,8,16,64,16,8,1,1,2,0,0,1,1", - "v4:16,256,4,16,32,16,8,1,1,2,0,0,1,1", - "v4:128,128,2,64,32,16,16,1,1,2,0,0,1,1", - "v4:256,64,2,64,32,16,8,1,2,2,0,0,1,1", - "v4:32,128,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:16,16,4,16,16,16,4,1,1,2,0,0,1,1", - "v4:64,256,2,32,64,16,8,1,2,2,0,0,1,1", - "v4:32,128,4,32,32,16,4,1,1,2,0,0,1,1", - "v4:32,256,2,32,64,16,8,1,2,2,0,0,1,1", - "v4:64,256,2,32,64,16,8,1,1,2,0,0,1,1" -}; -// END_CONV_Wmma_f16_gfx1100_DEFS - // BEGIN_GEMM_Wmma_fp8_gfx1000_DEFS const StringRef PopulateParamsWmma::initParametersFp8GemmGfx1000[] = { "v4:128,128,4,32,64,16,16,1,1,2,0,0,1,1", @@ -1400,20 +1891,6 @@ const StringRef PopulateParamsWmma::initParametersI8GemmGfx1000[] = { }; // END_GEMM_Wmma_i8_gfx1000_DEFS -// BEGIN_GEMM_Wmma_i8_gfx1100_DEFS -const StringRef PopulateParamsWmma::initParametersI8GemmGfx1100[] = { - "v4:64,16,8,16,16,16,16,1,1,2,0,0,1,1", - "v4:64,32,8,16,16,16,16,1,1,2,0,0,1,1", - "v4:128,32,4,64,16,16,16,1,1,2,0,0,1,1", - "v4:256,64,4,128,32,16,16,1,1,2,0,0,1,1", - "v4:128,32,2,32,32,16,16,1,1,2,0,0,1,1", - "v4:64,16,4,16,16,16,16,1,1,2,0,0,1,1", - "v4:32,16,4,16,16,16,8,1,2,2,0,0,1,1", - "v4:16,16,4,16,16,16,4,1,1,2,0,0,1,1", - "v4:128,64,8,16,16,16,4,1,2,2,0,0,1,1" -}; -// END_GEMM_Wmma_i8_gfx1100_DEFS - // BEGIN_CONV_Wmma_i8_gfx1000_DEFS const StringRef PopulateParamsWmma::initParametersI8ConvGfx1000[] = { "v4:128,64,8,32,64,16,16,1,1,2,0,0,1,1", @@ -1430,116 +1907,350 @@ const StringRef PopulateParamsWmma::initParametersI8ConvGfx1000[] = { }; // END_CONV_Wmma_i8_gfx1000_DEFS -// BEGIN_CONV_Wmma_i8_gfx1100_DEFS -const StringRef PopulateParamsWmma::initParametersI8ConvGfx1100[] = { - "v4:64,16,8,16,16,16,16,1,1,2,0,0,1,1", - "v4:16,64,8,16,16,16,16,1,1,2,0,0,1,1", - "v4:128,16,8,32,16,16,16,1,1,2,0,0,1,1", - "v4:256,16,8,16,16,16,16,1,1,2,0,0,1,1", - "v4:128,64,8,32,16,16,8,1,2,2,0,0,1,1", - "v4:64,128,8,16,32,16,16,1,2,2,0,0,1,1", - "v4:256,128,4,128,16,16,8,1,2,2,0,0,1,1", - "v4:256,64,8,16,64,16,8,1,1,2,0,0,1,1", - "v4:16,16,4,16,16,16,4,1,1,2,0,0,1,1", - "v4:32,128,2,32,32,16,8,1,1,2,0,0,1,1", - "v4:32,32,2,32,32,16,8,1,1,2,0,0,1,1", - "v4:128,128,4,128,128,16,4,1,2,2,0,0,1,1", - "v4:128,128,4,128,64,16,4,1,2,2,0,0,1,1" -}; -// END_CONV_Wmma_i8_gfx1100_DEFS - // BEGIN_GEMM_Wmma_f16_gfx1201_DEFS const StringRef PopulateParamsWmma::initParametersF16GemmGfx1201[] = { - "v4:128,64,4,64,32,16,8,1,1,2,0,0,1,1", - "v4:128,128,8,32,128,16,8,1,1,2,0,0,1,1", - "v4:32,64,8,16,32,16,8,1,2,2,0,0,1,1", - "v4:64,128,2,32,64,16,8,1,2,2,0,0,1,1", - "v4:16,32,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:128,256,4,64,64,16,8,1,2,2,0,0,1,1", - "v4:16,16,8,16,16,16,16,1,2,2,0,0,1,1", - "v4:128,128,4,128,32,16,8,1,1,2,0,0,1,1", - "v4:64,64,8,32,32,16,8,1,1,2,0,0,1,1", - "v4:32,64,8,32,16,16,8,1,1,2,0,0,1,1", - "v4:32,128,4,32,32,16,8,1,2,2,0,0,1,1", - "v4:256,64,4,32,64,16,8,1,1,2,0,0,1,1", + "v4:128,128,2,32,128,16,8,1,1,2,0,0,1,1", + "v4:128,128,4,32,64,16,8,1,1,2,0,0,1,1", + "v4:256,128,4,32,128,16,8,1,2,2,0,0,1,1", + "v4:128,128,4,128,32,16,8,1,2,2,0,0,1,1", + "v4:128,128,4,32,128,16,8,1,2,2,0,0,1,1", + "v4:64,64,4,32,64,16,8,1,1,2,0,0,1,1", + "v4:16,32,8,16,32,16,16,1,2,2,0,0,1,1", "v4:256,128,4,128,32,16,8,1,2,2,0,0,1,1", - "v4:64,64,8,64,32,16,8,1,1,2,0,0,1,1" + "v4:64,64,8,32,32,16,8,1,1,2,0,0,1,1", + "v4:128,128,4,64,64,16,4,1,1,2,0,0,1,1", + "v4:16,32,8,16,16,16,16,1,1,2,0,0,1,1", + "v4:16,128,8,16,32,16,8,1,2,2,0,0,1,1", + "v4:64,128,8,64,32,16,8,1,2,2,0,0,1,1", + "v4:96,32,8,48,32,16,8,1,1,2,0,0,1,1", + "v4:128,128,4,64,64,16,8,1,1,0,8,0,1,1", + "v4:80,64,8,80,16,16,8,1,1,2,0,0,1,1", + "v4:128,32,8,64,16,16,8,1,1,2,0,0,1,1", + "v4:16,64,8,16,16,16,16,1,1,2,0,0,1,1", + "v4:64,16,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:64,16,8,64,16,16,16,1,2,2,0,0,1,1", + "v4:128,64,8,32,64,16,16,1,1,2,0,0,1,1", + "v4:16,64,4,16,64,16,16,1,1,2,0,0,1,1", + "v4:256,256,4,64,64,16,8,1,2,2,0,0,1,1", + "v4:128,128,2,128,32,16,16,1,2,2,0,0,1,1", + "v4:16,64,4,16,64,16,8,1,2,2,0,0,1,1", + "v4:16,32,4,16,32,16,16,1,2,2,0,0,1,1", + "v4:64,32,2,64,32,16,16,1,2,2,0,0,1,1", + "v4:64,64,4,64,64,16,16,1,1,2,0,0,1,1", + "v4:96,192,4,96,48,16,4,1,2,2,0,0,1,1", + "v4:128,128,2,64,128,16,8,1,1,2,0,0,1,1", + "v4:128,16,4,128,16,16,16,1,2,2,0,0,1,1", + "v4:128,256,4,64,128,16,4,1,2,2,0,0,1,1", + "v4:16,64,8,16,64,16,4,1,2,2,0,0,1,1", + "v4:192,64,4,48,32,16,8,1,2,2,0,0,1,1", + "v4:256,128,4,128,64,16,8,1,2,2,0,0,1,1", + "v4:256,256,4,256,32,16,4,1,2,2,0,0,1,1", + "v4:128,128,8,128,16,16,8,1,2,2,0,0,1,1", + "v4:128,160,8,32,160,16,4,1,1,2,0,0,1,1", + "v4:128,64,4,128,32,16,8,1,2,2,0,0,1,1", + "v4:16,128,4,16,128,16,4,1,1,2,0,0,1,1", + "v4:16,64,8,16,64,16,16,1,2,2,0,0,1,1", + "v4:256,128,2,128,64,16,16,1,2,2,0,0,1,1", + "v4:256,128,8,128,64,16,8,1,1,2,0,0,1,1", + "v4:256,256,8,128,32,16,4,1,2,2,0,0,1,1", + "v4:64,128,8,32,128,16,8,1,1,2,0,0,1,1", + "v4:64,256,2,64,128,16,16,1,1,2,0,0,1,1", + "v4:128,128,8,32,128,16,8,1,2,2,0,0,1,1", + "v4:128,32,2,64,32,16,16,1,2,2,0,0,1,1", + "v4:128,32,8,128,16,16,16,1,1,2,0,0,1,1", + "v4:16,96,4,16,96,16,16,1,2,2,0,0,1,1", + "v4:48,48,4,48,48,16,8,1,2,2,0,0,1,1", + "v4:64,16,8,64,16,16,4,1,2,2,0,0,1,1", + "v4:64,256,4,32,256,16,4,1,1,2,0,0,1,1", + "v4:64,256,8,64,64,16,8,1,1,2,0,0,1,1", + "v4:128,64,2,128,64,16,16,1,2,2,0,0,1,1", + "v4:32,64,8,16,32,16,8,1,2,1,0,16,1,1", + "v4:128,128,4,128,64,16,16,1,1,2,0,0,1,1", + "v4:128,128,8,128,64,16,16,1,2,2,0,0,1,1", + "v4:128,16,8,128,16,16,16,1,1,2,0,0,1,1", + "v4:192,96,4,48,96,16,16,1,1,2,0,0,1,1", + "v4:32,256,2,32,256,16,16,1,1,2,0,0,1,1", + "v4:64,128,8,32,128,16,16,1,1,2,0,0,1,1", + "v4:96,64,8,96,16,16,8,1,1,2,0,0,1,1", + "v4:240,32,8,240,16,16,8,1,2,2,0,0,1,1", + "v4:256,32,8,256,32,16,4,1,1,2,0,0,1,1" }; // END_GEMM_Wmma_f16_gfx1201_DEFS // BEGIN_GEMM_Wmma_i8_gfx1201_DEFS const StringRef PopulateParamsWmma::initParametersI8GemmGfx1201[] = { - "v4:128,64,4,64,32,16,16,1,1,2,0,0,1,1", - "v4:32,32,8,16,16,16,16,1,2,2,0,0,1,1", - "v4:128,128,4,64,64,16,16,1,1,2,0,0,1,1", - "v4:256,64,4,32,64,16,16,1,1,2,0,0,1,1", - "v4:128,32,4,32,32,16,16,1,1,2,0,0,1,1", - "v4:128,256,8,64,64,16,8,1,1,2,0,0,1,1", - "v4:64,32,8,32,16,16,16,1,1,2,0,0,1,1", + "v4:64,32,8,16,32,16,16,1,2,2,0,0,1,1", "v4:256,128,4,128,32,16,16,1,1,2,0,0,1,1", - "v4:16,128,8,16,32,16,16,1,1,2,0,0,1,1", + "v4:256,128,4,32,128,16,16,1,2,2,0,0,1,1", + "v4:192,64,4,96,32,16,16,1,1,2,0,0,1,1", "v4:256,128,8,64,64,16,8,1,2,2,0,0,1,1", - "v4:128,256,2,128,64,16,8,1,1,2,0,0,1,1", - "v4:256,256,8,64,32,16,8,1,2,2,0,0,1,1", - "v4:64,64,8,32,32,16,16,1,1,2,0,0,1,1" + "v4:16,32,8,16,32,16,16,1,1,2,0,0,1,1", + "v4:128,64,4,128,16,16,16,1,2,2,0,0,1,1", + "v4:128,128,8,128,32,16,8,1,2,2,0,0,1,1", + "v4:256,128,4,128,32,16,16,1,2,2,0,0,1,1", + "v4:256,128,4,64,64,16,16,1,2,2,0,0,1,1", + "v4:256,64,8,128,16,16,16,1,1,2,0,0,1,1", + "v4:128,128,4,32,64,16,16,1,1,2,0,0,1,1", + "v4:160,128,4,160,16,16,16,1,2,2,0,0,1,1", + "v4:256,128,4,64,32,16,16,1,2,2,0,0,1,1", + "v4:64,256,4,64,32,16,16,1,2,2,0,0,1,1", + "v4:128,64,4,64,64,16,16,1,1,2,0,0,1,1", + "v4:64,64,4,64,64,16,16,1,1,2,0,0,1,1", + "v4:128,128,8,16,128,16,16,1,2,2,0,0,1,1", + "v4:128,128,8,32,64,16,16,1,2,2,0,0,1,1", + "v4:128,256,4,64,64,16,16,1,2,2,0,0,1,1", + "v4:256,32,4,256,32,16,16,1,1,2,0,0,1,1", + "v4:64,64,8,32,64,16,16,1,2,2,0,0,1,1", + "v4:128,128,8,32,128,16,16,1,2,2,0,0,1,1", + "v4:128,16,8,128,16,16,16,1,2,2,0,0,1,1", + "v4:16,128,8,16,128,16,16,1,2,2,0,0,1,1", + "v4:160,256,8,160,32,16,8,1,1,2,0,0,1,1", + "v4:256,64,4,256,16,16,8,1,2,2,0,0,1,1", + "v4:64,128,8,64,128,16,16,1,1,2,0,0,1,1", + "v4:64,64,8,32,64,16,16,1,1,2,0,0,1,1", + "v4:128,16,4,64,16,16,16,1,2,2,0,0,1,1", + "v4:128,256,8,128,64,16,4,1,1,2,0,0,1,1", + "v4:128,256,8,32,256,16,4,1,1,2,0,0,1,1", + "v4:128,32,2,128,32,16,16,1,2,2,0,0,1,1", + "v4:128,32,4,128,32,16,8,1,2,2,0,0,1,1", + "v4:16,112,8,16,112,16,16,1,2,2,0,0,1,1", + "v4:192,192,8,48,192,16,16,1,1,2,0,0,1,1", + "v4:224,64,4,112,32,16,16,1,2,2,0,0,1,1", + "v4:256,128,2,64,128,16,16,1,2,2,0,0,1,1", + "v4:256,128,4,64,128,16,16,1,2,2,0,0,1,1", + "v4:256,128,8,256,16,16,8,1,2,2,0,0,1,1", + "v4:256,16,4,128,16,16,16,1,2,2,0,0,1,1", + "v4:256,256,4,128,128,16,8,1,1,2,0,0,1,1", + "v4:256,32,2,256,32,16,8,1,2,2,0,0,1,1", + "v4:112,112,2,112,112,16,16,1,2,2,0,0,1,1", + "v4:128,256,2,32,256,16,8,1,1,2,0,0,1,1", + "v4:128,256,4,16,256,16,8,1,2,2,0,0,1,1", + "v4:16,80,4,16,80,16,16,1,1,2,0,0,1,1", + "v4:224,16,8,224,16,16,16,1,2,2,0,0,1,1", + "v4:240,192,4,240,48,16,16,1,2,2,0,0,1,1", + "v4:256,128,4,64,64,16,16,1,1,1,8,8,1,1", + "v4:256,128,8,32,128,16,4,1,2,2,0,0,1,1", + "v4:256,256,4,128,128,16,4,1,2,2,0,0,1,1", + "v4:256,256,8,64,128,16,16,1,1,2,0,0,1,1", + "v4:64,128,8,16,128,16,4,1,1,2,0,0,1,1", + "v4:64,64,8,64,64,16,8,1,2,2,0,0,1,1", + "v4:112,32,8,112,16,16,4,1,2,2,0,0,1,1", + "v4:192,32,8,96,16,16,16,1,2,1,0,8,1,1", + "v4:48,64,8,48,16,16,16,1,1,0,0,8,1,1", + "v4:64,256,8,64,256,16,16,1,1,2,0,0,1,1" }; // END_GEMM_Wmma_i8_gfx1201_DEFS // BEGIN_CONV_Wmma_f16_gfx1201_DEFS const StringRef PopulateParamsWmma::initParametersF16ConvGfx1201[] = { - "v4:128,128,4,128,32,16,8,1,1,2,0,0,1,1", - "v4:128,64,4,32,64,16,8,1,1,2,0,0,1,1", - "v4:128,64,8,32,64,16,8,1,1,2,0,0,1,1", - "v4:64,256,2,64,64,16,8,1,1,2,0,0,1,1", - "v4:128,32,8,32,32,16,8,1,1,2,0,0,1,1", - "v4:64,32,8,32,16,16,8,1,1,2,0,0,1,1", - "v4:128,128,2,128,32,16,8,1,1,2,0,0,1,1", + "v4:64,256,2,64,64,16,8,1,2,2,0,0,1,1", + "v4:160,128,8,80,32,16,8,1,1,2,0,0,1,1", + "v4:64,128,4,64,64,16,8,1,1,2,0,0,1,1", "v4:128,128,8,32,64,16,8,1,1,2,0,0,1,1", - "v4:256,128,4,64,64,16,8,1,1,2,0,0,1,1", - "v4:64,128,2,64,32,16,8,1,1,2,0,0,1,1", - "v4:256,64,4,128,32,16,8,1,1,2,0,0,1,1", + "v4:128,32,8,64,16,16,8,1,1,2,0,0,1,1", + "v4:256,128,8,64,64,16,8,1,1,2,0,0,1,1", + "v4:64,256,4,64,32,16,8,1,1,2,0,0,1,1", + "v4:192,256,8,96,32,16,8,1,1,2,0,0,1,1", "v4:128,16,8,32,16,16,8,1,1,2,0,0,1,1", - "v4:256,128,8,128,32,16,8,1,1,2,0,0,1,1", - "v4:32,64,8,32,16,16,8,1,2,2,0,0,1,1", - "v4:64,16,8,16,16,16,8,1,1,2,0,0,1,1", + "v4:64,32,8,32,16,16,8,1,1,2,0,0,1,1", + "v4:128,64,4,32,64,16,8,1,2,2,0,0,1,1", + "v4:64,128,2,64,64,16,16,1,1,2,0,0,1,1", + "v4:128,128,4,64,64,16,8,1,2,2,0,0,1,1", + "v4:128,256,2,128,32,16,8,1,1,2,0,0,1,1", + "v4:256,128,8,64,32,16,8,1,1,2,0,0,1,1", + "v4:64,64,8,32,32,16,8,1,2,2,0,0,1,1", + "v4:128,32,8,32,32,16,8,1,2,2,0,0,1,1", + "v4:192,256,4,192,32,16,8,1,1,2,0,0,1,1", + "v4:128,256,2,128,64,16,8,1,2,2,0,0,1,1", + "v4:64,128,4,32,128,16,4,1,1,2,0,0,1,1", + "v4:32,128,2,32,128,16,8,1,1,2,0,0,1,1", + "v4:64,128,8,32,64,16,8,1,1,2,0,0,1,1", + "v4:128,128,4,32,128,16,8,1,2,2,0,0,1,1", "v4:16,64,4,16,64,16,8,1,1,2,0,0,1,1", - "v4:32,128,4,32,32,16,8,1,2,2,0,0,1,1", - "v4:128,128,2,32,128,16,8,1,1,2,0,0,1,1", - "v4:64,128,4,16,128,16,8,1,1,2,0,0,1,1", + "v4:64,16,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:64,128,8,64,32,16,8,1,1,2,0,0,1,1", + "v4:128,16,8,32,16,16,8,1,2,2,0,0,1,1", + "v4:32,256,2,32,32,16,8,1,2,2,0,0,1,1", + "v4:64,64,8,16,64,16,8,1,2,2,0,0,1,1", + "v4:128,128,4,128,32,16,8,1,2,2,0,0,1,1", + "v4:128,256,4,128,64,16,4,1,1,2,0,0,1,1", + "v4:256,128,8,32,128,16,8,1,1,2,0,0,1,1", + "v4:128,256,4,64,64,16,8,1,2,2,0,0,1,1", + "v4:256,128,4,32,128,16,8,1,1,2,0,0,1,1", + "v4:256,128,4,128,32,16,8,1,2,2,0,0,1,1", + "v4:64,128,8,16,64,16,8,1,2,2,0,0,1,1", "v4:16,64,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:256,256,8,128,32,16,8,1,1,2,0,0,1,1", + "v4:32,64,8,32,16,16,8,1,2,2,0,0,1,1", "v4:32,64,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:128,256,8,64,64,16,8,1,1,2,0,0,1,1", - "v4:32,128,2,32,128,16,8,1,1,2,0,0,1,1", - "v4:64,128,8,32,32,16,8,1,2,2,0,0,1,1", - "v4:64,16,8,16,16,16,16,1,1,2,0,0,1,1", - "v4:32,128,8,16,32,16,8,1,2,2,0,0,1,1", - "v4:32,64,8,32,32,16,8,1,2,2,0,0,1,1", - "v4:64,64,4,64,16,16,8,1,1,2,0,0,1,1", - "v4:16,32,4,16,16,16,8,1,2,2,0,0,1,1", - "v4:64,128,8,32,16,16,8,1,2,2,0,0,1,1", - "v4:32,16,8,16,16,16,16,1,1,2,0,0,1,1", - "v4:64,64,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:32,32,4,16,16,16,16,1,2,2,0,0,1,1", - "v4:128,32,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:16,16,8,16,16,16,16,1,2,2,0,0,1,1" + "v4:64,128,8,64,16,16,8,1,2,2,0,0,1,1", + "v4:128,256,4,128,32,16,8,1,2,2,0,0,1,1", + "v4:128,256,4,64,128,16,4,1,2,2,0,0,1,1", + "v4:256,64,2,64,64,16,16,1,1,2,0,0,1,1", + "v4:64,32,8,64,16,16,8,1,1,2,0,0,1,1", + "v4:256,128,4,64,64,16,8,1,2,2,0,0,1,1", + "v4:64,128,2,32,64,16,16,1,2,2,0,0,1,1", + "v4:128,32,8,32,16,16,8,1,2,2,0,0,1,1", + "v4:64,128,8,32,16,16,8,1,1,2,0,0,1,1", + "v4:128,128,2,128,32,16,16,1,2,2,0,0,1,1", + "v4:128,128,2,64,32,16,8,1,2,2,0,0,1,1", + "v4:128,64,8,32,32,16,8,1,2,2,0,0,1,1", + "v4:256,256,4,128,64,16,8,1,2,2,0,0,1,1", + "v4:256,256,4,64,128,16,4,1,2,2,0,0,1,1", + "v4:128,256,4,64,128,16,8,1,1,2,0,0,1,1", + "v4:16,64,4,16,64,16,16,1,1,2,0,0,1,1", + "v4:256,128,2,128,64,16,16,1,2,2,0,0,1,1", + "v4:32,48,8,16,48,16,8,1,2,2,0,0,1,1", + "v4:128,128,8,128,16,16,8,1,2,2,0,0,1,1", + "v4:128,256,8,32,128,16,4,1,2,2,0,0,1,1", + "v4:256,128,2,256,32,16,16,1,1,2,0,0,1,1", + "v4:256,128,4,64,128,16,4,1,2,2,0,0,1,1", + "v4:256,64,4,256,16,16,4,1,2,2,0,0,1,1", + "v4:128,128,4,32,128,16,16,1,1,2,0,0,1,1", + "v4:128,128,8,128,32,16,8,1,1,2,0,0,1,1", + "v4:128,256,4,128,64,16,8,1,2,2,0,0,1,1", + "v4:128,64,8,16,64,16,8,1,2,2,0,0,1,1", + "v4:128,64,8,32,64,16,8,1,2,2,0,0,1,1", + "v4:16,16,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:160,80,2,160,80,16,8,1,1,2,0,0,1,1", + "v4:192,64,2,192,32,16,16,1,1,2,0,0,1,1", + "v4:256,32,8,32,32,16,8,1,1,2,0,0,1,1", + "v4:64,128,8,32,128,16,4,1,2,2,0,0,1,1", + "v4:64,32,4,64,32,16,8,1,2,2,0,0,1,1", + "v4:128,64,8,128,16,16,4,1,2,2,0,0,1,1", + "v4:16,32,4,16,32,16,16,1,2,2,0,0,1,1", + "v4:256,128,8,64,64,16,4,1,2,2,0,0,1,1", + "v4:256,256,8,64,128,16,8,1,1,2,0,0,1,1", + "v4:16,128,8,16,64,16,8,1,2,2,0,0,1,1", + "v4:256,64,2,128,32,16,16,1,2,2,0,0,1,1", + "v4:32,32,4,32,32,16,16,1,1,2,0,0,1,1", + "v4:64,256,4,64,128,16,4,1,2,2,0,0,1,1", + "v4:64,256,4,64,128,16,8,1,1,2,0,0,1,1", + "v4:128,256,8,64,128,16,4,1,1,2,0,0,1,1", + "v4:16,48,8,16,48,16,8,1,2,2,0,0,1,1", + "v4:256,128,2,32,128,16,16,1,2,2,0,0,1,1", + "v4:256,48,8,16,48,16,8,1,1,1,4,64,1,1", + "v4:32,16,8,32,16,16,8,1,2,2,0,0,1,1", + "v4:64,128,8,32,64,16,8,1,2,2,0,0,1,1", + "v4:80,64,8,80,16,16,8,1,1,0,2,16,1,1", + "v4:96,48,8,48,48,16,8,1,1,2,0,0,1,1", + "v4:128,64,4,128,64,16,4,1,1,2,0,0,1,1", + "v4:64,128,2,64,128,16,16,1,2,2,0,0,1,1", + "v4:64,64,8,32,32,16,16,1,2,2,0,0,1,1", + "v4:96,240,2,48,240,16,8,1,1,2,0,0,1,1", + "v4:128,160,8,32,160,16,4,1,1,2,0,0,1,1", + "v4:128,256,2,32,256,16,16,1,1,2,0,0,1,1", + "v4:16,256,4,16,256,16,4,1,1,2,0,0,1,1", + "v4:192,128,8,192,64,16,4,1,1,2,0,0,1,1", + "v4:256,64,8,64,64,16,8,1,1,2,0,0,1,1", + "v4:64,128,8,32,32,16,16,1,1,2,0,0,1,1", + "v4:64,16,4,64,16,16,8,1,2,2,0,0,1,1", + "v4:128,32,4,64,32,16,16,1,1,2,0,0,1,1", + "v4:192,96,8,48,48,16,8,1,1,2,0,0,1,1", + "v4:32,32,4,32,16,16,16,1,2,2,0,0,1,1", + "v4:64,128,2,64,32,16,8,1,2,1,0,4,1,1", + "v4:64,192,2,64,96,16,16,1,2,2,0,0,1,1", + "v4:96,32,8,48,32,16,8,1,2,2,0,0,1,1", + "v4:16,16,8,16,16,16,16,1,1,2,0,0,1,1", + "v4:192,256,2,192,64,16,8,1,1,2,0,0,1,1", + "v4:224,32,2,112,32,16,16,1,2,2,0,0,1,1", + "v4:32,240,4,16,240,16,16,1,2,2,0,0,1,1", + "v4:64,16,8,32,16,16,16,1,2,2,0,0,1,1", + "v4:64,32,4,64,16,16,16,1,2,2,0,0,1,1", + "v4:128,128,4,128,64,16,16,1,2,2,0,0,1,1", + "v4:128,128,8,64,64,16,8,1,1,0,8,32,1,1", + "v4:128,160,4,32,160,16,8,1,1,2,0,0,1,1", + "v4:16,80,8,16,80,16,16,1,1,2,0,0,1,1", + "v4:192,32,8,96,16,16,16,1,2,2,0,0,1,1", + "v4:192,32,8,96,16,16,8,1,2,2,0,0,1,1", + "v4:256,64,8,256,16,16,8,1,1,2,0,0,1,1", + "v4:64,128,8,32,128,16,16,1,1,2,0,0,1,1", + "v4:64,64,8,32,16,16,16,1,2,2,0,0,1,1", + "v4:96,64,4,48,16,16,16,1,2,2,0,0,1,1", + "v4:112,16,4,112,16,16,8,1,2,2,0,0,1,1", + "v4:128,128,4,128,64,16,16,1,1,2,0,0,1,1", + "v4:128,64,8,32,64,16,8,1,1,1,2,4,1,1", + "v4:128,64,8,64,64,16,16,1,1,2,0,0,1,1", + "v4:16,160,4,16,160,16,16,1,1,2,0,0,1,1", + "v4:160,160,2,80,160,16,16,1,1,0,4,4,1,1", + "v4:160,32,8,80,32,16,8,1,1,0,16,8,1,1", + "v4:32,128,8,32,16,16,8,1,2,1,0,4,1,1", + "v4:32,16,8,32,16,16,16,1,2,0,0,16,1,1", + "v4:32,256,4,32,256,16,8,1,1,2,0,0,1,1", + "v4:48,128,8,48,16,16,8,1,2,1,16,16,1,1", + "v4:64,128,8,16,64,16,16,1,1,2,0,0,1,1", + "v4:64,224,4,32,224,16,16,1,2,2,0,0,1,1", + "v4:64,256,2,64,128,16,8,1,1,1,0,4,1,1", + "v4:64,256,8,16,256,16,8,1,2,2,0,0,1,1", + "v4:64,64,8,16,16,16,8,1,2,0,2,16,1,1", + "v4:96,32,8,48,16,16,8,1,1,2,0,0,1,1", + "v4:96,32,8,96,32,16,16,1,2,2,0,0,1,1" }; // END_CONV_Wmma_f16_gfx1201_DEFS // BEGIN_CONV_Wmma_i8_gfx1201_DEFS const StringRef PopulateParamsWmma::initParametersI8ConvGfx1201[] = { - "v4:128,64,4,64,32,16,16,1,1,2,0,0,1,1", - "v4:256,64,4,32,64,16,16,1,2,2,0,0,1,1", - "v4:32,64,2,32,32,16,16,1,2,2,0,0,1,1", - "v4:64,16,8,16,16,16,16,1,1,2,0,0,1,1", + "v4:128,256,4,64,64,16,16,1,1,2,0,0,1,1", + "v4:128,64,4,32,64,16,16,1,2,2,0,0,1,1", + "v4:256,32,8,32,32,16,16,1,1,2,0,0,1,1", + "v4:256,64,8,32,64,16,16,1,1,2,0,0,1,1", "v4:64,16,8,16,16,16,16,1,2,2,0,0,1,1", - "v4:128,16,8,32,16,16,16,1,1,2,0,0,1,1", - "v4:128,128,4,64,64,16,8,1,2,2,0,0,1,1", - "v4:128,128,4,64,64,16,16,1,1,2,0,0,1,1", - "v4:256,32,8,64,32,16,16,1,1,2,0,0,1,1", - "v4:32,256,2,32,128,16,8,1,1,2,0,0,1,1", - "v4:64,128,2,64,32,16,8,1,1,2,0,0,1,1", - "v4:256,128,8,32,128,16,8,1,1,2,0,0,1,1" + "v4:128,64,4,128,32,16,8,1,2,2,0,0,1,1", + "v4:256,128,8,32,64,16,16,1,1,2,0,0,1,1", + "v4:64,128,2,64,128,16,8,1,1,2,0,0,1,1", + "v4:64,128,4,32,128,16,8,1,1,2,0,0,1,1", + "v4:128,256,8,128,64,16,4,1,1,2,0,0,1,1", + "v4:64,128,8,32,32,16,4,1,2,2,0,0,1,1", + "v4:64,64,8,64,64,16,8,1,1,2,0,0,1,1", + "v4:96,128,4,96,32,16,16,1,2,2,0,0,1,1", + "v4:128,128,8,32,64,16,16,1,2,2,0,0,1,1", + "v4:128,256,4,128,64,16,8,1,2,2,0,0,1,1", + "v4:256,64,4,256,16,16,8,1,2,2,0,0,1,1", + "v4:256,64,8,128,16,16,16,1,1,2,0,0,1,1", + "v4:128,16,4,128,16,16,16,1,1,2,0,0,1,1", + "v4:128,32,8,128,16,16,16,1,1,2,0,0,1,1", + "v4:192,64,2,192,32,16,16,1,1,2,0,0,1,1", + "v4:256,256,4,256,64,16,8,1,2,2,0,0,1,1", + "v4:32,128,4,32,64,16,16,1,1,2,0,0,1,1", + "v4:128,256,8,64,128,16,4,1,1,2,0,0,1,1", + "v4:128,32,8,64,32,16,16,1,2,2,0,0,1,1", + "v4:160,64,4,160,32,16,8,1,2,2,0,0,1,1", + "v4:256,128,2,64,128,16,16,1,2,2,0,0,1,1", + "v4:256,128,8,256,32,16,4,1,2,2,0,0,1,1", + "v4:256,128,8,64,64,16,8,1,2,2,0,0,1,1", + "v4:64,64,4,64,64,16,16,1,2,2,0,0,1,1", + "v4:128,16,4,128,16,16,16,1,2,2,0,0,1,1", + "v4:128,16,4,64,16,16,16,1,2,2,0,0,1,1", + "v4:128,16,8,64,16,16,16,1,2,2,0,0,1,1", + "v4:128,256,4,64,128,16,16,1,1,2,0,0,1,1", + "v4:128,32,4,128,32,16,16,1,1,2,0,0,1,1", + "v4:128,32,8,128,16,16,8,1,1,2,0,0,1,1", + "v4:256,128,4,64,64,16,16,1,2,2,0,0,1,1", + "v4:256,32,8,256,16,16,16,1,2,2,0,0,1,1", + "v4:256,64,2,256,64,16,8,1,2,2,0,0,1,1", + "v4:64,64,8,32,64,16,16,1,1,2,0,0,1,1", + "v4:128,128,8,64,64,16,16,1,1,2,0,0,1,1", + "v4:128,96,8,32,48,16,16,1,1,2,0,0,1,1", + "v4:256,16,8,64,16,16,16,1,1,2,0,0,1,1", + "v4:256,256,8,128,128,16,4,1,1,2,0,0,1,1", + "v4:32,128,8,16,64,16,16,1,2,2,0,0,1,1", + "v4:64,256,4,32,128,16,16,1,1,2,0,0,1,1", + "v4:96,192,8,96,96,16,8,1,2,2,0,0,1,1", + "v4:128,256,2,32,128,16,8,1,2,2,0,0,1,1", + "v4:128,256,8,64,128,16,8,1,1,2,0,0,1,1", + "v4:128,96,8,32,96,16,4,1,2,2,0,0,1,1", + "v4:16,16,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:192,48,4,192,48,16,4,1,2,2,0,0,1,1", + "v4:256,128,4,64,128,16,8,1,2,2,0,0,1,1", + "v4:256,16,8,128,16,16,8,1,1,2,0,0,1,1", + "v4:128,16,4,128,16,16,8,1,2,2,0,0,1,1", + "v4:128,256,8,128,64,16,8,1,2,0,4,0,1,1", + "v4:16,128,8,16,128,16,8,1,1,2,0,0,1,1", + "v4:16,256,4,16,64,16,16,1,2,0,8,16,1,1", + "v4:256,128,8,128,64,16,16,1,1,2,0,0,1,1", + "v4:256,16,8,256,16,16,16,1,2,2,0,0,1,1", + "v4:256,256,4,64,256,16,16,1,1,2,0,0,1,1", + "v4:64,128,4,64,128,16,16,1,1,2,0,0,1,1", + "v4:64,256,2,64,256,16,16,1,1,2,0,0,1,1" }; // END_CONV_Wmma_i8_gfx1201_DEFS @@ -1681,139 +2392,351 @@ const StringRef PopulateParamsWmma::initParametersI8ConvGfx1151[] = { const StringRef PopulateParamsWmma::initParametersF16GemmGfx1150[] = { "v4:64,128,8,32,64,16,8,1,1,2,0,0,1,1", "v4:128,256,8,64,64,16,8,1,1,2,0,0,1,1", - "v4:128,256,4,128,32,16,8,1,1,2,0,0,1,1", + "v4:128,64,8,64,32,16,8,1,1,2,0,0,1,1", + "v4:64,256,8,64,32,16,8,1,1,2,0,0,1,1", + "v4:32,256,4,16,128,16,8,1,1,2,0,0,1,1", "v4:64,128,4,32,64,16,8,1,1,2,0,0,1,1", - "v4:64,64,4,64,64,16,8,1,1,2,0,0,1,1", - "v4:64,256,8,16,128,16,8,1,1,2,0,0,1,1", - "v4:16,16,8,16,16,16,16,1,2,2,0,0,1,1", - "v4:16,16,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:128,64,8,32,32,16,8,1,1,2,0,0,1,1", - "v4:128,256,2,128,32,16,8,1,1,2,0,0,1,1", - "v4:64,64,8,32,64,16,8,1,1,2,0,0,1,1", "v4:32,16,8,16,16,16,16,1,2,2,0,0,1,1", - "v4:32,32,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:256,128,4,128,64,16,4,1,1,2,0,0,1,1", - "v4:32,64,4,32,64,16,16,1,1,2,0,0,1,1", - "v4:256,128,4,64,64,16,16,1,1,2,0,0,1,1", - "v4:16,16,4,16,16,16,4,1,2,2,0,0,1,1", - "v4:128,64,4,32,32,16,4,1,1,2,0,0,1,1", - "v4:64,32,4,16,32,16,4,1,1,2,0,0,1,1", + "v4:64,32,4,64,32,16,8,1,1,2,0,0,1,1", + "v4:64,32,8,32,16,16,8,1,2,2,0,0,1,1", + "v4:32,64,8,32,32,16,8,1,2,2,0,0,1,1", + "v4:128,128,8,64,32,16,8,1,1,2,0,0,1,1", + "v4:128,128,2,128,32,16,8,1,1,2,0,0,1,1", + "v4:256,128,2,128,64,16,8,1,1,2,0,0,1,1", + "v4:128,256,4,128,32,16,16,1,1,2,0,0,1,1", + "v4:256,256,2,128,32,16,8,1,2,2,0,0,1,1", + "v4:256,256,2,256,32,16,8,1,1,2,0,0,1,1", "v4:128,128,8,64,32,16,16,1,1,2,0,0,1,1", - "v4:256,64,8,128,32,16,8,1,1,2,0,0,1,1" + "v4:256,128,4,128,32,16,16,1,1,2,0,0,1,1", + "v4:16,128,8,16,128,16,16,1,2,2,0,0,1,1", + "v4:256,64,8,64,32,16,8,1,1,2,0,0,1,1", + "v4:16,128,8,16,128,16,16,1,1,2,0,0,1,1" }; // END_GEMM_Wmma_f16_gfx1150_DEFS // BEGIN_GEMM_Wmma_i8_gfx1150_DEFS const StringRef PopulateParamsWmma::initParametersI8GemmGfx1150[] = { + "v4:64,128,4,32,64,16,16,1,1,2,0,0,1,1", "v4:128,256,4,128,32,16,16,1,1,2,0,0,1,1", - "v4:128,64,2,64,32,16,16,1,2,2,0,0,1,1", - "v4:16,16,8,16,16,16,16,1,2,2,0,0,1,1", - "v4:128,64,8,128,16,16,16,1,1,2,0,0,1,1", - "v4:128,128,4,128,32,16,16,1,1,2,0,0,1,1", - "v4:32,256,2,32,64,16,16,1,1,2,0,0,1,1", - "v4:32,64,4,32,16,16,16,1,2,2,0,0,1,1", - "v4:64,128,2,64,32,16,16,1,2,2,0,0,1,1", - "v4:256,256,4,128,32,16,16,1,1,2,0,0,1,1", - "v4:32,64,8,16,64,16,16,1,1,2,0,0,1,1", - "v4:128,128,8,32,64,16,16,1,1,2,0,0,1,1", + "v4:128,128,4,64,64,16,16,1,1,2,0,0,1,1", + "v4:64,128,8,32,64,16,16,1,1,2,0,0,1,1", + "v4:256,64,4,256,16,16,16,1,1,2,0,0,1,1", + "v4:128,64,2,128,32,16,16,1,1,2,0,0,1,1", + "v4:256,64,8,128,16,16,16,1,1,2,0,0,1,1", + "v4:64,16,8,32,16,16,16,1,2,2,0,0,1,1", + "v4:32,64,4,32,64,16,8,1,1,2,0,0,1,1", + "v4:32,64,8,16,64,16,8,1,1,2,0,0,1,1", + "v4:64,256,2,64,64,16,16,1,1,2,0,0,1,1", + "v4:256,256,2,128,32,16,16,1,2,2,0,0,1,1", + "v4:128,64,8,16,16,16,16,1,1,2,0,0,1,1", + "v4:128,64,8,16,32,16,16,1,2,2,0,0,1,1", "v4:64,256,8,16,128,16,16,1,1,2,0,0,1,1", - "v4:64,64,4,64,64,16,16,1,1,2,0,0,1,1", - "v4:256,32,4,128,16,16,16,1,1,2,0,0,1,1", - "v4:16,16,4,16,16,16,4,1,2,2,0,0,1,1", - "v4:256,64,4,128,16,16,16,1,1,2,0,0,1,1" + "v4:16,256,8,16,256,16,16,1,1,2,0,0,1,1", + "v4:16,256,8,16,256,16,16,1,2,2,0,0,1,1", + "v4:256,256,2,128,256,16,8,1,1,2,0,0,1,1", + "v4:256,256,2,256,128,16,16,1,2,2,0,0,1,1", + "v4:256,256,2,256,128,16,8,1,1,2,0,0,1,1", + "v4:256,256,4,128,256,16,16,1,2,2,0,0,1,1", + "v4:256,256,4,256,128,16,16,1,2,2,0,0,1,1" }; // END_GEMM_Wmma_i8_gfx1150_DEFS // BEGIN_CONV_Wmma_f16_gfx1150_DEFS const StringRef PopulateParamsWmma::initParametersF16ConvGfx1150[] = { - "v4:64,128,4,64,32,16,8,1,1,2,0,0,1,1", - "v4:128,64,8,128,16,16,8,1,1,2,0,0,1,1", - "v4:128,64,4,64,32,16,8,1,1,2,0,0,1,1", - "v4:64,256,2,64,64,16,8,1,1,2,0,0,1,1", "v4:128,128,4,128,32,16,8,1,1,2,0,0,1,1", - "v4:128,128,8,16,128,16,8,1,1,2,0,0,1,1", - "v4:64,256,4,64,32,16,8,1,1,2,0,0,1,1", - "v4:128,64,8,32,32,16,8,1,1,2,0,0,1,1", - "v4:32,64,4,32,64,16,4,1,1,2,0,0,1,1", - "v4:128,256,4,128,32,16,8,1,1,2,0,0,1,1", - "v4:128,128,2,128,32,16,16,1,1,2,0,0,1,1", - "v4:256,128,8,128,32,16,8,1,1,2,0,0,1,1", - "v4:128,64,2,128,32,16,8,1,1,2,0,0,1,1", - "v4:256,256,8,128,32,16,8,1,1,2,0,0,1,1", - "v4:64,128,8,64,32,16,8,1,1,2,0,0,1,1", - "v4:64,128,8,64,16,16,8,1,1,2,0,0,1,1", - "v4:32,128,4,32,64,16,8,1,1,2,0,0,1,1", - "v4:128,128,8,32,64,16,8,1,1,2,0,0,1,1", - "v4:256,128,8,64,32,16,8,1,1,2,0,0,1,1", - "v4:32,128,2,32,128,16,8,1,1,2,0,0,1,1", - "v4:64,128,2,64,32,16,8,1,2,2,0,0,1,1", - "v4:256,64,4,64,64,16,8,1,1,2,0,0,1,1", - "v4:64,128,4,64,16,16,8,1,2,2,0,0,1,1", - "v4:32,128,2,32,32,16,8,1,2,2,0,0,1,1", - "v4:128,64,4,128,32,16,4,1,1,2,0,0,1,1", - "v4:64,256,4,64,16,16,8,1,2,2,0,0,1,1", - "v4:256,128,8,64,64,16,8,1,1,2,0,0,1,1", - "v4:64,32,4,32,32,16,8,1,1,2,0,0,1,1", - "v4:16,128,4,16,128,16,8,1,1,2,0,0,1,1", - "v4:32,256,2,32,128,16,8,1,1,2,0,0,1,1", - "v4:128,64,2,32,32,16,8,1,1,2,0,0,1,1", - "v4:256,64,8,64,32,16,8,1,1,2,0,0,1,1", - "v4:32,32,4,32,16,16,8,1,2,2,0,0,1,1", - "v4:256,64,2,32,64,16,8,1,1,2,0,0,1,1", - "v4:64,64,8,16,64,16,8,1,2,2,0,0,1,1", "v4:64,32,8,32,32,16,8,1,1,2,0,0,1,1", - "v4:16,32,8,16,32,16,8,1,1,2,0,0,1,1", - "v4:64,64,8,16,32,16,8,1,2,2,0,0,1,1", - "v4:16,32,4,16,32,16,8,1,2,2,0,0,1,1", - "v4:128,256,8,128,32,16,8,1,1,2,0,0,1,1", - "v4:64,128,8,16,128,16,8,1,2,2,0,0,1,1", - "v4:32,32,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:256,32,4,64,32,16,8,1,1,2,0,0,1,1", - "v4:32,64,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:256,128,8,32,64,16,8,1,1,2,0,0,1,1", - "v4:32,128,8,32,32,16,8,1,2,2,0,0,1,1", - "v4:32,16,4,16,16,16,8,1,1,2,0,0,1,1", + "v4:64,64,2,64,64,16,8,1,1,2,0,0,1,1", + "v4:128,64,4,128,32,16,8,1,1,2,0,0,1,1", + "v4:128,32,8,32,32,16,8,1,1,2,0,0,1,1", + "v4:64,128,8,32,64,16,8,1,1,2,0,0,1,1", + "v4:128,64,8,32,64,16,8,1,1,2,0,0,1,1", + "v4:128,64,4,64,32,16,16,1,1,2,0,0,1,1", + "v4:128,128,2,128,32,16,16,1,1,2,0,0,1,1", + "v4:128,64,2,128,32,16,16,1,1,2,0,0,1,1", + "v4:64,128,4,64,32,16,8,1,1,2,0,0,1,1", + "v4:128,128,2,64,64,16,8,1,2,2,0,0,1,1", + "v4:32,128,4,32,32,16,8,1,1,2,0,0,1,1", + "v4:256,64,2,64,64,16,16,1,1,2,0,0,1,1", + "v4:32,64,2,32,64,16,8,1,1,2,0,0,1,1", + "v4:32,128,8,16,32,16,8,1,1,2,0,0,1,1", + "v4:256,64,4,128,32,16,8,1,1,2,0,0,1,1", + "v4:64,64,8,32,32,16,4,1,2,2,0,0,1,1", + "v4:16,128,8,16,32,16,8,1,1,2,0,0,1,1", + "v4:256,128,4,128,32,16,8,1,1,2,0,0,1,1", + "v4:64,128,2,32,64,16,8,1,1,2,0,0,1,1", + "v4:256,128,2,64,64,16,8,1,2,2,0,0,1,1", + "v4:128,256,4,64,64,16,8,1,1,2,0,0,1,1", + "v4:128,32,8,128,16,16,8,1,1,2,0,0,1,1", + "v4:64,16,8,32,16,16,8,1,1,2,0,0,1,1", + "v4:16,128,4,16,32,16,8,1,1,2,0,0,1,1", + "v4:64,256,2,64,64,16,16,1,1,2,0,0,1,1", + "v4:128,128,4,128,64,16,4,1,1,2,0,0,1,1", + "v4:32,128,2,32,32,16,8,1,1,2,0,0,1,1", + "v4:64,128,2,64,64,16,16,1,1,2,0,0,1,1", + "v4:32,128,8,32,16,16,8,1,2,2,0,0,1,1", + "v4:256,128,4,256,32,16,4,1,1,2,0,0,1,1", + "v4:128,128,8,64,64,16,4,1,1,2,0,0,1,1", "v4:32,32,8,16,16,16,8,1,2,2,0,0,1,1", - "v4:16,64,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:16,32,4,16,16,16,4,1,1,2,0,0,1,1", - "v4:256,32,8,32,32,16,8,1,1,2,0,0,1,1", - "v4:16,128,8,16,64,16,8,1,1,2,0,0,1,1", - "v4:64,32,8,32,16,16,16,1,1,2,0,0,1,1", - "v4:16,16,4,16,16,16,4,1,1,2,0,0,1,1", - "v4:128,32,4,16,32,16,16,1,1,2,0,0,1,1", - "v4:16,16,8,16,16,16,16,1,1,2,0,0,1,1", - "v4:128,16,4,32,16,16,8,1,2,2,0,0,1,1", - "v4:64,16,8,32,16,16,16,1,1,2,0,0,1,1", - "v4:32,16,8,16,16,16,16,1,1,2,0,0,1,1" + "v4:128,128,2,128,64,16,16,1,1,2,0,0,1,1", + "v4:32,16,8,16,16,16,16,1,1,2,0,0,1,1", + "v4:128,256,8,64,64,16,8,1,1,2,0,0,1,1", + "v4:128,64,8,32,32,16,4,1,1,2,0,0,1,1", + "v4:256,128,8,256,16,16,8,1,1,2,0,0,1,1", + "v4:64,64,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:16,64,8,16,32,16,8,1,2,2,0,0,1,1", + "v4:256,64,4,128,32,16,4,1,1,2,0,0,1,1", + "v4:128,256,2,32,256,16,8,1,1,2,0,0,1,1", + "v4:64,16,8,16,16,16,16,1,1,2,0,0,1,1", + "v4:64,128,2,64,128,16,8,1,1,2,0,0,1,1", + "v4:256,128,8,256,32,16,8,1,1,2,0,0,1,1", + "v4:16,128,8,16,128,16,16,1,1,2,0,0,1,1", + "v4:128,32,8,128,32,16,16,1,2,2,0,0,1,1" }; // END_CONV_Wmma_f16_gfx1150_DEFS // BEGIN_CONV_Wmma_i8_gfx1150_DEFS const StringRef PopulateParamsWmma::initParametersI8ConvGfx1150[] = { - "v4:128,32,4,32,32,16,16,1,2,2,0,0,1,1", - "v4:16,128,4,16,128,16,4,1,1,2,0,0,1,1", - "v4:256,32,2,32,32,16,16,1,1,2,0,0,1,1", - "v4:64,128,4,32,64,16,8,1,1,2,0,0,1,1", - "v4:256,64,2,64,64,16,16,1,2,2,0,0,1,1", - "v4:64,64,4,16,64,16,16,1,1,2,0,0,1,1", - "v4:128,256,2,128,64,16,8,1,1,2,0,0,1,1", - "v4:128,64,8,128,32,16,4,1,2,2,0,0,1,1", - "v4:64,64,8,16,64,16,16,1,1,2,0,0,1,1", - "v4:128,64,4,64,64,16,4,1,1,2,0,0,1,1", - "v4:128,64,8,64,16,16,16,1,1,2,0,0,1,1", - "v4:128,128,2,128,32,16,8,1,2,2,0,0,1,1", - "v4:256,64,8,64,64,16,8,1,1,2,0,0,1,1", - "v4:256,32,4,64,32,16,16,1,1,2,0,0,1,1", - "v4:256,32,8,16,32,16,16,1,1,2,0,0,1,1", - "v4:32,256,4,32,128,16,4,1,2,2,0,0,1,1", + "v4:256,64,4,128,32,16,16,1,1,2,0,0,1,1", + "v4:256,64,8,64,32,16,16,1,1,2,0,0,1,1", + "v4:64,32,8,16,32,16,16,1,1,2,0,0,1,1", + "v4:128,128,4,128,32,16,4,1,1,2,0,0,1,1", + "v4:64,32,4,64,32,16,4,1,1,2,0,0,1,1", + "v4:64,32,8,16,32,16,16,1,2,2,0,0,1,1", + "v4:64,128,8,32,16,16,16,1,2,2,0,0,1,1", "v4:64,128,2,32,64,16,16,1,2,2,0,0,1,1", - "v4:128,16,4,16,16,16,8,1,1,2,0,0,1,1", - "v4:16,16,8,16,16,16,8,1,1,2,0,0,1,1", - "v4:256,16,8,16,16,16,16,1,1,2,0,0,1,1", - "v4:256,32,8,128,32,16,16,1,1,2,0,0,1,1" + "v4:32,16,8,16,16,16,16,1,1,2,0,0,1,1", + "v4:64,128,4,32,32,16,8,1,1,2,0,0,1,1", + "v4:256,128,4,128,64,16,8,1,1,2,0,0,1,1", + "v4:256,32,8,32,32,16,16,1,1,2,0,0,1,1", + "v4:64,128,4,64,128,16,4,1,1,2,0,0,1,1", + "v4:256,16,8,32,16,16,16,1,1,2,0,0,1,1", + "v4:16,128,8,16,128,16,16,1,2,2,0,0,1,1", + "v4:16,256,4,16,256,16,16,1,1,2,0,0,1,1" }; // END_CONV_Wmma_i8_gfx1150_DEFS +// BEGIN_GEMM_Wmma_f16_gfx1101_DEFS +const StringRef PopulateParamsWmma::initParametersF16GemmGfx1101[] = { + "v4:128,64,4,64,32,16,8,1,1,2,0,0,1,1", + "v4:64,128,4,64,32,16,8,1,1,2,0,0,1,1", + "v4:64,128,4,32,64,16,8,1,1,2,0,0,1,1", + "v4:32,32,8,16,16,16,8,1,1,2,0,0,1,1", + "v4:16,64,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:16,16,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:128,128,4,64,64,16,8,1,1,2,0,0,1,1", + "v4:32,64,8,32,16,16,8,1,2,2,0,0,1,1", + "v4:16,96,8,16,48,16,8,1,1,2,0,0,1,1", + "v4:192,256,8,96,32,16,8,1,1,2,0,0,1,1", + "v4:256,128,8,32,128,16,8,1,1,2,0,0,1,1", + "v4:32,256,2,32,64,16,8,1,1,2,0,0,1,1", + "v4:128,32,4,32,32,16,8,1,1,2,0,0,1,1", + "v4:64,256,2,64,64,16,8,1,1,2,0,0,1,1", + "v4:32,128,2,32,64,16,8,1,2,2,0,0,1,1", + "v4:16,256,4,16,64,16,8,1,2,2,0,0,1,1", + "v4:128,16,8,64,16,16,8,1,1,2,0,0,1,1", + "v4:64,256,2,32,128,16,8,1,2,2,0,0,1,1", + "v4:16,128,8,16,32,16,16,1,1,2,0,0,1,1", + "v4:96,128,2,96,32,16,8,1,2,2,0,0,1,1", + "v4:224,64,4,112,32,16,8,1,1,0,1,8,1,1", + "v4:96,48,8,48,48,16,8,1,1,0,16,8,1,1", + "v4:128,128,8,64,64,16,8,1,1,0,8,32,1,1", + "v4:128,64,2,32,64,16,16,1,2,2,0,0,1,1", + "v4:256,32,8,64,32,16,8,1,1,2,0,0,1,1", + "v4:64,256,4,16,256,16,16,1,1,2,0,0,1,1", + "v4:128,128,8,64,64,16,8,1,1,0,16,0,1,1", + "v4:16,80,8,16,80,16,16,1,1,2,0,0,1,1", + "v4:192,64,8,48,32,16,8,1,1,1,4,0,1,1", + "v4:224,256,8,112,32,16,8,1,1,2,0,0,1,1", + "v4:224,64,8,112,32,16,8,1,1,1,8,64,1,1", + "v4:256,256,8,32,64,16,8,1,1,2,0,0,1,1", + "v4:256,64,8,64,64,16,4,1,1,2,0,0,1,1", + "v4:48,96,8,48,48,16,4,1,1,0,16,0,1,1", + "v4:64,128,8,32,128,16,8,1,1,2,0,0,1,1" +}; +// END_GEMM_Wmma_f16_gfx1101_DEFS + +// BEGIN_GEMM_Wmma_i8_gfx1101_DEFS +const StringRef PopulateParamsWmma::initParametersI8GemmGfx1101[] = { + "v4:48,64,4,48,16,16,16,1,1,2,0,0,1,1", + "v4:64,64,8,16,32,16,16,1,2,2,0,0,1,1", + "v4:128,64,4,32,64,16,16,1,1,2,0,0,1,1", + "v4:32,16,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:16,128,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:64,64,8,32,32,16,16,1,1,2,0,0,1,1", + "v4:128,128,2,32,64,16,16,1,1,2,0,0,1,1", + "v4:128,96,2,32,48,16,16,1,1,2,0,0,1,1", + "v4:64,128,2,64,64,16,16,1,1,2,0,0,1,1", + "v4:32,128,4,32,64,16,16,1,1,2,0,0,1,1", + "v4:144,64,4,144,16,16,8,1,1,2,0,0,1,1", + "v4:64,128,2,32,128,16,16,1,2,2,0,0,1,1", + "v4:64,256,8,16,128,16,8,1,2,2,0,0,1,1", + "v4:192,128,4,192,16,16,16,1,1,2,0,0,1,1", + "v4:256,32,8,128,16,16,16,1,1,2,0,0,1,1", + "v4:64,64,8,64,32,16,16,1,1,2,0,0,1,1", + "v4:256,32,4,32,32,16,16,1,1,0,16,16,1,1", + "v4:64,128,4,32,128,16,16,1,1,0,4,4,1,1" +}; +// END_GEMM_Wmma_i8_gfx1101_DEFS + +// BEGIN_CONV_Wmma_f16_gfx1101_DEFS +const StringRef PopulateParamsWmma::initParametersF16ConvGfx1101[] = { + "v4:64,128,4,64,32,16,8,1,1,2,0,0,1,1", + "v4:64,16,8,16,16,16,8,1,1,2,0,0,1,1", + "v4:64,128,2,64,32,16,8,1,1,2,0,0,1,1", + "v4:32,64,4,32,32,16,8,1,1,2,0,0,1,1", + "v4:64,64,8,32,32,16,8,1,1,2,0,0,1,1", + "v4:128,64,4,128,16,16,8,1,1,2,0,0,1,1", + "v4:48,64,8,48,16,16,8,1,1,2,0,0,1,1", + "v4:128,32,8,16,32,16,8,1,1,2,0,0,1,1", + "v4:64,128,2,64,32,16,8,1,2,2,0,0,1,1", + "v4:64,128,2,64,64,16,8,1,1,2,0,0,1,1", + "v4:16,16,8,16,16,16,8,1,1,2,0,0,1,1", + "v4:128,128,2,128,32,16,8,1,1,2,0,0,1,1", + "v4:16,64,4,16,64,16,8,1,1,2,0,0,1,1", + "v4:32,64,8,32,16,16,8,1,1,2,0,0,1,1", + "v4:128,128,8,64,32,16,8,1,1,2,0,0,1,1", + "v4:16,32,8,16,16,16,8,1,1,2,0,0,1,1", + "v4:64,32,2,32,32,16,8,1,2,2,0,0,1,1", + "v4:96,64,4,48,16,16,8,1,2,2,0,0,1,1", + "v4:128,64,4,32,64,16,8,1,2,2,0,0,1,1", + "v4:128,64,8,64,32,16,8,1,1,2,0,0,1,1", + "v4:64,64,8,16,64,16,8,1,2,2,0,0,1,1", + "v4:224,64,4,112,32,16,8,1,1,2,0,0,1,1", + "v4:64,128,8,64,32,16,8,1,1,2,0,0,1,1", + "v4:64,128,4,16,128,16,8,1,1,2,0,0,1,1", + "v4:64,128,8,32,64,16,8,1,1,2,0,0,1,1", + "v4:128,128,4,128,32,16,8,1,1,2,0,0,1,1", + "v4:192,64,4,48,32,16,8,1,2,2,0,0,1,1", + "v4:64,16,8,16,16,16,8,1,1,0,16,32,1,1", + "v4:128,128,2,64,64,16,8,1,1,2,0,0,1,1", + "v4:80,64,8,80,16,16,8,1,1,2,0,0,1,1", + "v4:256,64,4,128,16,16,8,1,1,2,0,0,1,1", + "v4:32,64,8,16,16,16,8,1,2,2,0,0,1,1", + "v4:64,256,8,16,128,16,8,1,1,2,0,0,1,1", + "v4:32,32,8,32,32,16,4,1,2,2,0,0,1,1", + "v4:16,32,4,16,32,16,16,1,2,2,0,0,1,1", + "v4:32,128,4,32,128,16,4,1,1,2,0,0,1,1", + "v4:128,32,8,64,32,16,8,1,1,2,0,0,1,1", + "v4:256,64,8,64,32,16,8,1,1,2,0,0,1,1", + "v4:16,16,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:256,48,8,16,48,16,8,1,1,2,0,0,1,1", + "v4:192,96,2,48,96,16,8,1,1,2,0,0,1,1", + "v4:64,256,2,32,128,16,8,1,2,2,0,0,1,1", + "v4:64,256,8,64,32,16,8,1,1,2,0,0,1,1", + "v4:96,32,8,48,32,16,8,1,1,0,16,32,1,1", + "v4:128,64,4,32,16,16,8,1,1,2,0,0,1,1", + "v4:32,16,8,16,16,16,16,1,2,2,0,0,1,1", + "v4:64,32,8,32,16,16,8,1,1,0,16,32,1,1", + "v4:48,64,8,48,16,16,8,1,1,0,16,32,1,1", + "v4:128,32,4,32,32,16,16,1,2,2,0,0,1,1", + "v4:192,64,8,48,32,16,8,1,1,0,16,0,1,1", + "v4:192,64,8,48,32,16,8,1,1,0,16,16,1,1", + "v4:256,128,4,128,32,16,8,1,1,2,0,0,1,1", + "v4:128,128,2,128,32,16,8,1,1,0,8,32,1,1", + "v4:128,128,8,64,32,16,8,1,1,0,4,0,1,1", + "v4:128,16,4,64,16,16,8,1,2,2,0,0,1,1", + "v4:256,128,8,32,128,16,8,1,1,0,16,64,1,1", + "v4:32,80,8,16,80,16,4,1,2,2,0,0,1,1", + "v4:64,16,8,16,16,16,8,1,2,1,16,4,1,1", + "v4:80,64,8,80,16,16,8,1,1,0,16,8,1,1", + "v4:128,128,8,128,16,16,8,1,1,0,16,16,1,1", + "v4:128,128,8,128,16,16,8,1,1,0,16,32,1,1", + "v4:128,16,8,32,16,16,8,1,1,0,16,8,1,1", + "v4:128,256,8,128,32,16,8,1,1,2,0,0,1,1", + "v4:128,256,8,64,64,16,8,1,1,2,0,0,1,1", + "v4:128,32,8,16,32,16,8,1,1,0,16,32,1,1", + "v4:144,64,8,144,16,16,8,1,1,2,0,0,1,1", + "v4:192,64,8,48,32,16,8,1,1,0,16,64,1,1", + "v4:128,128,2,128,32,16,8,1,1,0,2,64,1,1", + "v4:128,128,2,128,32,16,8,1,1,0,4,64,1,1", + "v4:128,128,8,128,16,16,8,1,1,1,16,0,1,1", + "v4:128,64,4,128,16,16,16,1,1,2,0,0,1,1", + "v4:128,64,8,64,32,16,4,1,2,2,0,0,1,1", + "v4:16,160,4,16,160,16,8,1,1,2,0,0,1,1", + "v4:16,64,4,16,64,16,8,1,1,0,16,64,1,1", + "v4:256,128,4,64,64,16,16,1,1,2,0,0,1,1", + "v4:256,128,8,128,32,16,8,1,1,2,0,0,1,1", + "v4:256,128,8,32,128,16,8,1,1,0,16,0,1,1", + "v4:64,128,4,64,32,16,16,1,1,2,0,0,1,1", + "v4:64,64,8,64,32,16,8,1,1,0,8,16,1,1", + "v4:96,128,4,96,32,16,8,1,1,0,16,32,1,1", + "v4:96,32,2,96,32,16,8,1,1,2,0,0,1,1", + "v4:128,128,8,64,64,16,8,1,1,0,4,0,1,1", + "v4:128,128,8,64,64,16,8,1,1,1,0,16,1,1", + "v4:128,128,8,64,64,16,8,1,1,1,2,8,1,1", + "v4:128,32,8,32,32,16,8,1,1,1,16,16,1,1", + "v4:128,64,4,128,32,16,16,1,1,2,0,0,1,1", + "v4:16,128,8,16,128,16,4,1,1,2,0,0,1,1", + "v4:16,64,4,16,64,16,8,1,1,0,16,8,1,1", + "v4:160,128,8,80,32,16,8,1,1,0,16,0,1,1", + "v4:192,64,4,96,32,16,8,1,1,2,0,0,1,1", + "v4:256,256,8,128,32,16,8,1,1,2,0,0,1,1", + "v4:64,128,4,16,128,16,8,1,1,0,16,0,1,1", + "v4:64,128,8,32,32,16,8,1,1,0,8,64,1,1", + "v4:64,16,8,16,16,16,16,1,1,0,16,8,1,1", + "v4:64,256,4,64,64,16,4,1,1,0,16,64,1,1", + "v4:64,64,4,64,64,16,4,1,1,0,16,16,1,1", + "v4:96,64,8,48,32,16,8,1,1,0,16,64,1,1", + "v4:112,64,8,112,16,16,8,1,1,0,16,32,1,1", + "v4:128,128,2,128,32,16,16,1,1,0,16,16,1,1", + "v4:128,128,2,128,32,16,8,1,2,0,2,32,1,1", + "v4:128,128,4,32,64,16,8,1,1,1,8,4,1,1", + "v4:128,128,8,64,64,16,8,1,1,0,2,8,1,1", + "v4:128,128,8,64,64,16,8,1,1,1,0,32,1,1", + "v4:128,128,8,64,64,16,8,1,1,1,4,32,1,1", + "v4:128,64,2,64,64,16,16,1,1,0,8,16,1,1", + "v4:128,64,8,64,64,16,4,1,2,2,0,0,1,1", + "v4:128,96,8,32,96,16,8,1,1,0,16,4,1,1", + "v4:16,224,4,16,224,16,4,1,1,2,0,0,1,1", + "v4:16,64,4,16,64,16,8,1,1,0,0,0,1,1", + "v4:160,128,8,80,32,16,8,1,1,0,16,64,1,1", + "v4:160,128,8,80,32,16,8,1,1,0,4,64,1,1", + "v4:192,64,4,96,32,16,16,1,1,0,16,8,1,1", + "v4:256,128,4,64,64,16,16,1,1,0,2,64,1,1", + "v4:256,128,4,64,64,16,16,1,1,0,8,16,1,1", + "v4:256,128,8,32,128,16,8,1,1,1,16,8,1,1", + "v4:32,256,4,32,128,16,8,1,2,2,0,0,1,1", + "v4:32,64,4,32,16,16,4,1,1,0,8,32,1,1", + "v4:48,48,2,48,48,16,8,1,1,1,0,64,1,1", + "v4:64,112,8,16,112,16,8,1,1,0,16,32,1,1", + "v4:64,128,2,64,64,16,8,1,1,0,16,0,1,1", + "v4:64,128,4,16,64,16,16,1,2,2,0,0,1,1", + "v4:64,128,4,64,16,16,16,1,2,2,0,0,1,1", + "v4:64,128,8,32,64,16,16,1,1,2,0,0,1,1", + "v4:64,256,4,16,256,16,16,1,1,2,0,0,1,1", + "v4:64,256,8,64,32,16,8,1,1,0,1,32,1,1", + "v4:64,48,4,16,48,16,8,1,2,1,16,4,1,1", + "v4:64,96,4,32,48,16,8,1,1,0,16,64,1,1", + "v4:96,192,2,48,48,16,8,1,1,0,1,64,1,1" +}; +// END_CONV_Wmma_f16_gfx1101_DEFS + +// BEGIN_CONV_Wmma_i8_gfx1101_DEFS +const StringRef PopulateParamsWmma::initParametersI8ConvGfx1101[] = { + "v4:256,32,4,64,32,16,16,1,1,2,0,0,1,1", + "v4:64,16,8,16,16,16,16,1,1,2,0,0,1,1", + "v4:128,16,8,16,16,16,16,1,1,2,0,0,1,1", + "v4:128,32,8,32,16,16,16,1,1,2,0,0,1,1", + "v4:128,64,4,32,64,16,16,1,1,2,0,0,1,1", + "v4:64,128,2,64,32,16,8,1,2,2,0,0,1,1", + "v4:64,128,4,64,64,16,4,1,1,2,0,0,1,1", + "v4:96,64,4,48,16,16,16,1,1,2,0,0,1,1", + "v4:256,64,4,128,32,16,16,1,1,2,0,0,1,1", + "v4:128,128,4,64,64,16,4,1,1,2,0,0,1,1", + "v4:128,128,4,64,64,16,16,1,1,2,0,0,1,1", + "v4:32,64,4,32,64,16,8,1,1,2,0,0,1,1", + "v4:128,128,8,64,64,16,8,1,2,2,0,0,1,1", + "v4:128,256,4,64,16,16,8,1,2,2,0,0,1,1", + "v4:256,64,2,256,32,16,16,1,2,2,0,0,1,1", + "v4:32,256,4,16,256,16,16,1,2,2,0,0,1,1", + "v4:64,256,4,32,128,16,16,1,1,2,0,0,1,1", + "v4:16,128,8,16,128,16,16,1,2,2,0,0,1,1", + "v4:16,16,8,16,16,16,4,1,2,1,8,4,1,1", + "v4:256,128,8,128,128,16,8,1,1,2,0,0,1,1", + "v4:32,64,8,32,64,16,16,1,2,2,0,0,1,1" +}; +// END_CONV_Wmma_i8_gfx1101_DEFS + // BEGIN_GEMM_Wmma_f16_gfx1152_DEFS const StringRef PopulateParamsWmma::initParametersF16GemmGfx1152[] = { "v4:256,128,8,64,64,16,8,1,1,2,0,0,1,1", @@ -2152,21 +3075,11 @@ static constexpr size_t nInitParametersF16GemmGfx1000 = 17; static const StringRef initParametersF16GemmGfx1000[nInitParametersF16GemmGfx1000]; // END_GEMM_Wmma_f16_gfx1000_DECS -// BEGIN_GEMM_Wmma_f16_gfx1100_DECS -static constexpr size_t nInitParametersF16GemmGfx1100 = 21; -static const StringRef initParametersF16GemmGfx1100[nInitParametersF16GemmGfx1100]; -// END_GEMM_Wmma_f16_gfx1100_DECS - // BEGIN_CONV_Wmma_f16_gfx1000_DECS static constexpr size_t nInitParametersF16ConvGfx1000 = 26; static const StringRef initParametersF16ConvGfx1000[nInitParametersF16ConvGfx1000]; // END_CONV_Wmma_f16_gfx1000_DECS -// BEGIN_CONV_Wmma_f16_gfx1100_DECS -static constexpr size_t nInitParametersF16ConvGfx1100 = 27; -static const StringRef initParametersF16ConvGfx1100[nInitParametersF16ConvGfx1100]; -// END_CONV_Wmma_f16_gfx1100_DECS - // BEGIN_GEMM_Wmma_fp8_gfx1000_DECS static constexpr size_t nInitParametersFp8GemmGfx1000 = 18; static const StringRef initParametersFp8GemmGfx1000[nInitParametersFp8GemmGfx1000]; @@ -2182,38 +3095,28 @@ static constexpr size_t nInitParametersI8GemmGfx1000 = 15; static const StringRef initParametersI8GemmGfx1000[nInitParametersI8GemmGfx1000]; // END_GEMM_Wmma_i8_gfx1000_DECS -// BEGIN_GEMM_Wmma_i8_gfx1100_DECS -static constexpr size_t nInitParametersI8GemmGfx1100 = 9; -static const StringRef initParametersI8GemmGfx1100[nInitParametersI8GemmGfx1100]; -// END_GEMM_Wmma_i8_gfx1100_DECS - // BEGIN_CONV_Wmma_i8_gfx1000_DECS static constexpr size_t nInitParametersI8ConvGfx1000 = 11; static const StringRef initParametersI8ConvGfx1000[nInitParametersI8ConvGfx1000]; // END_CONV_Wmma_i8_gfx1000_DECS -// BEGIN_CONV_Wmma_i8_gfx1100_DECS -static constexpr size_t nInitParametersI8ConvGfx1100 = 13; -static const StringRef initParametersI8ConvGfx1100[nInitParametersI8ConvGfx1100]; -// END_CONV_Wmma_i8_gfx1100_DECS - // BEGIN_GEMM_Wmma_f16_gfx1201_DECS -static constexpr size_t nInitParametersF16GemmGfx1201 = 14; +static constexpr size_t nInitParametersF16GemmGfx1201 = 65; static const StringRef initParametersF16GemmGfx1201[nInitParametersF16GemmGfx1201]; // END_GEMM_Wmma_f16_gfx1201_DECS // BEGIN_GEMM_Wmma_i8_gfx1201_DECS -static constexpr size_t nInitParametersI8GemmGfx1201 = 13; +static constexpr size_t nInitParametersI8GemmGfx1201 = 59; static const StringRef initParametersI8GemmGfx1201[nInitParametersI8GemmGfx1201]; // END_GEMM_Wmma_i8_gfx1201_DECS // BEGIN_CONV_Wmma_f16_gfx1201_DECS -static constexpr size_t nInitParametersF16ConvGfx1201 = 36; +static constexpr size_t nInitParametersF16ConvGfx1201 = 141; static const StringRef initParametersF16ConvGfx1201[nInitParametersF16ConvGfx1201]; // END_CONV_Wmma_f16_gfx1201_DECS // BEGIN_CONV_Wmma_i8_gfx1201_DECS -static constexpr size_t nInitParametersI8ConvGfx1201 = 12; +static constexpr size_t nInitParametersI8ConvGfx1201 = 62; static const StringRef initParametersI8ConvGfx1201[nInitParametersI8ConvGfx1201]; // END_CONV_Wmma_i8_gfx1201_DECS @@ -2243,20 +3146,40 @@ static const StringRef initParametersF16GemmGfx1150[nInitParametersF16GemmGfx115 // END_GEMM_Wmma_f16_gfx1150_DECS // BEGIN_GEMM_Wmma_i8_gfx1150_DECS -static constexpr size_t nInitParametersI8GemmGfx1150 = 16; +static constexpr size_t nInitParametersI8GemmGfx1150 = 22; static const StringRef initParametersI8GemmGfx1150[nInitParametersI8GemmGfx1150]; // END_GEMM_Wmma_i8_gfx1150_DECS // BEGIN_CONV_Wmma_f16_gfx1150_DECS -static constexpr size_t nInitParametersF16ConvGfx1150 = 59; +static constexpr size_t nInitParametersF16ConvGfx1150 = 48; static const StringRef initParametersF16ConvGfx1150[nInitParametersF16ConvGfx1150]; // END_CONV_Wmma_f16_gfx1150_DECS // BEGIN_CONV_Wmma_i8_gfx1150_DECS -static constexpr size_t nInitParametersI8ConvGfx1150 = 21; +static constexpr size_t nInitParametersI8ConvGfx1150 = 16; static const StringRef initParametersI8ConvGfx1150[nInitParametersI8ConvGfx1150]; // END_CONV_Wmma_i8_gfx1150_DECS +// BEGIN_GEMM_Wmma_f16_gfx1101_DECS +static constexpr size_t nInitParametersF16GemmGfx1101 = 35; +static const StringRef initParametersF16GemmGfx1101[nInitParametersF16GemmGfx1101]; +// END_GEMM_Wmma_f16_gfx1101_DECS + +// BEGIN_GEMM_Wmma_i8_gfx1101_DECS +static constexpr size_t nInitParametersI8GemmGfx1101 = 18; +static const StringRef initParametersI8GemmGfx1101[nInitParametersI8GemmGfx1101]; +// END_GEMM_Wmma_i8_gfx1101_DECS + +// BEGIN_CONV_Wmma_f16_gfx1101_DECS +static constexpr size_t nInitParametersF16ConvGfx1101 = 128; +static const StringRef initParametersF16ConvGfx1101[nInitParametersF16ConvGfx1101]; +// END_CONV_Wmma_f16_gfx1101_DECS + +// BEGIN_CONV_Wmma_i8_gfx1101_DECS +static constexpr size_t nInitParametersI8ConvGfx1101 = 21; +static const StringRef initParametersI8ConvGfx1101[nInitParametersI8ConvGfx1101]; +// END_CONV_Wmma_i8_gfx1101_DECS + // BEGIN_GEMM_Wmma_f16_gfx1152_DECS static constexpr size_t nInitParametersF16GemmGfx1152 = 42; static const StringRef initParametersF16GemmGfx1152[nInitParametersF16GemmGfx1152]; @@ -3114,12 +4037,8 @@ static const StringRef initParametersI8AttentionGfx1103[nInitParametersI8Attenti {"gfx1000_gemm_f32", {PopulateParams::initParametersF32GemmGfx1000, PopulateParams::nInitParametersF32GemmGfx1000}}, -{"gfx1100_gemm_f32", {PopulateParams::initParametersF32GemmGfx1100, PopulateParams::nInitParametersF32GemmGfx1100}}, - {"gfx1000_conv_f32", {PopulateParams::initParametersF32ConvGfx1000, PopulateParams::nInitParametersF32ConvGfx1000}}, -{"gfx1100_conv_f32", {PopulateParams::initParametersF32ConvGfx1100, PopulateParams::nInitParametersF32ConvGfx1100}}, - {"gfx1201_gemm_f32", {PopulateParams::initParametersF32GemmGfx1201, PopulateParams::nInitParametersF32GemmGfx1201}}, {"gfx1201_conv_f32", {PopulateParams::initParametersF32ConvGfx1201, PopulateParams::nInitParametersF32ConvGfx1201}}, @@ -3132,6 +4051,10 @@ static const StringRef initParametersI8AttentionGfx1103[nInitParametersI8Attenti {"gfx1150_conv_f32", {PopulateParams::initParametersF32ConvGfx1150, PopulateParams::nInitParametersF32ConvGfx1150}}, +{"gfx1101_gemm_f32", {PopulateParams::initParametersF32GemmGfx1101, PopulateParams::nInitParametersF32GemmGfx1101}}, + +{"gfx1101_conv_f32", {PopulateParams::initParametersF32ConvGfx1101, PopulateParams::nInitParametersF32ConvGfx1101}}, + {"gfx1152_gemm_f32", {PopulateParams::initParametersF32GemmGfx1152, PopulateParams::nInitParametersF32GemmGfx1152}}, {"gfx1152_conv_f32", {PopulateParams::initParametersF32ConvGfx1152, PopulateParams::nInitParametersF32ConvGfx1152}}, @@ -3200,24 +4123,16 @@ static const StringRef initParametersI8AttentionGfx1103[nInitParametersI8Attenti {"gfx1000_gemm_f16", {PopulateParamsWmma::initParametersF16GemmGfx1000, PopulateParamsWmma::nInitParametersF16GemmGfx1000}}, -{"gfx1100_gemm_f16", {PopulateParamsWmma::initParametersF16GemmGfx1100, PopulateParamsWmma::nInitParametersF16GemmGfx1100}}, - {"gfx1000_conv_f16", {PopulateParamsWmma::initParametersF16ConvGfx1000, PopulateParamsWmma::nInitParametersF16ConvGfx1000}}, -{"gfx1100_conv_f16", {PopulateParamsWmma::initParametersF16ConvGfx1100, PopulateParamsWmma::nInitParametersF16ConvGfx1100}}, - {"gfx1000_gemm_fp8", {PopulateParamsWmma::initParametersFp8GemmGfx1000, PopulateParamsWmma::nInitParametersFp8GemmGfx1000}}, {"gfx1000_conv_fp8", {PopulateParamsWmma::initParametersFp8ConvGfx1000, PopulateParamsWmma::nInitParametersFp8ConvGfx1000}}, {"gfx1000_gemm_i8", {PopulateParamsWmma::initParametersI8GemmGfx1000, PopulateParamsWmma::nInitParametersI8GemmGfx1000}}, -{"gfx1100_gemm_i8", {PopulateParamsWmma::initParametersI8GemmGfx1100, PopulateParamsWmma::nInitParametersI8GemmGfx1100}}, - {"gfx1000_conv_i8", {PopulateParamsWmma::initParametersI8ConvGfx1000, PopulateParamsWmma::nInitParametersI8ConvGfx1000}}, -{"gfx1100_conv_i8", {PopulateParamsWmma::initParametersI8ConvGfx1100, PopulateParamsWmma::nInitParametersI8ConvGfx1100}}, - {"gfx1201_gemm_f16", {PopulateParamsWmma::initParametersF16GemmGfx1201, PopulateParamsWmma::nInitParametersF16GemmGfx1201}}, {"gfx1201_gemm_i8", {PopulateParamsWmma::initParametersI8GemmGfx1201, PopulateParamsWmma::nInitParametersI8GemmGfx1201}}, @@ -3260,12 +4175,8 @@ static const StringRef initParametersI8AttentionGfx1103[nInitParametersI8Attenti {"gfx1000_gemm_bf16", {PopulateParamsWmma::initParametersF16GemmGfx1000, PopulateParamsWmma::nInitParametersF16GemmGfx1000}}, // alias -> f16 -{"gfx1100_gemm_bf16", {PopulateParamsWmma::initParametersF16GemmGfx1100, PopulateParamsWmma::nInitParametersF16GemmGfx1100}}, // alias -> f16 - {"gfx1000_conv_bf16", {PopulateParamsWmma::initParametersF16ConvGfx1000, PopulateParamsWmma::nInitParametersF16ConvGfx1000}}, // alias -> f16 -{"gfx1100_conv_bf16", {PopulateParamsWmma::initParametersF16ConvGfx1100, PopulateParamsWmma::nInitParametersF16ConvGfx1100}}, // alias -> f16 - {"gfx1201_gemm_bf16", {PopulateParamsWmma::initParametersF16GemmGfx1201, PopulateParamsWmma::nInitParametersF16GemmGfx1201}}, // alias -> f16 {"gfx1201_conv_bf16", {PopulateParamsWmma::initParametersF16ConvGfx1201, PopulateParamsWmma::nInitParametersF16ConvGfx1201}}, // alias -> f16 @@ -3278,6 +4189,18 @@ static const StringRef initParametersI8AttentionGfx1103[nInitParametersI8Attenti {"gfx1150_conv_bf16", {PopulateParamsWmma::initParametersF16ConvGfx1150, PopulateParamsWmma::nInitParametersF16ConvGfx1150}}, // alias -> f16 +{"gfx1101_gemm_f16", {PopulateParamsWmma::initParametersF16GemmGfx1101, PopulateParamsWmma::nInitParametersF16GemmGfx1101}}, + +{"gfx1101_gemm_i8", {PopulateParamsWmma::initParametersI8GemmGfx1101, PopulateParamsWmma::nInitParametersI8GemmGfx1101}}, + +{"gfx1101_conv_f16", {PopulateParamsWmma::initParametersF16ConvGfx1101, PopulateParamsWmma::nInitParametersF16ConvGfx1101}}, + +{"gfx1101_conv_i8", {PopulateParamsWmma::initParametersI8ConvGfx1101, PopulateParamsWmma::nInitParametersI8ConvGfx1101}}, + +{"gfx1101_gemm_bf16", {PopulateParamsWmma::initParametersF16GemmGfx1101, PopulateParamsWmma::nInitParametersF16GemmGfx1101}}, // alias -> f16 + +{"gfx1101_conv_bf16", {PopulateParamsWmma::initParametersF16ConvGfx1101, PopulateParamsWmma::nInitParametersF16ConvGfx1101}}, // alias -> f16 + {"gfx1152_gemm_f16", {PopulateParamsWmma::initParametersF16GemmGfx1152, PopulateParamsWmma::nInitParametersF16GemmGfx1152}}, {"gfx1152_gemm_i8", {PopulateParamsWmma::initParametersI8GemmGfx1152, PopulateParamsWmma::nInitParametersI8GemmGfx1152}}, diff --git a/mlir/lib/Dialect/Rock/Tuning/CMakeLists.txt b/mlir/lib/Dialect/Rock/Tuning/CMakeLists.txt index c1ec2fbe5568..6dc88817cc5c 100644 --- a/mlir/lib/Dialect/Rock/Tuning/CMakeLists.txt +++ b/mlir/lib/Dialect/Rock/Tuning/CMakeLists.txt @@ -5,6 +5,7 @@ add_rocmlir_dialect_library(MLIRRockTuning GridwiseGemmGemmParams.cpp RockTuningImpl.cpp ParamLookupTable.cpp + QuickTuningClassifier.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Rock/Tuning @@ -15,8 +16,17 @@ add_rocmlir_dialect_library(MLIRRockTuning MLIRRockPassIncGen ) +find_package(xgboost REQUIRED) + target_link_libraries(MLIRRockTuning PRIVATE MLIRRockUtility MLIRIR + xgboost::xgboost ) + +if(EXISTS ${CMAKE_SOURCE_DIR}/models) + install(DIRECTORY ${CMAKE_SOURCE_DIR}/models/ + DESTINATION bin/models + FILES_MATCHING PATTERN "*.ubj") +endif() diff --git a/mlir/lib/Dialect/Rock/Tuning/GridwiseGemmParams.cpp b/mlir/lib/Dialect/Rock/Tuning/GridwiseGemmParams.cpp index e3aa8873a586..f4b036214a3a 100644 --- a/mlir/lib/Dialect/Rock/Tuning/GridwiseGemmParams.cpp +++ b/mlir/lib/Dialect/Rock/Tuning/GridwiseGemmParams.cpp @@ -12,6 +12,7 @@ #include "mlir/Dialect/Rock/Tuning/GeneralGemmBlockStructure.h" #include "mlir/Dialect/Rock/utility/loweringUtils.h" #include "mlir/Dialect/Rock/utility/math.h" +#include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/Support/LogicalResult.h" @@ -52,11 +53,35 @@ PopulateParamsInfo PopulateParamsInfo::fromOp(RockGemmWrapperInterface op) { PopulateParamsInfo info{op.getGemmSize(), arch, features, op.getAType(), op.getBType(), op.getKernelType()}; + info.numCu = rock::getNumCUValue(op); if (auto convOp = dyn_cast(*op)) { auto convDims = ConvolutionDims::fromOp(op); - info.numCu = rock::getNumCUValue(convOp); info.batchSize = convDims.n; } + if (auto convIface = dyn_cast(*op)) { + auto convDims = ConvolutionDims::fromOp(op); + auto strides = + extractFromIntegerArrayAttr(convIface.getStrides()); + auto dilations = + extractFromIntegerArrayAttr(convIface.getDilations()); + auto padding = + extractFromIntegerArrayAttr(convIface.getPadding()); + ConvMeta cm; + cm.batchN = convDims.n; + cm.cChannels = convDims.c; + cm.kChannels = convDims.k; + cm.inH = convDims.in.size() > 0 ? convDims.in[0] : 1; + cm.inW = convDims.in.size() > 1 ? convDims.in[1] : 1; + cm.filterH = convDims.fil.size() > 0 ? convDims.fil[0] : 1; + cm.filterW = convDims.fil.size() > 1 ? convDims.fil[1] : 1; + cm.padH = padding.size() > 0 ? padding[0] : 0; + cm.padW = padding.size() > 2 ? padding[2] : (padding.size() > 1 ? padding[1] : 0); + cm.strideH = strides.size() > 0 ? strides[0] : 1; + cm.strideW = strides.size() > 1 ? strides[1] : 1; + cm.dilH = dilations.size() > 0 ? dilations[0] : 1; + cm.dilW = dilations.size() > 1 ? dilations[1] : 1; + info.convMeta = cm; + } func::FuncOp func = op->getParentOfType(); WalkResult wRes = func.walk( [&](ReduceOp rOp) -> WalkResult { return WalkResult::interrupt(); }); diff --git a/mlir/lib/Dialect/Rock/Tuning/QuickTuningClassifier.cpp b/mlir/lib/Dialect/Rock/Tuning/QuickTuningClassifier.cpp new file mode 100644 index 000000000000..fa2865c9cd4b --- /dev/null +++ b/mlir/lib/Dialect/Rock/Tuning/QuickTuningClassifier.cpp @@ -0,0 +1,418 @@ +//===- QuickTuningClassifier.cpp - XGBoost-based perfconfig ranking -------===// +// +// Part of the rocMLIR Project, under the Apache License v2.0 with LLVM +// Exceptions. See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (c) 2025 Advanced Micro Devices Inc. +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Rock/Tuning/QuickTuningClassifier.h" +#include "mlir/Dialect/Rock/IR/GemmGemmSize.h" +#include "mlir/Dialect/Rock/IR/GetRockInfo.h" + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/Path.h" + +#include + +#include +#include +#include +#include +#include +#include +#include + +#define DEBUG_TYPE "rock-quick-tune-classifier" + +using namespace mlir; +using namespace mlir::rock; + +unsigned QuickTuningClassifier::getTopN() { + if (const char *env = std::getenv("ROCMLIR_QUICK_TUNE_TOP_N")) { + int val = std::atoi(env); + if (val >= 0) + return static_cast(val); + } + return 30; +} + +// --------------------------------------------------------------------------- +// Model cache +// --------------------------------------------------------------------------- + +namespace { + +std::mutex &getModelCacheMutex() { + static std::mutex mu; + return mu; +} + +std::map &getModelCache() { + static std::map cache; + return cache; +} + +std::string getExecutableDir() { + llvm::SmallString<256> realPath; + auto ec = llvm::sys::fs::real_path("/proc/self/exe", realPath); + if (ec) + llvm::report_fatal_error( + llvm::Twine("QuickTuningClassifier: cannot resolve /proc/self/exe: ") + + ec.message()); + llvm::sys::path::remove_filename(realPath); + return std::string(realPath); +} + +BoosterHandle loadModel(const std::string &key) { + std::lock_guard lock(getModelCacheMutex()); + auto &cache = getModelCache(); + auto it = cache.find(key); + if (it != cache.end()) + return it->second; + + std::string dir = getExecutableDir(); + std::string path = dir + "/models/" + key + ".ubj"; + + if (!llvm::sys::fs::exists(path)) + llvm::report_fatal_error( + llvm::Twine("QuickTuningClassifier: model file not found: ") + path); + + BoosterHandle booster = nullptr; + if (XGBoosterCreate(nullptr, 0, &booster) != 0) + llvm::report_fatal_error( + llvm::Twine("QuickTuningClassifier: XGBoosterCreate failed for key '") + + key + "'"); + if (XGBoosterLoadModel(booster, path.c_str()) != 0) + llvm::report_fatal_error( + llvm::Twine("QuickTuningClassifier: XGBoosterLoadModel failed for ") + + path); + + LLVM_DEBUG(llvm::dbgs() << "QuickTuningClassifier: loaded model " << path + << "\n"); + cache[key] = booster; + return booster; +} + +// --------------------------------------------------------------------------- +// Model key construction (mirrors ParamLookupTable::makeKey) +// --------------------------------------------------------------------------- + +std::string normalizeArch(StringRef arch) { + auto gfxPos = arch.find("gfx"); + if (gfxPos == StringRef::npos) + llvm::report_fatal_error( + llvm::Twine("QuickTuningClassifier: invalid architecture string: ") + + arch); + auto remaining = arch.substr(gfxPos); + auto endPos = + remaining.find_if_not([](char c) { return llvm::isAlnum(c); }, 3); + return std::string(remaining.substr(0, endPos)); +} + +std::string kernelTypeStr(KernelType kt) { + switch (kt) { + case KernelType::ConvBwdData: + case KernelType::ConvBwdWeight: + return stringifyEnum(KernelType::Conv).lower(); + default: + return stringifyEnum(kt).lower(); + } +} + +std::string dataTypeStr(Type dataType) { + if (dataType.isBF16()) + return "bf16"; + if (dataType.isFloat()) { + unsigned bw = dataType.getIntOrFloatBitWidth(); + switch (bw) { + case 4: + case 8: + return "fp" + std::to_string(bw); + case 16: + case 32: + return "f" + std::to_string(bw); + default: + llvm::report_fatal_error("QuickTuningClassifier: unsupported float " + "bitwidth: " + + llvm::Twine(bw)); + } + } + if (dataType.isInteger()) { + unsigned bw = dataType.getIntOrFloatBitWidth(); + if (bw == 8) + return "i8"; + llvm::report_fatal_error("QuickTuningClassifier: unsupported integer " + "bitwidth: " + + llvm::Twine(bw)); + } + llvm::report_fatal_error("QuickTuningClassifier: unsupported data type"); +} + +std::string makeModelKey(StringRef arch, KernelType kt, Type dataTypeA) { + return normalizeArch(arch) + "_" + kernelTypeStr(kt) + "_" + + dataTypeStr(dataTypeA); +} + +// --------------------------------------------------------------------------- +// Feature extraction: raw problem dims + raw perfconfig params. +// Each op type uses its own problem features. Must match the Python +// feature builders in trainQuickTuneClassifier.py exactly. +// --------------------------------------------------------------------------- + +/// Push raw conv problem dimensions (14 features) matching Python +/// _conv_problem_features: N,C,H,W,K,Y,X,padH,padW,strH,strW,dilH,dilW,dir. +void pushConvProblem(const PopulateParamsInfo &info, + llvm::SmallVectorImpl &feats) { + const auto &cm = *info.convMeta; + feats.push_back(static_cast(cm.batchN)); + feats.push_back(static_cast(cm.cChannels)); + feats.push_back(static_cast(cm.inH)); + feats.push_back(static_cast(cm.inW)); + feats.push_back(static_cast(cm.kChannels)); + feats.push_back(static_cast(cm.filterH)); + feats.push_back(static_cast(cm.filterW)); + feats.push_back(static_cast(cm.padH)); + feats.push_back(static_cast(cm.padW)); + feats.push_back(static_cast(cm.strideH)); + feats.push_back(static_cast(cm.strideW)); + feats.push_back(static_cast(cm.dilH)); + feats.push_back(static_cast(cm.dilW)); + float dir = 0.0f; // fwd + if (info.kernelType == KernelType::ConvBwdData) + dir = 1.0f; + else if (info.kernelType == KernelType::ConvBwdWeight) + dir = 2.0f; + feats.push_back(dir); +} + +/// Push raw GEMM problem dimensions (4 features): G, M, N, K. +void pushGemmProblem(const PopulateParamsInfo &info, + llvm::SmallVectorImpl &feats) { + const auto &gs = info.gemmSize; + feats.push_back(static_cast(gs.g)); + feats.push_back(static_cast(gs.m)); + feats.push_back(static_cast(gs.n)); + feats.push_back(static_cast(gs.k)); +} + +void buildAccelFeatures(const PopulateParamsInfo &info, AccelGemmParamsAttr p, + llvm::SmallVectorImpl &feats) { + if (info.convMeta.has_value()) + pushConvProblem(info, feats); + else + pushGemmProblem(info, feats); + + feats.push_back(static_cast(p.getMPerBlock())); + feats.push_back(static_cast(p.getNPerBlock())); + feats.push_back(static_cast(p.getKpackPerBlock())); + feats.push_back(static_cast(p.getMPerWave())); + feats.push_back(static_cast(p.getNPerWave())); + feats.push_back(static_cast(p.getMnPerXdl())); + feats.push_back(static_cast(p.getKpack())); + feats.push_back(static_cast(p.getForceUnroll())); + feats.push_back(static_cast(p.getScheduleVersion())); + feats.push_back(static_cast(p.getOutputSwizzle())); + feats.push_back(static_cast(p.getWavesPerEU())); + feats.push_back(static_cast(p.getGridGroupSize())); + feats.push_back(static_cast(p.getSplitKFactor())); +} + +void buildGeneralFeatures(const PopulateParamsInfo &info, + GeneralGemmParamsAttr p, + llvm::SmallVectorImpl &feats) { + if (info.convMeta.has_value()) + pushConvProblem(info, feats); + else + pushGemmProblem(info, feats); + + feats.push_back(static_cast(p.getBlockSize())); + feats.push_back(static_cast(p.getKPerBlock())); + feats.push_back(static_cast(p.getMPerBlock())); + feats.push_back(static_cast(p.getNPerBlock())); + feats.push_back(static_cast(p.getKPerThread())); + feats.push_back(static_cast(p.getMPerThread())); + feats.push_back(static_cast(p.getNPerThread())); + feats.push_back(static_cast(p.getKpack())); + feats.push_back(static_cast(p.getSplitKFactor())); + feats.push_back(static_cast(p.getScheduleVersion())); + feats.push_back(static_cast(p.getOutputSwizzle())); +} + +void buildGemmGemmFeatures(GemmGemmSize gs, GemmGemmParamsAttr p, + llvm::SmallVectorImpl &feats) { + feats.push_back(static_cast(gs.g)); + feats.push_back(static_cast(gs.m)); + feats.push_back(static_cast(gs.n)); + feats.push_back(static_cast(gs.k)); + + feats.push_back(static_cast(p.getMPerBlockG0())); + feats.push_back(static_cast(p.getMPerBlockG1())); + feats.push_back(static_cast(p.getNPerBlockG0())); + feats.push_back(static_cast(p.getKpackPerBlock())); + feats.push_back(static_cast(p.getMPerWave())); + feats.push_back(static_cast(p.getNPerWave())); + feats.push_back(static_cast(p.getMnPerXdl())); + feats.push_back(static_cast(p.getKpack())); + feats.push_back(static_cast(p.getSplitKFactor())); + feats.push_back(static_cast(p.getScheduleVersion())); + feats.push_back(static_cast(p.getOutputSwizzle())); + feats.push_back(static_cast(p.getWavesPerEU())); + feats.push_back(static_cast(p.getForceUnroll())); +} + +// --------------------------------------------------------------------------- +// Prediction + top-N selection +// --------------------------------------------------------------------------- + +template +std::vector +predictAndSelectTopN(BoosterHandle booster, unsigned topN, + llvm::ArrayRef candidates, + FeatFn &&featFn) { + size_t nCandidates = candidates.size(); + if (nCandidates == 0) + return {}; + + llvm::SmallVector firstRow; + featFn(candidates[0], firstRow); + size_t nFeats = firstRow.size(); + + std::vector featureMatrix(nCandidates * nFeats); + std::copy(firstRow.begin(), firstRow.end(), featureMatrix.begin()); + for (size_t i = 1; i < nCandidates; ++i) { + llvm::SmallVector row; + featFn(candidates[i], row); + assert(row.size() == nFeats); + std::copy(row.begin(), row.end(), featureMatrix.begin() + i * nFeats); + } + + DMatrixHandle dmat = nullptr; + if (XGDMatrixCreateFromMat( + featureMatrix.data(), nCandidates, nFeats, + /*missing=*/std::numeric_limits::quiet_NaN(), &dmat) != 0) + llvm::report_fatal_error( + llvm::Twine("QuickTuningClassifier: XGDMatrixCreateFromMat failed: ") + + XGBGetLastError()); + + bst_ulong outLen = 0; + const float *outResult = nullptr; + if (XGBoosterPredict(booster, dmat, 0, 0, 0, &outLen, &outResult) != 0) { + std::string err = XGBGetLastError(); + XGDMatrixFree(dmat); + llvm::report_fatal_error( + llvm::Twine("QuickTuningClassifier: XGBoosterPredict failed (") + + llvm::Twine(nFeats) + " features, " + llvm::Twine(nCandidates) + + " candidates): " + err); + } + + // Build indices sorted by predicted score descending. + std::vector indices(nCandidates); + std::iota(indices.begin(), indices.end(), 0); + size_t selectN = std::min(static_cast(topN), nCandidates); + std::partial_sort( + indices.begin(), indices.begin() + selectN, indices.end(), + [&](size_t a, size_t b) { return outResult[a] > outResult[b]; }); + + std::vector result; + result.reserve(selectN); + for (size_t i = 0; i < selectN; ++i) + result.push_back(candidates[indices[i]]); + + XGDMatrixFree(dmat); + return result; +} + +} // anonymous namespace + +// --------------------------------------------------------------------------- +// Public API -- Accel (XDL / WMMA) +// --------------------------------------------------------------------------- + +std::vector +QuickTuningClassifier::filterTopN(const PopulateParamsInfo &info, + llvm::ArrayRef cands) { + unsigned topN = getTopN(); + if (topN == 0 || cands.size() <= topN) { + LLVM_DEBUG(llvm::dbgs() + << "QuickTuningClassifier[Accel]: topN=" << topN + << ", candidates=" << cands.size() << ", skipping filter\n"); + return std::vector(cands.begin(), cands.end()); + } + + std::string key = makeModelKey(info.arch, info.kernelType, info.gemmAType); + BoosterHandle booster = loadModel(key); + + LLVM_DEBUG(llvm::dbgs() << "QuickTuningClassifier[Accel]: filtering " + << cands.size() << " -> " << topN << " using model '" + << key << "'\n"); + return predictAndSelectTopN( + booster, topN, cands, + [&](AccelGemmParamsAttr p, llvm::SmallVectorImpl &f) { + buildAccelFeatures(info, p, f); + }); +} + +// --------------------------------------------------------------------------- +// Public API -- NonAccel +// --------------------------------------------------------------------------- + +std::vector +QuickTuningClassifier::filterTopN(const PopulateParamsInfo &info, + llvm::ArrayRef cands) { + unsigned topN = getTopN(); + if (topN == 0 || cands.size() <= topN) { + LLVM_DEBUG(llvm::dbgs() + << "QuickTuningClassifier[General]: topN=" << topN + << ", candidates=" << cands.size() << ", skipping filter\n"); + return std::vector(cands.begin(), cands.end()); + } + + std::string key = makeModelKey(info.arch, info.kernelType, info.gemmAType); + BoosterHandle booster = loadModel(key); + + LLVM_DEBUG(llvm::dbgs() << "QuickTuningClassifier[General]: filtering " + << cands.size() << " -> " << topN << " using model '" + << key << "'\n"); + return predictAndSelectTopN( + booster, topN, cands, + [&](GeneralGemmParamsAttr p, llvm::SmallVectorImpl &f) { + buildGeneralFeatures(info, p, f); + }); +} + +// --------------------------------------------------------------------------- +// Public API -- GemmGemm (attention) +// --------------------------------------------------------------------------- + +std::vector +QuickTuningClassifier::filterTopN(RockGemmGemmWrapperInterface op, + llvm::ArrayRef cands) { + unsigned topN = getTopN(); + if (topN == 0 || cands.size() <= topN) { + LLVM_DEBUG(llvm::dbgs() + << "QuickTuningClassifier[GemmGemm]: topN=" << topN + << ", candidates=" << cands.size() << ", skipping filter\n"); + return std::vector(cands.begin(), cands.end()); + } + + StringAttr archAttr = rock::getArchValue(op); + std::string key = makeModelKey(archAttr, op.getKernelType(), op.getAType()); + BoosterHandle booster = loadModel(key); + + LLVM_DEBUG(llvm::dbgs() << "QuickTuningClassifier[GemmGemm]: filtering " + << cands.size() << " -> " << topN << " using model '" + << key << "'\n"); + GemmGemmSize gs = op.getGemmGemmSize(); + return predictAndSelectTopN( + booster, topN, cands, + [&](GemmGemmParamsAttr p, llvm::SmallVectorImpl &f) { + buildGemmGemmFeatures(gs, p, f); + }); +} diff --git a/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp b/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp index 18b59e015519..979637680738 100644 --- a/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp +++ b/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp @@ -20,6 +20,7 @@ #include "mlir/Dialect/Rock/IR/RockTypes.h" #include "mlir/Dialect/Rock/Tuning/GridwiseGemmGemmParams.h" #include "mlir/Dialect/Rock/Tuning/GridwiseGemmParams.h" +#include "mlir/Dialect/Rock/Tuning/QuickTuningClassifier.h" #include "mlir/Dialect/Rock/Tuning/RockTuning.h" #include "mlir/Dialect/Rock/utility/fusionUtils.h" #include "mlir/Dialect/Rock/utility/loweringUtils.h" @@ -717,35 +718,33 @@ static void createGemmTuningRangeQuick(TuningParamSet *newSpace, rock::AmdArchInfo archInfo = rock::lookupArchInfo(arch); if (archInfo.isMfma(gemmOp)) { PopulateParamsXDL tuningInfo; - - for (AccelGemmParamsAttr param : tuningInfo.orderParams( - tuningInfo.getTuningParameters(b, info.kernelType, info.gemmAType, - info.gemmBType, info.arch), - info.gemmSize)) { - if (succeeded(tuningInfo.paramsProbablyValid(b, info, param)) && - succeeded(tuningInfo.couldBePerformant(info, param))) + auto allParams = tuningInfo.getTuningParameters( + b, info.kernelType, info.gemmAType, info.gemmBType, info.arch); + auto filtered = QuickTuningClassifier::filterTopN(info, allParams); + for (AccelGemmParamsAttr param : + tuningInfo.orderParams(filtered, info.gemmSize)) { + if (succeeded(tuningInfo.paramsProbablyValid(b, info, param))) newSpace->tuningRange.insert(cast(param)); } } else if (archInfo.isWmma(gemmOp)) { - // Wmma PopulateParamsWmma tuningInfo; - for (AccelGemmParamsAttr param : tuningInfo.orderParams( - tuningInfo.getTuningParameters(b, info.kernelType, info.gemmAType, - info.gemmBType, info.arch), - info.gemmSize)) { - if (succeeded(tuningInfo.paramsProbablyValid(b, info, param)) && - succeeded(tuningInfo.couldBePerformant(info, param))) + auto allParams = tuningInfo.getTuningParameters( + b, info.kernelType, info.gemmAType, info.gemmBType, info.arch); + auto filtered = QuickTuningClassifier::filterTopN(info, allParams); + for (AccelGemmParamsAttr param : + tuningInfo.orderParams(filtered, info.gemmSize)) { + if (succeeded(tuningInfo.paramsProbablyValid(b, info, param))) newSpace->tuningRange.insert(cast(param)); } } else { // Non-XDLOPS PopulateParams tuningInfo; - for (GeneralGemmParamsAttr param : tuningInfo.orderParams( - tuningInfo.getTuningParameters(b, info.kernelType, info.gemmAType, - info.gemmBType, info.arch), - info.gemmSize)) { - if (succeeded(tuningInfo.paramsProbablyValid(b, info, param)) && - succeeded(tuningInfo.couldBePerformant(info, param))) + auto allParams = tuningInfo.getTuningParameters( + b, info.kernelType, info.gemmAType, info.gemmBType, info.arch); + auto filtered = QuickTuningClassifier::filterTopN(info, allParams); + for (GeneralGemmParamsAttr param : + tuningInfo.orderParams(filtered, info.gemmSize)) { + if (succeeded(tuningInfo.paramsProbablyValid(b, info, param))) newSpace->tuningRange.insert(cast(param)); } } @@ -755,8 +754,9 @@ static void createGemmGemmTuningRangeQuick(TuningParamSet *newSpace, RockGemmGemmWrapperInterface gemmGemmOp) { OpBuilder b(gemmGemmOp.getContext()); - for (GemmGemmParamsAttr params : - PopulateParamsGemmGemm::getTuningParameters(b, gemmGemmOp)) { + auto allParams = PopulateParamsGemmGemm::getTuningParameters(b, gemmGemmOp); + auto filtered = QuickTuningClassifier::filterTopN(gemmGemmOp, allParams); + for (GemmGemmParamsAttr params : filtered) { if (succeeded(PopulateParamsGemmGemm::paramsProbablyValid(b, gemmGemmOp, params))) { newSpace->tuningRange.insert(cast(params)); diff --git a/mlir/test/CAPI/mixr_full.c b/mlir/test/CAPI/mixr_full.c index 9d7fa2f59288..d5bcbfe9d39d 100644 --- a/mlir/test/CAPI/mixr_full.c +++ b/mlir/test/CAPI/mixr_full.c @@ -195,7 +195,7 @@ static bool constructAndTraverseIr(MlirContext ctx) { mlirRockTuningSpaceCreate(module, RocmlirTuningParamSetKindFull); printf("Got tuning space,\n"); unsigned fNum = mlirRockTuningGetNumParams(tuningSpace); - // CHECK: full set = 932 + // CHECK: full set = 937 printf("full set = %u\n", fNum); MlirRockTuningParam tuningParam = mlirRockTuningParamCreate(); MlirRockTuningTable tuningTable = mlirRockTuningTableCreate(); diff --git a/mlir/test/Dialect/Rock/affix_tuning_params.mlir b/mlir/test/Dialect/Rock/affix_tuning_params.mlir index 84d1b662bb1f..d38d4c46c152 100644 --- a/mlir/test/Dialect/Rock/affix_tuning_params.mlir +++ b/mlir/test/Dialect/Rock/affix_tuning_params.mlir @@ -10,7 +10,7 @@ // GRID-LABEL: rock_conv func.func @rock_conv(%filter : memref<1x128x8x3x3xf32>, %input : memref<128x1x8x32x32xf32>, %output : memref<128x1x128x30x30xf32>) attributes {rock.arch = "amdgcn-amd-amdhsa:gfx906"} { // CHECK: rock.conv - // CHECK-SAME: params = #rock.general_gemm_params + // CHECK-SAME: params = #rock.general_gemm_params // GRID: rock.gridwise_gemm // GRID-SAME: gridSize = 1800 rock.conv(%filter, %input, %output) features = none { @@ -28,7 +28,7 @@ func.func @rock_conv(%filter : memref<1x128x8x3x3xf32>, %input : memref<128x1x8x // GRID-LABEL: rock_conv_schedulev2 func.func @rock_conv_schedulev2(%filter : memref<1x128x8x3x3xf32>, %input : memref<128x1x8x32x32xf32>, %output : memref<128x1x128x30x30xf32>) attributes {rock.schedule_version = #rock.rock.schedule_version<2>, rock.arch = "amdgcn-amd-amdhsa:gfx906"} { // CHECK: rock.conv - // CHECK-SAME: params = #rock.general_gemm_params + // CHECK-SAME: params = #rock.general_gemm_params // GRID: rock.gridwise_gemm // GRID-SAME: gridSize = 1800 rock.conv(%filter, %input, %output) features = none { @@ -46,7 +46,7 @@ func.func @rock_conv_schedulev2(%filter : memref<1x128x8x3x3xf32>, %input : memr // GRID-LABEL: func.func @rock_conv_f16 func.func @rock_conv_f16(%filter : memref<1x128x8x3x3xf16>, %input : memref<128x1x8x32x32xf16>, %output : memref<128x1x128x30x30xf16>) attributes {rock.arch = "amdgcn-amd-amdhsa:gfx906"} { // CHECK: rock.conv - // CHECK-SAME: params = #rock.general_gemm_params + // CHECK-SAME: params = #rock.general_gemm_params // GRID: rock.gridwise_gemm // GRID-SAME: gridSize = 1800 rock.conv(%filter, %input, %output) features = none { @@ -64,10 +64,10 @@ func.func @rock_conv_f16(%filter : memref<1x128x8x3x3xf16>, %input : memref<128x // GRID-LABEL: func.func @rock_conv_i8 func.func @rock_conv_i8(%filter : memref<1x128x8x3x3xi8>, %input : memref<128x1x8x32x32xi8>, %output : memref<128x1x128x30x30xi32>) attributes {rock.arch = "amdgcn-amd-amdhsa:gfx908"} { // CHECK: rock.conv - // CHECK-SAME: derivedBlockSize = 64 - // CHECK-SAME: params = #rock.accel_gemm_params + // CHECK-SAME: derivedBlockSize = 256 + // CHECK-SAME: params = #rock.accel_gemm_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 3600 + // GRID-SAME: gridSize = 900 rock.conv(%filter, %input, %output) features = mfma|dot|atomic_add|atomic_add_f16 { filter_layout = ["g", "k", "c", "0", "1"], input_layout = ["ni", "gi", "ci", "0i", "1i"], @@ -84,9 +84,9 @@ func.func @rock_conv_i8(%filter : memref<1x128x8x3x3xi8>, %input : memref<128x1x func.func @rock_conv_bwd_data(%filter: memref<1x1024x1024x1x1xf32>, %input: memref<128x1x1024x14x14xf32>, %output: memref<128x1x1024x14x14xf32>) attributes {rock.kernel = 0 : i32, rock.arch = "amdgcn-amd-amdhsa:gfx908"} { // CHECK: rock.conv_bwd_data // CHECK-SAME: derivedBlockSize = 256 - // CHECK-SAME: params = #rock.accel_gemm_params + // CHECK-SAME: params = #rock.accel_gemm_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 25088 + // GRID-SAME: gridSize = 6272 rock.conv_bwd_data(%filter, %input, %output) features = mfma|dot|atomic_add|atomic_add_f16 { dilations = [1 : index, 1 : index], filter_layout = ["g", "k", "c", "0", "1"], @@ -105,9 +105,9 @@ func.func @rock_conv_bwd_data(%filter: memref<1x1024x1024x1x1xf32>, %input: memr func.func @rock_conv_bwd_data_f16(%filter: memref<1x1024x1024x1x1xf16>, %input: memref<128x1x1024x14x14xf16>, %output: memref<128x1x1024x14x14xf16>) attributes {rock.kernel = 0 : i32, rock.arch = "amdgcn-amd-amdhsa:gfx908"} { // CHECK: rock.conv_bwd_data // CHECK-SAME: derivedBlockSize = 256 - // CHECK-SAME: params = #rock.accel_gemm_params + // CHECK-SAME: params = #rock.accel_gemm_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 25088 + // GRID-SAME: gridSize = 12544 rock.conv_bwd_data(%filter, %input, %output) features = mfma|dot|atomic_add|atomic_add_f16 { dilations = [1 : index, 1 : index], filter_layout = ["g", "k", "c", "0", "1"], @@ -125,9 +125,9 @@ func.func @rock_conv_bwd_data_f16(%filter: memref<1x1024x1024x1x1xf16>, %input: // GRID-LABEL: func.func @rock_conv_bwd_data_padMN func.func @rock_conv_bwd_data_padMN(%filter : memref<1x64x3x1x1xf32>, %input : memref<11x1x3x15x15xf32>, %output : memref<11x1x64x15x15xf32>) attributes {rock.arch = "amdgcn-amd-amdhsa:gfx906"} { // CHECK: rock.conv_bwd_data - // CHECK-SAME: params = #rock.general_gemm_params + // CHECK-SAME: params = #rock.general_gemm_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 39 + // GRID-SAME: gridSize = 78 rock.conv_bwd_data(%filter, %input, %output) features = none { filter_layout = ["g", "k", "c", "0", "1"], input_layout = ["ni", "gi", "ci", "0i", "1i"], @@ -145,9 +145,9 @@ func.func @rock_conv_bwd_data_padMN(%filter : memref<1x64x3x1x1xf32>, %input : m // GRID-LABEL: @rock_conv_bwd_data_padMK func.func @rock_conv_bwd_data_padMK(%filter : memref<1x11x3x1x1xf32>, %input : memref<128x1x3x15x15xf32>, %output : memref<128x1x11x15x15xf32>) attributes {rock.arch = "amdgcn-amd-amdhsa:gfx906"} { // CHECK: rock.conv_bwd_data - // CHECK-SAME: params = #rock.general_gemm_params + // CHECK-SAME: params = #rock.general_gemm_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 225 + // GRID-SAME: gridSize = 450 rock.conv_bwd_data(%filter, %input, %output) features = none { filter_layout = ["g", "k", "c", "0", "1"], input_layout = ["ni", "gi", "ci", "0i", "1i"], @@ -165,9 +165,9 @@ func.func @rock_conv_bwd_data_padMK(%filter : memref<1x11x3x1x1xf32>, %input : m // GRID-LABEL: @rock_conv_bwd_weight func.func @rock_conv_bwd_weight(%filter : memref<1x128x8x3x3xf32>, %input : memref<128x1x8x32x32xf32>, %output : memref<128x1x128x30x30xf32>) attributes {rock.arch = "amdgcn-amd-amdhsa:gfx906", numCU = 64 : i32} { // CHECK: rock.conv_bwd_weight - // CHECK-SAME: params = #rock.general_gemm_params + // CHECK-SAME: params = #rock.general_gemm_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 6 + // GRID-SAME: gridSize = 12 rock.conv_bwd_weight(%filter, %input, %output) features = none { filter_layout = ["g", "k", "c", "0", "1"], input_layout = ["ni", "gi", "ci", "0i", "1i"], @@ -183,9 +183,9 @@ func.func @rock_conv_bwd_weight(%filter : memref<1x128x8x3x3xf32>, %input : memr // GRID-LABEL: @rock_conv_bwd_weight_f16 func.func @rock_conv_bwd_weight_f16(%filter : memref<1x128x8x3x3xf16>, %input : memref<128x1x8x32x32xf16>, %output : memref<128x1x128x30x30xf16>) attributes {rock.arch = "amdgcn-amd-amdhsa:gfx906", numCU = 64 : i32} { // CHECK: rock.conv_bwd_weight - // CHECK-SAME: params = #rock.general_gemm_params + // CHECK-SAME: params = #rock.general_gemm_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 6 + // GRID-SAME: gridSize = 12 rock.conv_bwd_weight(%filter, %input, %output) features = none { filter_layout = ["g", "k", "c", "0", "1"], input_layout = ["ni", "gi", "ci", "0i", "1i"], @@ -201,7 +201,7 @@ func.func @rock_conv_bwd_weight_f16(%filter : memref<1x128x8x3x3xf16>, %input : // GRID-LABEL: func.func @rock_conv_bwd_weight_padALL func.func @rock_conv_bwd_weight_padALL(%filter : memref<1x20x8x3x3xf32>, %input : memref<7x1x8x32x32xf32>, %output : memref<7x1x20x30x30xf32>) attributes {rock.arch = "amdgcn-amd-amdhsa:gfx906", numCU = 64 : i32} { // CHECK: rock.conv_bwd_weight - // CHECK-SAME: params = #rock.general_gemm_params + // CHECK-SAME: params = #rock.general_gemm_params // GRID: rock.gridwise_gemm // GRID-SAME: gridSize = 3 rock.conv_bwd_weight(%filter, %input, %output) features = none { @@ -219,7 +219,7 @@ func.func @rock_conv_bwd_weight_padALL(%filter : memref<1x20x8x3x3xf32>, %input // GRID-LABEL: @rock_conv_bwd_weight_padALL_f16 func.func @rock_conv_bwd_weight_padALL_f16(%filter : memref<1x20x8x3x3xf16>, %input : memref<7x1x8x32x32xf16>, %output : memref<7x1x20x30x30xf16>) attributes {rock.arch = "amdgcn-amd-amdhsa:gfx906", numCU = 64 : i32} { // CHECK: rock.conv_bwd_weight - // CHECK-SAME: params = #rock.general_gemm_params + // CHECK-SAME: params = #rock.general_gemm_params // GRID: rock.gridwise_gemm // GRID-SAME: gridSize = 3 rock.conv_bwd_weight(%filter, %input, %output) features = none { @@ -259,10 +259,10 @@ func.func @rock_conv_7x7_tuning(%arg0: memref<1x64x3x7x7xf32>, %arg1: memref<256 // GRID-LABEL: @rock_conv_7x7 func.func @rock_conv_7x7(%arg0: memref<1x64x3x7x7xf32>, %arg1: memref<256x1x3x230x230xf32>, %arg2: memref<256x1x64x112x112xf32>) attributes {rock.arch = "amdgcn-amd-amdhsa:gfx908"} { // CHECK: rock.conv - // CHECK-SAME: derivedBlockSize = 64 - // CHECK-SAME: params = #rock.accel_gemm_params + // CHECK-SAME: derivedBlockSize = 256 + // CHECK-SAME: params = #rock.accel_gemm_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 100352 + // GRID-SAME: gridSize = 12544 rock.conv(%arg0, %arg1, %arg2) features = mfma|dot|atomic_add|atomic_add_f16 { dilations = [1 : index, 1 : index], filter_layout = ["g", "k", "c", "0", "1"], @@ -279,7 +279,7 @@ func.func @rock_conv_7x7(%arg0: memref<1x64x3x7x7xf32>, %arg1: memref<256x1x3x23 func.func @rock_conv_bwd_weight_7x7(%arg0: memref<1x64x3x7x7xf32>, %arg1: memref<256x1x3x230x230xf32>, %arg2: memref<256x1x64x112x112xf32>) attributes {rock.kernel = 0 : i32, rock.arch = "amdgcn-amd-amdhsa:gfx908", numCU = 120 : i32} { // CHECK: rock.conv_bwd_weight // CHECK-SAME: derivedBlockSize = 256 - // CHECK-SAME: params = #rock.accel_gemm_params + // CHECK-SAME: params = #rock.accel_gemm_params // GRID: rock.gridwise_gemm // GRID-SAME: gridSize = 10 rock.conv_bwd_weight(%arg0, %arg1, %arg2) features = mfma|dot|atomic_add|atomic_add_f16 { @@ -319,10 +319,10 @@ func.func @rock_conv_bwd_data_7x7_tuning(%arg0: memref<1x64x3x7x7xf32>, %arg1: m // GRID-LABEL: @rock_conv_bwd_data_7x7 func.func @rock_conv_bwd_data_7x7(%arg0: memref<1x64x3x7x7xf32>, %arg1: memref<256x1x3x230x230xf32>, %arg2: memref<256x1x64x112x112xf32>) attributes {rock.kernel = 1 : i32, rock.arch = "amdgcn-amd-amdhsa:gfx908"} { // CHECK: rock.conv_bwd_data - // CHECK-SAME: derivedBlockSize = 64 - // CHECK-SAME: params = #rock.accel_gemm_params + // CHECK-SAME: derivedBlockSize = 256 + // CHECK-SAME: params = #rock.accel_gemm_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 211600 + // GRID-SAME: gridSize = 52900 rock.conv_bwd_data(%arg0, %arg1, %arg2) features = mfma|dot|atomic_add|atomic_add_f16 { dilations = [1 : index, 1 : index], filter_layout = ["g", "k", "c", "0", "1"], @@ -340,9 +340,9 @@ func.func @rock_conv_bwd_data_7x7(%arg0: memref<1x64x3x7x7xf32>, %arg1: memref<2 // GRID-LABEL: @rock_gemm_from_conv func.func @rock_gemm_from_conv(%a : memref<1x72x128xf32>, %b : memref<1x72x115200xf32>, %c : memref<1x128x115200xf32>) attributes {rock.arch = "amdgcn-amd-amdhsa:gfx906", numCU = 64 : i32} { // CHECK: rock.gemm - // CHECK-SAME: params = #rock.general_gemm_params + // CHECK-SAME: params = #rock.general_gemm_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 900 + // GRID-SAME: gridSize = 1800 rock.gemm %c = tr %a * %b features = none storeMethod = set : memref<1x128x115200xf32> = memref<1x72x128xf32> * memref<1x72x115200xf32> return @@ -352,10 +352,10 @@ func.func @rock_gemm_from_conv(%a : memref<1x72x128xf32>, %b : memref<1x72x11520 // GRID-LABEL: func.func @rock_gemm_from_i8_conv func.func @rock_gemm_from_i8_conv(%a : memref<1x72x128xi8>, %b : memref<1x72x115200xi8>, %c : memref<1x128x115200xi32>) attributes {rock.arch = "amdgcn-amd-amdhsa:gfx908", numCU = 120 : i32} { // CHECK: rock.gemm - // CHECK-SAME: derivedBlockSize = 256 - // CHECK-SAME: params = #rock.accel_gemm_params + // CHECK-SAME: derivedBlockSize = 512 + // CHECK-SAME: params = #rock.accel_gemm_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 7200 + // GRID-SAME: gridSize = 1800 rock.gemm %c = tr %a * %b features = mfma|dot|atomic_add|atomic_add_f16 storeMethod = set : memref<1x128x115200xi32> = memref<1x72x128xi8> * memref<1x72x115200xi8> return @@ -365,10 +365,10 @@ func.func @rock_gemm_from_i8_conv(%a : memref<1x72x128xi8>, %b : memref<1x72x115 // GRID-LABEL: func.func @rock_gemm_from_i8_conv_schedule_v2 func.func @rock_gemm_from_i8_conv_schedule_v2(%a : memref<1x72x128xi8>, %b : memref<1x72x115200xi8>, %c : memref<1x128x115200xi32>) attributes {rock.schedule_version = #rock.rock.schedule_version<2>, rock.arch = "amdgcn-amd-amdhsa:gfx908", numCU = 120 : i32} { // CHECK: rock.gemm - // CHECK-SAME: derivedBlockSize = 256 - // CHECK-SAME: params = #rock.accel_gemm_params + // CHECK-SAME: derivedBlockSize = 512 + // CHECK-SAME: params = #rock.accel_gemm_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 7200 + // GRID-SAME: gridSize = 1800 rock.gemm %c = tr %a * %b features = mfma|dot|atomic_add|atomic_add_f16 storeMethod = set : memref<1x128x115200xi32> = memref<1x72x128xi8> * memref<1x72x115200xi8> return @@ -381,10 +381,10 @@ func.func @rock_gemm_from_i8_conv_schedule_v2(%a : memref<1x72x128xi8>, %b : mem // GRID-LABEL: func.func @rock_gemm_from_i8_conv_gfx942 func.func @rock_gemm_from_i8_conv_gfx942(%a : memref<1x72x128xi8>, %b : memref<1x72x115200xi8>, %c : memref<1x128x115200xi32>) attributes {rock.arch = "amdgcn-amd-amdhsa:gfx942", numCU = 120 : i32} { // CHECK: rock.gemm - // CHECK-SAME: derivedBlockSize = 256 - // CHECK-SAME: params = #rock.accel_gemm_params + // CHECK-SAME: derivedBlockSize = 512 + // CHECK-SAME: params = #rock.accel_gemm_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 14400 + // GRID-SAME: gridSize = 1800 rock.gemm %c = tr %a * %b features = mfma|dot|atomic_add|atomic_add_f16 storeMethod = set : memref<1x128x115200xi32> = memref<1x72x128xi8> * memref<1x72x115200xi8> return diff --git a/mlir/test/fusion/pr-e2e/gemm-layouts/noTransA-noTransB/broadcasted-k-e2e.mlir b/mlir/test/fusion/pr-e2e/gemm-layouts/noTransA-noTransB/broadcasted-k-e2e.mlir index 92be2f1994b5..8ba76962c546 100644 --- a/mlir/test/fusion/pr-e2e/gemm-layouts/noTransA-noTransB/broadcasted-k-e2e.mlir +++ b/mlir/test/fusion/pr-e2e/gemm-layouts/noTransA-noTransB/broadcasted-k-e2e.mlir @@ -7,7 +7,7 @@ // EMITKEY: -t f16 -out_datatype f16 -transA false -transB false -g 2 -m 4096 -n 640 -k 320 // VECTORIZATION: aVectorDim: GemmDimension::MorN -// VECTORIZATION-NEXT: aVectorLen: 4 +// VECTORIZATION-NEXT: aVectorLen: 2 // VECTORIZATION: bVectorDim: GemmDimension::MorN // VECTORIZATION-NEXT: bVectorLen: 2 diff --git a/mlir/test/fusion/pr-e2e/gemm-layouts/noTransA-transB/broadcasted-k-e2e.mlir b/mlir/test/fusion/pr-e2e/gemm-layouts/noTransA-transB/broadcasted-k-e2e.mlir index 853c3779e9af..778dbac1e10a 100644 --- a/mlir/test/fusion/pr-e2e/gemm-layouts/noTransA-transB/broadcasted-k-e2e.mlir +++ b/mlir/test/fusion/pr-e2e/gemm-layouts/noTransA-transB/broadcasted-k-e2e.mlir @@ -7,7 +7,7 @@ // EMITKEY: -t f16 -out_datatype f16 -transA false -transB true -g 2 -m 4096 -n 640 -k 320 // VECTORIZATION: aVectorDim: GemmDimension::MorN -// VECTORIZATION-NEXT: aVectorLen: 4 +// VECTORIZATION-NEXT: aVectorLen: 2 // VECTORIZATION: bVectorDim: GemmDimension::K // VECTORIZATION-NEXT: bVectorLen: 8 diff --git a/mlir/test/fusion/pr-e2e/gemm-layouts/transA-noTransB/gemm-k-e2e.mlir b/mlir/test/fusion/pr-e2e/gemm-layouts/transA-noTransB/gemm-k-e2e.mlir index 1c1f9e26dc68..6164549e560a 100644 --- a/mlir/test/fusion/pr-e2e/gemm-layouts/transA-noTransB/gemm-k-e2e.mlir +++ b/mlir/test/fusion/pr-e2e/gemm-layouts/transA-noTransB/gemm-k-e2e.mlir @@ -6,7 +6,7 @@ // EMITKEY: -t f16 -out_datatype f16 -transA true -transB false -g 2 -m 4096 -n 640 -k 320 // VECTORIZATION: aVectorDim: GemmDimension::MorN -// VECTORIZATION-NEXT: aVectorLen: 4 +// VECTORIZATION-NEXT: aVectorLen: 2 // VECTORIZATION: bVectorDim: GemmDimension::MorN // VECTORIZATION-NEXT: bVectorLen: 2 diff --git a/mlir/test/fusion/pr-e2e/gemm-layouts/transA-noTransB/sliced-k-e2e.mlir b/mlir/test/fusion/pr-e2e/gemm-layouts/transA-noTransB/sliced-k-e2e.mlir index 9a89a23dfe17..b35867a25f25 100644 --- a/mlir/test/fusion/pr-e2e/gemm-layouts/transA-noTransB/sliced-k-e2e.mlir +++ b/mlir/test/fusion/pr-e2e/gemm-layouts/transA-noTransB/sliced-k-e2e.mlir @@ -7,7 +7,7 @@ // EMITKEY: -t f16 -out_datatype f16 -transA true -transB false -g 2 -m 4096 -n 640 -k 320 // VECTORIZATION: aVectorDim: GemmDimension::MorN -// VECTORIZATION-NEXT: aVectorLen: 4 +// VECTORIZATION-NEXT: aVectorLen: 2 // VECTORIZATION: bVectorDim: GemmDimension::MorN // VECTORIZATION-NEXT: bVectorLen: 2 diff --git a/mlir/test/fusion/pr-e2e/gemm-layouts/transA-noTransB/unitdim-m-e2e.mlir b/mlir/test/fusion/pr-e2e/gemm-layouts/transA-noTransB/unitdim-m-e2e.mlir index 2eb79a3eab78..a22cf6f17803 100644 --- a/mlir/test/fusion/pr-e2e/gemm-layouts/transA-noTransB/unitdim-m-e2e.mlir +++ b/mlir/test/fusion/pr-e2e/gemm-layouts/transA-noTransB/unitdim-m-e2e.mlir @@ -9,7 +9,7 @@ // VECTORIZATION: aVectorDim: GemmDimension::K // VECTORIZATION-NEXT: aVectorLen: 8 // VECTORIZATION: bVectorDim: GemmDimension::MorN -// VECTORIZATION-NEXT: bVectorLen: 4 +// VECTORIZATION-NEXT: bVectorLen: 8 module { func.func @test(%arg0: !migraphx.shaped<2x1x320xf16, 320x1x1>, %arg1: !migraphx.shaped<2x640x320xf16, 204800x1x640>, %arg2: !migraphx.shaped<2x64x10xf16, 0x10x1>) -> !migraphx.shaped<2x64x10xf16, 640x10x1> { diff --git a/mlir/test/fusion/pr-e2e/gemm-layouts/transA-transB/gemm-k-e2e.mlir b/mlir/test/fusion/pr-e2e/gemm-layouts/transA-transB/gemm-k-e2e.mlir index 5753af9c6330..62dcc2df3117 100644 --- a/mlir/test/fusion/pr-e2e/gemm-layouts/transA-transB/gemm-k-e2e.mlir +++ b/mlir/test/fusion/pr-e2e/gemm-layouts/transA-transB/gemm-k-e2e.mlir @@ -7,7 +7,7 @@ // EMITKEY: -t f16 -out_datatype f16 -transA true -transB true -g 2 -m 4096 -n 640 -k 320 // VECTORIZATION: aVectorDim: GemmDimension::MorN -// VECTORIZATION-NEXT: aVectorLen: 4 +// VECTORIZATION-NEXT: aVectorLen: 2 // VECTORIZATION: bVectorDim: GemmDimension::K // VECTORIZATION-NEXT: bVectorLen: 8 diff --git a/mlir/test/fusion/pr-e2e/gemm-layouts/transA-transB/sliced-k-e2e.mlir b/mlir/test/fusion/pr-e2e/gemm-layouts/transA-transB/sliced-k-e2e.mlir index b19e6a1ec7df..3f95dfc8caae 100644 --- a/mlir/test/fusion/pr-e2e/gemm-layouts/transA-transB/sliced-k-e2e.mlir +++ b/mlir/test/fusion/pr-e2e/gemm-layouts/transA-transB/sliced-k-e2e.mlir @@ -7,7 +7,7 @@ // EMITKEY: -t f16 -out_datatype f16 -transA true -transB true -g 2 -m 4096 -n 640 -k 320 // VECTORIZATION: aVectorDim: GemmDimension::MorN -// VECTORIZATION-NEXT: aVectorLen: 4 +// VECTORIZATION-NEXT: aVectorLen: 2 // VECTORIZATION: bVectorDim: GemmDimension::K // VECTORIZATION-NEXT: bVectorLen: 8 diff --git a/mlir/utils/performance/analysis/models/gfx1201_conv_f16.ubj b/mlir/utils/performance/analysis/models/gfx1201_conv_f16.ubj new file mode 100644 index 000000000000..ab573c6efc94 Binary files /dev/null and b/mlir/utils/performance/analysis/models/gfx1201_conv_f16.ubj differ diff --git a/mlir/utils/performance/analysis/models/gfx1201_conv_f32.ubj b/mlir/utils/performance/analysis/models/gfx1201_conv_f32.ubj new file mode 100644 index 000000000000..d4241d89e530 Binary files /dev/null and b/mlir/utils/performance/analysis/models/gfx1201_conv_f32.ubj differ diff --git a/mlir/utils/performance/analysis/models/gfx1201_conv_i8.ubj b/mlir/utils/performance/analysis/models/gfx1201_conv_i8.ubj new file mode 100644 index 000000000000..4fe31bc9d19b Binary files /dev/null and b/mlir/utils/performance/analysis/models/gfx1201_conv_i8.ubj differ diff --git a/mlir/utils/performance/analysis/models/gfx942_conv_f16.ubj b/mlir/utils/performance/analysis/models/gfx942_conv_f16.ubj new file mode 100644 index 000000000000..159a2df590a6 Binary files /dev/null and b/mlir/utils/performance/analysis/models/gfx942_conv_f16.ubj differ diff --git a/mlir/utils/performance/analysis/models/gfx942_conv_f32.ubj b/mlir/utils/performance/analysis/models/gfx942_conv_f32.ubj new file mode 100644 index 000000000000..6be7414f1517 Binary files /dev/null and b/mlir/utils/performance/analysis/models/gfx942_conv_f32.ubj differ diff --git a/mlir/utils/performance/analysis/models/gfx942_conv_i8.ubj b/mlir/utils/performance/analysis/models/gfx942_conv_i8.ubj new file mode 100644 index 000000000000..7a9ce7c3eb06 Binary files /dev/null and b/mlir/utils/performance/analysis/models/gfx942_conv_i8.ubj differ diff --git a/mlir/utils/performance/analysis/quickTuningGen.py b/mlir/utils/performance/analysis/quickTuningGen.py index ed0b745363a5..51622e9a74f6 100644 --- a/mlir/utils/performance/analysis/quickTuningGen.py +++ b/mlir/utils/performance/analysis/quickTuningGen.py @@ -132,7 +132,7 @@ def validate_files(files): sys.exit(1) -def load_data(files, no_splitk): +def load_data(files, no_splitk, usecols): """Load tuning data from files or stdin.""" if files: validate_files(files) @@ -141,7 +141,7 @@ def load_data(files, no_splitk): for f in files: print(f" {f}") - dfs = [pd.read_csv(f, sep='\t', index_col=None) for f in files] + dfs = [pd.read_csv(f, sep='\t', index_col=None, usecols=usecols) for f in files] df = pd.concat(dfs, ignore_index=True) else: # Read TSV content from stdin @@ -470,7 +470,8 @@ def main(args=None): # Generate quick-tune lists if pargs.op: - df = load_data(pargs.files, pargs.no_splitk) + needed = set(get_target_columns(pargs.op)) | {'Chip', 'DataType', 'PerfConfig', 'TFlops'} + df = load_data(pargs.files, pargs.no_splitk, usecols=needed) if not df.empty: archs = sorted(df['Chip'].unique()) print(f"Processing {len(archs)} architecture(s): {', '.join(archs)}") diff --git a/mlir/utils/performance/analysis/trainQuickTuneClassifier.py b/mlir/utils/performance/analysis/trainQuickTuneClassifier.py new file mode 100755 index 000000000000..5c3552d645ef --- /dev/null +++ b/mlir/utils/performance/analysis/trainQuickTuneClassifier.py @@ -0,0 +1,365 @@ +#!/usr/bin/env python3 +"""Quick-Tune Classifier Training Script + +Trains XGBoost regressors to predict the performance ratio of quick-tune-list +perfconfigs for a given problem size. Produces one .ubj model file per +(arch, op, dtype) combination. + +Training data is filtered to only include perfconfigs present in the +quick-tune list (obtained via ``rocmlir-gen --emit-tuning-space=quick``). +""" + +import argparse +import os +import subprocess +import sys +from pathlib import Path + +import numpy as np +from xgboost import XGBRegressor + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) +import perfRunner # noqa: E402 + +from quickTuningGen import ( # noqa: E402 + get_target_columns, load_data, parse_perfconfig, +) + +DIRECTION_MAP = {'fwd': 0, 'bwd': 1, 'wrw': 2} + +# --------------------------------------------------------------------------- +# Quick-tune list via rocmlir-gen +# --------------------------------------------------------------------------- + + +def get_quick_tune_configs(rocmlir_gen, arch, op, dtype): + """Run ``rocmlir-gen --emit-tuning-space=quick`` and return perfconfigs.""" + cmd = [ + rocmlir_gen, + '-p', + '--arch', + arch, + f'--operation={op}', + '-t', + dtype, + '--emit-tuning-space=quick', + ] + env = {**os.environ, 'ROCMLIR_QUICK_TUNE_TOP_N': '0'} + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=10, env=env) + except (subprocess.TimeoutExpired, FileNotFoundError) as e: + print(f' rocmlir-gen failed: {e}', file=sys.stderr) + return None + if result.returncode != 0: + return None + configs = [line.strip() for line in result.stdout.splitlines() if line.strip()] + return configs if configs else None + + +# --------------------------------------------------------------------------- +# Feature extraction: raw problem dims + raw perfconfig params. +# Each op type uses its own problem features; perfconfig params are always +# the raw numeric values from the perfconfig string. Must match the C++ +# feature builders in QuickTuningClassifier.cpp exactly. +# --------------------------------------------------------------------------- + + +def _perfconfig_params(perfconfig): + """Extract raw numeric params from a perfconfig string.""" + _, _, params = parse_perfconfig(perfconfig) + return [int(x) for x in params] + + +def _conv_problem_features(df): + """Raw conv dimensions as feature columns (14 features).""" + direction = np.array([DIRECTION_MAP.get(d, 0) + for d in df['Direction'].values], + dtype=np.float32) + return np.column_stack([ + df['N'].values, df['C'].values, df['H'].values, df['W'].values, + df['K'].values, df['Y'].values, df['X'].values, + df['PaddingH'].values, df['PaddingW'].values, + df['StrideH'].values, df['StrideW'].values, + df['DilationH'].values, df['DilationW'].values, + direction, + ]).astype(np.float32) + + +def _gemm_problem_features(df): + """Raw GEMM dimensions as feature columns (4 features).""" + return np.column_stack([ + df['G'].values, df['M'].values, df['N'].values, df['K'].values, + ]).astype(np.float32) + + +def _attention_problem_features(df): + """Raw attention dimensions as feature columns (7 features).""" + return np.column_stack([ + df['G'].values, df['SeqLenQ'].values, df['SeqLenK'].values, + df['NumHeadsQ'].values, df['NumHeadsKV'].values, + df['HeadDimQK'].values, df['HeadDimV'].values, + ]).astype(np.float32) + + +def build_feature_matrix(df, op): + """Raw problem dims + raw perfconfig params.""" + pc_raw = np.array([_perfconfig_params(pc) for pc in df['PerfConfig']], + dtype=np.float32) + if op == 'conv': + problem = _conv_problem_features(df) + elif op == 'attention': + problem = _attention_problem_features(df) + else: + problem = _gemm_problem_features(df) + feats = np.column_stack([problem, pc_raw]).astype(np.float32) + return feats, feats.shape[1] + + +# --------------------------------------------------------------------------- +# Coverage evaluation +# --------------------------------------------------------------------------- + + +def _topn_ratios(model, df, problem_cols, op, top_n): + """For each problem, pick top-N by predicted score and return the best + actual TFlops ratio found per problem.""" + features, _ = build_feature_matrix(df, op) + pred = model.predict(features) + + df = df.copy() + df['_pred'] = pred + + ratios = [] + for _, grp in df.groupby(problem_cols): + top = grp.nlargest(min(top_n, len(grp)), '_pred') + ratios.append(top['ratio'].max()) + return np.array(ratios) + + +def evaluate_coverage(model, df, problem_cols, op, top_n): + """Returns (mean_ratio, min_ratio) on the given data.""" + ratios = _topn_ratios(model, df, problem_cols, op, top_n) + return float(ratios.mean()), float(ratios.min()) + + +def cross_validate(df, problem_cols, op, top_n, n_folds=5, model_params=None): + """K-fold cross-validation split by problem (not by sample). + + Trains on (k-1)/k of the problems, evaluates on the held-out 1/k. + Returns per-problem coverage ratios for all held-out problems. + """ + df = df.copy() + df['_problem_id'] = df.groupby(problem_cols).ngroup() + unique_problems = np.array(sorted(df['_problem_id'].unique())) + + rng = np.random.RandomState(42) + shuffled = rng.permutation(unique_problems) + folds = np.array_split(shuffled, n_folds) + + all_ratios = [] + + for fold_idx, test_problem_ids in enumerate(folds): + test_set = set(test_problem_ids) + train_mask = ~df['_problem_id'].isin(test_set) + test_mask = df['_problem_id'].isin(test_set) + + df_train = df[train_mask].reset_index(drop=True) + df_test = df[test_mask].reset_index(drop=True) + + if df_train.empty or df_test.empty: + continue + + X_train, _ = build_feature_matrix(df_train, op) + y_train = df_train['ratio'].values.astype(np.float32) + + fold_model = XGBRegressor(**(model_params or {})) + fold_model.fit(X_train, y_train) + + fold_ratios = _topn_ratios(fold_model, df_test, problem_cols, op, top_n) + n_test = len(test_set) + print(f' fold {fold_idx+1}/{n_folds}: {n_test} problems, ' + f'mean={float(fold_ratios.mean()):.4f} min={float(fold_ratios.min()):.4f}', + flush=True) + all_ratios.extend(fold_ratios.tolist()) + + return np.array(all_ratios) + + +# --------------------------------------------------------------------------- +# Training loop +# --------------------------------------------------------------------------- + + +def train_models(df, op, top_n, output_dir, rocmlir_gen): + problem_cols = get_target_columns(op) + + summary = [] + + for arch in sorted(df['Chip'].unique()): + df_arch = df[df['Chip'] == arch] + + for dtype in sorted(df_arch['DataType'].unique()): + key = f'{arch}_{op}_{dtype}' + + print(f'\n {key}: fetching quick-tune list ...') + qt_list = get_quick_tune_configs(rocmlir_gen, arch, op, dtype) + if qt_list is None: + print(' no quick-tune list available, skipping') + continue + + qt_set = set(qt_list) + mask = (df_arch['DataType'] == dtype) & df_arch['PerfConfig'].isin(qt_set) + df_sub = df_arch[mask].copy() + + if df_sub.empty: + print(' no matching rows in data, skipping') + continue + + n_pc_params = len(_perfconfig_params(df_sub['PerfConfig'].iloc[0])) + bad_rows = df_sub['PerfConfig'].apply( + lambda pc, n=n_pc_params: len(_perfconfig_params(pc)) != n) + if bad_rows.any(): + print(f' dropping {bad_rows.sum()} rows with inconsistent ' + f'perfconfig param counts') + df_sub = df_sub[~bad_rows] + + df_best = df_sub.groupby(problem_cols + ['PerfConfig'], as_index=False)['TFlops'].max() + df_best = df_best[df_best['TFlops'].notna() & (df_best['TFlops'] > 0)] + + max_tf = df_best.groupby( + problem_cols, as_index=False)['TFlops'].max().rename(columns={'TFlops': '_max_tf'}) + df_labeled = df_best.merge(max_tf, on=problem_cols) + df_labeled['ratio'] = df_labeled['TFlops'] / df_labeled['_max_tf'] + + y = df_labeled['ratio'].values.astype(np.float32) + + features, n_feats = build_feature_matrix(df_labeled, op) + + print(f' samples: {len(y)}', flush=True) + print(f' features: {n_feats} (raw problem + perfconfig)', flush=True) + n_problems = len(max_tf) + configs_per_problem = len(y) // n_problems if n_problems else 0 + print(f' quick-tune configs: {len(qt_set)}, ' + f'unique problems: {n_problems}, ' + f'~{configs_per_problem} configs/problem', + flush=True) + + model_params = dict( + objective='reg:squarederror', + n_estimators=1000, + max_depth=12, + learning_rate=0.1, + subsample=1.0, + colsample_bytree=1.0, + n_jobs=min(os.cpu_count() or 1, 16), + random_state=42, + verbosity=0, + ) + model = XGBRegressor(**model_params) + + print(' training (reg:squarederror)...', flush=True) + model.fit(features, y) + + mean_r, min_r = evaluate_coverage(model, df_labeled, problem_cols, op, top_n) + print(f' train top-{top_n}: mean={mean_r:.4f} min={min_r:.4f}', flush=True) + + print(f' cross-validating (5-fold by problem)...', flush=True) + cv_ratios = cross_validate(df_labeled, problem_cols, op, top_n, + n_folds=5, model_params=model_params) + cv_mean = float(cv_ratios.mean()) + cv_min = float(cv_ratios.min()) + print(f' CV top-{top_n}: mean={cv_mean:.4f} min={cv_min:.4f}', flush=True) + + model_path = Path(output_dir) / f'{key}.ubj' + model_path.parent.mkdir(parents=True, exist_ok=True) + model.save_model(str(model_path)) + print(f' saved: {model_path}') + + summary.append({ + 'key': key, + 'samples': len(y), + 'mean_ratio': mean_r, + 'min_ratio': min_r, + 'cv_mean': cv_mean, + 'cv_min': cv_min, + }) + + return summary + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def main(args=None): + parser = argparse.ArgumentParser( + prog='trainQuickTuneClassifier.py', + formatter_class=argparse.RawDescriptionHelpFormatter, + description='Train XGBoost classifiers for quick-tune perfconfig ranking.', + epilog=''' +Examples: + %(prog)s --op gemm \\ + /data/gfx942/gemm-exhaustive.tsv.debug --output-dir models/ + %(prog)s --op conv \\ + /data/gfx942/conv-tier1.tsv.debug --output-dir models/ + %(prog)s --op attention \\ + /data/gfx942/attn-exhaustive.tsv.debug --output-dir models/ +''') + + parser.add_argument('files', + nargs='+', + metavar='FILE', + help='.debug TSV files produced by tuningRunner.py') + parser.add_argument('--op', + required=True, + choices=['gemm', 'conv', 'attention'], + help='Operation type') + parser.add_argument('--top-n', + type=int, + default=30, + help='Number of top candidates for coverage evaluation (default: 30)') + parser.add_argument('--output-dir', + default='models/', + help='Output directory for .ubj model files (default: models/)') + parser.add_argument('--mlir-build-dir', + default=perfRunner.find_mlir_build_dir(), + metavar='DIR', + help='MLIR build directory (auto-detected if omitted)') + + pargs = parser.parse_args(args) + + paths = perfRunner.create_paths(None, pargs.mlir_build_dir) + if paths.mlir_paths is None: + parser.error('Cannot find rocmlir-gen. ' + 'Use --mlir-build-dir to specify the build directory.') + rocmlir_gen = paths.mlir_paths.rocmlir_gen_path + + needed = set(get_target_columns( + pargs.op)) | {'Chip', 'DataType', 'PerfConfig', 'TFlops'} + print(f'Loading {len(pargs.files)} file(s)...', flush=True) + df = load_data(pargs.files, no_splitk=False, usecols=needed) + if df.empty: + print('ERROR: no data loaded', file=sys.stderr) + return 1 + + print(f'Loaded {len(df)} rows', flush=True) + print(f'Architectures: {sorted(df["Chip"].unique())}') + print(f'Data types: {sorted(df["DataType"].unique())}') + + results = train_models(df, pargs.op, pargs.top_n, pargs.output_dir, rocmlir_gen) + + if results: + print(f'\n{"=" * 60}') + print(f'Trained {len(results)} model(s):') + for r in results: + print(f' {r["key"]:30s} train: mean={r["mean_ratio"]:.4f} min={r["min_ratio"]:.4f}' + f' | CV: mean={r["cv_mean"]:.4f} min={r["cv_min"]:.4f}') + else: + print('\nNo models trained.') + + return 0 + + +if __name__ == '__main__': + sys.exit(main())