diff --git a/projects/hipblaslt/clients/tests/data/matmul_gtest.yaml b/projects/hipblaslt/clients/tests/data/matmul_gtest.yaml index 84db82eabe6d..d47da80e9fdf 100755 --- a/projects/hipblaslt/clients/tests/data/matmul_gtest.yaml +++ b/projects/hipblaslt/clients/tests/data/matmul_gtest.yaml @@ -1574,7 +1574,7 @@ Tests: beta: [ 0.0, 2.0 ] unit_check: 1 algo_method: [2] - gpu_arch_exclude: '1[1-2]\d{2}' + gpu_arch_exclude: '11\d{2}|120\d{1}' - name: matmul_groupedgemm category: pre_checkin @@ -1589,7 +1589,7 @@ Tests: beta: [ 0.0, 2.0 ] unit_check: 1 algo_method: [0, 2] - gpu_arch_exclude: '1[1-2]\d{2}' + gpu_arch_exclude: '11\d{2}|120\d{1}' - name: matmul_groupedgemm_f8_fnuz category: pre_checkin diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Ailk_Bjlk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Ailk_Bjlk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml new file mode 100644 index 000000000000..7bc5b30772cc --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Ailk_Bjlk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,1247 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx1250 +- gfx1250 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 4] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DataTypeMXSA: 22 + DataTypeMXSB: 22 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: true + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MXBlockA: 0 + MXBlockB: 0 + MacDataTypeA: 4 + MacDataTypeB: 4 + MetadataLayout: 0 + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMXSA: [] + MirrorDimsMXSB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + isMixMode: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT2RdeBVnY9LsdfU3UWF21lSGIg87uBBqtjbk3MlUl_5oo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_4_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS0_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 32 + LVCB: 128 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 229376 + LdsInitCVgprs: false + LdsNumBytes: 229376 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 65536 + LdsOffsetB_Blk: 196608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 196608 + LdsOffsetMXSB: 65536 + LdsOffsetMXSB_Blk: 196608 + LdsOffsetMetadata: 65536 + LdsOffsetMetadata_Blk: 196608 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 32 + NumLoadsB: 128 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 128 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_4_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS0_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 65536 + ldsNumBytesB: 32768 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT3kEX5yYoNZsA0zsf9ueqyg0Ro7mJZHUoj6JMMCT52iNI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS0_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 8192 + LdsInitCVgprs: false + LdsNumBytes: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 2048 + LdsOffsetMXSA_Blk: 6144 + LdsOffsetMXSB: 2048 + LdsOffsetMXSB_Blk: 6144 + LdsOffsetMetadata: 2048 + LdsOffsetMetadata_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS0_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 2048 + ldsNumBytesB: 2048 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT3pg5utUeMZAIeeeg3x6_6dFi2BSgNo4tvTwknAik6K20= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS0_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 16384 + LdsOffsetMXSB_Blk: 49152 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 64 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 64 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS0_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 512.0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 16384 + ldsNumBytesB: 16384 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT2zd7fMMojE6tyfJ33owyYyBJeTByPBbdCSSeNE82XGao= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_8_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB2_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS0_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 32 + LVCB: 128 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 16384 + LdsOffsetMXSB_Blk: 49152 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 512 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 64 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 32 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_8_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB2_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS0_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 8 + ThreadTileA: 64 + ThreadTileB: 8 + TransposeLDS: 0 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 16384 + ldsNumBytesB: 16384 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [256, 256, 1, 128] + - [0, 0.0] + - - [32, 32, 1, 128] + - [1, 0.0] + - - [32, 32, 1, 256] + - [2, 0.0] + - - [512, 512, 1, 32] + - [3, 0.0] +- null +- null +- DeviceEfficiency +- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Ailk_Bjlk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Ailk_Bjlk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml new file mode 100644 index 000000000000..3879f5b3c2ff --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Ailk_Bjlk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,1247 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx1250 +- gfx1250 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: true + BiasDataTypeList: [0, 4] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 0 + DataTypeMXSA: 22 + DataTypeMXSB: 22 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: true + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MXBlockA: 0 + MXBlockB: 0 + MacDataTypeA: 4 + MacDataTypeB: 4 + MetadataLayout: 0 + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMXSA: [] + MirrorDimsMXSB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + isMixMode: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Ailk_Bjlk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT2RdeBVnY9LsdfU3UWF21lSGIg87uBBqtjbk3MlUl_5oo= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_4_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS0_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 32 + LVCB: 128 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 229376 + LdsInitCVgprs: false + LdsNumBytes: 229376 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 65536 + LdsOffsetB_Blk: 196608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 196608 + LdsOffsetMXSB: 65536 + LdsOffsetMXSB_Blk: 196608 + LdsOffsetMetadata: 65536 + LdsOffsetMetadata_Blk: 196608 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 32 + NumLoadsB: 128 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 128 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bjlk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_4_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS0_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 0 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 65536 + ldsNumBytesB: 32768 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Ailk_Bjlk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT3kEX5yYoNZsA0zsf9ueqyg0Ro7mJZHUoj6JMMCT52iNI= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS0_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 8192 + LdsInitCVgprs: false + LdsNumBytes: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 2048 + LdsOffsetMXSA_Blk: 6144 + LdsOffsetMXSB: 2048 + LdsOffsetMXSB_Blk: 6144 + LdsOffsetMetadata: 2048 + LdsOffsetMetadata_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bjlk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS0_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 2048 + ldsNumBytesB: 2048 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Ailk_Bjlk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT3pg5utUeMZAIeeeg3x6_6dFi2BSgNo4tvTwknAik6K20= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS0_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 16384 + LdsOffsetMXSB_Blk: 49152 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 64 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 64 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bjlk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS0_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 512.0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 0 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 16384 + ldsNumBytesB: 16384 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Ailk_Bjlk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT2zd7fMMojE6tyfJ33owyYyBJeTByPBbdCSSeNE82XGao= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bjlk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_8_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB2_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS0_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 4 + LSPB: 1 + LVCA: 32 + LVCB: 128 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 16384 + LdsOffsetMXSB_Blk: 49152 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 512 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 64 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 32 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bjlk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_8_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB2_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS0_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 8 + ThreadTileA: 64 + ThreadTileB: 8 + TransposeLDS: 0 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: 0 + UnrollMajorLDSB: 0 + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 16384 + ldsNumBytesB: 16384 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [256, 256, 1, 128] + - [0, 0.0] + - - [32, 32, 1, 128] + - [1, 0.0] + - - [32, 32, 1, 256] + - [2, 0.0] + - - [512, 512, 1, 32] + - [3, 0.0] +- null +- null +- DeviceEfficiency +- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Ailk_Bljk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Ailk_Bljk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml new file mode 100644 index 000000000000..5f668b3aaf00 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Ailk_Bljk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,1247 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx1250 +- gfx1250 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 4] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DataTypeMXSA: 22 + DataTypeMXSB: 22 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: true + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MXBlockA: 0 + MXBlockB: 0 + MacDataTypeA: 4 + MacDataTypeB: 4 + MetadataLayout: 0 + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMXSA: [] + MirrorDimsMXSB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + isMixMode: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT2RYmfcLGBqY7vBi04wQyk9vhTlR1BTjtOip516nv2tmE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_4_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 229376 + LdsInitCVgprs: false + LdsNumBytes: 229376 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 65536 + LdsOffsetB_Blk: 196608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 196608 + LdsOffsetMXSB: 65536 + LdsOffsetMXSB_Blk: 196608 + LdsOffsetMetadata: 65536 + LdsOffsetMetadata_Blk: 196608 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 32 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 16 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_4_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 65536 + ldsNumBytesB: 32768 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT3kZyxxoKvzvQDQ3D6r_QvOcs-yB2tGza1myO-x7aluMY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 8192 + LdsInitCVgprs: false + LdsNumBytes: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 2048 + LdsOffsetMXSA_Blk: 6144 + LdsOffsetMXSB: 2048 + LdsOffsetMXSB_Blk: 6144 + LdsOffsetMetadata: 2048 + LdsOffsetMetadata_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 2048 + ldsNumBytesB: 2048 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT3r8pYiAlhSTuS52LL0GNKk0fa2PsysIFaHKR_cB2VN-8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 256 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 16384 + LdsOffsetMXSB_Blk: 49152 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 512.0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 16384 + ldsNumBytesB: 16384 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Ailk_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT2IqXuzLHdIcXTpWEIa58GVTNVJI8Da6twwPr3geNxYNU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_8_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 16384 + LdsOffsetMXSB_Blk: 49152 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 512 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_8_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 8 + ThreadTileA: 64 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 16384 + ldsNumBytesB: 16384 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [256, 256, 1, 128] + - [0, 0.0] + - - [32, 32, 1, 128] + - [1, 0.0] + - - [32, 32, 1, 256] + - [2, 0.0] + - - [512, 512, 1, 32] + - [3, 0.0] +- null +- null +- DeviceEfficiency +- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Ailk_Bljk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Ailk_Bljk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml new file mode 100644 index 000000000000..e83589de1bf4 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Ailk_Bljk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,1247 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx1250 +- gfx1250 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: true + BiasDataTypeList: [0, 4] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 0 + DataTypeMXSA: 22 + DataTypeMXSB: 22 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: true + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [0, 3, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 1 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MXBlockA: 0 + MXBlockB: 0 + MacDataTypeA: 4 + MacDataTypeB: 4 + MetadataLayout: 0 + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMXSA: [] + MirrorDimsMXSB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: true + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 0 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + isMixMode: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Ailk_Bljk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT2RYmfcLGBqY7vBi04wQyk9vhTlR1BTjtOip516nv2tmE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_4_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 128 + LSPA: 4 + LSPB: 8 + LVCA: 32 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 229376 + LdsInitCVgprs: false + LdsNumBytes: 229376 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 65536 + LdsOffsetB_Blk: 196608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 196608 + LdsOffsetMXSB: 65536 + LdsOffsetMXSB_Blk: 196608 + LdsOffsetMetadata: 65536 + LdsOffsetMetadata_Blk: 196608 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 32 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 16 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Ailk_Bljk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_4_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 65536 + ldsNumBytesB: 32768 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Ailk_Bljk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT3kZyxxoKvzvQDQ3D6r_QvOcs-yB2tGza1myO-x7aluMY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 8192 + LdsInitCVgprs: false + LdsNumBytes: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 2048 + LdsOffsetMXSA_Blk: 6144 + LdsOffsetMXSB: 2048 + LdsOffsetMXSB_Blk: 6144 + LdsOffsetMetadata: 2048 + LdsOffsetMetadata_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Ailk_Bljk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 2048 + ldsNumBytesB: 2048 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Ailk_Bljk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT3r8pYiAlhSTuS52LL0GNKk0fa2PsysIFaHKR_cB2VN-8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 256 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 16384 + LdsOffsetMXSB_Blk: 49152 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Ailk_Bljk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 512.0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 16384 + ldsNumBytesB: 16384 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Ailk_Bljk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT2IqXuzLHdIcXTpWEIa58GVTNVJI8Da6twwPr3geNxYNU= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: false + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Ailk_Bljk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_8_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 32 + LVCA: 32 + LVCB: 4 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 16384 + LdsOffsetMXSB_Blk: 49152 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 512 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Ailk_Bljk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_8_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 8 + ThreadTileA: 64 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: false + UnrollMajorLDSB: true + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: false + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 16384 + ldsNumBytesB: 16384 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [256, 256, 1, 128] + - [0, 0.0] + - - [32, 32, 1, 128] + - [1, 0.0] + - - [32, 32, 1, 256] + - [2, 0.0] + - - [512, 512, 1, 32] + - [3, 0.0] +- null +- null +- DeviceEfficiency +- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Alik_Bjlk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Alik_Bjlk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml new file mode 100644 index 000000000000..62fea46cc035 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Alik_Bjlk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,1247 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx1250 +- gfx1250 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 4] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DataTypeMXSA: 22 + DataTypeMXSB: 22 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: true + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MXBlockA: 0 + MXBlockB: 0 + MacDataTypeA: 4 + MacDataTypeB: 4 + MetadataLayout: 0 + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMXSA: [] + MirrorDimsMXSB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + isMixMode: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT2CNu0YgWj9ufYXKdMCgWFo6UUS26oqBlbaFlMCLxn4ww= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_4_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 16 + LVCB: 128 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 229376 + LdsInitCVgprs: false + LdsNumBytes: 229376 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 65536 + LdsOffsetB_Blk: 196608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 196608 + LdsOffsetMXSB: 65536 + LdsOffsetMXSB_Blk: 196608 + LdsOffsetMetadata: 65536 + LdsOffsetMetadata_Blk: 196608 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 32 + NumLoadsB: 128 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 128 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_4_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 65536 + ldsNumBytesB: 32768 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT3KK4p6OfcvLNiyqVfhkx6kgohteb7oC1FbgPOzRA_frg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 8192 + LdsInitCVgprs: false + LdsNumBytes: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 2048 + LdsOffsetMXSA_Blk: 6144 + LdsOffsetMXSB: 2048 + LdsOffsetMXSB_Blk: 6144 + LdsOffsetMetadata: 2048 + LdsOffsetMetadata_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 2048 + ldsNumBytesB: 2048 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT3m133YIaemW2s2Dy_YvFAHLSwBb4OgdWrieWLA7PkEXQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 16384 + LdsOffsetMXSB_Blk: 49152 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 64 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 64 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 512.0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 16384 + ldsNumBytesB: 16384 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bjlk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT2kRjysn5Y5tLCNtQjmb3fCtwipfRjAOWQIwW_Bt1zYPw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: false + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_8_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB2_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 32 + LSPB: 1 + LVCA: 4 + LVCB: 128 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 16384 + LdsOffsetMXSB_Blk: 49152 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 512 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 64 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 32 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_8_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB2_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 8 + ThreadTileA: 64 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 16384 + ldsNumBytesB: 16384 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [256, 256, 1, 128] + - [0, 0.0] + - - [32, 32, 1, 128] + - [1, 0.0] + - - [32, 32, 1, 256] + - [2, 0.0] + - - [512, 512, 1, 32] + - [3, 0.0] +- null +- null +- DeviceEfficiency +- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Alik_Bjlk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Alik_Bjlk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml new file mode 100644 index 000000000000..ef4f29603127 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Alik_Bjlk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,1247 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx1250 +- gfx1250 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: true + BiasDataTypeList: [0, 4] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 0 + DataTypeMXSA: 22 + DataTypeMXSB: 22 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: true + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [1, 3, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 1 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MXBlockA: 0 + MXBlockB: 0 + MacDataTypeA: 4 + MacDataTypeB: 4 + MetadataLayout: 0 + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMXSA: [] + MirrorDimsMXSB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: true + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 1 + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + isMixMode: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bjlk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT2CNu0YgWj9ufYXKdMCgWFo6UUS26oqBlbaFlMCLxn4ww= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_4_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 1 + LVCA: 16 + LVCB: 128 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 229376 + LdsInitCVgprs: false + LdsNumBytes: 229376 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 65536 + LdsOffsetB_Blk: 196608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 196608 + LdsOffsetMXSB: 65536 + LdsOffsetMXSB_Blk: 196608 + LdsOffsetMetadata: 65536 + LdsOffsetMetadata_Blk: 196608 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 32 + NumLoadsB: 128 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 128 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bjlk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_4_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 65536 + ldsNumBytesB: 32768 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bjlk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT3KK4p6OfcvLNiyqVfhkx6kgohteb7oC1FbgPOzRA_frg= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 4 + LVCA: 4 + LVCB: 32 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 8192 + LdsInitCVgprs: false + LdsNumBytes: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 2048 + LdsOffsetMXSA_Blk: 6144 + LdsOffsetMXSB: 2048 + LdsOffsetMXSB_Blk: 6144 + LdsOffsetMetadata: 2048 + LdsOffsetMetadata_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bjlk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 2048 + ldsNumBytesB: 2048 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bjlk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT3m133YIaemW2s2Dy_YvFAHLSwBb4OgdWrieWLA7PkEXQ= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 32 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 16384 + LdsOffsetMXSB_Blk: 49152 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 64 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 64 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bjlk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 512.0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 16384 + ldsNumBytesB: 16384 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bjlk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT2kRjysn5Y5tLCNtQjmb3fCtwipfRjAOWQIwW_Bt1zYPw= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 1 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bjlk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_8_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB2_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 128 + LSPA: 32 + LSPB: 1 + LVCA: 4 + LVCB: 128 + LVPA: 4 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 16384 + LdsOffsetMXSB_Blk: 49152 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 512 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 64 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 2 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 32 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bjlk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_8_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB2_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 8 + ThreadTileA: 64 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: false + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 16384 + ldsNumBytesB: 16384 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [256, 256, 1, 128] + - [0, 0.0] + - - [32, 32, 1, 128] + - [1, 0.0] + - - [32, 32, 1, 256] + - [2, 0.0] + - - [512, 512, 1, 32] + - [3, 0.0] +- null +- null +- DeviceEfficiency +- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Alik_Bljk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Alik_Bljk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml new file mode 100644 index 000000000000..8785129398eb --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Alik_Bljk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,1247 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx1250 +- gfx1250 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: false + BiasDataTypeList: [0, 4] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 4 + DataTypeMXSA: 22 + DataTypeMXSB: 22 + DestDataType: 4 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: true + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MXBlockA: 0 + MXBlockB: 0 + MacDataTypeA: 4 + MacDataTypeB: 4 + MetadataLayout: 0 + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMXSA: [] + MirrorDimsMXSB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + isMixMode: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT2RYmfcLGBqY7vBi04wQyk9vhTlR1BTjtOip516nv2tmE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_4_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 229376 + LdsInitCVgprs: false + LdsNumBytes: 229376 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 65536 + LdsOffsetB_Blk: 196608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 196608 + LdsOffsetMXSB: 65536 + LdsOffsetMXSB_Blk: 196608 + LdsOffsetMetadata: 65536 + LdsOffsetMetadata_Blk: 196608 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 32 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 16 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_4_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 65536 + ldsNumBytesB: 32768 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT3kZyxxoKvzvQDQ3D6r_QvOcs-yB2tGza1myO-x7aluMY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 8192 + LdsInitCVgprs: false + LdsNumBytes: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 2048 + LdsOffsetMXSA_Blk: 6144 + LdsOffsetMXSB: 2048 + LdsOffsetMXSB_Blk: 6144 + LdsOffsetMetadata: 2048 + LdsOffsetMetadata_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 2048 + ldsNumBytesB: 2048 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT3r8pYiAlhSTuS52LL0GNKk0fa2PsysIFaHKR_cB2VN-8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 16384 + LdsOffsetMXSB_Blk: 49152 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 512.0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 16384 + ldsNumBytesB: 16384 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT22UlkBq8K_PUu2RVaD_geadXpwKll6hQQcYz306D1nE8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_8_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 16384 + LdsOffsetMXSB_Blk: 49152 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 512 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_HHS_BH_Bias_GG_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_8_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 8 + ThreadTileA: 64 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 16384 + ldsNumBytesB: 16384 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 2 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [256, 256, 1, 128] + - [0, 0.0] + - - [32, 32, 1, 128] + - [1, 0.0] + - - [32, 32, 1, 256] + - [2, 0.0] + - - [512, 512, 1, 32] + - [3, 0.0] +- null +- null +- DeviceEfficiency +- GridBased diff --git a/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Alik_Bljk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Alik_Bljk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml new file mode 100644 index 000000000000..72d27adee9d4 --- /dev/null +++ b/projects/hipblaslt/library/src/amd_detail/rocblaslt/src/Tensile/Logic/asm_full/gfx1250/GridBased/gfx1250_Cijk_Alik_Bljk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs.yaml @@ -0,0 +1,1247 @@ +- {MinimumRequiredVersion: 5.0.0} +- gfx1250 +- gfx1250 +- [Device 73f0] +- Activation: true + ActivationComputeDataType: 0 + ActivationNoGuard: false + ActivationType: hipblaslt_all + AllowNoFreeDims: false + AssignedDerivedParameters: true + Batched: true + BetaOnlyUseBias: true + BiasDataTypeList: [0, 4] + BiasSrc: D + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 4 + DataTypeA: 4 + DataTypeAmaxD: 0 + DataTypeB: 4 + DataTypeE: 0 + DataTypeMXSA: 22 + DataTypeMXSB: 22 + DestDataType: 0 + F32XdlMathOp: 0 + Gradient: false + GroupedGemm: true + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexAssignmentsMetadata: [3, 0, 2] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndexUnrollM: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + MXBlockA: 0 + MXBlockB: 0 + MacDataTypeA: 4 + MacDataTypeB: 4 + MetadataLayout: 0 + MirrorDimsA: [] + MirrorDimsB: [] + MirrorDimsMXSA: [] + MirrorDimsMXSB: [] + MirrorDimsMetadata: [] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + OutputAmaxD: false + SetConstStrideA: [] + SetConstStrideB: [] + SetConstStrideBias: [] + SilentHighPrecisionAccumulate: false + Sparse: 0 + StochasticRounding: false + StridedBatched: true + SupportUserArgs: true + SwizzleTensorA: false + SwizzleTensorB: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: 1 + TransposeB: 0 + UseBeta: true + UseBias: 1 + UseE: false + UseInitialStridesAB: false + UseInitialStridesCD: false + UseScaleAB: '' + UseScaleAlphaVec: 1 + UseScaleCD: false + isMixMode: false +- - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT2RYmfcLGBqY7vBi04wQyk9vhTlR1BTjtOip516nv2tmE= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 128 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_4_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 128 + LSCB: 128 + LSPA: 8 + LSPB: 8 + LVCA: 16 + LVCB: 16 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 229376 + LdsInitCVgprs: false + LdsNumBytes: 229376 + LdsNumElementsAlignedA: 65536 + LdsNumElementsAlignedB: 32768 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 131072 + LdsOffsetB: 65536 + LdsOffsetB_Blk: 196608 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 65536 + LdsOffsetMXSA_Blk: 196608 + LdsOffsetMXSB: 65536 + LdsOffsetMXSB_Blk: 196608 + LdsOffsetMetadata: 65536 + LdsOffsetMetadata_Blk: 196608 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 4 + LoopUnroll: 128 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 4] + MIWaveTileA: 8 + MIWaveTileB: 4 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 256 + NumGlobalWriteVectorsPerThread: 32 + NumLdsBlk: 2 + NumLoadsA: 32 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 32 + NumLoadsPerpendicularB: 16 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs_MT256x128x128_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_4_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB4_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 4 + ThreadTileA: 64 + ThreadTileB: 4 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 4 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 128 + _DepthUA: 128 + _DepthUB: 128 + _DepthUMetadata: 128 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 0 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 65536 + ldsNumBytesB: 32768 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT3kZyxxoKvzvQDQ3D6r_QvOcs-yB2tGza1myO-x7aluMY= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 0 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 8192 + LdsInitCVgprs: false + LdsNumBytes: 8192 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 2048 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 4096 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 6144 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 2048 + LdsOffsetMXSA_Blk: 6144 + LdsOffsetMXSB: 2048 + LdsOffsetMXSB_Blk: 6144 + LdsOffsetMetadata: 2048 + LdsOffsetMetadata_Blk: 6144 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLdsBlk: 2 + NumLoadsA: 1 + NumLoadsB: 1 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 1 + NumLoadsPerpendicularB: 1 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 0 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs_MT32x32x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC1_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 1 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 2048 + ldsNumBytesB: 2048 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT3r8pYiAlhSTuS52LL0GNKk0fa2PsysIFaHKR_cB2VN-8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 256 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: false + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 256 + LSCB: 256 + LSPA: 4 + LSPB: 4 + LVCA: 32 + LVCB: 32 + LVPA: 1 + LVPB: 1 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 16384 + LdsOffsetMXSB_Blk: 49152 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 8 + LoopUnroll: 256 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [1, 1] + MIWaveTileA: 1 + MIWaveTileB: 1 + MIWaveTileMetadata: 0 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 8 + NumGlobalWriteVectorsPerThread: 8 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs_MT32x32x256_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT1_1_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS512_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 512.0 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 8 + ThreadTile1: 1 + ThreadTileA: 8 + ThreadTileB: 1 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 1 + VectorWidthB: 1 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 256 + _DepthUA: 256 + _DepthUB: 256 + _DepthUMetadata: 256 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 0 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 16384 + ldsNumBytesB: 16384 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 1 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false + - 1LDSBuffer: 0 + ActivationAlt: false + ActivationFuncCall: true + ActivationFused: true + AdaptiveGemm: 0 + AssertAIGreaterThanEqual: -1 + AssertAILessThanEqual: -1 + AssertFree0ElementMultiple: 1 + AssertFree1DivByMT1LowbitGT1: 0 + AssertFree1ElementMultiple: 1 + AssertKRingShiftTailWrapOnly: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: true + AssignedProblemIndependentDerivedParameters: true + BAddrInterleave: false + BaseName: Cijk_Alik_Bljk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT22UlkBq8K_PUu2RVaD_geadXpwKll6hQQcYz306D1nE8= + BufferLoad: true + BufferStore: true + CUCount: null + CUOccupancy: -1 + ClusterLocalRead: 1 + CodeObjectVersion: 4 + ConvertAfterDS: false + CustomKernelName: '' + DebugStreamK: 0 + DepthU: 32 + DirectToLds: 0 + DirectToLdsA: false + DirectToLdsB: false + DirectToVgprA: false + DirectToVgprB: false + DirectToVgprMXSA: false + DirectToVgprMXSB: false + DirectToVgprSparseMetadata: false + DtlPlusLdsBuf: 0 + EdgeType: ShiftPtr + EnableF32XdlMathOp: false + EnableMatrixInstruction: true + ExpandPointerSwap: 0 + ExpertSchedulingMode: 0 + ExtraLatencyForLR: 0 + ExtraMiLatencyLeft: -1 + ForceDisableShadowInit: false + ForceUnrollSubIter: true + GlobalReadPerMfma: 1 + GlobalReadVectorWidthA: 8 + GlobalReadVectorWidthB: 8 + GlobalSplitU: 1 + GlobalSplitUAlgorithm: MultipleBuffer + GlobalSplitUCoalesced: false + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 8 + GroupLoadStore: false + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + GuaranteeNoPartialMetadata: true + ISA: [12, 5, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true, + SupportUserGSU: true, UseSFC: false, UseUniversalArgs: true} + KRingShift: false + Kernel: true + KernelLanguage: Assembly + KernelNameMin: Cijk_Alik_Bljk_HSS_BH_Bias_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSUAMB_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_8_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS32_WG32_4_1 + LDSTrInst: false + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 4 + LVCB: 4 + LVPA: 4 + LVPB: 4 + LdsBlockSizePerPadA: 0 + LdsBlockSizePerPadB: 0 + LdsBlockSizePerPadMXSA: 0 + LdsBlockSizePerPadMXSB: 0 + LdsBlockSizePerPadMetadata: 0 + LdsBytesNoAmax: 65536 + LdsInitCVgprs: false + LdsNumBytes: 65536 + LdsNumElementsAlignedA: 16384 + LdsNumElementsAlignedB: 16384 + LdsNumElementsAlignedMXSA: 0 + LdsNumElementsAlignedMXSB: 0 + LdsNumElementsAlignedMetadata: 0 + LdsOffsetA: 0 + LdsOffsetA_Blk: 32768 + LdsOffsetB: 16384 + LdsOffsetB_Blk: 49152 + LdsOffsetBias: 0 + LdsOffsetBiasGSU: 0 + LdsOffsetBiasNonGSU: 0 + LdsOffsetMXSA: 16384 + LdsOffsetMXSA_Blk: 49152 + LdsOffsetMXSB: 16384 + LdsOffsetMXSB_Blk: 49152 + LdsOffsetMetadata: 16384 + LdsOffsetMetadata_Blk: 49152 + LdsPadA: 0 + LdsPadB: 0 + LdsPadMXSA: 0 + LdsPadMXSB: 0 + LdsPadMetadata: 0 + LocalReadVectorWidth: -1 + LocalReadVectorWidthA: 8 + LocalReadVectorWidthB: 8 + LocalSplitU: 1 + LocalSplitUReuseLDS: 1 + LocalWritePerMfma: -1 + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopIters: 1 + LoopUnroll: 32 + MFMA_BF16_1K: 0 + MIArchVgpr: true + MIBlock: [16, 16, 32, 1, 1, 1] + MIInputPerThread: 16 + MIInputPerThreadA: 16 + MIInputPerThreadB: 16 + MIInputPerThreadMetadata: 16 + MIOutputVectorWidth: 8 + MIRegPerOut: 1 + MIWaveGroup: [2, 2] + MIWaveTile: [8, 8] + MIWaveTileA: 8 + MIWaveTileB: 8 + MIWaveTileMetadata: 0 + MacroTile0: 256 + MacroTile1: 256 + MacroTileA: 256 + MacroTileB: 256 + MagicDivAlg: 2 + MathClocksUnrolledLoop: 0 + MatrixInstB: 1 + MatrixInstBM: 1 + MatrixInstBN: 1 + MatrixInstK: 32 + MatrixInstM: 16 + MatrixInstN: 16 + MatrixInstruction: [16, 16, 32, 1] + MaxLDS: 327680 + MaxOccupancy: 40 + MbskPrefetchMethod: 0 + MfmaInitCVgprs: false + MinGRIncPerMfma: 1 + NoLdsWriteCode: true + NoReject: false + NoTailLoop: false + NonDTLTailLoopA: false + NonDTLTailLoopB: false + NonTemporal: -1 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NonTemporalD: 0 + NonTemporalE: 0 + NonTemporalMXSA: 0 + NonTemporalMXSB: 0 + NonTemporalMetadata: 0 + NonTemporalWS: 0 + NumElementsPerBatchStore: 0 + NumElementsPerThread: 512 + NumGlobalWriteVectorsPerThread: 64 + NumLdsBlk: 2 + NumLoadsA: 8 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 8 + NumThreads: 128 + NumTotalPackedLoadsA: -1 + NumTotalPackedLoadsB: -1 + NumWaveSplitK: 1 + OptNoLoadLoop: 1 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PrefetchGlobalRead: 2 + PrefetchLocalRead: 1 + PreloadKernArgs: false + SFCWGM: + - [1, 1] + - [1, 1] + ScheduleGROverBarrier: 0 + ScheduleGlobalRead: 1 + ScheduleIterAlg: 0 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_HSS_BH_Bias_GG_HA_S_SAV_UserArgs_MT256x256x32_MI16x16x1_SN_LDSB0_AFC1_AG0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_DTVMXSA0_DTVMXSB0_DTVSM0_DPLB0_EPS0_ELFLR0_EMLLn1_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1250_IU1_K1_LDSTI0_LBSPPA0_LBSPPB0_LBSPPM0_LPA0_LPB0_LPM0_LRVWn1_LWPMn1_MIAV1_MIWT8_8_MO40_MGRIPM1_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA0_SGROB0_SIA0_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKFTR0_SKXCCM0_SGRO0_TDMI3_TDMS0_TIN0_TLDS1_TLDSMn1_ULSGRO0_USL1_UIOFGRO0_UPLRP0_USFGROn1_VSn1_VWA8_VWB8_WSGRA0_WSGRB0_WS32_WG32_4_1_WGM8_WGMXCC8_WGMXCCGn1 + SourceSwap: false + SpaceFillingAlgo: [] + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + StorePriorityOpt: false + StoreRemapVectorWidth: 0 + StoreSwapAddr: false + StoreSyncOpt: 0 + StoreVectorWidth: 8 + StreamK: 0 + StreamKAtomic: 0 + StreamKFixupTreeReduction: 0 + StreamKXCCMapping: 0 + SubGroup0: 4 + SubGroup1: 32 + SubGroupA: 4 + SubGroupB: 32 + SuppressNoLoadLoop: false + SwapGlobalReadOrder: 0 + TDMInst: 3 + TDMSplit: false + TailloopInNll: false + ThreadTile: [1, 1] + ThreadTile0: 64 + ThreadTile1: 8 + ThreadTileA: 64 + ThreadTileB: 8 + TransposeLDS: 1 + TransposeLDSMetadata: -1 + ULSGRODoubleG2L: 0 + UnrollLoopSwapGlobalReadOrder: 0 + UnrollMajorLDSA: true + UnrollMajorLDSB: true + Use64bShadowLimit: true + UseCustomMainLoopSchedule: 0 + UseDirect32XEmulation: false + UseDot2F32XEmulation: false + UseDotInstruction: false + UseF32XEmulation: false + UseGeneralizedNLCOneA: false + UseGeneralizedNLCOneB: false + UseGeneralizedNLCOneMXSA: false + UseGeneralizedNLCOneMXSB: false + UseGeneralizedNLCOneMetadata: false + UseInstOffsetForGRO: 0 + UseMFMAF32XEmulation: false + UsePLRPack: 0 + UseSgprForGRO: -1 + Valid: true + VectorStore: -1 + VectorWidthA: 8 + VectorWidthB: 8 + WaveSeparateGlobalReadA: 0 + WaveSeparateGlobalReadB: 0 + WaveSeparateGlobalReadMetadata: 0 + WaveSplitK: false + WavefrontSize: 32 + WorkGroup: [32, 4, 1] + WorkGroupMapping: 8 + WorkGroupMappingXCC: 8 + WorkGroupMappingXCCGroup: -1 + WorkGroupReduction: false + WorkspaceCheck: [4, 0, 1] + _DepthU: 32 + _DepthUA: 32 + _DepthUB: 32 + _DepthUMetadata: 32 + _GlobalAccumulation: MultipleBuffer + _UseSgprForGRO: 1 + _VectorStore: 1 + _WorkspaceSizePerElemBias: 0 + _WorkspaceSizePerElemC: 4 + _staggerStrideShift: 2 + enableGLTrA: false + enableGLTrB: false + enableLDSTrA: false + enableLDSTrB: false + enableLDSTrMXSA: false + enableLDSTrMXSB: false + enableLDSTrMetadata: 0 + enableTDMA: true + enableTDMB: true + ldsNumBytesA: 16384 + ldsNumBytesB: 16384 + ldsNumBytesMXSA: 0 + ldsNumBytesMXSB: 0 + ldsNumBytesMetadata: 0 + numSubTiles: 2 + reorderGRInstForDTVA: false + reorderGRInstForDTVB: false + tailLoopOptA: false + tailLoopOptB: false +- [2, 3, 0, 1] +- - - [256, 256, 1, 128] + - [0, 0.0] + - - [32, 32, 1, 128] + - [1, 0.0] + - - [32, 32, 1, 256] + - [2, 0.0] + - - [512, 512, 1, 32] + - [3, 0.0] +- null +- null +- DeviceEfficiency +- GridBased