diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py index f6e15a1dd46b..d2b56157548b 100644 --- a/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py +++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriter.py @@ -3498,12 +3498,16 @@ def noLoadLoop( self, kernel, tensorParametersA, tensorParametersB, isOptNLL, is module.add(self._wait(kernel, tensorParametersA, tensorParametersB, vlcntVal, -1, -1, "10wait for global read")) if not kernel["NoLdsWriteCode"]: module.add(self._wait(kernel, tensorParametersA, tensorParametersB, -1, 0, -1, "4wait for local write")) - module.add(self._syncThreads(kernel, "Wait GR->LW done, sync LDS%u"%self.states.ldsWriteTokenIdx, memoryToken=[self.states.ldsWriteTokenIdx])) - - if kernel["enableTDMA"] and kernel["enableTDMB"]: + module.add(self._syncThreads(kernel, "wait for local write done, sync LDS%u"%self.states.ldsBarrierTokenIdx, memoryToken=[self.states.ldsBarrierTokenIdx])) + # swap barrier token, locked by OptNll + if not isOptNLL: + self.states.ldsBarrierTokenIdx = self.states.memTokenLdsBuffer1 if self.states.ldsBarrierTokenIdx == self.states.memTokenLdsBuffer0 else self.states.memTokenLdsBuffer0 + elif kernel["enableTDMA"] and kernel["enableTDMB"]: module.add(self._wait(kernel, tensorParametersA, tensorParametersB, 0, -1, -1, "wait for tensor load to finish")) - module.add(self._syncThreads(kernel)) - + module.add(self._syncThreads(kernel, "wait for tensor load done, sync LDS%u"%self.states.ldsBarrierTokenIdx, memoryToken=[self.states.ldsBarrierTokenIdx])) + # swap barrier token + if not isOptNLL: + self.states.ldsBarrierTokenIdx = self.states.memTokenLdsBuffer1 if self.states.ldsBarrierTokenIdx == self.states.memTokenLdsBuffer0 else self.states.memTokenLdsBuffer0 # generate no Load Loop Body code module.add(self.noLoadLoopBody(kernel, tensorParametersA, tensorParametersB, pack, packPre, isOptNLL, isNGLL, NLLfirst, NLLlast, NLLindex=NLLindex, \ NLLnum=NLLnum, useTailloopInNll=useTailloopInNll, remainPgr=remainPgr)) @@ -3540,13 +3544,14 @@ def _loopBody( self, kernel, tensorParametersA, tensorParametersB, pack, packPre module.add(self._wait(kernel, tensorParametersA, tensorParametersB, vlcntVal, -1, -1, "11wait for global read")) if not kernel["NoLdsWriteCode"]: module.add(self._wait(kernel, tensorParametersA, tensorParametersB, 1, 0, -1, "1wait for local write")) - module.add(self._syncThreads(kernel, "4sync for global read, PGR->LW needs sync LDS0", memoryToken=[self.states.ldsBarrierTokenIdx])) + module.add(self._syncThreads(kernel, "4sync for global read, PGR->LW needs sync LDS%u"%(self.states.ldsBarrierTokenIdx), memoryToken=[self.states.ldsBarrierTokenIdx])) # swap barrier token self.states.ldsBarrierTokenIdx = self.states.memTokenLdsBuffer1 if self.states.ldsBarrierTokenIdx == self.states.memTokenLdsBuffer0 else self.states.memTokenLdsBuffer0 - - if kernel["PrefetchGlobalRead"] and kernel["enableTDMA"] and kernel["enableTDMB"]: + elif kernel["PrefetchGlobalRead"] and kernel["enableTDMA"] and kernel["enableTDMB"]: module.add(self._wait(kernel, tensorParametersA, tensorParametersB, 0, -1, -1, "wait for tensor load to finish")) - module.add(self._syncThreads(kernel)) + module.add(self._syncThreads(kernel, "wait for tensor load to finish, PGR->LW needs sync LDS%u"%(self.states.ldsBarrierTokenIdx), memoryToken=[self.states.ldsBarrierTokenIdx])) + # swap barrier token + self.states.ldsBarrierTokenIdx = self.states.memTokenLdsBuffer1 if self.states.ldsBarrierTokenIdx == self.states.memTokenLdsBuffer0 else self.states.memTokenLdsBuffer0 module.addComment1("Begin Each Unroll: Check VGPR.checkin for INT8 LW") @@ -4983,6 +4988,10 @@ def kernelBody( self, kernel, tensorParametersA, tensorParametersB ): # loop body code generation finalLoop = lc == loopCopies - 1 loop.add(self._loopBody( kernel, tensorParametersA, tensorParametersB, pack, packPre, lc, loopCopies, finalLoop, isDTVGRSecondBuf=isDTVGRSecondBuf )) + if self.states.numItersPLR == 0 and not finalLoop: + # swap LDS read buffer + self.states.ldsReadTokenIdx = self.states.memTokenLdsBuffer1 if self.states.ldsReadTokenIdx == self.states.memTokenLdsBuffer0 else self.states.memTokenLdsBuffer0 + module.add(loop) if kernel["ExpertSchedulingMode"] > 0: diff --git a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py index d036d7c328de..a15a5b39de1c 100644 --- a/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py +++ b/projects/hipblaslt/tensilelite/Tensile/KernelWriterAssembly.py @@ -6993,7 +6993,7 @@ def closeLoop(self, kernel, tPA, tPB, loopIdx, finalLoop, emitEndLabelOnly=False if kernel["enableTDMA"] and kernel["enableTDMB"] and not kernel["PrefetchGlobalRead"]: module.add(SWaitCnt(dscnt=0, comment="TDM PGR=0: wait all ds_reads before TDM overwrite")) - module.add(SBarrier(comment="TDM PGR=0: signal+wait done reading LDS")) + module.add(SBarrier(comment="TDM PGR=0: signal+wait done reading LDS", memoryToken=[self.states.memTokenLdsBuffer0])) # If PrefetchGlobalRead=1 the loads in the loop prefetch next macro-tile # For the final trip through the unroll loop we need to ensure those loads stay in bounds. diff --git a/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx12/stinky_sia4.yaml b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx12/stinky_sia4.yaml new file mode 100644 index 000000000000..be55ba9271ee --- /dev/null +++ b/projects/hipblaslt/tensilelite/Tensile/Tests/common/gemm/gfx12/stinky_sia4.yaml @@ -0,0 +1,523 @@ +TestParameters: + marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch +GlobalParameters: + NumElementsToValidate: -1 + KernelTime: False + BoundsCheck: False + ValidationMaxToPrint: 4 # maximum number of mismatches to print + ValidationPrintValids: False # print matches too + PruneSparseMode: 0 + DataInitTypeA: 3 + DataInitTypeB: 3 + DataInitTypeD: 2 + DataInitTypeAlpha: 1 + DataInitTypeBeta: 0 + # PrintTensorC: 1 + # PrintTensorD: 1 + # PrintTensorRef: 1 + NumWarmups: 0 + EnqueuesPerSync: 1 + PrintWinnersOnly: True + Architecture: gfx1250 + CpuThreads: 1 + KeepBuildTmp: True + PrintLevel: 2 + CodeObjectVersion: 4 + ROCmAgentEnumeratorPath: rocm_agent_enumerator + +BenchmarkProblems: + - + - # ProblemType, TN + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + Activation: False + SupportUserArgs: True + UseBias: 0 + + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16, 32, 1, 1, 1, 1, 1, 1 ] + - [16, 16, 32, 1, 1, 1, 1, 2, 2 ] + # - [16, 16, 32, 1, 1, 8, 8, 2, 2 ] + # - [16, 16, 32, 1, 1, 1, 1, 4, 1 ] + # - [16, 16, 32, 1, 1, 2, 2, 2, 2 ] + # - [16, 16, 32, 1, 1, 2, 2, 2, 2] + - DepthU: [64, 128] + - TransposeLDS: [-1] + - LdsPadA: [0] + - LdsPadB: [0] + - LdsBlockSizePerPadA: [0] + - LdsBlockSizePerPadB: [0] + - LdsPadMetadata: [0] + - TDMInst: [3] + - LDSTrInst: [False] + - StaggerU: [0] + - ScheduleIterAlg: [4] + - PrefetchLocalRead: [1] + - PrefetchGlobalRead: [2] + - StoreRemapVectorWidth: [0] + - GlobalSplitU: [1] + - GlobalSplitUAlgorithm: [MultipleBuffer] + - GlobalReadVectorWidthA: [-1] + - GlobalReadVectorWidthB: [-1] + - VectorWidthA: [1] + - VectorWidthB: [1] + - LocalReadVectorWidth: [8] + - 1LDSBuffer: [0] + - SourceSwap: [0] + - DirectToVgprSparseMetadata: [0] + - WorkGroupMapping: [1] + - StoreVectorWidth: [-1] + - DirectToVgprA: [False] + - DirectToVgprB: [False] + - WavefrontSize: [32] + # - AssertFree0ElementMultiple: [ 16 ] + # - AssertFree1ElementMultiple: [ 16 ] + # - AssertSummationElementMultiple: [ 32 ] + - ExpandPointerSwap: [0] + - GroupLoadStore: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [64, 64, 1, 512] + - Exact: [64, 64, 1, 127] + - Exact: [127, 127, 1, 128] + # - Exact: [512, 512, 1, 256] + # - Exact: [2048, 1024, 1, 4096] + # - Exact: [16, 16, 1, 256] + # - Exact: [64, 64, 1, 255] + + - ActivationArgs: + - [Enum: none] + - + - # ProblemType, NT + TLDS1 => tr-load-ab + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: True + UseBeta: True + Batched: True + Activation: False + SupportUserArgs: True + UseBias: 0 + + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16, 32, 1, 1, 1, 1, 1, 1 ] + - [16, 16, 32, 1, 1, 1, 1, 2, 2 ] + # - [16, 16, 32, 1, 1, 8, 8, 2, 2 ] + # - [16, 16, 32, 1, 1, 1, 1, 4, 1 ] + # - [16, 16, 32, 1, 1, 2, 2, 2, 2 ] + # - [16, 16, 32, 1, 1, 2, 2, 2, 2] + - DepthU: [64, 128] + - TransposeLDS: [-1] + - LdsPadA: [0] + - LdsPadB: [0] + - LdsBlockSizePerPadA: [0] + - LdsBlockSizePerPadB: [0] + - LdsPadMetadata: [0] + - TDMInst: [3] + - LDSTrInst: [True] + - StaggerU: [0] + - ScheduleIterAlg: [4] + - PrefetchLocalRead: [1] + - PrefetchGlobalRead: [2] + - StoreRemapVectorWidth: [0] + - GlobalSplitU: [1] + - GlobalSplitUAlgorithm: [MultipleBuffer] + - GlobalReadVectorWidthA: [-1] + - GlobalReadVectorWidthB: [-1] + - VectorWidthA: [1] + - VectorWidthB: [1] + - LocalReadVectorWidth: [8] + - 1LDSBuffer: [0] + - SourceSwap: [0] + - DirectToVgprSparseMetadata: [0] + - WorkGroupMapping: [1] + - StoreVectorWidth: [-1] + - DirectToVgprA: [False] + - DirectToVgprB: [False] + - WavefrontSize: [32] + # - AssertFree0ElementMultiple: [ 16 ] + # - AssertFree1ElementMultiple: [ 16 ] + # - AssertSummationElementMultiple: [ 32 ] + - ExpandPointerSwap: [0] + - GroupLoadStore: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [64, 64, 1, 512] + - Exact: [64, 64, 1, 127] + - Exact: [127, 127, 1, 128] + # - Exact: [512, 512, 1, 256] + # - Exact: [2048, 1024, 1, 4096] + # - Exact: [16, 16, 1, 256] + # - Exact: [64, 64, 1, 255] + + - ActivationArgs: + - [Enum: none] + + - + - # ProblemType, NN + TLDS1 => tr-load-a + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: False + TransposeB: False + UseBeta: True + Batched: True + Activation: False + SupportUserArgs: True + UseBias: 0 + + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16, 32, 1, 1, 1, 1, 1, 1 ] + - [16, 16, 32, 1, 1, 1, 1, 2, 2 ] + # - [16, 16, 32, 1, 1, 8, 8, 2, 2 ] + # - [16, 16, 32, 1, 1, 1, 1, 4, 1 ] + # - [16, 16, 32, 1, 1, 2, 2, 2, 2 ] + # - [16, 16, 32, 1, 1, 2, 2, 2, 2] + - DepthU: [64, 128] + - TransposeLDS: [-1] + - LdsPadA: [0] + - LdsPadB: [0] + - LdsBlockSizePerPadA: [0] + - LdsBlockSizePerPadB: [0] + - LdsPadMetadata: [0] + - TDMInst: [3] + - LDSTrInst: [True] + - StaggerU: [0] + - ScheduleIterAlg: [4] + - PrefetchLocalRead: [1] + - PrefetchGlobalRead: [2] + - StoreRemapVectorWidth: [0] + - GlobalSplitU: [1] + - GlobalSplitUAlgorithm: [MultipleBuffer] + - GlobalReadVectorWidthA: [-1] + - GlobalReadVectorWidthB: [-1] + - VectorWidthA: [1] + - VectorWidthB: [1] + - LocalReadVectorWidth: [8] + - 1LDSBuffer: [0] + - SourceSwap: [0] + - DirectToVgprSparseMetadata: [0] + - WorkGroupMapping: [1] + - StoreVectorWidth: [-1] + - DirectToVgprA: [False] + - DirectToVgprB: [False] + - WavefrontSize: [32] + # - AssertFree0ElementMultiple: [ 16 ] + # - AssertFree1ElementMultiple: [ 16 ] + # - AssertSummationElementMultiple: [ 32 ] + - ExpandPointerSwap: [0] + - GroupLoadStore: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [64, 64, 1, 512] + - Exact: [64, 64, 1, 127] + - Exact: [127, 127, 1, 128] + # - Exact: [512, 512, 1, 256] + # - Exact: [2048, 1024, 1, 4096] + # - Exact: [16, 16, 1, 256] + # - Exact: [64, 64, 1, 255] + + - ActivationArgs: + - [Enum: none] + - + - # ProblemType, TT + TLDS1 => tr-load-b + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: True + UseBeta: True + Batched: True + Activation: False + SupportUserArgs: True + UseBias: 0 + + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16, 32, 1, 1, 1, 1, 1, 1 ] + - [16, 16, 32, 1, 1, 1, 1, 2, 2 ] + # - [16, 16, 32, 1, 1, 8, 8, 2, 2 ] + # - [16, 16, 32, 1, 1, 1, 1, 4, 1 ] + # - [16, 16, 32, 1, 1, 2, 2, 2, 2 ] + # - [16, 16, 32, 1, 1, 2, 2, 2, 2] + - DepthU: [64, 128] + - TransposeLDS: [-1] + - LdsPadA: [0] + - LdsPadB: [0] + - LdsBlockSizePerPadA: [0] + - LdsBlockSizePerPadB: [0] + - LdsPadMetadata: [0] + - TDMInst: [3] + - LDSTrInst: [True] + - StaggerU: [0] + - ScheduleIterAlg: [4] + - PrefetchLocalRead: [1] + - PrefetchGlobalRead: [2] + - StoreRemapVectorWidth: [0] + - GlobalSplitU: [1] + - GlobalSplitUAlgorithm: [MultipleBuffer] + - GlobalReadVectorWidthA: [-1] + - GlobalReadVectorWidthB: [-1] + - VectorWidthA: [1] + - VectorWidthB: [1] + - LocalReadVectorWidth: [8] + - 1LDSBuffer: [0] + - SourceSwap: [0] + - DirectToVgprSparseMetadata: [0] + - WorkGroupMapping: [1] + - StoreVectorWidth: [-1] + - DirectToVgprA: [False] + - DirectToVgprB: [False] + - WavefrontSize: [32] + # - AssertFree0ElementMultiple: [ 16 ] + # - AssertFree1ElementMultiple: [ 16 ] + # - AssertSummationElementMultiple: [ 32 ] + - ExpandPointerSwap: [0] + - GroupLoadStore: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [64, 64, 1, 512] + - Exact: [64, 64, 1, 127] + - Exact: [127, 127, 1, 128] + # - Exact: [512, 512, 1, 256] + # - Exact: [2048, 1024, 1, 4096] + # - Exact: [16, 16, 1, 256] + # - Exact: [64, 64, 1, 255] + + - ActivationArgs: + - [Enum: none] + + - + - # ProblemType, TN + OperationType: GEMM + DataType: h + DestDataType: s + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + Activation: False + SupportUserArgs: True + UseBias: 0 + + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16, 32, 1, 1, 2, 2, 2, 2] + - DepthU: [32] + - TransposeLDS: [-1] + - LdsPadA: [0] + - LdsPadB: [0] + - LdsBlockSizePerPadA: [0] + - LdsBlockSizePerPadB: [0] + - LdsPadMetadata: [0] + - LDSTrInst: [False] + - TDMInst: [3] + - StaggerU: [0] + - ScheduleIterAlg: [4] + - PrefetchGlobalRead: [1] + - PrefetchLocalRead: [1] + - ClusterLocalRead: [0] + - StoreRemapVectorWidth: [0] + - GlobalSplitU: [1] + - GlobalSplitUAlgorithm: [MultipleBuffer] + - GlobalReadVectorWidthA: [-1] + - GlobalReadVectorWidthB: [-1] + - VectorWidthA: [-1] + - VectorWidthB: [-1] + - LocalReadVectorWidth: [-1] + - 1LDSBuffer: [0] + - SourceSwap: [0] + - DirectToVgprSparseMetadata: [0] + - WorkGroupMapping: [1] + - StoreVectorWidth: [-1] + - DirectToVgprA: [False] + - DirectToVgprB: [False] + - WavefrontSize: [32] + - ExpandPointerSwap: [1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [64, 64, 1, 256] + - Exact: [64, 64, 1, 127] + - Exact: [127, 127, 1, 128] + - ActivationArgs: + - [Enum: none] + + - + - # ProblemType, TN + OperationType: GEMM + DataType: s + DestDataType: s + ComputeDataType: s + HighPrecisionAccumulate: False + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + Activation: False + SupportUserArgs: True + UseBias: 0 + + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16, 4, 1, 1, 2, 2, 1, 1 ] + - [16, 16, 4, 1, 1, 1, 1, 2, 2 ] + - DepthU: [16] + - TransposeLDS: [-1] + - LdsPadA: [0] + - LdsPadB: [0] + - LdsBlockSizePerPadA: [0] + - LdsBlockSizePerPadB: [0] + - LdsPadMetadata: [0] + - TDMInst: [3] + - LDSTrInst: [False] + - StaggerU: [0] + - ScheduleIterAlg: [4] + - PrefetchLocalRead: [1] + - PrefetchGlobalRead: [2] + - StoreRemapVectorWidth: [0] + - GlobalSplitU: [1] + - GlobalSplitUAlgorithm: [MultipleBuffer] + - GlobalReadVectorWidthA: [-1] + - GlobalReadVectorWidthB: [-1] + - VectorWidthA: [1] + - VectorWidthB: [1] + #- LocalReadVectorWidth: [8] + - 1LDSBuffer: [0] + - SourceSwap: [0] + - DirectToVgprSparseMetadata: [0] + - WorkGroupMapping: [1] + - StoreVectorWidth: [-1] + - DirectToVgprA: [False] + - DirectToVgprB: [False] + - WavefrontSize: [32] + # - AssertFree0ElementMultiple: [ 16 ] + # - AssertFree1ElementMultiple: [ 16 ] + # - AssertSummationElementMultiple: [ 32 ] + - ExpandPointerSwap: [0] + - GroupLoadStore: [False] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [1, 1, 1, 127] # 7DU + tail + - Exact: [64, 64, 1, 128] + - Exact: [64, 64, 1, 129] + - ActivationArgs: + - [Enum: none] + - + - # ProblemType + OperationType: GEMM + DataType: b + DestDataType: b + ComputeDataType: s + HighPrecisionAccumulate: True + TransposeA: True + TransposeB: False + UseBeta: True + Batched: True + SupportUserArgs: True + UseBias: 0 + Activation: False + UseScaleAlphaVec: 0 + + - # BenchmarkProblemSizeGroup - Standard + InitialSolutionParameters: + BenchmarkCommonParameters: + - KernelLanguage: ["Assembly"] + ForkParameters: + - MatrixInstruction: + - [16, 16, 32, 1, 1, 8, 8, 2, 2 ] + - DepthU: [128] + - ClusterLocalRead: [0] + - TransposeLDS: [-1] + - LDSTrInst: [False] + - LdsPadA: [0] + - LdsPadB: [0] + - LdsBlockSizePerPadA: [256] + - LdsBlockSizePerPadB: [256] + - LdsPadMetadata: [0] + - StaggerU: [0] + - ScheduleIterAlg: [4] + - PrefetchLocalRead: [0, 1] + - PrefetchGlobalRead: [1, 2] + - GlobalSplitU: [1] + - GlobalSplitUAlgorithm: [MultipleBuffer] + - GlobalReadVectorWidthA: [-1] + - GlobalReadVectorWidthB: [-1] + - 1LDSBuffer: [-1] + - TDMInst: [3] + - VectorWidthA: [1] + - VectorWidthB: [1] + - SourceSwap: [0] + - DirectToVgprSparseMetadata: [0] + - WorkGroupMapping: [1] + - StoreVectorWidth: [-1] + - DirectToVgprA: [False] + - WavefrontSize: [32] + - AssertFree0ElementMultiple: [1] + - AssertFree1ElementMultiple: [1] + BenchmarkForkParameters: + JoinParameters: + BenchmarkJoinParameters: + BenchmarkFinalParameters: + - ProblemSizes: + - Exact: [256, 256, 1, 192] \ No newline at end of file