Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 19 additions & 21 deletions tools/clang/unittests/HLSLExec/LinAlgTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,11 +73,13 @@ struct MatrixParams {
bool Enable16Bit;
bool EmulateTest;

size_t strideBytes() const {
size_t rowStride() const {
uint32_t ES = elementSize(CompType);
if (Layout == LinalgMatrixLayout::RowMajor)
return N * ES;
return M * ES;
if (Layout == LinalgMatrixLayout::ColumnMajor)
return M * ES;
return 0;
}

size_t totalElements() const { return M * N; }
Expand All @@ -94,7 +96,7 @@ static std::string buildCompilerArgs(const MatrixParams &Params,
SS << " -DN_DIM=" << Params.N;
SS << " -DUSE=" << static_cast<int>(Params.Use);
SS << " -DSCOPE=" << static_cast<int>(Params.Scope);
SS << " -DSTRIDE=" << Params.strideBytes();
SS << " -DSTRIDE=" << Params.rowStride();
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The stride is a problem for group shared load and store, from spec, the stride of group shared is the count of elements, so it should be N or M for group shared.

it needs to fix:
__builtin_LinAlg_MatrixLoadFromMemory(
Mat, GsData, OFFSET, STRIDE, LAYOUT);
__builtin_LinAlg_MatrixStoreToMemory(
Mat, GsData, OFFSET, STRIDE, LAYOUT);

also, group shared offset is set to 0 from test, it's okay here, but I guess the offset for group shared also the count of elements?

SS << " -DLAYOUT=" << static_cast<int>(Params.Layout);
SS << " -DELEM_SIZE=" << static_cast<int>(elementSize(Params.CompType));
SS << " -DNUMTHREADS=" << Params.NumThreads;
Expand Down Expand Up @@ -320,7 +322,6 @@ class DxilConf_SM610_LinAlg {
TEST_METHOD(LoadStoreDescriptor_Wave_16x16_F16);
TEST_METHOD(SplatStore_Wave_16x16_F16);
TEST_METHOD(AccumulateDescriptor_Wave_16x16_F16);
TEST_METHOD(AccumulateDescriptor_Thread_16x16_F16);

// Load/Store/Accumulate Memory
TEST_METHOD(LoadMemory_Wave_16x16_F16);
Expand Down Expand Up @@ -537,6 +538,9 @@ void DxilConf_SM610_LinAlg::SplatStore_Wave_16x16_F16() {
runSplatStore(D3DDevice, DxcSupport, Params, 42.0f, VerboseLogging);
}

// Since MatrixAccumulateToDescriptor requires an accumulator matrix and
// MatrixLoadFromDescriptor always returns an A matrix when loading a Thread
// matrix this shader only makes sense for Wave/ThreadGroup
static const char AccumulateDescriptorShader[] = R"(
#define USE_ACC 2

Expand Down Expand Up @@ -613,19 +617,6 @@ void DxilConf_SM610_LinAlg::AccumulateDescriptor_Wave_16x16_F16() {
runAccumulateDescriptor(D3DDevice, DxcSupport, Params, 12, VerboseLogging);
}

void DxilConf_SM610_LinAlg::AccumulateDescriptor_Thread_16x16_F16() {
MatrixParams Params = {};
Params.CompType = ComponentType::F16;
Params.M = 16;
Params.N = 16;
Params.Use = MatrixUse::Accumulator;
Params.Scope = MatrixScope::Thread;
Params.Layout = LinalgMatrixLayout::RowMajor;
Params.NumThreads = 1;
Params.Enable16Bit = true;
runAccumulateDescriptor(D3DDevice, DxcSupport, Params, 19, VerboseLogging);
}

static const char ElementAccessShader[] = R"(
RWByteAddressBuffer Input : register(u0);
RWByteAddressBuffer Output : register(u1);
Expand Down Expand Up @@ -1324,8 +1315,14 @@ void DxilConf_SM610_LinAlg::MatVecMulAdd_Thread_16x16_F16() {
}

static const char OuterProductShader[] = R"(
#define USE_A 0
// OuterProduct Matrix must be Thread scope
#define SCOPE_THREAD 0
// OuterProduct/Accumulate must be Accumulator use
#define USE_ACC 2
// Accumulate Layout must be OuterProductOptimal
#define LAYOUT_OUTER_PROD_OPT 4
// Accumulate Stride msut be 0 for non Row/Col Major
#define STRIDE 0

RWByteAddressBuffer Input : register(u0);
RWByteAddressBuffer Output : register(u1);
Expand All @@ -1345,12 +1342,12 @@ static const char OuterProductShader[] = R"(
}

__builtin_LinAlgMatrix
[[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE_A, SCOPE_THREAD)]]
[[__LinAlgMatrix_Attributes(COMP_TYPE, M_DIM, N_DIM, USE_ACC, SCOPE_THREAD)]]
Mat;
__builtin_LinAlg_MatrixOuterProduct(Mat, VecA, VecB);

__builtin_LinAlg_MatrixAccumulateToDescriptor(
Mat, Output, 0, STRIDE, LAYOUT, 128);
Mat, Output, 0, STRIDE, LAYOUT_OUTER_PROD_OPT, 128);
}
)";

Expand Down Expand Up @@ -1398,8 +1395,9 @@ void DxilConf_SM610_LinAlg::OuterProduct_Thread_16x16_F16() {
Params.CompType = ComponentType::F16;
Params.M = 16;
Params.N = 16;
Params.Use = MatrixUse::Accumulator;
Params.Scope = MatrixScope::Thread;
Params.Layout = LinalgMatrixLayout::RowMajor;
Params.Layout = LinalgMatrixLayout::OuterProductOptimal;
Params.NumThreads = 1;
Params.Enable16Bit = true;
runOuterProduct(D3DDevice, DxcSupport, Params, VerboseLogging);
Expand Down
Loading