Skip to content

Commit 0ba4950

Browse files
TimDettmersclaude
andcommitted
perf: Remove unnecessary workspace allocation from NVFP4 GEMM
CUTLASS get_workspace_size() returns 0 for our GEMM configuration (no split-k, simple epilogue). Pass nullptr instead of constructing a cutlass::device_memory::allocation which calls cudaMalloc/cudaFree on every invocation. This makes the GEMM kernel CUDA-graph-capturable. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 9d2f452 commit 0ba4950

File tree

1 file changed

+2
-5
lines changed

1 file changed

+2
-5
lines changed

csrc/qutlass/gemm_nvfp4_sm120.cu

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,6 @@ static int runGemm(
9898

9999
Gemm gemm;
100100

101-
size_t workspace_size = Gemm::get_workspace_size(arguments);
102-
cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
103-
104101
cutlass::Status status;
105102

106103
status = gemm.can_implement(arguments);
@@ -109,13 +106,13 @@ static int runGemm(
109106
return -1;
110107
}
111108

112-
status = gemm.initialize(arguments, workspace.get(), stream);
109+
status = gemm.initialize(arguments, nullptr, stream);
113110
if (status != cutlass::Status::kSuccess) {
114111
fprintf(stderr, "CUTLASS GEMM initialize failed: %d\n", (int)status);
115112
return -2;
116113
}
117114

118-
status = gemm.run(arguments, workspace.get(), stream);
115+
status = gemm.run(arguments, nullptr, stream);
119116
if (status != cutlass::Status::kSuccess) {
120117
fprintf(stderr, "CUTLASS GEMM run failed: %d\n", (int)status);
121118
return -3;

0 commit comments

Comments
 (0)