Apply suggestions from code review

khosravipasha · JohannesGaessler · web-flow · commit 05b0c84e6507 · 2026-04-10T13:50:56.000-07:00
Co-authored-by: Johannes Gäßler &lt;johannesg@5d6.de&gt;
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
@@ -362,7 +362,7 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
             x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + dst_offset + j] = unpacked_bytes[j];
 #else
             x_qs[i*(2*MMQ_TILE_NE_K + 1) + dst_offset + j] = unpacked_bytes[j];
-#endif
+#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
         }
     }
 
@@ -383,7 +383,7 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa
         x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + ksx] = bxi->d;
 #else
         x_df[i*(2*MMQ_TILE_NE_K/QI8_0) + i/(QI8_0/2) + ksx] = bxi->d;
-#endif
+#endif // defined(AMD_MFMA_AVAILABLE) || defined(TURING_MMA_AVAILABLE) || defined(AMD_WMMA_AVAILABLE)
     }
 }
 
diff --git a/ggml/src/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh
@@ -713,8 +713,8 @@ static __device__ __forceinline__ float vec_dot_q1_0_q8_1(
     }
 
     // Apply Q1_0's single scale and this chunk's Q8_1 scale
-    const float2 ds8f = __half22float2(bq8_1_chunk->ds);
-    return d1 * ds8f.x * sumi;
+    const float d8 = __low2float(bq8_1_chunk->ds);
+    return d1 * d8 * sumi;
 }
 
 static __device__ __forceinline__ float vec_dot_q4_0_q8_1(

Original file line number	Diff line number	Diff line change
`@@ -362,7 +362,7 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa`
`362`	`362`	`x_qs[i*MMQ_MMA_TILE_X_K_Q8_0 + dst_offset + j] = unpacked_bytes[j];`
`363`	`363`	`#else`
`364`	`364`	`x_qs[i(2MMQ_TILE_NE_K + 1) + dst_offset + j] = unpacked_bytes[j];`
`365`		`-#endif`
	`365`	`+#endif // defined(AMD_MFMA_AVAILABLE) \|\| defined(TURING_MMA_AVAILABLE) \|\| defined(AMD_WMMA_AVAILABLE)`
`366`	`366`	`}`
`367`	`367`	`}`
`368`	`368`
`@@ -383,7 +383,7 @@ template <int mmq_y, bool need_check> static __device__ __forceinline__ void loa`
`383`	`383`	`x_df[i*MMQ_MMA_TILE_X_K_Q8_0 + ksx] = bxi->d;`
`384`	`384`	`#else`
`385`	`385`	`x_df[i(2MMQ_TILE_NE_K/QI8_0) + i/(QI8_0/2) + ksx] = bxi->d;`
`386`		`-#endif`
	`386`	`+#endif // defined(AMD_MFMA_AVAILABLE) \|\| defined(TURING_MMA_AVAILABLE) \|\| defined(AMD_WMMA_AVAILABLE)`
`387`	`387`	`}`
`388`	`388`	`}`
`389`	`389`
Original file line number	Diff line number	Diff line change
`@@ -713,8 +713,8 @@ static __device__ __forceinline__ float vec_dot_q1_0_q8_1(`
`713`	`713`	`}`
`714`	`714`
`715`	`715`	`// Apply Q1_0's single scale and this chunk's Q8_1 scale`
`716`		`- const float2 ds8f = __half22float2(bq8_1_chunk->ds);`
`717`		`- return d1 * ds8f.x * sumi;`
	`716`	`+ const float d8 = __low2float(bq8_1_chunk->ds);`
	`717`	`+ return d1 * d8 * sumi;`
`718`	`718`	`}`
`719`	`719`
`720`	`720`	`static __device__ __forceinline__ float vec_dot_q4_0_q8_1(`