Fix for issue ggml-org#22974. Cast intermediate results to float before adding and casting the result to the destination type. Avoids half+half operator ambiguity. (ggml-org#22994)

scutler-nv · web-flow · commit 7f3f843c31cd · 2026-05-13T22:36:14.000+02:00
diff --git a/ggml/src/ggml-cuda/allreduce.cu b/ggml/src/ggml-cuda/allreduce.cu
@@ -184,13 +184,15 @@ static __global__ void ggml_cuda_ar_kernel(
             #pragma unroll
             for (int k = 0; k < ELEMS_PER_VEC; ++k) {
                 const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[off + k]);
-                recvbuf[off + k] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(wire[k]);
+                recvbuf[off + k] = ggml_cuda_cast<T_dst>(
+                    ggml_cuda_cast<float>(d_low) + ggml_cuda_cast<float>(wire[k]));
             }
         }
         if (bid == 0 && tid < count - tail) {
             const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);
-            recvbuf[tail + tid] =
-                ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(host_other[tail + tid]);
+            recvbuf[tail + tid] = ggml_cuda_cast<T_dst>(
+                ggml_cuda_cast<float>(d_low) +
+                ggml_cuda_cast<float>(host_other[tail + tid]));
         }
     }
 }
@@ -210,7 +212,8 @@ static __global__ void ggml_cuda_ar_add_kernel(
     const int nt  = gridDim.x * blockDim.x;
     for (int i = tid; i < count; i += nt) {
         const T_src d_low = ggml_cuda_cast<T_src>(dst[i]);
-        dst[i] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(src[i]);
+        dst[i] = ggml_cuda_cast<T_dst>(
+            ggml_cuda_cast<float>(d_low) + ggml_cuda_cast<float>(src[i]));
     }
 }
 

Original file line number	Diff line number	Diff line change
`@@ -184,13 +184,15 @@ static __global__ void ggml_cuda_ar_kernel(`
`184`	`184`	`#pragma unroll`
`185`	`185`	`for (int k = 0; k < ELEMS_PER_VEC; ++k) {`
`186`	`186`	`const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[off + k]);`
`187`		`- recvbuf[off + k] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(wire[k]);`
	`187`	`+ recvbuf[off + k] = ggml_cuda_cast<T_dst>(`
	`188`	`+ ggml_cuda_cast<float>(d_low) + ggml_cuda_cast<float>(wire[k]));`
`188`	`189`	`}`
`189`	`190`	`}`
`190`	`191`	`if (bid == 0 && tid < count - tail) {`
`191`	`192`	`const T_wire d_low = ggml_cuda_cast<T_wire>(sendbuf[tail + tid]);`
`192`		`- recvbuf[tail + tid] =`
`193`		`- ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(host_other[tail + tid]);`
	`193`	`+ recvbuf[tail + tid] = ggml_cuda_cast<T_dst>(`
	`194`	`+ ggml_cuda_cast<float>(d_low) +`
	`195`	`+ ggml_cuda_cast<float>(host_other[tail + tid]));`
`194`	`196`	`}`
`195`	`197`	`}`
`196`	`198`	`}`
`@@ -210,7 +212,8 @@ static __global__ void ggml_cuda_ar_add_kernel(`
`210`	`212`	`const int nt = gridDim.x * blockDim.x;`
`211`	`213`	`for (int i = tid; i < count; i += nt) {`
`212`	`214`	`const T_src d_low = ggml_cuda_cast<T_src>(dst[i]);`
`213`		`- dst[i] = ggml_cuda_cast<T_dst>(d_low) + ggml_cuda_cast<T_dst>(src[i]);`
	`215`	`+ dst[i] = ggml_cuda_cast<T_dst>(`
	`216`	`+ ggml_cuda_cast<float>(d_low) + ggml_cuda_cast<float>(src[i]));`
`214`	`217`	`}`
`215`	`218`	`}`
`216`	`219`