Apply auto-formatting by clang

ORippler · ORippler · commit 9296d1f85433 · 2025-08-07T00:19:11.000-07:00
diff --git a/ggml/src/ggml-cuda/mean.cu b/ggml/src/ggml-cuda/mean.cu
@@ -15,11 +15,11 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int64_t nrows = ggml_nrows(src0);
 
     const dim3 block_nums(nrows, 1, 1);
-    if ((nrows / ctx.sm_count)< 2){
+    if ((nrows / ctx.sm_count) < 2) {
         constexpr dim3 block_dims(512, 1, 1);
-        reduce_rows_f32</*norm=*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
+        reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
     } else {
         const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
-        reduce_rows_f32</*norm=*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
+        reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
     }
 }
diff --git a/ggml/src/ggml-cuda/reduce_rows.cuh b/ggml/src/ggml-cuda/reduce_rows.cuh
@@ -1,40 +1,39 @@
 #include "common.cuh"
 
 // Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
-template<bool norm>
+template <bool norm>
 static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __restrict__ dst, const int ncols) {
     const int row = blockIdx.x;
     const int col = threadIdx.x;
 
-    float sum = 0.0f;
+    float     sum        = 0.0f;
     const int num_unroll = 8;
-    float temp[num_unroll];
-    float sum_temp[num_unroll] = {0.0f};
+    float     temp[num_unroll];
+    float     sum_temp[num_unroll] = { 0.0f };
     for (int i = col; i < ncols;) {
-        for (int j = 0; j < num_unroll; ++j){
-            if (i < ncols){
+        for (int j = 0; j < num_unroll; ++j) {
+            if (i < ncols) {
                 temp[j] = x[row * ncols + i];
-            }
-            else {
+            } else {
                 temp[j] = 0;
             }
             i += blockDim.x;
         }
-        for (int j = 0; j < num_unroll; ++j){
+        for (int j = 0; j < num_unroll; ++j) {
             sum_temp[j] += temp[j];
         }
     }
-    for (int j = 0; j < num_unroll; ++j){
-            sum += sum_temp[j];
+    for (int j = 0; j < num_unroll; ++j) {
+        sum += sum_temp[j];
     }
 
     // sum up partial sums
     sum = warp_reduce_sum(sum);
     if (blockDim.x > WARP_SIZE) {
         assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
         __shared__ float s_sum[32];
-        const int warp_id = threadIdx.x / WARP_SIZE;
-        const int lane_id = threadIdx.x % WARP_SIZE;
+        const int        warp_id = threadIdx.x / WARP_SIZE;
+        const int        lane_id = threadIdx.x % WARP_SIZE;
         if (lane_id == 0) {
             s_sum[warp_id] = sum;
         }
@@ -51,4 +50,4 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r
     }
 
     dst[row] = norm ? sum / ncols : sum;
-}
+}
diff --git a/ggml/src/ggml-cuda/sum.cuh b/ggml/src/ggml-cuda/sum.cuh
@@ -1,5 +1,10 @@
 #include "common.cuh"
 
-void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream, int & n_sum);
+void sum_f32_cuda(ggml_cuda_pool & pool,
+                  const float *    x,
+                  float *          dst,
+                  const int64_t    ne,
+                  cudaStream_t     stream,
+                  int &            n_sm);
 
 void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-cuda/sumrows.cu b/ggml/src/ggml-cuda/sumrows.cu
@@ -1,9 +1,14 @@
-#include "sumrows.cuh"
 #include "reduce_rows.cuh"
+#include "sumrows.cuh"
 
-void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream, int & n_sm) {
+void sum_rows_f32_cuda(const float * x,
+                       float *       dst,
+                       const int     ncols,
+                       const int     nrows,
+                       cudaStream_t  stream,
+                       int &         n_sm) {
     const dim3 block_nums(nrows, 1, 1);
-    if ((nrows / n_sm)< 2){
+    if ((nrows / n_sm) < 2) {
         const dim3 block_dims(512, 1, 1);
         reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
     } else {
@@ -26,8 +31,8 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
     const int64_t nrows = ggml_nrows(src0);
 
     const dim3 block_nums(nrows, 1, 1);
-    
-    if ((nrows / ctx.sm_count)< 2){
+
+    if ((nrows / ctx.sm_count) < 2) {
         // Increase num threads to 512 for small nrows to better hide the latency
         const dim3 block_dims(512, 1, 1);
         reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);

Original file line number	Diff line number	Diff line change
`@@ -15,11 +15,11 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {`
`15`	`15`	`const int64_t nrows = ggml_nrows(src0);`
`16`	`16`
`17`	`17`	`const dim3 block_nums(nrows, 1, 1);`
`18`		`- if ((nrows / ctx.sm_count)< 2){`
	`18`	`+ if ((nrows / ctx.sm_count) < 2) {`
`19`	`19`	`constexpr dim3 block_dims(512, 1, 1);`
`20`		`- reduce_rows_f32</norm=/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);`
	`20`	`+ reduce_rows_f32</norm=/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);`
`21`	`21`	`} else {`
`22`	`22`	`const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);`
`23`		`- reduce_rows_f32</norm=/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);`
	`23`	`+ reduce_rows_f32</norm=/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);`
`24`	`24`	`}`
`25`	`25`	`}`