LessUp
diff --git a/‎src/04_convolution/conv_winograd.cu‎
Lines changed: 176 additions & 10 deletions b/‎src/04_convolution/conv_winograd.cu‎
Lines changed: 176 additions & 10 deletions
diff --git a/‎src/04_convolution/conv_winograd.cuh‎
Lines changed: 13 additions & 4 deletions b/‎src/04_convolution/conv_winograd.cuh‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎src/07_cuda13_features/cluster.cu‎
Lines changed: 86 additions & 6 deletions b/‎src/07_cuda13_features/cluster.cu‎
Lines changed: 86 additions & 6 deletions
diff --git a/‎src/07_cuda13_features/cluster.cuh‎
Lines changed: 9 additions & 1 deletion b/‎src/07_cuda13_features/cluster.cuh‎
Lines changed: 9 additions & 1 deletion
@@ -2,24 +2,190 @@
 #include "conv_implicit_gemm.cuh"
 #include "../common/cuda_check.cuh"
 #include <stdexcept>
+#include <cmath>
 
 namespace hpc::convolution {
 
-// Experimental wrapper: until Winograd transforms are implemented, this path
-// intentionally falls back to the validated implicit GEMM implementation.
-template <>
-void conv2d_winograd<float>(const float* input, const float* weight, float* output,
-                            int batch, int in_channels, int out_channels,
-                            int height, int width, cudaStream_t stream) {
+__device__ constexpr float winograd_BT[16] = {
+    1.0f,  0.0f,  -1.0f, 0.0f,
+    0.0f,  1.0f,   1.0f, 0.0f,
+    0.0f, -1.0f,   1.0f, 0.0f,
+    0.0f,  1.0f,   0.0f, -1.0f
+};
+
+__device__ constexpr float winograd_G[16] = {
+    1.0f,    0.0f,    0.0f,   0.0f,
+    0.5f,    0.5f,    0.5f,   0.5f,
+    0.5f,   -0.5f,    0.5f,  -0.5f,
+    0.0f,    0.0f,    1.0f,   1.0f
+};
+
+__device__ constexpr float winograd_AT[16] = {
+    1.0f,  1.0f,   1.0f,  0.0f,
+    0.0f,  1.0f,  -1.0f,  0.0f,
+    0.0f,  1.0f,   1.0f,  1.0f,
+    0.0f,  1.0f,   0.0f, -1.0f
+};
+
+__device__ __forceinline__ float winograd_transform_input(float d[4][4], int i, int j) {
+    float result = 0.0f;
+    for (int ri = 0; ri < 4; ++ri) {
+        for (int rj = 0; rj < 4; ++rj) {
+            result += winograd_AT[i * 4 + ri] * d[ri][rj] * winograd_AT[j * 4 + rj];
+        }
+    }
+    return result;
+}
+
+__device__ __forceinline__ float winograd_transform_weight(float g[3][3], int i, int j) {
+    float result = 0.0f;
+    for (int ri = 0; ri < 3; ++ri) {
+        for (int rj = 0; rj < 3; ++rj) {
+            result += winograd_G[i * 4 + ri] * g[ri][rj] * winograd_G[j * 4 + rj];
+        }
+    }
+    return result;
+}
+
+__global__ void winograd_conv_kernel(const float* __restrict__ input,
+                                      const float* __restrict__ weight,
+                                      float* __restrict__ output,
+                                      int batch, int in_ch, int out_ch,
+                                      int out_h, int out_w,
+                                      int in_h, int in_w) {
+    const int tile_h = 4;
+    const int tile_w = 4;
+    
+    extern __shared__ float smem[];
+    float* s_input = smem;
+    float* s_weight = s_input + tile_h * tile_w * 16;
+    float* s_output = s_weight + 16;
+    
+    int tile_idx = blockIdx.x;
+    int tile_h_idx = tile_idx / ((out_w + tile_w - 1) / tile_w);
+    int tile_w_idx = tile_idx % ((out_w + tile_w - 1) / tile_w);
+    
+    int output_row = tile_h_idx * (tile_h - 2) + threadIdx.y;
+    int output_col = tile_w_idx * (tile_w - 2) + threadIdx.x;
+    
+    if (output_row < out_h && output_col < out_w) {
+        float d[4][4] = {0};
+        
+        for (int c = 0; c < in_ch; ++c) {
+            for (int dy = 0; dy < tile_h; ++dy) {
+                for (int dx = 0; dx < tile_w; ++dx) {
+                    int in_row = output_row + dy - 1;
+                    int in_col = output_col + dx - 1;
+                    
+                    if (in_row >= 0 && in_row < in_h && in_col >= 0 && in_col < in_w) {
+                        d[dy][dx] = input[(batch * in_ch + c) * in_h * in_w + in_row * in_w + in_col];
+                    }
+                }
+            }
+            
+            float d_win[4][4];
+            for (int i = 0; i < 4; ++i) {
+                for (int j = 0; j < 4; ++j) {
+                    d_win[i][j] = 0;
+                    for (int ri = 0; ri < 4; ++ri) {
+                        d_win[i][j] += winograd_BT[i * 4 + ri] * d[ri][j];
+                    }
+                }
+            }
+            
+            for (int ox = 0; ox < 4; ++ox) {
+                for (int oy = 0; oy < 4; ++oy) {
+                    d[oy][ox] = 0;
+                    for (int ri = 0; ri < 4; ++ri) {
+                        d[oy][ox] += d_win[oy][ri] * winograd_BT[ox * 4 + ri];
+                    }
+                }
+            }
+            
+            for (int oc = 0; oc < out_ch; ++oc) {
+                float g[3][3] = {0};
+                for (int ky = 0; ky < 3; ++ky) {
+                    for (int kx = 0; kx < 3; ++kx) {
+                        g[ky][kx] = weight[(oc * in_ch + c) * 9 + ky * 3 + kx];
+                    }
+                }
+                
+                float g_win[4][4];
+                for (int i = 0; i < 4; ++i) {
+                    for (int j = 0; j < 4; ++j) {
+                        g_win[i][j] = 0;
+                        for (int ri = 0; ri < 3; ++ri) {
+                            g_win[i][j] += winograd_G[i * 4 + ri] * g[ri][j % 3];
+                        }
+                    }
+                }
+                
+                float m[4][4];
+                for (int i = 0; i < 4; ++i) {
+                    for (int j = 0; j < 4; ++j) {
+                        m[i][j] = d[i][j] * g_win[i][j];
+                    }
+                }
+                
+                if (output_row < out_h && output_col < out_w) {
+                    float sum = 0;
+                    for (int i = 0; i < 4; ++i) {
+                        for (int j = 0; j < 4; ++j) {
+                            sum += winograd_AT[i * 4 + j] * m[i][j];
+                        }
+                    }
+                    
+                    int out_idx = (oc * out_h + output_row) * out_w + output_col;
+                    if (threadIdx.y == 0 && threadIdx.x == 0) {
+                        atomicAdd(&output[out_idx], sum);
+                    }
+                }
+            }
+        }
+    }
+}
+
+void conv2d_winograd(const float* input, const float* weight, float* output,
+                     const ConvParams& params,
+                     const WinogradConfig& config,
+                     cudaStream_t stream) {
     if (input == nullptr || weight == nullptr || output == nullptr) {
         throw std::invalid_argument("conv2d_winograd expects non-null input, weight, and output pointers");
     }
-    if (batch <= 0 || in_channels <= 0 || out_channels <= 0 || height <= 0 || width <= 0) {
-        throw std::invalid_argument("conv2d_winograd expects positive batch/channel/spatial dimensions");
+    if (params.batch <= 0 || params.in_channels <= 0 || params.out_channels <= 0) {
+        throw std::invalid_argument("conv2d_winograd expects positive batch/channel dimensions");
     }
+    if (params.kernel_h != 3 || params.kernel_w != 3) {
+        conv2d_winograd_fallback(input, weight, output, params, stream);
+        return;
+    }
+
+    if (config.use_winograd) {
+        int out_h = (params.in_height + 2 * params.pad_h - params.dilation_h * (params.kernel_h - 1) - 1) / params.stride_h + 1;
+        int out_w = (params.in_width + 2 * params.pad_w - params.dilation_w * (params.kernel_w - 1) - 1) / params.stride_w + 1;
+        
+        int tiles_h = (out_h + 1) / 2;
+        int tiles_w = (out_w + 1) / 2;
+        int num_tiles = tiles_h * tiles_w;
+        
+        dim3 block(4, 4);
+        dim3 grid(num_tiles);
+        size_t smem_size = sizeof(float) * (16 + 16 + 16);
+        
+        winograd_conv_kernel<<<grid, block, smem_size, stream>>>(
+            input, weight, output,
+            params.batch, params.in_channels, params.out_channels,
+            out_h, out_w,
+            params.in_height, params.in_width);
+    } else {
+        conv2d_winograd_fallback(input, weight, output, params, stream);
+    }
+    CUDA_CHECK_LAST();
+}
 
-    ConvParams params{batch, in_channels, out_channels, height, width,
-                      3, 3, 1, 1, 1, 1, 1, 1};
+void conv2d_winograd_fallback(const float* input, const float* weight, float* output,
+                              const ConvParams& params,
+                              cudaStream_t stream) {
     conv2d_implicit_gemm<float>(input, weight, output, params, stream);
 }
 
 
@@ -4,10 +4,19 @@
 
 namespace hpc::convolution {
 
-template <typename T>
-void conv2d_winograd(const T* input, const T* weight, T* output,
-                     int batch, int in_channels, int out_channels,
-                     int height, int width,
+struct ConvParams;
+struct WinogradConfig {
+    int tile_size = 4;
+    bool use_winograd = true;
+};
+
+void conv2d_winograd(const float* input, const float* weight, float* output,
+                     const ConvParams& params,
+                     const WinogradConfig& config = {},
                      cudaStream_t stream = nullptr);
 
+void conv2d_winograd_fallback(const float* input, const float* weight, float* output,
+                              const ConvParams& params,
+                              cudaStream_t stream = nullptr);
+
 } // namespace hpc::convolution
@@ -1,17 +1,71 @@
 #include "cluster.cuh"
 #include "../common/cuda_check.cuh"
 #include <stdexcept>
+#include <cooperative_groups/memcpy_async.h>
 
 namespace hpc::cuda13 {
 
-// Experimental fallback for a future thread-block-cluster implementation.
-// Today this uses a portable block reduction and does not rely on SM90-only features.
+bool is_hopper_architecture() {
+    int device = 0;
+    cudaDeviceProp prop;
+    CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
+    return prop.major >= 9;
+}
+
+namespace cg = cooperative_groups;
 
 template <typename T>
 __global__ void cluster_reduce_kernel(const T* __restrict__ input,
                                        T* __restrict__ output,
                                        size_t n) {
-    // Simple reduction without cluster features for compatibility
+    extern __shared__ float smem[];
+    
+    cg::cluster_group cluster = cg::this_cluster();
+    int cluster_rank = cluster.rank();
+    int cluster_size = cluster.size();
+    
+    int tid = threadIdx.x;
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    float val = (idx < n) ? static_cast<float>(input[idx]) : 0.0f;
+    smem[tid] = val;
+    
+    cluster.sync();
+    
+    if (cluster.use_cluster()) {
+        for (int s = cluster_size / 2; s > 0; s >>= 1) {
+            int peer_rank = (cluster_rank ^ s);
+            if (cluster_rank < s) {
+                smem[tid] = smem[tid] + smem[tid + s * blockDim.x];
+            }
+            cluster.sync();
+        }
+        
+        if (cluster_rank == 0) {
+            float block_sum = 0.0f;
+            for (int i = 0; i < cluster_size; ++i) {
+                block_sum += smem[i * blockDim.x];
+            }
+            atomicAdd(output, static_cast<T>(block_sum));
+        }
+    } else {
+        for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+            if (tid < s) {
+                smem[tid] += smem[tid + s];
+            }
+            __syncthreads();
+        }
+        
+        if (tid == 0) {
+            atomicAdd(output, static_cast<T>(smem[0]));
+        }
+    }
+}
+
+template <typename T>
+__global__ void cluster_reduce_fallback_kernel(const T* __restrict__ input,
+                                                 T* __restrict__ output,
+                                                 size_t n) {
     extern __shared__ float smem[];
 
     int tid = threadIdx.x;
@@ -20,7 +74,6 @@ __global__ void cluster_reduce_kernel(const T* __restrict__ input,
     smem[tid] = (idx < n) ? static_cast<float>(input[idx]) : 0.0f;
     __syncthreads();
 
-    // Block-level reduction
     for (int s = blockDim.x / 2; s > 0; s >>= 1) {
         if (tid < s) {
             smem[tid] += smem[tid + s];
@@ -35,7 +88,7 @@ __global__ void cluster_reduce_kernel(const T* __restrict__ input,
 
 template <>
 void cluster_reduce<float>(const float* input, float* output, size_t n,
-                           const ClusterConfig& config, cudaStream_t stream) {
+                          const ClusterConfig& config, cudaStream_t stream) {
     if (input == nullptr || output == nullptr) {
         throw std::invalid_argument("cluster_reduce expects non-null input and output pointers");
     }
@@ -52,7 +105,34 @@ void cluster_reduce<float>(const float* input, float* output, size_t n,
 
     CUDA_CHECK(cudaMemsetAsync(output, 0, sizeof(float), stream));
 
-    cluster_reduce_kernel<float><<<grid_size, block_size, smem_size, stream>>>(
+    if (config.use_cluster && is_hopper_architecture()) {
+        cluster_reduce_kernel<float><<<grid_size, block_size, smem_size, stream>>>(
+            input, output, n);
+    } else {
+        cluster_reduce_fallback_kernel<float><<<grid_size, block_size, smem_size, stream>>>(
+            input, output, n);
+    }
+    CUDA_CHECK_LAST();
+}
+
+template <>
+void cluster_reduce_fallback<float>(const float* input, float* output, size_t n,
+                          const ClusterConfig& config, cudaStream_t stream) {
+    if (input == nullptr || output == nullptr) {
+        throw std::invalid_argument("cluster_reduce expects non-null input and output pointers");
+    }
+    if (n == 0) {
+        throw std::invalid_argument("cluster_reduce expects n > 0");
+    }
+    if (config.block_dims.x == 0) {
+        throw std::invalid_argument("cluster_reduce expects config.block_dims.x > 0");
+    }
+
+    int block_size = config.block_dims.x;
+    int grid_size = (n + block_size - 1) / block_size;
+    size_t smem_size = block_size * sizeof(float);
+
+    cluster_reduce_fallback_kernel<float><<<grid_size, block_size, smem_size, stream>>>(
         input, output, n);
     CUDA_CHECK_LAST();
 }
 
@@ -5,14 +5,22 @@
 namespace hpc::cuda13 {
 
 struct ClusterConfig {
-    dim3 cluster_dims;  // e.g., {2, 1, 1} for 2-block cluster
+    dim3 cluster_dims;
     dim3 grid_dims;
     dim3 block_dims;
+    bool use_cluster = true;
 };
 
+bool is_hopper_architecture();
+
 template <typename T>
 void cluster_reduce(const T* input, T* output, size_t n,
                     const ClusterConfig& config,
                     cudaStream_t stream = nullptr);
 
+template <typename T>
+void cluster_reduce_fallback(const T* input, T* output, size_t n,
+                             const ClusterConfig& config,
+                             cudaStream_t stream = nullptr);
+
 } // namespace hpc::cuda13