ROCm
diff --git a/‎dispatcher/README.md‎
Lines changed: 104 additions & 37 deletions b/‎dispatcher/README.md‎
Lines changed: 104 additions & 37 deletions
diff --git a/‎dispatcher/bindings/README.md‎
Lines changed: 15 additions & 9 deletions b/‎dispatcher/bindings/README.md‎
Lines changed: 15 additions & 9 deletions
diff --git a/‎dispatcher/bindings/ctypes/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions b/‎dispatcher/bindings/ctypes/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎dispatcher/bindings/ctypes/conv_bwdw_ctypes_lib.cpp‎
Lines changed: 2 additions & 2 deletions b/‎dispatcher/bindings/ctypes/conv_bwdw_ctypes_lib.cpp‎
Lines changed: 2 additions & 2 deletions
@@ -1,6 +1,6 @@
 # CK Tile Dispatcher
 
-A unified kernel dispatch system for AMD GPUs with C++ and Python frontends.
+A unified kernel dispatch system for AMD GPUs with C++ and Python frontends, supporting GEMM and Grouped Convolution operations.
 
 **Validated Platform:** AMD Instinct MI300 series (gfx942)
 
@@ -342,8 +342,8 @@ ls examples/libdispatcher_gemm_lib.so
 | `CMAKE_PREFIX_PATH` | - | ROCm installation path |
 | `CMAKE_CXX_COMPILER` | - | Path to hipcc compiler |
 
-⚠️ **Important:** Always use `-DCMAKE_BUILD_TYPE=Release` for benchmarking. Debug builds are slower.
-⚠️ **Important:** Note that the current system provides single GPU target support for architecture-based kernel filtering, please do not use multiple GPU targets at a time (if necessary, please compile into different build directories).
+WARNING: **Important:** Always use `-DCMAKE_BUILD_TYPE=Release` for benchmarking. Debug builds are slower.
+WARNING: **Important:** Note that the current system provides single GPU target support for architecture-based kernel filtering, please do not use multiple GPU targets at a time (if necessary, please compile into different build directories).
 
 ---
 
@@ -363,6 +363,15 @@ cd build/examples
 ./gemm_04_heuristics         # Heuristic kernel selection
 ./gemm_05_json_export        # Registry JSON export
 ./gemm_06_multi_registry     # Multiple registries
+
+# Grouped Convolution Examples
+./grouped_conv_01_basic         # Declaration patterns + GPU execution
+./grouped_conv_02_all_dirs      # Forward/BwdData/BwdWeight with GPU
+./grouped_conv_03_bench_val     # Benchmark + CPU reference validation
+./grouped_conv_04_registry_json # Heuristic selection + JSON export
+./grouped_conv_05_bwd_data      # Backward data + CPU validation
+./grouped_conv_06_bwd_weight    # Backward weight + CPU validation
+./grouped_conv_07_benchmark     # Multi-tile ResNet benchmark
 ```
 
 ### Python Examples
@@ -375,8 +384,16 @@ cd /path/to/composable_kernel/dispatcher
 # GEMM Examples
 python3 examples/gemm/python/01_basic_gemm.py     # Basic multi-kernel GEMM
 python3 examples/gemm/python/04_validation.py     # CPU reference validation
-python3 examples/gemm/python/07_stress_test.py    # Stress test (48 kernels)
+python3 examples/gemm/python/07_stress_test.py    # Stress test
 python3 examples/gemm/python/08_heuristics.py     # Heuristic selection
+
+# Grouped Convolution Examples
+python3 examples/grouped_conv/python/01_basic_grouped_conv.py  # Config patterns + registry + GPU
+python3 examples/grouped_conv/python/02_forward.py             # Forward 2D/3D + CPU ref
+python3 examples/grouped_conv/python/03_bwd_data.py            # Backward data + CPU ref
+python3 examples/grouped_conv/python/04_bwd_weight.py          # Backward weight + CPU ref
+python3 examples/grouped_conv/python/05_benchmark.py           # Multi-problem benchmark
+python3 examples/grouped_conv/python/06_registry_json.py       # Heuristic selection + JSON
 ```
 
 ### Example Output
@@ -647,7 +664,7 @@ lib = DispatcherLib.load("/absolute/path/to/libdispatcher_gemm_lib.so")
 ### Data Flow
 
 ```
-KernelConfig → Registry → Dispatcher → GPU Execution
+KernelConfig -> Registry -> Dispatcher -> GPU Execution
 ```
 
 1. **KernelConfig**: Defines kernel parameters (tile sizes, data types, layouts)
@@ -843,31 +860,49 @@ make -j$(nproc)
 
 ```
 dispatcher/
-├── README.md                    # This file
-├── CMakeLists.txt              # Build configuration
-│
-├── include/ck_tile/dispatcher/  # C++ headers
-│   ├── dispatcher.hpp           # GEMM dispatcher
-│   ├── registry.hpp             # Kernel registry
-│   └── kernel_key.hpp          # Kernel configuration
-│
-├── src/                        # C++ implementation
-│
-├── codegen/                    # Kernel generation
-│   ├── unified_gemm_codegen.py # GEMM kernel generator
-│   └── arch_specs.json         # GPU specifications
-│
-├── bindings/ctypes/            # Python ctypes interface
-│   └── gemm_ctypes_lib.cpp     # GEMM Python library
-│
-├── examples/                   # Examples
-│   └── gemm/
-│       ├── cpp/                # C++ GEMM examples (01-06)
-│       └── python/             # Python GEMM examples (01-11)
-│
-├── scripts/                    # Build scripts
-│
-└── tests/                      # Unit tests
+|---- README.md                    # This file
+|---- CMakeLists.txt              # Build configuration
+|
+|---- include/ck_tile/dispatcher/  # C++ headers
+|   |---- dispatcher.hpp           # Main dispatcher include
+|   |---- registry.hpp             # GEMM kernel registry
+|   |---- kernel_key.hpp           # Kernel configuration
+|   |---- grouped_conv_config.hpp  # Grouped conv configuration
+|   |---- grouped_conv_problem.hpp # Grouped conv problem (with builder)
+|   |---- grouped_conv_kernel_decl.hpp  # Grouped conv kernel declarations
+|   |---- grouped_conv_registry.hpp     # Grouped conv registry (thread-safe)
+|   +---- grouped_conv_utils.hpp        # Grouped conv utilities
+|
+|---- src/                        # C++ implementation
+|
+|---- codegen/                    # Kernel generation
+|   |---- codegen_common.py       # Shared: TileConfig, TraitConfigBase, type mappings
+|   |---- unified_gemm_codegen.py # GEMM kernel generator
+|   |---- unified_grouped_conv_codegen.py  # Grouped conv kernel generator
+|   +---- arch_specs.json         # GPU specifications
+|
+|---- python/                     # Python utilities
+|   |---- dispatcher_common.py    # Shared: paths, validation, Colors, phased output
+|   |---- ctypes_utils.py         # GEMM ctypes utilities
+|   +---- grouped_conv_utils.py   # Grouped conv utilities
+|
+|---- scripts/                    # Build scripts
+|   |---- compile_gemm_examples.py           # GEMM build script
+|   +---- compile_grouped_conv_examples.py   # Grouped conv build script
+|
+|---- bindings/ctypes/            # Python ctypes interface
+|   |---- gemm_ctypes_lib.cpp     # GEMM Python library
+|   +---- conv_ctypes_lib.cpp     # Grouped conv Python library
+|
+|---- examples/                   # Examples
+|   |---- gemm/
+|   |   |---- cpp/                # C++ GEMM examples (01-07)
+|   |   +---- python/             # Python GEMM examples (01-11)
+|   +---- grouped_conv/
+|       |---- cpp/                # C++ Grouped Conv examples (01-07)
+|       +---- python/             # Python Grouped Conv examples (01-06)
+|
++---- tests/                      # Unit tests (C++ and Python)
 ```
 
 ---
@@ -879,17 +914,49 @@ dispatcher/
 | GEMM C++ | [examples/gemm/cpp/README.md](examples/gemm/cpp/README.md) |
 | GEMM Python | [examples/gemm/python/README.md](examples/gemm/python/README.md) |
 | Codegen | [codegen/README.md](codegen/README.md) |
+| Python Utils | [python/README.md](python/README.md) |
+| C++ Headers | [include/ck_tile/dispatcher/README.md](include/ck_tile/dispatcher/README.md) |
 
 ---
 
-## Archived Content
+## Grouped Convolution Support
+
+Grouped convolution is fully supported alongside GEMM, with shared infrastructure to eliminate duplication.
+
+### Python
+
+```bash
+# Generate grouped conv kernels
+python3 codegen/unified_grouped_conv_codegen.py \
+    --output-dir build/generated_kernels \
+    --datatype fp16 --variant forward --ndim-spatial 2
+
+# Build grouped conv examples
+python3 scripts/compile_grouped_conv_examples.py examples/grouped_conv/cpp/01_basic_grouped_conv.cpp
+```
+
+### Key Files
+
+| Component | File |
+|-----------|------|
+| C++ Headers | `include/ck_tile/dispatcher/grouped_conv_*.hpp` |
+| Python Codegen | `codegen/unified_grouped_conv_codegen.py` |
+| Python Utils | `python/grouped_conv_utils.py` |
+| Build Script | `scripts/compile_grouped_conv_examples.py` |
+| Shared Codegen | `codegen/codegen_common.py` |
+| Shared Utils | `python/dispatcher_common.py` |
+
+### Variants
+
+- **Forward** (`grouped_conv_fwd`) - Standard grouped convolution
+- **Backward Data** (`grouped_conv_bwd_data`) - Gradient w.r.t. input
+- **Backward Weight** (`grouped_conv_bwd_weight`) - Gradient w.r.t. weights
+
+### Shared Infrastructure
 
-Convolution examples and utilities have been archived to `ck-2/conv_archive/dispatcher/`:
-- `examples/conv/cpp/` - 11 C++ convolution examples
-- `examples/conv/python/` - 14 Python convolution examples
-- `codegen/unified_conv_codegen.py` - Conv kernel generator
-- `include/ck_tile/dispatcher/conv_*.hpp` - Conv headers
-- `python/conv_utils.py` - Conv Python utilities
+GEMM and grouped convolution share common code to avoid duplication:
+- `codegen/codegen_common.py` - TileConfig, TraitConfigBase, type mappings, parallel generation, arch-aware expansion
+- `python/dispatcher_common.py` - Path helpers, validation, auto-correction, Colors, phased output
 
 ---
 
 
@@ -6,13 +6,13 @@ This directory contains language bindings for the CK Tile Dispatcher.
 
 ```
 bindings/
-├── ctypes/              # Python ctypes bindings (C API)
-│   ├── gemm_ctypes_lib.cpp      # GEMM dispatcher C API
-│   ├── conv_ctypes_lib.cpp      # Convolution dispatcher C API (fwd + bwd_data)
-│   ├── conv_bwdw_ctypes_lib.cpp # Convolution backward weight C API
-│   ├── gpu_helper.cpp           # CLI helper for Python
-│   └── CMakeLists.txt
-└── README.md
+|---- ctypes/              # Python ctypes bindings (C API)
+|   |---- gemm_ctypes_lib.cpp      # GEMM dispatcher C API
+|   |---- conv_ctypes_lib.cpp      # Grouped conv dispatcher C API (fwd + bwd_data)
+|   |---- conv_bwdw_ctypes_lib.cpp # Grouped conv backward weight C API (separate library)
+|   |---- gpu_helper.cpp           # CLI helper for Python
+|   +---- CMakeLists.txt
++---- README.md
 ```
 
 ## ctypes Bindings
@@ -65,7 +65,7 @@ lib.dispatcher_cleanup()
 | `dispatcher_export_registry_json()` | Export registry as JSON |
 | `dispatcher_cleanup()` | Release resources |
 
-### Convolution API
+### Grouped Convolution API
 
 | Function | Description |
 |----------|-------------|
@@ -105,5 +105,11 @@ Output is JSON for easy parsing:
 See the examples that use these bindings:
 
 - **GEMM**: `dispatcher/examples/gemm/python/`
-- **Conv**: `dispatcher/examples/conv/python/`
+
+### Grouped Convolution
+
+Grouped convolution C++ headers and Python utilities are in:
+- **C++ Headers**: `dispatcher/include/ck_tile/dispatcher/grouped_conv_*.hpp`
+- **Python Utils**: `dispatcher/python/grouped_conv_utils.py`
+- **Build Script**: `dispatcher/scripts/compile_grouped_conv_examples.py`
 
@@ -78,7 +78,7 @@ endif()
 # Look for forward kernels
 file(GLOB CONV_FWD_KERNEL_HEADERS "${CMAKE_BINARY_DIR}/generated_kernels/conv_fwd_*.hpp")
 # Look for backward data kernels  
-file(GLOB CONV_BWDD_KERNEL_HEADERS "${CMAKE_BINARY_DIR}/generated_kernels/conv_bwdd_*.hpp")
+file(GLOB CONV_BWDD_KERNEL_HEADERS "${CMAKE_BINARY_DIR}/generated_kernels/conv_bwd_data_*.hpp")
 # Fallback: any conv kernel (for backwards compatibility)
 file(GLOB CONV_KERNEL_HEADERS "${CMAKE_BINARY_DIR}/generated_kernels/conv_*.hpp")
 
@@ -112,7 +112,7 @@ endif()
 # Add backward data kernel if available
 if(CONV_BWDD_KERNEL_HEADERS)
     list(GET CONV_BWDD_KERNEL_HEADERS 0 CONV_BWDD_KERNEL_HEADER)
-    message(STATUS "Found Conv BWD_DATA kernel for ctypes lib: ${CONV_BWDD_KERNEL_HEADER}")
+    message(STATUS "Found Conv BWD_DATA kernel for ctypes lib: ${CONV_BWD_DATA_KERNEL_HEADER}")
     target_compile_options(dispatcher_conv_lib PRIVATE -include ${CONV_BWDD_KERNEL_HEADER})
     target_compile_definitions(dispatcher_conv_lib PRIVATE CONV_BWD_DATA_AVAILABLE)
 endif()
 
@@ -53,6 +53,7 @@ struct ConvBwdwProblemC
     int stride_d, stride_h, stride_w;
     int pad_d, pad_h, pad_w;
     int dilation_d, dilation_h, dilation_w;
+    int split_k;
 };
 
 // =============================================================================
@@ -108,8 +109,7 @@ static float run_bwd_weight_impl(const void* input_ptr,
                                                grad_weight_ptr, // wei_ptr = grad_weight (output)
                                                {},              // ds_ptr
                                                grad_output_ptr, // out_ptr = grad_output
-                                               1                // k_batch
-    );
+                                               (prob->split_k > 1) ? prob->split_k : 1);
 
     ck_tile::stream_config stream_cfg{static_cast<hipStream_t>(stream), true, 1, 3, 10};