AICL-Lab
diff --git a/‎README.md‎
Lines changed: 32 additions & 15 deletions b/‎README.md‎
Lines changed: 32 additions & 15 deletions
diff --git a/‎README.zh-CN.md‎
Lines changed: 32 additions & 15 deletions b/‎README.zh-CN.md‎
Lines changed: 32 additions & 15 deletions
diff --git a/‎docs/404.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/404.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/python/basic_usage.py‎
Lines changed: 2 additions & 1 deletion b/‎examples/python/basic_usage.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎openspec/changes/archive/2026-04-28-align-surface-and-gemm-baseline/.openspec.yaml‎
Lines changed: 2 additions & 0 deletions b/‎openspec/changes/archive/2026-04-28-align-surface-and-gemm-baseline/.openspec.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎openspec/changes/archive/2026-04-28-align-surface-and-gemm-baseline/design.md‎
Lines changed: 95 additions & 0 deletions b/‎openspec/changes/archive/2026-04-28-align-surface-and-gemm-baseline/design.md‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎openspec/changes/archive/2026-04-28-align-surface-and-gemm-baseline/proposal.md‎
Lines changed: 27 additions & 0 deletions b/‎openspec/changes/archive/2026-04-28-align-surface-and-gemm-baseline/proposal.md‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎openspec/changes/archive/2026-04-28-align-surface-and-gemm-baseline/specs/documentation-rationalization/spec.md‎
Lines changed: 12 additions & 0 deletions b/‎openspec/changes/archive/2026-04-28-align-surface-and-gemm-baseline/specs/documentation-rationalization/spec.md‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎openspec/changes/archive/2026-04-28-align-surface-and-gemm-baseline/specs/gemm-baseline-benchmarking/spec.md‎
Lines changed: 30 additions & 0 deletions b/‎openspec/changes/archive/2026-04-28-align-surface-and-gemm-baseline/specs/gemm-baseline-benchmarking/spec.md‎
Lines changed: 30 additions & 0 deletions
@@ -156,7 +156,7 @@ ctest --output-on-failure
 ./examples/gemm/gemm_benchmark
 
 # Python example (if bindings enabled)
-python ../examples/python/basic_usage.py
+python3 ../examples/python/basic_usage.py
 ```
 
 <details>
@@ -279,34 +279,49 @@ hpc-ai-optimization-lab/
 #include "common/tensor.cuh"
 
 // Allocate GPU tensors
-auto A = hpc::common::make_tensor<float>({1024, 1024});
-auto B = hpc::common::make_tensor<float>({1024, 1024});
-auto C = hpc::common::make_tensor<float>({1024, 1024});
+constexpr int M = 1024;
+constexpr int N = 1024;
+constexpr int K = 1024;
 
-// Launch optimized kernel
-hpc::gemm::gemm<float, hpc::gemm::OptLevel::Advanced>(
-    A.data(), B.data(), C.data(), 1024, 1024, 1024);
+hpc::Tensor<float> A(M * K);
+hpc::Tensor<float> B(K * N);
+hpc::Tensor<float> C(M * N);
+C.zero();
+
+// Launch the current shared-memory-tiling GEMM path
+hpc::gemm::gemm<float, hpc::gemm::GemmOpt::SharedMemTiling>(
+    A.data(), B.data(), C.data(), M, N, K);
 
 // Automatic memory cleanup when tensors go out of scope
 ```
 
 ### Python API
 
+Current Python bindings expose `elementwise`, `reduction`, and `gemm`.
+
 ```python
 import hpc_ai_opt
-import numpy as np
+import torch
+
+# Create CUDA tensors
+a = torch.randn(128, 64, device="cuda", dtype=torch.float32)
+b = torch.randn(64, 96, device="cuda", dtype=torch.float32)
+c = torch.zeros(128, 96, device="cuda", dtype=torch.float32)
+
+# Execute the currently shipped GEMM binding
+hpc_ai_opt.gemm.matmul(a, b, c, 128, 96, 64, 1.0, 0.0)
 
-# Create input data
-A = np.random.randn(1024, 1024).astype(np.float32)
-B = np.random.randn(1024, 1024).astype(np.float32)
+print(c.shape)
+```
 
-# Execute optimized GEMM
-C = hpc_ai_opt.gemm(A, B)
+Current phase benchmark CLI:
 
-print(f"Result shape: {C.shape}")
-print(f"Performance: {hpc_ai_opt.last_tflops:.1f} TFLOPS")
+```bash
+python3 python/benchmark/benchmark.py --suite gemm --sizes 256,512 --output results.json
 ```
 
+The Python benchmark entrypoint currently wires the GEMM suite by default and emits reports only from measured result sets.
+
 ---
 
 ## Testing
@@ -378,6 +393,8 @@ The repository is in a finishing-and-hardening phase.
 | Attention | ✅ | ✅ | - | - | - | Stable |
 | Quantization | ✅ | ✅ | - | ✅ | 🚧 | Stable |
 
+The support matrix describes the C++/CUDA core. In this phase, Python bindings cover `elementwise`, `reduction`, and `gemm` only.
+
 🚧 = Partial support / In development
 
 ---
 
@@ -156,7 +156,7 @@ ctest --output-on-failure
 ./examples/gemm/gemm_benchmark
 
 # Python 示例（如果启用了绑定）
-python ../examples/python/basic_usage.py
+python3 ../examples/python/basic_usage.py
 ```
 
 <details>
@@ -279,13 +279,18 @@ hpc-ai-optimization-lab/
 #include "common/tensor.cuh"
 
 // 分配 GPU 张量
-auto A = hpc::common::make_tensor<float>({1024, 1024});
-auto B = hpc::common::make_tensor<float>({1024, 1024});
-auto C = hpc::common::make_tensor<float>({1024, 1024});
+constexpr int M = 1024;
+constexpr int N = 1024;
+constexpr int K = 1024;
 
-// 启动优化后的内核
-hpc::gemm::gemm<float, hpc::gemm::OptLevel::Advanced>(
-    A.data(), B.data(), C.data(), 1024, 1024, 1024);
+hpc::Tensor<float> A(M * K);
+hpc::Tensor<float> B(K * N);
+hpc::Tensor<float> C(M * N);
+C.zero();
+
+// 启动当前的共享内存分块 GEMM 路径
+hpc::gemm::gemm<float, hpc::gemm::GemmOpt::SharedMemTiling>(
+    A.data(), B.data(), C.data(), M, N, K);
 
 // 张量超出作用域时自动释放内存
 ```
@@ -294,19 +299,29 @@ hpc::gemm::gemm<float, hpc::gemm::OptLevel::Advanced>(
 
 ```python
 import hpc_ai_opt
-import numpy as np
+import torch
+
+# 创建 CUDA 张量
+a = torch.randn(128, 64, device="cuda", dtype=torch.float32)
+b = torch.randn(64, 96, device="cuda", dtype=torch.float32)
+c = torch.zeros(128, 96, device="cuda", dtype=torch.float32)
+
+# 调用当前已提供的 GEMM 绑定
+hpc_ai_opt.gemm.matmul(a, b, c, 128, 96, 64, 1.0, 0.0)
+
+print(c.shape)
+```
 
-# 创建输入数据
-A = np.random.randn(1024, 1024).astype(np.float32)
-B = np.random.randn(1024, 1024).astype(np.float32)
+当前 Python 绑定暴露 `elementwise`、`reduction` 和 `gemm`。
 
-# 执行优化的 GEMM
-C = hpc_ai_opt.gemm(A, B)
+当前阶段的 benchmark CLI：
 
-print(f"结果形状: {C.shape}")
-print(f"性能: {hpc_ai_opt.last_tflops:.1f} TFLOPS")
+```bash
+python3 python/benchmark/benchmark.py --suite gemm --sizes 256,512 --output results.json
 ```
 
+Python benchmark 入口当前默认只接通 GEMM suite，并且只会基于真实测量结果生成报告。
+
 ---
 
 ## 测试
@@ -380,6 +395,8 @@ git push origin feature/my-optimization
 
 🚧 = 部分支持 / 开发中
 
+支持矩阵描述的是 C++/CUDA 核心能力。在当前阶段，Python 绑定只覆盖 `elementwise`、`reduction` 和 `gemm`。
+
 ---
 
 ## 许可证
 
@@ -39,7 +39,7 @@ onMounted(() => {
       <ul>
         <li><a href="/en/guide/installation">Installation Guide</a></li>
         <li><a href="/en/guide/quick-start">Quick Start</a></li>
-        <li><a href="/en/API_REFERENCE">API Reference</a></li>
+        <li><a href="/en/api/index">API Reference</a></li>
         <li><a href="/en/guide/gemm">GEMM Optimization</a></li>
         <li><a href="/en/guide/profiling">Profiling Guide</a></li>
       </ul>
 
@@ -63,7 +63,7 @@ def example_gemm(device: torch.device) -> None:
     m, n, k = 128, 96, 64
     a = torch.randn(m, k, device=device, dtype=torch.float32)
     b = torch.randn(k, n, device=device, dtype=torch.float32)
-    c = torch.empty(m, n, device=device, dtype=torch.float32)
+    c = torch.zeros(m, n, device=device, dtype=torch.float32)
 
     opt.gemm.matmul(a, b, c, m, n, k, 1.0, 0.0)
     torch.testing.assert_close(c, a @ b, rtol=1e-4, atol=1e-4)
@@ -73,6 +73,7 @@ def example_gemm(device: torch.device) -> None:
 def main() -> None:
     device = require_cuda()
     print("Running hpc_ai_opt examples on", torch.cuda.get_device_name(device))
+    print("Current shipped modules: elementwise, reduction, gemm")
     example_elementwise(device)
     example_reduction(device)
     example_gemm(device)
 
@@ -0,0 +1,2 @@
+schema: spec-driven
+created: 2026-04-27
@@ -0,0 +1,95 @@
+## Context
+
+The repository already contains the ingredients for a credible runtime-facing CUDA lab, but its public surface currently overstates what is actually wired together. The README shows a simplified Python API that the nanobind module does not export, the Python benchmark framework contains device modeling and report generation but still stops at placeholder execution, and the existing GEMM API advertises a CUTLASS comparison hook without implementing it. This creates a trust gap in exactly the area the repository now wants to harden: what users can actually run, compare, and learn from.
+
+This change is intentionally bounded. It targets the first runtime-facing slice that can be made truthful and demonstrable without expanding into a broad new product surface: shipped Python API alignment, executable benchmark entrypoints, and a GEMM baseline path using the CUTLASS dependency that is already fetched by CMake.
+
+## Goals / Non-Goals
+
+**Goals:**
+- Make the documented Python surface match the nanobind module and shipped examples.
+- Turn the benchmark entrypoint into a real, bounded execution path for GEMM-first comparisons.
+- Implement a CUTLASS-backed GEMM baseline/fallback path that can be consumed by benchmarks and comparison tests.
+- Add enough validation that docs, bindings, examples, and benchmark behavior stay aligned.
+- Express the above as OpenSpec capabilities so later runtime work can build on a stable contract.
+
+**Non-Goals:**
+- Expanding Python bindings to every existing CUDA module in this change.
+- Implementing Hopper/TMA/FP8 systemization, paged attention, or broader inference-runtime features.
+- Rewriting the project into a packaging-first Python distribution in this phase.
+- Replacing the educational GEMM path with CUTLASS; CUTLASS is introduced as baseline/fallback, not as the primary pedagogical implementation.
+
+## Decisions
+
+### 1. Split the change into a truthfulness surface and a benchmark baseline within one bounded runtime phase
+
+**Decision:** This change will combine the public-surface corrections (README, examples, support descriptions, benchmark entrypoint) with the minimum kernel-side work needed to make those corrections meaningful: a real GEMM benchmark path and a CUTLASS baseline.
+
+**Rationale:** Doing only documentation cleanup would leave the benchmark surface weak, while doing only CUTLASS work would still leave the repository telling an inaccurate story. Bundling these two pieces creates one coherent "trust restoration + baseline establishment" change.
+
+**Alternatives considered:**
+- **Docs-only first:** lower implementation cost, but still leaves the benchmark path unconvincing.
+- **CUTLASS-only first:** improves internals, but does not fix the current public mismatch.
+
+### 2. Treat the current nanobind module as the source of truth, then expand only what this change explicitly adds
+
+**Decision:** The shipped Python surface will be defined by the actual exported module structure (`elementwise`, `reduction`, `gemm`) plus any deliberate additions made in this bounded change. Documentation and examples will be rewritten to that surface rather than to aspirational convenience APIs.
+
+**Rationale:** The current trust issue comes from docs getting ahead of implementation. Reversing that relationship makes later growth safer.
+
+**Alternatives considered:**
+- **Add convenience APIs just to preserve the README shape:** rejected because it would expand scope unnecessarily.
+- **Keep aspirational examples with disclaimers:** rejected because they still blur what is shipped.
+
+### 3. Implement CUTLASS as a comparison path, not the default GEMM teaching path
+
+**Decision:** `gemm_cutlass()` will be implemented as a dedicated baseline/fallback path that sits alongside the existing staged GEMM implementations, and benchmarks will compare against it explicitly.
+
+**Rationale:** The repository's educational value depends on keeping the 7-step GEMM progression visible. CUTLASS should strengthen credibility and benchmarking, not collapse the learning narrative into a single black-box kernel.
+
+**Alternatives considered:**
+- **Replace advanced GEMM paths with CUTLASS:** rejected because it weakens the lab's pedagogical structure.
+- **Leave CUTLASS unused:** rejected because the dependency already exists and the missing baseline is currently a notable gap.
+
+### 4. Make the benchmark CLI execute real workloads with explicit bounded scope
+
+**Decision:** The benchmark framework entrypoint will execute real kernels for the suites supported by this phase, starting with GEMM and any already-wired adjacent comparisons, and it will emit JSON/HTML/chart outputs only from real result sets.
+
+**Rationale:** The repository already has result formatting, roofline analysis, and report generation. The missing value is not more framework code but actual kernel invocation and a truthful execution contract.
+
+**Alternatives considered:**
+- **Keep the framework as a library and defer the CLI:** rejected because the user-facing gap is specifically at the entrypoint.
+- **Support every suite immediately:** rejected because it would turn a bounded change into a broad expansion.
+
+### 5. Use tests and examples to lock the public/runtime contract together
+
+**Decision:** This change will add or adjust tests and examples so README snippets, Python examples, benchmark execution, and the CUTLASS path are all validated against the same bounded contract.
+
+**Rationale:** Public-surface drift is likely to recur unless it is enforced through executable checks close to the changed code.
+
+**Alternatives considered:**
+- **Rely on manual review only:** rejected because the mismatch already survived without machine-enforced checks.
+
+## Risks / Trade-offs
+
+- **[Risk] CUTLASS integration adds complexity or architecture-specific constraints** → Mitigation: keep the initial path limited to bounded GEMM shapes/types already supported by the repository and document any constraints explicitly.
+- **[Risk] Benchmark CLI grows into a large product surface** → Mitigation: scope this phase to real execution for a bounded suite rather than to universal benchmark coverage.
+- **[Risk] README truthfulness work looks like a feature reduction** → Mitigation: pair the doc alignment with an actually improved benchmark and baseline story so the change clearly increases credibility.
+- **[Risk] Python ergonomics remain limited after alignment** → Mitigation: document that broader Python-surface expansion is a later sequential change and avoid overloading this one.
+- **[Risk] Existing tests do not fully cover public-surface drift** → Mitigation: add focused tests or executable example coverage around the changed API and benchmark paths.
+
+## Migration Plan
+
+1. Define the runtime-facing OpenSpec requirements for the Python surface and GEMM baseline/benchmark contract.
+2. Align README, support descriptions, and Python examples with the bounded shipped surface.
+3. Implement `gemm_cutlass()` and any supporting benchmark wiring needed for real comparisons.
+4. Turn the benchmark entrypoint into a real execution path for the supported suite(s).
+5. Add or update validation around bindings, examples, and benchmark outputs.
+6. Leave later Python-surface expansion and advanced Hopper/attention work for subsequent changes.
+
+Rollback is straightforward at repository level: the public-surface edits and benchmark wiring can be reverted together if the CUTLASS or benchmark path proves too unstable, restoring the previous bounded behavior while keeping the OpenSpec history.
+
+## Open Questions
+
+- Whether the first benchmarked GEMM baseline should be limited to `float` or immediately include the repository's current half/int8 support envelope.
+- Whether benchmark result artifacts should be committed only as documentation/examples or remain purely generated output.
@@ -0,0 +1,27 @@
+## Why
+
+The repository currently presents a more complete Python and benchmarking surface than it actually ships: the README shows a Python API shape that does not match the current nanobind module, support claims blur the line between C++-only and Python-accessible functionality, and the benchmark framework entrypoint is still placeholder-driven. This change is needed now to restore trust in the public surface while establishing a concrete GEMM baseline path that can support credible performance work.
+
+## What Changes
+
+- Align README Python examples, support claims, and user-facing docs with the bindings and kernels that are actually shipped.
+- Define and implement the supported Python kernel surface for the first bounded runtime-facing phase, including how examples and docs must reflect that surface.
+- Turn the benchmark framework from a placeholder entrypoint into a real executable path for bounded kernel suites, starting with GEMM and adjacent comparison hooks.
+- Add a CUTLASS-backed GEMM baseline/fallback path and make it available to benchmark and comparison workflows.
+- Update tests and validation surfaces so documentation, bindings, and benchmark behavior remain consistent after the change.
+
+## Capabilities
+
+### New Capabilities
+- `python-kernel-surface`: Define the shipped Python-facing kernel API, its documented examples, and the consistency rules between bindings, examples, and public claims.
+- `gemm-baseline-benchmarking`: Define a real GEMM comparison and benchmarking path, including a CUTLASS baseline/fallback and reproducible benchmark outputs.
+
+### Modified Capabilities
+- `documentation-rationalization`: Tighten active documentation requirements so runtime-facing README examples and support descriptions reflect the actually shipped API surface rather than aspirational or placeholder behavior.
+- `stabilization-sweep`: Capture runtime-surface trust issues discovered in docs, bindings, and benchmark entrypoints as first-class stabilization work rather than leaving them as informal follow-up.
+
+## Impact
+
+- Affected areas include `README.md`, Python examples and bindings under `python/` and `examples/python/`, GEMM implementation files under `src/gemm/`, benchmark tooling, tests, and the new OpenSpec capability set for runtime-facing behavior.
+- This change introduces a CUTLASS-backed baseline path on top of the existing dependency already fetched through CMake, but it does not broaden the project into a new runtime family beyond the bounded GEMM and public-surface scope.
+- Public-facing behavior will become stricter: documentation and examples may be reduced or rewritten where the current repository overstates shipped functionality.
@@ -0,0 +1,12 @@
+## ADDED Requirements
+
+### Requirement: Runtime-facing examples reflect the shipped API surface
+Active runtime-facing documentation SHALL show only examples, signatures, and support descriptions that match the repository's shipped bindings and executable benchmark surface for the current change.
+
+#### Scenario: Python API documentation is reviewed
+- **WHEN** README or active guides present Python usage or performance-reporting examples
+- **THEN** the examples reflect the actual binding structure, callable signatures, and supported reporting outputs shipped by the repository
+
+#### Scenario: Runtime support is described
+- **WHEN** active documentation summarizes module or benchmark availability
+- **THEN** it avoids aspirational claims and describes unsupported or C++-only surfaces in a way that does not mislead Python users
@@ -0,0 +1,30 @@
+## ADDED Requirements
+
+### Requirement: GEMM baseline path is executable through a CUTLASS-backed implementation
+The repository SHALL provide an executable GEMM baseline path backed by CUTLASS so benchmark and comparison workflows can run against a non-placeholder reference implementation.
+
+#### Scenario: GEMM baseline is requested
+- **WHEN** benchmark or comparison code requests the CUTLASS GEMM path
+- **THEN** the repository executes a real CUTLASS-backed GEMM implementation for the supported type and shape envelope of this change
+
+#### Scenario: CUTLASS baseline is unavailable for a requested case
+- **WHEN** a benchmark or test requests an unsupported CUTLASS configuration
+- **THEN** the repository fails clearly or scopes the request explicitly rather than silently pretending to run a reference baseline
+
+### Requirement: Benchmark entrypoints execute real workloads for supported suites
+The benchmark framework SHALL run real kernel workloads for the suites supported by this change and SHALL emit reports only from real measured result sets.
+
+#### Scenario: Benchmark CLI is invoked for a supported suite
+- **WHEN** a user runs the benchmark entrypoint for a supported benchmark suite in this change
+- **THEN** the CLI executes the actual kernel and baseline functions, measures results, and can emit the configured JSON, HTML, or chart outputs from those measurements
+
+#### Scenario: Benchmark CLI is invoked for an unsupported suite
+- **WHEN** the benchmark entrypoint is asked to run a suite that this change does not wire to real execution
+- **THEN** it reports the unsupported state explicitly instead of printing placeholder-success guidance
+
+### Requirement: Public benchmark examples align with the executable benchmark contract
+Examples and documentation SHALL describe the benchmark surface in terms of the workloads and outputs that the repository can actually execute in this change.
+
+#### Scenario: Benchmark documentation is updated
+- **WHEN** active docs or examples describe benchmark usage
+- **THEN** they point to supported benchmark commands and output shapes that match the implemented CLI behavior
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+schema: spec-driven`
	`2`	`+created: 2026-04-27`