fix(hpc): validate int8_gemm_amx_tiled slice lengths (codex P1)

claude · claude · commit e8f9ce07a059 · 2026-05-21T07:12:06.000Z
Per codex review on PR #185: `int8_gemm_amx_tiled` is a safe public function (no `unsafe` in the signature) but its inner loop read `b_i8` via `core::slice::from_raw_parts(b_i8.as_ptr().add(row), 16)` without any length check. Callers passing mismatched (m, n, k) vs slice lengths could trigger out-of-bounds reads / UB instead of a panic. Before PR #185 this logic lived only in `matmul_i8_to_i32`'s private AMX arm (where the public `pack_contig` preceded it and bounded everything), but the factored helper is now reachable from `gemm_u8_i8` and any future caller. Fix: 1. Add three boundary assertions at function entry matching `gemm_u8_i8`'s contract: a_u8.len() >= m * k b_i8.len() >= k * n c.len() >= m * n These panic with descriptive messages on undersized input — the safety contract is now enforced at the public function boundary, not at the unsafe pointer-arithmetic site inside the hot loop. 2. Replace the `unsafe { core::slice::from_raw_parts(...) }` B-pack line with safe `b_tile[..].copy_from_slice(&b_i8[row..row + 16])`. The bounds-check inside the loop is now redundant given the function-entry assertions, but the compiler should elide it once the invariant is proven; either way the code becomes panicking- safe instead of UB-on-misuse. 3. Update the doc-comment `# Panics` section to list the boundary panics alongside the existing debug-only AMX / alignment assertions. New regression test `amx_tiled_panics_on_undersized_b`: * Constructs `b: Vec<i8>` half-a-j_tile shorter than the claimed `k * n`. * Calls `int8_gemm_amx_tiled` and asserts the expected panic fires before any unsafe slice arithmetic. * `#[should_panic(expected = "b_i8.len()")]` catches the exact assertion message; works on any host (the boundary check fires before the `debug_assert!(amx_available())` so the test passes on AMX-less CI runners too). Verification: * 2097 lib tests pass (was 2096 — +1 new regression test). * cargo clippy --lib --tests --features rayon,native -- -D warnings clean. * cargo fmt --all --check clean. The matmul_i8_to_i32 path that delegates to int8_gemm_amx_tiled inherits the assertions transparently via the call chain. No behavior change for valid input — only mismatched-shape callers that would have hit UB now get a clean panic instead. https://claude.ai/code/session_01HbqooFZHAjaUtFEzhA1R2u
diff --git a/src/hpc/int8_tile_gemm.rs b/src/hpc/int8_tile_gemm.rs
@@ -341,10 +341,22 @@ fn fallback_path(a_u8: &[u8], b_i8: &[i8], c: &mut [i32], k: usize) {
 /// is pure overwrite.)
 ///
 /// # Panics
-/// Debug-asserts AMX availability and the 16/16/64 shape constraints.
-/// Production builds rely on the caller's runtime check
-/// (`crate::hpc::amx_matmul::amx_available()`).
+/// Panics if `a_u8`, `b_i8`, or `c` are too small for the requested
+/// `(m, n, k)`, mirroring the boundary contract from `gemm_u8_i8`. Also
+/// panics in debug builds when AMX isn't OS-enabled or when the shape
+/// alignment constraints aren't met (production builds skip those for
+/// performance — callers must runtime-check
+/// `crate::hpc::amx_matmul::amx_available()` and the 16/16/64
+/// alignment themselves).
 pub fn int8_gemm_amx_tiled(a_u8: &[u8], b_i8: &[i8], c: &mut [i32], m: usize, n: usize, k: usize) {
+    // Length assertions (codex P1 from PR #185 — the function reads
+    // `b_i8` via a 16-wide window per (kk, j_tile) iteration and a_u8
+    // via a 16-row slice per i_tile, so mismatched shapes would
+    // trigger out-of-bounds reads without these gates).
+    assert!(a_u8.len() >= m * k, "int8_gemm_amx_tiled: a_u8.len()={} < m*k={}", a_u8.len(), m * k);
+    assert!(b_i8.len() >= k * n, "int8_gemm_amx_tiled: b_i8.len()={} < k*n={}", b_i8.len(), k * n);
+    assert!(c.len() >= m * n, "int8_gemm_amx_tiled: c.len()={} < m*n={}", c.len(), m * n);
+
     debug_assert!(crate::hpc::amx_matmul::amx_available());
     debug_assert_eq!(m % 16, 0, "int8_gemm_amx_tiled: M must be multiple of 16");
     debug_assert_eq!(n % 16, 0, "int8_gemm_amx_tiled: N must be multiple of 16");
@@ -354,12 +366,13 @@ pub fn int8_gemm_amx_tiled(a_u8: &[u8], b_i8: &[i8], c: &mut [i32], m: usize, n:
     let mut tile_c = vec![0i32; 256];
 
     for j_tile in (0..n).step_by(16) {
-        // Pack B[0..k, j_tile..j_tile+16] into 16-wide K-rows (contiguous
-        // memory for int8_tile_gemm_16x16's input shape).
+        // Pack B[0..k, j_tile..j_tile+16] into 16-wide K-rows
+        // (contiguous memory for int8_tile_gemm_16x16's input shape).
+        // Safe slicing — the row..row+16 range is bounded by
+        // `b_i8.len() >= k * n` asserted at function entry.
         for kk in 0..k {
             let row = kk * n + j_tile;
-            b_tile[kk * 16..(kk + 1) * 16]
-                .copy_from_slice(unsafe { core::slice::from_raw_parts(b_i8.as_ptr().add(row), 16) });
+            b_tile[kk * 16..(kk + 1) * 16].copy_from_slice(&b_i8[row..row + 16]);
         }
         for i_tile in (0..m).step_by(16) {
             let a_tile = &a_u8[i_tile * k..(i_tile + 16) * k];
@@ -513,6 +526,26 @@ mod tests {
         }
     }
 
+    /// Codex P1 regression on PR #185: `int8_gemm_amx_tiled` is a
+    /// safe public function — mismatched (m, n, k) vs slice lengths
+    /// must panic at the function boundary, not trigger UB inside
+    /// the unsafe slice/pointer arithmetic in the inner loop. This
+    /// test passes deliberately-undersized buffers and expects a
+    /// panic (which `#[should_panic]` catches).
+    #[test]
+    #[should_panic(expected = "b_i8.len()")]
+    fn amx_tiled_panics_on_undersized_b() {
+        let m = 16;
+        let n = 32;
+        let k = 64;
+        let a = vec![0u8; m * k];
+        let b = vec![0i8; k * (n - 16)]; // half a j_tile short of what's claimed
+        let mut c = vec![0i32; m * n];
+        // Even on non-AMX hosts the assertion fires before reaching
+        // the (debug-asserted) amx_available() check.
+        int8_gemm_amx_tiled(&a, &b, &mut c, m, n, k);
+    }
+
     /// Direct test for the VPDPBUSD-ymm arm (AVX-VNNI tier of
     /// `matmul_i8_to_i32`). Same shape / bit-exactness contract as
     /// the zmm version's test, just on the narrower 8-wide kernel.