Fix FP4 packed access and GEMM K tile validation

TerminusAkivili · TerminusAkivili · commit 33266aea4e49 · 2026-05-11T01:58:14.000+08:00
Lower FP4 packed vector load/store with odd or symbolic bases to per-lane nibble operations to avoid silent miscompiles.

Reject T.gemm K tiles that are not divisible by the MMA instruction K tile so FP4/A8W4 block_K tails cannot be silently skipped.
diff --git a/src/backend/cuda/codegen/codegen_cuda.cc b/src/backend/cuda/codegen/codegen_cuda.cc
@@ -2039,6 +2039,30 @@ std::string CodeGenTileLangCUDA::GetVecLoad(DataType t,
     return os.str();
   }
 
+  if (IsFp4PackedStorage(buffer_var, buffer->dtype) && t.is_float4_e2m1fn() &&
+      t.lanes() > 1) {
+    arith::Analyzer analyzer;
+    bool base_aligned = is_zero(analyzer.Simplify(truncmod(base, 2)));
+    if (!base_aligned) {
+      // Packed FP4 vector reinterpret is only nibble-aligned for even logical
+      // bases. Odd or symbolic bases need per-lane nibble selection.
+      std::string vid = GetVarID(buffer_var);
+      std::ostringstream os;
+      os << "make_fp4_e2_" << t.lanes() << "_t(";
+      for (int i = 0; i < t.lanes(); ++i) {
+        if (i != 0) {
+          os << ", ";
+        }
+        PrimExpr index = analyzer.Simplify(
+            base + IntImm(base.dtype(), static_cast<int64_t>(i)));
+        os << "tl_fp4_packed_load((fp4_e2_2_t*)" << vid << ", "
+           << PrintExpr(index) << ")";
+      }
+      os << ")";
+      return os.str();
+    }
+  }
+
   std::string scope;
   if (alloc_storage_scope_.count(buffer_var)) {
     scope = alloc_storage_scope_.at(buffer_var);
@@ -2133,6 +2157,30 @@ void CodeGenTileLangCUDA::PrintVecStore(const BufferNode *buffer, DataType t,
     return;
   }
 
+  if (IsFp4PackedStorage(buffer_var, buffer->dtype) && t.is_float4_e2m1fn() &&
+      t.lanes() > 1) {
+    arith::Analyzer analyzer;
+    bool base_aligned = is_zero(analyzer.Simplify(truncmod(base, 2)));
+    if (!base_aligned) {
+      std::ostringstream vec_type;
+      PrintType(t, vec_type);
+      std::string vid = GetVarID(buffer_var);
+      this->PrintIndent();
+      this->stream << "{ " << vec_type.str() << " __tl_fp4_vec = " << value
+                   << "; ";
+      for (int i = 0; i < t.lanes(); ++i) {
+        std::ostringstream elem;
+        PrintVecElemLoad("__tl_fp4_vec", t, i, elem);
+        PrimExpr index = analyzer.Simplify(
+            base + IntImm(base.dtype(), static_cast<int64_t>(i)));
+        this->stream << "tl_fp4_packed_store((fp4_e2_2_t*)" << vid << ", "
+                     << PrintExpr(index) << ", " << elem.str() << "); ";
+      }
+      this->stream << "}\n";
+      return;
+    }
+  }
+
   std::string scope;
   if (alloc_storage_scope_.count(buffer_var)) {
     scope = alloc_storage_scope_.at(buffer_var);
diff --git a/tilelang/cuda/op/gemm/gemm_mma.py b/tilelang/cuda/op/gemm/gemm_mma.py
@@ -57,6 +57,11 @@ def _make_mma_emitter(self, target: Target, thread_nums: int, thread_var: tir.Va
             chunk=self.chunk,
             thread_var=thread_var,
         )
+        if self.chunk % emitter.micro_size_k != 0:
+            raise ValueError(
+                f"T.gemm K tile ({self.chunk}) must be divisible by MMA instruction K tile "
+                f"({emitter.micro_size_k}) for A={self.A.dtype}, B={self.B.dtype}"
+            )
         return emitter
 
     def infer_layout(self, target: Target, thread_nums: int):