tile-ai
diff --git a/‎CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎benchmark/matmul_metal/benchmark_matmul_metal.py‎
Lines changed: 119 additions & 0 deletions b/‎benchmark/matmul_metal/benchmark_matmul_metal.py‎
Lines changed: 119 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎requirements-dev.txt‎
Lines changed: 1 addition & 0 deletions b/‎requirements-dev.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/op/copy.cc‎
Lines changed: 97 additions & 3 deletions b/‎src/op/copy.cc‎
Lines changed: 97 additions & 3 deletions
diff --git a/‎src/op/copy.h‎
Lines changed: 17 additions & 4 deletions b/‎src/op/copy.h‎
Lines changed: 17 additions & 4 deletions
diff --git a/‎src/op/fill.cc‎
Lines changed: 24 additions & 1 deletion b/‎src/op/fill.cc‎
Lines changed: 24 additions & 1 deletion
diff --git a/‎src/op/gemm.cc‎
Lines changed: 7 additions & 2 deletions b/‎src/op/gemm.cc‎
Lines changed: 7 additions & 2 deletions
@@ -190,6 +190,13 @@ list(APPEND TILE_LANG_SRCS
   src/runtime/error_helpers.cc
 )
 
+# Metal codegen is pure C++ (no Apple frameworks) and can generate Metal shader
+# source on any platform.  Always compile it so that "target.build.tilelang_metal"
+# is available for cross-compilation on Linux/Windows.
+list(APPEND TILE_LANG_SRCS
+  src/target/codegen_metal.cc
+)
+
 set(TILELANG_OUTPUT_TARGETS tilelang tvm)
 
 # Track if the user explicitly selected a backend via cache options.
 
@@ -0,0 +1,119 @@
+import argparse
+import logging
+import time
+
+import torch
+
+import tilelang
+import tilelang.language as T
+
+logging.getLogger("tilelang").setLevel(logging.WARNING)
+
+BLOCK_CONFIGS = [
+    (16, 16, 16),
+    (32, 32, 16),
+    (32, 32, 32),
+    (64, 64, 32),
+]
+
+
+@tilelang.jit
+def matmul_simdgroup(M, N, K, block_M=64, block_N=64, block_K=32, dtype=T.float16, accum_dtype=T.float32):
+
+    @T.prim_func
+    def gemm_kernel(
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), accum_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype, scope="shared")
+            B_shared = T.alloc_shared((block_K, block_N), dtype, scope="shared")
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=0):
+                T.copy(A[by * block_M, ko * block_K], A_shared)
+                T.copy(B[ko * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return gemm_kernel
+
+
+def _tflops(M, N, K, seconds):
+    return 2.0 * M * N * K / seconds / 1e12
+
+
+def _bench(fn, warmup, repeats):
+    for _ in range(warmup):
+        fn()
+    torch.mps.synchronize()
+    t0 = time.perf_counter()
+    for _ in range(repeats):
+        fn()
+    torch.mps.synchronize()
+    return (time.perf_counter() - t0) / repeats
+
+
+def bench_torch_mps(M, N, K, warmup, repeats):
+    a = torch.randn(M, K, dtype=torch.float16, device="mps")
+    b = torch.randn(K, N, dtype=torch.float16, device="mps")
+    avg_s = _bench(lambda: torch.mm(a, b), warmup, repeats)
+    return _tflops(M, N, K, avg_s)
+
+
+def bench_tilelang(M, N, K, block_M, block_N, block_K, warmup, repeats):
+    kernel = matmul_simdgroup(M, N, K, block_M, block_N, block_K)
+    a = torch.randn(M, K, dtype=torch.float16, device="mps")
+    b = torch.randn(K, N, dtype=torch.float16, device="mps")
+    c = torch.zeros(M, N, dtype=torch.float32, device="mps")
+    avg_s = _bench(lambda: kernel(a, b, c), warmup, repeats)
+    return _tflops(M, N, K, avg_s)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Metal GEMM Benchmark (simdgroup)")
+    parser.add_argument("--m", type=int, default=4096)
+    parser.add_argument("--n", type=int, default=4096)
+    parser.add_argument("--k", type=int, default=4096)
+    parser.add_argument("--warmup", type=int, default=10)
+    parser.add_argument("--repeats", type=int, default=100)
+    parser.add_argument("--sweep", action="store_true", help="Sweep all block configs instead of using default (64,64,32)")
+    args = parser.parse_args()
+
+    M, N, K = args.m, args.n, args.k
+
+    print(f"torch:    {torch.__version__}")
+    print(f"tilelang: {tilelang.__version__}")
+    print(f"MPS:      {torch.backends.mps.is_available()}")
+    print(f"M={M}, N={N}, K={K}, warmup={args.warmup}, repeats={args.repeats}")
+    print()
+
+    ref_tflops = bench_torch_mps(M, N, K, args.warmup, args.repeats)
+    print(f"PyTorch MPS (torch.mm fp16): {ref_tflops:.1f} TFLOPS")
+    print()
+
+    configs = BLOCK_CONFIGS if args.sweep else [(64, 64, 32)]
+
+    print(f"{'block (M,N,K)':>16s} | {'TileLang':>14s} | {'Ratio':>6s}")
+    print("-" * 44)
+
+    best_tflops = 0.0
+    best_config = configs[0]
+    for bM, bN, bK in configs:
+        try:
+            tl = bench_tilelang(M, N, K, bM, bN, bK, args.warmup, args.repeats)
+            ratio = tl / ref_tflops * 100
+            tag = ""
+            if tl > best_tflops:
+                best_tflops = tl
+                best_config = (bM, bN, bK)
+            print(f"{f'({bM},{bN},{bK})':>16s} | {tl:>10.1f} TFLOPS | {ratio:>5.0f}%")
+        except Exception as e:
+            print(f"{f'({bM},{bN},{bK})':>16s} | {'FAILED':>14s} | {e}")
+
+    if args.sweep:
+        print()
+        print(f"Best config: {best_config}")
+        print(f"Best TFlops: {best_tflops:.1f}")
+        print(f"Reference TFlops (PyTorch MPS): {ref_tflops:.1f}")
@@ -31,6 +31,7 @@ dependencies = [
     # requirement as wide as possible to be compatible with other libraries
     # pip will try to use latest version whenever possible.
     "apache-tvm-ffi~=0.1.0,>=0.1.2",
+    "apache-tvm-ffi<0.1.8; platform_system == 'Darwin'",
     # torch-c-dlpack-ext provides prebuilt torch extensions.
     # Without it, TVM FFI may require JIT compilation on first import.
     "torch-c-dlpack-ext; python_version < '3.14'",
 
@@ -1,6 +1,7 @@
 # Requirements to run local build with `--no-build-isolation` or other developments
 
 apache-tvm-ffi~=0.1.0,>=0.1.2
+apache-tvm-ffi<0.1.8; platform_system == 'Darwin'
 build
 cmake>=3.26
 cython>=3.1.0
 
@@ -1,6 +1,7 @@
 # Runtime requirements
 
 apache-tvm-ffi~=0.1.0,>=0.1.2
+apache-tvm-ffi<0.1.8; platform_system == 'Darwin'
 torch-c-dlpack-ext; python_version < '3.14'
 cloudpickle
 ml-dtypes
 
@@ -517,6 +517,10 @@ LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
     return result_map;
   }
 
+  if (copy_inst == CopyInst::kMetalSIMDGroup) {
+    return {};
+  }
+
   // for LDSM/STSM, the layout was deduced from register layout
   // so we can directly apply the layout of normal copy
   // Use parallel op to infer the layout
@@ -792,11 +796,16 @@ bool CopyNode::CheckCPAsyncCopy(Target target, const LayoutMap &layout_map,
   if (!CheckCPAsyncCopyPreconditions()) {
     return false;
   }
-  // Skip vectorize size check here because, during the Infer Layout stage,
-  // the layout is not stable and the vectorized size cannot be determined.
   return true;
 }
 
+bool CopyNode::CheckSIMDGroupCopy(Target target) const {
+  if (TargetIsMetal(target) && IsSIMDGroupBuffer(src)) {
+    return IsSharedBuffer(dst) || IsGlobalBuffer(dst);
+  }
+  return false;
+}
+
 // Selects the most specific copy instruction for the given target and buffers.
 // Priority: BulkLoad1D, BulkStore1D, BulkLoad, BulkStore, LDSM, STSM,
 // TMemLoad, TMemStore, CPAsync, Normal.
@@ -864,6 +873,8 @@ CopyInst CopyNode::GetCopyInst(Target target, const LayoutMap &layout_map,
     return CopyInst::kTMemLoad;
   } else if (CheckTMemStore(target)) {
     return CopyInst::kTMemStore;
+  } else if (CheckSIMDGroupCopy(target)) {
+    return CopyInst::kMetalSIMDGroup;
   } else {
     return CopyInst::kNormal;
   }
@@ -897,6 +908,8 @@ Stmt CopyNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
     auto cp_async_copy = LowerCPAsyncCopy(T, analyzer);
     ICHECK(cp_async_copy.defined()) << "Failed to lower cp.async copy";
     return cp_async_copy;
+  } else if (copy_inst == CopyInst::kMetalSIMDGroup) {
+    return LowerSIMDGroupCopy(T, analyzer);
   } else if (copy_inst == CopyInst::kNormal) {
     return LowerNormalCopy(T, analyzer);
   } else {
@@ -982,7 +995,88 @@ Stmt CopyNode::LowerCPAsyncCopy(const LowerArgs &T,
   return cp_async_loop;
 }
 
-// Lowers the copy using standard load/store with loop transformations.
+Stmt CopyNode::LowerSIMDGroupCopy(const LowerArgs &T,
+                                  arith::Analyzer *analyzer) const {
+  ICHECK(IsSIMDGroupBuffer(src));
+  int total_elements = 1;
+  for (auto s : src->shape) {
+    auto imm = s.as<IntImmNode>();
+    ICHECK(imm) << "simdgroup buffer must have constant shape";
+    total_elements *= imm->value;
+  }
+  ICHECK(total_elements % 64 == 0)
+      << "simdgroup buffer size must be multiple of 64 (8x8), got "
+      << total_elements;
+
+  ICHECK(dst_range.size() == 2)
+      << "Expected 2D destination for simdgroup store";
+  PrimExpr dst_row_base = dst_range[0]->min;
+  PrimExpr dst_col_base = dst_range[1]->min;
+  PrimExpr dst_stride = dst->shape[dst->shape.size() - 1];
+
+  int warp_size = TargetGetWarpSize(T.target);
+  int block_size = T.thread_bounds->extent.as<IntImmNode>()->value;
+  int num_warps = block_size / warp_size;
+  PrimExpr warp_id = FloorDiv(T.thread_var, warp_size);
+
+  int M = src_range[0]->extent.as<IntImmNode>()->value;
+  int N = src_range[1]->extent.as<IntImmNode>()->value;
+
+  int kMPerWarp = 8;
+  int kNPerWarp = 8;
+  int m_warp = 1, n_warp = num_warps;
+  int max_m = M / kMPerWarp;
+  int max_n = N / kNPerWarp;
+  float ideal = N > 0 ? static_cast<float>(M) / N : 1.f;
+  float best_score = std::numeric_limits<float>::max();
+  for (int m = 1; m <= std::min(num_warps, max_m); ++m) {
+    if (num_warps % m != 0)
+      continue;
+    int n = num_warps / m;
+    if (n > max_n)
+      continue;
+    float m_per = static_cast<float>(M) / (m * kMPerWarp);
+    float n_per = static_cast<float>(N) / (n * kNPerWarp);
+    float score = std::abs(m_per / n_per - ideal);
+    if (score < best_score) {
+      best_score = score;
+      m_warp = m;
+      n_warp = n;
+    }
+  }
+
+  ICHECK(M >= m_warp * 8 && N >= n_warp * 8)
+      << "Cannot partition " << M << "x" << N << " matrix across " << m_warp
+      << "x" << n_warp << " warps with 8x8 simdgroup tiles";
+  int warp_row_tiles = M / m_warp / 8;
+  int warp_col_tiles = N / n_warp / 8;
+  ICHECK(warp_row_tiles > 0 && warp_col_tiles > 0);
+  ICHECK(warp_row_tiles * warp_col_tiles * 64 <= total_elements)
+      << "Warp partition produces more tiles than buffer capacity";
+
+  PrimExpr warp_m = FloorMod(warp_id, m_warp);
+  PrimExpr warp_n = FloorDiv(warp_id, m_warp);
+
+  Array<Stmt> stmts;
+  for (int i = 0; i < warp_row_tiles; i++) {
+    for (int j = 0; j < warp_col_tiles; j++) {
+      int tile_idx = i * warp_col_tiles + j;
+      PrimExpr row = dst_row_base + warp_m * (warp_row_tiles * 8) + i * 8;
+      PrimExpr col = dst_col_base + warp_n * (warp_col_tiles * 8) + j * 8;
+      PrimExpr ptr = Call(DataType::Handle(), builtin::address_of(),
+                          {BufferLoad(dst, {row, col})});
+      stmts.push_back(Evaluate(
+          Call(DataType::Handle(), builtin::simdgroup_store(),
+               {src->data, IntImm(DataType::Int(32), tile_idx), ptr, dst_stride,
+                IntImm(DataType::Int(32), 8), IntImm(DataType::Int(32), 8),
+                Cast(DataType::Bool(), IntImm(DataType::Int(32), 0))})));
+    }
+  }
+  if (stmts.size() == 1)
+    return stmts[0];
+  return SeqStmt(stmts);
+}
+
 Stmt CopyNode::LowerNormalCopy(const LowerArgs &T,
                                arith::Analyzer *analyzer) const {
   bool is_cpu_target = T.target->GetTargetDeviceType() == kDLCPU;
 
@@ -24,10 +24,11 @@ enum class CopyInst : uint8_t {
   kCPAsync = 5,   // cp.async global->shared copy
   // we should separate the bulk load and store for 1d and multi-dim
   // as they have different memory access patterns
-  kBulkLoad1D = 6,  // utilize tma load 1d
-  kBulkStore1D = 7, // utilize tma store 1d
-  kTMemLoad = 8,    // tcgen05.ld (tensor memory -> register)
-  kTMemStore = 9,   // tcgen05.st (register -> tensor memory)
+  kBulkLoad1D = 6,      // utilize tma load 1d
+  kBulkStore1D = 7,     // utilize tma store 1d
+  kTMemLoad = 8,        // tcgen05.ld (tensor memory -> register)
+  kTMemStore = 9,       // tcgen05.st (register -> tensor memory)
+  kMetalSIMDGroup = 10, // Metal simdgroup load/store
 };
 
 /// Convert CopyInst enum to string for debugging
@@ -53,6 +54,8 @@ inline const char *CopyInstToString(CopyInst inst) {
     return "TMemLoad";
   case CopyInst::kTMemStore:
     return "TMemStore";
+  case CopyInst::kMetalSIMDGroup:
+    return "MetalSIMDGroup";
   default:
     return "Unknown";
   }
@@ -290,6 +293,11 @@ class CopyNode : public TileOperatorNode {
                         arith::Analyzer *analyzer) const;
 
 protected:
+  /*!
+   * \brief Check if copy from Metal simdgroup to shared/global is supported.
+   */
+  bool CheckSIMDGroupCopy(Target target) const;
+
   /*!
    * \brief Get the copy instruction type.
    */
@@ -331,6 +339,11 @@ class CopyNode : public TileOperatorNode {
    */
   Stmt LowerCPAsyncCopy(const LowerArgs &T, arith::Analyzer *analyzer) const;
 
+  /*!
+   * \brief Generate lowering for simdgroup store.
+   */
+  Stmt LowerSIMDGroupCopy(const LowerArgs &T, arith::Analyzer *analyzer) const;
+
   /*!
    * \brief Generate SIMT (thread-level) loop for copying.
    */
 
@@ -156,7 +156,30 @@ For FillNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
  * @return Stmt The lowered TIR statement implementing the fill.
  */
 Stmt FillNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
-  if (IsFragmentBuffer(dst)) {
+  if (IsSIMDGroupBuffer(dst)) {
+    int region_elements = 1;
+    for (auto r : region) {
+      auto imm = r->extent.as<IntImmNode>();
+      ICHECK(imm) << "simdgroup fill region must have constant extents";
+      region_elements *= imm->value;
+    }
+    int total_elements = region_elements;
+    ICHECK(total_elements % 64 == 0)
+        << "simdgroup buffer size must be multiple of 64 (8x8), got "
+        << total_elements;
+    int num_matrices = total_elements / 64;
+    PrimExpr fill_value = Cast(dst->dtype, value);
+    Array<Stmt> stmts;
+    for (int i = 0; i < num_matrices; i++) {
+      stmts.push_back(Evaluate(
+          Call(DataType::Handle(), builtin::make_filled_simdgroup_matrix(),
+               {dst->data, IntImm(DataType::Int(32), i), fill_value,
+                IntImm(DataType::Int(32), 8), IntImm(DataType::Int(32), 8)})));
+    }
+    if (stmts.size() == 1)
+      return stmts[0];
+    return SeqStmt(stmts);
+  } else if (IsFragmentBuffer(dst)) {
     auto par_op = ParallelOp(MakeSIMTLoop(analyzer));
     par_op->InferLayout({T.target,
                          T.thread_bounds,
 
@@ -183,6 +183,8 @@ GemmInst GemmNode::getGemmInst(int block_size, Target target) const {
     return GemmInst::kMMA;
   } else if (TargetIsCPU(target)) {
     return GemmInst::kScalar;
+  } else if (TargetIsMetal(target)) {
+    return GemmInst::kMetalSimdgroup;
   } else {
     ICHECK(0) << "Unsupported target for gemm: " << target->str();
     return GemmInst::kMMA;
@@ -199,8 +201,11 @@ std::pair<int, int> GemmWarpPolicyNode::computeWarpPartition(
   }
 
   int m_warp = 1, n_warp = 1;
-  constexpr int kMPerWarp = 16; // Rows processed by a single warp
-  int kNPerWarp = 8;            // Columns processed by a single warp
+  int kMPerWarp = 16; // Rows processed by a single warp
+  if (TargetIsMetal(target)) {
+    kMPerWarp = 8;
+  }
+  int kNPerWarp = 8; // Columns processed by a single warp
   if (TargetIsVolta(target)) {
     kNPerWarp = 16;
   } else if (TargetIsCDNA(target)) {