DatasunriseOU
diff --git a/‎docs/upstream/tilelang_metal_emit_metal_builtins/0001-metal-emit-builtins-directly.patch‎
Lines changed: 75 additions & 0 deletions b/‎docs/upstream/tilelang_metal_emit_metal_builtins/0001-metal-emit-builtins-directly.patch‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎docs/upstream/tilelang_metal_emit_metal_builtins/0002-tvm-metal-emit-builtins-directly.patch‎
Lines changed: 69 additions & 0 deletions b/‎docs/upstream/tilelang_metal_emit_metal_builtins/0002-tvm-metal-emit-builtins-directly.patch‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎docs/upstream/tilelang_metal_emit_metal_builtins/README.md‎
Lines changed: 135 additions & 0 deletions b/‎docs/upstream/tilelang_metal_emit_metal_builtins/README.md‎
Lines changed: 135 additions & 0 deletions
@@ -0,0 +1,75 @@
+diff --git a/src/target/codegen_metal.cc b/src/target/codegen_metal.cc
+index faa43a35..e924ef99 100644
+--- a/src/target/codegen_metal.cc
++++ b/src/target/codegen_metal.cc
+@@ -153,8 +153,15 @@ void CodeGenTileLangMetal::AddFunction(const GlobalVar &gvar,
+     decl_stream << "};\n\n";
+   }
+   // Setup the thread group info.
++  // Reserve the CUDA-style alias names so user code or downstream passes
++  // cannot accidentally collide with them, even though the kernel itself
++  // emits Metal builtin names directly (no `blockIdx`/`threadIdx` aliases).
+   ICHECK_EQ(name_supply_->FreshName("threadIdx"), "threadIdx");
+   ICHECK_EQ(name_supply_->FreshName("blockIdx"), "blockIdx");
++  ICHECK_EQ(name_supply_->FreshName("threadgroup_position_in_grid"),
++            "threadgroup_position_in_grid");
++  ICHECK_EQ(name_supply_->FreshName("thread_position_in_threadgroup"),
++            "thread_position_in_threadgroup");
+   int work_dim = 0;
+   auto launch_params =
+       func->GetAttr<ffi::Array<ffi::String>>(tir::attr::kKernelLaunchParams)
+@@ -167,13 +174,22 @@ void CodeGenTileLangMetal::AddFunction(const GlobalVar &gvar,
+   }
+ 
+   if (work_dim != 0) {
+-    // use ushort by default for now
++    // Emit Metal builtin names directly as the kernel parameter identifiers
++    // rather than using CUDA-style `blockIdx`/`threadIdx` aliases. This means
++    // body references are emitted as e.g. `threadgroup_position_in_grid.x`
++    // instead of `blockIdx.x`, which:
++    //   - matches Apple's MSL convention,
++    //   - eliminates a redundant naming layer that downstream MSL passes had
++    //     to undo with regex-based string substitution (see cppmega.mlx
++    //     `_msl_transform.py::_canonicalize_tilelang_builtin_aliases`),
++    //   - removes intermediate dead-alias declarations that callers had to
++    //     strip manually.
+     stream << "  ";
+     PrintType(DataType::UInt(thread_index_bits_, work_dim), stream);
+-    stream << " blockIdx [[threadgroup_position_in_grid]],\n";
++    stream << " threadgroup_position_in_grid [[threadgroup_position_in_grid]],\n";
+     stream << "  ";
+     PrintType(DataType::UInt(thread_index_bits_, work_dim), stream);
+-    stream << " threadIdx [[thread_position_in_threadgroup]]\n";
++    stream << " thread_position_in_threadgroup [[thread_position_in_threadgroup]]\n";
+   }
+   thread_work_dim_ = work_dim;
+ 
+@@ -188,11 +204,24 @@ void CodeGenTileLangMetal::AddFunction(const GlobalVar &gvar,
+ 
+ void CodeGenTileLangMetal::BindThreadIndex(const IterVar &iv) {
+   ICHECK(!var_idmap_.count(iv->var.get()));
+-  // if we only have threadIdx.x
+-  // metal will directly print as threadIdx
++  // The thread_tag is the CUDA-style name (e.g. "threadIdx.x", "blockIdx.y").
++  // Translate to the Metal builtin reference so emitted body references
++  // resolve directly against the kernel parameters declared in AddFunction
++  // (which now use the Metal builtin names verbatim instead of the
++  // blockIdx/threadIdx aliases). The .x/.y/.z suffix is preserved.
+   std::string vname = iv->thread_tag;
+-  if (thread_work_dim_ <= 1) {
+-    vname = vname.substr(0, iv->thread_tag.length() - 2);
++  std::string axis;
++  if (vname.length() >= 2 && vname[vname.length() - 2] == '.') {
++    axis = vname.substr(vname.length() - 2);  // ".x" / ".y" / ".z"
++    vname = vname.substr(0, vname.length() - 2);
++  }
++  if (vname == "threadIdx") {
++    vname = "thread_position_in_threadgroup";
++  } else if (vname == "blockIdx") {
++    vname = "threadgroup_position_in_grid";
++  }
++  if (thread_work_dim_ > 1) {
++    vname += axis;
+   }
+   var_idmap_[iv->var.get()] =
+       CastFromTo(vname, DataType::UInt(thread_index_bits_), iv->var.dtype());
@@ -0,0 +1,69 @@
+diff --git a/src/target/source/codegen_metal.cc b/src/target/source/codegen_metal.cc
+index 0104277..2645c8e 100644
+--- a/src/target/source/codegen_metal.cc
++++ b/src/target/source/codegen_metal.cc
+@@ -146,8 +146,15 @@ void CodeGenMetal::AddFunction(const GlobalVar& gvar, const PrimFunc& func) {
+     decl_stream << "};\n\n";
+   }
+   // Setup the thread group info.
++  // Reserve the CUDA-style alias names so user code or downstream passes
++  // cannot accidentally collide with them, even though the kernel itself
++  // emits Metal builtin names directly (no `blockIdx`/`threadIdx` aliases).
+   ICHECK_EQ(name_supply_->FreshName("threadIdx"), "threadIdx");
+   ICHECK_EQ(name_supply_->FreshName("blockIdx"), "blockIdx");
++  ICHECK_EQ(name_supply_->FreshName("threadgroup_position_in_grid"),
++            "threadgroup_position_in_grid");
++  ICHECK_EQ(name_supply_->FreshName("thread_position_in_threadgroup"),
++            "thread_position_in_threadgroup");
+   int work_dim = 0;
+   auto launch_params =
+       func->GetAttr<ffi::Array<ffi::String>>(tir::attr::kKernelLaunchParams).value();
+@@ -159,13 +166,16 @@ void CodeGenMetal::AddFunction(const GlobalVar& gvar, const PrimFunc& func) {
+   }
+ 
+   if (work_dim != 0) {
+-    // use ushort by default for now
++    // Emit Metal builtin names directly as the kernel parameter identifiers
++    // rather than using CUDA-style `blockIdx`/`threadIdx` aliases. This keeps
++    // body references aligned with Apple's MSL convention and avoids forcing
++    // downstream passes to canonicalize the alias back to the Metal builtin.
+     stream << "  ";
+     PrintType(DataType::UInt(thread_index_bits_, work_dim), stream);
+-    stream << " blockIdx [[threadgroup_position_in_grid]],\n";
++    stream << " threadgroup_position_in_grid [[threadgroup_position_in_grid]],\n";
+     stream << "  ";
+     PrintType(DataType::UInt(thread_index_bits_, work_dim), stream);
+-    stream << " threadIdx [[thread_position_in_threadgroup]]\n";
++    stream << " thread_position_in_threadgroup [[thread_position_in_threadgroup]]\n";
+   }
+   thread_work_dim_ = work_dim;
+ 
+@@ -180,11 +190,24 @@ void CodeGenMetal::AddFunction(const GlobalVar& gvar, const PrimFunc& func) {
+ 
+ void CodeGenMetal::BindThreadIndex(const IterVar& iv) {
+   ICHECK(!var_idmap_.count(iv->var.get()));
+-  // if we only have threadIdx.x
+-  // metal will directly print as threadIdx
++  // The thread_tag is the CUDA-style name (e.g. "threadIdx.x", "blockIdx.y").
++  // Translate to the Metal builtin reference so emitted body references
++  // resolve directly against the kernel parameters declared in AddFunction
++  // (which now use the Metal builtin names verbatim instead of the
++  // blockIdx/threadIdx aliases). The .x/.y/.z suffix is preserved.
+   std::string vname = iv->thread_tag;
+-  if (thread_work_dim_ <= 1) {
+-    vname = vname.substr(0, iv->thread_tag.length() - 2);
++  std::string axis;
++  if (vname.length() >= 2 && vname[vname.length() - 2] == '.') {
++    axis = vname.substr(vname.length() - 2);  // ".x" / ".y" / ".z"
++    vname = vname.substr(0, vname.length() - 2);
++  }
++  if (vname == "threadIdx") {
++    vname = "thread_position_in_threadgroup";
++  } else if (vname == "blockIdx") {
++    vname = "threadgroup_position_in_grid";
++  }
++  if (thread_work_dim_ > 1) {
++    vname += axis;
+   }
+   var_idmap_[iv->var.get()] =
+       CastFromTo(vname, DataType::UInt(thread_index_bits_), iv->var.dtype());
@@ -0,0 +1,135 @@
+# TileLang Metal codegen: emit Metal builtins directly
+
+Filed PR: https://github.com/tile-ai/tilelang/pull/2143
+Branch: `apstenku123:cppmega/metal-emit-builtins-directly`
+Stacks on: tile-ai/tilelang#2130 (jorgecurious metal-gemm-upstream-rebase)
+
+## What this fixes
+
+TileLang's Metal codegen names the thread/block kernel-launch parameters
+using the CUDA-style identifiers `blockIdx` and `threadIdx`. The MSL output
+of `lower(prim_func, target='metal')` therefore looks like:
+
+```cpp
+kernel void smoke_kernel(
+    device const half4* A [[ buffer(0) ]],
+    device half4* C [[ buffer(1) ]],
+    uint3 blockIdx [[threadgroup_position_in_grid]],
+    uint3 threadIdx [[thread_position_in_threadgroup]]
+) {
+    C[((((int)threadIdx.x) * 4) / 4)] = A[((((int)threadIdx.x) * 4) / 4)];
+}
+```
+
+The named parameters mirror CUDA's `blockIdx.x`/`threadIdx.x`, but downstream
+consumers that inline the body of `kernel void` into another kernel (e.g.
+the cppmega.mlx Path C ports that splice TileLang-emitted bodies into
+`mx.fast.metal_kernel` `source=` strings) end up having to:
+
+* Inject `uint3 blockIdx = threadgroup_position_in_grid;` and
+  `uint3 threadIdx = thread_position_in_threadgroup;` shims so the body's
+  references still bind, then
+* Regex-substitute every `((int)threadIdx.x)` etc. back to
+  `((int)thread_position_in_threadgroup.x)`, then
+* Drop the now-dead alias declarations.
+
+See the canonicalization helpers in cppmega.mlx
+(`cppmega_mlx/nn/_tilelang/_msl_transform.py`):
+
+* `_metal_builtin_for_tilelang_alias`
+* `_rewrite_tilelang_builtin_axis`
+* `_rewrite_tilelang_builtin_axis_cast`
+* `_canonicalize_tilelang_builtin_aliases`
+* `_drop_alias_decl_if_unused`
+
+The whole chain is pure overhead: every consumer either lives with the
+alias or post-processes it back to the Metal builtin.
+
+## What this PR does
+
+In TileLang's Metal codegen, the thread/block kernel parameters are now
+declared using the Metal builtin identifiers themselves:
+
+```cpp
+uint3 threadgroup_position_in_grid [[threadgroup_position_in_grid]],
+uint3 thread_position_in_threadgroup [[thread_position_in_threadgroup]]
+```
+
+`BindThreadIndex` translates the CUDA-style `IterVar::thread_tag`
+(`"blockIdx.x"`, `"threadIdx.y"`, ...) to the matching Metal builtin
+reference (`threadgroup_position_in_grid.x`, ...) before recording it in
+`var_idmap_`. The body therefore emits `((int)threadgroup_position_in_grid.x)`
+directly. The `name_supply_` reservation also keeps the legacy
+`blockIdx`/`threadIdx` names blocked so the rest of the kernel cannot
+collide with them.
+
+### Before / after MSL
+
+Before:
+
+```cpp
+kernel void smoke_kernel(
+    device const half4* A [[ buffer(0) ]],
+    device half4* C [[ buffer(1) ]],
+    uint3 blockIdx [[threadgroup_position_in_grid]],
+    uint3 threadIdx [[thread_position_in_threadgroup]]
+) {
+    C[((((int)threadIdx.x) * 4) / 4)] = A[((((int)threadIdx.x) * 4) / 4)];
+}
+```
+
+After:
+
+```cpp
+kernel void smoke_kernel(
+    device const half4* A [[ buffer(0) ]],
+    device half4* C [[ buffer(1) ]],
+    uint3 threadgroup_position_in_grid [[threadgroup_position_in_grid]],
+    uint3 thread_position_in_threadgroup [[thread_position_in_threadgroup]]
+) {
+    C[((((int)thread_position_in_threadgroup.x) * 4) / 4)] = A[((((int)thread_position_in_threadgroup.x) * 4) / 4)];
+}
+```
+
+## Files in this directory
+
+* `0001-metal-emit-builtins-directly.patch` - the TileLang half. Touches
+  `src/target/codegen_metal.cc`. ~36 LOC.
+* `0002-tvm-metal-emit-builtins-directly.patch` - the vendored TVM half.
+  Touches `3rdparty/tvm/src/target/source/codegen_metal.cc`. ~30 LOC.
+
+The TileLang half is filed as one PR to `tile-ai/tilelang`. The TVM half
+needs to land in `TileLang/tvm` (the vendored fork) before a TileLang
+release that bumps the submodule.
+
+## Stacking
+
+This PR depends on `jorgecurious/tilelang:metal-gemm-upstream-rebase`
+(PR tile-ai/tilelang#2130) for the simdgroup-store hardening that the
+Path C ports rely on. The diff applies cleanly on top of that branch.
+
+## Verification
+
+* Build: TileLang `ninja -j8` succeeds against the patched
+  `src/target/codegen_metal.cc` and submodule `codegen_metal.cc`.
+* Smoke test: `lower(prim_func, target='metal')` no longer emits
+  `int blockIdx_x = ...;`-style aliases, the Metal builtin name appears
+  directly in body references.
+* cppmega Path C tests still pass: the regex helpers in
+  `_msl_transform.py` become no-ops because the emitted MSL already uses
+  the Metal builtin names; the helpers are kept (idempotent) so the
+  fallback works against unpatched TileLang releases as well.
+
+## Risk
+
+* Limited to the `tilelang_metal` and `metal` codegen paths in TileLang
+  and the vendored TVM fork respectively. CUDA, ROCm, OpenCL, WebGPU,
+  and CPU codegen are unaffected.
+* MSL parameter names are user-chosen identifiers; renaming the
+  parameter does not change semantics as the `[[threadgroup_position_in_grid]]`
+  attribute is what binds the value. Apple's MSL spec permits the
+  parameter identifier to match the attribute name.
+* `name_supply_` reserves both the new Metal builtin names and the
+  legacy `blockIdx`/`threadIdx` names so a future user-defined symbol
+  cannot collide, and the existing assertion-based contract on the
+  reservation order is preserved.