[Kernels][stdlib] Replace DeviceContextPtr with direct use of

sabauma · modularbot · commit a1550d87c83c · 2026-05-16T07:19:33.000Z
DeviceContext

The `DeviceContextPtr` type is a legacy type which was needed to satisfy
the requirement that kernel arguments be register passable. Internally,
`DeviceContextPtr` is structurally equivalent to `DeviceContext`, with
the caveat that it does not increment the reference count on the
`DeviceContext` C++ implementation.

Now that we have the EmitMojo backend, kernels are able to accept
`DeviceContext` or containers thereof. This change replaces
`DeviceContextPtr` with direct uses of `DeviceContext`.

Summary of changes:

* Kernel registrations now take a `DeviceContext` argument rather than a
`DeviceContextPtr` argument.
* Multi-device kernels now take a `DeviceContextList` type, rather than
`DeviceContextPtrList`.
* Kernel implementations which used the nullability of
`DeviceContextPtr` now take `Optional[DeviceContext]`. Where possible,
these cases were eliminated as a null `DeviceContextPtr` is uncommon
with the introduction of `CpuDeviceContext`.

MODULAR_ORIG_COMMIT_REV_ID: fad38ebea8c999c67dd807ce13e5589d641ba38a
diff --git a/book/i18n/ko/src/puzzle_17/puzzle_17.md b/book/i18n/ko/src/puzzle_17/puzzle_17.md
@@ -189,7 +189,7 @@ Verification passed: Custom kernel results match NumPy calculation
            output: OutputTensor[rank=1],
            input: InputTensor[dtype = output.dtype, rank = output.rank],
            kernel: InputTensor[dtype = output.dtype, rank = output.rank],
-           ctx: DeviceContextPtr,
+           ctx: DeviceContext,
        ) raises:
            # 구현
    ```
@@ -248,7 +248,7 @@ struct Conv1DCustomOp:
         output: OutputTensor[rank=1],
         input: InputTensor[dtype = output.dtype, rank = output.rank],
         kernel: InputTensor[type = output.dtype, rank = output.rank],
-        ctx: DeviceContextPtr,
+        ctx: DeviceContext,
     ) raises:
         # 구현
 ```
@@ -259,7 +259,7 @@ struct Conv1DCustomOp:
   때 사용하는 이름
 - **구조체**에는 올바른 시그니처를 가진 `execute` 메서드가 있어야 함
 - **OutputTensor**와 **InputTensor** 타입이 파이썬 데이터와의 인터페이스를 정의
-- **DeviceContextPtr**이 실행 환경에 대한 접근을 제공
+- **DeviceContext**가 실행 환경에 대한 접근을 제공
 
 ### 커스텀 op 패키징
 
diff --git a/book/src/puzzle_17/puzzle_17.md b/book/src/puzzle_17/puzzle_17.md
@@ -192,7 +192,7 @@ Let's break down how this works in the larger context:
            output: OutputTensor[rank=1],
            input: InputTensor[dtype = output.dtype, rank = output.rank],
            kernel: InputTensor[dtype = output.dtype, rank = output.rank],
-           ctx: DeviceContextPtr,
+           ctx: DeviceContext,
        ) raises:
            # Implementation
    ```
@@ -252,7 +252,7 @@ struct Conv1DCustomOp:
         output: OutputTensor[rank=1],
         input: InputTensor[dtype = output.dtype, rank = output.rank],
         kernel: InputTensor[type = output.dtype, rank = output.rank],
-        ctx: DeviceContextPtr,
+        ctx: DeviceContext,
     ) raises:
         # Implementation here
 ```
@@ -264,7 +264,7 @@ Key components of the registration:
 - The **struct** must have an `execute` method with the correct signature
 - **OutputTensor** and **InputTensor** types define the interface for Python
   data
-- **DeviceContextPtr** provides access to the execution environment
+- **DeviceContext** provides access to the execution environment
 
 ### Packaging custom ops
 
diff --git a/problems/p17/op/conv1d.mojo b/problems/p17/op/conv1d.mojo
@@ -68,7 +68,7 @@ def conv1d_kernel[
 
 # ANCHOR: conv1d_custom_op
 import compiler
-from std.runtime.asyncrt import DeviceContextPtr
+
 from tensor import InputTensor, OutputTensor
 from std.memory import UnsafePointer
 from std.gpu.host import DeviceBuffer
@@ -88,14 +88,14 @@ struct Conv1DCustomOp:
         input: InputTensor[rank=output.rank, static_spec=_],
         kernel: InputTensor[rank=output.rank, static_spec=_],
         # the context is needed for some GPU calls
-        ctx: DeviceContextPtr,
+        ctx: DeviceContext,
     ) raises:
         var output_tensor = output.to_layout_tensor()
         var input_tensor = input.to_layout_tensor()
         var kernel_tensor = kernel.to_layout_tensor()
 
         comptime if target == "gpu":
-            var gpu_ctx = ctx.get_device_context()
+            var gpu_ctx = ctx
             # making sure the output tensor is zeroed out before the kernel is called
             gpu_ctx.enqueue_memset(
                 DeviceBuffer[output_tensor.dtype](
diff --git a/problems/p18/op/softmax.mojo b/problems/p18/op/softmax.mojo
@@ -60,7 +60,7 @@ def softmax_cpu_kernel[
 # ANCHOR_END: softmax_cpu_kernel
 
 import compiler
-from std.runtime.asyncrt import DeviceContextPtr
+
 from tensor import InputTensor, OutputTensor
 
 
@@ -74,7 +74,7 @@ struct SoftmaxCustomOp:
     ](
         output: OutputTensor[rank=1, static_spec=_],
         input: InputTensor[rank=output.rank, static_spec=_],
-        ctx: DeviceContextPtr,
+        ctx: DeviceContext,
     ) raises:
         # Note: rebind is necessary now but it shouldn't be!
         var output_tensor = rebind[
@@ -85,7 +85,7 @@ struct SoftmaxCustomOp:
         ](input.to_layout_tensor())
 
         comptime if target == "gpu":
-            var gpu_ctx = ctx.get_device_context()
+            var gpu_ctx = ctx
             # making sure the output tensor is zeroed out before the kernel is called
             gpu_ctx.enqueue_memset(
                 DeviceBuffer[output_tensor.dtype](
diff --git a/problems/p19/op/attention.mojo b/problems/p19/op/attention.mojo
@@ -15,7 +15,7 @@ from std.math import exp
 from std.bit import log2_ceil
 from std.utils.numerics import max_finite, min_finite
 import compiler
-from std.runtime.asyncrt import DeviceContextPtr
+
 from tensor import InputTensor, OutputTensor
 
 comptime SEQ_LEN = 16  # This must be equal to SEQ_LEN in p19.py
@@ -269,7 +269,7 @@ struct AttentionCustomOp:
         q: InputTensor[rank=1, static_spec=_],  # Query vector (d,)
         k: InputTensor[rank=2, static_spec=_],  # Key matrix (seq_len, d)
         v: InputTensor[rank=2, static_spec=_],  # Value matrix (seq_len, d)
-        ctx: DeviceContextPtr,
+        ctx: DeviceContext,
     ) raises:
         # Define layouts
         comptime layout_q = row_major[d]()
diff --git a/problems/p20/op/conv1d.mojo b/problems/p20/op/conv1d.mojo
@@ -71,7 +71,7 @@ def conv1d_kernel[
 # ANCHOR_END: conv1d_kernel
 
 import compiler
-from std.runtime.asyncrt import DeviceContextPtr
+
 from tensor import InputTensor, OutputTensor
 from std.memory import UnsafePointer
 from std.gpu.host import DeviceBuffer
@@ -91,14 +91,14 @@ struct Conv1DCustomOp:
         input: InputTensor[dtype=dtype, rank=output.rank, static_spec=_],
         kernel: InputTensor[dtype=dtype, rank=output.rank, static_spec=_],
         # the context is needed for some GPU calls
-        ctx: DeviceContextPtr,
+        ctx: DeviceContext,
     ) raises:
         var out_tensor = output.to_layout_tensor()
         var input_tensor = input.to_layout_tensor()
         var kernel_tensor = kernel.to_layout_tensor()
 
         comptime if target == "gpu":
-            var gpu_ctx = ctx.get_device_context()
+            var gpu_ctx = ctx
             # making sure the output tensor is zeroed out before the kernel is called
             gpu_ctx.enqueue_memset(
                 DeviceBuffer[output.dtype](
diff --git a/problems/p21/op/embedding.mojo b/problems/p21/op/embedding.mojo
@@ -103,7 +103,7 @@ def embedding_kernel_2d[
 # ANCHOR_END: embedding_kernel_2d
 
 import compiler
-from std.runtime.asyncrt import DeviceContextPtr
+
 from tensor import InputTensor, OutputTensor
 from std.memory import UnsafePointer
 from std.gpu.host import DeviceBuffer
@@ -128,14 +128,14 @@ struct EmbeddingCustomOp:
         weights: InputTensor[
             dtype=output.dtype, rank=2, static_spec=_
         ],  # [vocab_size, embed_dim]
-        ctx: DeviceContextPtr,
+        ctx: DeviceContext,
     ) raises:
         var output_tensor = output.to_layout_tensor()
         var indices_tensor = indices.to_layout_tensor()
         var weights_tensor = weights.to_layout_tensor()
 
         comptime if target == "gpu":
-            var gpu_ctx = ctx.get_device_context()
+            var gpu_ctx = ctx
 
             # Zero out output tensor
             gpu_ctx.enqueue_memset(
@@ -203,14 +203,14 @@ struct Embedding2DCustomOp:
         weights: InputTensor[
             dtype=output.dtype, rank=2, static_spec=_
         ],  # [vocab_size, embed_dim]
-        ctx: DeviceContextPtr,
+        ctx: DeviceContext,
     ) raises:
         var output_tensor = output.to_layout_tensor()
         var indices_tensor = indices.to_layout_tensor()
         var weights_tensor = weights.to_layout_tensor()
 
         comptime if target == "gpu":
-            var gpu_ctx = ctx.get_device_context()
+            var gpu_ctx = ctx
 
             # Zero out output tensor
             gpu_ctx.enqueue_memset(
diff --git a/problems/p22/op/layernorm_linear.mojo b/problems/p22/op/layernorm_linear.mojo
@@ -12,7 +12,9 @@ from layout.tile_layout import row_major, TensorLayout
 from layout.tile_tensor import stack_allocation
 from layout.layout_tensor import copy_dram_to_sram_async
 import compiler
-from std.runtime.asyncrt import DeviceContextPtr
+
+from std.gpu.host import DeviceContext
+
 from tensor import InputTensor, OutputTensor
 from std.utils import StaticTuple
 
@@ -371,7 +373,7 @@ struct LayerNormLinearCustomOp:
         ln_bias: InputTensor[dtype=DType.float32, rank=1, static_spec=_],
         linear_weight: InputTensor[dtype=DType.float32, rank=2, static_spec=_],
         linear_bias: InputTensor[dtype=DType.float32, rank=1, static_spec=_],
-        ctx: DeviceContextPtr,
+        ctx: DeviceContext,
     ) raises:
         comptime input_layout = input.static_spec.to_layout()
         comptime ln_params_layout = ln_weight.static_spec.to_layout()
@@ -405,7 +407,7 @@ struct LayerNormLinearCustomOp:
         ](linear_bias.to_layout_tensor())
 
         comptime if target == "gpu":
-            var gpu_ctx = ctx.get_device_context()
+            var gpu_ctx = ctx
 
             # ANCHOR: layernorm_linear_custom_op
             comptime if algorithm == "fused":
@@ -613,7 +615,7 @@ struct LayerNormLinearBackwardCustomOp:
         ln_weight: InputTensor[dtype=DType.float32, rank=1, static_spec=_],
         ln_bias: InputTensor[dtype=DType.float32, rank=1, static_spec=_],
         linear_weight: InputTensor[dtype=DType.float32, rank=2, static_spec=_],
-        ctx: DeviceContextPtr,
+        ctx: DeviceContext,
     ) raises:
         comptime grad_output_layout = grad_output.static_spec.to_layout()
         comptime input_layout = input.static_spec.to_layout()
@@ -666,7 +668,7 @@ struct LayerNormLinearBackwardCustomOp:
         ](linear_weight.to_layout_tensor())
 
         comptime if target == "gpu":
-            var gpu_ctx = ctx.get_device_context()
+            var gpu_ctx = ctx
 
             # Launch backward kernel
             comptime kernel = minimal_fused_kernel_backward[
diff --git a/solutions/p17/op/conv1d.mojo b/solutions/p17/op/conv1d.mojo
@@ -72,7 +72,7 @@ def conv1d_kernel[
 
 
 import compiler
-from std.runtime.asyncrt import DeviceContextPtr
+
 from tensor import InputTensor, OutputTensor
 from std.memory import UnsafePointer
 from std.gpu.host import DeviceBuffer
@@ -92,7 +92,7 @@ struct Conv1DCustomOp:
         input: InputTensor[dtype=dtype, rank=output.rank, static_spec=_],
         kernel: InputTensor[dtype=dtype, rank=output.rank, static_spec=_],
         # the context is needed for some GPU calls
-        ctx: DeviceContextPtr,
+        ctx: DeviceContext,
     ) raises:
         comptime out_layout_val = row_major[input_size]()
         comptime OutLayout = type_of(out_layout_val)
@@ -110,7 +110,7 @@ struct Conv1DCustomOp:
         ](kernel.unsafe_ptr(), conv_layout_val)
 
         comptime if target == "gpu":
-            var gpu_ctx = ctx.get_device_context()
+            var gpu_ctx = ctx
             # making sure the output tensor is zeroed out before the kernel is called
             gpu_ctx.enqueue_memset(
                 DeviceBuffer[output_tensor.dtype](
diff --git a/solutions/p18/op/softmax.mojo b/solutions/p18/op/softmax.mojo
@@ -119,7 +119,7 @@ def softmax_cpu_kernel[
 # ANCHOR_END: softmax_cpu_kernel_solution
 
 import compiler
-from std.runtime.asyncrt import DeviceContextPtr
+
 from tensor import InputTensor, OutputTensor
 
 
@@ -133,7 +133,7 @@ struct SoftmaxCustomOp:
     ](
         output: OutputTensor[dtype=dtype, rank=1, static_spec=_],
         input: InputTensor[dtype=dtype, rank=output.rank, static_spec=_],
-        ctx: DeviceContextPtr,
+        ctx: DeviceContext,
     ) raises:
         var output_tensor = TileTensor[
             mut=True, dtype, LayoutType, MutAnyOrigin
@@ -143,7 +143,7 @@ struct SoftmaxCustomOp:
         ](input.unsafe_ptr(), layout)
 
         comptime if target == "gpu":
-            var gpu_ctx = ctx.get_device_context()
+            var gpu_ctx = ctx
             # making sure the output tensor is zeroed out before the kernel is called
             gpu_ctx.enqueue_memset(
                 DeviceBuffer[dtype](
diff --git a/solutions/p19/op/attention.mojo b/solutions/p19/op/attention.mojo
@@ -14,7 +14,7 @@ from std.math import exp
 from std.bit import log2_ceil
 from std.utils.numerics import max_finite, min_finite
 import compiler
-from std.runtime.asyncrt import DeviceContextPtr
+
 from tensor import InputTensor, OutputTensor
 
 comptime SEQ_LEN = 16  # This must be equal to SEQ_LEN in p19.py
@@ -308,7 +308,7 @@ struct AttentionCustomOp:
         v: InputTensor[
             dtype=dtype, rank=2, static_spec=_
         ],  # Value matrix (seq_len, d)
-        ctx: DeviceContextPtr,
+        ctx: DeviceContext,
     ) raises:
         # Define layouts
         comptime layout_q = row_major[d]()
@@ -337,8 +337,6 @@ struct AttentionCustomOp:
         )
 
         comptime if target == "gpu":
-            var gpu_ctx = rebind[DeviceContext](ctx[])
-
             # Define layouts for matrix multiplication
             # Q reshaped to (1, d)
             comptime layout_q_2d = row_major[1, d]()
@@ -384,10 +382,10 @@ struct AttentionCustomOp:
             ) // MATMUL_BLOCK_DIM_XY
 
             # Allocate minimal temporary buffers - reuse same buffer for different shapes
-            var k_t_buf = gpu_ctx.enqueue_create_buffer[dtype](
+            var k_t_buf = ctx.enqueue_create_buffer[dtype](
                 seq_len * d
             )  # K^T as (d, seq_len)
-            var scores_weights_buf = gpu_ctx.enqueue_create_buffer[dtype](
+            var scores_weights_buf = ctx.enqueue_create_buffer[dtype](
                 seq_len
             )  # Reused for scores and weights
 
@@ -402,7 +400,7 @@ struct AttentionCustomOp:
             comptime kernel = transpose_kernel[
                 seq_len, d, KTLayout, KLayout, dtype
             ]
-            gpu_ctx.enqueue_function[kernel](
+            ctx.enqueue_function[kernel](
                 k_t,
                 k_tensor,
                 grid_dim=transpose_blocks_per_grid,
@@ -422,7 +420,7 @@ struct AttentionCustomOp:
                 KTLayout,
                 dtype,
             ]
-            gpu_ctx.enqueue_function[kernel2](
+            ctx.enqueue_function[kernel2](
                 scores_2d,
                 q_2d,
                 k_t,
@@ -443,7 +441,7 @@ struct AttentionCustomOp:
             var weights_in = TileTensor[
                 mut=True, dtype, ScoresLayout, MutAnyOrigin
             ](scores_weights_buf, layout_scores)
-            gpu_ctx.enqueue_function[kernel3](
+            ctx.enqueue_function[kernel3](
                 weights_out,
                 weights_in,
                 grid_dim=softmax_blocks_per_grid,
@@ -465,7 +463,7 @@ struct AttentionCustomOp:
                 VLayout,
                 dtype,
             ]
-            gpu_ctx.enqueue_function[kernel4](
+            ctx.enqueue_function[kernel4](
                 result_2d,
                 weights_2d,
                 v_tensor,
diff --git a/solutions/p20/op/conv1d.mojo b/solutions/p20/op/conv1d.mojo
@@ -72,7 +72,7 @@ def conv1d_kernel[
 
 
 import compiler
-from std.runtime.asyncrt import DeviceContextPtr
+
 from tensor import InputTensor, OutputTensor
 from std.memory import UnsafePointer
 from std.gpu.host import DeviceBuffer
@@ -92,7 +92,7 @@ struct Conv1DCustomOp:
         input: InputTensor[dtype=dtype, rank=output.rank, static_spec=_],
         kernel: InputTensor[dtype=dtype, rank=output.rank, static_spec=_],
         # the context is needed for some GPU calls
-        ctx: DeviceContextPtr,
+        ctx: DeviceContext,
     ) raises:
         comptime out_layout_val = row_major[input_size]()
         comptime OutLayout = type_of(out_layout_val)
@@ -110,7 +110,7 @@ struct Conv1DCustomOp:
         ](kernel.unsafe_ptr(), conv_layout_val)
 
         comptime if target == "gpu":
-            var gpu_ctx = ctx.get_device_context()
+            var gpu_ctx = ctx
             # making sure the output tensor is zeroed out before the kernel is called
             gpu_ctx.enqueue_memset(
                 DeviceBuffer[output.dtype](
diff --git a/solutions/p21/op/embedding.mojo b/solutions/p21/op/embedding.mojo
diff --git a/solutions/p22/op/layernorm_linear.mojo b/solutions/p22/op/layernorm_linear.mojo