Skip to content

Commit a1550d8

Browse files
sabaumamodularbot
authored andcommitted
[Kernels][stdlib] Replace DeviceContextPtr with direct use of
DeviceContext The `DeviceContextPtr` type is a legacy type which was needed to satisfy the requirement that kernel arguments be register passable. Internally, `DeviceContextPtr` is structurally equivalent to `DeviceContext`, with the caveat that it does not increment the reference count on the `DeviceContext` C++ implementation. Now that we have the EmitMojo backend, kernels are able to accept `DeviceContext` or containers thereof. This change replaces `DeviceContextPtr` with direct uses of `DeviceContext`. Summary of changes: * Kernel registrations now take a `DeviceContext` argument rather than a `DeviceContextPtr` argument. * Multi-device kernels now take a `DeviceContextList` type, rather than `DeviceContextPtrList`. * Kernel implementations which used the nullability of `DeviceContextPtr` now take `Optional[DeviceContext]`. Where possible, these cases were eliminated as a null `DeviceContextPtr` is uncommon with the introduction of `CpuDeviceContext`. MODULAR_ORIG_COMMIT_REV_ID: fad38ebea8c999c67dd807ce13e5589d641ba38a
1 parent 6478d0e commit a1550d8

14 files changed

Lines changed: 58 additions & 56 deletions

File tree

book/i18n/ko/src/puzzle_17/puzzle_17.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ Verification passed: Custom kernel results match NumPy calculation
189189
output: OutputTensor[rank=1],
190190
input: InputTensor[dtype = output.dtype, rank = output.rank],
191191
kernel: InputTensor[dtype = output.dtype, rank = output.rank],
192-
ctx: DeviceContextPtr,
192+
ctx: DeviceContext,
193193
) raises:
194194
# 구현
195195
```
@@ -248,7 +248,7 @@ struct Conv1DCustomOp:
248248
output: OutputTensor[rank=1],
249249
input: InputTensor[dtype = output.dtype, rank = output.rank],
250250
kernel: InputTensor[type = output.dtype, rank = output.rank],
251-
ctx: DeviceContextPtr,
251+
ctx: DeviceContext,
252252
) raises:
253253
# 구현
254254
```
@@ -259,7 +259,7 @@ struct Conv1DCustomOp:
259259
때 사용하는 이름
260260
- **구조체**에는 올바른 시그니처를 가진 `execute` 메서드가 있어야 함
261261
- **OutputTensor****InputTensor** 타입이 파이썬 데이터와의 인터페이스를 정의
262-
- **DeviceContextPtr** 실행 환경에 대한 접근을 제공
262+
- **DeviceContext** 실행 환경에 대한 접근을 제공
263263

264264
### 커스텀 op 패키징
265265

book/src/puzzle_17/puzzle_17.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ Let's break down how this works in the larger context:
192192
output: OutputTensor[rank=1],
193193
input: InputTensor[dtype = output.dtype, rank = output.rank],
194194
kernel: InputTensor[dtype = output.dtype, rank = output.rank],
195-
ctx: DeviceContextPtr,
195+
ctx: DeviceContext,
196196
) raises:
197197
# Implementation
198198
```
@@ -252,7 +252,7 @@ struct Conv1DCustomOp:
252252
output: OutputTensor[rank=1],
253253
input: InputTensor[dtype = output.dtype, rank = output.rank],
254254
kernel: InputTensor[type = output.dtype, rank = output.rank],
255-
ctx: DeviceContextPtr,
255+
ctx: DeviceContext,
256256
) raises:
257257
# Implementation here
258258
```
@@ -264,7 +264,7 @@ Key components of the registration:
264264
- The **struct** must have an `execute` method with the correct signature
265265
- **OutputTensor** and **InputTensor** types define the interface for Python
266266
data
267-
- **DeviceContextPtr** provides access to the execution environment
267+
- **DeviceContext** provides access to the execution environment
268268

269269
### Packaging custom ops
270270

problems/p17/op/conv1d.mojo

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def conv1d_kernel[
6868

6969
# ANCHOR: conv1d_custom_op
7070
import compiler
71-
from std.runtime.asyncrt import DeviceContextPtr
71+
7272
from tensor import InputTensor, OutputTensor
7373
from std.memory import UnsafePointer
7474
from std.gpu.host import DeviceBuffer
@@ -88,14 +88,14 @@ struct Conv1DCustomOp:
8888
input: InputTensor[rank=output.rank, static_spec=_],
8989
kernel: InputTensor[rank=output.rank, static_spec=_],
9090
# the context is needed for some GPU calls
91-
ctx: DeviceContextPtr,
91+
ctx: DeviceContext,
9292
) raises:
9393
var output_tensor = output.to_layout_tensor()
9494
var input_tensor = input.to_layout_tensor()
9595
var kernel_tensor = kernel.to_layout_tensor()
9696

9797
comptime if target == "gpu":
98-
var gpu_ctx = ctx.get_device_context()
98+
var gpu_ctx = ctx
9999
# making sure the output tensor is zeroed out before the kernel is called
100100
gpu_ctx.enqueue_memset(
101101
DeviceBuffer[output_tensor.dtype](

problems/p18/op/softmax.mojo

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def softmax_cpu_kernel[
6060
# ANCHOR_END: softmax_cpu_kernel
6161

6262
import compiler
63-
from std.runtime.asyncrt import DeviceContextPtr
63+
6464
from tensor import InputTensor, OutputTensor
6565

6666

@@ -74,7 +74,7 @@ struct SoftmaxCustomOp:
7474
](
7575
output: OutputTensor[rank=1, static_spec=_],
7676
input: InputTensor[rank=output.rank, static_spec=_],
77-
ctx: DeviceContextPtr,
77+
ctx: DeviceContext,
7878
) raises:
7979
# Note: rebind is necessary now but it shouldn't be!
8080
var output_tensor = rebind[
@@ -85,7 +85,7 @@ struct SoftmaxCustomOp:
8585
](input.to_layout_tensor())
8686

8787
comptime if target == "gpu":
88-
var gpu_ctx = ctx.get_device_context()
88+
var gpu_ctx = ctx
8989
# making sure the output tensor is zeroed out before the kernel is called
9090
gpu_ctx.enqueue_memset(
9191
DeviceBuffer[output_tensor.dtype](

problems/p19/op/attention.mojo

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ from std.math import exp
1515
from std.bit import log2_ceil
1616
from std.utils.numerics import max_finite, min_finite
1717
import compiler
18-
from std.runtime.asyncrt import DeviceContextPtr
18+
1919
from tensor import InputTensor, OutputTensor
2020

2121
comptime SEQ_LEN = 16 # This must be equal to SEQ_LEN in p19.py
@@ -269,7 +269,7 @@ struct AttentionCustomOp:
269269
q: InputTensor[rank=1, static_spec=_], # Query vector (d,)
270270
k: InputTensor[rank=2, static_spec=_], # Key matrix (seq_len, d)
271271
v: InputTensor[rank=2, static_spec=_], # Value matrix (seq_len, d)
272-
ctx: DeviceContextPtr,
272+
ctx: DeviceContext,
273273
) raises:
274274
# Define layouts
275275
comptime layout_q = row_major[d]()

problems/p20/op/conv1d.mojo

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def conv1d_kernel[
7171
# ANCHOR_END: conv1d_kernel
7272

7373
import compiler
74-
from std.runtime.asyncrt import DeviceContextPtr
74+
7575
from tensor import InputTensor, OutputTensor
7676
from std.memory import UnsafePointer
7777
from std.gpu.host import DeviceBuffer
@@ -91,14 +91,14 @@ struct Conv1DCustomOp:
9191
input: InputTensor[dtype=dtype, rank=output.rank, static_spec=_],
9292
kernel: InputTensor[dtype=dtype, rank=output.rank, static_spec=_],
9393
# the context is needed for some GPU calls
94-
ctx: DeviceContextPtr,
94+
ctx: DeviceContext,
9595
) raises:
9696
var out_tensor = output.to_layout_tensor()
9797
var input_tensor = input.to_layout_tensor()
9898
var kernel_tensor = kernel.to_layout_tensor()
9999

100100
comptime if target == "gpu":
101-
var gpu_ctx = ctx.get_device_context()
101+
var gpu_ctx = ctx
102102
# making sure the output tensor is zeroed out before the kernel is called
103103
gpu_ctx.enqueue_memset(
104104
DeviceBuffer[output.dtype](

problems/p21/op/embedding.mojo

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def embedding_kernel_2d[
103103
# ANCHOR_END: embedding_kernel_2d
104104

105105
import compiler
106-
from std.runtime.asyncrt import DeviceContextPtr
106+
107107
from tensor import InputTensor, OutputTensor
108108
from std.memory import UnsafePointer
109109
from std.gpu.host import DeviceBuffer
@@ -128,14 +128,14 @@ struct EmbeddingCustomOp:
128128
weights: InputTensor[
129129
dtype=output.dtype, rank=2, static_spec=_
130130
], # [vocab_size, embed_dim]
131-
ctx: DeviceContextPtr,
131+
ctx: DeviceContext,
132132
) raises:
133133
var output_tensor = output.to_layout_tensor()
134134
var indices_tensor = indices.to_layout_tensor()
135135
var weights_tensor = weights.to_layout_tensor()
136136

137137
comptime if target == "gpu":
138-
var gpu_ctx = ctx.get_device_context()
138+
var gpu_ctx = ctx
139139

140140
# Zero out output tensor
141141
gpu_ctx.enqueue_memset(
@@ -203,14 +203,14 @@ struct Embedding2DCustomOp:
203203
weights: InputTensor[
204204
dtype=output.dtype, rank=2, static_spec=_
205205
], # [vocab_size, embed_dim]
206-
ctx: DeviceContextPtr,
206+
ctx: DeviceContext,
207207
) raises:
208208
var output_tensor = output.to_layout_tensor()
209209
var indices_tensor = indices.to_layout_tensor()
210210
var weights_tensor = weights.to_layout_tensor()
211211

212212
comptime if target == "gpu":
213-
var gpu_ctx = ctx.get_device_context()
213+
var gpu_ctx = ctx
214214

215215
# Zero out output tensor
216216
gpu_ctx.enqueue_memset(

problems/p22/op/layernorm_linear.mojo

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@ from layout.tile_layout import row_major, TensorLayout
1212
from layout.tile_tensor import stack_allocation
1313
from layout.layout_tensor import copy_dram_to_sram_async
1414
import compiler
15-
from std.runtime.asyncrt import DeviceContextPtr
15+
16+
from std.gpu.host import DeviceContext
17+
1618
from tensor import InputTensor, OutputTensor
1719
from std.utils import StaticTuple
1820

@@ -371,7 +373,7 @@ struct LayerNormLinearCustomOp:
371373
ln_bias: InputTensor[dtype=DType.float32, rank=1, static_spec=_],
372374
linear_weight: InputTensor[dtype=DType.float32, rank=2, static_spec=_],
373375
linear_bias: InputTensor[dtype=DType.float32, rank=1, static_spec=_],
374-
ctx: DeviceContextPtr,
376+
ctx: DeviceContext,
375377
) raises:
376378
comptime input_layout = input.static_spec.to_layout()
377379
comptime ln_params_layout = ln_weight.static_spec.to_layout()
@@ -405,7 +407,7 @@ struct LayerNormLinearCustomOp:
405407
](linear_bias.to_layout_tensor())
406408

407409
comptime if target == "gpu":
408-
var gpu_ctx = ctx.get_device_context()
410+
var gpu_ctx = ctx
409411

410412
# ANCHOR: layernorm_linear_custom_op
411413
comptime if algorithm == "fused":
@@ -613,7 +615,7 @@ struct LayerNormLinearBackwardCustomOp:
613615
ln_weight: InputTensor[dtype=DType.float32, rank=1, static_spec=_],
614616
ln_bias: InputTensor[dtype=DType.float32, rank=1, static_spec=_],
615617
linear_weight: InputTensor[dtype=DType.float32, rank=2, static_spec=_],
616-
ctx: DeviceContextPtr,
618+
ctx: DeviceContext,
617619
) raises:
618620
comptime grad_output_layout = grad_output.static_spec.to_layout()
619621
comptime input_layout = input.static_spec.to_layout()
@@ -666,7 +668,7 @@ struct LayerNormLinearBackwardCustomOp:
666668
](linear_weight.to_layout_tensor())
667669

668670
comptime if target == "gpu":
669-
var gpu_ctx = ctx.get_device_context()
671+
var gpu_ctx = ctx
670672

671673
# Launch backward kernel
672674
comptime kernel = minimal_fused_kernel_backward[

solutions/p17/op/conv1d.mojo

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def conv1d_kernel[
7272

7373

7474
import compiler
75-
from std.runtime.asyncrt import DeviceContextPtr
75+
7676
from tensor import InputTensor, OutputTensor
7777
from std.memory import UnsafePointer
7878
from std.gpu.host import DeviceBuffer
@@ -92,7 +92,7 @@ struct Conv1DCustomOp:
9292
input: InputTensor[dtype=dtype, rank=output.rank, static_spec=_],
9393
kernel: InputTensor[dtype=dtype, rank=output.rank, static_spec=_],
9494
# the context is needed for some GPU calls
95-
ctx: DeviceContextPtr,
95+
ctx: DeviceContext,
9696
) raises:
9797
comptime out_layout_val = row_major[input_size]()
9898
comptime OutLayout = type_of(out_layout_val)
@@ -110,7 +110,7 @@ struct Conv1DCustomOp:
110110
](kernel.unsafe_ptr(), conv_layout_val)
111111

112112
comptime if target == "gpu":
113-
var gpu_ctx = ctx.get_device_context()
113+
var gpu_ctx = ctx
114114
# making sure the output tensor is zeroed out before the kernel is called
115115
gpu_ctx.enqueue_memset(
116116
DeviceBuffer[output_tensor.dtype](

solutions/p18/op/softmax.mojo

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ def softmax_cpu_kernel[
119119
# ANCHOR_END: softmax_cpu_kernel_solution
120120

121121
import compiler
122-
from std.runtime.asyncrt import DeviceContextPtr
122+
123123
from tensor import InputTensor, OutputTensor
124124

125125

@@ -133,7 +133,7 @@ struct SoftmaxCustomOp:
133133
](
134134
output: OutputTensor[dtype=dtype, rank=1, static_spec=_],
135135
input: InputTensor[dtype=dtype, rank=output.rank, static_spec=_],
136-
ctx: DeviceContextPtr,
136+
ctx: DeviceContext,
137137
) raises:
138138
var output_tensor = TileTensor[
139139
mut=True, dtype, LayoutType, MutAnyOrigin
@@ -143,7 +143,7 @@ struct SoftmaxCustomOp:
143143
](input.unsafe_ptr(), layout)
144144

145145
comptime if target == "gpu":
146-
var gpu_ctx = ctx.get_device_context()
146+
var gpu_ctx = ctx
147147
# making sure the output tensor is zeroed out before the kernel is called
148148
gpu_ctx.enqueue_memset(
149149
DeviceBuffer[dtype](

0 commit comments

Comments
 (0)