Skip to content

Commit 33ced70

Browse files
danielbyrnesmodularbot
authored andcommitted
[Revert][Kernels][stdlib] Replace DeviceContextPtr with direct use of
DeviceContext" Reverting replacement of DeviceContextPtr with direct use of DeviceContext since it causes segfault. MODULAR_ORIG_COMMIT_REV_ID: 0d48f848a7c2b9ecc58efbea1c23152a8cab8efe
1 parent a1550d8 commit 33ced70

14 files changed

Lines changed: 56 additions & 58 deletions

File tree

book/i18n/ko/src/puzzle_17/puzzle_17.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,7 @@ Verification passed: Custom kernel results match NumPy calculation
189189
output: OutputTensor[rank=1],
190190
input: InputTensor[dtype = output.dtype, rank = output.rank],
191191
kernel: InputTensor[dtype = output.dtype, rank = output.rank],
192-
ctx: DeviceContext,
192+
ctx: DeviceContextPtr,
193193
) raises:
194194
# 구현
195195
```
@@ -248,7 +248,7 @@ struct Conv1DCustomOp:
248248
output: OutputTensor[rank=1],
249249
input: InputTensor[dtype = output.dtype, rank = output.rank],
250250
kernel: InputTensor[type = output.dtype, rank = output.rank],
251-
ctx: DeviceContext,
251+
ctx: DeviceContextPtr,
252252
) raises:
253253
# 구현
254254
```
@@ -259,7 +259,7 @@ struct Conv1DCustomOp:
259259
때 사용하는 이름
260260
- **구조체**에는 올바른 시그니처를 가진 `execute` 메서드가 있어야 함
261261
- **OutputTensor****InputTensor** 타입이 파이썬 데이터와의 인터페이스를 정의
262-
- **DeviceContext** 실행 환경에 대한 접근을 제공
262+
- **DeviceContextPtr** 실행 환경에 대한 접근을 제공
263263

264264
### 커스텀 op 패키징
265265

book/src/puzzle_17/puzzle_17.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ Let's break down how this works in the larger context:
192192
output: OutputTensor[rank=1],
193193
input: InputTensor[dtype = output.dtype, rank = output.rank],
194194
kernel: InputTensor[dtype = output.dtype, rank = output.rank],
195-
ctx: DeviceContext,
195+
ctx: DeviceContextPtr,
196196
) raises:
197197
# Implementation
198198
```
@@ -252,7 +252,7 @@ struct Conv1DCustomOp:
252252
output: OutputTensor[rank=1],
253253
input: InputTensor[dtype = output.dtype, rank = output.rank],
254254
kernel: InputTensor[type = output.dtype, rank = output.rank],
255-
ctx: DeviceContext,
255+
ctx: DeviceContextPtr,
256256
) raises:
257257
# Implementation here
258258
```
@@ -264,7 +264,7 @@ Key components of the registration:
264264
- The **struct** must have an `execute` method with the correct signature
265265
- **OutputTensor** and **InputTensor** types define the interface for Python
266266
data
267-
- **DeviceContext** provides access to the execution environment
267+
- **DeviceContextPtr** provides access to the execution environment
268268

269269
### Packaging custom ops
270270

problems/p17/op/conv1d.mojo

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ def conv1d_kernel[
6868

6969
# ANCHOR: conv1d_custom_op
7070
import compiler
71-
71+
from std.runtime.asyncrt import DeviceContextPtr
7272
from tensor import InputTensor, OutputTensor
7373
from std.memory import UnsafePointer
7474
from std.gpu.host import DeviceBuffer
@@ -88,14 +88,14 @@ struct Conv1DCustomOp:
8888
input: InputTensor[rank=output.rank, static_spec=_],
8989
kernel: InputTensor[rank=output.rank, static_spec=_],
9090
# the context is needed for some GPU calls
91-
ctx: DeviceContext,
91+
ctx: DeviceContextPtr,
9292
) raises:
9393
var output_tensor = output.to_layout_tensor()
9494
var input_tensor = input.to_layout_tensor()
9595
var kernel_tensor = kernel.to_layout_tensor()
9696

9797
comptime if target == "gpu":
98-
var gpu_ctx = ctx
98+
var gpu_ctx = ctx.get_device_context()
9999
# making sure the output tensor is zeroed out before the kernel is called
100100
gpu_ctx.enqueue_memset(
101101
DeviceBuffer[output_tensor.dtype](

problems/p18/op/softmax.mojo

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ def softmax_cpu_kernel[
6060
# ANCHOR_END: softmax_cpu_kernel
6161

6262
import compiler
63-
63+
from std.runtime.asyncrt import DeviceContextPtr
6464
from tensor import InputTensor, OutputTensor
6565

6666

@@ -74,7 +74,7 @@ struct SoftmaxCustomOp:
7474
](
7575
output: OutputTensor[rank=1, static_spec=_],
7676
input: InputTensor[rank=output.rank, static_spec=_],
77-
ctx: DeviceContext,
77+
ctx: DeviceContextPtr,
7878
) raises:
7979
# Note: rebind is necessary now but it shouldn't be!
8080
var output_tensor = rebind[
@@ -85,7 +85,7 @@ struct SoftmaxCustomOp:
8585
](input.to_layout_tensor())
8686

8787
comptime if target == "gpu":
88-
var gpu_ctx = ctx
88+
var gpu_ctx = ctx.get_device_context()
8989
# making sure the output tensor is zeroed out before the kernel is called
9090
gpu_ctx.enqueue_memset(
9191
DeviceBuffer[output_tensor.dtype](

problems/p19/op/attention.mojo

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ from std.math import exp
1515
from std.bit import log2_ceil
1616
from std.utils.numerics import max_finite, min_finite
1717
import compiler
18-
18+
from std.runtime.asyncrt import DeviceContextPtr
1919
from tensor import InputTensor, OutputTensor
2020

2121
comptime SEQ_LEN = 16 # This must be equal to SEQ_LEN in p19.py
@@ -269,7 +269,7 @@ struct AttentionCustomOp:
269269
q: InputTensor[rank=1, static_spec=_], # Query vector (d,)
270270
k: InputTensor[rank=2, static_spec=_], # Key matrix (seq_len, d)
271271
v: InputTensor[rank=2, static_spec=_], # Value matrix (seq_len, d)
272-
ctx: DeviceContext,
272+
ctx: DeviceContextPtr,
273273
) raises:
274274
# Define layouts
275275
comptime layout_q = row_major[d]()

problems/p20/op/conv1d.mojo

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def conv1d_kernel[
7171
# ANCHOR_END: conv1d_kernel
7272

7373
import compiler
74-
74+
from std.runtime.asyncrt import DeviceContextPtr
7575
from tensor import InputTensor, OutputTensor
7676
from std.memory import UnsafePointer
7777
from std.gpu.host import DeviceBuffer
@@ -91,14 +91,14 @@ struct Conv1DCustomOp:
9191
input: InputTensor[dtype=dtype, rank=output.rank, static_spec=_],
9292
kernel: InputTensor[dtype=dtype, rank=output.rank, static_spec=_],
9393
# the context is needed for some GPU calls
94-
ctx: DeviceContext,
94+
ctx: DeviceContextPtr,
9595
) raises:
9696
var out_tensor = output.to_layout_tensor()
9797
var input_tensor = input.to_layout_tensor()
9898
var kernel_tensor = kernel.to_layout_tensor()
9999

100100
comptime if target == "gpu":
101-
var gpu_ctx = ctx
101+
var gpu_ctx = ctx.get_device_context()
102102
# making sure the output tensor is zeroed out before the kernel is called
103103
gpu_ctx.enqueue_memset(
104104
DeviceBuffer[output.dtype](

problems/p21/op/embedding.mojo

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,7 @@ def embedding_kernel_2d[
103103
# ANCHOR_END: embedding_kernel_2d
104104

105105
import compiler
106-
106+
from std.runtime.asyncrt import DeviceContextPtr
107107
from tensor import InputTensor, OutputTensor
108108
from std.memory import UnsafePointer
109109
from std.gpu.host import DeviceBuffer
@@ -128,14 +128,14 @@ struct EmbeddingCustomOp:
128128
weights: InputTensor[
129129
dtype=output.dtype, rank=2, static_spec=_
130130
], # [vocab_size, embed_dim]
131-
ctx: DeviceContext,
131+
ctx: DeviceContextPtr,
132132
) raises:
133133
var output_tensor = output.to_layout_tensor()
134134
var indices_tensor = indices.to_layout_tensor()
135135
var weights_tensor = weights.to_layout_tensor()
136136

137137
comptime if target == "gpu":
138-
var gpu_ctx = ctx
138+
var gpu_ctx = ctx.get_device_context()
139139

140140
# Zero out output tensor
141141
gpu_ctx.enqueue_memset(
@@ -203,14 +203,14 @@ struct Embedding2DCustomOp:
203203
weights: InputTensor[
204204
dtype=output.dtype, rank=2, static_spec=_
205205
], # [vocab_size, embed_dim]
206-
ctx: DeviceContext,
206+
ctx: DeviceContextPtr,
207207
) raises:
208208
var output_tensor = output.to_layout_tensor()
209209
var indices_tensor = indices.to_layout_tensor()
210210
var weights_tensor = weights.to_layout_tensor()
211211

212212
comptime if target == "gpu":
213-
var gpu_ctx = ctx
213+
var gpu_ctx = ctx.get_device_context()
214214

215215
# Zero out output tensor
216216
gpu_ctx.enqueue_memset(

problems/p22/op/layernorm_linear.mojo

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,7 @@ from layout.tile_layout import row_major, TensorLayout
1212
from layout.tile_tensor import stack_allocation
1313
from layout.layout_tensor import copy_dram_to_sram_async
1414
import compiler
15-
16-
from std.gpu.host import DeviceContext
17-
15+
from std.runtime.asyncrt import DeviceContextPtr
1816
from tensor import InputTensor, OutputTensor
1917
from std.utils import StaticTuple
2018

@@ -373,7 +371,7 @@ struct LayerNormLinearCustomOp:
373371
ln_bias: InputTensor[dtype=DType.float32, rank=1, static_spec=_],
374372
linear_weight: InputTensor[dtype=DType.float32, rank=2, static_spec=_],
375373
linear_bias: InputTensor[dtype=DType.float32, rank=1, static_spec=_],
376-
ctx: DeviceContext,
374+
ctx: DeviceContextPtr,
377375
) raises:
378376
comptime input_layout = input.static_spec.to_layout()
379377
comptime ln_params_layout = ln_weight.static_spec.to_layout()
@@ -407,7 +405,7 @@ struct LayerNormLinearCustomOp:
407405
](linear_bias.to_layout_tensor())
408406

409407
comptime if target == "gpu":
410-
var gpu_ctx = ctx
408+
var gpu_ctx = ctx.get_device_context()
411409

412410
# ANCHOR: layernorm_linear_custom_op
413411
comptime if algorithm == "fused":
@@ -615,7 +613,7 @@ struct LayerNormLinearBackwardCustomOp:
615613
ln_weight: InputTensor[dtype=DType.float32, rank=1, static_spec=_],
616614
ln_bias: InputTensor[dtype=DType.float32, rank=1, static_spec=_],
617615
linear_weight: InputTensor[dtype=DType.float32, rank=2, static_spec=_],
618-
ctx: DeviceContext,
616+
ctx: DeviceContextPtr,
619617
) raises:
620618
comptime grad_output_layout = grad_output.static_spec.to_layout()
621619
comptime input_layout = input.static_spec.to_layout()
@@ -668,7 +666,7 @@ struct LayerNormLinearBackwardCustomOp:
668666
](linear_weight.to_layout_tensor())
669667

670668
comptime if target == "gpu":
671-
var gpu_ctx = ctx
669+
var gpu_ctx = ctx.get_device_context()
672670

673671
# Launch backward kernel
674672
comptime kernel = minimal_fused_kernel_backward[

solutions/p17/op/conv1d.mojo

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ def conv1d_kernel[
7272

7373

7474
import compiler
75-
75+
from std.runtime.asyncrt import DeviceContextPtr
7676
from tensor import InputTensor, OutputTensor
7777
from std.memory import UnsafePointer
7878
from std.gpu.host import DeviceBuffer
@@ -92,7 +92,7 @@ struct Conv1DCustomOp:
9292
input: InputTensor[dtype=dtype, rank=output.rank, static_spec=_],
9393
kernel: InputTensor[dtype=dtype, rank=output.rank, static_spec=_],
9494
# the context is needed for some GPU calls
95-
ctx: DeviceContext,
95+
ctx: DeviceContextPtr,
9696
) raises:
9797
comptime out_layout_val = row_major[input_size]()
9898
comptime OutLayout = type_of(out_layout_val)
@@ -110,7 +110,7 @@ struct Conv1DCustomOp:
110110
](kernel.unsafe_ptr(), conv_layout_val)
111111

112112
comptime if target == "gpu":
113-
var gpu_ctx = ctx
113+
var gpu_ctx = ctx.get_device_context()
114114
# making sure the output tensor is zeroed out before the kernel is called
115115
gpu_ctx.enqueue_memset(
116116
DeviceBuffer[output_tensor.dtype](

solutions/p18/op/softmax.mojo

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ def softmax_cpu_kernel[
119119
# ANCHOR_END: softmax_cpu_kernel_solution
120120

121121
import compiler
122-
122+
from std.runtime.asyncrt import DeviceContextPtr
123123
from tensor import InputTensor, OutputTensor
124124

125125

@@ -133,7 +133,7 @@ struct SoftmaxCustomOp:
133133
](
134134
output: OutputTensor[dtype=dtype, rank=1, static_spec=_],
135135
input: InputTensor[dtype=dtype, rank=output.rank, static_spec=_],
136-
ctx: DeviceContext,
136+
ctx: DeviceContextPtr,
137137
) raises:
138138
var output_tensor = TileTensor[
139139
mut=True, dtype, LayoutType, MutAnyOrigin
@@ -143,7 +143,7 @@ struct SoftmaxCustomOp:
143143
](input.unsafe_ptr(), layout)
144144

145145
comptime if target == "gpu":
146-
var gpu_ctx = ctx
146+
var gpu_ctx = ctx.get_device_context()
147147
# making sure the output tensor is zeroed out before the kernel is called
148148
gpu_ctx.enqueue_memset(
149149
DeviceBuffer[dtype](

0 commit comments

Comments
 (0)