55# cuda_tile.load_ptr_tko
66@eval Intrinsics begin
77 """
8- load_ptr_tko(ptrs, mask=nothing, padding=nothing)
8+ load_ptr_tko(ptrs, latency, mask=nothing, padding=nothing)
99
1010 Load values from a tile of pointers.
1111 If mask is provided, masked-out positions return the padding value.
1212 Compiled to cuda_tile.load_ptr_tko.
13+
14+ Note: TMA (allow_tma) is not applicable for pointer-based loads as they
15+ support irregular access patterns incompatible with TMA requirements.
1316 """
1417 @noinline function load_ptr_tko (ptrs:: Tile{Ptr{T}, S} ,
18+ latency:: Union{Int, Nothing} = nothing ,
1519 mask:: Union{Tile{Bool, S}, Nothing} = nothing ,
1620 padding:: Union{Tile{T, S}, Nothing} = nothing ) where {T, S}
17- donotdelete (ptrs, mask, padding)
21+ donotdelete (ptrs, latency, mask, padding)
1822 Tile {T, S} ()
1923 end
2024end
2125function emit_intrinsic! (ctx:: CGCtx , :: typeof (Intrinsics. load_ptr_tko), args)
2226 cb = ctx. cb
2327 tt = ctx. tt
2428
29+ # args: (ptrs, latency, mask?, padding?)
2530 # Get pointer tile (arg 1)
2631 ptrs_tv = emit_value! (ctx, args[1 ])
2732 ptrs_tv === nothing && error (" load_ptr_tko: cannot resolve pointer tile" )
@@ -36,29 +41,37 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.load_ptr_tko), args)
3641 result_tile_type = tile_type! (tt, dtype, tile_shape)
3742 token_type = Token (tt)
3843
39- # Check if mask is provided (arg 2 is not nothing)
40- has_mask = length (args) >= 2 && get_constant (ctx, args[2 ]) != = nothing
44+ # Extract latency hint (args[2])
45+ latency = get_constant (ctx, args[2 ])
46+
47+ # Create optimization hints if provided
48+ optimization_hints = create_optimization_hints (ctx, latency)
49+
50+ # Check if mask is provided (arg 3 is not nothing)
51+ has_mask = length (args) >= 3 && get_constant (ctx, args[3 ]) != = nothing
4152
4253 if has_mask
43- # Get mask tile (arg 2 )
44- mask_tv = emit_value! (ctx, args[2 ])
54+ # Get mask tile (arg 3 )
55+ mask_tv = emit_value! (ctx, args[3 ])
4556 mask_tv === nothing && error (" load_ptr_tko: cannot resolve mask tile" )
4657 mask = mask_tv. v
4758
48- # Get padding tile (arg 3 )
49- padding_tv = emit_value! (ctx, args[3 ])
59+ # Get padding tile (arg 4 )
60+ padding_tv = emit_value! (ctx, args[4 ])
5061 padding_tv === nothing && error (" load_ptr_tko: cannot resolve padding tile" )
5162 padding = padding_tv. v
5263
5364 # Load with mask and padding
5465 tile_val, new_token = encode_LoadPtrTkoOp! (cb, result_tile_type, token_type, pointers;
5566 mask= mask,
5667 padding_value= padding,
57- token= ctx. token)
68+ token= ctx. token,
69+ optimization_hints)
5870 else
5971 # Load without mask
6072 tile_val, new_token = encode_LoadPtrTkoOp! (cb, result_tile_type, token_type, pointers;
61- token= ctx. token)
73+ token= ctx. token,
74+ optimization_hints)
6275 end
6376 ctx. token = new_token
6477
7184# cuda_tile.store_ptr_tko
7285@eval Intrinsics begin
7386 """
74- store_ptr_tko(ptrs, values, mask=nothing)
87+ store_ptr_tko(ptrs, values, latency, mask=nothing)
7588
7689 Store values to a tile of pointers.
7790 If mask is provided, masked-out positions are not written.
7891 Compiled to cuda_tile.store_ptr_tko.
92+
93+ Note: TMA (allow_tma) is not applicable for pointer-based stores as they
94+ support irregular access patterns incompatible with TMA requirements.
7995 """
8096 @noinline function store_ptr_tko (ptrs:: Tile{Ptr{T}, S} , values:: Tile{T, S} ,
97+ latency:: Union{Int, Nothing} ,
8198 mask:: Union{Tile{Bool, S}, Nothing} = nothing ) where {T, S}
82- donotdelete (ptrs, values, mask)
99+ donotdelete (ptrs, values, latency, mask)
83100 nothing
84101 end
85102end
86103function emit_intrinsic! (ctx:: CGCtx , :: typeof (Intrinsics. store_ptr_tko), args)
87104 cb = ctx. cb
88105 tt = ctx. tt
89106
107+ # args: (ptrs, values, latency, mask?)
90108 # Get pointer tile (arg 1)
91109 ptrs_tv = emit_value! (ctx, args[1 ])
92110 ptrs_tv === nothing && error (" store_ptr_tko: cannot resolve pointer tile" )
@@ -99,23 +117,31 @@ function emit_intrinsic!(ctx::CGCtx, ::typeof(Intrinsics.store_ptr_tko), args)
99117
100118 token_type = Token (tt)
101119
102- # Check if mask is provided (arg 3 is not nothing)
103- has_mask = length (args) >= 3 && get_constant (ctx, args[3 ]) != = nothing
120+ # Extract latency hint (args[3])
121+ latency = get_constant (ctx, args[3 ])
122+
123+ # Create optimization hints if provided
124+ optimization_hints = create_optimization_hints (ctx, latency)
125+
126+ # Check if mask is provided (arg 4 is not nothing)
127+ has_mask = length (args) >= 4 && get_constant (ctx, args[4 ]) != = nothing
104128
105129 if has_mask
106- # Get mask tile (arg 3 )
107- mask_tv = emit_value! (ctx, args[3 ])
130+ # Get mask tile (arg 4 )
131+ mask_tv = emit_value! (ctx, args[4 ])
108132 mask_tv === nothing && error (" store_ptr_tko: cannot resolve mask tile" )
109133 mask = mask_tv. v
110134
111135 # Store with mask
112136 new_token = encode_StorePtrTkoOp! (cb, token_type, pointers, values;
113137 mask= mask,
114- token= ctx. token)
138+ token= ctx. token,
139+ optimization_hints)
115140 else
116141 # Store without mask
117142 new_token = encode_StorePtrTkoOp! (cb, token_type, pointers, values;
118- token= ctx. token)
143+ token= ctx. token,
144+ optimization_hints)
119145 end
120146 ctx. token = new_token
121147
0 commit comments