forked from JuliaSIMD/LoopVectorization.jl
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathconstructors.jl
More file actions
478 lines (431 loc) · 14.8 KB
/
constructors.jl
File metadata and controls
478 lines (431 loc) · 14.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
### This file contains convenience functions for constructing LoopSets.
# function strip_unneeded_const_deps!(ls::LoopSet)
# for op ∈ operations(ls)
# if isconstant(op) && iszero(length(reducedchildren(op)))
# op.dependencies = NODEPENDENCY
# end
# end
# ls
# end
function _copyto!(ls::LoopSet, q::Expr)
q.head === :for || throw("Expression must be a for loop.")
add_loop!(ls, q, 8)
# strip_unneeded_const_deps!(ls)
end
function add_ci_call!(
q::Expr,
@nospecialize(f),
args::Vector{Any},
syms::Vector{Symbol},
i::Int,
@nospecialize(valarg) = nothing,
@nospecialize(mod) = nothing
)
call = if f isa Core.SSAValue
Expr(:call, syms[f.id])
else
Expr(:call, f)
end
for arg ∈ @view(args[2:end])
if arg isa Core.SSAValue
push!(call.args, syms[arg.id])
else
push!(call.args, arg)
end
end
if mod !== nothing # indicates it's `vmaterialize(!)`
push!(call.args, Expr(:call, Expr(:curly, :Val, QuoteNode(mod))), valarg)
end
push!(q.args, Expr(:(=), syms[i], call))
end
function substitute_broadcast(
q::Expr,
mod::Symbol,
inline::Bool,
u₁::Int8,
u₂::Int8,
v::Int8,
threads::Int,
warncheckarg::Int,
safe::Bool
)
ci = first(Meta.lower(LoopVectorization, q).args).code
nargs = length(ci) - 1
lb = Expr(:block)
syms = Vector{Symbol}(undef, nargs)
configarg = (inline, u₁, u₂, v, true, threads, warncheckarg, safe)
unroll_param_tup =
Expr(:call, lv(:avx_config_val), :(Val{$configarg}()), staticexpr(0))
for n ∈ 1:nargs
_ciₙ = ci[n]
syms[n] = Symbol('%', n)
if _ciₙ isa Core.SSAValue
push!(lb.args, Expr(:(=), syms[n], syms[_ciₙ.id]))
elseif _ciₙ isa GlobalRef
if _ciₙ.mod === Base || _ciₙ.mod === Core
push!(lb.args, Expr(:(=), syms[n], lv(_ciₙ.name)))
else
push!(lb.args, Expr(:(=), syms[n], _ciₙ.name))
end
elseif _ciₙ isa Expr && _ciₙ.head === :call
f = first(_ciₙ.args)
if isglobalref(f, Base, :materialize!)
add_ci_call!(lb, lv(:vmaterialize!), _ciₙ.args, syms, n, unroll_param_tup, mod)
elseif isglobalref(f, Base, :materialize)
add_ci_call!(lb, lv(:vmaterialize), _ciₙ.args, syms, n, unroll_param_tup, mod)
else
add_ci_call!(lb, f, _ciₙ.args, syms, n)
end
else
push!(lb.args, Expr(:(=), syms[n], _ciₙ))
end
end
ret::Expr = pop!(lb.args)::Expr
if Meta.isexpr(ret, :(=), 2)
ret = (ret.args[2])::Expr
end
esc(Expr(:let, lb, Expr(:block, ret)))
end
function LoopSet(q::Expr, mod::Symbol = :Main)
ls = LoopSet(mod)
check_inputs!(q, ls.prepreamble)
contract_pass!(q)
_copyto!(ls, q)
resize!(ls.loop_order, num_loops(ls))
ls
end
LoopSet(q::Expr, m::Module) = LoopSet(macroexpand(m, q)::Expr, Symbol(m))
function loopset(q::Expr) # for interactive use only
ls = LoopSet(q)
set_hw!(ls)
ls
end
function check_macro_kwarg(
arg,
inline::Bool,
check_empty::Bool,
u₁::Int8,
u₂::Int8,
v::Int8,
threads::Int,
warncheckarg::Int,
safe::Bool
)
((arg.head === :(=)) && (length(arg.args) == 2)) ||
throw(ArgumentError("macro kwarg should be of the form `argname = value`."))
kw = (arg.args[1])::Symbol
value = (arg.args[2])
if kw === :inline
inline = value::Bool
elseif kw === :unroll
if value isa Integer
u₁ = convert(Int8, value)::Int8
elseif Meta.isexpr(value, :tuple, 2)
u₁ = convert(Int8, value.args[1])::Int8
u₂ = convert(Int8, value.args[2])::Int8
else
throw(
ArgumentError("Don't know how to process argument in `unroll=$value`.")
)
end
elseif kw === :vectorize
v = convert(Int8, value)
elseif kw === :check_empty
check_empty = value::Bool
elseif kw === :thread
if value isa Bool
threads = Core.ifelse(value::Bool, -1, 1)
elseif value isa Integer
threads = max(1, convert(Int, value)::Int)
else
throw(
ArgumentError("Don't know how to process argument in `thread=$value`.")
)
end
elseif kw === :warn_check_args
warncheckarg = convert(Int, value)::Int
elseif kw === :safe
safe = convert(Bool, value)
else
throw(
ArgumentError(
"Received unrecognized keyword argument $kw. Recognized arguments include:\n`inline`, `unroll`, `check_empty`, and `thread`."
)
)
end
inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe
end
function process_args(
args;
inline::Bool = false,
check_empty::Bool = false,
u₁::Int8 = zero(Int8),
u₂::Int8 = zero(Int8),
v::Int8 = zero(Int8),
threads::Int = 1,
warncheckarg::Int = 1,
safe::Bool = true
)
for arg ∈ args
inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe =
check_macro_kwarg(
arg,
inline,
check_empty,
u₁,
u₂,
v,
threads,
warncheckarg,
safe
)
end
inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe
end
# check if the body of loop is a block, if not convert it to a block issue#395
# and check if the range of loop is an enumerate, if it is replace it, issue#393
function check_inputs!(q, prepreamble)
if Meta.isexpr(q, :for)
if !Meta.isexpr(q.args[2], :block)
q.args[2] = Expr(:block, q.args[2])
replace_enumerate!(q, prepreamble) # must after warp block
else # maybe inner loops in block
replace_enumerate!(q, prepreamble)
for arg in q.args[2].args
check_inputs!(arg, prepreamble) # check recursively for inner loop
end
end
end
return q
end
function replace_enumerate!(q, prepreamble)
looprange = q.args[1]
if Meta.isexpr(looprange, :block)
for i = 1:length(looprange.args)
replace_single_enumerate!(q, prepreamble, i)
end
else
replace_single_enumerate!(q, prepreamble)
end
return q
end
function replace_single_enumerate!(q, prepreamble, i = nothing)
if isnothing(i) # not nest loop
looprange, body = q.args[1]::Expr, q.args[2]
else # nest loop
looprange, body = q.args[1].args[i]::Expr, q.args[2]
end
@assert Meta.isexpr(looprange, :(=), 2)
itersyms, r = looprange.args
if Meta.isexpr(r, :call, 2) && r.args[1] === :enumerate
_iter = r.args[2]
if _iter isa Symbol
iter = _iter
else # name complex expr
iter = gensym(:iter)
push!(prepreamble.args, :($iter = $_iter))
end
if Meta.isexpr(itersyms, :tuple, 2)
indsym, varsym = itersyms.args[1]::Symbol, itersyms.args[2]::Symbol
_replace_looprange!(q, i, indsym, iter)
pushfirst!(body.args, :($varsym = $iter[$indsym+firstindex($iter)-1]))
elseif Meta.isexpr(itersyms, :tuple, 1) # like `for (i,) in enumerate(...)`
indsym = itersyms.args[1]::Symbol
_replace_looprange!(q, i, indsym, iter)
elseif itersyms isa Symbol # if itersyms are not unbox in loop range
throw(
ArgumentError("`for $itersyms in enumerate($r)` is not supported,
please use `for ($(itersyms)_i, $(itersyms)_v) in enumerate($r)` instead.")
)
else
throw(ArgumentError("Don't know how to handle expression `$itersyms`."))
end
end
return q
end
_replace_looprange!(q, ::Nothing, indsym, iter) =
q.args[1] = :($indsym = Base.OneTo(length($iter)))
_replace_looprange!(q, i::Int, indsym, iter) =
q.args[1].args[i] = :($indsym = Base.OneTo(length($iter)))
function turbo_macro(mod, src, q, args...)
q = macroexpand(mod, q)
if q.head === :for
ls = LoopSet(q, mod)
inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe =
process_args(args)
esc(
setup_call(
ls,
q,
src,
inline,
check_empty,
u₁,
u₂,
v,
threads,
warncheckarg,
safe
)
)
else
inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe =
process_args(args; inline = true)
substitute_broadcast(
q,
Symbol(mod),
inline,
u₁,
u₂,
v,
threads,
warncheckarg,
safe
)
end
end
"""
@turbo
Annotate a `for` loop, or a set of nested `for` loops whose bounds are constant across iterations, to optimize the computation. For example:
function AmulB!(C, A, B)
@turbo for m ∈ indices((A,C), 1), n ∈ indices((B,C), 2) # indices((A,C),1) == axes(A,1) == axes(C,1)
Cₘₙ = zero(eltype(C))
for k ∈ indices((A,B), (2,1)) # indices((A,B), (2,1)) == axes(A,2) == axes(B,1)
Cₘₙ += A[m,k] * B[k,n]
end
C[m,n] = Cₘₙ
end
end
The macro models the set of nested loops, and chooses an ordering of the three loops to minimize predicted computation time.
Current limitations:
1. It assumes that loop iterations are independent.
2. It does not perform bounds checks.
3. It assumes that each loop iterates at least once. (Use `@turbo check_empty=true` to lift this assumption.)
4. That there is only one loop at each level of the nest.
It may also apply to broadcasts:
```jldoctest
julia> using LoopVectorization
julia> a = rand(100);
julia> b = @turbo exp.(2 .* a);
julia> c = similar(b);
julia> @turbo @. c = exp(2a);
julia> b ≈ c
true
```
# Extended help
Advanced users can customize the implementation of the `@turbo`-annotated block
using keyword arguments:
```julia
@turbo inline = false unroll = 2 thread = 4 body
```
where `body` is the code of the block (e.g., `for ... end`).
`thread` is either a Boolean, or an integer.
The integer's value indicates the number of threads to use.
It is clamped to be between `1` and `min(Threads.nthreads(),LoopVectorization.num_cores())`.
`false` is equivalent to `1`, and `true` is equivalent to `min(Threads.nthreads(),LoopVectorization.num_cores())`.
`safe` (defaults to `true`) will cause `@turbo` to fall back to `@inbounds @fastmath` if `can_turbo` returns false for any of the functions called in the loop. You can disable the associated warning with `warn_check_args=false`.
Setting the keyword argument `warn_check_args=true` (e.g. `@turbo warn_check_args=true for ...`) in a loop or
broadcast statement will cause it to warn once if `LoopVectorization.check_args` fails and the fallback
loop is executed instead of the LoopVectorization-optimized loop.
Setting it to an integer > 0 will warn that many times, while setting it to a negative integer will warn
an unlimited amount of times. The default is `warn_check_args = 1`. Failure means that there may have been an array with unsupported type, unsupported element types, or (if `safe=true`) a function for which `can_turbo` returned `false`.
`inline` is a Boolean. When `true`, `body` will be directly inlined
into the function (via a forced-inlining call to `_turbo_!`).
When `false`, it wont force inlining of the call to `_turbo_!` instead, letting Julia's own inlining engine
determine whether the call to `_turbo_!` should be inlined. (Typically, it won't.)
Sometimes not inlining can lead to substantially worse code generation, and >40% regressions, even in very
large problems (2-d convolutions are a case where this has been observed).
One can find some circumstances where `inline=true` is faster, and other circumstances
where `inline=false` is faster, so the best setting may require experimentation. By default, the macro
tries to guess. Currently the algorithm is simple: roughly, if there are more than two dynamically sized loops
or and no convolutions, it will probably not force inlining. Otherwise, it probably will.
`check_empty` (default is `false`) determines whether or not it will check if any of the iterators are empty.
If false, you must ensure yourself that they are not empty, else the behavior of the loop is undefined and
(like with `@inbounds`) segmentation faults are likely.
`unroll` is an integer that specifies the loop unrolling factor, or a
tuple `(u₁, u₂) = (4, 2)` signaling that the generated code should unroll more than
one loop. `u₁` is the unrolling factor for the first unrolled loop and `u₂` for the next (if present),
but it applies to the loop ordering and unrolling that will be chosen by LoopVectorization,
*not* the order in `body`.
`uᵢ=0` (the default) indicates that LoopVectorization should pick its own value,
and `uᵢ=-1` disables unrolling for the correspond loop.
The `@turbo` macro also checks the array arguments using `LoopVectorization.check_args` to try and determine
if they are compatible with the macro. If `check_args` returns false, a fall back loop annotated with `@inbounds`
and `@fastmath` is generated. Note that `VectorizationBase` provides functions such as `vadd` and `vmul` that will
ignore `@fastmath`, preserving IEEE semantics both within `@turbo` and `@fastmath`.
`check_args` currently returns false for some wrapper types like `LinearAlgebra.UpperTriangular`, requiring you to
use their `parent`. Triangular loops aren't yet supported.
"""
macro turbo(args...)
turbo_macro(__module__, __source__, last(args), Base.front(args)...)
end
"""
@tturbo
Equivalent to `@turbo`, except it adds `thread=true` as the first keyword argument.
Note that later arguments take precedence.
Meant for convenience, as `@tturbo` is shorter than `@turbo thread=true`.
"""
macro tturbo(args...)
turbo_macro(
__module__,
__source__,
last(args),
:(thread = true),
Base.front(args)...
)
end
function def_outer_reduct_types!(ls::LoopSet)
for or ∈ ls.outer_reductions
op = operations(ls)[or]
pushpreamble!(
ls,
Expr(:(=), outer_reduct_init_typename(op), typeof_expr(op))
)
end
end
"""
@_turbo
This macro mostly exists for debugging/testing purposes. It does not support many of the use cases of [`@turbo`](@ref).
It emits loops directly, rather than punting to an `@generated` function, meaning it doesn't have access to type
information when generating code or analyzing the loops, often leading to bad performance.
This macro accepts the `inline` and `unroll` keyword arguments like `@turbo`, but ignores the `check_empty` argument.
"""
macro _turbo(q)
q = macroexpand(__module__, q)
ls = LoopSet(q, __module__)
set_hw!(ls)
def_outer_reduct_types!(ls)
esc(Expr(:block, ls.prepreamble, lower_and_split_loops(ls, -1)))
end
macro _turbo(arg, q)
@assert q.head === :for
q = macroexpand(__module__, q)
inline, check_empty, u₁, u₂, v = check_macro_kwarg(
arg,
false,
false,
zero(Int8),
zero(Int8),
zero(Int8),
1,
0,
true
)
ls = LoopSet(q, __module__)
set_hw!(ls)
def_outer_reduct_types!(ls)
esc(Expr(:block, ls.prepreamble, lower(ls, u₁ % Int, u₂ % Int, v % Int, -1)))
end
"""
@turbo_debug
Returns a `LoopSet` object instead of evaluating the loops. Useful for debugging and introspection.
"""
macro turbo_debug(q)
q = macroexpand(__module__, q)
ls = LoopSet(q, __module__)
esc(LoopVectorization.setup_call_debug(ls))
end
# define aliases
const var"@avx" = var"@turbo"
const var"@avxt" = var"@tturbo"
const var"@avx_debug" = var"@turbo_debug"