LoopVectorization.jl/src/constructors.jl at db7af8ca6a737f9bd3711a77b23bdbb33caf69d4 · mxpoch/LoopVectorization.jl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478

### This file contains convenience functions for constructing LoopSets.

# function strip_unneeded_const_deps!(ls::LoopSet)
#     for op ∈ operations(ls)
#         if isconstant(op) && iszero(length(reducedchildren(op)))
#             op.dependencies = NODEPENDENCY
#         end
#     end
#     ls
# end

function _copyto!(ls::LoopSet, q::Expr)
  q.head === :for || throw("Expression must be a for loop.")
  add_loop!(ls, q, 8)
  # strip_unneeded_const_deps!(ls)
end

function add_ci_call!(
  q::Expr,
  @nospecialize(f),
  args::Vector{Any},
  syms::Vector{Symbol},
  i::Int,
  @nospecialize(valarg) = nothing,
  @nospecialize(mod) = nothing
)
  call = if f isa Core.SSAValue
    Expr(:call, syms[f.id])
  else
    Expr(:call, f)
  end
  for arg ∈ @view(args[2:end])
    if arg isa Core.SSAValue
      push!(call.args, syms[arg.id])
    else
      push!(call.args, arg)
    end
  end
  if mod !== nothing # indicates it's `vmaterialize(!)`
    push!(call.args, Expr(:call, Expr(:curly, :Val, QuoteNode(mod))), valarg)
  end
  push!(q.args, Expr(:(=), syms[i], call))
end

function substitute_broadcast(
  q::Expr,
  mod::Symbol,
  inline::Bool,
  u₁::Int8,
  u₂::Int8,
  v::Int8,
  threads::Int,
  warncheckarg::Int,
  safe::Bool
)
  ci = first(Meta.lower(LoopVectorization, q).args).code
  nargs = length(ci) - 1
  lb = Expr(:block)
  syms = Vector{Symbol}(undef, nargs)
  configarg = (inline, u₁, u₂, v, true, threads, warncheckarg, safe)
  unroll_param_tup =
    Expr(:call, lv(:avx_config_val), :(Val{$configarg}()), staticexpr(0))

  for n ∈ 1:nargs
    _ciₙ = ci[n]
    syms[n] = Symbol('%', n)

    if _ciₙ isa Core.SSAValue
      push!(lb.args, Expr(:(=), syms[n], syms[_ciₙ.id]))

    elseif _ciₙ isa GlobalRef
      if _ciₙ.mod === Base || _ciₙ.mod === Core
        push!(lb.args, Expr(:(=), syms[n], lv(_ciₙ.name)))
      else
        push!(lb.args, Expr(:(=), syms[n], _ciₙ.name))
      end

    elseif _ciₙ isa Expr && _ciₙ.head === :call
      f = first(_ciₙ.args)
      if isglobalref(f, Base, :materialize!)
        add_ci_call!(lb, lv(:vmaterialize!), _ciₙ.args, syms, n, unroll_param_tup, mod)
      elseif isglobalref(f, Base, :materialize)
        add_ci_call!(lb, lv(:vmaterialize), _ciₙ.args, syms, n, unroll_param_tup, mod)
      else
        add_ci_call!(lb, f, _ciₙ.args, syms, n)
      end

    else
      push!(lb.args, Expr(:(=), syms[n], _ciₙ))
    end
  end

  ret::Expr = pop!(lb.args)::Expr
  if Meta.isexpr(ret, :(=), 2)
    ret = (ret.args[2])::Expr
  end

  esc(Expr(:let, lb, Expr(:block, ret)))
end

function LoopSet(q::Expr, mod::Symbol = :Main)
  ls = LoopSet(mod)
  check_inputs!(q, ls.prepreamble)
  contract_pass!(q)
  _copyto!(ls, q)
  resize!(ls.loop_order, num_loops(ls))
  ls
end
LoopSet(q::Expr, m::Module) = LoopSet(macroexpand(m, q)::Expr, Symbol(m))

function loopset(q::Expr) # for interactive use only
  ls = LoopSet(q)
  set_hw!(ls)
  ls
end

function check_macro_kwarg(
  arg,
  inline::Bool,
  check_empty::Bool,
  u₁::Int8,
  u₂::Int8,
  v::Int8,
  threads::Int,
  warncheckarg::Int,
  safe::Bool
)
  ((arg.head === :(=)) && (length(arg.args) == 2)) ||
    throw(ArgumentError("macro kwarg should be of the form `argname = value`."))
  kw = (arg.args[1])::Symbol
  value = (arg.args[2])
  if kw === :inline
    inline = value::Bool
  elseif kw === :unroll
    if value isa Integer
      u₁ = convert(Int8, value)::Int8
    elseif Meta.isexpr(value, :tuple, 2)
      u₁ = convert(Int8, value.args[1])::Int8
      u₂ = convert(Int8, value.args[2])::Int8
    else
      throw(
        ArgumentError("Don't know how to process argument in `unroll=$value`.")
      )
    end
  elseif kw === :vectorize
    v = convert(Int8, value)
  elseif kw === :check_empty
    check_empty = value::Bool
  elseif kw === :thread
    if value isa Bool
      threads = Core.ifelse(value::Bool, -1, 1)
    elseif value isa Integer
      threads = max(1, convert(Int, value)::Int)
    else
      throw(
        ArgumentError("Don't know how to process argument in `thread=$value`.")
      )
    end
  elseif kw === :warn_check_args
    warncheckarg = convert(Int, value)::Int
  elseif kw === :safe
    safe = convert(Bool, value)
  else
    throw(
      ArgumentError(
        "Received unrecognized keyword argument $kw. Recognized arguments include:\n`inline`, `unroll`, `check_empty`, and `thread`."
      )
    )
  end
  inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe
end
function process_args(
  args;
  inline::Bool = false,
  check_empty::Bool = false,
  u₁::Int8 = zero(Int8),
  u₂::Int8 = zero(Int8),
  v::Int8 = zero(Int8),
  threads::Int = 1,
  warncheckarg::Int = 1,
  safe::Bool = true
)
  for arg ∈ args
    inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe =
      check_macro_kwarg(
        arg,
        inline,
        check_empty,
        u₁,
        u₂,
        v,
        threads,
        warncheckarg,
        safe
      )
  end
  inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe
end
# check if the body of loop is a block, if not convert it to a block issue#395
# and check if the range of loop is an enumerate, if it is replace it, issue#393
function check_inputs!(q, prepreamble)
  if Meta.isexpr(q, :for)
    if !Meta.isexpr(q.args[2], :block)
      q.args[2] = Expr(:block, q.args[2])
      replace_enumerate!(q, prepreamble) # must after warp block
    else # maybe inner loops in block
      replace_enumerate!(q, prepreamble)
      for arg in q.args[2].args
        check_inputs!(arg, prepreamble) # check recursively for inner loop
      end
    end
  end
  return q
end
function replace_enumerate!(q, prepreamble)
  looprange = q.args[1]
  if Meta.isexpr(looprange, :block)
    for i = 1:length(looprange.args)
      replace_single_enumerate!(q, prepreamble, i)
    end
  else
    replace_single_enumerate!(q, prepreamble)
  end
  return q
end
function replace_single_enumerate!(q, prepreamble, i = nothing)
  if isnothing(i) # not nest loop
    looprange, body = q.args[1]::Expr, q.args[2]
  else # nest loop
    looprange, body = q.args[1].args[i]::Expr, q.args[2]
  end
  @assert Meta.isexpr(looprange, :(=), 2)
  itersyms, r = looprange.args
  if Meta.isexpr(r, :call, 2) && r.args[1] === :enumerate
    _iter = r.args[2]
    if _iter isa Symbol
      iter = _iter
    else # name complex expr
      iter = gensym(:iter)
      push!(prepreamble.args, :($iter = $_iter))
    end
    if Meta.isexpr(itersyms, :tuple, 2)
      indsym, varsym = itersyms.args[1]::Symbol, itersyms.args[2]::Symbol
      _replace_looprange!(q, i, indsym, iter)
      pushfirst!(body.args, :($varsym = $iter[$indsym+firstindex($iter)-1]))
    elseif Meta.isexpr(itersyms, :tuple, 1) # like `for (i,) in enumerate(...)`
      indsym = itersyms.args[1]::Symbol
      _replace_looprange!(q, i, indsym, iter)
    elseif itersyms isa Symbol # if itersyms are not unbox in loop range
      throw(
        ArgumentError("`for $itersyms in enumerate($r)` is not supported,
    please use `for ($(itersyms)_i, $(itersyms)_v) in enumerate($r)` instead.")
      )
    else
      throw(ArgumentError("Don't know how to handle expression `$itersyms`."))
    end
  end
  return q
end
_replace_looprange!(q, ::Nothing, indsym, iter) =
  q.args[1] = :($indsym = Base.OneTo(length($iter)))
_replace_looprange!(q, i::Int, indsym, iter) =
  q.args[1].args[i] = :($indsym = Base.OneTo(length($iter)))

function turbo_macro(mod, src, q, args...)
  q = macroexpand(mod, q)
  if q.head === :for
    ls = LoopSet(q, mod)
    inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe =
      process_args(args)
    esc(
      setup_call(
        ls,
        q,
        src,
        inline,
        check_empty,
        u₁,
        u₂,
        v,
        threads,
        warncheckarg,
        safe
      )
    )
  else
    inline, check_empty, u₁, u₂, v, threads, warncheckarg, safe =
      process_args(args; inline = true)
    substitute_broadcast(
      q,
      Symbol(mod),
      inline,
      u₁,
      u₂,
      v,
      threads,
      warncheckarg,
      safe
    )
  end
end
"""
    @turbo

Annotate a `for` loop, or a set of nested `for` loops whose bounds are constant across iterations, to optimize the computation. For example:

    function AmulB!(C, A, B)
        @turbo for m ∈ indices((A,C), 1), n ∈ indices((B,C), 2) # indices((A,C),1) == axes(A,1) == axes(C,1)
            Cₘₙ = zero(eltype(C))
            for k ∈ indices((A,B), (2,1)) # indices((A,B), (2,1)) == axes(A,2) == axes(B,1)
                Cₘₙ += A[m,k] * B[k,n]
            end
            C[m,n] = Cₘₙ
        end
    end

The macro models the set of nested loops, and chooses an ordering of the three loops to minimize predicted computation time.

Current limitations:

 1. It assumes that loop iterations are independent.
 2. It does not perform bounds checks.
 3. It assumes that each loop iterates at least once. (Use `@turbo check_empty=true` to lift this assumption.)
 4. That there is only one loop at each level of the nest.

It may also apply to broadcasts:

```jldoctest
julia> using LoopVectorization

julia> a = rand(100);

julia> b = @turbo exp.(2 .* a);

julia> c = similar(b);

julia> @turbo @. c = exp(2a);

julia> b ≈ c
true
```

# Extended help

Advanced users can customize the implementation of the `@turbo`-annotated block
using keyword arguments:

```julia
@turbo inline = false unroll = 2 thread = 4 body
```

where `body` is the code of the block (e.g., `for ... end`).

`thread` is either a Boolean, or an integer.
The integer's value indicates the number of threads to use.
It is clamped to be between `1` and `min(Threads.nthreads(),LoopVectorization.num_cores())`.
`false` is equivalent to `1`, and `true` is equivalent to `min(Threads.nthreads(),LoopVectorization.num_cores())`.

`safe` (defaults to `true`) will cause `@turbo` to fall back to `@inbounds @fastmath` if `can_turbo` returns false for any of the functions called in the loop. You can disable the associated warning with `warn_check_args=false`.

Setting the keyword argument `warn_check_args=true` (e.g. `@turbo warn_check_args=true for ...`) in a loop or
broadcast statement will cause it to warn once if `LoopVectorization.check_args` fails and the fallback
loop is executed instead of the LoopVectorization-optimized loop.
Setting it to an integer > 0 will warn that many times, while setting it to a negative integer will warn
an unlimited amount of times. The default is `warn_check_args = 1`. Failure means that there may have been an array with unsupported type, unsupported element types, or (if `safe=true`) a function for which `can_turbo` returned `false`.

`inline` is a Boolean. When `true`, `body` will be directly inlined
into the function (via a forced-inlining call to `_turbo_!`).
When `false`, it wont force inlining of the call to `_turbo_!` instead, letting Julia's own inlining engine
determine whether the call to `_turbo_!` should be inlined. (Typically, it won't.)
Sometimes not inlining can lead to substantially worse code generation, and >40% regressions, even in very
large problems (2-d convolutions are a case where this has been observed).
One can find some circumstances where `inline=true` is faster, and other circumstances
where `inline=false` is faster, so the best setting may require experimentation. By default, the macro
tries to guess. Currently the algorithm is simple: roughly, if there are more than two dynamically sized loops
or and no convolutions, it will probably not force inlining. Otherwise, it probably will.

`check_empty` (default is `false`) determines whether or not it will check if any of the iterators are empty.
If false, you must ensure yourself that they are not empty, else the behavior of the loop is undefined and
(like with `@inbounds`) segmentation faults are likely.

`unroll` is an integer that specifies the loop unrolling factor, or a
tuple `(u₁, u₂) = (4, 2)` signaling that the generated code should unroll more than
one loop. `u₁` is the unrolling factor for the first unrolled loop and `u₂` for the next (if present),
but it applies to the loop ordering and unrolling that will be chosen by LoopVectorization,
*not* the order in `body`.
`uᵢ=0` (the default) indicates that LoopVectorization should pick its own value,
and `uᵢ=-1` disables unrolling for the correspond loop.

The `@turbo` macro also checks the array arguments using `LoopVectorization.check_args` to try and determine
if they are compatible with the macro. If `check_args` returns false, a fall back loop annotated with `@inbounds`
and `@fastmath` is generated. Note that `VectorizationBase` provides functions such as `vadd` and `vmul` that will
ignore `@fastmath`, preserving IEEE semantics both within `@turbo` and `@fastmath`.
`check_args` currently returns false for some wrapper types like `LinearAlgebra.UpperTriangular`, requiring you to
use their `parent`. Triangular loops aren't yet supported.
"""
macro turbo(args...)
  turbo_macro(__module__, __source__, last(args), Base.front(args)...)
end
"""
    @tturbo

Equivalent to `@turbo`, except it adds `thread=true` as the first keyword argument.
Note that later arguments take precedence.

Meant for convenience, as `@tturbo` is shorter than `@turbo thread=true`.
"""
macro tturbo(args...)
  turbo_macro(
    __module__,
    __source__,
    last(args),
    :(thread = true),
    Base.front(args)...
  )
end

function def_outer_reduct_types!(ls::LoopSet)
  for or ∈ ls.outer_reductions
    op = operations(ls)[or]
    pushpreamble!(
      ls,
      Expr(:(=), outer_reduct_init_typename(op), typeof_expr(op))
    )
  end
end
"""
    @_turbo

This macro mostly exists for debugging/testing purposes. It does not support many of the use cases of [`@turbo`](@ref).
It emits loops directly, rather than punting to an `@generated` function, meaning it doesn't have access to type
information when generating code or analyzing the loops, often leading to bad performance.

This macro accepts the `inline` and `unroll` keyword arguments like `@turbo`, but ignores the `check_empty` argument.
"""
macro _turbo(q)
  q = macroexpand(__module__, q)
  ls = LoopSet(q, __module__)
  set_hw!(ls)
  def_outer_reduct_types!(ls)
  esc(Expr(:block, ls.prepreamble, lower_and_split_loops(ls, -1)))
end
macro _turbo(arg, q)
  @assert q.head === :for
  q = macroexpand(__module__, q)
  inline, check_empty, u₁, u₂, v = check_macro_kwarg(
    arg,
    false,
    false,
    zero(Int8),
    zero(Int8),
    zero(Int8),
    1,
    0,
    true
  )
  ls = LoopSet(q, __module__)
  set_hw!(ls)
  def_outer_reduct_types!(ls)
  esc(Expr(:block, ls.prepreamble, lower(ls, u₁ % Int, u₂ % Int, v % Int, -1)))
end

"""
    @turbo_debug

Returns a `LoopSet` object instead of evaluating the loops. Useful for debugging and introspection.
"""
macro turbo_debug(q)
  q = macroexpand(__module__, q)
  ls = LoopSet(q, __module__)
  esc(LoopVectorization.setup_call_debug(ls))
end

# define aliases
const var"@avx" = var"@turbo"
const var"@avxt" = var"@tturbo"
const var"@avx_debug" = var"@turbo_debug"