Halide/src/CodeGen_LLVM.h at ffa72f3429bfcd2b7a94dce65b68a875470ceff5 · halide/Halide · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
#ifndef HALIDE_CODEGEN_LLVM_H
#define HALIDE_CODEGEN_LLVM_H

/** \file
 *
 * Defines the base-class for all architecture-specific code
 * generators that use llvm.
 */

namespace llvm {
class Value;
class Module;
class Function;
class FunctionType;
class IRBuilderDefaultInserter;
class ConstantFolder;
template<typename, typename>
class IRBuilder;
class LLVMContext;
class Type;
class PointerType;
class StructType;
class Instruction;
class CallInst;
class ExecutionEngine;
class AllocaInst;
class Constant;
class Triple;
class MDNode;
class NamedMDNode;
class DataLayout;
class BasicBlock;
class GlobalVariable;
class VectorType;
}  // namespace llvm

#include <functional>
#include <map>
#include <memory>
#include <optional>
#include <string>
#include <variant>
#include <vector>

#include "IRVisitor.h"
#include "Module.h"
#include "Scope.h"
#include "Target.h"

namespace Halide {

struct ExternSignature;

namespace Internal {

/** A code generator abstract base class. Actual code generators
 * (e.g. CodeGen_X86) inherit from this. This class is responsible
 * for taking a Halide Stmt and producing llvm bitcode, machine
 * code in an object file, or machine code accessible through a
 * function pointer.
 */
class CodeGen_LLVM : public IRVisitor {
public:
    /** Create an instance of CodeGen_LLVM suitable for the target. */
    static std::unique_ptr<CodeGen_LLVM> new_for_target(const Target &target, llvm::LLVMContext &context);

    /** Takes a halide Module and compiles it to an llvm Module. */
    virtual std::unique_ptr<llvm::Module> compile(const Module &module);

    /** The target we're generating code for */
    const Target &get_target() const {
        return target;
    }

    /** Tell the code generator which LLVM context to use. */
    void set_context(llvm::LLVMContext &context);

    /** Initialize internal llvm state for the enabled targets. */
    static void initialize_llvm();

    static std::unique_ptr<llvm::Module> compile_trampolines(
        const Target &target,
        llvm::LLVMContext &context,
        const std::string &suffix,
        const std::vector<std::pair<std::string, ExternSignature>> &externs);

    size_t get_requested_alloca_total() const {
        return requested_alloca_total;
    }

protected:
    CodeGen_LLVM(const Target &t);

    /** Compile a specific halide declaration into the llvm Module. */
    // @{
    virtual void compile_func(const LoweredFunc &func, const std::string &simple_name, const std::string &extern_name);
    virtual void compile_buffer(const Buffer<> &buffer);
    // @}

    /** Helper functions for compiling Halide functions to llvm
     * functions. begin_func performs all the work necessary to begin
     * generating code for a function with a given argument list with
     * the IRBuilder. A call to begin_func should be a followed by a
     * call to end_func with the same arguments, to generate the
     * appropriate cleanup code. */
    // @{
    virtual void begin_func(LinkageType linkage, const std::string &simple_name,
                            const std::string &extern_name, const std::vector<LoweredArgument> &args);
    virtual void end_func(const std::vector<LoweredArgument> &args);
    // @}

    /** What should be passed as -mcpu (warning: implies attrs!), -mattrs,
     *  and related for compilation. The architecture-specific code generator
     *  should define these.
     *
     *  `mcpu_target()` - target this specific CPU, in the sense of the allowed
     *  ISA sets *and* the CPU-specific tuning/assembly instruction scheduling.
     *
     *  `mcpu_tune()` - expect that we will be running on this specific CPU,
     *  so perform CPU-specific tuning/assembly instruction scheduling, *but*
     *  DON'T sacrifice the portability, support running on other CPUs, only
     *  make use of the ISAs that are enabled by `mcpu_target()`+`mattrs()`.
     */
    // @{
    virtual std::string mcpu_target() const = 0;
    virtual std::string mcpu_tune() const = 0;
    virtual std::string mattrs() const = 0;
    virtual std::string mabi() const;
    virtual bool use_soft_float_abi() const = 0;
    virtual bool use_pic() const;
    // @}

    /** Should indexing math be promoted to 64-bit on platforms with
     * 64-bit pointers? */
    virtual bool promote_indices() const {
        return true;
    }

    /** What's the natural vector bit-width to use for loads, stores, etc. */
    virtual int native_vector_bits() const = 0;

    /** Used to decide whether to break a vector up into multiple smaller
     * operations. This is the largest size the architecture supports. */
    virtual int maximum_vector_bits() const {
        return native_vector_bits();
    }
    /** For architectures that have vscale vectors, return the constant vscale to use.
     * Default of 0 means do not use vscale vectors. Generally will depend on
     * the target flags and vector_bits settings.
     */
    virtual int target_vscale() const {
        return 0;
    }

    /** Return the type in which arithmetic should be done for the
     * given storage type. */
    virtual Type upgrade_type_for_arithmetic(const Type &) const;

    /** Return the type that a given Halide type should be
     * stored/loaded from memory as. */
    virtual Type upgrade_type_for_storage(const Type &) const;

    /** Return the type that a Halide type should be passed in and out
     * of functions as. */
    virtual Type upgrade_type_for_argument_passing(const Type &) const;

    void set_effective_vscale(int vscale);

    std::unique_ptr<llvm::Module> module;
    llvm::Function *function = nullptr;
    llvm::LLVMContext *context = nullptr;
    std::unique_ptr<llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>> builder;
    llvm::Value *value = nullptr;
    llvm::MDNode *very_likely_branch = nullptr;
    llvm::MDNode *fast_fp_math_md = nullptr;
    llvm::MDNode *strict_fp_math_md = nullptr;
    std::vector<LoweredArgument> current_function_args;

    bool in_strict_float = false;
    bool any_strict_float = false;

    /** Change floating-point math op emission to use fast flags. */
    void set_fast_fp_math();

    /** Change floating-point math op emission to use strict flags. */
    void set_strict_fp_math();

    /** If any_strict_float is true, sets fast math flags for the lifetime of
     * this object, then sets them to strict on destruction. If any_strict_float
     * is false, does nothing.  Any call to an IRBuilder method that starts with
     * "CreateF" should probably be wrapped in one of these, but it's safe to
     * miss one - we just miss out on some optimizations. In this way codegen is
     * designed to fail safe. */
    struct ScopedFastMath {
        CodeGen_LLVM *codegen;
        ScopedFastMath(CodeGen_LLVM *);
        ~ScopedFastMath();
    };

    /** The target we're generating code for */
    Halide::Target target;

    /** Grab all the context specific internal state. */
    virtual void init_context();
    /** Initialize the CodeGen_LLVM internal state to compile a fresh
     * module. This allows reuse of one CodeGen_LLVM object to compiled
     * multiple related modules (e.g. multiple device kernels). */
    virtual void init_module();

    /** Run all of llvm's optimization passes on the module. */
    void optimize_module();

    /** Add an entry to the symbol table, hiding previous entries with
     * the same name. Call this when new values come into scope. */
    void sym_push(const std::string &name, llvm::Value *value);

    /** Remove an entry for the symbol table, revealing any previous
     * entries with the same name. Call this when values go out of
     * scope. */
    void sym_pop(const std::string &name);

    /** Fetch an entry from the symbol table. If the symbol is not
     * found, it either errors out (if the second arg is true), or
     * returns nullptr. */
    llvm::Value *sym_get(const std::string &name,
                         bool must_succeed = true) const;

    /** Test if an item exists in the symbol table. */
    bool sym_exists(const std::string &name) const;

    /** Given a Halide ExternSignature, return the equivalent llvm::FunctionType. */
    llvm::FunctionType *signature_to_type(const ExternSignature &signature);

    /** Some useful llvm types */
    // @{
    llvm::Type *void_t = nullptr, *i1_t = nullptr, *i8_t = nullptr, *i16_t = nullptr, *i32_t = nullptr, *i64_t = nullptr, *f16_t = nullptr, *f32_t = nullptr, *f64_t = nullptr;
    llvm::PointerType *ptr_t = nullptr;
    llvm::StructType *halide_buffer_t_type = nullptr,
                     *type_t_type,
                     *dimension_t_type,
                     *metadata_t_type = nullptr,
                     *argument_t_type = nullptr,
                     *scalar_value_t_type = nullptr,
                     *device_interface_t_type = nullptr,
                     *pseudostack_slot_t_type = nullptr,
                     *semaphore_t_type;

    // @}

    /** Some wildcard variables used for peephole optimizations in
     * subclasses */
    // @{
    Expr wild_u1x_, wild_i8x_, wild_u8x_, wild_i16x_, wild_u16x_;
    Expr wild_i32x_, wild_u32x_, wild_i64x_, wild_u64x_;
    Expr wild_f32x_, wild_f64x_;

    // Wildcards for scalars.
    Expr wild_u1_, wild_i8_, wild_u8_, wild_i16_, wild_u16_;
    Expr wild_i32_, wild_u32_, wild_i64_, wild_u64_;
    Expr wild_f32_, wild_f64_;
    // @}

    /** Emit code that evaluates an expression, and return the llvm
     * representation of the result of the expression. */
    llvm::Value *codegen(const Expr &);

    /** Emit code that runs a statement. */
    void codegen(const Stmt &);

    /** Codegen a vector Expr by codegenning each lane and combining. */
    void scalarize(const Expr &);

    /** Some destructors should always be called. Others should only
     * be called if the pipeline is exiting with an error code. */
    enum DestructorType { Always,
                          OnError,
                          OnSuccess };

    /* Call this at the location of object creation to register how an
     * object should be destroyed. This does three things:
     * 1) Emits code here that puts the object in a unique
     * null-initialized stack slot
     * 2) Adds an instruction to the destructor block that calls the
     * destructor on that stack slot if it's not null.
     * 3) Returns that stack slot, so you can neuter the destructor
     * (by storing null to the stack slot) or destroy the object early
     * (by calling trigger_destructor).
     */
    llvm::Value *register_destructor(llvm::Function *destructor_fn, llvm::Value *obj, DestructorType when);

    /** Call a destructor early. Pass in the value returned by register destructor. */
    void trigger_destructor(llvm::Function *destructor_fn, llvm::Value *stack_slot);

    /** Retrieves the block containing the error handling
     * code. Creates it if it doesn't already exist for this
     * function. */
    llvm::BasicBlock *get_destructor_block();

    /** Codegen an assertion. If false, returns the error code (if not
     * null), or evaluates and returns the message, which must be an
     * Int(32) expression. */
    // @{
    void create_assertion(llvm::Value *condition, const Expr &message, llvm::Value *error_code = nullptr);
    // @}

    /** Codegen a block of asserts with pure conditions */
    void codegen_asserts(const std::vector<const AssertStmt *> &asserts);

    /** Return the the pipeline with the given error code. Will run
     * the destructor block. */
    void return_with_error_code(llvm::Value *error_code);

    /** Put a string constant in the module as a global variable and return a pointer to it. */
    llvm::Constant *create_string_constant(const std::string &str);

    /** Put a binary blob in the module as a global variable and return a pointer to it. */
    llvm::Constant *create_binary_blob(const std::vector<char> &data, const std::string &name, bool constant = true);

    /** Widen an llvm scalar into an llvm vector with the given number of lanes. */
    llvm::Value *create_broadcast(llvm::Value *, int lanes);

    /** Generate a pointer into a named buffer at a given index, of a
     * given type. The index counts according to the scalar type of
     * the type passed in. */
    // @{
    llvm::Value *codegen_buffer_pointer(const std::string &buffer, Type type, llvm::Value *index);
    llvm::Value *codegen_buffer_pointer(const std::string &buffer, Type type, Expr index);
    llvm::Value *codegen_buffer_pointer(llvm::Value *base_address, Type type, Expr index);
    llvm::Value *codegen_buffer_pointer(llvm::Value *base_address, Type type, llvm::Value *index);
    // @}

    /** Return type string for LLVM type using LLVM IR intrinsic type mangling.
     * E.g. ".i32 or ".f32" for scalars, ".p0" for pointers,
     * ".nxv4i32" for a scalable vector of four 32-bit integers,
     * or ".v4f32" for a fixed vector of four 32-bit floats.
     * The dot is included in the result.
     */
    std::string mangle_llvm_type(llvm::Type *type);

    /** Turn a Halide Type into an llvm::Value representing a constant halide_type_t */
    llvm::Value *make_halide_type_t(const Type &);

    /** Mark a load or store with type-based-alias-analysis metadata
     * so that llvm knows it can reorder loads and stores across
     * different buffers */
    void add_tbaa_metadata(llvm::Instruction *inst, std::string buffer, const Expr &index);

    /** Get a unique name for the actual block of memory that an
     * allocate node uses. Used so that alias analysis understands
     * when multiple Allocate nodes shared the same memory. */
    virtual std::string get_allocation_name(const std::string &n) {
        return n;
    }

    /** Add the appropriate function attribute to tell LLVM that the function
     * doesn't access memory. */
    void function_does_not_access_memory(llvm::Function *fn);

    using IRVisitor::visit;

    /** Generate code for various IR nodes. These can be overridden by
     * architecture-specific code to perform peephole
     * optimizations. The result of each is stored in \ref value */
    // @{
    void visit(const IntImm *) override;
    void visit(const UIntImm *) override;
    void visit(const FloatImm *) override;
    void visit(const StringImm *) override;
    void visit(const Cast *) override;
    void visit(const Reinterpret *) override;
    void visit(const Variable *) override;
    void visit(const Add *) override;
    void visit(const Sub *) override;
    void visit(const Mul *) override;
    void visit(const Div *) override;
    void visit(const Mod *) override;
    void visit(const Min *) override;
    void visit(const Max *) override;
    void visit(const EQ *) override;
    void visit(const NE *) override;
    void visit(const LT *) override;
    void visit(const LE *) override;
    void visit(const GT *) override;
    void visit(const GE *) override;
    void visit(const And *) override;
    void visit(const Or *) override;
    void visit(const Not *) override;
    void visit(const Select *) override;
    void visit(const Load *) override;
    void visit(const Ramp *) override;
    void visit(const Broadcast *) override;
    void visit(const Call *) override;
    void visit(const Let *) override;
    void visit(const LetStmt *) override;
    void visit(const AssertStmt *) override;
    void visit(const ProducerConsumer *) override;
    void visit(const For *) override;
    void visit(const Store *) override;
    void visit(const Block *) override;
    void visit(const IfThenElse *) override;
    void visit(const Evaluate *) override;
    void visit(const Shuffle *) override;
    void visit(const VectorReduce *) override;
    void visit(const Prefetch *) override;
    void visit(const Atomic *) override;
    // @}

    /** Generate code for an allocate node. It has no default
     * implementation - it must be handled in an architecture-specific
     * way. */
    void visit(const Allocate *) override = 0;

    /** Generate code for a free node. It has no default
     * implementation and must be handled in an architecture-specific
     * way. */
    void visit(const Free *) override = 0;

    /** These IR nodes should have been removed during
     * lowering. CodeGen_LLVM will error out if they are present */
    // @{
    void visit(const Provide *) override;
    void visit(const Realize *) override;
    // @}

    /** Get the llvm type equivalent to the given halide type in the
     * current context. */
    virtual llvm::Type *llvm_type_of(const Type &) const;

    /** Get the llvm type equivalent to a given halide type. If
     * effective_vscale is nonzero and the type is a vector type with lanes
     * a multiple of effective_vscale, a scalable vector type is generated
     * with total lanes divided by effective_vscale. That is a scalable
     * vector intended to be used with a fixed vscale of effective_vscale.
     */
    llvm::Type *llvm_type_of(llvm::LLVMContext *context, Halide::Type t,
                             int effective_vscale) const;

    /** Perform an alloca at the function entrypoint. Will be cleaned
     * on function exit. */
    llvm::Value *create_alloca_at_entry(llvm::Type *type, int n,
                                        bool zero_initialize = false,
                                        const std::string &name = "");

    /** A (very) conservative guess at the size of all alloca() storage requested
     * (including alignment padding). It's currently meant only to be used as
     * a very coarse way to ensure there is enough stack space when testing
     * on the WebAssembly backend.
     *
     * It is *not* meant to be a useful proxy for "stack space needed", for a
     * number of reasons:
     * - allocas with non-overlapping lifetimes will share space
     * - on some backends, LLVM may promote register-sized allocas into registers
     * - while this accounts for alloca() calls we know about, it doesn't attempt
     *   to account for stack spills, function call overhead, etc.
     */
    size_t requested_alloca_total = 0;

    /** The user_context argument. May be a constant null if the
     * function is being compiled without a user context. */
    llvm::Value *get_user_context() const;

    /** Implementation of the intrinsic call to
     * interleave_vectors. This implementation allows for interleaving
     * an arbitrary number of vectors.*/
    virtual llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &);

    /** Description of an intrinsic function overload. Overloads are resolved
     * using both argument and return types. The scalar types of the arguments
     * and return type must match exactly for an overload resolution to succeed. */
    struct Intrinsic {
        Type result_type;
        std::vector<Type> arg_types;
        llvm::Function *impl;

        Intrinsic(Type result_type, std::vector<Type> arg_types, llvm::Function *impl)
            : result_type(result_type), arg_types(std::move(arg_types)), impl(impl) {
        }
    };
    using IntrinsicsMap = std::map<std::string, std::vector<Intrinsic>>;
    /** Mapping of intrinsic functions to the various overloads implementing it. */
    IntrinsicsMap intrinsics;

    /** Get an LLVM intrinsic declaration. If it doesn't exist, it will be created. */
    llvm::Function *get_llvm_intrin(const Type &ret_type, const std::string &name, const std::vector<Type> &arg_types, bool scalars_are_vectors = false);
    llvm::Function *get_llvm_intrin(llvm::Type *ret_type, const std::string &name, const std::vector<llvm::Type *> &arg_types);
    /** Declare an intrinsic function that participates in overload resolution. */
    llvm::Function *declare_intrin_overload(const std::string &name, const Type &ret_type, const std::string &impl_name, std::vector<Type> arg_types, bool scalars_are_vectors = false);
    void declare_intrin_overload(const std::string &name, const Type &ret_type, llvm::Function *impl, std::vector<Type> arg_types);
    /** Call an overloaded intrinsic function. Returns nullptr if no suitable overload is found. */
    virtual llvm::Value *call_overloaded_intrin(const Type &result_type, const std::string &name, const std::vector<Expr> &args);
    /** Call an overloaded intrinsic function. Returns nullptr if no suitable overload is found.
     * Look up the given overloaded_intrinsics map for the corresponding intrin */
    llvm::Value *call_overloaded_intrin(const Type &result_type, const std::string &name, const std::vector<Expr> &args,
                                        const IntrinsicsMap &overloaded_intrinsics);

    /** Generate a call to a vector intrinsic or runtime inlined
     * function. The arguments are sliced up into vectors of the width
     * given by 'intrin_lanes', the intrinsic is called on each
     * piece, then the results (if any) are concatenated back together
     * into the original type 't'. For the version that takes an
     * llvm::Type *, the type may be void, so the vector width of the
     * arguments must be specified explicitly as
     * 'called_lanes'. */
    // @{
    llvm::Value *call_intrin(const Type &t, int intrin_lanes,
                             const std::string &name, std::vector<Expr>);
    llvm::Value *call_intrin(const Type &t, int intrin_lanes,
                             llvm::Function *intrin, std::vector<Expr>);
    llvm::Value *call_intrin(const llvm::Type *t, int intrin_lanes,
                             const std::string &name, std::vector<llvm::Value *>,
                             bool scalable_vector_result = false, bool is_reduction = false);
    llvm::Value *call_intrin(const llvm::Type *t, int intrin_lanes,
                             llvm::Function *intrin, std::vector<llvm::Value *>,
                             bool is_reduction = false);
    // @}

    /** Take a slice of lanes out of an llvm vector. Pads with undefs
     * if you ask for more lanes than the vector has. */
    virtual llvm::Value *slice_vector(llvm::Value *vec, int start, int extent);

    /** Use an arithmetic fence to prevent LLVM from fusing operations
     * across this barrier. Works by bitcasting to float, applying
     * llvm.arithmetic.fence, and bitcasting back. */
    virtual llvm::Value *optimization_fence(llvm::Value *);

    /** Concatenate a bunch of llvm vectors. Must be of the same type. */
    virtual llvm::Value *concat_vectors(const std::vector<llvm::Value *> &);

    /** Reverse elements in a vector */
    llvm::Value *reverse_vector(llvm::Value *vec);

    /** Create an LLVM shuffle vectors instruction. Takes a combination of
     * fixed or scalable vectors as input, so long as the effective lengths match,
     * but always returns a fixed vector. */
    virtual llvm::Value *shuffle_vectors(llvm::Value *a, llvm::Value *b,
                                         const std::vector<int> &indices);
    /** Shorthand for shuffling a single vector. */
    llvm::Value *shuffle_vectors(llvm::Value *v, const std::vector<int> &indices);

    bool is_power_of_two(int x) const;

    bool is_scalable_vector(llvm::Value *v) const;

    /** Go looking for a vector version of a runtime function. Will
     * return the best match. Matches in the following order:
     *
     * 1) The requested vector width.
     *
     * 2) The width which is the smallest power of two
     * greater than or equal to the vector width.
     *
     * 3) All the factors of 2) greater than one, in decreasing order.
     *
     * 4) The smallest power of two not yet tried.
     *
     * So for a 5-wide vector, it tries: 5, 8, 4, 2, 16.
     *
     * If there's no match, returns (nullptr, 0).
     */
    std::pair<llvm::Function *, int> find_vector_runtime_function(const std::string &name, int lanes);

    virtual bool supports_atomic_add(const Type &t) const;

    /** Compile a horizontal reduction that starts with an explicit
     * initial value. There are lots of complex ways to peephole
     * optimize this pattern, especially with the proliferation of
     * dot-product instructions, and they can usefully share logic
     * across backends. */
    virtual void codegen_vector_reduce(const VectorReduce *op, const Expr &init);

    /** Are we inside an atomic node that uses mutex locks?
        This is used for detecting deadlocks from nested atomics & illegal vectorization. */
    bool inside_atomic_mutex_node = false;

    /** Emit atomic store instructions? */
    bool emit_atomic_stores = false;

    /** Can we call this operation with float16 type?
        This is used to avoid "emulated" equivalent code-gen in case target has FP16 feature **/
    virtual bool supports_call_as_float16(const Call *op) const;

    /** call_intrin does far too much to be useful and generally breaks things
     * when one has carefully set things up for a specific architecture. This
     * just does the bare minimum. call_intrin should be refactored and could
     * call this, possibly with renaming of the methods. */
    llvm::Value *simple_call_intrin(const std::string &intrin,
                                    const std::vector<llvm::Value *> &args,
                                    llvm::Type *result_type);

    /** Ensure that a vector value is either fixed or vscale depending to match desired_type.
     */
    llvm::Value *normalize_fixed_scalable_vector_type(llvm::Type *desired_type, llvm::Value *result);

    /** Convert between two LLVM vectors of potentially different scalable/fixed and size.
     * Used to handle converting to/from fixed vectors that are smaller than the minimum
     * size scalable vector. */
    llvm::Value *convert_fixed_or_scalable_vector_type(llvm::Value *arg,
                                                       llvm::Type *desired_type);

    /** Convert an LLVM fixed vector value to the corresponding vscale vector value. */
    llvm::Value *fixed_to_scalable_vector_type(llvm::Value *fixed);

    /** Convert an LLVM vscale vector value to the corresponding fixed vector value. */
    llvm::Value *scalable_to_fixed_vector_type(llvm::Value *scalable);

    /** Work around LLVM's inability to lower vector insert/extract for i1
     * element types (getVectorSubVecPointer computes byte offsets via integer
     * division, truncating for i1: 1/8=0). Widens the i1 vector arg to i8,
     * applies fn to the widened value, and truncates the result back to
     * result_i1_type. */
    llvm::Value *handle_bool_as_i8(llvm::Value *arg, llvm::VectorType *result_i1_type,
                                   const std::function<llvm::Value *(llvm::Value *)> &fn);

    /** Get number of vector elements, taking into account scalable vectors. Returns 1 for scalars. */
    // @{
    int get_vector_num_elements(const llvm::Type *t);
    int get_vector_num_elements(const llvm::Value *v);
    // @}

    /** Interface to abstract vector code generation as LLVM is now
     * providing multiple options to express even simple vector
     * operations. Specifically traditional fixed length vectors, vscale
     * based variable length vectors, and the vector predicate based approach
     * where an explicit length is passed with each instruction.
     */
    // @{
    enum class VectorTypeConstraint {
        None,    /// Use default for current target.
        Fixed,   /// Force use of fixed size vectors.
        VScale,  /// For use of scalable vectors.
    };
    llvm::Type *get_vector_type(llvm::Type *, int n,
                                VectorTypeConstraint type_constraint = VectorTypeConstraint::None) const;
    // @}

    llvm::Constant *get_splat(int lanes, llvm::Constant *value,
                              VectorTypeConstraint type_constraint = VectorTypeConstraint::None) const;

    /** Make sure a value type has the same scalable/fixed vector type as a guide. */
    // @{
    llvm::Value *match_vector_type_scalable(llvm::Value *value, VectorTypeConstraint constraint);
    llvm::Value *match_vector_type_scalable(llvm::Value *value, llvm::Type *guide);
    llvm::Value *match_vector_type_scalable(llvm::Value *value, llvm::Value *guide);
    // @}

    /** Support for generating LLVM vector predication intrinsics
     * ("@llvm.vp.*" and "@llvm.experimental.vp.*")
     */
    // @{
    /** Struct to hold descriptor for an argument to a vector
     *  predicated intrinsic. This includes the value, whether the
     *  type of the argument should be mangled into the intrinsic name
     *  and if so, where, and the alignment for pointer arguments. */
    struct VPArg {
        llvm::Value *value;
        // If provided, put argument's type into the intrinsic name via LLVM IR type mangling.
        std::optional<size_t> mangle_index;
        int alignment;
        VPArg(llvm::Value *value, std::optional<size_t> mangle_index = std::nullopt, int32_t alignment = 0)
            : value(value), mangle_index(mangle_index), alignment(alignment) {
        }
    };

    /** Type indicating an intrinsic does not take a mask. */
    struct NoMask {
    };

    /** Type indicating mask to use is all true -- all lanes enabled. */
    struct AllEnabledMask {
    };

    /** Predication mask using the above two types for special cases
     *   and an llvm::Value for the general one. */
    using MaskVariant = std::variant<NoMask, AllEnabledMask, llvm::Value *>;

    /** Generate a vector predicated comparison intrinsic call if
     * use_llvm_vp_intrinsics is true and result_type is a vector
     * type. If generated, assigns result of vp intrinsic to value and
     * returns true if it an instruction is generated, otherwise
     * returns false. */
    bool try_vector_predication_comparison(const std::string &name, const Type &result_type,
                                           MaskVariant mask, llvm::Value *a, llvm::Value *b,
                                           const char *cmp_op);

    struct VPResultType {
        llvm::Type *type;
        std::optional<size_t> mangle_index;
        VPResultType(llvm::Type *type, std::optional<size_t> mangle_index = std::nullopt)
            : type(type), mangle_index(mangle_index) {
        }
    };

    /** Generate an intrinsic call if use_llvm_vp_intrinsics is true
     * and length is greater than 1. If generated, assigns result
     * of vp intrinsic to value and returns true if it an instruction
     * is generated, otherwise returns false. */
    bool try_vector_predication_intrinsic(const std::string &name, VPResultType result_type,
                                          int32_t length, MaskVariant mask, std::vector<VPArg> args);

    /** Controls use of vector predicated intrinsics for vector operations.
     * Will be set by certain backends (e.g. RISC V) to control codegen. */
    bool use_llvm_vp_intrinsics = false;
    // @}

    /** Generate a basic dense vector load, with an optional predicate and
     * control over whether or not we should slice the load into native
     * vectors. Used by CodeGen_ARM to help with vld2/3/4 emission. */
    llvm::Value *codegen_dense_vector_load(const Load *load, llvm::Value *vpred = nullptr, bool slice_to_native = true);

    /** Warning messages which we want to avoid displaying number of times */
    enum class WarningKind {
        EmulatedFloat16,
    };
    std::map<WarningKind, std::string> onetime_warnings;

private:
    /** All the values in scope at the current code location during
     * codegen. Use sym_push and sym_pop to access. */
    Scope<llvm::Value *> symbol_table;

    /** String constants already emitted to the module. Tracked to
     * prevent emitting the same string many times. */
    std::map<std::string, llvm::Constant *> string_constants;

    /** A basic block to branch to on error that triggers all
     * destructors. As destructors are registered, code gets added
     * to this block. */
    llvm::BasicBlock *destructor_block = nullptr;

    /** Turn off all unsafe math flags in scopes while this is set. */
    bool strict_float;

    /** Use the LLVM large code model when this is set. */
    bool llvm_large_code_model;

    /** Cache the result of target_vscale from architecture specific implementation
     * as this is used on every Halide to LLVM type conversion.
     */
    int effective_vscale = 0;

    /** Assign a unique ID to each producer-consumer and for-loop node. The IDs
     * are printed as comments in assembly and used to link visualizations with
     * the generated assembly code within `StmtToViz`
     */
    int producer_consumer_id = 0;
    int for_loop_id = 0;

    /** Embed an instance of halide_filter_metadata_t in the code, using
     * the given name (by convention, this should be ${FUNCTIONNAME}_metadata)
     * as extern "C" linkage. Note that the return value is a function-returning-
     * pointer-to-constant-data.
     */
    llvm::Function *embed_metadata_getter(const std::string &metadata_getter_name,
                                          const std::string &function_name, const std::vector<LoweredArgument> &args,
                                          const MetadataNameMap &metadata_name_map);

    /** Embed a constant expression as a global variable. */
    llvm::Constant *embed_constant_expr(Expr e, llvm::Type *t);
    llvm::Constant *embed_constant_scalar_value_t(const Expr &e);

    llvm::Function *add_argv_wrapper(llvm::Function *fn, const std::string &name,
                                     bool result_in_argv, std::vector<bool> &arg_is_buffer);

    llvm::Value *codegen_vector_load(const Type &type, const std::string &name, const Expr &base,
                                     const Buffer<> &image, const Parameter &param, const ModulusRemainder &alignment,
                                     llvm::Value *vpred = nullptr, bool slice_to_native = true, llvm::Value *stride = nullptr);

    virtual void codegen_predicated_load(const Load *op);
    virtual void codegen_predicated_store(const Store *op);

    void codegen_atomic_rmw(const Store *op);

    void init_codegen(const std::string &name);
    std::unique_ptr<llvm::Module> finish_codegen();

    /** A helper routine for generating folded vector reductions. */
    template<typename Op>
    bool try_to_fold_vector_reduce(const Expr &a, Expr b);

    /** Records the StructType for pointer values returned from
     * make_struct intrinsic. Required for opaque pointer support.
     * This map should never grow without bound as each entry
     * represents a unique struct type created by a closure or similar.
     */
    std::map<llvm::Value *, llvm::Type *> struct_type_recovery;
};

}  // namespace Internal

/** Given a Halide module, generate an llvm::Module. */
std::unique_ptr<llvm::Module> codegen_llvm(const Module &module,
                                           llvm::LLVMContext &context);

}  // namespace Halide

#endif