codegen: fix ARM SVE2 lowering and i1 vector handling

alexreinking · alexreinking · commit 9372a802bfb3 · 2026-03-05T08:32:25.000-05:00
diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
@@ -56,6 +56,10 @@ Target complete_arm_target(Target t) {
         }
     };
 
+    // ARMFp16 implies ARMv8.2-A; we don't know of any devices where
+    // that doesn't hold. The cascade loop below will set ARMv81a and ARMv8a.
+    add_implied_feature_if_supported(t, Target::ARMFp16, Target::ARMv82a);
+
     constexpr int num_arm_v8_features = 10;
     static const Target::Feature arm_v8_features[num_arm_v8_features] = {
         Target::ARMv89a,
@@ -1681,6 +1685,7 @@ void CodeGen_ARM::visit(const Store *op) {
                 vpred_val = convert_fixed_or_scalable_vector_type(vpred_val, pred_type);
                 if (is_predicated_store) {
                     Value *sliced_store_vpred_val = slice_vector(store_pred_val, i, natural_lanes);
+                    sliced_store_vpred_val = convert_fixed_or_scalable_vector_type(sliced_store_vpred_val, pred_type);
                     vpred_val = builder->CreateAnd(vpred_val, sliced_store_vpred_val);
                 }
 
@@ -1854,6 +1859,7 @@ void CodeGen_ARM::visit(const Load *op) {
                 Value *vpred_val = codegen(vpred);
                 if (is_predicated_load) {
                     Value *sliced_load_vpred_val = slice_vector(load_pred_val, i, natural_lanes);
+                    sliced_load_vpred_val = convert_fixed_or_scalable_vector_type(sliced_load_vpred_val, vpred_val->getType());
                     vpred_val = builder->CreateAnd(vpred_val, sliced_load_vpred_val);
                 }
 
@@ -1904,8 +1910,14 @@ Value *CodeGen_ARM::interleave_vectors(const std::vector<Value *> &vecs) {
         return CodeGen_Posix::interleave_vectors(vecs);
     }
 
-    // Lower into llvm.vector.interleave intrinsic
+    // Lower into llvm.vector.interleave intrinsic.
+    // LLVM only supports non-power-of-2 strides (e.g. 3) for scalable
+    // vectors starting in LLVM 22.
+#if LLVM_VERSION >= 220
     const std::set<int> supported_strides{2, 3, 4, 8};
+#else
+    const std::set<int> supported_strides{2, 4, 8};
+#endif
     const int stride = vecs.size();
     const int src_lanes = get_vector_num_elements(vecs[0]->getType());
 
@@ -1957,7 +1969,11 @@ Value *CodeGen_ARM::shuffle_vectors(Value *a, Value *b, const std::vector<int> &
         }
 
         // Lower slice with stride into llvm.vector.deinterleave intrinsic
+#if LLVM_VERSION >= 220
         const std::set<int> supported_strides{2, 3, 4, 8};
+#else
+        const std::set<int> supported_strides{2, 4, 8};
+#endif
         if (supported_strides.find(slice_stride) != supported_strides.end() &&
             dst_lanes * slice_stride == src_lanes &&
             indices.front() < slice_stride &&  // Start position cannot be larger than stride
@@ -2410,6 +2426,10 @@ string CodeGen_ARM::mcpu_target() const {
     if (target.bits == 32) {
         if (target.has_feature(Target::ARMv7s)) {
             return "swift";
+        } else if (target.has_feature(Target::ARMv82a)) {
+            return "cortex-a55";
+        } else if (target.has_feature(Target::ARMv8a)) {
+            return "cortex-a32";
         } else {
             return "cortex-a9";
         }
@@ -2436,7 +2456,10 @@ string CodeGen_ARM::mattrs() const {
         attrs.emplace_back("+fullfp16");
     }
     if (target.has_feature(Target::ARMv8a)) {
-        attrs.emplace_back("+v8a");
+        // The ARM (32-bit) backend calls this feature "v8"; the AArch64
+        // backend calls it "v8a". The dotted sub-versions (v8.1a, v8.2a,
+        // etc.) use the same names in both backends.
+        attrs.emplace_back(target.bits == 32 ? "+v8" : "+v8a");
     }
     if (target.has_feature(Target::ARMv81a)) {
         attrs.emplace_back("+v8.1a");
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
@@ -1515,6 +1515,17 @@ void CodeGen_LLVM::visit(const Reinterpret *op) {
         llvm::Type *llvm_dst_fixed = get_vector_type(llvm_type_of(dst.element_of()), dst.lanes(), VectorTypeConstraint::Fixed);
         value = builder->CreateBitOrPointerCast(value, llvm_dst_fixed);
         value = fixed_to_scalable_vector_type(value);
+    } else if (isa<FixedVectorType>(value->getType()) && isa<ScalableVectorType>(llvm_dst)) {
+        // Cannot bitcast/ptrtoint directly between fixed and scalable vectors.
+        // First cast to a fixed vector of the destination element type, then convert to scalable.
+        llvm::Type *llvm_dst_fixed = get_vector_type(llvm_dst->getScalarType(), dst.lanes(), VectorTypeConstraint::Fixed);
+        value = builder->CreateBitOrPointerCast(value, llvm_dst_fixed);
+        value = fixed_to_scalable_vector_type(value);
+    } else if (isa<ScalableVectorType>(value->getType()) && isa<FixedVectorType>(llvm_dst)) {
+        // Cannot bitcast/ptrtoint directly between scalable and fixed vectors.
+        // First convert to a fixed vector of the source element type, then cast.
+        value = scalable_to_fixed_vector_type(value);
+        value = builder->CreateBitOrPointerCast(value, llvm_dst);
     } else {
         // Our `Reinterpret` expr directly maps to LLVM IR bitcast/ptrtoint/inttoptr
         // instructions with no additional handling required:
@@ -4314,10 +4325,12 @@ void CodeGen_LLVM::codegen_vector_reduce(const VectorReduce *op, const Expr &ini
         const int input_lanes = val.type().lanes();
         const int input_bytes = input_lanes * val.type().bytes();
         const int vscale = std::max(effective_vscale, 1);
+        // LLVM added VECREDUCE_MUL/FMUL lowering for SVE in LLVM 22.
+        const bool mul_ok = LLVM_VERSION >= 220 || effective_vscale == 0;
         const bool llvm_has_intrinsic =
             // Must be one of these ops
             ((op->op == VectorReduce::Add ||
-              op->op == VectorReduce::Mul ||
+              (op->op == VectorReduce::Mul && mul_ok) ||
               op->op == VectorReduce::Min ||
               op->op == VectorReduce::Max) &&
              (use_llvm_vp_intrinsics ||
@@ -4920,6 +4933,13 @@ Value *CodeGen_LLVM::slice_vector(Value *vec, int start, int size) {
         // otherwise.
         llvm::Type *scalar_type = vec->getType()->getScalarType();
 
+        if (scalar_type->isIntegerTy(1)) {
+            auto *result_type = cast<VectorType>(get_vector_type(scalar_type, size / effective_vscale, VectorTypeConstraint::VScale));
+            return handle_bool_as_i8(vec, result_type, [&](Value *v) {
+                return slice_vector(v, start, size);
+            });
+        }
+
         int intermediate_lanes = std::min(size, vec_lanes - start);
         llvm::Type *intermediate_type = get_vector_type(scalar_type, intermediate_lanes, VectorTypeConstraint::Fixed);
 
@@ -5190,6 +5210,18 @@ llvm::Value *CodeGen_LLVM::match_vector_type_scalable(llvm::Value *value, llvm::
     return match_vector_type_scalable(value, guide->getType());
 }
 
+llvm::Value *CodeGen_LLVM::handle_bool_as_i8(llvm::Value *arg, llvm::VectorType *result_i1_type,
+                                             const std::function<llvm::Value *(llvm::Value *)> &fn) {
+    auto *arg_vty = cast<llvm::VectorType>(arg->getType());
+    bool scalable = isa<llvm::ScalableVectorType>(arg_vty);
+    int min_elts = scalable ? cast<llvm::ScalableVectorType>(arg_vty)->getMinNumElements() : cast<llvm::FixedVectorType>(arg_vty)->getNumElements();
+    auto constraint = scalable ? VectorTypeConstraint::VScale : VectorTypeConstraint::Fixed;
+    llvm::Type *arg_i8 = get_vector_type(i8_t, min_elts, constraint);
+    llvm::Value *widened = builder->CreateZExt(arg, arg_i8);
+    llvm::Value *result = fn(widened);
+    return builder->CreateTrunc(result, result_i1_type);
+}
+
 llvm::Value *CodeGen_LLVM::convert_fixed_or_scalable_vector_type(llvm::Value *arg,
                                                                  llvm::Type *desired_type) {
     llvm::Type *arg_type = arg->getType();
@@ -5199,6 +5231,18 @@ llvm::Value *CodeGen_LLVM::convert_fixed_or_scalable_vector_type(llvm::Value *ar
     }
 
     internal_assert(arg_type->getScalarType() == desired_type->getScalarType());
+
+    if (arg_type->isVectorTy() && desired_type->isVectorTy() &&
+        arg_type->getScalarType()->isIntegerTy(1)) {
+        bool dst_scalable = isa<llvm::ScalableVectorType>(desired_type);
+        int dst_elts = get_vector_num_elements(desired_type);
+        llvm::Type *dst_i8 = get_vector_type(i8_t, dst_scalable ? dst_elts / effective_vscale : dst_elts,
+                                             dst_scalable ? VectorTypeConstraint::VScale : VectorTypeConstraint::Fixed);
+        return handle_bool_as_i8(arg, cast<VectorType>(desired_type), [&](Value *v) {
+            return convert_fixed_or_scalable_vector_type(v, dst_i8);
+        });
+    }
+
     if (!arg_type->isVectorTy()) {
         arg = create_broadcast(arg, 1);
         arg_type = arg->getType();
@@ -5280,6 +5324,12 @@ llvm::Value *CodeGen_LLVM::fixed_to_scalable_vector_type(llvm::Value *fixed_arg)
     internal_assert(fixed_type->getElementType() == scalable_type->getElementType());
     internal_assert(lanes == (scalable_type->getMinNumElements() * effective_vscale));
 
+    if (fixed_type->getElementType()->isIntegerTy(1)) {
+        return handle_bool_as_i8(fixed_arg, scalable_type, [&](Value *v) {
+            return fixed_to_scalable_vector_type(v);
+        });
+    }
+
     // E.g. <vscale x 2 x i64> llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64>, <4 x i64>, i64)
     const char *type_designator;
     if (fixed_type->getElementType()->isIntegerTy()) {
@@ -5297,7 +5347,7 @@ llvm::Value *CodeGen_LLVM::fixed_to_scalable_vector_type(llvm::Value *fixed_arg)
 
     std::vector<llvm::Value *> args;
     args.push_back(result_vec);
-    args.push_back(value);
+    args.push_back(fixed_arg);
     args.push_back(ConstantInt::get(i64_t, 0));
 
     return simple_call_intrin(intrin, args, scalable_type);
@@ -5316,6 +5366,12 @@ llvm::Value *CodeGen_LLVM::scalable_to_fixed_vector_type(llvm::Value *scalable_a
     internal_assert(fixed_type->getElementType() == scalable_type->getElementType());
     internal_assert(fixed_type->getNumElements() == (scalable_type->getMinNumElements() * effective_vscale));
 
+    if (scalable_type->getElementType()->isIntegerTy(1)) {
+        return handle_bool_as_i8(scalable_arg, fixed_type, [&](Value *v) {
+            return scalable_to_fixed_vector_type(v);
+        });
+    }
+
     // E.g. <64 x i8> @llvm.vector.extract.v64i8.nxv8i8(<vscale x 8 x i8> %vresult, i64 0)
     const char *type_designator;
     if (scalable_type->getElementType()->isIntegerTy()) {
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
@@ -31,8 +31,10 @@ class NamedMDNode;
 class DataLayout;
 class BasicBlock;
 class GlobalVariable;
+class VectorType;
 }  // namespace llvm
 
+#include <functional>
 #include <map>
 #include <memory>
 #include <optional>
@@ -589,6 +591,14 @@ class CodeGen_LLVM : public IRVisitor {
     /** Convert an LLVM vscale vector value to the corresponding fixed vector value. */
     llvm::Value *scalable_to_fixed_vector_type(llvm::Value *scalable);
 
+    /** Work around LLVM's inability to lower vector insert/extract for i1
+     * element types (getVectorSubVecPointer computes byte offsets via integer
+     * division, truncating for i1: 1/8=0). Widens the i1 vector arg to i8,
+     * applies fn to the widened value, and truncates the result back to
+     * result_i1_type. */
+    llvm::Value *handle_bool_as_i8(llvm::Value *arg, llvm::VectorType *result_i1_type,
+                                   const std::function<llvm::Value *(llvm::Value *)> &fn);
+
     /** Get number of vector elements, taking into account scalable vectors. Returns 1 for scalars. */
     int get_vector_num_elements(const llvm::Type *t);
 
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
@@ -319,6 +319,8 @@ tests(GROUPS correctness
       strict_float.cpp
       strict_float_bounds.cpp
       strided_load.cpp
+      sve_codegen_predicated.cpp
+      sve_codegen_reinterpret.cpp
       target.cpp
       target_query.cpp
       tiled_matmul.cpp
diff --git a/test/correctness/sve_codegen_predicated.cpp b/test/correctness/sve_codegen_predicated.cpp
@@ -0,0 +1,26 @@
+#include "Halide.h"
+#include "halide_test_dirs.h"
+
+#include <cstdio>
+#include <string>
+
+using namespace Halide;
+
+int main(int argc, char **argv) {
+    const Target sve2("arm-64-linux-arm_dot_prod-arm_fp16-sve2-vector_bits_128");
+    std::string tmpdir = Internal::get_test_tmp_dir();
+
+    // Dense stores with non-natural lane counts force predicate tail masking.
+    // The predicate is a boolean (i1) vector that must be converted from fixed
+    // to scalable, which previously triggered an LLVM assertion in
+    // getVectorSubVecPointer ("Converting bits to bytes lost precision")
+    // because the byte offset computation truncates for i1 (1/8=0).
+    Func f("dense_pred_store");
+    Var x("x");
+    f(x) = cast<uint8_t>(x * 2);
+    f.vectorize(x, 24);  // 24 is not a multiple of 16 (natural for uint8 @ 128-bit SVE)
+    f.compile_to_object(tmpdir + "sve_dense_pred_store.o", {}, "dense_pred_store", sve2);
+
+    printf("Success!\n");
+    return 0;
+}
diff --git a/test/correctness/sve_codegen_reinterpret.cpp b/test/correctness/sve_codegen_reinterpret.cpp
@@ -0,0 +1,32 @@
+#include "Halide.h"
+#include "halide_test_dirs.h"
+
+#include <cstdio>
+#include <string>
+
+using namespace Halide;
+
+int main(int argc, char **argv) {
+    const Target sve2("arm-64-linux-arm_dot_prod-arm_fp16-sve2-vector_bits_128");
+    std::string tmpdir = Internal::get_test_tmp_dir();
+
+    // Reinterpret between Handle (pointer) and integer types with vectorization.
+    // Pointers produce fixed vectors (<4 x ptr>) while the integer destination
+    // may be scalable (<vscale x 4 x i64>), requiring conversion before the
+    // cast. Previously triggered ConstantExpr::getCast ("Invalid constantexpr
+    // cast!") because CreateBitOrPointerCast cannot operate across fixed and
+    // scalable vector types, and fixed_to_scalable_vector_type passed the wrong
+    // value to the llvm.vector.insert intrinsic.
+    std::string msg = "hello!\n";
+    Func f("handle_cast"), g("copy"), h("out");
+    Var x("x");
+    f(x) = cast<char *>(msg);
+    f.compute_root().vectorize(x, 4);
+    g(x) = f(x);
+    g.compute_root();
+    h(x) = g(x);
+    h.compile_to_object(tmpdir + "sve_handle_cast.o", {}, "handle_cast", sve2);
+
+    printf("Success!\n");
+    return 0;
+}