diff --git a/Makefile b/Makefile index 4e5cb36364f8..7edddd719f81 100644 --- a/Makefile +++ b/Makefile @@ -491,6 +491,7 @@ SOURCE_FILES = \ Debug.cpp \ DebugArguments.cpp \ DebugToFile.cpp \ + DecomposeVectorShuffle.cpp \ Definition.cpp \ Deinterleave.cpp \ Derivative.cpp \ @@ -687,6 +688,7 @@ HEADER_FILES = \ Debug.h \ DebugArguments.h \ DebugToFile.h \ + DecomposeVectorShuffle.h \ Definition.h \ Deinterleave.h \ Derivative.h \ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 036b92651667..63297410f2ce 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -95,6 +95,7 @@ target_sources( Debug.h DebugArguments.h DebugToFile.h + DecomposeVectorShuffle.h Definition.h Deinterleave.h Derivative.h @@ -279,6 +280,7 @@ target_sources( Debug.cpp DebugArguments.cpp DebugToFile.cpp + DecomposeVectorShuffle.cpp Definition.cpp Deinterleave.cpp Derivative.cpp diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index 7178e82965d8..30ae3249fca9 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -6,6 +6,7 @@ #include "CodeGen_Posix.h" #include "ConciseCasts.h" #include "Debug.h" +#include "DecomposeVectorShuffle.h" #include "DistributeShifts.h" #include "IREquality.h" #include "IRMatch.h" @@ -209,8 +210,25 @@ class CodeGen_ARM : public CodeGen_Posix { void visit(const Call *) override; void visit(const LT *) override; void visit(const LE *) override; + + llvm::Type *get_vector_type_from_value(llvm::Value *vec_or_scalar, int n); + Value *concat_vectors(const std::vector &) override; + Value *slice_vector(Value *vec, int start, int extent) override; + Value *create_undef_vector_like(Value *ref_vec, int lanes); + + /** Extract a sub vector from a vector, all the elements in the sub vector must be in the src vector. + * Specialized for scalable vector */ + Value *extract_scalable_vector(Value *vec, int start, int extract_size); + + /** Insert a vector into the "start" position of a base vector. + * Specialized for scalable vector */ + Value *insert_scalable_vector(Value *base_vec, Value *new_vec, int start); + Value *interleave_vectors(const std::vector &) override; Value *shuffle_vectors(Value *a, Value *b, const std::vector &indices) override; + Value *shuffle_scalable_vectors_general(Value *a, Value *b, const std::vector &indices); + Value *codegen_shuffle_indices(int bits, const std::vector &indices); + Value *codegen_whilelt(int total_lanes, int start, int end); void codegen_vector_reduce(const VectorReduce *, const Expr &) override; bool codegen_dot_product_vector_reduce(const VectorReduce *, const Expr &); bool codegen_pairwise_vector_reduce(const VectorReduce *, const Expr &); @@ -231,6 +249,7 @@ class CodeGen_ARM : public CodeGen_Posix { }; vector casts, calls, negations; + int natural_vector_size(const Halide::Type &t) const; string mcpu_target() const override; string mcpu_tune() const override; string mattrs() const override; @@ -261,6 +280,8 @@ class CodeGen_ARM : public CodeGen_Posix { return Shuffle::make_concat({const_true(true_lanes), const_false(false_lanes)}); } } + + friend struct DecomposeVectorShuffle; }; CodeGen_ARM::CodeGen_ARM(const Target &target) @@ -1901,11 +1922,224 @@ void CodeGen_ARM::visit(const Shuffle *op) { value = codegen_dense_vector_load(load, nullptr, /* slice_to_native */ false); value = CodeGen_Posix::shuffle_vectors(value, op->indices); - } else { + return; + } + + if (target_vscale() == 0) { CodeGen_Posix::visit(op); + return; + } + + const int total_lanes = op->type.lanes(); + if (op->type.bits() == 1) { + // Peep-hole pattern that matches SVE "whilelt" which represents particular pattern of + // vector predicate. e.g. 11100000 (active_lanes=3, all_lanes=8) + if (op->is_concat() && op->vectors.size() == 2 && + op->type.is_int_or_uint() && + is_power_of_two(total_lanes) && + total_lanes >= 2 * target_vscale() && total_lanes <= 16 * target_vscale() && + is_const_one(op->vectors[0]) && is_const_zero(op->vectors[1])) { + + int active_lanes = op->vectors[0].type().lanes(); + value = codegen_whilelt(op->type.lanes(), 0, active_lanes); + return; + } else { + // Rewrite to process 1bit type vector as 8 bit vector, and then cast back + std::vector vecs_i8; + vecs_i8.reserve(op->vectors.size()); + for (const auto &vec_i1 : op->vectors) { + Type upgraded_type = vec_i1.type().with_bits(8); + vecs_i8.emplace_back(Cast::make(upgraded_type, vec_i1)); + } + Expr equiv = Shuffle::make(vecs_i8, op->indices); + equiv = Cast::make(op->type, equiv); + equiv = common_subexpression_elimination(equiv); + value = codegen(equiv); + return; + } + } else if (op->is_concat() && op->vectors.size() == 2) { + // Here, we deal with some specific patterns of concat(a, b). + // Others are decomposed by CodeGen_LLVM at first, + // which in turn calls CodeGen_ARM::concat_vectors(). + + if (const Broadcast *bc_1 = op->vectors[1].as()) { + // Common pattern where padding is appended to align lanes. + // Create broadcast of padding with dst lanes, then insert vec[0] at lane 0. + Value *val_0 = codegen(op->vectors[0]); + Value *val_1_scalar = codegen(bc_1->value); + Value *padding = builder->CreateVectorSplat(llvm::ElementCount::getScalable(total_lanes / target_vscale()), val_1_scalar); + value = insert_scalable_vector(padding, val_0, 0); + return; + } + } + + CodeGen_Posix::visit(op); +} + +llvm::Type *CodeGen_ARM::get_vector_type_from_value(Value *vec_or_scalar, int n) { + llvm::Type *t = vec_or_scalar->getType(); + llvm::Type *elt = t->isVectorTy() ? get_vector_element_type(t) : t; + return CodeGen_Posix::get_vector_type(elt, n); +} + +Value *CodeGen_ARM::concat_vectors(const vector &vecs) { + // Override only for scalable vector which includes + // the case where scalars are concatenated into scalable vector. + if (target_vscale() == 0 || + vecs.size() <= 1 || + isa(vecs[0]->getType())) { + return CodeGen_Posix::concat_vectors(vecs); + } + + int total_lanes = 0; + for (auto *v : vecs) { + total_lanes += get_vector_num_elements(v->getType()); + } + + llvm::Type *concat_type = get_vector_type(get_vector_element_type(vecs[0]->getType()), total_lanes); + Value *ret = UndefValue::get(concat_type); + int insert_index = 0; + for (auto *v : vecs) { + ret = insert_scalable_vector(ret, v, insert_index); + insert_index += get_vector_num_elements(v->getType()); + } + return ret; +} + +Value *CodeGen_ARM::slice_vector(llvm::Value *vec, int start, int slice_size) { + // Override only for scalable vector + if (target_vscale() == 0 || + !is_scalable_vector(vec)) { + return CodeGen_Posix::slice_vector(vec, start, slice_size); + } + + const int vec_lanes = get_vector_num_elements(vec->getType()); + if (slice_size == 1) { + return builder->CreateExtractElement(vec, ConstantInt::get(i64_t, start, true)); + } else if (start == 0) { + if (vec_lanes == slice_size) { + return vec; + } else if (vec_lanes < slice_size) { + return insert_scalable_vector(UndefValue::get(get_vector_type_from_value(vec, slice_size)), vec, 0); + } else { + auto *dst_type = get_vector_type_from_value(vec, slice_size); + Value *val_index = ConstantInt::get(i64_t, 0, true); + return builder->CreateExtractVector(dst_type, vec, val_index); + } + } else { + const int extract_size = std::min(vec_lanes - start, slice_size); + Value *extracted = extract_scalable_vector(vec, start, extract_size); + if (slice_size == extract_size) { + return extracted; + } else { + Value *sliced = UndefValue::get(get_vector_type_from_value(vec, slice_size)); + sliced = insert_scalable_vector(sliced, extracted, 0); + return sliced; + } + } +} + +Value *CodeGen_ARM::create_undef_vector_like(Value *ref_vec, int lanes) { + llvm::Type *elt = get_vector_element_type(ref_vec->getType()); + return PoisonValue::get(get_vector_type(elt, lanes)); +} + +Value *CodeGen_ARM::extract_scalable_vector(Value *vec, int start, int extract_size) { + internal_assert(target_vscale() > 0 && is_scalable_vector(vec)); + internal_assert(start + extract_size <= get_vector_num_elements(vec->getType())); // No overrun + + if (extract_size == 1) { + return builder->CreateExtractElement(vec, ConstantInt::get(i64_t, start, true)); + } else { + // To follow the requirement of ‘llvm.vector.extract’ intrinsic that + // idx must be a constant multiple of the known-minimum vector length of the result type, + // the extraction is performed as multiple sub-extraction, where the worst case is extraction of scalar. + std::vector sub_slices; + int i = 0; + while (i < extract_size) { + int sub_extract_pos = start + i; + for (int sub_extract_size = extract_size - i; sub_extract_size > 0; --sub_extract_size) { + if (sub_extract_pos % sub_extract_size == 0) { + Value *sub_extracted; + if (sub_extract_size == 1) { + sub_extracted = builder->CreateExtractElement(vec, sub_extract_pos); + } else { + // In vector operation, index needs to be normalized by vscale + internal_assert(sub_extract_pos % target_vscale() == 0); + Value *idx_val = ConstantInt::get(i64_t, sub_extract_pos / target_vscale(), true); + llvm::Type *sub_extract_type = get_vector_type_from_value(vec, sub_extract_size); + sub_extracted = builder->CreateExtractVector(sub_extract_type, vec, idx_val); + } + sub_slices.push_back(sub_extracted); + + i += sub_extract_size; + break; + } + } + } + Value *extracted = concat_vectors(sub_slices); + return extracted; } } +Value *CodeGen_ARM::insert_scalable_vector(Value *base_vec, Value *new_vec, int start) { + const int base_lanes = get_vector_num_elements(base_vec->getType()); + const int new_vec_lanes = get_vector_num_elements(new_vec->getType()); + llvm::Type *element_type = get_vector_element_type(base_vec->getType()); + + internal_assert(start + new_vec_lanes <= base_lanes); + + if (base_lanes == 1 && new_vec_lanes == 1) { + return new_vec; + } + + internal_assert(target_vscale() > 0 && is_scalable_vector(base_vec)); + + if (!new_vec->getType()->isVectorTy()) { + return builder->CreateInsertElement(base_vec, new_vec, start); + } else if (start % new_vec_lanes == 0) { + // Most of the ordinal use cases are this pattern + // In vector operation, index needs to be normalized by vscale + Value *val_start_index = ConstantInt::get(i64_t, start / target_vscale(), true); + return builder->CreateInsertVector(base_vec->getType(), base_vec, new_vec, val_start_index); + } + + // To follow the requirement of ‘llvm.vector.insert’ intrinsic that + // idx must be a constant multiple of subvec’s known minimum vector length, + // insertion is performed in multiple sub slices. + Value *ret = base_vec; + int extract_index = 0; + int insert_index = start; + int sub_slice_size = std::min(start, new_vec_lanes); + + while (extract_index < new_vec_lanes) { + if (extract_index + sub_slice_size <= new_vec_lanes && // Condition to not overrun + extract_index % sub_slice_size == 0 && // Requirement of LLVM intrinsic + insert_index % sub_slice_size == 0) { // Requirement of LLVM intrinsic + + if (sub_slice_size == 1) { + Value *sub_slice = builder->CreateExtractElement(new_vec, extract_index); + ret = builder->CreateInsertElement(ret, sub_slice, insert_index); + } else { + // In vector operation, index needs to be normalized by vscale + internal_assert(extract_index % target_vscale() == 0); + internal_assert(insert_index % target_vscale() == 0); + Value *val_extract_index = ConstantInt::get(i64_t, extract_index / target_vscale(), true); + Value *val_insert_index = ConstantInt::get(i64_t, insert_index / target_vscale(), true); + llvm::Type *sub_sliced_type = get_vector_type(element_type, sub_slice_size); + Value *sub_slice = builder->CreateExtractVector(sub_sliced_type, new_vec, val_extract_index); + ret = builder->CreateInsertVector(base_vec->getType(), ret, sub_slice, val_insert_index); + } + insert_index += sub_slice_size; + extract_index += sub_slice_size; + } else { + // move on to next candidate + --sub_slice_size; + } + } + return ret; +} + Value *CodeGen_ARM::interleave_vectors(const std::vector &vecs) { if (simd_intrinsics_disabled() || target_vscale() == 0 || vecs.size() < 2 || @@ -1952,56 +2186,153 @@ Value *CodeGen_ARM::shuffle_vectors(Value *a, Value *b, const std::vector & } internal_assert(a->getType() == b->getType()); - - llvm::Type *elt = get_vector_element_type(a->getType()); - const int src_lanes = get_vector_num_elements(a->getType()); + llvm::Type *src_type = a->getType(); + llvm::Type *elt = get_vector_element_type(src_type); + const int bits = elt->getScalarSizeInBits(); + // note: lanes are multiplied by vscale + const int natural_lanes = natural_vector_size(Int(bits)); + const int src_lanes = get_vector_num_elements(src_type); const int dst_lanes = indices.size(); - // Check if deinterleaved slice - { - // Get the stride of slice - int slice_stride = 0; - const int start_index = indices[0]; - if (dst_lanes > 1) { - const int stride = indices[1] - start_index; - bool stride_equal = true; - for (int i = 2; i < dst_lanes; ++i) { - stride_equal &= (indices[i] == start_index + i * stride); - } - slice_stride = stride_equal ? stride : 0; + if (src_type->isVectorTy()) { + // i1 -> shuffle with i8 -> i1 + if (src_type->getScalarSizeInBits() == 1) { + internal_assert(src_type->isIntegerTy()) << "1 bit floating point type is unexpected\n"; + a = builder->CreateIntCast(a, VectorType::get(i8_t, dyn_cast(src_type)), false); + b = builder->CreateIntCast(b, VectorType::get(i8_t, dyn_cast(src_type)), false); + Value *v = shuffle_vectors(a, b, indices); + return builder->CreateIntCast(v, VectorType::get(i1_t, dyn_cast(v->getType())), false); } - // Lower slice with stride into llvm.vector.deinterleave intrinsic + // Check if deinterleaved slice + { + // Get the stride of slice + int slice_stride = 0; + const int start_index = indices[0]; + if (dst_lanes > 1) { + const int stride = indices[1] - start_index; + bool stride_equal = true; + for (int i = 2; i < dst_lanes; ++i) { + stride_equal &= (indices[i] == start_index + i * stride); + } + slice_stride = stride_equal ? stride : 0; + } + + // Lower slice with stride into llvm.vector.deinterleave intrinsic #if LLVM_VERSION >= 220 - const std::set supported_strides{2, 3, 4, 8}; + const std::set supported_strides{2, 3, 4, 8}; #else - const std::set supported_strides{2, 4, 8}; + const std::set supported_strides{2, 4, 8}; #endif - if (supported_strides.find(slice_stride) != supported_strides.end() && - dst_lanes * slice_stride == src_lanes && - indices.front() < slice_stride && // Start position cannot be larger than stride - is_power_of_two(dst_lanes) && - dst_lanes % target_vscale() == 0 && - dst_lanes / target_vscale() > 1) { - - std::string instr = concat_strings("llvm.vector.deinterleave", slice_stride, mangle_llvm_type(a->getType())); - - // We cannot mix FixedVector and ScalableVector, so dst_type must be scalable - llvm::Type *dst_type = get_vector_type(elt, dst_lanes / target_vscale(), VectorTypeConstraint::VScale); - StructType *sret_type = StructType::get(*context, std::vector(slice_stride, dst_type)); - std::vector arg_types{a->getType()}; - llvm::FunctionType *fn_type = FunctionType::get(sret_type, arg_types, false); - FunctionCallee fn = module->getOrInsertFunction(instr, fn_type); - - CallInst *deinterleave = builder->CreateCall(fn, {a}); - // extract one element out of the returned struct - Value *extracted = builder->CreateExtractValue(deinterleave, indices.front()); - - return extracted; + if (supported_strides.find(slice_stride) != supported_strides.end() && + dst_lanes * slice_stride == src_lanes && + indices.front() < slice_stride && // Start position cannot be larger than stride + is_power_of_two(dst_lanes) && + dst_lanes % target_vscale() == 0 && + dst_lanes / target_vscale() > 1) { + + std::string instr = concat_strings("llvm.vector.deinterleave", slice_stride, mangle_llvm_type(a->getType())); + + // We cannot mix FixedVector and ScalableVector, so dst_type must be scalable + llvm::Type *dst_type = get_vector_type(elt, dst_lanes / target_vscale(), VectorTypeConstraint::VScale); + StructType *sret_type = StructType::get(*context, std::vector(slice_stride, dst_type)); + std::vector arg_types{a->getType()}; + llvm::FunctionType *fn_type = FunctionType::get(sret_type, arg_types, false); + FunctionCallee fn = module->getOrInsertFunction(instr, fn_type); + + CallInst *deinterleave = builder->CreateCall(fn, {a}); + // extract one element out of the returned struct + Value *extracted = builder->CreateExtractValue(deinterleave, indices.front()); + + return extracted; + } } } - return CodeGen_Posix::shuffle_vectors(a, b, indices); + // Perform vector shuffle by decomposing the operation to multiple native shuffle steps + // which calls shuffle_scalable_vectors_general() which emits TBL/TBL2 instruction + DecomposeVectorShuffle shuffler(*this, a, b, get_vector_num_elements(a->getType()), natural_lanes); + return shuffler.run(indices); +} + +Value *CodeGen_ARM::shuffle_scalable_vectors_general(Value *a, Value *b, const std::vector &indices) { + internal_assert(a) << "Must provide a valid vector operand"; + internal_assert(!indices.empty()) << "Cannot shuffle with empty indices"; + + llvm::Type *elt = get_vector_element_type(a->getType()); + const int bits = elt->getScalarSizeInBits(); + const int natural_lanes = natural_vector_size(Int(bits)); + const int src_lanes = get_vector_num_elements(a->getType()); + const int dst_lanes = indices.size(); + llvm::Type *dst_type = get_vector_type(elt, dst_lanes); + + internal_assert(target_vscale() > 0 && is_scalable_vector(a)) << "Only deal with scalable vectors\n"; + internal_assert(src_lanes == natural_lanes && dst_lanes == natural_lanes) + << "Only deal with vector with natural_lanes\n"; + + // We select TBL or TBL2 intrinsic depending on indices range + int highest_lane = *std::max_element(indices.begin(), indices.end()); + internal_assert(highest_lane >= 0) + << "highest_lane was " + << (highest_lane == SliceIndexNone ? "SliceIndexNone" : + highest_lane == SliceIndexCarryPrevResult ? "SliceIndexCarryPrevResult" : + "") + << " (" << highest_lane << ")"; + + bool use_tbl = highest_lane < src_lanes; + internal_assert(use_tbl || b) << "'b' must be valid in case of tbl2\n"; + + auto instr = concat_strings("llvm.aarch64.sve.", use_tbl ? "tbl" : "tbl2", mangle_llvm_type(dst_type)); + + Value *val_indices = codegen_shuffle_indices(bits, indices); + llvm::Type *vt_natural = get_vector_type(elt, natural_lanes); + std::vector llvm_arg_types; + std::vector llvm_arg_vals; + if (use_tbl) { + llvm_arg_types = {vt_natural, val_indices->getType()}; + llvm_arg_vals = {a, val_indices}; + } else { + llvm_arg_types = {vt_natural, vt_natural, val_indices->getType()}; + llvm_arg_vals = {a, b, val_indices}; + } + llvm::FunctionType *fn_type = FunctionType::get(vt_natural, llvm_arg_types, false); + FunctionCallee fn = module->getOrInsertFunction(instr, fn_type); + + Value *v = builder->CreateCall(fn, llvm_arg_vals); + return v; +} + +Value *CodeGen_ARM::codegen_shuffle_indices(int bits, const std::vector &indices) { + const int lanes = indices.size(); + llvm::Type *index_type = IntegerType::get(module->getContext(), bits); + llvm::Type *index_vec_type = get_vector_type(index_type, lanes); + + std::vector llvm_indices(lanes); + for (int i = 0; i < lanes; i++) { + int idx = indices[i]; + llvm_indices[i] = idx >= 0 ? ConstantInt::get(index_type, idx) : UndefValue::get(index_type); + } + + Value *v = ConstantVector::get(llvm_indices); + v = builder->CreateInsertVector(index_vec_type, UndefValue::get(index_vec_type), + v, ConstantInt::get(i64_t, 0)); + return v; +} + +Value *CodeGen_ARM::codegen_whilelt(int total_lanes, int start, int end) { + // Generates SVE "whilelt" instruction which represents vector predicate pattern of + // e.g. 11100000 (total_lanes = 8 , start = 0, end = 3) + // -> @llvm.aarch64.sve.whilelt.nxv8i1.i32(i32 0, i32 3) + internal_assert(target_vscale() > 0); + internal_assert(total_lanes % target_vscale() == 0); + std::string instr = concat_strings("llvm.aarch64.sve.whilelt.nxv", total_lanes / target_vscale(), "i1.i32"); + + llvm::Type *pred_type = get_vector_type(llvm_type_of(Int(1)), total_lanes); + llvm::FunctionType *fn_type = FunctionType::get(pred_type, {i32_t, i32_t}, false); + FunctionCallee fn = module->getOrInsertFunction(instr, fn_type); + + value = builder->CreateCall(fn, {ConstantInt::get(i32_t, start), ConstantInt::get(i32_t, end)}); + return value; } void CodeGen_ARM::visit(const Ramp *op) { @@ -2425,6 +2756,11 @@ Type CodeGen_ARM::upgrade_type_for_storage(const Type &t) const { return CodeGen_Posix::upgrade_type_for_storage(t); } +int CodeGen_ARM::natural_vector_size(const Halide::Type &t) const { + internal_assert(t.bits() > 1) << "natural_vector_size requested with 1 bits\n"; + return native_vector_bits() / t.bits(); +} + string CodeGen_ARM::mcpu_target() const { if (target.bits == 32) { if (target.has_feature(Target::ARMv7s)) { diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 07d8cbb31a08..98837d27b0d4 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -4155,7 +4155,9 @@ void CodeGen_LLVM::visit(const Shuffle *op) { } else { internal_assert(op->indices[0] == 0); } - value = create_broadcast(value, op->indices.size()); + if (op->indices.size() > 1) { + value = create_broadcast(value, op->indices.size()); + } return; } } @@ -5445,6 +5447,10 @@ int CodeGen_LLVM::get_vector_num_elements(const llvm::Type *t) { } } +int CodeGen_LLVM::get_vector_num_elements(const llvm::Value *v) { + return get_vector_num_elements(v->getType()); +} + llvm::Type *CodeGen_LLVM::llvm_type_of(LLVMContext *c, Halide::Type t, int effective_vscale) const { if (t.lanes() == 1) { @@ -5481,23 +5487,7 @@ llvm::Type *CodeGen_LLVM::get_vector_type(llvm::Type *t, int n, switch (type_constraint) { case VectorTypeConstraint::None: if (effective_vscale > 0) { - bool wide_enough = true; - // TODO(https://github.com/halide/Halide/issues/8119): Architecture - // specific code should not go here. Ideally part of this can go - // away via LLVM fixes and modifying intrinsic selection to handle - // scalable vs. fixed vectors. Making this method virtual is - // possibly expensive. - if (target.arch == Target::ARM) { - if (!target.has_feature(Target::NoNEON)) { - // force booleans into bytes. TODO(https://github.com/halide/Halide/issues/8119): figure out a better way to do this. - int bit_size = std::max((int)t->getScalarSizeInBits(), 8); - wide_enough = (bit_size * n) > 128; - } else { - // TODO(https://github.com/halide/Halide/issues/8119): AArch64 SVE2 support is crashy with scalable vectors of min size 1. - wide_enough = (n / effective_vscale) > 1; - } - } - scalable = wide_enough && ((n % effective_vscale) == 0); + scalable = (n % effective_vscale) == 0; if (scalable) { n = n / effective_vscale; } diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h index bdd267020f1a..240114977f82 100644 --- a/src/CodeGen_LLVM.h +++ b/src/CodeGen_LLVM.h @@ -605,7 +605,10 @@ class CodeGen_LLVM : public IRVisitor { const std::function &fn); /** Get number of vector elements, taking into account scalable vectors. Returns 1 for scalars. */ + // @{ int get_vector_num_elements(const llvm::Type *t); + int get_vector_num_elements(const llvm::Value *v); + // @} /** Interface to abstract vector code generation as LLVM is now * providing multiple options to express even simple vector diff --git a/src/DecomposeVectorShuffle.cpp b/src/DecomposeVectorShuffle.cpp new file mode 100644 index 000000000000..8b0fb0fa05cf --- /dev/null +++ b/src/DecomposeVectorShuffle.cpp @@ -0,0 +1,80 @@ +#include "DecomposeVectorShuffle.h" + +#include + +namespace Halide::Internal { + +std::vector> decompose_to_native_shuffles( + int src_lanes, const std::vector &indices, int vl) { + + int dst_lanes = static_cast(indices.size()); + int src_lanes_aligned = align_up(src_lanes, vl); + + // Adjust indices so that src vectors are aligned up to multiple of vl + std::vector aligned_indices = indices; + for (int &idx : aligned_indices) { + if (idx >= src_lanes) { + idx += src_lanes_aligned - src_lanes; + } + } + + const int num_dst_slices = align_up(dst_lanes, vl) / vl; + std::vector> all_steps(num_dst_slices); + + for (int dst_slice = 0; dst_slice < num_dst_slices; dst_slice++) { + std::unordered_map slice_to_step; + auto &steps = all_steps[dst_slice]; + const int dst_start = dst_slice * vl; + + for (int dst_index = dst_start; dst_index < dst_start + vl && dst_index < dst_lanes; ++dst_index) { + const int src_index = aligned_indices[dst_index]; + if (src_index < 0) { + continue; + } + + const int src_slice = src_index / vl; + const int lane_in_src_slice = src_index % vl; + const int lane_in_dst_slice = dst_index - dst_start; + + if (steps.empty()) { + // first slice in this block + slice_to_step[src_slice] = 0; + steps.emplace_back(vl, src_slice, SliceIndexNone); + steps.back().lane_map[lane_in_dst_slice] = lane_in_src_slice; + + } else if (auto itr = slice_to_step.find(src_slice); itr != slice_to_step.end()) { + // slice already seen + NativeShuffle &step = steps[itr->second]; + bool is_a = (step.slice_a != SliceIndexCarryPrevResult && step.slice_a == src_slice); + int offset = is_a ? 0 : vl; + step.lane_map[lane_in_dst_slice] = lane_in_src_slice + offset; + + } else if (steps[0].slice_b == SliceIndexNone) { + // add as 'b' of first step if b is unused + slice_to_step[src_slice] = 0; + steps[0].slice_b = src_slice; + steps[0].lane_map[lane_in_dst_slice] = lane_in_src_slice + vl; + + } else { + // otherwise chain a new step + slice_to_step[src_slice] = static_cast(steps.size()); + // new step uses previous result as 'a', so we use 'b' for this one + steps.emplace_back(vl, SliceIndexCarryPrevResult, src_slice); + + // Except for the first step, we need to arrange indices + // so that the output carried from the previous step is kept + auto &lane_map = steps.back().lane_map; + // initialize lane_map as identical copy + for (size_t lane_idx = 0; lane_idx < lane_map.size(); ++lane_idx) { + lane_map[lane_idx] = lane_idx; + } + // update for this index + lane_map[lane_in_dst_slice] = lane_in_src_slice + vl; + } + } + } + + return all_steps; +} + +} // namespace Halide::Internal diff --git a/src/DecomposeVectorShuffle.h b/src/DecomposeVectorShuffle.h new file mode 100644 index 000000000000..e3a60e3cd4fa --- /dev/null +++ b/src/DecomposeVectorShuffle.h @@ -0,0 +1,163 @@ +#ifndef HALIDE_DECOMPOSE_VECTOR_SHUFFLE_H +#define HALIDE_DECOMPOSE_VECTOR_SHUFFLE_H + +/** \file + * + * Perform vector shuffle by decomposing the operation to + * a sequence of the sub shuffle steps where each step is a shuffle of: + * - One or two slices as input (slice_a and slice_b) + * - Produce one slice (dst slice) + * - All the slices have the same length as target native vector (vl) + * + * The structure of the sequence of steps consists of: + * 1. Outer loop to iterate the slices of dst vector. + * 2. Inner loop to iterate the native shuffle steps to complete a single dst slice. + * This can be multiple steps because a single native shuffle can take + * only 2 slices (native vector length x 2) at most, while we may need + * to fetch from wider location in the src vector. + * + * The following example, log of test code, illustrates how it works. + * + * src_lanes: 17, dst_lanes: 7, vl: 4 + * input a: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, ] + * input b: [170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, ] + * indices: [6, 13, 24, 14, 7, 11, 5, ] + * + * slice a:[40, 50, 60, 70, ], slice b:[120, 130, 140, 150, ], indices:[2, 5, -1, 6, ] + * => slice output:[60, 130, -559038801, 140, ] + * slice a:[60, 130, -559038801, 140, ], slice b:[210, 220, 230, 240, ], indices:[0, 1, 7, 3, ] + * => slice output:[60, 130, 240, 140, ] + * slice a:[40, 50, 60, 70, ], slice b:[80, 90, 100, 110, ], indices:[3, 7, 1, -1, ] + * => slice output:[70, 110, 50, -559038801, ] + * + * output: [60, 130, 240, 140, 70, 110, 50, ] + * + */ + +#include "Error.h" +#include "Util.h" + +#include +#include + +namespace Halide { +namespace Internal { + +/** Enum to represent the special cases of slice index */ +enum { + SliceIndexNone = -1, + SliceIndexCarryPrevResult = -2, +}; + +struct NativeShuffle { + int slice_a; + int slice_b; + std::vector lane_map; + + NativeShuffle(int vl, int a, int b) + : slice_a(a), slice_b(b) { + lane_map.resize(vl, SliceIndexNone); + } +}; + +std::vector> decompose_to_native_shuffles( + int src_lanes, const std::vector &indices, int vl); + +/** Algorithm logic for shuffle decomposition, parameterized on vector type + * and a codegen-like class that provides primitive vector operations. + */ +template +struct DecomposeVectorShuffle { + // TODO: when upgrading to C++20, replace with a concept. + // get_vector_num_elements may be overloaded (e.g. on Type* and Value*), so use + // expression SFINAE rather than a method pointer to handle overload resolution. + static_assert(std::is_convertible_v().get_vector_num_elements(std::declval())), int>, + "CodeGenTy must provide: int get_vector_num_elements(VecTy)"); + static_assert(std::is_invocable_r_v, + "CodeGenTy must provide: VecTy slice_vector(const VecTy &, int, int)"); + static_assert(std::is_invocable_r_v &>, + "CodeGenTy must provide: VecTy concat_vectors(const std::vector &)"); + static_assert(std::is_invocable_r_v &>, + "CodeGenTy must provide: VecTy shuffle_scalable_vectors_general(const VecTy &, const VecTy &, const std::vector &)"); + static_assert(std::is_invocable_r_v, + "CodeGenTy must provide: VecTy create_undef_vector_like(const VecTy &, int)"); + + DecomposeVectorShuffle(CodeGenTy &codegen, const VecTy &src_a, const VecTy &src_b, int src_lanes, int vl) + : codegen(codegen), + vl(vl), + src_a(align_up_vector(src_a, vl)), + src_b(align_up_vector(src_b, vl)), + src_lanes(src_lanes), + src_lanes_aligned(align_up(src_lanes, vl)) { + } + + VecTy run(const std::vector &indices) { + auto shuffle_plan = decompose_to_native_shuffles(src_lanes, indices, vl); + int dst_lanes = static_cast(indices.size()); + + // process each block divided by vl + std::vector shuffled_dst_slices; + shuffled_dst_slices.reserve(shuffle_plan.size()); + + for (const auto &steps_for_dst_slice : shuffle_plan) { + std::optional dst_slice = std::nullopt; + for (const auto &step : steps_for_dst_slice) { + // Obtain 1st slice a + VecTy a; + if (step.slice_a == SliceIndexCarryPrevResult) { + internal_assert(dst_slice.has_value()) << "Tried to carry from undefined previous result"; + a = *dst_slice; + } else { + a = get_vl_slice(step.slice_a); + } + // Obtain 2nd slice b + std::optional b; + if (step.slice_b == SliceIndexNone) { + b = std::nullopt; + } else { + b = std::optional(get_vl_slice(step.slice_b)); + } + // Perform shuffle where vector length is aligned + dst_slice = codegen.shuffle_scalable_vectors_general(a, b.value_or(VecTy{}), step.lane_map); + } + if (!dst_slice.has_value()) { + // No shuffle step for this slice, i.e. all the indices are -1 + dst_slice = codegen.create_undef_vector_like(src_a, vl); + } + shuffled_dst_slices.push_back(*dst_slice); + } + + return codegen.slice_vector(codegen.concat_vectors(shuffled_dst_slices), 0, dst_lanes); + } + +private: + // Helper to extract slice with lanes=vl + VecTy get_vl_slice(int slice_index) { + const int num_slices_a = src_lanes_aligned / vl; + int start_index = slice_index * vl; + if (slice_index < num_slices_a) { + return codegen.slice_vector(src_a, start_index, vl); + } else { + start_index -= src_lanes_aligned; + return codegen.slice_vector(src_b, start_index, vl); + } + } + + VecTy align_up_vector(const VecTy &v, int align) { + int len = codegen.get_vector_num_elements(v); + return codegen.slice_vector(v, 0, align_up(len, align)); + } + + CodeGenTy &codegen; + int vl; + VecTy src_a; + VecTy src_b; + int src_lanes; + int src_lanes_aligned; +}; + +} // namespace Internal +} // namespace Halide + +#endif diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index e1c44561be00..a25b077a1abb 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -80,6 +80,7 @@ tests(GROUPS correctness debug_to_file.cpp debug_to_file_multiple_outputs.cpp debug_to_file_reorder.cpp + decompose_vector_shuffle.cpp deferred_loop_level.cpp deinterleave4.cpp device_buffer_copies_with_profile.cpp diff --git a/test/correctness/decompose_vector_shuffle.cpp b/test/correctness/decompose_vector_shuffle.cpp new file mode 100644 index 000000000000..81b958e04ba6 --- /dev/null +++ b/test/correctness/decompose_vector_shuffle.cpp @@ -0,0 +1,155 @@ +#include + +#include +#include +#include +#include +#include + +using namespace Halide; +using namespace Halide::Internal; + +using std::optional; +using std::vector; + +namespace { + +constexpr int UNDEF_VALUE = 0xdeadbeef; + +vector shuffle_without_divided(const vector &a, const vector &b, const vector &indices) { + int src_lanes = static_cast(a.size()); + vector dst(indices.size(), 0x1234abcd); + for (size_t i = 0; i < indices.size(); ++i) { + int idx = indices[i]; + if (idx < 0) { + dst[i] = UNDEF_VALUE; + } else if (idx < src_lanes) { + dst[i] = a[idx]; + } else { + int idx_b = idx - src_lanes; + internal_assert(idx_b < static_cast(b.size())); + dst[i] = b[idx_b]; + } + } + return dst; +} + +struct STLShuffleCodeGen { + int get_vector_num_elements(const vector &v) { + return static_cast(v.size()); + } + + vector slice_vector(const vector &v, int start, int lanes) { + auto result = vector(v.begin() + start, v.begin() + std::min(start + lanes, static_cast(v.size()))); + result.resize(lanes); + return result; + } + + vector concat_vectors(const vector> &vecs) { + vector out; + for (const auto &v : vecs) { + out.insert(out.end(), v.begin(), v.end()); + } + return out; + } + + vector shuffle_scalable_vectors_general(const vector &a, const vector &b, const vector &indices) { + internal_assert(a.size() == indices.size()); + + auto result = shuffle_without_divided(a, b, indices); + + debug(1) << "slice a: " << PrintSpan{a} << ", " + << "slice b: " << PrintSpan{b} << ", " + << "indices: " << PrintSpan{indices} << "\n" + << "\t=> slice output: " << PrintSpan{result} << "\n"; + + return result; + } + + vector create_undef_vector_like(const vector &ref, int lanes) { + return vector(lanes, UNDEF_VALUE); + } +}; + +void generate_data(int src_lanes, int dst_lanes, + vector &a, vector &b, vector &indices) { + // Input vector values are fixed for readability. + // Indices values are random in range [-1, src_lanes*2) . + a.resize(src_lanes); + b.resize(src_lanes); + for (int i = 0; i < src_lanes; ++i) { + a[i] = i * 10; + b[i] = (i + src_lanes) * 10; + } + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution dist(-1, src_lanes * 2 - 1); + indices.resize(dst_lanes); + for (int i = 0; i < dst_lanes; ++i) { + indices[i] = dist(gen); + } + + debug(1) << "input a: " << PrintSpan{a} << "\n" + << "input b: " << PrintSpan{b} << "\n" + << "indices: " << PrintSpan{indices} << "\n\n"; +} + +void assert_vectors_equal(const vector &expected, const vector &actual) { + internal_assert(expected.size() == actual.size()) + << "Vector sizes are different\n" + << "expected: " << PrintSpan{expected} << "\n" + << " actual: " << PrintSpan{actual} << "\n"; + + for (size_t i = 0; i < expected.size(); ++i) { + internal_assert(expected[i] == actual[i]) + << "Mismatch: expected[" << i << "] = " << expected[i] << ", actual[" << i << "] = " << actual[i] << "\n" + << "expected: " << PrintSpan{expected} << "\n" + << " actual: " << PrintSpan{actual} << "\n"; + } +} + +void run_single_test(int src_lanes, int dst_lanes, int vl) { + vector a, b, indices; + generate_data(src_lanes, dst_lanes, a, b, indices); + + auto expected = shuffle_without_divided(a, b, indices); + + STLShuffleCodeGen ops; + DecomposeVectorShuffle shuffler(ops, a, b, src_lanes, vl); + auto actual = shuffler.run(indices); + + assert_vectors_equal(expected, actual); +} + +void run_test(int src_lanes, int dst_lanes, int vl, int repeat) { + debug(2) << "Running " << repeat << " tests for\n" + << " src_lanes: " << src_lanes + << ", dst_lanes: " << dst_lanes + << ", vl: " << vl << "\n"; + + for (int t = 0; t < repeat; ++t) { + run_single_test(src_lanes, dst_lanes, vl); + } +} + +} // namespace + +int main(int argc, char *argv[]) { + int repeat = 100; + + if (argc >= 3) { + int src_lanes = std::stoi(argv[1]); + int dst_lanes = std::stoi(argv[2]); + int vl = (argc >= 4) ? std::stoi(argv[3]) : 4; + repeat = (argc >= 5) ? std::stoi(argv[4]) : repeat; + internal_assert(popcount64(vl) == 1 && vl > 1) << "vl must be a power of 2"; + run_test(src_lanes, dst_lanes, vl, repeat); + } else { + run_test(8, 8, 4, repeat); + run_test(19, 9, 4, repeat); + run_test(5, 3, 8, repeat); + } + + printf("Success!\n"); + return 0; +} diff --git a/test/correctness/interleave.cpp b/test/correctness/interleave.cpp index 0d758428e2cd..cbee263f5487 100644 --- a/test/correctness/interleave.cpp +++ b/test/correctness/interleave.cpp @@ -74,6 +74,17 @@ Expr element(FuncRef f, int i) { int main(int argc, char **argv) { Var x, y, c; + // SVE2 backend has the below LLVM issue which has been fixed in LLVM 22. + // "LLVM ERROR: Don't know how to widen the operands for INSERT_SUBVECTOR" + // https://github.com/llvm/llvm-project/issues/160134 + // https://github.com/llvm/llvm-project/issues/169300 + if (Internal::get_llvm_version() < 220 && + get_jit_target_from_environment().has_feature(Target::SVE2)) { + printf("[SKIP] LLVM %d has known SVE backend bugs for this test.\n", + Internal::get_llvm_version()); + return 0; + } + // TODO: Is this still true? // As of May 26 2016, this test causes a segfault due to // permissions failure on ARM-32 trying to execute a diff --git a/test/correctness/predicated_store_load_single_lane.cpp b/test/correctness/predicated_store_load_single_lane.cpp index 3e1f3b3b4ca0..64fa8cf86713 100644 --- a/test/correctness/predicated_store_load_single_lane.cpp +++ b/test/correctness/predicated_store_load_single_lane.cpp @@ -3,6 +3,16 @@ using namespace Halide; int main(int argc, char **argv) { + // SVE2 backend has the below LLVM issue which has been fixed in LLVM 22. + // "LLVM ERROR: Unable to widen vector store" + // https://github.com/llvm/llvm-project/issues/54424 + if (Internal::get_llvm_version() < 220 && + get_jit_target_from_environment().has_feature(Target::SVE2)) { + printf("[SKIP] LLVM %d has known SVE backend bugs for this test.\n", + Internal::get_llvm_version()); + return 0; + } + // This test exercises predicated vector loads and stores with a single // lane. These require special handling because Halide's IR does not // distinguish between scalars and single-element vectors, while LLVM diff --git a/test/correctness/simd_op_check_sve2.cpp b/test/correctness/simd_op_check_sve2.cpp index f0183412323a..467c7a5f7794 100644 --- a/test/correctness/simd_op_check_sve2.cpp +++ b/test/correctness/simd_op_check_sve2.cpp @@ -21,37 +21,35 @@ using CastFuncTy = function; class SimdOpCheckArmSve : public SimdOpCheckTest { public: - SimdOpCheckArmSve(Target t, int w = 384, int h = 32) + SimdOpCheckArmSve(Target t, int w = 512, int h = 16) : SimdOpCheckTest(t, w, h), debug_mode(Internal::get_env_variable("HL_DEBUG_SIMDOPCHECK")) { // Determine and hold can_run_the_code - // TODO: Since features of Arm CPU cannot be obtained automatically from get_host_target(), - // it is necessary to set some feature (e.g. "arm_fp16") explicitly to HL_JIT_TARGET. - // Halide throws error if there is unacceptable mismatch between jit_target and host_target. - Target host = get_host_target(); Target jit_target = get_jit_target_from_environment(); cout << "host is: " << host.to_string() << endl; cout << "HL_TARGET is: " << target.to_string() << endl; cout << "HL_JIT_TARGET is: " << jit_target.to_string() << endl; - auto is_same_triple = [](const Target &t1, const Target &t2) -> bool { - return t1.arch == t2.arch && t1.bits == t2.bits && t1.os == t2.os && t1.vector_bits == t2.vector_bits; + auto is_runtime_compatible = [](const Target &t1, const Target &t2) -> bool { + bool yes = true; + yes &= (t1.arch == t2.arch && t1.bits == t2.bits && t1.os == t2.os); + yes &= (t1.vector_bits == t2.vector_bits); + + // A bunch of feature flags also need to match between the + // compiled code and the host in order to run the code. + for (Target::Feature f : {Target::SVE2}) { + yes &= (t1.has_feature(f) == t2.has_feature(f)); + } + return yes; }; - can_run_the_code = is_same_triple(host, target) && is_same_triple(jit_target, target); + can_run_the_code = is_runtime_compatible(host, target) && is_runtime_compatible(jit_target, target); - // A bunch of feature flags also need to match between the - // compiled code and the host in order to run the code. - for (Target::Feature f : {Target::ARMv7s, Target::ARMFp16, Target::NoNEON, Target::SVE2}) { - if (target.has_feature(f) != jit_target.has_feature(f)) { - can_run_the_code = false; - } - } if (!can_run_the_code) { - cout << "[WARN] To perform verification of realization, " - << R"(the target triple "arm--" and key feature "arm_fp16")" - << " must be the same between HL_TARGET and HL_JIT_TARGET" << endl; + debug(0) << "[WARN] To perform verification of realization, " + << R"(the target triple "arm--", vector_bits, and feature "sve2")" + << " must be the same between HL_TARGET and HL_JIT_TARGET" << endl; } } @@ -563,13 +561,20 @@ class SimdOpCheckArmSve : public SimdOpCheckTest { continue; } - vector total_bits_params = {256}; // {64, 128, 192, 256}; + std::vector simd_bit_widths; + if (has_sve()) { + simd_bit_widths.push_back(target.vector_bits); + } else if (has_neon()) { + simd_bit_widths.push_back(64); + simd_bit_widths.push_back(128); + } + if (bits != 64) { // Add scalar case to verify float16 native operation - total_bits_params.push_back(bits); + simd_bit_widths.push_back(bits); } - for (auto total_bits : total_bits_params) { + for (auto &total_bits : simd_bit_widths) { const int vf = total_bits / bits; const bool is_vector = vf > 1; @@ -720,6 +725,20 @@ class SimdOpCheckArmSve : public SimdOpCheckTest { } } + // TBL - Structured load with stride=5 + { + constexpr int stride = 5; + const int vector_lanes = base_vec_bits * 4 / bits; + + AddTestFunctor add(*this, bits, vector_lanes); + + Expr load_n = in_im(x * stride) + in_im(x * stride + stride - 1); + + if (has_sve()) { + add("tbl", load_n); + } + } + // ST2 - Store two-element structures for (int factor : {1, 2}) { const int width = base_vec_bits * 2 * factor; @@ -829,7 +848,8 @@ class SimdOpCheckArmSve : public SimdOpCheckTest { // SVE Gather/Scatter if (has_sve()) { - for (int width = 64; width <= 64 * 4; width *= 2) { + for (float factor : {0.5f, 1.f, 2.f}) { + const int width = base_vec_bits * factor; const int total_lanes = width / bits; const int instr_lanes = min(total_lanes, 128 / bits); if (instr_lanes < 2 || (total_lanes / vscale < 2)) continue; // bail out scalar and @@ -846,6 +866,14 @@ class SimdOpCheckArmSve : public SimdOpCheckTest { const int index_bits = std::max(32, bits); add({get_sve_ls_instr("ld1", bits, index_bits, "uxtw")}, total_lanes, gather); add({get_sve_ls_instr("st1", bits, index_bits, "uxtw")}, total_lanes, scatter); + + // In case of lanes shorter than native's, predicate pattern is generated by + // "whilelt" intrinsic. + // @llvm.aarch64.sve.whilelt.nxv8i1.i32(i32 0, i32 4) + if (factor == 0.5f) { + string constraint("vl" + to_string(total_lanes)); + add("whilelt", {get_ptrue_instr_with_constraint(bits, constraint)}, total_lanes, scatter); + } } } } @@ -871,9 +899,12 @@ class SimdOpCheckArmSve : public SimdOpCheckTest { {64, in_i64, in_u64, i64, i64, u64, u64}, }; + const int base_vec_bits = has_sve() ? target.vector_bits : 128; + const int vscale = base_vec_bits / 128; + for (const auto &[bits, in_i, in_u, widen_i, widenx4_i, widen_u, widenx4_u] : test_params) { - for (auto &total_bits : {64, 128}) { + for (auto &total_bits : {base_vec_bits / 2, base_vec_bits}) { const int vf = total_bits / bits; const int instr_lanes = Instruction::get_force_vectorized_instr_lanes(bits, vf, target); AddTestFunctor add(*this, bits, instr_lanes, vf, !(is_arm32() && bits == 64)); // 64 bit is unavailable in neon 32 bit @@ -945,11 +976,13 @@ class SimdOpCheckArmSve : public SimdOpCheckTest { // UDOT/SDOT if (is_arm_dot_prod_available) { - const int factor_32bit = vf / 4; + const int factor_reduced = vf / 4; + if (factor_reduced / vscale < 2) continue; // bail out scalar and + for (int f : {4, 8}) { // checks vector register for narrow src data type (i.e. 8 or 16 bit) - const int lanes_src = Instruction::get_instr_lanes(bits, f * factor_32bit, target); - AddTestFunctor add_dot(*this, bits, lanes_src, factor_32bit); + const int lanes_src = Instruction::get_instr_lanes(bits, f * factor_reduced, target); + AddTestFunctor add_dot(*this, bits, lanes_src, factor_reduced); RDom r(0, f); add_dot("udot", sum(widenx4_u(in_u(f * x + r)) * in_u(f * x + r + 32))); @@ -1048,13 +1081,13 @@ class SimdOpCheckArmSve : public SimdOpCheckTest { return opcode_pattern + R"(\s.*\b)" + operand_pattern + R"(\b.*)"; } - // TODO Fix this for SVE2 - static int natural_lanes(int bits) { - return 128 / bits; + static int natural_lanes(int bits, const Target &t) { + const int base_vector_bits = std::max(t.vector_bits, 128); + return base_vector_bits / bits; } static int get_instr_lanes(int bits, int vec_factor, const Target &target) { - return min(natural_lanes(bits), vec_factor); + return min(natural_lanes(bits, target), vec_factor); } static int get_force_vectorized_instr_lanes(int bits, int vec_factor, const Target &target) { @@ -1063,10 +1096,10 @@ class SimdOpCheckArmSve : public SimdOpCheckTest { if (vec_factor == 1) { return 1; } else { - return natural_lanes(bits); + return natural_lanes(bits, target); } } else { - int min_lanes = std::max(2, natural_lanes(bits) / 2); // 64 bit wide VL + int min_lanes = std::max(2, natural_lanes(bits, target) / 2); // 64 bit wide VL return max(min_lanes, get_instr_lanes(bits, vec_factor, target)); } } @@ -1075,7 +1108,7 @@ class SimdOpCheckArmSve : public SimdOpCheckTest { return opcode + to_string(bits.value()); } - const char *get_bits_designator() const { + static const char *get_bits_designator(int bits) { static const map designators{ // NOTE: vector or float only {8, "b"}, @@ -1083,7 +1116,7 @@ class SimdOpCheckArmSve : public SimdOpCheckTest { {32, "s"}, {64, "d"}, }; - auto iter = designators.find(bits.value()); + auto iter = designators.find(bits); assert(iter != designators.end()); return iter->second; } @@ -1092,7 +1125,7 @@ class SimdOpCheckArmSve : public SimdOpCheckTest { if (pattern_lanes == ANY_LANES) { return R"((z\d\d?\.[bhsd])|(s\d\d?))"; } else { - const char *bits_designator = get_bits_designator(); + const char *bits_designator = get_bits_designator(bits.value()); // TODO(need issue): This should only match the scalar register, and likely a NEON instruction opcode. // Generating a full SVE vector instruction for a scalar operation is inefficient. However this is // happening and fixing it involves changing intrinsic selection. Likely to use NEON intrinsics where @@ -1109,7 +1142,7 @@ class SimdOpCheckArmSve : public SimdOpCheckTest { } string get_reg_neon64() const { - const char *bits_designator = get_bits_designator(); + const char *bits_designator = get_bits_designator(bits.value()); if (pattern_lanes == 1) { return std::string(bits_designator) + R"(\d\d?)"; // e.g. "h15" } else if (pattern_lanes == ANY_LANES) { @@ -1149,6 +1182,15 @@ class SimdOpCheckArmSve : public SimdOpCheckTest { return get_sve_ls_instr(base_opcode, bits, bits, ""); } + Instruction get_ptrue_instr_with_constraint(int bits, const string &constraint) { + // Special predicate pattern is generated by "whilelt" intrinsic, e.g. + // @llvm.aarch64.sve.whilelt.nxv8i1.i32(i32 0, i32 4) + // LLVM compiles this to the instruction below: + // ptrue p0.h, vl4 + string operand = R"(p\d\d?\.)" + string(Instruction::get_bits_designator(bits)); + return Instruction("ptrue", operand + R"(,\s.*\b)" + constraint); + } + // Helper functor to add test case class AddTestFunctor { public: @@ -1379,5 +1421,6 @@ int main(int argc, char **argv) { Target("arm-64-linux-sve2-no_neon-vector_bits_128"), Target("arm-64-linux-sve2-no_neon-vector_bits_256"), + Target("arm-64-linux-sve2-no_neon-vector_bits_512"), }); } diff --git a/test/correctness/stmt_to_html.cpp b/test/correctness/stmt_to_html.cpp index dae0cdc95527..0dad8820d0c3 100644 --- a/test/correctness/stmt_to_html.cpp +++ b/test/correctness/stmt_to_html.cpp @@ -6,6 +6,16 @@ using namespace Halide; int main() { + // SVE2 backend has the below LLVM issue which has been fixed in LLVM 22. + // "Request for a fixed element count on a scalable object" + // https://github.com/llvm/llvm-project/issues/160127 + if (Internal::get_llvm_version() < 220 && + get_jit_target_from_environment().has_feature(Target::SVE2)) { + printf("[SKIP] LLVM %d has known SVE backend bugs for this test.\n", + Internal::get_llvm_version()); + return 0; + } + Var x, y; // The gradient function and schedule from tutorial lesson 5. diff --git a/test/performance/boundary_conditions.cpp b/test/performance/boundary_conditions.cpp index 04c525ee9554..0dd7c98077e8 100644 --- a/test/performance/boundary_conditions.cpp +++ b/test/performance/boundary_conditions.cpp @@ -91,6 +91,13 @@ int main(int argc, char **argv) { return 0; } + if (Halide::Internal::get_llvm_version() < 220 && + get_jit_target_from_environment().has_feature(Target::SVE2)) { + printf("[SKIP] LLVM %d has known SVE backend bugs for this test.\n", + Halide::Internal::get_llvm_version()); + return 0; + } + ImageParam input(Float(32), 2); ImageParam padded_input(Float(32), 2); diff --git a/tutorial/CMakeLists.txt b/tutorial/CMakeLists.txt index e90ce173798a..9b911dc1c552 100644 --- a/tutorial/CMakeLists.txt +++ b/tutorial/CMakeLists.txt @@ -51,7 +51,20 @@ add_tutorial(lesson_01_basics.cpp) add_tutorial(lesson_02_input_image.cpp WITH_IMAGE_IO) add_tutorial(lesson_03_debugging_1.cpp) add_tutorial(lesson_04_debugging_2.cpp GROUPS multithreaded) -add_tutorial(lesson_05_scheduling_1.cpp GROUPS multithreaded) + +if (Halide_BUILDING_IN_CI AND + Halide_LLVM_VERSION VERSION_LESS 22 AND + Halide_HOST_TARGET MATCHES "arm-64-linux") + # We can't reliably detect SVE2 without having first built Halide, but the + # SVE2 backend has the below LLVM issue which has been fixed in LLVM 22. + # This issue breaks lesson 5 on the GitHub Actions ARM64 runners, which have + # SVE2 support. + # "Request for a fixed element count on a scalable object" + # See: https://github.com/llvm/llvm-project/issues/160127 +else () + add_tutorial(lesson_05_scheduling_1.cpp GROUPS multithreaded) +endif () + add_tutorial(lesson_06_realizing_over_shifted_domains.cpp) add_tutorial(lesson_07_multi_stage_pipelines.cpp WITH_IMAGE_IO) add_tutorial(lesson_08_scheduling_2.cpp WITH_IMAGE_IO WITH_OPENMP GROUPS multithreaded)