diff --git a/Makefile b/Makefile
index 4e5cb36364f8..7edddd719f81 100644
--- a/Makefile
+++ b/Makefile
@@ -491,6 +491,7 @@ SOURCE_FILES = \
   Debug.cpp \
   DebugArguments.cpp \
   DebugToFile.cpp \
+  DecomposeVectorShuffle.cpp \
   Definition.cpp \
   Deinterleave.cpp \
   Derivative.cpp \
@@ -687,6 +688,7 @@ HEADER_FILES = \
   Debug.h \
   DebugArguments.h \
   DebugToFile.h \
+  DecomposeVectorShuffle.h \
   Definition.h \
   Deinterleave.h \
   Derivative.h \
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 036b92651667..63297410f2ce 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -95,6 +95,7 @@ target_sources(
     Debug.h
     DebugArguments.h
     DebugToFile.h
+    DecomposeVectorShuffle.h
     Definition.h
     Deinterleave.h
     Derivative.h
@@ -279,6 +280,7 @@ target_sources(
     Debug.cpp
     DebugArguments.cpp
     DebugToFile.cpp
+    DecomposeVectorShuffle.cpp
     Definition.cpp
     Deinterleave.cpp
     Derivative.cpp
diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
index 7178e82965d8..30ae3249fca9 100644
--- a/src/CodeGen_ARM.cpp
+++ b/src/CodeGen_ARM.cpp
@@ -6,6 +6,7 @@
 #include "CodeGen_Posix.h"
 #include "ConciseCasts.h"
 #include "Debug.h"
+#include "DecomposeVectorShuffle.h"
 #include "DistributeShifts.h"
 #include "IREquality.h"
 #include "IRMatch.h"
@@ -209,8 +210,25 @@ class CodeGen_ARM : public CodeGen_Posix {
     void visit(const Call *) override;
     void visit(const LT *) override;
     void visit(const LE *) override;
+
+    llvm::Type *get_vector_type_from_value(llvm::Value *vec_or_scalar, int n);
+    Value *concat_vectors(const std::vector<llvm::Value *> &) override;
+    Value *slice_vector(Value *vec, int start, int extent) override;
+    Value *create_undef_vector_like(Value *ref_vec, int lanes);
+
+    /** Extract a sub vector from a vector, all the elements in the sub vector must be in the src vector.
+     * Specialized for scalable vector */
+    Value *extract_scalable_vector(Value *vec, int start, int extract_size);
+
+    /** Insert a vector into the "start" position of a base vector.
+     * Specialized for scalable vector */
+    Value *insert_scalable_vector(Value *base_vec, Value *new_vec, int start);
+
     Value *interleave_vectors(const std::vector<Value *> &) override;
     Value *shuffle_vectors(Value *a, Value *b, const std::vector<int> &indices) override;
+    Value *shuffle_scalable_vectors_general(Value *a, Value *b, const std::vector<int> &indices);
+    Value *codegen_shuffle_indices(int bits, const std::vector<int> &indices);
+    Value *codegen_whilelt(int total_lanes, int start, int end);
     void codegen_vector_reduce(const VectorReduce *, const Expr &) override;
     bool codegen_dot_product_vector_reduce(const VectorReduce *, const Expr &);
     bool codegen_pairwise_vector_reduce(const VectorReduce *, const Expr &);
@@ -231,6 +249,7 @@ class CodeGen_ARM : public CodeGen_Posix {
     };
     vector<Pattern> casts, calls, negations;
 
+    int natural_vector_size(const Halide::Type &t) const;
     string mcpu_target() const override;
     string mcpu_tune() const override;
     string mattrs() const override;
@@ -261,6 +280,8 @@ class CodeGen_ARM : public CodeGen_Posix {
             return Shuffle::make_concat({const_true(true_lanes), const_false(false_lanes)});
         }
     }
+
+    friend struct DecomposeVectorShuffle<CodeGen_ARM, Value *>;
 };
 
 CodeGen_ARM::CodeGen_ARM(const Target &target)
@@ -1901,11 +1922,224 @@ void CodeGen_ARM::visit(const Shuffle *op) {
 
         value = codegen_dense_vector_load(load, nullptr, /* slice_to_native */ false);
         value = CodeGen_Posix::shuffle_vectors(value, op->indices);
-    } else {
+        return;
+    }
+
+    if (target_vscale() == 0) {
         CodeGen_Posix::visit(op);
+        return;
+    }
+
+    const int total_lanes = op->type.lanes();
+    if (op->type.bits() == 1) {
+        // Peep-hole pattern that matches SVE "whilelt" which represents particular pattern of
+        // vector predicate. e.g. 11100000 (active_lanes=3, all_lanes=8)
+        if (op->is_concat() && op->vectors.size() == 2 &&
+            op->type.is_int_or_uint() &&
+            is_power_of_two(total_lanes) &&
+            total_lanes >= 2 * target_vscale() && total_lanes <= 16 * target_vscale() &&
+            is_const_one(op->vectors[0]) && is_const_zero(op->vectors[1])) {
+
+            int active_lanes = op->vectors[0].type().lanes();
+            value = codegen_whilelt(op->type.lanes(), 0, active_lanes);
+            return;
+        } else {
+            // Rewrite to process 1bit type vector as 8 bit vector, and then cast back
+            std::vector<Expr> vecs_i8;
+            vecs_i8.reserve(op->vectors.size());
+            for (const auto &vec_i1 : op->vectors) {
+                Type upgraded_type = vec_i1.type().with_bits(8);
+                vecs_i8.emplace_back(Cast::make(upgraded_type, vec_i1));
+            }
+            Expr equiv = Shuffle::make(vecs_i8, op->indices);
+            equiv = Cast::make(op->type, equiv);
+            equiv = common_subexpression_elimination(equiv);
+            value = codegen(equiv);
+            return;
+        }
+    } else if (op->is_concat() && op->vectors.size() == 2) {
+        // Here, we deal with some specific patterns of concat(a, b).
+        // Others are decomposed by CodeGen_LLVM at first,
+        // which in turn calls CodeGen_ARM::concat_vectors().
+
+        if (const Broadcast *bc_1 = op->vectors[1].as<Broadcast>()) {
+            // Common pattern where padding is appended to align lanes.
+            // Create broadcast of padding with dst lanes, then insert vec[0] at lane 0.
+            Value *val_0 = codegen(op->vectors[0]);
+            Value *val_1_scalar = codegen(bc_1->value);
+            Value *padding = builder->CreateVectorSplat(llvm::ElementCount::getScalable(total_lanes / target_vscale()), val_1_scalar);
+            value = insert_scalable_vector(padding, val_0, 0);
+            return;
+        }
+    }
+
+    CodeGen_Posix::visit(op);
+}
+
+llvm::Type *CodeGen_ARM::get_vector_type_from_value(Value *vec_or_scalar, int n) {
+    llvm::Type *t = vec_or_scalar->getType();
+    llvm::Type *elt = t->isVectorTy() ? get_vector_element_type(t) : t;
+    return CodeGen_Posix::get_vector_type(elt, n);
+}
+
+Value *CodeGen_ARM::concat_vectors(const vector<Value *> &vecs) {
+    // Override only for scalable vector which includes
+    // the case where scalars are concatenated into scalable vector.
+    if (target_vscale() == 0 ||
+        vecs.size() <= 1 ||
+        isa<FixedVectorType>(vecs[0]->getType())) {
+        return CodeGen_Posix::concat_vectors(vecs);
+    }
+
+    int total_lanes = 0;
+    for (auto *v : vecs) {
+        total_lanes += get_vector_num_elements(v->getType());
+    }
+
+    llvm::Type *concat_type = get_vector_type(get_vector_element_type(vecs[0]->getType()), total_lanes);
+    Value *ret = UndefValue::get(concat_type);
+    int insert_index = 0;
+    for (auto *v : vecs) {
+        ret = insert_scalable_vector(ret, v, insert_index);
+        insert_index += get_vector_num_elements(v->getType());
+    }
+    return ret;
+}
+
+Value *CodeGen_ARM::slice_vector(llvm::Value *vec, int start, int slice_size) {
+    // Override only for scalable vector
+    if (target_vscale() == 0 ||
+        !is_scalable_vector(vec)) {
+        return CodeGen_Posix::slice_vector(vec, start, slice_size);
+    }
+
+    const int vec_lanes = get_vector_num_elements(vec->getType());
+    if (slice_size == 1) {
+        return builder->CreateExtractElement(vec, ConstantInt::get(i64_t, start, true));
+    } else if (start == 0) {
+        if (vec_lanes == slice_size) {
+            return vec;
+        } else if (vec_lanes < slice_size) {
+            return insert_scalable_vector(UndefValue::get(get_vector_type_from_value(vec, slice_size)), vec, 0);
+        } else {
+            auto *dst_type = get_vector_type_from_value(vec, slice_size);
+            Value *val_index = ConstantInt::get(i64_t, 0, true);
+            return builder->CreateExtractVector(dst_type, vec, val_index);
+        }
+    } else {
+        const int extract_size = std::min(vec_lanes - start, slice_size);
+        Value *extracted = extract_scalable_vector(vec, start, extract_size);
+        if (slice_size == extract_size) {
+            return extracted;
+        } else {
+            Value *sliced = UndefValue::get(get_vector_type_from_value(vec, slice_size));
+            sliced = insert_scalable_vector(sliced, extracted, 0);
+            return sliced;
+        }
+    }
+}
+
+Value *CodeGen_ARM::create_undef_vector_like(Value *ref_vec, int lanes) {
+    llvm::Type *elt = get_vector_element_type(ref_vec->getType());
+    return PoisonValue::get(get_vector_type(elt, lanes));
+}
+
+Value *CodeGen_ARM::extract_scalable_vector(Value *vec, int start, int extract_size) {
+    internal_assert(target_vscale() > 0 && is_scalable_vector(vec));
+    internal_assert(start + extract_size <= get_vector_num_elements(vec->getType()));  // No overrun
+
+    if (extract_size == 1) {
+        return builder->CreateExtractElement(vec, ConstantInt::get(i64_t, start, true));
+    } else {
+        // To follow the requirement of ‘llvm.vector.extract’ intrinsic that
+        // idx must be a constant multiple of the known-minimum vector length of the result type,
+        // the extraction is performed as multiple sub-extraction, where the worst case is extraction of scalar.
+        std::vector<Value *> sub_slices;
+        int i = 0;
+        while (i < extract_size) {
+            int sub_extract_pos = start + i;
+            for (int sub_extract_size = extract_size - i; sub_extract_size > 0; --sub_extract_size) {
+                if (sub_extract_pos % sub_extract_size == 0) {
+                    Value *sub_extracted;
+                    if (sub_extract_size == 1) {
+                        sub_extracted = builder->CreateExtractElement(vec, sub_extract_pos);
+                    } else {
+                        // In vector operation, index needs to be normalized by vscale
+                        internal_assert(sub_extract_pos % target_vscale() == 0);
+                        Value *idx_val = ConstantInt::get(i64_t, sub_extract_pos / target_vscale(), true);
+                        llvm::Type *sub_extract_type = get_vector_type_from_value(vec, sub_extract_size);
+                        sub_extracted = builder->CreateExtractVector(sub_extract_type, vec, idx_val);
+                    }
+                    sub_slices.push_back(sub_extracted);
+
+                    i += sub_extract_size;
+                    break;
+                }
+            }
+        }
+        Value *extracted = concat_vectors(sub_slices);
+        return extracted;
     }
 }
 
+Value *CodeGen_ARM::insert_scalable_vector(Value *base_vec, Value *new_vec, int start) {
+    const int base_lanes = get_vector_num_elements(base_vec->getType());
+    const int new_vec_lanes = get_vector_num_elements(new_vec->getType());
+    llvm::Type *element_type = get_vector_element_type(base_vec->getType());
+
+    internal_assert(start + new_vec_lanes <= base_lanes);
+
+    if (base_lanes == 1 && new_vec_lanes == 1) {
+        return new_vec;
+    }
+
+    internal_assert(target_vscale() > 0 && is_scalable_vector(base_vec));
+
+    if (!new_vec->getType()->isVectorTy()) {
+        return builder->CreateInsertElement(base_vec, new_vec, start);
+    } else if (start % new_vec_lanes == 0) {
+        // Most of the ordinal use cases are this pattern
+        // In vector operation, index needs to be normalized by vscale
+        Value *val_start_index = ConstantInt::get(i64_t, start / target_vscale(), true);
+        return builder->CreateInsertVector(base_vec->getType(), base_vec, new_vec, val_start_index);
+    }
+
+    // To follow the requirement of ‘llvm.vector.insert’ intrinsic that
+    // idx must be a constant multiple of subvec’s known minimum vector length,
+    // insertion is performed in multiple sub slices.
+    Value *ret = base_vec;
+    int extract_index = 0;
+    int insert_index = start;
+    int sub_slice_size = std::min(start, new_vec_lanes);
+
+    while (extract_index < new_vec_lanes) {
+        if (extract_index + sub_slice_size <= new_vec_lanes &&  // Condition to not overrun
+            extract_index % sub_slice_size == 0 &&              // Requirement of LLVM intrinsic
+            insert_index % sub_slice_size == 0) {               // Requirement of LLVM intrinsic
+
+            if (sub_slice_size == 1) {
+                Value *sub_slice = builder->CreateExtractElement(new_vec, extract_index);
+                ret = builder->CreateInsertElement(ret, sub_slice, insert_index);
+            } else {
+                // In vector operation, index needs to be normalized by vscale
+                internal_assert(extract_index % target_vscale() == 0);
+                internal_assert(insert_index % target_vscale() == 0);
+                Value *val_extract_index = ConstantInt::get(i64_t, extract_index / target_vscale(), true);
+                Value *val_insert_index = ConstantInt::get(i64_t, insert_index / target_vscale(), true);
+                llvm::Type *sub_sliced_type = get_vector_type(element_type, sub_slice_size);
+                Value *sub_slice = builder->CreateExtractVector(sub_sliced_type, new_vec, val_extract_index);
+                ret = builder->CreateInsertVector(base_vec->getType(), ret, sub_slice, val_insert_index);
+            }
+            insert_index += sub_slice_size;
+            extract_index += sub_slice_size;
+        } else {
+            // move on to next candidate
+            --sub_slice_size;
+        }
+    }
+    return ret;
+}
+
 Value *CodeGen_ARM::interleave_vectors(const std::vector<Value *> &vecs) {
     if (simd_intrinsics_disabled() || target_vscale() == 0 ||
         vecs.size() < 2 ||
@@ -1952,56 +2186,153 @@ Value *CodeGen_ARM::shuffle_vectors(Value *a, Value *b, const std::vector<int> &
     }
 
     internal_assert(a->getType() == b->getType());
-
-    llvm::Type *elt = get_vector_element_type(a->getType());
-    const int src_lanes = get_vector_num_elements(a->getType());
+    llvm::Type *src_type = a->getType();
+    llvm::Type *elt = get_vector_element_type(src_type);
+    const int bits = elt->getScalarSizeInBits();
+    // note: lanes are multiplied by vscale
+    const int natural_lanes = natural_vector_size(Int(bits));
+    const int src_lanes = get_vector_num_elements(src_type);
     const int dst_lanes = indices.size();
 
-    // Check if deinterleaved slice
-    {
-        // Get the stride of slice
-        int slice_stride = 0;
-        const int start_index = indices[0];
-        if (dst_lanes > 1) {
-            const int stride = indices[1] - start_index;
-            bool stride_equal = true;
-            for (int i = 2; i < dst_lanes; ++i) {
-                stride_equal &= (indices[i] == start_index + i * stride);
-            }
-            slice_stride = stride_equal ? stride : 0;
+    if (src_type->isVectorTy()) {
+        // i1 -> shuffle with i8 -> i1
+        if (src_type->getScalarSizeInBits() == 1) {
+            internal_assert(src_type->isIntegerTy()) << "1 bit floating point type is unexpected\n";
+            a = builder->CreateIntCast(a, VectorType::get(i8_t, dyn_cast<llvm::VectorType>(src_type)), false);
+            b = builder->CreateIntCast(b, VectorType::get(i8_t, dyn_cast<llvm::VectorType>(src_type)), false);
+            Value *v = shuffle_vectors(a, b, indices);
+            return builder->CreateIntCast(v, VectorType::get(i1_t, dyn_cast<llvm::VectorType>(v->getType())), false);
         }
 
-        // Lower slice with stride into llvm.vector.deinterleave intrinsic
+        // Check if deinterleaved slice
+        {
+            // Get the stride of slice
+            int slice_stride = 0;
+            const int start_index = indices[0];
+            if (dst_lanes > 1) {
+                const int stride = indices[1] - start_index;
+                bool stride_equal = true;
+                for (int i = 2; i < dst_lanes; ++i) {
+                    stride_equal &= (indices[i] == start_index + i * stride);
+                }
+                slice_stride = stride_equal ? stride : 0;
+            }
+
+            // Lower slice with stride into llvm.vector.deinterleave intrinsic
 #if LLVM_VERSION >= 220
-        const std::set<int> supported_strides{2, 3, 4, 8};
+            const std::set<int> supported_strides{2, 3, 4, 8};
 #else
-        const std::set<int> supported_strides{2, 4, 8};
+            const std::set<int> supported_strides{2, 4, 8};
 #endif
-        if (supported_strides.find(slice_stride) != supported_strides.end() &&
-            dst_lanes * slice_stride == src_lanes &&
-            indices.front() < slice_stride &&  // Start position cannot be larger than stride
-            is_power_of_two(dst_lanes) &&
-            dst_lanes % target_vscale() == 0 &&
-            dst_lanes / target_vscale() > 1) {
-
-            std::string instr = concat_strings("llvm.vector.deinterleave", slice_stride, mangle_llvm_type(a->getType()));
-
-            // We cannot mix FixedVector and ScalableVector, so dst_type must be scalable
-            llvm::Type *dst_type = get_vector_type(elt, dst_lanes / target_vscale(), VectorTypeConstraint::VScale);
-            StructType *sret_type = StructType::get(*context, std::vector(slice_stride, dst_type));
-            std::vector<llvm::Type *> arg_types{a->getType()};
-            llvm::FunctionType *fn_type = FunctionType::get(sret_type, arg_types, false);
-            FunctionCallee fn = module->getOrInsertFunction(instr, fn_type);
-
-            CallInst *deinterleave = builder->CreateCall(fn, {a});
-            // extract one element out of the returned struct
-            Value *extracted = builder->CreateExtractValue(deinterleave, indices.front());
-
-            return extracted;
+            if (supported_strides.find(slice_stride) != supported_strides.end() &&
+                dst_lanes * slice_stride == src_lanes &&
+                indices.front() < slice_stride &&  // Start position cannot be larger than stride
+                is_power_of_two(dst_lanes) &&
+                dst_lanes % target_vscale() == 0 &&
+                dst_lanes / target_vscale() > 1) {
+
+                std::string instr = concat_strings("llvm.vector.deinterleave", slice_stride, mangle_llvm_type(a->getType()));
+
+                // We cannot mix FixedVector and ScalableVector, so dst_type must be scalable
+                llvm::Type *dst_type = get_vector_type(elt, dst_lanes / target_vscale(), VectorTypeConstraint::VScale);
+                StructType *sret_type = StructType::get(*context, std::vector(slice_stride, dst_type));
+                std::vector<llvm::Type *> arg_types{a->getType()};
+                llvm::FunctionType *fn_type = FunctionType::get(sret_type, arg_types, false);
+                FunctionCallee fn = module->getOrInsertFunction(instr, fn_type);
+
+                CallInst *deinterleave = builder->CreateCall(fn, {a});
+                // extract one element out of the returned struct
+                Value *extracted = builder->CreateExtractValue(deinterleave, indices.front());
+
+                return extracted;
+            }
         }
     }
 
-    return CodeGen_Posix::shuffle_vectors(a, b, indices);
+    // Perform vector shuffle by decomposing the operation to multiple native shuffle steps
+    // which calls shuffle_scalable_vectors_general() which emits TBL/TBL2 instruction
+    DecomposeVectorShuffle shuffler(*this, a, b, get_vector_num_elements(a->getType()), natural_lanes);
+    return shuffler.run(indices);
+}
+
+Value *CodeGen_ARM::shuffle_scalable_vectors_general(Value *a, Value *b, const std::vector<int> &indices) {
+    internal_assert(a) << "Must provide a valid vector operand";
+    internal_assert(!indices.empty()) << "Cannot shuffle with empty indices";
+
+    llvm::Type *elt = get_vector_element_type(a->getType());
+    const int bits = elt->getScalarSizeInBits();
+    const int natural_lanes = natural_vector_size(Int(bits));
+    const int src_lanes = get_vector_num_elements(a->getType());
+    const int dst_lanes = indices.size();
+    llvm::Type *dst_type = get_vector_type(elt, dst_lanes);
+
+    internal_assert(target_vscale() > 0 && is_scalable_vector(a)) << "Only deal with scalable vectors\n";
+    internal_assert(src_lanes == natural_lanes && dst_lanes == natural_lanes)
+        << "Only deal with vector with natural_lanes\n";
+
+    // We select TBL or TBL2 intrinsic depending on indices range
+    int highest_lane = *std::max_element(indices.begin(), indices.end());
+    internal_assert(highest_lane >= 0)
+        << "highest_lane was "
+        << (highest_lane == SliceIndexNone            ? "SliceIndexNone" :
+            highest_lane == SliceIndexCarryPrevResult ? "SliceIndexCarryPrevResult" :
+                                                        "")
+        << " (" << highest_lane << ")";
+
+    bool use_tbl = highest_lane < src_lanes;
+    internal_assert(use_tbl || b) << "'b' must be valid in case of tbl2\n";
+
+    auto instr = concat_strings("llvm.aarch64.sve.", use_tbl ? "tbl" : "tbl2", mangle_llvm_type(dst_type));
+
+    Value *val_indices = codegen_shuffle_indices(bits, indices);
+    llvm::Type *vt_natural = get_vector_type(elt, natural_lanes);
+    std::vector<llvm::Type *> llvm_arg_types;
+    std::vector<llvm::Value *> llvm_arg_vals;
+    if (use_tbl) {
+        llvm_arg_types = {vt_natural, val_indices->getType()};
+        llvm_arg_vals = {a, val_indices};
+    } else {
+        llvm_arg_types = {vt_natural, vt_natural, val_indices->getType()};
+        llvm_arg_vals = {a, b, val_indices};
+    }
+    llvm::FunctionType *fn_type = FunctionType::get(vt_natural, llvm_arg_types, false);
+    FunctionCallee fn = module->getOrInsertFunction(instr, fn_type);
+
+    Value *v = builder->CreateCall(fn, llvm_arg_vals);
+    return v;
+}
+
+Value *CodeGen_ARM::codegen_shuffle_indices(int bits, const std::vector<int> &indices) {
+    const int lanes = indices.size();
+    llvm::Type *index_type = IntegerType::get(module->getContext(), bits);
+    llvm::Type *index_vec_type = get_vector_type(index_type, lanes);
+
+    std::vector<Constant *> llvm_indices(lanes);
+    for (int i = 0; i < lanes; i++) {
+        int idx = indices[i];
+        llvm_indices[i] = idx >= 0 ? ConstantInt::get(index_type, idx) : UndefValue::get(index_type);
+    }
+
+    Value *v = ConstantVector::get(llvm_indices);
+    v = builder->CreateInsertVector(index_vec_type, UndefValue::get(index_vec_type),
+                                    v, ConstantInt::get(i64_t, 0));
+    return v;
+}
+
+Value *CodeGen_ARM::codegen_whilelt(int total_lanes, int start, int end) {
+    // Generates SVE "whilelt" instruction which represents vector predicate pattern of
+    // e.g. 11100000 (total_lanes = 8 , start = 0, end = 3)
+    //     -> @llvm.aarch64.sve.whilelt.nxv8i1.i32(i32 0, i32 3)
+    internal_assert(target_vscale() > 0);
+    internal_assert(total_lanes % target_vscale() == 0);
+    std::string instr = concat_strings("llvm.aarch64.sve.whilelt.nxv", total_lanes / target_vscale(), "i1.i32");
+
+    llvm::Type *pred_type = get_vector_type(llvm_type_of(Int(1)), total_lanes);
+    llvm::FunctionType *fn_type = FunctionType::get(pred_type, {i32_t, i32_t}, false);
+    FunctionCallee fn = module->getOrInsertFunction(instr, fn_type);
+
+    value = builder->CreateCall(fn, {ConstantInt::get(i32_t, start), ConstantInt::get(i32_t, end)});
+    return value;
 }
 
 void CodeGen_ARM::visit(const Ramp *op) {
@@ -2425,6 +2756,11 @@ Type CodeGen_ARM::upgrade_type_for_storage(const Type &t) const {
     return CodeGen_Posix::upgrade_type_for_storage(t);
 }
 
+int CodeGen_ARM::natural_vector_size(const Halide::Type &t) const {
+    internal_assert(t.bits() > 1) << "natural_vector_size requested with 1 bits\n";
+    return native_vector_bits() / t.bits();
+}
+
 string CodeGen_ARM::mcpu_target() const {
     if (target.bits == 32) {
         if (target.has_feature(Target::ARMv7s)) {
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 07d8cbb31a08..98837d27b0d4 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -4155,7 +4155,9 @@ void CodeGen_LLVM::visit(const Shuffle *op) {
             } else {
                 internal_assert(op->indices[0] == 0);
             }
-            value = create_broadcast(value, op->indices.size());
+            if (op->indices.size() > 1) {
+                value = create_broadcast(value, op->indices.size());
+            }
             return;
         }
     }
@@ -5445,6 +5447,10 @@ int CodeGen_LLVM::get_vector_num_elements(const llvm::Type *t) {
     }
 }
 
+int CodeGen_LLVM::get_vector_num_elements(const llvm::Value *v) {
+    return get_vector_num_elements(v->getType());
+}
+
 llvm::Type *CodeGen_LLVM::llvm_type_of(LLVMContext *c, Halide::Type t,
                                        int effective_vscale) const {
     if (t.lanes() == 1) {
@@ -5481,23 +5487,7 @@ llvm::Type *CodeGen_LLVM::get_vector_type(llvm::Type *t, int n,
     switch (type_constraint) {
     case VectorTypeConstraint::None:
         if (effective_vscale > 0) {
-            bool wide_enough = true;
-            // TODO(https://github.com/halide/Halide/issues/8119): Architecture
-            // specific code should not go here. Ideally part of this can go
-            // away via LLVM fixes and modifying intrinsic selection to handle
-            // scalable vs. fixed vectors. Making this method virtual is
-            // possibly expensive.
-            if (target.arch == Target::ARM) {
-                if (!target.has_feature(Target::NoNEON)) {
-                    // force booleans into bytes. TODO(https://github.com/halide/Halide/issues/8119): figure out a better way to do this.
-                    int bit_size = std::max((int)t->getScalarSizeInBits(), 8);
-                    wide_enough = (bit_size * n) > 128;
-                } else {
-                    // TODO(https://github.com/halide/Halide/issues/8119): AArch64 SVE2 support is crashy with scalable vectors of min size 1.
-                    wide_enough = (n / effective_vscale) > 1;
-                }
-            }
-            scalable = wide_enough && ((n % effective_vscale) == 0);
+            scalable = (n % effective_vscale) == 0;
             if (scalable) {
                 n = n / effective_vscale;
             }
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
index bdd267020f1a..240114977f82 100644
--- a/src/CodeGen_LLVM.h
+++ b/src/CodeGen_LLVM.h
@@ -605,7 +605,10 @@ class CodeGen_LLVM : public IRVisitor {
                                    const std::function<llvm::Value *(llvm::Value *)> &fn);
 
     /** Get number of vector elements, taking into account scalable vectors. Returns 1 for scalars. */
+    // @{
     int get_vector_num_elements(const llvm::Type *t);
+    int get_vector_num_elements(const llvm::Value *v);
+    // @}
 
     /** Interface to abstract vector code generation as LLVM is now
      * providing multiple options to express even simple vector
diff --git a/src/DecomposeVectorShuffle.cpp b/src/DecomposeVectorShuffle.cpp
new file mode 100644
index 000000000000..8b0fb0fa05cf
--- /dev/null
+++ b/src/DecomposeVectorShuffle.cpp
@@ -0,0 +1,80 @@
+#include "DecomposeVectorShuffle.h"
+
+#include <unordered_map>
+
+namespace Halide::Internal {
+
+std::vector<std::vector<NativeShuffle>> decompose_to_native_shuffles(
+    int src_lanes, const std::vector<int> &indices, int vl) {
+
+    int dst_lanes = static_cast<int>(indices.size());
+    int src_lanes_aligned = align_up(src_lanes, vl);
+
+    // Adjust indices so that src vectors are aligned up to multiple of vl
+    std::vector<int> aligned_indices = indices;
+    for (int &idx : aligned_indices) {
+        if (idx >= src_lanes) {
+            idx += src_lanes_aligned - src_lanes;
+        }
+    }
+
+    const int num_dst_slices = align_up(dst_lanes, vl) / vl;
+    std::vector<std::vector<NativeShuffle>> all_steps(num_dst_slices);
+
+    for (int dst_slice = 0; dst_slice < num_dst_slices; dst_slice++) {
+        std::unordered_map<int, int> slice_to_step;
+        auto &steps = all_steps[dst_slice];
+        const int dst_start = dst_slice * vl;
+
+        for (int dst_index = dst_start; dst_index < dst_start + vl && dst_index < dst_lanes; ++dst_index) {
+            const int src_index = aligned_indices[dst_index];
+            if (src_index < 0) {
+                continue;
+            }
+
+            const int src_slice = src_index / vl;
+            const int lane_in_src_slice = src_index % vl;
+            const int lane_in_dst_slice = dst_index - dst_start;
+
+            if (steps.empty()) {
+                // first slice in this block
+                slice_to_step[src_slice] = 0;
+                steps.emplace_back(vl, src_slice, SliceIndexNone);
+                steps.back().lane_map[lane_in_dst_slice] = lane_in_src_slice;
+
+            } else if (auto itr = slice_to_step.find(src_slice); itr != slice_to_step.end()) {
+                // slice already seen
+                NativeShuffle &step = steps[itr->second];
+                bool is_a = (step.slice_a != SliceIndexCarryPrevResult && step.slice_a == src_slice);
+                int offset = is_a ? 0 : vl;
+                step.lane_map[lane_in_dst_slice] = lane_in_src_slice + offset;
+
+            } else if (steps[0].slice_b == SliceIndexNone) {
+                // add as 'b' of first step if b is unused
+                slice_to_step[src_slice] = 0;
+                steps[0].slice_b = src_slice;
+                steps[0].lane_map[lane_in_dst_slice] = lane_in_src_slice + vl;
+
+            } else {
+                // otherwise chain a new step
+                slice_to_step[src_slice] = static_cast<int>(steps.size());
+                // new step uses previous result as 'a', so we use 'b' for this one
+                steps.emplace_back(vl, SliceIndexCarryPrevResult, src_slice);
+
+                // Except for the first step, we need to arrange indices
+                // so that the output carried from the previous step is kept
+                auto &lane_map = steps.back().lane_map;
+                // initialize lane_map as identical copy
+                for (size_t lane_idx = 0; lane_idx < lane_map.size(); ++lane_idx) {
+                    lane_map[lane_idx] = lane_idx;
+                }
+                // update for this index
+                lane_map[lane_in_dst_slice] = lane_in_src_slice + vl;
+            }
+        }
+    }
+
+    return all_steps;
+}
+
+}  // namespace Halide::Internal
diff --git a/src/DecomposeVectorShuffle.h b/src/DecomposeVectorShuffle.h
new file mode 100644
index 000000000000..e3a60e3cd4fa
--- /dev/null
+++ b/src/DecomposeVectorShuffle.h
@@ -0,0 +1,163 @@
+#ifndef HALIDE_DECOMPOSE_VECTOR_SHUFFLE_H
+#define HALIDE_DECOMPOSE_VECTOR_SHUFFLE_H
+
+/** \file
+ *
+ * Perform vector shuffle by decomposing the operation to
+ * a sequence of the sub shuffle steps where each step is a shuffle of:
+ * - One or two slices as input (slice_a and slice_b)
+ * - Produce one slice (dst slice)
+ * - All the slices have the same length as target native vector (vl)
+ *
+ * The structure of the sequence of steps consists of:
+ * 1. Outer loop to iterate the slices of dst vector.
+ * 2. Inner loop to iterate the native shuffle steps to complete a single dst slice.
+ *    This can be multiple steps because a single native shuffle can take
+ *    only 2 slices (native vector length x 2) at most, while we may need
+ *    to fetch from wider location in the src vector.
+ *
+ * The following example, log of test code, illustrates how it works.
+ *
+ * src_lanes: 17, dst_lanes: 7, vl: 4
+ *  input a: [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, ]
+ *  input b: [170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, ]
+ *  indices: [6, 13, 24, 14, 7, 11, 5, ]
+ *
+ *  slice a:[40, 50, 60, 70, ],  slice b:[120, 130, 140, 150, ],  indices:[2, 5, -1, 6, ]
+ *    => slice output:[60, 130, -559038801, 140, ]
+ *  slice a:[60, 130, -559038801, 140, ],  slice b:[210, 220, 230, 240, ],  indices:[0, 1, 7, 3, ]
+ *    => slice output:[60, 130, 240, 140, ]
+ *  slice a:[40, 50, 60, 70, ],  slice b:[80, 90, 100, 110, ],  indices:[3, 7, 1, -1, ]
+ *    => slice output:[70, 110, 50, -559038801, ]
+ *
+ *  output: [60, 130, 240, 140, 70, 110, 50, ]
+ *
+ */
+
+#include "Error.h"
+#include "Util.h"
+
+#include <optional>
+#include <vector>
+
+namespace Halide {
+namespace Internal {
+
+/** Enum to represent the special cases of slice index */
+enum {
+    SliceIndexNone = -1,
+    SliceIndexCarryPrevResult = -2,
+};
+
+struct NativeShuffle {
+    int slice_a;
+    int slice_b;
+    std::vector<int> lane_map;
+
+    NativeShuffle(int vl, int a, int b)
+        : slice_a(a), slice_b(b) {
+        lane_map.resize(vl, SliceIndexNone);
+    }
+};
+
+std::vector<std::vector<NativeShuffle>> decompose_to_native_shuffles(
+    int src_lanes, const std::vector<int> &indices, int vl);
+
+/** Algorithm logic for shuffle decomposition, parameterized on vector type
+ * and a codegen-like class that provides primitive vector operations.
+ */
+template<typename CodeGenTy, typename VecTy>
+struct DecomposeVectorShuffle {
+    // TODO: when upgrading to C++20, replace with a concept.
+    // get_vector_num_elements may be overloaded (e.g. on Type* and Value*), so use
+    // expression SFINAE rather than a method pointer to handle overload resolution.
+    static_assert(std::is_convertible_v<decltype(std::declval<CodeGenTy &>().get_vector_num_elements(std::declval<VecTy>())), int>,
+                  "CodeGenTy must provide: int get_vector_num_elements(VecTy)");
+    static_assert(std::is_invocable_r_v<VecTy, decltype(&CodeGenTy::slice_vector), CodeGenTy &, const VecTy &, int, int>,
+                  "CodeGenTy must provide: VecTy slice_vector(const VecTy &, int, int)");
+    static_assert(std::is_invocable_r_v<VecTy, decltype(&CodeGenTy::concat_vectors), CodeGenTy &, const std::vector<VecTy> &>,
+                  "CodeGenTy must provide: VecTy concat_vectors(const std::vector<VecTy> &)");
+    static_assert(std::is_invocable_r_v<VecTy, decltype(&CodeGenTy::shuffle_scalable_vectors_general), CodeGenTy &,
+                                        const VecTy &, const VecTy &, const std::vector<int> &>,
+                  "CodeGenTy must provide: VecTy shuffle_scalable_vectors_general(const VecTy &, const VecTy &, const std::vector<int> &)");
+    static_assert(std::is_invocable_r_v<VecTy, decltype(&CodeGenTy::create_undef_vector_like), CodeGenTy &, const VecTy &, int>,
+                  "CodeGenTy must provide: VecTy create_undef_vector_like(const VecTy &, int)");
+
+    DecomposeVectorShuffle(CodeGenTy &codegen, const VecTy &src_a, const VecTy &src_b, int src_lanes, int vl)
+        : codegen(codegen),
+          vl(vl),
+          src_a(align_up_vector(src_a, vl)),
+          src_b(align_up_vector(src_b, vl)),
+          src_lanes(src_lanes),
+          src_lanes_aligned(align_up(src_lanes, vl)) {
+    }
+
+    VecTy run(const std::vector<int> &indices) {
+        auto shuffle_plan = decompose_to_native_shuffles(src_lanes, indices, vl);
+        int dst_lanes = static_cast<int>(indices.size());
+
+        // process each block divided by vl
+        std::vector<VecTy> shuffled_dst_slices;
+        shuffled_dst_slices.reserve(shuffle_plan.size());
+
+        for (const auto &steps_for_dst_slice : shuffle_plan) {
+            std::optional<VecTy> dst_slice = std::nullopt;
+            for (const auto &step : steps_for_dst_slice) {
+                // Obtain 1st slice a
+                VecTy a;
+                if (step.slice_a == SliceIndexCarryPrevResult) {
+                    internal_assert(dst_slice.has_value()) << "Tried to carry from undefined previous result";
+                    a = *dst_slice;
+                } else {
+                    a = get_vl_slice(step.slice_a);
+                }
+                // Obtain 2nd slice b
+                std::optional<VecTy> b;
+                if (step.slice_b == SliceIndexNone) {
+                    b = std::nullopt;
+                } else {
+                    b = std::optional<VecTy>(get_vl_slice(step.slice_b));
+                }
+                // Perform shuffle where vector length is aligned
+                dst_slice = codegen.shuffle_scalable_vectors_general(a, b.value_or(VecTy{}), step.lane_map);
+            }
+            if (!dst_slice.has_value()) {
+                // No shuffle step for this slice, i.e. all the indices are -1
+                dst_slice = codegen.create_undef_vector_like(src_a, vl);
+            }
+            shuffled_dst_slices.push_back(*dst_slice);
+        }
+
+        return codegen.slice_vector(codegen.concat_vectors(shuffled_dst_slices), 0, dst_lanes);
+    }
+
+private:
+    // Helper to extract slice with lanes=vl
+    VecTy get_vl_slice(int slice_index) {
+        const int num_slices_a = src_lanes_aligned / vl;
+        int start_index = slice_index * vl;
+        if (slice_index < num_slices_a) {
+            return codegen.slice_vector(src_a, start_index, vl);
+        } else {
+            start_index -= src_lanes_aligned;
+            return codegen.slice_vector(src_b, start_index, vl);
+        }
+    }
+
+    VecTy align_up_vector(const VecTy &v, int align) {
+        int len = codegen.get_vector_num_elements(v);
+        return codegen.slice_vector(v, 0, align_up(len, align));
+    }
+
+    CodeGenTy &codegen;
+    int vl;
+    VecTy src_a;
+    VecTy src_b;
+    int src_lanes;
+    int src_lanes_aligned;
+};
+
+}  // namespace Internal
+}  // namespace Halide
+
+#endif
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index e1c44561be00..a25b077a1abb 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -80,6 +80,7 @@ tests(GROUPS correctness
       debug_to_file.cpp
       debug_to_file_multiple_outputs.cpp
       debug_to_file_reorder.cpp
+      decompose_vector_shuffle.cpp
       deferred_loop_level.cpp
       deinterleave4.cpp
       device_buffer_copies_with_profile.cpp
diff --git a/test/correctness/decompose_vector_shuffle.cpp b/test/correctness/decompose_vector_shuffle.cpp
new file mode 100644
index 000000000000..81b958e04ba6
--- /dev/null
+++ b/test/correctness/decompose_vector_shuffle.cpp
@@ -0,0 +1,155 @@
+#include <Halide.h>
+
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <random>
+#include <string>
+
+using namespace Halide;
+using namespace Halide::Internal;
+
+using std::optional;
+using std::vector;
+
+namespace {
+
+constexpr int UNDEF_VALUE = 0xdeadbeef;
+
+vector<int> shuffle_without_divided(const vector<int> &a, const vector<int> &b, const vector<int> &indices) {
+    int src_lanes = static_cast<int>(a.size());
+    vector<int> dst(indices.size(), 0x1234abcd);
+    for (size_t i = 0; i < indices.size(); ++i) {
+        int idx = indices[i];
+        if (idx < 0) {
+            dst[i] = UNDEF_VALUE;
+        } else if (idx < src_lanes) {
+            dst[i] = a[idx];
+        } else {
+            int idx_b = idx - src_lanes;
+            internal_assert(idx_b < static_cast<int>(b.size()));
+            dst[i] = b[idx_b];
+        }
+    }
+    return dst;
+}
+
+struct STLShuffleCodeGen {
+    int get_vector_num_elements(const vector<int> &v) {
+        return static_cast<int>(v.size());
+    }
+
+    vector<int> slice_vector(const vector<int> &v, int start, int lanes) {
+        auto result = vector<int>(v.begin() + start, v.begin() + std::min(start + lanes, static_cast<int>(v.size())));
+        result.resize(lanes);
+        return result;
+    }
+
+    vector<int> concat_vectors(const vector<vector<int>> &vecs) {
+        vector<int> out;
+        for (const auto &v : vecs) {
+            out.insert(out.end(), v.begin(), v.end());
+        }
+        return out;
+    }
+
+    vector<int> shuffle_scalable_vectors_general(const vector<int> &a, const vector<int> &b, const vector<int> &indices) {
+        internal_assert(a.size() == indices.size());
+
+        auto result = shuffle_without_divided(a, b, indices);
+
+        debug(1) << "slice a: " << PrintSpan{a} << ", "
+                 << "slice b: " << PrintSpan{b} << ", "
+                 << "indices: " << PrintSpan{indices} << "\n"
+                 << "\t=> slice output: " << PrintSpan{result} << "\n";
+
+        return result;
+    }
+
+    vector<int> create_undef_vector_like(const vector<int> &ref, int lanes) {
+        return vector<int>(lanes, UNDEF_VALUE);
+    }
+};
+
+void generate_data(int src_lanes, int dst_lanes,
+                   vector<int> &a, vector<int> &b, vector<int> &indices) {
+    // Input vector values are fixed for readability.
+    // Indices values are random in range [-1, src_lanes*2) .
+    a.resize(src_lanes);
+    b.resize(src_lanes);
+    for (int i = 0; i < src_lanes; ++i) {
+        a[i] = i * 10;
+        b[i] = (i + src_lanes) * 10;
+    }
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution dist(-1, src_lanes * 2 - 1);
+    indices.resize(dst_lanes);
+    for (int i = 0; i < dst_lanes; ++i) {
+        indices[i] = dist(gen);
+    }
+
+    debug(1) << "input a: " << PrintSpan{a} << "\n"
+             << "input b: " << PrintSpan{b} << "\n"
+             << "indices: " << PrintSpan{indices} << "\n\n";
+}
+
+void assert_vectors_equal(const vector<int> &expected, const vector<int> &actual) {
+    internal_assert(expected.size() == actual.size())
+        << "Vector sizes are different\n"
+        << "expected: " << PrintSpan{expected} << "\n"
+        << "  actual: " << PrintSpan{actual} << "\n";
+
+    for (size_t i = 0; i < expected.size(); ++i) {
+        internal_assert(expected[i] == actual[i])
+            << "Mismatch: expected[" << i << "] = " << expected[i] << ", actual[" << i << "] = " << actual[i] << "\n"
+            << "expected: " << PrintSpan{expected} << "\n"
+            << "  actual: " << PrintSpan{actual} << "\n";
+    }
+}
+
+void run_single_test(int src_lanes, int dst_lanes, int vl) {
+    vector<int> a, b, indices;
+    generate_data(src_lanes, dst_lanes, a, b, indices);
+
+    auto expected = shuffle_without_divided(a, b, indices);
+
+    STLShuffleCodeGen ops;
+    DecomposeVectorShuffle shuffler(ops, a, b, src_lanes, vl);
+    auto actual = shuffler.run(indices);
+
+    assert_vectors_equal(expected, actual);
+}
+
+void run_test(int src_lanes, int dst_lanes, int vl, int repeat) {
+    debug(2) << "Running " << repeat << " tests for\n"
+             << "  src_lanes: " << src_lanes
+             << ", dst_lanes: " << dst_lanes
+             << ", vl: " << vl << "\n";
+
+    for (int t = 0; t < repeat; ++t) {
+        run_single_test(src_lanes, dst_lanes, vl);
+    }
+}
+
+}  // namespace
+
+int main(int argc, char *argv[]) {
+    int repeat = 100;
+
+    if (argc >= 3) {
+        int src_lanes = std::stoi(argv[1]);
+        int dst_lanes = std::stoi(argv[2]);
+        int vl = (argc >= 4) ? std::stoi(argv[3]) : 4;
+        repeat = (argc >= 5) ? std::stoi(argv[4]) : repeat;
+        internal_assert(popcount64(vl) == 1 && vl > 1) << "vl must be a power of 2";
+        run_test(src_lanes, dst_lanes, vl, repeat);
+    } else {
+        run_test(8, 8, 4, repeat);
+        run_test(19, 9, 4, repeat);
+        run_test(5, 3, 8, repeat);
+    }
+
+    printf("Success!\n");
+    return 0;
+}
diff --git a/test/correctness/interleave.cpp b/test/correctness/interleave.cpp
index 0d758428e2cd..cbee263f5487 100644
--- a/test/correctness/interleave.cpp
+++ b/test/correctness/interleave.cpp
@@ -74,6 +74,17 @@ Expr element(FuncRef f, int i) {
 int main(int argc, char **argv) {
     Var x, y, c;
 
+    // SVE2 backend has the below LLVM issue which has been fixed in LLVM 22.
+    // "LLVM ERROR: Don't know how to widen the operands for INSERT_SUBVECTOR"
+    // https://github.com/llvm/llvm-project/issues/160134
+    // https://github.com/llvm/llvm-project/issues/169300
+    if (Internal::get_llvm_version() < 220 &&
+        get_jit_target_from_environment().has_feature(Target::SVE2)) {
+        printf("[SKIP] LLVM %d has known SVE backend bugs for this test.\n",
+               Internal::get_llvm_version());
+        return 0;
+    }
+
     // TODO: Is this still true?
     // As of May 26 2016, this test causes a segfault due to
     // permissions failure on ARM-32 trying to execute a
diff --git a/test/correctness/predicated_store_load_single_lane.cpp b/test/correctness/predicated_store_load_single_lane.cpp
index 3e1f3b3b4ca0..64fa8cf86713 100644
--- a/test/correctness/predicated_store_load_single_lane.cpp
+++ b/test/correctness/predicated_store_load_single_lane.cpp
@@ -3,6 +3,16 @@
 using namespace Halide;
 
 int main(int argc, char **argv) {
+    // SVE2 backend has the below LLVM issue which has been fixed in LLVM 22.
+    // "LLVM ERROR: Unable to widen vector store"
+    // https://github.com/llvm/llvm-project/issues/54424
+    if (Internal::get_llvm_version() < 220 &&
+        get_jit_target_from_environment().has_feature(Target::SVE2)) {
+        printf("[SKIP] LLVM %d has known SVE backend bugs for this test.\n",
+               Internal::get_llvm_version());
+        return 0;
+    }
+
     // This test exercises predicated vector loads and stores with a single
     // lane. These require special handling because Halide's IR does not
     // distinguish between scalars and single-element vectors, while LLVM
diff --git a/test/correctness/simd_op_check_sve2.cpp b/test/correctness/simd_op_check_sve2.cpp
index f0183412323a..467c7a5f7794 100644
--- a/test/correctness/simd_op_check_sve2.cpp
+++ b/test/correctness/simd_op_check_sve2.cpp
@@ -21,37 +21,35 @@ using CastFuncTy = function<Expr(Expr)>;
 
 class SimdOpCheckArmSve : public SimdOpCheckTest {
 public:
-    SimdOpCheckArmSve(Target t, int w = 384, int h = 32)
+    SimdOpCheckArmSve(Target t, int w = 512, int h = 16)
         : SimdOpCheckTest(t, w, h), debug_mode(Internal::get_env_variable("HL_DEBUG_SIMDOPCHECK")) {
 
         // Determine and hold can_run_the_code
-        // TODO: Since features of Arm CPU cannot be obtained automatically from get_host_target(),
-        // it is necessary to set some feature (e.g. "arm_fp16") explicitly to HL_JIT_TARGET.
-        // Halide throws error if there is unacceptable mismatch between jit_target and host_target.
-
         Target host = get_host_target();
         Target jit_target = get_jit_target_from_environment();
         cout << "host is:          " << host.to_string() << endl;
         cout << "HL_TARGET is:     " << target.to_string() << endl;
         cout << "HL_JIT_TARGET is: " << jit_target.to_string() << endl;
 
-        auto is_same_triple = [](const Target &t1, const Target &t2) -> bool {
-            return t1.arch == t2.arch && t1.bits == t2.bits && t1.os == t2.os && t1.vector_bits == t2.vector_bits;
+        auto is_runtime_compatible = [](const Target &t1, const Target &t2) -> bool {
+            bool yes = true;
+            yes &= (t1.arch == t2.arch && t1.bits == t2.bits && t1.os == t2.os);
+            yes &= (t1.vector_bits == t2.vector_bits);
+
+            // A bunch of feature flags also need to match between the
+            // compiled code and the host in order to run the code.
+            for (Target::Feature f : {Target::SVE2}) {
+                yes &= (t1.has_feature(f) == t2.has_feature(f));
+            }
+            return yes;
         };
 
-        can_run_the_code = is_same_triple(host, target) && is_same_triple(jit_target, target);
+        can_run_the_code = is_runtime_compatible(host, target) && is_runtime_compatible(jit_target, target);
 
-        // A bunch of feature flags also need to match between the
-        // compiled code and the host in order to run the code.
-        for (Target::Feature f : {Target::ARMv7s, Target::ARMFp16, Target::NoNEON, Target::SVE2}) {
-            if (target.has_feature(f) != jit_target.has_feature(f)) {
-                can_run_the_code = false;
-            }
-        }
         if (!can_run_the_code) {
-            cout << "[WARN] To perform verification of realization, "
-                 << R"(the target triple "arm-<bits>-<os>" and key feature "arm_fp16")"
-                 << " must be the same between HL_TARGET and HL_JIT_TARGET" << endl;
+            debug(0) << "[WARN] To perform verification of realization, "
+                     << R"(the target triple "arm-<bits>-<os>", vector_bits, and feature "sve2")"
+                     << " must be the same between HL_TARGET and HL_JIT_TARGET" << endl;
         }
     }
 
@@ -563,13 +561,20 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
                 continue;
             }
 
-            vector total_bits_params = {256};  // {64, 128, 192, 256};
+            std::vector<int> simd_bit_widths;
+            if (has_sve()) {
+                simd_bit_widths.push_back(target.vector_bits);
+            } else if (has_neon()) {
+                simd_bit_widths.push_back(64);
+                simd_bit_widths.push_back(128);
+            }
+
             if (bits != 64) {
                 // Add scalar case to verify float16 native operation
-                total_bits_params.push_back(bits);
+                simd_bit_widths.push_back(bits);
             }
 
-            for (auto total_bits : total_bits_params) {
+            for (auto &total_bits : simd_bit_widths) {
                 const int vf = total_bits / bits;
                 const bool is_vector = vf > 1;
 
@@ -720,6 +725,20 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
                 }
             }
 
+            // TBL - Structured load with stride=5
+            {
+                constexpr int stride = 5;
+                const int vector_lanes = base_vec_bits * 4 / bits;
+
+                AddTestFunctor add(*this, bits, vector_lanes);
+
+                Expr load_n = in_im(x * stride) + in_im(x * stride + stride - 1);
+
+                if (has_sve()) {
+                    add("tbl", load_n);
+                }
+            }
+
             // ST2       -       Store two-element structures
             for (int factor : {1, 2}) {
                 const int width = base_vec_bits * 2 * factor;
@@ -829,7 +848,8 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
 
             // SVE Gather/Scatter
             if (has_sve()) {
-                for (int width = 64; width <= 64 * 4; width *= 2) {
+                for (float factor : {0.5f, 1.f, 2.f}) {
+                    const int width = base_vec_bits * factor;
                     const int total_lanes = width / bits;
                     const int instr_lanes = min(total_lanes, 128 / bits);
                     if (instr_lanes < 2 || (total_lanes / vscale < 2)) continue;  // bail out scalar and <vscale x 1 x ty>
@@ -846,6 +866,14 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
                     const int index_bits = std::max(32, bits);
                     add({get_sve_ls_instr("ld1", bits, index_bits, "uxtw")}, total_lanes, gather);
                     add({get_sve_ls_instr("st1", bits, index_bits, "uxtw")}, total_lanes, scatter);
+
+                    // In case of lanes shorter than native's, predicate pattern is generated by
+                    // "whilelt" intrinsic.
+                    // <vscale x 8 x i1> @llvm.aarch64.sve.whilelt.nxv8i1.i32(i32 0, i32 4)
+                    if (factor == 0.5f) {
+                        string constraint("vl" + to_string(total_lanes));
+                        add("whilelt", {get_ptrue_instr_with_constraint(bits, constraint)}, total_lanes, scatter);
+                    }
                 }
             }
         }
@@ -871,9 +899,12 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
                 {64, in_i64, in_u64, i64, i64, u64, u64},
             };
 
+            const int base_vec_bits = has_sve() ? target.vector_bits : 128;
+            const int vscale = base_vec_bits / 128;
+
             for (const auto &[bits, in_i, in_u, widen_i, widenx4_i, widen_u, widenx4_u] : test_params) {
 
-                for (auto &total_bits : {64, 128}) {
+                for (auto &total_bits : {base_vec_bits / 2, base_vec_bits}) {
                     const int vf = total_bits / bits;
                     const int instr_lanes = Instruction::get_force_vectorized_instr_lanes(bits, vf, target);
                     AddTestFunctor add(*this, bits, instr_lanes, vf, !(is_arm32() && bits == 64));  // 64 bit is unavailable in neon 32 bit
@@ -945,11 +976,13 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
 
                     // UDOT/SDOT
                     if (is_arm_dot_prod_available) {
-                        const int factor_32bit = vf / 4;
+                        const int factor_reduced = vf / 4;
+                        if (factor_reduced / vscale < 2) continue;  // bail out scalar and <vscale x 1 x ty>
+
                         for (int f : {4, 8}) {
                             // checks vector register for narrow src data type (i.e. 8 or 16 bit)
-                            const int lanes_src = Instruction::get_instr_lanes(bits, f * factor_32bit, target);
-                            AddTestFunctor add_dot(*this, bits, lanes_src, factor_32bit);
+                            const int lanes_src = Instruction::get_instr_lanes(bits, f * factor_reduced, target);
+                            AddTestFunctor add_dot(*this, bits, lanes_src, factor_reduced);
                             RDom r(0, f);
 
                             add_dot("udot", sum(widenx4_u(in_u(f * x + r)) * in_u(f * x + r + 32)));
@@ -1048,13 +1081,13 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
             return opcode_pattern + R"(\s.*\b)" + operand_pattern + R"(\b.*)";
         }
 
-        // TODO Fix this for SVE2
-        static int natural_lanes(int bits) {
-            return 128 / bits;
+        static int natural_lanes(int bits, const Target &t) {
+            const int base_vector_bits = std::max(t.vector_bits, 128);
+            return base_vector_bits / bits;
         }
 
         static int get_instr_lanes(int bits, int vec_factor, const Target &target) {
-            return min(natural_lanes(bits), vec_factor);
+            return min(natural_lanes(bits, target), vec_factor);
         }
 
         static int get_force_vectorized_instr_lanes(int bits, int vec_factor, const Target &target) {
@@ -1063,10 +1096,10 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
                 if (vec_factor == 1) {
                     return 1;
                 } else {
-                    return natural_lanes(bits);
+                    return natural_lanes(bits, target);
                 }
             } else {
-                int min_lanes = std::max(2, natural_lanes(bits) / 2);  // 64 bit wide VL
+                int min_lanes = std::max(2, natural_lanes(bits, target) / 2);  // 64 bit wide VL
                 return max(min_lanes, get_instr_lanes(bits, vec_factor, target));
             }
         }
@@ -1075,7 +1108,7 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
             return opcode + to_string(bits.value());
         }
 
-        const char *get_bits_designator() const {
+        static const char *get_bits_designator(int bits) {
             static const map<int, const char *> designators{
                 // NOTE: vector or float only
                 {8, "b"},
@@ -1083,7 +1116,7 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
                 {32, "s"},
                 {64, "d"},
             };
-            auto iter = designators.find(bits.value());
+            auto iter = designators.find(bits);
             assert(iter != designators.end());
             return iter->second;
         }
@@ -1092,7 +1125,7 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
             if (pattern_lanes == ANY_LANES) {
                 return R"((z\d\d?\.[bhsd])|(s\d\d?))";
             } else {
-                const char *bits_designator = get_bits_designator();
+                const char *bits_designator = get_bits_designator(bits.value());
                 // TODO(need issue): This should only match the scalar register, and likely a NEON instruction opcode.
                 // Generating a full SVE vector instruction for a scalar operation is inefficient. However this is
                 // happening and fixing it involves changing intrinsic selection. Likely to use NEON intrinsics where
@@ -1109,7 +1142,7 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
         }
 
         string get_reg_neon64() const {
-            const char *bits_designator = get_bits_designator();
+            const char *bits_designator = get_bits_designator(bits.value());
             if (pattern_lanes == 1) {
                 return std::string(bits_designator) + R"(\d\d?)";  // e.g. "h15"
             } else if (pattern_lanes == ANY_LANES) {
@@ -1149,6 +1182,15 @@ class SimdOpCheckArmSve : public SimdOpCheckTest {
         return get_sve_ls_instr(base_opcode, bits, bits, "");
     }
 
+    Instruction get_ptrue_instr_with_constraint(int bits, const string &constraint) {
+        // Special predicate pattern is generated by "whilelt" intrinsic, e.g.
+        // <vscale x 8 x i1> @llvm.aarch64.sve.whilelt.nxv8i1.i32(i32 0, i32 4)
+        // LLVM compiles this to the instruction below:
+        // ptrue p0.h, vl4
+        string operand = R"(p\d\d?\.)" + string(Instruction::get_bits_designator(bits));
+        return Instruction("ptrue", operand + R"(,\s.*\b)" + constraint);
+    }
+
     // Helper functor to add test case
     class AddTestFunctor {
     public:
@@ -1379,5 +1421,6 @@ int main(int argc, char **argv) {
 
             Target("arm-64-linux-sve2-no_neon-vector_bits_128"),
             Target("arm-64-linux-sve2-no_neon-vector_bits_256"),
+            Target("arm-64-linux-sve2-no_neon-vector_bits_512"),
         });
 }
diff --git a/test/correctness/stmt_to_html.cpp b/test/correctness/stmt_to_html.cpp
index dae0cdc95527..0dad8820d0c3 100644
--- a/test/correctness/stmt_to_html.cpp
+++ b/test/correctness/stmt_to_html.cpp
@@ -6,6 +6,16 @@
 using namespace Halide;
 
 int main() {
+    // SVE2 backend has the below LLVM issue which has been fixed in LLVM 22.
+    // "Request for a fixed element count on a scalable object"
+    // https://github.com/llvm/llvm-project/issues/160127
+    if (Internal::get_llvm_version() < 220 &&
+        get_jit_target_from_environment().has_feature(Target::SVE2)) {
+        printf("[SKIP] LLVM %d has known SVE backend bugs for this test.\n",
+               Internal::get_llvm_version());
+        return 0;
+    }
+
     Var x, y;
 
     // The gradient function and schedule from tutorial lesson 5.
diff --git a/test/performance/boundary_conditions.cpp b/test/performance/boundary_conditions.cpp
index 04c525ee9554..0dd7c98077e8 100644
--- a/test/performance/boundary_conditions.cpp
+++ b/test/performance/boundary_conditions.cpp
@@ -91,6 +91,13 @@ int main(int argc, char **argv) {
         return 0;
     }
 
+    if (Halide::Internal::get_llvm_version() < 220 &&
+        get_jit_target_from_environment().has_feature(Target::SVE2)) {
+        printf("[SKIP] LLVM %d has known SVE backend bugs for this test.\n",
+               Halide::Internal::get_llvm_version());
+        return 0;
+    }
+
     ImageParam input(Float(32), 2);
     ImageParam padded_input(Float(32), 2);
 
diff --git a/tutorial/CMakeLists.txt b/tutorial/CMakeLists.txt
index e90ce173798a..9b911dc1c552 100644
--- a/tutorial/CMakeLists.txt
+++ b/tutorial/CMakeLists.txt
@@ -51,7 +51,20 @@ add_tutorial(lesson_01_basics.cpp)
 add_tutorial(lesson_02_input_image.cpp WITH_IMAGE_IO)
 add_tutorial(lesson_03_debugging_1.cpp)
 add_tutorial(lesson_04_debugging_2.cpp GROUPS multithreaded)
-add_tutorial(lesson_05_scheduling_1.cpp GROUPS multithreaded)
+
+if (Halide_BUILDING_IN_CI AND
+    Halide_LLVM_VERSION VERSION_LESS 22 AND
+    Halide_HOST_TARGET MATCHES "arm-64-linux")
+    # We can't reliably detect SVE2 without having first built Halide, but the
+    # SVE2 backend has the below LLVM issue which has been fixed in LLVM 22.
+    # This issue breaks lesson 5 on the GitHub Actions ARM64 runners, which have
+    # SVE2 support.
+    # "Request for a fixed element count on a scalable object"
+    # See: https://github.com/llvm/llvm-project/issues/160127
+else ()
+    add_tutorial(lesson_05_scheduling_1.cpp GROUPS multithreaded)
+endif ()
+
 add_tutorial(lesson_06_realizing_over_shifted_domains.cpp)
 add_tutorial(lesson_07_multi_stage_pipelines.cpp WITH_IMAGE_IO)
 add_tutorial(lesson_08_scheduling_2.cpp WITH_IMAGE_IO WITH_OPENMP GROUPS multithreaded)