halide
diff --git a/‎.github/workflows/pip.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pip.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎apps/iir_blur/Makefile‎
Lines changed: 1 addition & 1 deletion b/‎apps/iir_blur/Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎apps/iir_blur/iir_blur_generator.cpp‎
Lines changed: 13 additions & 6 deletions b/‎apps/iir_blur/iir_blur_generator.cpp‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎src/CSE.cpp‎
Lines changed: 32 additions & 3 deletions b/‎src/CSE.cpp‎
Lines changed: 32 additions & 3 deletions
diff --git a/‎src/CodeGen_ARM.cpp‎
Lines changed: 6 additions & 4 deletions b/‎src/CodeGen_ARM.cpp‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎src/CodeGen_Hexagon.cpp‎
Lines changed: 7 additions & 4 deletions b/‎src/CodeGen_Hexagon.cpp‎
Lines changed: 7 additions & 4 deletions
@@ -47,7 +47,7 @@ jobs:
           fetch-tags: true
 
       - uses: ilammy/msvc-dev-cmd@v1
-      - uses: lukka/get-cmake@v4.3.1
+      - uses: lukka/get-cmake@v4.3.2
         with:
           cmakeVersion: "~3.28.0"
 
 
@@ -25,7 +25,7 @@ $(BIN)/%/filter: filter.cpp $(BIN)/%/iir_blur.a $(BIN)/%/iir_blur_auto_schedule.
 	$(CXX) $(CXXFLAGS) -I$(BIN)/$* -Wall -O3 $^ -o $@ $(LDFLAGS) $(IMAGE_IO_FLAGS) $(CUDA_LDFLAGS) $(OPENCL_LDFLAGS)
 
 $(BIN)/%/out.png: $(BIN)/%/filter
-	$< ../images/rgba.png $(BIN)/$*/out.png
+	$< ../images/rgb.png $(BIN)/$*/out.png
 
 clean:
 	rm -rf $(BIN)
@@ -36,19 +36,26 @@ Func blur_cols_transpose(Func input, Expr height, Expr alpha, bool skip_schedule
     if (!skip_schedule) {
         if (!target.has_gpu_feature()) {
             // CPU schedule.
-            // 8.2ms on an Intel i9-9960X using 16 threads
+            // 9.7ms on an Intel i9-9960X at 3.1 GHz using 16 threads
             // Split the transpose into tiles of rows. Parallelize over channels
-            // and strips (Halide supports nested parallelism).
-            Var xo, yo, t;
+            // and strips.
+            Var xo, yo, t, yi;
             transpose.compute_root()
                 .tile(x, y, xo, yo, x, y, vec, vec * 4)
+                .split(y, y, yi, vec)
+                .vectorize(yi)
                 .vectorize(x)
-                .parallel(yo)
-                .parallel(c);
+                .fuse(yo, c, t)
+                .parallel(t);
+
+            blur.in(transpose)
+                .compute_at(transpose, y)
+                .vectorize(x)
+                .unroll(y);
 
             // Run the filter on each row of tiles (which corresponds to a strip of
             // columns in the input).
-            blur.compute_at(transpose, yo);
+            blur.compute_at(transpose, t);
 
             // Vectorize computations within the strips.
             blur.update(0)
 
@@ -239,10 +239,39 @@ class CSEEveryExprInStmt : public IRMutator {
         }
         const Call *bundle = Call::as_intrinsic(dummy, {Call::bundle});
         internal_assert(bundle && bundle->args.size() == 2);
-        Stmt s = Store::make(op->name, bundle->args[0], bundle->args[1],
+
+        Expr value = bundle->args[0], index = bundle->args[1];
+
+        // Figure out which ones are actually needed by the index
+
+        auto add_all_vars_to_set = [&](const Expr &e, std::set<std::string> &s) {
+            visit_with(e, [&](auto *, const Variable *var) {
+                s.insert(var->name);
+            });
+        };
+
+        std::set<string> index_lets;
+        add_all_vars_to_set(index, index_lets);
+        for (const auto &[var, val] : reverse_view(lets)) {
+            if (index_lets.count(var)) {
+                add_all_vars_to_set(val, index_lets);
+            }
+        }
+
+        vector<pair<string, Expr>> deferred;
+        for (const auto &[var, val] : reverse_view(lets)) {
+            if (index_lets.count(var)) {
+                deferred.emplace_back(var, val);
+            } else {
+                value = Let::make(var, val, value);
+            }
+        }
+
+        Stmt s = Store::make(op->name, value, index,
                              op->param, mutate(op->predicate), op->alignment);
-        for (const auto &[var, value] : reverse_view(lets)) {
-            s = LetStmt::make(var, value, s);
+
+        for (const auto &[var, val] : deferred) {
+            s = LetStmt::make(var, val, s);
         }
         return s;
     }
 
@@ -1499,10 +1499,11 @@ void CodeGen_ARM::visit(const Store *op) {
         intrin_type = t;
         Type elt = t.element_of();
         int vec_bits = t.bits() * t.lanes();
-        if (elt == Float(32) || elt == Float(64) ||
-            is_float16_and_has_feature(elt) ||
-            elt == Int(8) || elt == Int(16) || elt == Int(32) || elt == Int(64) ||
-            elt == UInt(8) || elt == UInt(16) || elt == UInt(32) || elt == UInt(64)) {
+        if (t.bits() <= target.bits &&
+            (elt == Float(32) || elt == Float(64) ||
+             is_float16_and_has_feature(elt) ||
+             elt == Int(8) || elt == Int(16) || elt == Int(32) || elt == Int(64) ||
+             elt == UInt(8) || elt == UInt(16) || elt == UInt(32) || elt == UInt(64))) {
             const int target_vector_bits = native_vector_bits();
             if (vec_bits % 128 == 0) {
                 type_ok_for_vst = true;
@@ -1978,6 +1979,7 @@ void CodeGen_ARM::visit(const Shuffle *op) {
     if (target.os != Target::IOS && target.os != Target::OSX &&
         load &&
         op->vectors.size() == 1 &&
+        op->is_slice() &&
         2 <= stride && stride <= 4 &&
         op->slice_begin() < stride &&
         load->type.lanes() == stride * op->type.lanes()) {
 
@@ -95,6 +95,7 @@ class CodeGen_Hexagon : public CodeGen_CPU {
     llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &v) override;
     llvm::Value *shuffle_vectors(llvm::Value *a, llvm::Value *b,
                                  const std::vector<int> &indices) override;
+    llvm::Value *optimization_fence(llvm::Value *v) override;
     using CodeGen_CPU::shuffle_vectors;
     ///@}
 
@@ -1301,6 +1302,12 @@ Value *CodeGen_Hexagon::shuffle_vectors(Value *a, Value *b,
     return vdelta(concat_vectors({a, b}), indices);
 }
 
+Value *CodeGen_Hexagon::optimization_fence(Value *v) {
+    // As of llvm 21, the base class version seems to trip up LLVM's hexagon
+    // backend, possibly because it relies on a floating point type.
+    return v;
+}
+
 Value *CodeGen_Hexagon::vlut256(Value *lut, Value *idx, int min_index,
                                 int max_index) {
     llvm::Type *lut_ty = lut->getType();
@@ -1409,10 +1416,6 @@ Value *CodeGen_Hexagon::vlut256(Value *lut, Value *idx, int min_index,
     return slice_vector(concat_vectors(result), 0, idx_elements);
 }
 
-bool is_power_of_two(int x) {
-    return (x & (x - 1)) == 0;
-}
-
 // vdelta and vrdelta are instructions that take an input vector and
 // pass it through a network made up of levels. Each element x at each
 // level i can either take the element from the previous level at the