Skip to content

Commit 72360f6

Browse files
authored
Merge branch 'halide:main' into main
2 parents eaa2054 + dd187a2 commit 72360f6

37 files changed

Lines changed: 1862 additions & 200 deletions

.github/workflows/pip.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ jobs:
4747
fetch-tags: true
4848

4949
- uses: ilammy/msvc-dev-cmd@v1
50-
- uses: lukka/get-cmake@v4.3.1
50+
- uses: lukka/get-cmake@v4.3.2
5151
with:
5252
cmakeVersion: "~3.28.0"
5353

apps/iir_blur/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ $(BIN)/%/filter: filter.cpp $(BIN)/%/iir_blur.a $(BIN)/%/iir_blur_auto_schedule.
2525
$(CXX) $(CXXFLAGS) -I$(BIN)/$* -Wall -O3 $^ -o $@ $(LDFLAGS) $(IMAGE_IO_FLAGS) $(CUDA_LDFLAGS) $(OPENCL_LDFLAGS)
2626

2727
$(BIN)/%/out.png: $(BIN)/%/filter
28-
$< ../images/rgba.png $(BIN)/$*/out.png
28+
$< ../images/rgb.png $(BIN)/$*/out.png
2929

3030
clean:
3131
rm -rf $(BIN)

apps/iir_blur/iir_blur_generator.cpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,19 +36,26 @@ Func blur_cols_transpose(Func input, Expr height, Expr alpha, bool skip_schedule
3636
if (!skip_schedule) {
3737
if (!target.has_gpu_feature()) {
3838
// CPU schedule.
39-
// 8.2ms on an Intel i9-9960X using 16 threads
39+
// 9.7ms on an Intel i9-9960X at 3.1 GHz using 16 threads
4040
// Split the transpose into tiles of rows. Parallelize over channels
41-
// and strips (Halide supports nested parallelism).
42-
Var xo, yo, t;
41+
// and strips.
42+
Var xo, yo, t, yi;
4343
transpose.compute_root()
4444
.tile(x, y, xo, yo, x, y, vec, vec * 4)
45+
.split(y, y, yi, vec)
46+
.vectorize(yi)
4547
.vectorize(x)
46-
.parallel(yo)
47-
.parallel(c);
48+
.fuse(yo, c, t)
49+
.parallel(t);
50+
51+
blur.in(transpose)
52+
.compute_at(transpose, y)
53+
.vectorize(x)
54+
.unroll(y);
4855

4956
// Run the filter on each row of tiles (which corresponds to a strip of
5057
// columns in the input).
51-
blur.compute_at(transpose, yo);
58+
blur.compute_at(transpose, t);
5259

5360
// Vectorize computations within the strips.
5461
blur.update(0)

src/CSE.cpp

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -239,10 +239,39 @@ class CSEEveryExprInStmt : public IRMutator {
239239
}
240240
const Call *bundle = Call::as_intrinsic(dummy, {Call::bundle});
241241
internal_assert(bundle && bundle->args.size() == 2);
242-
Stmt s = Store::make(op->name, bundle->args[0], bundle->args[1],
242+
243+
Expr value = bundle->args[0], index = bundle->args[1];
244+
245+
// Figure out which ones are actually needed by the index
246+
247+
auto add_all_vars_to_set = [&](const Expr &e, std::set<std::string> &s) {
248+
visit_with(e, [&](auto *, const Variable *var) {
249+
s.insert(var->name);
250+
});
251+
};
252+
253+
std::set<string> index_lets;
254+
add_all_vars_to_set(index, index_lets);
255+
for (const auto &[var, val] : reverse_view(lets)) {
256+
if (index_lets.count(var)) {
257+
add_all_vars_to_set(val, index_lets);
258+
}
259+
}
260+
261+
vector<pair<string, Expr>> deferred;
262+
for (const auto &[var, val] : reverse_view(lets)) {
263+
if (index_lets.count(var)) {
264+
deferred.emplace_back(var, val);
265+
} else {
266+
value = Let::make(var, val, value);
267+
}
268+
}
269+
270+
Stmt s = Store::make(op->name, value, index,
243271
op->param, mutate(op->predicate), op->alignment);
244-
for (const auto &[var, value] : reverse_view(lets)) {
245-
s = LetStmt::make(var, value, s);
272+
273+
for (const auto &[var, val] : deferred) {
274+
s = LetStmt::make(var, val, s);
246275
}
247276
return s;
248277
}

src/CodeGen_ARM.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1499,10 +1499,11 @@ void CodeGen_ARM::visit(const Store *op) {
14991499
intrin_type = t;
15001500
Type elt = t.element_of();
15011501
int vec_bits = t.bits() * t.lanes();
1502-
if (elt == Float(32) || elt == Float(64) ||
1503-
is_float16_and_has_feature(elt) ||
1504-
elt == Int(8) || elt == Int(16) || elt == Int(32) || elt == Int(64) ||
1505-
elt == UInt(8) || elt == UInt(16) || elt == UInt(32) || elt == UInt(64)) {
1502+
if (t.bits() <= target.bits &&
1503+
(elt == Float(32) || elt == Float(64) ||
1504+
is_float16_and_has_feature(elt) ||
1505+
elt == Int(8) || elt == Int(16) || elt == Int(32) || elt == Int(64) ||
1506+
elt == UInt(8) || elt == UInt(16) || elt == UInt(32) || elt == UInt(64))) {
15061507
const int target_vector_bits = native_vector_bits();
15071508
if (vec_bits % 128 == 0) {
15081509
type_ok_for_vst = true;
@@ -1978,6 +1979,7 @@ void CodeGen_ARM::visit(const Shuffle *op) {
19781979
if (target.os != Target::IOS && target.os != Target::OSX &&
19791980
load &&
19801981
op->vectors.size() == 1 &&
1982+
op->is_slice() &&
19811983
2 <= stride && stride <= 4 &&
19821984
op->slice_begin() < stride &&
19831985
load->type.lanes() == stride * op->type.lanes()) {

src/CodeGen_Hexagon.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ class CodeGen_Hexagon : public CodeGen_CPU {
9595
llvm::Value *interleave_vectors(const std::vector<llvm::Value *> &v) override;
9696
llvm::Value *shuffle_vectors(llvm::Value *a, llvm::Value *b,
9797
const std::vector<int> &indices) override;
98+
llvm::Value *optimization_fence(llvm::Value *v) override;
9899
using CodeGen_CPU::shuffle_vectors;
99100
///@}
100101

@@ -1301,6 +1302,12 @@ Value *CodeGen_Hexagon::shuffle_vectors(Value *a, Value *b,
13011302
return vdelta(concat_vectors({a, b}), indices);
13021303
}
13031304

1305+
Value *CodeGen_Hexagon::optimization_fence(Value *v) {
1306+
// As of llvm 21, the base class version seems to trip up LLVM's hexagon
1307+
// backend, possibly because it relies on a floating point type.
1308+
return v;
1309+
}
1310+
13041311
Value *CodeGen_Hexagon::vlut256(Value *lut, Value *idx, int min_index,
13051312
int max_index) {
13061313
llvm::Type *lut_ty = lut->getType();
@@ -1409,10 +1416,6 @@ Value *CodeGen_Hexagon::vlut256(Value *lut, Value *idx, int min_index,
14091416
return slice_vector(concat_vectors(result), 0, idx_elements);
14101417
}
14111418

1412-
bool is_power_of_two(int x) {
1413-
return (x & (x - 1)) == 0;
1414-
}
1415-
14161419
// vdelta and vrdelta are instructions that take an input vector and
14171420
// pass it through a network made up of levels. Each element x at each
14181421
// level i can either take the element from the previous level at the

0 commit comments

Comments
 (0)