Skip to content

Commit 4cf212c

Browse files
authored
Merge pull request #784 from hvdijk/vecz-barrier-id
[vecz] Track barriers by ID.
2 parents 124cd80 + e0d0de9 commit 4cf212c

4 files changed

Lines changed: 136 additions & 49 deletions

File tree

modules/compiler/compiler_pipeline/include/compiler/utils/barrier_regions.h

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
#include <llvm/Transforms/Utils/ValueMapper.h>
3232
#include <multi_llvm/llvm_version.h>
3333

34+
#include <map>
35+
3436
#include "pass_functions.h"
3537

3638
namespace llvm {
@@ -82,10 +84,11 @@ struct BarrierRegion {
8284
BarrierSchedule schedule = BarrierSchedule::Unordered;
8385
};
8486

85-
using BarrierGraph = llvm::SmallVector<BarrierRegion, 8>;
86-
8787
class Barrier {
8888
public:
89+
/// @brief Type for ids of new kernel functions
90+
using kernel_id_map_t = std::map<unsigned, llvm::Function *>;
91+
8992
Barrier(llvm::Module &m, llvm::Function &f, bool IsDebug)
9093
: live_var_mem_ty_(nullptr),
9194
size_t_bytes(compiler::utils::getSizeTypeBytes(m)),
@@ -106,6 +109,9 @@ class Barrier {
106109
/// @brief returns the maximum alignment of the barrier struct
107110
unsigned getLiveVarMaxAlignment() const { return max_live_var_alignment; }
108111

112+
/// @brief gets the split subkernels
113+
const kernel_id_map_t &getSubkernels() const { return kernel_id_map_; }
114+
109115
/// @brief gets the split subkernel for the given barrier id
110116
llvm::Function *getSubkernel(unsigned id) const {
111117
return kernel_id_map_.find(id)->second;
@@ -116,7 +122,7 @@ class Barrier {
116122

117123
llvm::CallInst *getBarrierCall(unsigned id) const {
118124
return llvm::dyn_cast_or_null<llvm::CallInst>(
119-
barrier_graph[id - kBarrier_FirstID].barrier_inst);
125+
barrier_region_id_map_.find(id)->second.barrier_inst);
120126
}
121127

122128
/// @brief gets the size of the fixed sized part of the barrier struct
@@ -135,12 +141,12 @@ class Barrier {
135141

136142
/// @brief gets the barrier IDs of the successors of the given barrier region
137143
const llvm::SmallVectorImpl<unsigned> &getSuccessorIds(unsigned id) const {
138-
return barrier_graph[id - kBarrier_FirstID].successor_ids;
144+
return barrier_region_id_map_.find(id)->second.successor_ids;
139145
}
140146

141147
/// @brief gets the barrier IDs of the successors of the given barrier region
142148
BarrierSchedule getSchedule(unsigned id) const {
143-
return barrier_graph[id - kBarrier_FirstID].schedule;
149+
return barrier_region_id_map_.find(id)->second.schedule;
144150
}
145151

146152
/// @brief replaces a subkernel with a given function
@@ -223,8 +229,8 @@ class Barrier {
223229
using live_variable_scalables_map_t = live_variable_index_map_t;
224230
/// @brief Type for ids of barriers
225231
using barrier_id_map_t = llvm::DenseMap<llvm::BasicBlock *, unsigned>;
226-
/// @brief Type for ids of new kernel functions
227-
using kernel_id_map_t = llvm::DenseMap<unsigned, llvm::Function *>;
232+
/// @brief Type for ids of barrier regions
233+
using barrier_region_id_map_t = std::map<unsigned, BarrierRegion>;
228234
/// @brief Type for map from ids to fence instructions
229235
using fence_id_map_t = llvm::DenseMap<unsigned, llvm::FenceInst *>;
230236
/// @brief Type between block and instruction for barrier.
@@ -245,6 +251,8 @@ class Barrier {
245251
live_variable_scalables_map_t live_variable_scalables_map_;
246252
/// @brief Keep ids of barriers.
247253
barrier_id_map_t barrier_id_map_;
254+
/// @brief Look up a barrier region by its id.
255+
barrier_region_id_map_t barrier_region_id_map_;
248256
/// @brief Keep ids of barriers.
249257
kernel_id_map_t kernel_id_map_;
250258
/// @brief Keep struct types for live variables' memory layout.
@@ -269,8 +277,6 @@ class Barrier {
269277

270278
size_t size_t_bytes;
271279

272-
BarrierGraph barrier_graph;
273-
274280
llvm::Module &module_;
275281
llvm::Function &func_;
276282

modules/compiler/compiler_pipeline/source/barrier_regions.cpp

Lines changed: 18 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -437,11 +437,12 @@ void compiler::utils::Barrier::Run(llvm::ModuleAnalysisManager &mam) {
437437
bi_ = &mam.getResult<BuiltinInfoAnalysis>(module_);
438438
FindBarriers();
439439

440+
kernel_id_map_[kBarrier_EndID] = nullptr;
441+
440442
if (barriers_.empty()) {
441443
// If there are no barriers, we can use the original function as the
442444
// single barrier region.
443-
barrier_graph.emplace_back();
444-
auto &node = barrier_graph.back();
445+
auto &node = barrier_region_id_map_[kBarrier_FirstID];
445446
node.entry = &func_.getEntryBlock();
446447
node.id = kBarrier_FirstID;
447448
node.successor_ids.push_back(kBarrier_EndID);
@@ -513,11 +514,9 @@ void compiler::utils::Barrier::FindBarriers() {
513514
if (callee != nullptr) {
514515
const auto B = bi_->analyzeBuiltin(*callee);
515516
if (BuiltinInfo::isMuxBuiltinWithWGBarrierID(B.ID)) {
516-
unsigned id = ~0u;
517517
auto *const id_param = call_inst->getOperand(0);
518-
if (auto *const id_param_c = dyn_cast<ConstantInt>(id_param)) {
519-
id = id_param_c->getZExtValue();
520-
}
518+
auto *const id_param_c = cast<ConstantInt>(id_param);
519+
const auto id = id_param_c->getZExtValue();
521520
orderedBarriers.emplace_back(id, call_inst);
522521
}
523522
}
@@ -548,13 +547,15 @@ void compiler::utils::Barrier::SplitBlockwithBarrier() {
548547
exit_stub = MakeStubFunction("__barrier_exit", module_, stub_cc);
549548
}
550549

551-
barrier_graph.emplace_back();
552-
auto &node = barrier_graph.back();
550+
auto &node = barrier_region_id_map_[kBarrier_FirstID];
553551
node.entry = &func_.getEntryBlock();
554552
node.id = kBarrier_FirstID;
555553

556-
unsigned barrier_id = kBarrier_StartNewID;
557554
for (CallInst *split_point : barriers_) {
555+
// ID identifying which barrier invoked stub used as argument to call.
556+
auto *id = cast<ConstantInt>(split_point->getOperand(0));
557+
const auto barrier_id = kBarrier_StartNewID + id->getZExtValue();
558+
558559
if (is_debug_) {
559560
assert(entry_stub != nullptr); // Guaranteed as is_debug_ is const.
560561
assert(exit_stub != nullptr); // Guaranteed as is_debug_ is const.
@@ -564,10 +565,6 @@ void compiler::utils::Barrier::SplitBlockwithBarrier() {
564565
// them at a point where live variables have already been loaded. This
565566
// info won't be available till later.
566567

567-
// ID identifying which barrier invoked stub used as argument to call.
568-
// This number monotonically increases from 0 for each barrier.
569-
auto id = ConstantInt::get(Type::getInt32Ty(module_.getContext()),
570-
barrier_id - kBarrier_StartNewID);
571568
// Call invoking entry stub
572569
auto entry_caller = CallInst::Create(entry_stub, id);
573570
entry_caller->setDebugLoc(split_point->getDebugLoc());
@@ -583,10 +580,9 @@ void compiler::utils::Barrier::SplitBlockwithBarrier() {
583580
std::make_pair(entry_caller, exit_caller);
584581
}
585582

586-
barrier_graph.emplace_back();
587-
auto &node = barrier_graph.back();
583+
auto &node = barrier_region_id_map_[barrier_id];
588584
node.barrier_inst = split_point;
589-
node.id = barrier_id++;
585+
node.id = barrier_id;
590586
node.schedule = getBarrierSchedule(*split_point);
591587

592588
// Our scan implementation requires a linear work-item ordering, to loop
@@ -603,7 +599,7 @@ void compiler::utils::Barrier::SplitBlockwithBarrier() {
603599
// We have to gather the basic block data after splitting, because we
604600
// might not be processing barriers in program order, and things can get
605601
// awfully confused.
606-
for (auto &node : barrier_graph) {
602+
for (auto &[i, node] : barrier_region_id_map_) {
607603
if (node.barrier_inst) {
608604
auto *const bb = node.barrier_inst->getParent();
609605
barrier_id_map_[bb] = node.id;
@@ -770,7 +766,7 @@ void compiler::utils::Barrier::FindLiveVariables() {
770766
}
771767
}
772768

773-
for (auto &region : barrier_graph) {
769+
for (auto &[i, region] : barrier_region_id_map_) {
774770
GatherBarrierRegionBlocks(region);
775771
GatherBarrierRegionUses(region, func_args);
776772
whole_live_variables_set_.set_union(region.uses_int);
@@ -1150,9 +1146,9 @@ Function *compiler::utils::Barrier::GenerateNewKernel(BarrierRegion &region) {
11501146
} else if (ReturnInst *ret =
11511147
dyn_cast<ReturnInst>(cloned_bb->getTerminator())) {
11521148
// Change return instruction with end barrier number.
1153-
ConstantInt *cst_zero =
1149+
ConstantInt *cst_endid =
11541150
ConstantInt::get(Type::getInt32Ty(context), kBarrier_EndID);
1155-
ReturnInst *new_ret = ReturnInst::Create(context, cst_zero);
1151+
ReturnInst *new_ret = ReturnInst::Create(context, cst_endid);
11561152
new_ret->insertBefore(ret->getIterator());
11571153
ret->replaceAllUsesWith(new_ret);
11581154
ret->eraseFromParent();
@@ -1450,7 +1446,7 @@ BasicBlock *compiler::utils::Barrier::CloneBasicBlock(
14501446
void compiler::utils::Barrier::SeperateKernelWithBarrier() {
14511447
if (barriers_.empty()) return;
14521448

1453-
for (auto &region : barrier_graph) {
1449+
for (auto &[i, region] : barrier_region_id_map_) {
14541450
kernel_id_map_[region.id] = GenerateNewKernel(region);
14551451
}
14561452

@@ -1467,15 +1463,10 @@ void compiler::utils::Barrier::SeperateKernelWithBarrier() {
14671463

14681464
LLVM_DEBUG({
14691465
for (const auto &Kid : kernel_id_map_) {
1470-
dbgs() << "1. kernel_id[" << Kid.first << "] = " << Kid.second->getName()
1466+
dbgs() << "kernel_id[" << Kid.first << "] = " << Kid.second->getName()
14711467
<< "\n";
14721468
}
14731469

1474-
for (unsigned I = kBarrier_FirstID;
1475-
I < kernel_id_map_.size() + kBarrier_FirstID; I++) {
1476-
dbgs() << "2. kernel_id[" << I << "] = " << kernel_id_map_[I]->getName()
1477-
<< "\n";
1478-
}
14791470
dbgs() << "\n\n" << module_ << "\n\n";
14801471
});
14811472
}

modules/compiler/compiler_pipeline/source/work_item_loops_pass.cpp

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1552,13 +1552,14 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
15521552
AllocaInst *nextID =
15531553
entryIR.CreateAlloca(index_type, nullptr, "next_barrier_id");
15541554

1555-
SmallVector<BasicBlock *, 8> bbs;
1556-
const unsigned num_blocks = barrierMain.getNumSubkernels();
1557-
assert(!emitTail || barrierTail->getNumSubkernels() == num_blocks);
1558-
1559-
for (unsigned i = kBarrier_EndID; i <= num_blocks; i++) {
1560-
BasicBlock *bb = BasicBlock::Create(context, "sw.bb", new_wrapper);
1561-
bbs.push_back(bb);
1555+
std::map<unsigned, BasicBlock *> bbs;
1556+
// The vectorized kernel has been further optimized and may have removed
1557+
// unreachable barriers that are still present in the scalar kernel. But if
1558+
// they are unreachable, we know they must also be unreachable in the scalar
1559+
// kernel even if we have not yet detected that.
1560+
1561+
for (auto &[i, subkernel] : barrierMain.getSubkernels()) {
1562+
bbs[i] = BasicBlock::Create(context, "sw.bb", new_wrapper);
15621563
}
15631564

15641565
ScheduleGenerator schedule(M, barrierMain, barrierTail, BI);
@@ -1584,7 +1585,9 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
15841585
// Branch directly into the first basic block.
15851586
entryIR.CreateBr(bbs[kBarrier_FirstID]);
15861587

1587-
for (unsigned i = kBarrier_EndID; i <= num_blocks; i++) {
1588+
for (auto &[i_, subkernel_] : barrierMain.getSubkernels()) {
1589+
auto i = i_;
1590+
15881591
// Keep it linear
15891592
BasicBlock *const block = bbs[i];
15901593
block->moveAfter(&new_wrapper->back());
@@ -1663,7 +1666,7 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
16631666

16641667
if (num_succ == 1) {
16651668
// If there is only one successor, we can branch directly to it
1666-
exitIR.CreateBr(bbs[successors.front()]);
1669+
exitIR.CreateBr(bbs.find(successors.front())->second);
16671670
} else if (num_succ == 2) {
16681671
// If there are exactly two successors, we can use a conditional branch
16691672
auto *const bb_id = ConstantInt::get(index_type, successors[0]);
@@ -1673,8 +1676,8 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
16731676
auto *const cmp_id =
16741677
CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, ld_next_id,
16751678
bb_id, "", br_block);
1676-
BranchInst::Create(bbs[successors[0]], bbs[successors[1]], cmp_id,
1677-
br_block);
1679+
BranchInst::Create(bbs.find(successors[0])->second,
1680+
bbs.find(successors[1])->second, cmp_id, br_block);
16781681

16791682
exitIR.CreateBr(br_block);
16801683
} else if (num_succ == 0) {
@@ -1700,9 +1703,9 @@ Function *compiler::utils::WorkItemLoopsPass::makeWrapperFunction(
17001703
LoadInst *const ld_next_id =
17011704
new LoadInst(index_type, nextID, "", switch_body);
17021705
SwitchInst *const sw = SwitchInst::Create(
1703-
ld_next_id, bbs[successors[0]], num_succ, switch_body);
1706+
ld_next_id, bbs.find(successors[0])->second, num_succ, switch_body);
17041707
for (const auto i : successors) {
1705-
sw->addCase(ConstantInt::get(index_type, i), bbs[i]);
1708+
sw->addCase(ConstantInt::get(index_type, i), bbs.find(i)->second);
17061709
}
17071710
exitIR.CreateBr(switch_body);
17081711
}
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
; Copyright (C) Codeplay Software Limited
2+
;
3+
; Licensed under the Apache License, Version 2.0 (the "License") with LLVM
4+
; Exceptions; you may not use this file except in compliance with the License.
5+
; You may obtain a copy of the License at
6+
;
7+
; https://github.com/codeplaysoftware/oneapi-construction-kit/blob/main/LICENSE.txt
8+
;
9+
; Unless required by applicable law or agreed to in writing, software
10+
; distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11+
; WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12+
; License for the specific language governing permissions and limitations
13+
; under the License.
14+
;
15+
; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
16+
17+
; RUN: muxc --passes work-item-loops,verify -S %s | FileCheck %s
18+
19+
; This test checks the validity of a set of main/tail loops when the
20+
; barriers between vector and scalar kernels do not match up.
21+
22+
target triple = "spir64-unknown-unknown"
23+
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
24+
25+
define spir_kernel void @foo(ptr addrspace(1) %a) !codeplay_ca_vecz.base !0 {
26+
entry:
27+
store i1 false, ptr addrspace(1) %a, align 4
28+
call void @__mux_work_group_barrier(i32 0, i32 1, i32 272) #8
29+
%load = load i1, ptr addrspace(1) %a, align 4
30+
call void @__mux_work_group_barrier(i32 1, i32 1, i32 272) #8
31+
br i1 %load, label %bb.1, label %bb.2
32+
33+
bb.1:
34+
store i1 poison, ptr addrspace(1) poison
35+
call void @__mux_work_group_barrier(i32 2, i32 1, i32 272) #8
36+
br label %bb.3
37+
38+
bb.2:
39+
store i1 true, ptr addrspace(1) %a
40+
call void @__mux_work_group_barrier(i32 3, i32 1, i32 272) #8
41+
br label %bb.3
42+
43+
bb.3:
44+
call void @__mux_work_group_barrier(i32 4, i32 1, i32 272) #8
45+
ret void
46+
}
47+
48+
define spir_kernel void @__vecz_v2_foo(ptr addrspace(1) %a) #0 !codeplay_ca_vecz.derived !2 {
49+
entry:
50+
store i1 false, ptr addrspace(1) %a, align 4
51+
call void @__mux_work_group_barrier(i32 0, i32 1, i32 272)
52+
%load = load i1, ptr addrspace(1) %a, align 4
53+
call void @__mux_work_group_barrier(i32 1, i32 1, i32 272)
54+
%0 = xor i1 %load, true
55+
call void @llvm.assume(i1 %0)
56+
store i1 true, ptr addrspace(1) %a, align 1
57+
call void @__mux_work_group_barrier(i32 3, i32 1, i32 272)
58+
call void @__mux_work_group_barrier(i32 4, i32 1, i32 272)
59+
ret void
60+
}
61+
62+
declare void @__mux_work_group_barrier(i32, i32, i32)
63+
64+
attributes #0 = { norecurse nounwind "mux-kernel"="entry-point" "mux-base-fn-name"="foo"}
65+
attributes #1 = { nounwind "mux-barrier-schedule"="linear" }
66+
67+
; The block which has conditional undefined behavior in the scalar kernel.
68+
;
69+
; CHECK-LABEL: define internal spir_func i32 @foo.mux-barrier-region.6(ptr addrspace(1) %0, ptr %1)
70+
; CHECK: bb.1:
71+
; CHECK-NEXT: store i1 poison, ptr addrspace(1) poison, align 1
72+
;
73+
; The block which only follows after undefined behavior in the scalar kernel.
74+
;
75+
; CHECK-LABEL: define internal spir_func i32 @foo.mux-barrier-region.7(ptr addrspace(1) %0, ptr %1)
76+
; CHECK-NEXT: barrier2:
77+
;
78+
; Check that we do not call the unreachable region.
79+
;
80+
; CHECK-LABEL: define spir_kernel void @foo.mux-barrier-wrapper(ptr addrspace(1) %a)
81+
; CHECK: call spir_func i32 @foo.mux-barrier-region.6(
82+
; CHECK-NOT: call spir_func i32 @foo.mux-barrier-region.7(
83+
84+
; Vectorized by 2
85+
!0 = !{!1, ptr @__vecz_v2_foo}
86+
!1 = !{i32 2, i32 0, i32 0, i32 0}
87+
!2 = !{!1, ptr @foo}

0 commit comments

Comments
 (0)