|
28 | 28 | #include "llvm/IR/ReplaceConstant.h" |
29 | 29 | #include "llvm/Support/Format.h" |
30 | 30 | #include "llvm/Support/VirtualFileSystem.h" |
31 | | -#include "llvm/Transforms/Utils/ModuleUtils.h" |
32 | 31 |
|
33 | 32 | using namespace clang; |
34 | 33 | using namespace CodeGen; |
@@ -73,11 +72,6 @@ class CGNVCUDARuntime : public CGCUDARuntime { |
73 | 72 | /// ModuleCtorFunction() and used to create corresponding cleanup calls in |
74 | 73 | /// ModuleDtorFunction() |
75 | 74 | llvm::GlobalVariable *GpuBinaryHandle = nullptr; |
76 | | - /// Host-side shadow for the per-TU __llvm_profile_sections_<CUID> global, |
77 | | - /// emitted only for HIP host compiles when PGO is on. Registered via |
78 | | - /// __hipRegisterVar (non-RDC) or an offloading entry (RDC) so the runtime |
79 | | - /// can locate the device-side table by name. |
80 | | - llvm::GlobalVariable *OffloadProfShadow = nullptr; |
81 | 75 | /// Whether we generate relocatable device code. |
82 | 76 | bool RelocatableDeviceCode; |
83 | 77 | /// Mangle context for device. |
@@ -182,13 +176,6 @@ class CGNVCUDARuntime : public CGCUDARuntime { |
182 | 176 | void transformManagedVars(); |
183 | 177 | /// Create offloading entries to register globals in RDC mode. |
184 | 178 | void createOffloadingEntries(); |
185 | | - /// For HIP+PGO, emit the per-TU __llvm_profile_sections_<CUID> global. |
186 | | - /// On the device side it is the populated 7-pointer section-bounds table. |
187 | | - /// On the host side it is a placeholder void* shadow stored in |
188 | | - /// OffloadProfShadow, registered later by makeRegisterGlobalsFn (non-RDC) |
189 | | - /// or createOffloadingEntries (RDC) so the runtime can locate the |
190 | | - /// device-side table by name. |
191 | | - void emitOffloadProfilingSections(); |
192 | 179 |
|
193 | 180 | public: |
194 | 181 | CGNVCUDARuntime(CodeGenModule &CGM); |
@@ -748,32 +735,6 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { |
748 | 735 | } |
749 | 736 | } |
750 | 737 |
|
751 | | - // Register the per-TU offload-profiling shadow so the host runtime can |
752 | | - // locate the matching device-side __llvm_profile_sections_<CUID>. We |
753 | | - // emit both __hipRegisterVar (so the HIP runtime can map the host |
754 | | - // shadow to the device symbol) and |
755 | | - // __llvm_profile_offload_register_shadow_variable (so the profile |
756 | | - // runtime adds the shadow to its drain list). |
757 | | - if (OffloadProfShadow) { |
758 | | - llvm::Constant *Name = |
759 | | - makeConstantString(std::string(OffloadProfShadow->getName())); |
760 | | - llvm::Value *RegisterVarArgs[] = { |
761 | | - &GpuBinaryHandlePtr, |
762 | | - OffloadProfShadow, |
763 | | - Name, |
764 | | - Name, |
765 | | - llvm::ConstantInt::get(IntTy, /*Extern=*/0), |
766 | | - llvm::ConstantInt::get(VarSizeTy, CGM.getDataLayout().getPointerSize()), |
767 | | - llvm::ConstantInt::get(IntTy, /*Constant=*/0), |
768 | | - llvm::ConstantInt::get(IntTy, 0)}; |
769 | | - Builder.CreateCall(RegisterVar, RegisterVarArgs); |
770 | | - |
771 | | - llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction( |
772 | | - llvm::FunctionType::get(VoidTy, {PtrTy}, false), |
773 | | - "__llvm_profile_offload_register_shadow_variable"); |
774 | | - Builder.CreateCall(RegisterShadow, {OffloadProfShadow}); |
775 | | - } |
776 | | - |
777 | 738 | Builder.CreateRetVoid(); |
778 | 739 | return RegisterKernelsFunc; |
779 | 740 | } |
@@ -1295,124 +1256,11 @@ void CGNVCUDARuntime::createOffloadingEntries() { |
1295 | 1256 | I.Flags.getSurfTexType()); |
1296 | 1257 | } |
1297 | 1258 | } |
1298 | | - |
1299 | | - // Register the per-TU offload-profiling shadow. The offloading entry |
1300 | | - // makes the linker-wrapper emit the host __hipRegisterVar call in the |
1301 | | - // combined ctor. Separately emit a per-TU ctor that registers the |
1302 | | - // shadow with the profile runtime's drain list. |
1303 | | - if (OffloadProfShadow) { |
1304 | | - llvm::offloading::emitOffloadingEntry( |
1305 | | - M, Kind, OffloadProfShadow, OffloadProfShadow->getName(), |
1306 | | - CGM.getDataLayout().getPointerSize(), |
1307 | | - llvm::offloading::OffloadGlobalEntry, /*Data=*/0); |
1308 | | - |
1309 | | - llvm::LLVMContext &Ctx = M.getContext(); |
1310 | | - auto *PtrTy = llvm::PointerType::getUnqual(Ctx); |
1311 | | - llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction( |
1312 | | - llvm::FunctionType::get(VoidTy, {PtrTy}, false), |
1313 | | - "__llvm_profile_offload_register_shadow_variable"); |
1314 | | - auto *CtorFn = llvm::Function::Create( |
1315 | | - llvm::FunctionType::get(VoidTy, false), |
1316 | | - llvm::GlobalValue::InternalLinkage, |
1317 | | - "__llvm_profile_register_shadow." + CGM.getContext().getCUIDHash(), &M); |
1318 | | - auto *Entry = llvm::BasicBlock::Create(Ctx, "entry", CtorFn); |
1319 | | - llvm::IRBuilder<> B(Entry); |
1320 | | - B.CreateCall(RegisterShadow, {OffloadProfShadow}); |
1321 | | - B.CreateRetVoid(); |
1322 | | - llvm::appendToGlobalCtors(M, CtorFn, /*Priority=*/65535); |
1323 | | - } |
1324 | | -} |
1325 | | - |
1326 | | -// For HIP host+device compiles with PGO enabled, emit the per-TU global |
1327 | | -// __llvm_profile_sections_<CUID>. Device side: a 7-pointer struct holding |
1328 | | -// section start/stop bounds for the names/counters/data sections plus the |
1329 | | -// raw-version variable. Host side: an opaque void* shadow whose only |
1330 | | -// purpose is to give the host-runtime a registered symbol name to look up |
1331 | | -// via hipGetSymbolAddress; the actual device-side data lives in the |
1332 | | -// matching device-side global. |
1333 | | -void CGNVCUDARuntime::emitOffloadProfilingSections() { |
1334 | | - if (!CGM.getLangOpts().HIP) |
1335 | | - return; |
1336 | | - if (!CGM.getCodeGenOpts().hasProfileInstr()) |
1337 | | - return; |
1338 | | - |
1339 | | - StringRef CUIDHash = CGM.getContext().getCUIDHash(); |
1340 | | - if (CUIDHash.empty()) |
1341 | | - return; |
1342 | | - |
1343 | | - llvm::Module &M = CGM.getModule(); |
1344 | | - llvm::LLVMContext &Ctx = M.getContext(); |
1345 | | - std::string Name = ("__llvm_profile_sections_" + CUIDHash).str(); |
1346 | | - |
1347 | | - // If the global already exists (e.g. another TU was merged in), don't |
1348 | | - // duplicate it. |
1349 | | - if (M.getNamedValue(Name)) |
1350 | | - return; |
1351 | | - |
1352 | | - if (CGM.getLangOpts().CUDAIsDevice) { |
1353 | | - // Device side: emit the populated struct. Section start/stop symbols |
1354 | | - // are linker-defined (ELF auto-generates __start_/__stop_ for any |
1355 | | - // section whose name is a valid C identifier; AMDGPU is ELF). |
1356 | | - unsigned GlobalAS = M.getDataLayout().getDefaultGlobalsAddressSpace(); |
1357 | | - auto *PtrTy = llvm::PointerType::get(Ctx, GlobalAS); |
1358 | | - auto getOrDeclare = [&](StringRef SymName) { |
1359 | | - if (auto *GV = M.getNamedGlobal(SymName)) |
1360 | | - return GV; |
1361 | | - auto *GV = new llvm::GlobalVariable( |
1362 | | - M, llvm::Type::getInt8Ty(Ctx), /*isConstant=*/false, |
1363 | | - llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, SymName, |
1364 | | - /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, |
1365 | | - GlobalAS); |
1366 | | - GV->setVisibility(llvm::GlobalValue::HiddenVisibility); |
1367 | | - return GV; |
1368 | | - }; |
1369 | | - auto *VersionGV = M.getNamedGlobal("__llvm_profile_raw_version"); |
1370 | | - if (!VersionGV) { |
1371 | | - VersionGV = new llvm::GlobalVariable( |
1372 | | - M, llvm::Type::getInt64Ty(Ctx), /*isConstant=*/true, |
1373 | | - llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, |
1374 | | - "__llvm_profile_raw_version", |
1375 | | - /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, |
1376 | | - GlobalAS); |
1377 | | - } |
1378 | | - |
1379 | | - auto *StructTy = llvm::StructType::get( |
1380 | | - Ctx, {PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy}); |
1381 | | - llvm::Constant *Fields[] = { |
1382 | | - getOrDeclare("__start___llvm_prf_names"), |
1383 | | - getOrDeclare("__stop___llvm_prf_names"), |
1384 | | - getOrDeclare("__start___llvm_prf_cnts"), |
1385 | | - getOrDeclare("__stop___llvm_prf_cnts"), |
1386 | | - getOrDeclare("__start___llvm_prf_data"), |
1387 | | - getOrDeclare("__stop___llvm_prf_data"), |
1388 | | - VersionGV, |
1389 | | - }; |
1390 | | - auto *Init = llvm::ConstantStruct::get(StructTy, Fields); |
1391 | | - auto *GV = new llvm::GlobalVariable( |
1392 | | - M, StructTy, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, |
1393 | | - Init, Name, /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal, |
1394 | | - GlobalAS); |
1395 | | - GV->setVisibility(llvm::GlobalValue::ProtectedVisibility); |
1396 | | - CGM.addCompilerUsedGlobal(GV); |
1397 | | - return; |
1398 | | - } |
1399 | | - |
1400 | | - // Host side: emit an opaque void* shadow. Layout doesn't matter — the |
1401 | | - // runtime locates it by name via hipGetSymbolAddress and treats it as |
1402 | | - // the address of the device-side struct. Registration with the HIP |
1403 | | - // runtime is added by makeRegisterGlobalsFn (non-RDC) or |
1404 | | - // createOffloadingEntries (RDC). |
1405 | | - auto *PtrTy = llvm::PointerType::getUnqual(Ctx); |
1406 | | - OffloadProfShadow = new llvm::GlobalVariable( |
1407 | | - M, PtrTy, /*isConstant=*/false, llvm::GlobalValue::ExternalLinkage, |
1408 | | - llvm::ConstantPointerNull::get(PtrTy), Name); |
1409 | | - CGM.addCompilerUsedGlobal(OffloadProfShadow); |
1410 | 1259 | } |
1411 | 1260 |
|
1412 | 1261 | // Returns module constructor to be added. |
1413 | 1262 | llvm::Function *CGNVCUDARuntime::finalizeModule() { |
1414 | 1263 | transformManagedVars(); |
1415 | | - emitOffloadProfilingSections(); |
1416 | 1264 | if (CGM.getLangOpts().CUDAIsDevice) { |
1417 | 1265 | // Mark ODR-used device variables as compiler used to prevent it from being |
1418 | 1266 | // eliminated by optimization. This is necessary for device variables |
|
0 commit comments