Skip to content

Commit 6cfa1a0

Browse files
authored
Revert "[PGO][AMDGPU] Add basic HIP offload PGO support (#177665)" (#201416)
This broke profiling builds on Windows by switching the profile library to link against the dynamic CRT; see discussion on the PR. There were already a number of issues reported and fixed after this PR. Rather than piling on the fixes (and this one may need some work), revert back to green for now to let the project recover. This reverts commit 5db1364. Additionally, this reverts the followup PRs in 635e120, 2766733, 4c33844, and 5eca8b6: "[PGO][HIP] Stop pulling ROCm.o into every PGO host link (#200101)" "[compiler-rt][profile] Add COMPILER_RT_BUILD_PROFILE_ROCM option (#200127)" "[PGO][HIP] Skip ROCm interceptor in profile-only compiler-rt builds (#200111)" "[PGO][HIP] Fix profile-only Windows link by gating ROCm interceptor macro (#200859)"
1 parent 1c88bd7 commit 6cfa1a0

14 files changed

Lines changed: 32 additions & 1407 deletions

File tree

clang/lib/CodeGen/CGCUDANV.cpp

Lines changed: 0 additions & 152 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
#include "llvm/IR/ReplaceConstant.h"
2929
#include "llvm/Support/Format.h"
3030
#include "llvm/Support/VirtualFileSystem.h"
31-
#include "llvm/Transforms/Utils/ModuleUtils.h"
3231

3332
using namespace clang;
3433
using namespace CodeGen;
@@ -73,11 +72,6 @@ class CGNVCUDARuntime : public CGCUDARuntime {
7372
/// ModuleCtorFunction() and used to create corresponding cleanup calls in
7473
/// ModuleDtorFunction()
7574
llvm::GlobalVariable *GpuBinaryHandle = nullptr;
76-
/// Host-side shadow for the per-TU __llvm_profile_sections_<CUID> global,
77-
/// emitted only for HIP host compiles when PGO is on. Registered via
78-
/// __hipRegisterVar (non-RDC) or an offloading entry (RDC) so the runtime
79-
/// can locate the device-side table by name.
80-
llvm::GlobalVariable *OffloadProfShadow = nullptr;
8175
/// Whether we generate relocatable device code.
8276
bool RelocatableDeviceCode;
8377
/// Mangle context for device.
@@ -182,13 +176,6 @@ class CGNVCUDARuntime : public CGCUDARuntime {
182176
void transformManagedVars();
183177
/// Create offloading entries to register globals in RDC mode.
184178
void createOffloadingEntries();
185-
/// For HIP+PGO, emit the per-TU __llvm_profile_sections_<CUID> global.
186-
/// On the device side it is the populated 7-pointer section-bounds table.
187-
/// On the host side it is a placeholder void* shadow stored in
188-
/// OffloadProfShadow, registered later by makeRegisterGlobalsFn (non-RDC)
189-
/// or createOffloadingEntries (RDC) so the runtime can locate the
190-
/// device-side table by name.
191-
void emitOffloadProfilingSections();
192179

193180
public:
194181
CGNVCUDARuntime(CodeGenModule &CGM);
@@ -748,32 +735,6 @@ llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
748735
}
749736
}
750737

751-
// Register the per-TU offload-profiling shadow so the host runtime can
752-
// locate the matching device-side __llvm_profile_sections_<CUID>. We
753-
// emit both __hipRegisterVar (so the HIP runtime can map the host
754-
// shadow to the device symbol) and
755-
// __llvm_profile_offload_register_shadow_variable (so the profile
756-
// runtime adds the shadow to its drain list).
757-
if (OffloadProfShadow) {
758-
llvm::Constant *Name =
759-
makeConstantString(std::string(OffloadProfShadow->getName()));
760-
llvm::Value *RegisterVarArgs[] = {
761-
&GpuBinaryHandlePtr,
762-
OffloadProfShadow,
763-
Name,
764-
Name,
765-
llvm::ConstantInt::get(IntTy, /*Extern=*/0),
766-
llvm::ConstantInt::get(VarSizeTy, CGM.getDataLayout().getPointerSize()),
767-
llvm::ConstantInt::get(IntTy, /*Constant=*/0),
768-
llvm::ConstantInt::get(IntTy, 0)};
769-
Builder.CreateCall(RegisterVar, RegisterVarArgs);
770-
771-
llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction(
772-
llvm::FunctionType::get(VoidTy, {PtrTy}, false),
773-
"__llvm_profile_offload_register_shadow_variable");
774-
Builder.CreateCall(RegisterShadow, {OffloadProfShadow});
775-
}
776-
777738
Builder.CreateRetVoid();
778739
return RegisterKernelsFunc;
779740
}
@@ -1295,124 +1256,11 @@ void CGNVCUDARuntime::createOffloadingEntries() {
12951256
I.Flags.getSurfTexType());
12961257
}
12971258
}
1298-
1299-
// Register the per-TU offload-profiling shadow. The offloading entry
1300-
// makes the linker-wrapper emit the host __hipRegisterVar call in the
1301-
// combined ctor. Separately emit a per-TU ctor that registers the
1302-
// shadow with the profile runtime's drain list.
1303-
if (OffloadProfShadow) {
1304-
llvm::offloading::emitOffloadingEntry(
1305-
M, Kind, OffloadProfShadow, OffloadProfShadow->getName(),
1306-
CGM.getDataLayout().getPointerSize(),
1307-
llvm::offloading::OffloadGlobalEntry, /*Data=*/0);
1308-
1309-
llvm::LLVMContext &Ctx = M.getContext();
1310-
auto *PtrTy = llvm::PointerType::getUnqual(Ctx);
1311-
llvm::FunctionCallee RegisterShadow = CGM.CreateRuntimeFunction(
1312-
llvm::FunctionType::get(VoidTy, {PtrTy}, false),
1313-
"__llvm_profile_offload_register_shadow_variable");
1314-
auto *CtorFn = llvm::Function::Create(
1315-
llvm::FunctionType::get(VoidTy, false),
1316-
llvm::GlobalValue::InternalLinkage,
1317-
"__llvm_profile_register_shadow." + CGM.getContext().getCUIDHash(), &M);
1318-
auto *Entry = llvm::BasicBlock::Create(Ctx, "entry", CtorFn);
1319-
llvm::IRBuilder<> B(Entry);
1320-
B.CreateCall(RegisterShadow, {OffloadProfShadow});
1321-
B.CreateRetVoid();
1322-
llvm::appendToGlobalCtors(M, CtorFn, /*Priority=*/65535);
1323-
}
1324-
}
1325-
1326-
// For HIP host+device compiles with PGO enabled, emit the per-TU global
1327-
// __llvm_profile_sections_<CUID>. Device side: a 7-pointer struct holding
1328-
// section start/stop bounds for the names/counters/data sections plus the
1329-
// raw-version variable. Host side: an opaque void* shadow whose only
1330-
// purpose is to give the host-runtime a registered symbol name to look up
1331-
// via hipGetSymbolAddress; the actual device-side data lives in the
1332-
// matching device-side global.
1333-
void CGNVCUDARuntime::emitOffloadProfilingSections() {
1334-
if (!CGM.getLangOpts().HIP)
1335-
return;
1336-
if (!CGM.getCodeGenOpts().hasProfileInstr())
1337-
return;
1338-
1339-
StringRef CUIDHash = CGM.getContext().getCUIDHash();
1340-
if (CUIDHash.empty())
1341-
return;
1342-
1343-
llvm::Module &M = CGM.getModule();
1344-
llvm::LLVMContext &Ctx = M.getContext();
1345-
std::string Name = ("__llvm_profile_sections_" + CUIDHash).str();
1346-
1347-
// If the global already exists (e.g. another TU was merged in), don't
1348-
// duplicate it.
1349-
if (M.getNamedValue(Name))
1350-
return;
1351-
1352-
if (CGM.getLangOpts().CUDAIsDevice) {
1353-
// Device side: emit the populated struct. Section start/stop symbols
1354-
// are linker-defined (ELF auto-generates __start_/__stop_ for any
1355-
// section whose name is a valid C identifier; AMDGPU is ELF).
1356-
unsigned GlobalAS = M.getDataLayout().getDefaultGlobalsAddressSpace();
1357-
auto *PtrTy = llvm::PointerType::get(Ctx, GlobalAS);
1358-
auto getOrDeclare = [&](StringRef SymName) {
1359-
if (auto *GV = M.getNamedGlobal(SymName))
1360-
return GV;
1361-
auto *GV = new llvm::GlobalVariable(
1362-
M, llvm::Type::getInt8Ty(Ctx), /*isConstant=*/false,
1363-
llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr, SymName,
1364-
/*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
1365-
GlobalAS);
1366-
GV->setVisibility(llvm::GlobalValue::HiddenVisibility);
1367-
return GV;
1368-
};
1369-
auto *VersionGV = M.getNamedGlobal("__llvm_profile_raw_version");
1370-
if (!VersionGV) {
1371-
VersionGV = new llvm::GlobalVariable(
1372-
M, llvm::Type::getInt64Ty(Ctx), /*isConstant=*/true,
1373-
llvm::GlobalValue::ExternalLinkage, /*Initializer=*/nullptr,
1374-
"__llvm_profile_raw_version",
1375-
/*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
1376-
GlobalAS);
1377-
}
1378-
1379-
auto *StructTy = llvm::StructType::get(
1380-
Ctx, {PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy, PtrTy});
1381-
llvm::Constant *Fields[] = {
1382-
getOrDeclare("__start___llvm_prf_names"),
1383-
getOrDeclare("__stop___llvm_prf_names"),
1384-
getOrDeclare("__start___llvm_prf_cnts"),
1385-
getOrDeclare("__stop___llvm_prf_cnts"),
1386-
getOrDeclare("__start___llvm_prf_data"),
1387-
getOrDeclare("__stop___llvm_prf_data"),
1388-
VersionGV,
1389-
};
1390-
auto *Init = llvm::ConstantStruct::get(StructTy, Fields);
1391-
auto *GV = new llvm::GlobalVariable(
1392-
M, StructTy, /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage,
1393-
Init, Name, /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
1394-
GlobalAS);
1395-
GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
1396-
CGM.addCompilerUsedGlobal(GV);
1397-
return;
1398-
}
1399-
1400-
// Host side: emit an opaque void* shadow. Layout doesn't matter — the
1401-
// runtime locates it by name via hipGetSymbolAddress and treats it as
1402-
// the address of the device-side struct. Registration with the HIP
1403-
// runtime is added by makeRegisterGlobalsFn (non-RDC) or
1404-
// createOffloadingEntries (RDC).
1405-
auto *PtrTy = llvm::PointerType::getUnqual(Ctx);
1406-
OffloadProfShadow = new llvm::GlobalVariable(
1407-
M, PtrTy, /*isConstant=*/false, llvm::GlobalValue::ExternalLinkage,
1408-
llvm::ConstantPointerNull::get(PtrTy), Name);
1409-
CGM.addCompilerUsedGlobal(OffloadProfShadow);
14101259
}
14111260

14121261
// Returns module constructor to be added.
14131262
llvm::Function *CGNVCUDARuntime::finalizeModule() {
14141263
transformManagedVars();
1415-
emitOffloadProfilingSections();
14161264
if (CGM.getLangOpts().CUDAIsDevice) {
14171265
// Mark ODR-used device variables as compiler used to prevent it from being
14181266
// eliminated by optimization. This is necessary for device variables

clang/test/CodeGenHIP/offload-pgo-sections.hip

Lines changed: 0 additions & 50 deletions
This file was deleted.

compiler-rt/CMakeLists.txt

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -322,15 +322,6 @@ option(COMPILER_RT_USE_ATOMIC_LIBRARY "Use compiler-rt atomic instead of libatom
322322

323323
option(COMPILER_RT_PROFILE_BAREMETAL "Build minimal baremetal profile library" OFF)
324324

325-
set(DEFAULT_COMPILER_RT_BUILD_PROFILE_ROCM ON)
326-
if(APPLE)
327-
set(DEFAULT_COMPILER_RT_BUILD_PROFILE_ROCM OFF)
328-
endif()
329-
option(COMPILER_RT_BUILD_PROFILE_ROCM
330-
"Build the host-side ROCm/HIP device profile collection runtime"
331-
${DEFAULT_COMPILER_RT_BUILD_PROFILE_ROCM})
332-
mark_as_advanced(COMPILER_RT_BUILD_PROFILE_ROCM)
333-
334325
include(config-ix)
335326

336327
#================================

compiler-rt/lib/profile/CMakeLists.txt

Lines changed: 3 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,6 @@ if (NOT COMPILER_RT_PROFILE_BAREMETAL)
9393
InstrProfilingUtil.c
9494
InstrProfilingValue.c
9595
)
96-
if(COMPILER_RT_BUILD_PROFILE_ROCM)
97-
list(APPEND PROFILE_SOURCES InstrProfilingPlatformROCm.cpp)
98-
endif()
9996
endif()
10097

10198
set(PROFILE_HEADERS
@@ -158,43 +155,6 @@ if(COMPILER_RT_PROFILE_BAREMETAL)
158155
-DCOMPILER_RT_PROFILE_BAREMETAL=1)
159156
endif()
160157

161-
# The HIP host interceptor in InstrProfilingPlatformROCm.cpp pulls in
162-
# RTInterception + sanitizer_common object libs. Those targets are only created
163-
# when COMPILER_RT_BUILD_SANITIZERS / _MEMPROF / _XRAY / _CTX_PROFILE is enabled
164-
# (see lib/CMakeLists.txt). In a profile-only build the targets do not exist;
165-
# skip both the object-lib merge and the ROCm source file so the static archive
166-
# remains self-contained.
167-
set(PROFILE_OBJECT_LIBS)
168-
set(PROFILE_HAS_HIP_INTERCEPTOR FALSE)
169-
if(COMPILER_RT_HAS_INTERCEPTION AND NOT COMPILER_RT_PROFILE_BAREMETAL
170-
AND TARGET RTInterception.${COMPILER_RT_DEFAULT_TARGET_ARCH}
171-
AND TARGET RTSanitizerCommon.${COMPILER_RT_DEFAULT_TARGET_ARCH}
172-
AND TARGET RTSanitizerCommonLibc.${COMPILER_RT_DEFAULT_TARGET_ARCH})
173-
# RTInterception references __sanitizer_internal_{memcpy,memset,memmove} and other
174-
# sanitizer_common symbols; merge the same object libs as clang_rt.cfi (without
175-
# coverage/symbolizer) so -fprofile-instr-generate links stay self-contained.
176-
list(APPEND PROFILE_OBJECT_LIBS
177-
RTInterception
178-
RTSanitizerCommon
179-
RTSanitizerCommonLibc)
180-
set(PROFILE_HAS_HIP_INTERCEPTOR TRUE)
181-
endif()
182-
183-
if(NOT PROFILE_HAS_HIP_INTERCEPTOR)
184-
list(REMOVE_ITEM PROFILE_SOURCES InstrProfilingPlatformROCm.cpp)
185-
endif()
186-
187-
# Only advertise the ROCm interceptor to InstrProfilingFile.c when its
188-
# definition (InstrProfilingPlatformROCm.cpp) is actually compiled into the
189-
# archive. Otherwise InstrProfilingFile.c references
190-
# __llvm_profile_hip_collect_device_data with no definition; on COFF/Windows
191-
# there is no weak-undefined fallback, so the link fails (see PR #200111).
192-
if(COMPILER_RT_BUILD_PROFILE_ROCM AND PROFILE_HAS_HIP_INTERCEPTOR)
193-
set(EXTRA_FLAGS
194-
${EXTRA_FLAGS}
195-
-DCOMPILER_RT_BUILD_PROFILE_ROCM=1)
196-
endif()
197-
198158
if("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "amdgcn|nvptx")
199159
append_list_if(COMPILER_RT_HAS_FFREESTANDING_FLAG -ffreestanding EXTRA_FLAGS)
200160
append_list_if(COMPILER_RT_HAS_NOGPULIB_FLAG -nogpulib EXTRA_FLAGS)
@@ -208,24 +168,13 @@ if("${COMPILER_RT_DEFAULT_TARGET_ARCH}" MATCHES "amdgcn|nvptx")
208168
endif()
209169

210170
if(MSVC)
211-
# profile historically used the static CRT (/MT). When we merge RTInterception and
212-
# RTSanitizerCommon (same object libs as clang_rt.cfi on ELF), those targets are
213-
# built with MultiThreadedDLL (/MD) — see interception/CMakeLists.txt and
214-
# sanitizer_common/CMakeLists.txt. Mixing /MD objects into a /MT libclang_rt.profile
215-
# yields LNK2019 (__imp__stricmp from interception_win.cpp) and LNK4098 in Profile-*.
216-
if(PROFILE_HAS_HIP_INTERCEPTOR)
217-
set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDLL)
218-
else()
219-
set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded)
220-
endif()
171+
# profile historically has only been supported with the static runtime
172+
# on windows
173+
set(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreaded)
221174
endif()
222175

223176
# We don't use the C++ Standard Library here, so avoid including it by mistake.
224177
append_list_if(COMPILER_RT_HAS_NOSTDINCXX_FLAG -nostdinc++ EXTRA_FLAGS)
225-
# C++ profile sources (e.g. InstrProfilingPlatformROCm.cpp) must not emit exception
226-
# personality symbols: host libclang_rt.profile.a is linked from C code and from C++
227-
# tests that do not pull in __gxx_personality_v0 (Profile-* / premerge).
228-
append_list_if(COMPILER_RT_HAS_FNO_EXCEPTIONS_FLAG -fno-exceptions EXTRA_FLAGS)
229178
# XRay uses C++ standard library headers.
230179
string(REGEX REPLACE "-?-stdlib=[a-zA-Z+]*" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
231180

@@ -251,7 +200,6 @@ if(APPLE)
251200
STATIC
252201
OS ${PROFILE_SUPPORTED_OS}
253202
ARCHS ${PROFILE_SUPPORTED_ARCH}
254-
OBJECT_LIBS ${PROFILE_OBJECT_LIBS}
255203
CFLAGS ${EXTRA_FLAGS}
256204
SOURCES ${PROFILE_SOURCES}
257205
ADDITIONAL_HEADERS ${PROFILE_HEADERS}
@@ -261,7 +209,6 @@ else()
261209
add_compiler_rt_runtime(clang_rt.profile
262210
STATIC
263211
ARCHS ${PROFILE_SUPPORTED_ARCH}
264-
OBJECT_LIBS ${PROFILE_OBJECT_LIBS}
265212
CFLAGS ${EXTRA_FLAGS}
266213
SOURCES ${PROFILE_SOURCES}
267214
ADDITIONAL_HEADERS ${PROFILE_HEADERS}

0 commit comments

Comments
 (0)