Skip to content

Commit f8bc368

Browse files
zhaomaosuRossBrunton
authored andcommitted
Support device thread sanitizer for device globals (#17548)
1.Add a global '__TsanDeviceGlobalMetadata' to record device global's information 2.Read global meta data when build/link program done, and then poison related shadow memory
1 parent 3cc524f commit f8bc368

4 files changed

Lines changed: 203 additions & 14 deletions

File tree

source/loader/layers/sanitizer/tsan/tsan_ddi.cpp

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,95 @@ ur_result_t urContextRelease(
130130
return UR_RESULT_SUCCESS;
131131
}
132132

133+
///////////////////////////////////////////////////////////////////////////////
134+
/// @brief Intercept function for urProgramBuild
135+
ur_result_t urProgramBuild(
136+
/// [in] handle of the context object
137+
ur_context_handle_t hContext,
138+
/// [in] handle of the program object
139+
ur_program_handle_t hProgram,
140+
/// [in] string of build options
141+
const char *pOptions) {
142+
getContext()->logger.debug("==== urProgramBuild");
143+
144+
UR_CALL(
145+
getContext()->urDdiTable.Program.pfnBuild(hContext, hProgram, pOptions));
146+
147+
UR_CALL(getTsanInterceptor()->registerProgram(hProgram));
148+
149+
return UR_RESULT_SUCCESS;
150+
}
151+
152+
///////////////////////////////////////////////////////////////////////////////
153+
/// @brief Intercept function for urProgramBuildExp
154+
ur_result_t urProgramBuildExp(
155+
/// [in] Handle of the program to build.
156+
ur_program_handle_t hProgram,
157+
/// [in] number of devices
158+
uint32_t numDevices,
159+
/// [in][range(0, numDevices)] pointer to array of device handles
160+
ur_device_handle_t *phDevices,
161+
/// [in][optional] pointer to build options null-terminated string.
162+
const char *pOptions) {
163+
getContext()->logger.debug("==== urProgramBuildExp");
164+
165+
UR_CALL(getContext()->urDdiTable.ProgramExp.pfnBuildExp(hProgram, numDevices,
166+
phDevices, pOptions));
167+
UR_CALL(getTsanInterceptor()->registerProgram(hProgram));
168+
169+
return UR_RESULT_SUCCESS;
170+
}
171+
172+
///////////////////////////////////////////////////////////////////////////////
173+
/// @brief Intercept function for urProgramLink
174+
ur_result_t urProgramLink(
175+
/// [in] handle of the context instance.
176+
ur_context_handle_t hContext,
177+
/// [in] number of program handles in `phPrograms`.
178+
uint32_t count,
179+
/// [in][range(0, count)] pointer to array of program handles.
180+
const ur_program_handle_t *phPrograms,
181+
/// [in][optional] pointer to linker options null-terminated string.
182+
const char *pOptions,
183+
/// [out] pointer to handle of program object created.
184+
ur_program_handle_t *phProgram) {
185+
getContext()->logger.debug("==== urProgramLink");
186+
187+
UR_CALL(getContext()->urDdiTable.Program.pfnLink(hContext, count, phPrograms,
188+
pOptions, phProgram));
189+
190+
UR_CALL(getTsanInterceptor()->registerProgram(*phProgram));
191+
192+
return UR_RESULT_SUCCESS;
193+
}
194+
195+
///////////////////////////////////////////////////////////////////////////////
196+
/// @brief Intercept function for urProgramLinkExp
197+
ur_result_t urProgramLinkExp(
198+
/// [in] handle of the context instance.
199+
ur_context_handle_t hContext,
200+
/// [in] number of devices
201+
uint32_t numDevices,
202+
/// [in][range(0, numDevices)] pointer to array of device handles
203+
ur_device_handle_t *phDevices,
204+
/// [in] number of program handles in `phPrograms`.
205+
uint32_t count,
206+
/// [in][range(0, count)] pointer to array of program handles.
207+
const ur_program_handle_t *phPrograms,
208+
/// [in][optional] pointer to linker options null-terminated string.
209+
const char *pOptions,
210+
/// [out] pointer to handle of program object created.
211+
ur_program_handle_t *phProgram) {
212+
getContext()->logger.debug("==== urProgramLinkExp");
213+
214+
UR_CALL(getContext()->urDdiTable.ProgramExp.pfnLinkExp(
215+
hContext, numDevices, phDevices, count, phPrograms, pOptions, phProgram));
216+
217+
UR_CALL(getTsanInterceptor()->registerProgram(*phProgram));
218+
219+
return UR_RESULT_SUCCESS;
220+
}
221+
133222
///////////////////////////////////////////////////////////////////////////////
134223
/// @brief Intercept function for urUSMDeviceAlloc
135224
__urdlllocal ur_result_t UR_APICALL urUSMDeviceAlloc(
@@ -283,6 +372,39 @@ __urdlllocal ur_result_t UR_APICALL urGetContextProcAddrTable(
283372
return result;
284373
}
285374

375+
///////////////////////////////////////////////////////////////////////////////
376+
/// @brief Exported function for filling application's Program table
377+
/// with current process' addresses
378+
///
379+
/// @returns
380+
/// - ::UR_RESULT_SUCCESS
381+
/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
382+
ur_result_t urGetProgramProcAddrTable(
383+
/// [in,out] pointer to table of DDI function pointers
384+
ur_program_dditable_t *pDdiTable) {
385+
pDdiTable->pfnBuild = ur_sanitizer_layer::tsan::urProgramBuild;
386+
pDdiTable->pfnLink = ur_sanitizer_layer::tsan::urProgramLink;
387+
388+
return UR_RESULT_SUCCESS;
389+
}
390+
391+
/// @brief Exported function for filling application's ProgramExp table
392+
/// with current process' addresses
393+
///
394+
/// @returns
395+
/// - ::UR_RESULT_SUCCESS
396+
/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
397+
ur_result_t urGetProgramExpProcAddrTable(
398+
/// [in,out] pointer to table of DDI function pointers
399+
ur_program_exp_dditable_t *pDdiTable) {
400+
ur_result_t result = UR_RESULT_SUCCESS;
401+
402+
pDdiTable->pfnBuildExp = ur_sanitizer_layer::tsan::urProgramBuildExp;
403+
pDdiTable->pfnLinkExp = ur_sanitizer_layer::tsan::urProgramLinkExp;
404+
405+
return result;
406+
}
407+
286408
///////////////////////////////////////////////////////////////////////////////
287409
/// @brief Exported function for filling application's USM table
288410
/// with current process' addresses
@@ -363,6 +485,16 @@ ur_result_t initTsanDDITable(ur_dditable_t *dditable) {
363485
UR_API_VERSION_CURRENT, &dditable->Context);
364486
}
365487

488+
if (UR_RESULT_SUCCESS == result) {
489+
result =
490+
ur_sanitizer_layer::tsan::urGetProgramProcAddrTable(&dditable->Program);
491+
}
492+
493+
if (UR_RESULT_SUCCESS == result) {
494+
result = ur_sanitizer_layer::tsan::urGetProgramExpProcAddrTable(
495+
&dditable->ProgramExp);
496+
}
497+
366498
if (UR_RESULT_SUCCESS == result) {
367499
result = ur_sanitizer_layer::tsan::urGetUSMProcAddrTable(
368500
UR_API_VERSION_CURRENT, &dditable->USM);

source/loader/layers/sanitizer/tsan/tsan_interceptor.cpp

Lines changed: 56 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -69,15 +69,14 @@ ur_result_t DeviceInfo::allocShadowMemory() {
6969
return UR_RESULT_SUCCESS;
7070
}
7171

72-
void ContextInfo::insertAllocInfo(ur_device_handle_t Device,
73-
std::shared_ptr<TsanAllocInfo> &AI) {
72+
void ContextInfo::insertAllocInfo(ur_device_handle_t Device, TsanAllocInfo AI) {
7473
if (Device) {
7574
std::scoped_lock<ur_shared_mutex> Guard(AllocInfosMapMutex);
76-
AllocInfosMap[Device].emplace_back(AI);
75+
AllocInfosMap[Device].emplace_back(std::move(AI));
7776
} else {
7877
for (auto Device : DeviceList) {
7978
std::scoped_lock<ur_shared_mutex> Guard(AllocInfosMapMutex);
80-
AllocInfosMap[Device].emplace_back(AI);
79+
AllocInfosMap[Device].emplace_back(std::move(AI));
8180
}
8281
}
8382
}
@@ -103,16 +102,62 @@ ur_result_t TsanInterceptor::allocateMemory(ur_context_handle_t Context,
103102
Context, Device, Properties, Pool, Size, &Allocated));
104103
}
105104

106-
auto AI = std::make_shared<TsanAllocInfo>(
107-
TsanAllocInfo{reinterpret_cast<uptr>(Allocated), Size});
108-
105+
auto AI = TsanAllocInfo{reinterpret_cast<uptr>(Allocated), Size};
109106
// For updating shadow memory
110-
CI->insertAllocInfo(Device, AI);
107+
CI->insertAllocInfo(Device, std::move(AI));
111108

112109
*ResultPtr = Allocated;
113110
return UR_RESULT_SUCCESS;
114111
}
115112

113+
ur_result_t TsanInterceptor::registerProgram(ur_program_handle_t Program) {
114+
getContext()->logger.info("registerDeviceGlobals");
115+
UR_CALL(registerDeviceGlobals(Program));
116+
return UR_RESULT_SUCCESS;
117+
}
118+
119+
ur_result_t TsanInterceptor::registerDeviceGlobals(ur_program_handle_t Program) {
120+
std::vector<ur_device_handle_t> Devices = GetDevices(Program);
121+
assert(Devices.size() != 0 && "No devices in registerDeviceGlobals");
122+
auto Context = GetContext(Program);
123+
auto ContextInfo = getContextInfo(Context);
124+
125+
for (auto Device : Devices) {
126+
ManagedQueue Queue(Context, Device);
127+
128+
size_t MetadataSize;
129+
void *MetadataPtr;
130+
auto Result = getContext()->urDdiTable.Program.pfnGetGlobalVariablePointer(
131+
Device, Program, kSPIR_TsanDeviceGlobalMetadata, &MetadataSize,
132+
&MetadataPtr);
133+
if (Result != UR_RESULT_SUCCESS) {
134+
getContext()->logger.info("No device globals");
135+
continue;
136+
}
137+
138+
const uint64_t NumOfDeviceGlobal = MetadataSize / sizeof(DeviceGlobalInfo);
139+
assert((MetadataSize % sizeof(DeviceGlobalInfo) == 0) &&
140+
"DeviceGlobal metadata size is not correct");
141+
std::vector<DeviceGlobalInfo> GVInfos(NumOfDeviceGlobal);
142+
Result = getContext()->urDdiTable.Enqueue.pfnUSMMemcpy(
143+
Queue, true, &GVInfos[0], MetadataPtr,
144+
sizeof(DeviceGlobalInfo) * NumOfDeviceGlobal, 0, nullptr, nullptr);
145+
if (Result != UR_RESULT_SUCCESS) {
146+
getContext()->logger.error("Device Global[{}] Read Failed: {}",
147+
kSPIR_TsanDeviceGlobalMetadata, Result);
148+
return Result;
149+
}
150+
151+
for (size_t i = 0; i < NumOfDeviceGlobal; i++) {
152+
const auto &GVInfo = GVInfos[i];
153+
auto AI = TsanAllocInfo{GVInfo.Addr, GVInfo.Size};
154+
ContextInfo->insertAllocInfo(Device, std::move(AI));
155+
}
156+
}
157+
158+
return UR_RESULT_SUCCESS;
159+
}
160+
116161
ur_result_t TsanInterceptor::insertContext(ur_context_handle_t Context,
117162
std::shared_ptr<ContextInfo> &CI) {
118163
std::scoped_lock<ur_shared_mutex> Guard(m_ContextMapMutex);
@@ -225,9 +270,10 @@ TsanInterceptor::updateShadowMemory(std::shared_ptr<ContextInfo> &CI,
225270
ur_queue_handle_t Queue) {
226271
std::scoped_lock<ur_shared_mutex> Guard(CI->AllocInfosMapMutex);
227272
for (auto &AllocInfo : CI->AllocInfosMap[DI->Handle]) {
228-
UR_CALL(DI->Shadow->CleanShadow(Queue, AllocInfo->AllocBegin,
229-
AllocInfo->AllocSize));
273+
UR_CALL(DI->Shadow->CleanShadow(Queue, AllocInfo.AllocBegin,
274+
AllocInfo.AllocSize));
230275
}
276+
CI->AllocInfosMap[DI->Handle].clear();
231277
return UR_RESULT_SUCCESS;
232278
}
233279

source/loader/layers/sanitizer/tsan/tsan_interceptor.hpp

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,7 @@ struct ContextInfo {
4848
std::vector<ur_device_handle_t> DeviceList;
4949

5050
ur_shared_mutex AllocInfosMapMutex;
51-
std::unordered_map<ur_device_handle_t,
52-
std::vector<std::shared_ptr<TsanAllocInfo>>>
51+
std::unordered_map<ur_device_handle_t, std::vector<TsanAllocInfo>>
5352
AllocInfosMap;
5453

5554
explicit ContextInfo(ur_context_handle_t Context) : Handle(Context) {
@@ -68,8 +67,12 @@ struct ContextInfo {
6867

6968
ContextInfo &operator=(const ContextInfo &) = delete;
7069

71-
void insertAllocInfo(ur_device_handle_t Device,
72-
std::shared_ptr<TsanAllocInfo> &AI);
70+
void insertAllocInfo(ur_device_handle_t Device, TsanAllocInfo AI);
71+
};
72+
73+
struct DeviceGlobalInfo {
74+
uptr Size;
75+
uptr Addr;
7376
};
7477

7578
struct TsanRuntimeDataWrapper {
@@ -132,6 +135,8 @@ class TsanInterceptor {
132135
ur_usm_pool_handle_t Pool, size_t Size,
133136
AllocType Type, void **ResultPtr);
134137

138+
ur_result_t registerProgram(ur_program_handle_t Program);
139+
135140
ur_result_t insertContext(ur_context_handle_t Context,
136141
std::shared_ptr<ContextInfo> &CI);
137142

@@ -168,6 +173,8 @@ class TsanInterceptor {
168173
ur_queue_handle_t Queue, ur_kernel_handle_t Kernel,
169174
LaunchInfo &LaunchInfo);
170175

176+
ur_result_t registerDeviceGlobals(ur_program_handle_t Program);
177+
171178
private:
172179
std::unordered_map<ur_context_handle_t, std::shared_ptr<ContextInfo>>
173180
m_ContextMap;

source/loader/layers/sanitizer/tsan/tsan_libdevice.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,10 @@ struct TsanRuntimeData {
9494
TsanErrorReport Report[TSAN_MAX_NUM_REPORTS];
9595
};
9696

97+
constexpr auto kSPIR_TsanDeviceGlobalMetadata = "__TsanDeviceGlobalMetadata";
98+
99+
constexpr auto kSPIR_TsanSpirKernelMetadata = "__TsanKernelMetadata";
100+
97101
#if !defined(__SPIR__) && !defined(__SPIRV__)
98102
} // namespace ur_sanitizer_layer
99103
#endif // !__SPIR__ && !__SPIRV__

0 commit comments

Comments
 (0)