Skip to content

Commit 3ac54eb

Browse files
committed
Add fast path for teardown check with a shared thread
Signed-off-by: Neil R. Spruit <neil.r.spruit@intel.com>
1 parent 74997ed commit 3ac54eb

2 files changed

Lines changed: 191 additions & 72 deletions

File tree

samples/zello_world/zello_world.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,9 +196,13 @@ int main( int argc, char *argv[] )
196196
zeEventHostSynchronize(event, UINT64_MAX );
197197
std::cout << "Congratulations, the device completed execution!\n";
198198

199+
zelCheckIsLoaderInTearDown();
199200
zeContextDestroy(context);
201+
zelCheckIsLoaderInTearDown();
200202
zeCommandListDestroy(command_list);
203+
zelCheckIsLoaderInTearDown();
201204
zeEventDestroy(event);
205+
zelCheckIsLoaderInTearDown();
202206
zeEventPoolDestroy(event_pool);
203207

204208
if (tracing_enabled) {

source/lib/ze_lib.cpp

Lines changed: 187 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,141 @@ namespace ze_lib
2727
}
2828
}
2929
bool delayContextDestruction = false;
30+
#define ZEL_STABILITY_CHECK_RESULT_SUCCESS 0
31+
#define ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_NULL 1
32+
#define ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_FAILED 2
33+
#define ZEL_STABILITY_CHECK_RESULT_EXCEPTION 3
34+
#define ZEL_STABILITY_THREAD_EXIT 4
35+
#define ZEL_STABILITY_THREAD_TIMEOUT 5
36+
#define ZEL_STABILITY_THREAD_SIGNAL 1
37+
std::promise<int> *sharedSignal = nullptr;
38+
std::shared_future<int> *sharedFuture = nullptr;
39+
std::promise<int> *sharedResult = nullptr;
40+
std::shared_future<int> *sharedResultFuture = nullptr;
41+
42+
class StabilityThreadClass {
43+
public:
44+
StabilityThreadClass(std::function<void()> func) :
45+
stabilityThread([this, func]() {
46+
try {
47+
func();
48+
crashed_.store(false);
49+
} catch (...) {
50+
crashed_.store(true);
51+
}
52+
}) {}
53+
54+
~StabilityThreadClass() {
55+
try {
56+
if (stabilityThread.joinable() && !has_crashed()) {
57+
signal(ZEL_STABILITY_THREAD_EXIT);
58+
stabilityThread.join();
59+
}
60+
} catch (...) {
61+
crashed_.store(true);
62+
}
63+
}
64+
65+
bool has_crashed() const {
66+
return crashed_.load();
67+
}
68+
69+
void signal(int signalValue) {
70+
try {
71+
if (stabilityThread.joinable() && !has_crashed()) {
72+
ze_lib::sharedSignal->set_value(signalValue);
73+
}
74+
} catch (...) {
75+
crashed_.store(true);
76+
}
77+
}
78+
79+
void join() {
80+
try {
81+
if (stabilityThread.joinable()) {
82+
stabilityThread.join();
83+
}
84+
} catch (...) {
85+
crashed_.store(true);
86+
}
87+
}
88+
89+
private:
90+
std::thread stabilityThread;
91+
std::atomic<bool> crashed_{false};
92+
};
93+
StabilityThreadClass *l0StabilityThread = nullptr;
94+
95+
/**
96+
* @brief Performs a stability check for the Level Zero loader.
97+
*
98+
* This function checks the stability of the Level Zero loader by verifying
99+
* the presence of the loader module, the validity of the `zeDriverGet` function
100+
* pointer, and the ability to retrieve driver information. The result of the
101+
* stability check is communicated through the provided promise.
102+
*
103+
* @param stabilityPromise A promise object used to communicate the result of
104+
* the stability check. The promise is set with one of
105+
* the following values:
106+
* - ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_NULL: The
107+
* `zeDriverGet` function pointer is invalid.
108+
* - ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_FAILED: The
109+
* loader failed to retrieve driver information.
110+
* - ZEL_STABILITY_CHECK_RESULT_EXCEPTION: An
111+
* exception occurred during the stability check.
112+
* - ZEL_STABILITY_CHECK_RESULT_SUCCESS: The stability
113+
* check was successful.
114+
*
115+
* @note If debug tracing is enabled, debug messages are logged for each failure
116+
* scenario.
117+
* @note If the Loader is completely torn down, this thread is expected to be killed
118+
* due to invalid memory access and the stability check will determine a failure.
119+
*
120+
* @exception This function catches all exceptions internally and does not throw.
121+
*/
122+
void stabilityCheck() {
123+
// Wait for the user to set a value (signal) before proceeding with the stability check
124+
while (true) {
125+
if (!ze_lib::context) {
126+
return;
127+
}
128+
try {
129+
ze_lib::sharedFuture->wait();
130+
auto signalValue = ze_lib::sharedFuture->get();
131+
if (signalValue == ZEL_STABILITY_THREAD_EXIT) {
132+
ze_lib::sharedResult->set_value(ZEL_STABILITY_CHECK_RESULT_SUCCESS);
133+
return;
134+
}
135+
136+
if (!ze_lib::context->loaderDriverGet) {
137+
if (ze_lib::context->debugTraceEnabled) {
138+
std::string message = "LoaderDriverGet is a bad pointer. Exiting stability checker thread.";
139+
ze_lib::context->debug_trace_message(message, "");
140+
}
141+
ze_lib::sharedResult->set_value(ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_NULL);
142+
return;
143+
}
144+
145+
uint32_t driverCount = 0;
146+
ze_result_t result = ZE_RESULT_ERROR_UNINITIALIZED;
147+
result = ze_lib::context->loaderDriverGet(&driverCount, nullptr);
148+
if (result != ZE_RESULT_SUCCESS || driverCount == 0) {
149+
if (ze_lib::context->debugTraceEnabled) {
150+
std::string message = "Loader stability check failed. Exiting stability checker thread.";
151+
ze_lib::context->debug_trace_message(message, "");
152+
}
153+
ze_lib::sharedResult->set_value(ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_FAILED);
154+
return;
155+
}
156+
ze_lib::sharedResult->set_value(ZEL_STABILITY_CHECK_RESULT_SUCCESS);
157+
*ze_lib::sharedSignal = std::promise<int>();
158+
*ze_lib::sharedFuture = ze_lib::sharedSignal->get_future().share();
159+
} catch (...) {
160+
ze_lib::sharedResult->set_value(ZEL_STABILITY_CHECK_RESULT_EXCEPTION);
161+
return;
162+
}
163+
}
164+
}
30165
#endif
31166
bool destruction = false;
32167

@@ -43,6 +178,24 @@ namespace ze_lib
43178
if (loader) {
44179
FREE_DRIVER_LIBRARY( loader );
45180
}
181+
delete l0StabilityThread;
182+
l0StabilityThread = nullptr;
183+
if (sharedSignal) {
184+
delete sharedSignal;
185+
sharedSignal = nullptr;
186+
}
187+
if (sharedFuture) {
188+
delete sharedFuture;
189+
sharedFuture = nullptr;
190+
}
191+
if (sharedResultFuture) {
192+
delete sharedResultFuture;
193+
sharedResultFuture = nullptr;
194+
}
195+
if (sharedResult) {
196+
delete sharedResult;
197+
sharedResult = nullptr;
198+
}
46199
#endif
47200
ze_lib::destruction = true;
48201
};
@@ -149,6 +302,13 @@ namespace ze_lib
149302
std::string version_message = "Loader API Version to be requested is v" + std::to_string(ZE_MAJOR_VERSION(version)) + "." + std::to_string(ZE_MINOR_VERSION(version));
150303
debug_trace_message(version_message, "");
151304
loaderDriverGet = reinterpret_cast<ze_pfnDriverGet_t>(GET_FUNCTION_PTR(loader, "zeDriverGet"));
305+
ze_lib::sharedSignal = new std::promise<int>();
306+
ze_lib::sharedFuture = new std::shared_future<int>();
307+
*ze_lib::sharedFuture = ze_lib::sharedSignal->get_future().share();
308+
ze_lib::sharedResult = new std::promise<int>();
309+
ze_lib::sharedResultFuture = new std::shared_future<int>();
310+
*ze_lib::sharedResultFuture = ze_lib::sharedResult->get_future().share();
311+
ze_lib::l0StabilityThread = new StabilityThreadClass(stabilityCheck);
152312
#else
153313
result = zeLoaderInit();
154314
if( ZE_RESULT_SUCCESS == result ) {
@@ -405,70 +565,6 @@ zelSetDelayLoaderContextTeardown()
405565
#endif
406566
}
407567

408-
#ifdef DYNAMIC_LOAD_LOADER
409-
#define ZEL_STABILITY_CHECK_RESULT_SUCCESS 0
410-
#define ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_NULL 1
411-
#define ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_FAILED 2
412-
#define ZEL_STABILITY_CHECK_RESULT_EXCEPTION 3
413-
414-
/**
415-
* @brief Performs a stability check for the Level Zero loader.
416-
*
417-
* This function checks the stability of the Level Zero loader by verifying
418-
* the presence of the loader module, the validity of the `zeDriverGet` function
419-
* pointer, and the ability to retrieve driver information. The result of the
420-
* stability check is communicated through the provided promise.
421-
*
422-
* @param stabilityPromise A promise object used to communicate the result of
423-
* the stability check. The promise is set with one of
424-
* the following values:
425-
* - ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_NULL: The
426-
* `zeDriverGet` function pointer is invalid.
427-
* - ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_FAILED: The
428-
* loader failed to retrieve driver information.
429-
* - ZEL_STABILITY_CHECK_RESULT_EXCEPTION: An
430-
* exception occurred during the stability check.
431-
* - ZEL_STABILITY_CHECK_RESULT_SUCCESS: The stability
432-
* check was successful.
433-
*
434-
* @note If debug tracing is enabled, debug messages are logged for each failure
435-
* scenario.
436-
* @note If the Loader is completely torn down, this thread is expected to be killed
437-
* due to invalid memory access and the stability check will determine a failure.
438-
*
439-
* @exception This function catches all exceptions internally and does not throw.
440-
*/
441-
void stabilityCheck(std::promise<int> stabilityPromise) {
442-
try {
443-
if (!ze_lib::context->loaderDriverGet) {
444-
if (ze_lib::context->debugTraceEnabled) {
445-
std::string message = "LoaderDriverGet is a bad pointer. Exiting stability checker thread.";
446-
ze_lib::context->debug_trace_message(message, "");
447-
}
448-
stabilityPromise.set_value(ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_NULL);
449-
return;
450-
}
451-
452-
uint32_t driverCount = 0;
453-
ze_result_t result = ZE_RESULT_ERROR_UNINITIALIZED;
454-
result = ze_lib::context->loaderDriverGet(&driverCount, nullptr);
455-
if (result != ZE_RESULT_SUCCESS || driverCount == 0) {
456-
if (ze_lib::context->debugTraceEnabled) {
457-
std::string message = "Loader stability check failed. Exiting stability checker thread.";
458-
ze_lib::context->debug_trace_message(message, "");
459-
}
460-
stabilityPromise.set_value(ZEL_STABILITY_CHECK_RESULT_DRIVER_GET_FAILED);
461-
return;
462-
}
463-
stabilityPromise.set_value(ZEL_STABILITY_CHECK_RESULT_SUCCESS);
464-
return;
465-
} catch (...) {
466-
stabilityPromise.set_value(ZEL_STABILITY_CHECK_RESULT_EXCEPTION);
467-
return;
468-
}
469-
}
470-
#endif
471-
472568
/**
473569
* @brief Checks if the loader is in the process of tearing down.
474570
*
@@ -490,18 +586,36 @@ zelCheckIsLoaderInTearDown() {
490586
return true;
491587
}
492588
#ifdef DYNAMIC_LOAD_LOADER
493-
std::promise<int> stabilityPromise;
494-
std::future<int> resultFuture = stabilityPromise.get_future();
495589
int result = -1;
496-
try {
497-
// Launch the stability checker thread
498-
std::thread stabilityThread(stabilityCheck, std::move(stabilityPromise));
499-
result = resultFuture.get(); // Blocks until the result is available
590+
static bool failure = false;
591+
if (failure) {
500592
if (ze_lib::context->debugTraceEnabled) {
501-
std::string message = "Stability checker thread completed with result: " + std::to_string(result);
593+
std::string message = "Stability checker thread failed already.";
502594
ze_lib::context->debug_trace_message(message, "");
503595
}
504-
stabilityThread.join();
596+
return true;
597+
}
598+
try {
599+
ze_lib::l0StabilityThread->signal(ZEL_STABILITY_THREAD_SIGNAL);
600+
if (ze_lib::sharedResultFuture->wait_for(std::chrono::milliseconds(ZEL_STABILITY_THREAD_TIMEOUT)) == std::future_status::timeout) {
601+
if (ze_lib::context->debugTraceEnabled) {
602+
std::string message = "Stability checker thread timeout.";
603+
ze_lib::context->debug_trace_message(message, "");
604+
}
605+
result = ZEL_STABILITY_CHECK_RESULT_EXCEPTION;
606+
} else {
607+
if (!ze_lib::l0StabilityThread->has_crashed()) {
608+
result = ze_lib::sharedResultFuture->get();
609+
*ze_lib::sharedResult = std::promise<int>();
610+
*ze_lib::sharedResultFuture = ze_lib::sharedResult->get_future().share();
611+
} else {
612+
if (ze_lib::context->debugTraceEnabled) {
613+
std::string message = "Stability checker thread crashed.";
614+
ze_lib::context->debug_trace_message(message, "");
615+
}
616+
result = ZEL_STABILITY_CHECK_RESULT_EXCEPTION;
617+
}
618+
}
505619
} catch (const std::exception& e) {
506620
if (ze_lib::context->debugTraceEnabled) {
507621
std::string message = "Exception caught in parent thread: " + std::string(e.what());
@@ -518,6 +632,7 @@ zelCheckIsLoaderInTearDown() {
518632
std::string message = "Loader stability check failed with result: " + std::to_string(result);
519633
ze_lib::context->debug_trace_message(message, "");
520634
}
635+
failure = true;
521636
return true;
522637
}
523638
#endif

0 commit comments

Comments
 (0)