|
44 | 44 | #include "gutil/sysinfo.h" |
45 | 45 | #include "gutil/walltime.h" |
46 | 46 | #include "testutil/assert.h" |
| 47 | +#include "testutil/sync_point.h" |
47 | 48 | #include "util/await.h" |
48 | 49 | #include "util/countdown_latch.h" |
49 | 50 | #include "util/metrics.h" |
@@ -987,4 +988,151 @@ TEST_F(ThreadPoolTest, TestLIFOThreadWakeUps) { |
987 | 988 | } |
988 | 989 | */ |
989 | 990 |
|
| 991 | +// Regression test for a use-after-free bug in do_submit(). |
| 992 | +// |
| 993 | +// When the pool has min_threads=0 and all threads have exited, a new submit |
| 994 | +// must create a thread. The old code enqueued the task first, then attempted |
| 995 | +// thread creation. If creation failed (e.g. EAGAIN), it returned an error |
| 996 | +// WITHOUT removing the task from the queue. The caller, seeing the error, |
| 997 | +// would perform its own cleanup (e.g. counting down a latch). A later |
| 998 | +// successful submit could then create a thread that picks up the orphaned |
| 999 | +// task, executing it against already-destroyed state (use-after-free). |
| 1000 | +// |
| 1001 | +// The fix creates the thread BEFORE enqueuing the task when the pool has |
| 1002 | +// zero threads. If thread creation fails, the task is never enqueued. |
| 1003 | +TEST_F(ThreadPoolTest, TestSubmitFailsCleanlyWhenNoThreadsExist) { |
| 1004 | + // Pool with min_threads=0 so all threads can idle-exit. |
| 1005 | + ASSERT_TRUE(rebuild_pool_with_builder(ThreadPoolBuilder(kDefaultPoolName) |
| 1006 | + .set_min_threads(0) |
| 1007 | + .set_max_threads(4) |
| 1008 | + .set_idle_timeout(MonoDelta::FromMilliseconds(1))) |
| 1009 | + .ok()); |
| 1010 | + |
| 1011 | + // Wait for threads to exit (min_threads=0, short idle timeout). |
| 1012 | + ASSERT_EQ(0, _pool->num_threads()); |
| 1013 | + |
| 1014 | + // Inject create_thread failure to simulate EAGAIN (resource exhaustion). |
| 1015 | + SyncPoint::GetInstance()->SetCallBack("ThreadPool::create_thread", [](void* arg) { |
| 1016 | + auto* status = static_cast<Status*>(arg); |
| 1017 | + *status = Status::RuntimeError("Could not create thread: Resource temporarily unavailable"); |
| 1018 | + }); |
| 1019 | + SyncPoint::GetInstance()->EnableProcessing(); |
| 1020 | + SCOPED_CLEANUP({ |
| 1021 | + SyncPoint::GetInstance()->ClearCallBack("ThreadPool::create_thread"); |
| 1022 | + SyncPoint::GetInstance()->DisableProcessing(); |
| 1023 | + }); |
| 1024 | + |
| 1025 | + std::atomic<int> run_count{0}; |
| 1026 | + Status s = _pool->submit_func([&]() { run_count++; }); |
| 1027 | + // Submit should fail because thread creation failed. |
| 1028 | + ASSERT_FALSE(s.ok()); |
| 1029 | + // The task must NOT have been enqueued. |
| 1030 | + ASSERT_EQ(0, _pool->_total_queued_tasks); |
| 1031 | + ASSERT_EQ(0, run_count); |
| 1032 | + |
| 1033 | + // Now disable the failure injection and verify normal operation resumes. |
| 1034 | + SyncPoint::GetInstance()->ClearCallBack("ThreadPool::create_thread"); |
| 1035 | + SyncPoint::GetInstance()->DisableProcessing(); |
| 1036 | + |
| 1037 | + CountDownLatch latch(1); |
| 1038 | + s = _pool->submit_func([&]() { |
| 1039 | + run_count++; |
| 1040 | + latch.count_down(); |
| 1041 | + }); |
| 1042 | + ASSERT_TRUE(s.ok()); |
| 1043 | + latch.wait(); |
| 1044 | + ASSERT_EQ(1, run_count); |
| 1045 | + _pool->wait(); |
| 1046 | + _pool->shutdown(); |
| 1047 | +} |
| 1048 | + |
| 1049 | +// Verify that when thread creation fails but other threads exist in the pool, |
| 1050 | +// the task is still enqueued and eventually processed (no error returned). |
| 1051 | +TEST_F(ThreadPoolTest, TestSubmitSucceedsWhenThreadCreationFailsButThreadsExist) { |
| 1052 | + // Pool with min_threads=1 so at least one thread always exists. |
| 1053 | + ASSERT_TRUE(rebuild_pool_with_builder(ThreadPoolBuilder(kDefaultPoolName) |
| 1054 | + .set_min_threads(1) |
| 1055 | + .set_max_threads(4) |
| 1056 | + .set_idle_timeout(MonoDelta::FromMilliseconds(kThreadIdleTimeoutMs))) |
| 1057 | + .ok()); |
| 1058 | + ASSERT_EQ(1, _pool->num_threads()); |
| 1059 | + |
| 1060 | + // Block the existing thread so a new submit requires a new thread. |
| 1061 | + CountDownLatch block_latch(1); |
| 1062 | + ASSERT_TRUE(_pool->submit(SlowTask::new_slow_task(&block_latch)).ok()); |
| 1063 | + |
| 1064 | + // Now inject create_thread failure. |
| 1065 | + SyncPoint::GetInstance()->SetCallBack("ThreadPool::create_thread", [](void* arg) { |
| 1066 | + auto* status = static_cast<Status*>(arg); |
| 1067 | + *status = Status::RuntimeError("Could not create thread: Resource temporarily unavailable"); |
| 1068 | + }); |
| 1069 | + SyncPoint::GetInstance()->EnableProcessing(); |
| 1070 | + |
| 1071 | + // Submit should succeed because existing thread can process the task. |
| 1072 | + std::atomic<int> run_count{0}; |
| 1073 | + Status s = _pool->submit_func([&]() { run_count++; }); |
| 1074 | + ASSERT_TRUE(s.ok()); |
| 1075 | + |
| 1076 | + // Disable failure injection and unblock. |
| 1077 | + SyncPoint::GetInstance()->ClearCallBack("ThreadPool::create_thread"); |
| 1078 | + SyncPoint::GetInstance()->DisableProcessing(); |
| 1079 | + block_latch.count_down(); |
| 1080 | + |
| 1081 | + _pool->wait(); |
| 1082 | + ASSERT_EQ(1, run_count); |
| 1083 | + _pool->shutdown(); |
| 1084 | +} |
| 1085 | + |
| 1086 | +// Verify the original crash scenario: a stack-allocated latch captured by |
| 1087 | +// reference in a task. If the task were orphaned in the queue after a failed |
| 1088 | +// submit, a later thread would execute it and count_down a destroyed latch. |
| 1089 | +// With the fix, the task is never enqueued, so the latch is safe. |
| 1090 | +TEST_F(ThreadPoolTest, TestSubmitFailureDoesNotCauseUseAfterFree) { |
| 1091 | + ASSERT_TRUE(rebuild_pool_with_builder(ThreadPoolBuilder(kDefaultPoolName) |
| 1092 | + .set_min_threads(0) |
| 1093 | + .set_max_threads(4) |
| 1094 | + .set_idle_timeout(MonoDelta::FromMilliseconds(1))) |
| 1095 | + .ok()); |
| 1096 | + ASSERT_EQ(0, _pool->num_threads()); |
| 1097 | + |
| 1098 | + // First call: fail. Subsequent calls: succeed. |
| 1099 | + std::atomic<int> create_call_count{0}; |
| 1100 | + SyncPoint::GetInstance()->SetCallBack("ThreadPool::create_thread", [&](void* arg) { |
| 1101 | + if (create_call_count.fetch_add(1) == 0) { |
| 1102 | + auto* status = static_cast<Status*>(arg); |
| 1103 | + *status = Status::RuntimeError("Could not create thread: Resource temporarily unavailable"); |
| 1104 | + } |
| 1105 | + }); |
| 1106 | + SyncPoint::GetInstance()->EnableProcessing(); |
| 1107 | + SCOPED_CLEANUP({ |
| 1108 | + SyncPoint::GetInstance()->ClearCallBack("ThreadPool::create_thread"); |
| 1109 | + SyncPoint::GetInstance()->DisableProcessing(); |
| 1110 | + }); |
| 1111 | + |
| 1112 | + // Simulate the crash scenario: stack-allocated latch, task captured by ref. |
| 1113 | + { |
| 1114 | + const int kNumTasks = 5; |
| 1115 | + CountDownLatch latch(kNumTasks); |
| 1116 | + int submit_failures = 0; |
| 1117 | + |
| 1118 | + for (int i = 0; i < kNumTasks; i++) { |
| 1119 | + Status s = _pool->submit_func([&latch]() { latch.count_down(); }); |
| 1120 | + if (!s.ok()) { |
| 1121 | + // Caller counts down on failure (like the original bug scenario). |
| 1122 | + latch.count_down(); |
| 1123 | + submit_failures++; |
| 1124 | + } |
| 1125 | + } |
| 1126 | + // First submit should fail (injected), rest should succeed. |
| 1127 | + ASSERT_EQ(1, submit_failures); |
| 1128 | + // Wait for all count_downs (from both successful tasks and caller cleanup). |
| 1129 | + latch.wait(); |
| 1130 | + } |
| 1131 | + // If the orphaned task bug existed, this point would crash (use-after-free |
| 1132 | + // on the destroyed latch). With the fix, we reach here safely. |
| 1133 | + |
| 1134 | + _pool->wait(); |
| 1135 | + _pool->shutdown(); |
| 1136 | +} |
| 1137 | + |
990 | 1138 | } // namespace starrocks |
0 commit comments