|
41 | 41 | #include "base/metrics.h" |
42 | 42 | #include "base/random/random.h" |
43 | 43 | #include "base/testutil/assert.h" |
| 44 | +#include "base/testutil/sync_point.h" |
44 | 45 | #include "base/time/monotime.h" |
45 | 46 | #include "base/utility/scoped_cleanup.h" |
46 | 47 | #include "common/logging.h" |
@@ -990,4 +991,151 @@ TEST_F(ThreadPoolTest, TestLIFOThreadWakeUps) { |
990 | 991 | } |
991 | 992 | */ |
992 | 993 |
|
| 994 | +// Regression test for a use-after-free bug in do_submit(). |
| 995 | +// |
| 996 | +// When the pool has min_threads=0 and all threads have exited, a new submit |
| 997 | +// must create a thread. The old code enqueued the task first, then attempted |
| 998 | +// thread creation. If creation failed (e.g. EAGAIN), it returned an error |
| 999 | +// WITHOUT removing the task from the queue. The caller, seeing the error, |
| 1000 | +// would perform its own cleanup (e.g. counting down a latch). A later |
| 1001 | +// successful submit could then create a thread that picks up the orphaned |
| 1002 | +// task, executing it against already-destroyed state (use-after-free). |
| 1003 | +// |
| 1004 | +// The fix creates the thread BEFORE enqueuing the task when the pool has |
| 1005 | +// zero threads. If thread creation fails, the task is never enqueued. |
| 1006 | +TEST_F(ThreadPoolTest, TestSubmitFailsCleanlyWhenNoThreadsExist) { |
| 1007 | + // Pool with min_threads=0 so all threads can idle-exit. |
| 1008 | + ASSERT_TRUE(rebuild_pool_with_builder(ThreadPoolBuilder(kDefaultPoolName) |
| 1009 | + .set_min_threads(0) |
| 1010 | + .set_max_threads(4) |
| 1011 | + .set_idle_timeout(MonoDelta::FromMilliseconds(1))) |
| 1012 | + .ok()); |
| 1013 | + |
| 1014 | + // Wait for threads to exit (min_threads=0, short idle timeout). |
| 1015 | + ASSERT_EQ(0, _pool->num_threads()); |
| 1016 | + |
| 1017 | + // Inject create_thread failure to simulate EAGAIN (resource exhaustion). |
| 1018 | + SyncPoint::GetInstance()->SetCallBack("ThreadPool::create_thread", [](void* arg) { |
| 1019 | + auto* status = static_cast<Status*>(arg); |
| 1020 | + *status = Status::RuntimeError("Could not create thread: Resource temporarily unavailable"); |
| 1021 | + }); |
| 1022 | + SyncPoint::GetInstance()->EnableProcessing(); |
| 1023 | + SCOPED_CLEANUP({ |
| 1024 | + SyncPoint::GetInstance()->ClearCallBack("ThreadPool::create_thread"); |
| 1025 | + SyncPoint::GetInstance()->DisableProcessing(); |
| 1026 | + }); |
| 1027 | + |
| 1028 | + std::atomic<int> run_count{0}; |
| 1029 | + Status s = _pool->submit_func([&]() { run_count++; }); |
| 1030 | + // Submit should fail because thread creation failed. |
| 1031 | + ASSERT_FALSE(s.ok()); |
| 1032 | + // The task must NOT have been enqueued. |
| 1033 | + ASSERT_EQ(0, _pool->_total_queued_tasks); |
| 1034 | + ASSERT_EQ(0, run_count); |
| 1035 | + |
| 1036 | + // Now disable the failure injection and verify normal operation resumes. |
| 1037 | + SyncPoint::GetInstance()->ClearCallBack("ThreadPool::create_thread"); |
| 1038 | + SyncPoint::GetInstance()->DisableProcessing(); |
| 1039 | + |
| 1040 | + CountDownLatch latch(1); |
| 1041 | + s = _pool->submit_func([&]() { |
| 1042 | + run_count++; |
| 1043 | + latch.count_down(); |
| 1044 | + }); |
| 1045 | + ASSERT_TRUE(s.ok()); |
| 1046 | + latch.wait(); |
| 1047 | + ASSERT_EQ(1, run_count); |
| 1048 | + _pool->wait(); |
| 1049 | + _pool->shutdown(); |
| 1050 | +} |
| 1051 | + |
| 1052 | +// Verify that when thread creation fails but other threads exist in the pool, |
| 1053 | +// the task is still enqueued and eventually processed (no error returned). |
| 1054 | +TEST_F(ThreadPoolTest, TestSubmitSucceedsWhenThreadCreationFailsButThreadsExist) { |
| 1055 | + // Pool with min_threads=1 so at least one thread always exists. |
| 1056 | + ASSERT_TRUE(rebuild_pool_with_builder(ThreadPoolBuilder(kDefaultPoolName) |
| 1057 | + .set_min_threads(1) |
| 1058 | + .set_max_threads(4) |
| 1059 | + .set_idle_timeout(MonoDelta::FromMilliseconds(kThreadIdleTimeoutMs))) |
| 1060 | + .ok()); |
| 1061 | + ASSERT_EQ(1, _pool->num_threads()); |
| 1062 | + |
| 1063 | + // Block the existing thread so a new submit requires a new thread. |
| 1064 | + CountDownLatch block_latch(1); |
| 1065 | + ASSERT_TRUE(_pool->submit(SlowTask::new_slow_task(&block_latch)).ok()); |
| 1066 | + |
| 1067 | + // Now inject create_thread failure. |
| 1068 | + SyncPoint::GetInstance()->SetCallBack("ThreadPool::create_thread", [](void* arg) { |
| 1069 | + auto* status = static_cast<Status*>(arg); |
| 1070 | + *status = Status::RuntimeError("Could not create thread: Resource temporarily unavailable"); |
| 1071 | + }); |
| 1072 | + SyncPoint::GetInstance()->EnableProcessing(); |
| 1073 | + |
| 1074 | + // Submit should succeed because existing thread can process the task. |
| 1075 | + std::atomic<int> run_count{0}; |
| 1076 | + Status s = _pool->submit_func([&]() { run_count++; }); |
| 1077 | + ASSERT_TRUE(s.ok()); |
| 1078 | + |
| 1079 | + // Disable failure injection and unblock. |
| 1080 | + SyncPoint::GetInstance()->ClearCallBack("ThreadPool::create_thread"); |
| 1081 | + SyncPoint::GetInstance()->DisableProcessing(); |
| 1082 | + block_latch.count_down(); |
| 1083 | + |
| 1084 | + _pool->wait(); |
| 1085 | + ASSERT_EQ(1, run_count); |
| 1086 | + _pool->shutdown(); |
| 1087 | +} |
| 1088 | + |
| 1089 | +// Verify the original crash scenario: a stack-allocated latch captured by |
| 1090 | +// reference in a task. If the task were orphaned in the queue after a failed |
| 1091 | +// submit, a later thread would execute it and count_down a destroyed latch. |
| 1092 | +// With the fix, the task is never enqueued, so the latch is safe. |
| 1093 | +TEST_F(ThreadPoolTest, TestSubmitFailureDoesNotCauseUseAfterFree) { |
| 1094 | + ASSERT_TRUE(rebuild_pool_with_builder(ThreadPoolBuilder(kDefaultPoolName) |
| 1095 | + .set_min_threads(0) |
| 1096 | + .set_max_threads(4) |
| 1097 | + .set_idle_timeout(MonoDelta::FromMilliseconds(1))) |
| 1098 | + .ok()); |
| 1099 | + ASSERT_EQ(0, _pool->num_threads()); |
| 1100 | + |
| 1101 | + // First call: fail. Subsequent calls: succeed. |
| 1102 | + std::atomic<int> create_call_count{0}; |
| 1103 | + SyncPoint::GetInstance()->SetCallBack("ThreadPool::create_thread", [&](void* arg) { |
| 1104 | + if (create_call_count.fetch_add(1) == 0) { |
| 1105 | + auto* status = static_cast<Status*>(arg); |
| 1106 | + *status = Status::RuntimeError("Could not create thread: Resource temporarily unavailable"); |
| 1107 | + } |
| 1108 | + }); |
| 1109 | + SyncPoint::GetInstance()->EnableProcessing(); |
| 1110 | + SCOPED_CLEANUP({ |
| 1111 | + SyncPoint::GetInstance()->ClearCallBack("ThreadPool::create_thread"); |
| 1112 | + SyncPoint::GetInstance()->DisableProcessing(); |
| 1113 | + }); |
| 1114 | + |
| 1115 | + // Simulate the crash scenario: stack-allocated latch, task captured by ref. |
| 1116 | + { |
| 1117 | + const int kNumTasks = 5; |
| 1118 | + CountDownLatch latch(kNumTasks); |
| 1119 | + int submit_failures = 0; |
| 1120 | + |
| 1121 | + for (int i = 0; i < kNumTasks; i++) { |
| 1122 | + Status s = _pool->submit_func([&latch]() { latch.count_down(); }); |
| 1123 | + if (!s.ok()) { |
| 1124 | + // Caller counts down on failure (like the original bug scenario). |
| 1125 | + latch.count_down(); |
| 1126 | + submit_failures++; |
| 1127 | + } |
| 1128 | + } |
| 1129 | + // First submit should fail (injected), rest should succeed. |
| 1130 | + ASSERT_EQ(1, submit_failures); |
| 1131 | + // Wait for all count_downs (from both successful tasks and caller cleanup). |
| 1132 | + latch.wait(); |
| 1133 | + } |
| 1134 | + // If the orphaned task bug existed, this point would crash (use-after-free |
| 1135 | + // on the destroyed latch). With the fix, we reach here safely. |
| 1136 | + |
| 1137 | + _pool->wait(); |
| 1138 | + _pool->shutdown(); |
| 1139 | +} |
| 1140 | + |
993 | 1141 | } // namespace starrocks |
0 commit comments