Skip to content

Commit a1adfcc

Browse files
authored
test: clean up gRPC server process groups (#1492)
Makes the gRPC integration test's ServerProcess helper clean up reliably. Each spawned cuopt_grpc_server is now put in its own process group (setpgid), and teardown signals the whole group, so the GPU worker child can no longer be orphaned when a test ends or a server is restarted. stop() now returns success/failure and is idempotent, the process group is verified before it's ever used as a kill target, and teardown paths assert the group is gone. Adds ServerProcessLifecycleTests covering idle stop, idempotent stop, and crash-during-solve cleanup. Test-only change; no library or API changes. The change is motivated by CI failures like https://github.com/NVIDIA/cuopt/actions/runs/28380812204/job/84090251116?pr=1481. There Codex-diagnosed issue here is: 1. ErrorRecoveryTests.ClientHandlesServerCrashDuringSolve starts a 120-second MIP in a forked gRPC worker. 2. `ServerProcess::stop()` signals only the server parent. 3. After five seconds it SIGKILLs that parent, but not the worker forked in `spawn_worker()`. 4. The test reports success while the orphan worker continues using the GPU. 5. INCUMBENT_CALLBACK_TEST starts next and competes with that worker for VRAM Authors: - Miles Lubin (https://github.com/mlubin) Approvers: - Trevor McKay (https://github.com/tmckayus) - Alice Boucher (https://github.com/aliceb-nv) - Akif ÇÖRDÜK (https://github.com/akifcorduk) URL: #1492
1 parent 2ca56e7 commit a1adfcc

1 file changed

Lines changed: 96 additions & 38 deletions

File tree

cpp/tests/linear_programming/grpc/grpc_integration_test.cpp

Lines changed: 96 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include <gtest/gtest.h>
3030

3131
#include <atomic>
32+
#include <cerrno>
3233
#include <cmath>
3334
#include <filesystem>
3435
#include <fstream>
@@ -54,6 +55,7 @@
5455

5556
#include <fcntl.h>
5657
#include <signal.h>
58+
#include <sys/prctl.h>
5759
#include <sys/types.h>
5860
#include <sys/wait.h>
5961
#include <unistd.h>
@@ -71,14 +73,26 @@ using cuopt::mathematical_optimization::testing::GrpcTestLogCapture;
7173

7274
namespace {
7375

74-
// =============================================================================
75-
// Server Process Manager
76-
// =============================================================================
76+
// GRPC_INTEGRATION_TEST
77+
// `-- cuopt_grpc_server parent (pid_, leads process group pgid_ == pid_)
78+
// `-- GPU worker (inherits pgid_)
79+
//
80+
// The server is forked into its own process group so stop() can signal the whole
81+
// group. Killing the server parent orphans its GPU worker, so this test process
82+
// also marks itself a subreaper (PR_SET_CHILD_SUBREAPER): the orphan reparents
83+
// here and stop() reaps the entire group, even inside a container whose PID 1
84+
// does not reap. Polling kill(-pgid_, 0) instead is not enough -- it counts an
85+
// unreaped zombie as a live member and can never observe a grandchild's death.
7786

7887
class ServerProcess {
7988
public:
80-
ServerProcess() : pid_(-1), port_(0) {}
81-
~ServerProcess() { stop(); }
89+
ServerProcess() : pid_(-1), pgid_(-1), port_(0) {}
90+
ServerProcess(const ServerProcess&) = delete;
91+
ServerProcess& operator=(const ServerProcess&) = delete;
92+
~ServerProcess()
93+
{
94+
if (!stop()) { std::cerr << "Failed to clean up test server process group\n"; }
95+
}
8296

8397
void set_tls_config(const std::string& root_certs,
8498
const std::string& client_cert = "",
@@ -91,21 +105,31 @@ class ServerProcess {
91105

92106
bool start(int port, const std::vector<std::string>& extra_args = {})
93107
{
94-
port_ = port;
108+
if (pid_ > 0) {
109+
std::cerr << "Cannot reuse a ServerProcess while it still owns a process lifecycle\n";
110+
return false;
111+
}
95112

96113
std::string server_path = find_server_binary();
97114
if (server_path.empty()) {
98115
std::cerr << "Could not find cuopt_grpc_server binary\n";
99116
return false;
100117
}
101118

102-
pid_ = fork();
119+
// Reparent any process orphaned by killing the server (i.e. the GPU worker)
120+
// onto this test process so stop() can reap it without relying on init.
121+
prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0);
122+
123+
port_ = port;
124+
pid_ = fork();
103125
if (pid_ < 0) {
104126
std::cerr << "fork() failed\n";
105127
return false;
106128
}
107129

108130
if (pid_ == 0) {
131+
setpgid(0, 0); // child leads its own group; parent mirrors this below
132+
109133
std::vector<const char*> args;
110134
args.push_back(server_path.c_str());
111135
args.push_back("--port");
@@ -131,30 +155,37 @@ class ServerProcess {
131155
_exit(127);
132156
}
133157

134-
return wait_for_ready(15000);
158+
// Mirror the child's setpgid() so the group is established regardless of which
159+
// process runs first; the child is a fresh fork, so its group id is its pid.
160+
setpgid(pid_, pid_);
161+
pgid_ = pid_;
162+
163+
if (!wait_for_ready(15000)) {
164+
if (!stop()) { std::cerr << "Failed to clean up server after readiness failure\n"; }
165+
return false;
166+
}
167+
168+
return true;
135169
}
136170

137-
void stop()
171+
bool stop()
138172
{
139-
if (pid_ > 0) {
140-
kill(pid_, SIGTERM);
141-
142-
int status;
143-
int wait_ms = 0;
144-
while (wait_ms < 5000) {
145-
int ret = waitpid(pid_, &status, WNOHANG);
146-
if (ret != 0) break;
147-
std::this_thread::sleep_for(std::chrono::milliseconds(100));
148-
wait_ms += 100;
149-
}
150-
151-
if (waitpid(pid_, &status, WNOHANG) == 0) {
152-
kill(pid_, SIGKILL);
153-
waitpid(pid_, &status, 0);
173+
if (pid_ <= 0) return true;
174+
175+
// Ask the whole group (server + GPU worker) to exit gracefully, then reap
176+
// every member. Killing the server parent can orphan a mid-solve worker;
177+
// because we are a subreaper it reparents here and reap_group() collects it.
178+
kill(-pgid_, SIGTERM);
179+
if (!reap_group(std::chrono::seconds(15))) {
180+
kill(-pgid_, SIGKILL);
181+
if (!reap_group(std::chrono::seconds(15))) {
182+
std::cerr << "Server process group " << pgid_ << " did not exit\n";
183+
return false;
154184
}
155-
156-
pid_ = -1;
157185
}
186+
187+
clear_lifecycle_state();
188+
return true;
158189
}
159190

160191
int port() const { return port_; }
@@ -172,6 +203,32 @@ class ServerProcess {
172203
}
173204

174205
private:
206+
void clear_lifecycle_state()
207+
{
208+
pid_ = -1;
209+
pgid_ = -1;
210+
port_ = 0;
211+
}
212+
213+
// Reap every member of the server's process group, returning true once the
214+
// group is fully drained. start() makes this process a subreaper, so a worker
215+
// orphaned when the server parent dies reparents here and is reaped too --
216+
// first the parent, then (once its death reparents the worker) the worker.
217+
bool reap_group(std::chrono::milliseconds timeout)
218+
{
219+
auto deadline = std::chrono::steady_clock::now() + timeout;
220+
while (true) {
221+
int status = 0;
222+
pid_t ret = waitpid(-pgid_, &status, WNOHANG);
223+
if (ret > 0) continue; // reaped a member; keep draining
224+
if (ret < 0 && errno == EINTR) continue;
225+
if (ret < 0 && errno == ECHILD) return true; // no members left
226+
// ret == 0: members remain but none have exited yet.
227+
if (std::chrono::steady_clock::now() >= deadline) return false;
228+
std::this_thread::sleep_for(std::chrono::milliseconds(10));
229+
}
230+
}
231+
175232
std::string find_in_path(const std::string& name)
176233
{
177234
const char* path_env = std::getenv("PATH");
@@ -246,8 +303,8 @@ class ServerProcess {
246303
grpc_client_t client(config);
247304
if (client.connect()) { return true; }
248305

249-
int status;
250-
if (waitpid(pid_, &status, WNOHANG) != 0) {
306+
int status = 0;
307+
if (waitpid(pid_, &status, WNOHANG) == pid_) {
251308
std::cerr << "Server process died during startup\n";
252309
return false;
253310
}
@@ -257,6 +314,7 @@ class ServerProcess {
257314
}
258315

259316
pid_t pid_;
317+
pid_t pgid_;
260318
int port_;
261319
std::string tls_root_certs_;
262320
std::string tls_client_cert_;
@@ -516,7 +574,7 @@ class DefaultServerTests : public GrpcIntegrationTestBase {
516574

517575
static void TearDownTestSuite()
518576
{
519-
if (s_server_) s_server_->stop();
577+
if (s_server_) EXPECT_TRUE(s_server_->stop());
520578
s_server_.reset();
521579
}
522580

@@ -1083,7 +1141,7 @@ class ChunkedUploadTests : public GrpcIntegrationTestBase {
10831141

10841142
static void TearDownTestSuite()
10851143
{
1086-
if (s_server_) s_server_->stop();
1144+
if (s_server_) EXPECT_TRUE(s_server_->stop());
10871145
s_server_.reset();
10881146
}
10891147

@@ -1399,7 +1457,7 @@ class PathSelectionTests : public GrpcIntegrationTestBase {
13991457

14001458
static void TearDownTestSuite()
14011459
{
1402-
if (s_server_) s_server_->stop();
1460+
if (s_server_) EXPECT_TRUE(s_server_->stop());
14031461
s_server_.reset();
14041462
}
14051463

@@ -1583,7 +1641,7 @@ TEST_F(PathSelectionTests, UnaryUploadMIPWithPathLogging)
15831641
class ErrorRecoveryTests : public GrpcIntegrationTestBase {
15841642
protected:
15851643
void SetUp() override { port_ = get_test_port(); }
1586-
void TearDown() override { server_.stop(); }
1644+
void TearDown() override { EXPECT_TRUE(server_.stop()); }
15871645

15881646
bool start_server(const std::vector<std::string>& extra_args = {})
15891647
{
@@ -1602,7 +1660,7 @@ TEST_F(ErrorRecoveryTests, ClientReconnectsAfterServerRestart)
16021660
auto status_before = client->check_status("test-job");
16031661
EXPECT_TRUE(status_before.success);
16041662

1605-
server_.stop();
1663+
ASSERT_TRUE(server_.stop());
16061664
EXPECT_FALSE(server_.is_running());
16071665

16081666
auto status_down = client->check_status("test-job");
@@ -1630,7 +1688,7 @@ TEST_F(ErrorRecoveryTests, ClientHandlesServerCrashDuringSolve)
16301688
ASSERT_TRUE(submit_result.success);
16311689

16321690
std::this_thread::sleep_for(std::chrono::milliseconds(500));
1633-
server_.stop();
1691+
EXPECT_TRUE(server_.stop());
16341692

16351693
auto status_result = client->check_status(submit_result.job_id);
16361694
EXPECT_FALSE(status_result.success);
@@ -1694,7 +1752,7 @@ TEST_F(ErrorRecoveryTests, ChunkedUploadAfterServerRestart)
16941752
auto result1 = client->solve_mip(problem, settings, false);
16951753
EXPECT_TRUE(result1.success) << result1.error_message;
16961754

1697-
server_.stop();
1755+
ASSERT_TRUE(server_.stop());
16981756
ASSERT_TRUE(start_server({"--max-message-mb", "256"}));
16991757

17001758
auto client2 = create_client(config);
@@ -1745,7 +1803,7 @@ class TlsServerTests : public GrpcIntegrationTestBase {
17451803

17461804
static void TearDownTestSuite()
17471805
{
1748-
if (s_server_) s_server_->stop();
1806+
if (s_server_) EXPECT_TRUE(s_server_->stop());
17491807
s_server_.reset();
17501808
}
17511809

@@ -1846,7 +1904,7 @@ class MtlsServerTests : public GrpcIntegrationTestBase {
18461904

18471905
static void TearDownTestSuite()
18481906
{
1849-
if (s_server_) s_server_->stop();
1907+
if (s_server_) EXPECT_TRUE(s_server_->stop());
18501908
s_server_.reset();
18511909
}
18521910

@@ -1917,7 +1975,7 @@ class ChunkValidationTests : public GrpcIntegrationTestBase {
19171975

19181976
static void TearDownTestSuite()
19191977
{
1920-
if (s_server_) s_server_->stop();
1978+
if (s_server_) EXPECT_TRUE(s_server_->stop());
19211979
s_server_.reset();
19221980
}
19231981

0 commit comments

Comments
 (0)