2929#include < gtest/gtest.h>
3030
3131#include < atomic>
32+ #include < cerrno>
3233#include < cmath>
3334#include < filesystem>
3435#include < fstream>
5455
5556#include < fcntl.h>
5657#include < signal.h>
58+ #include < sys/prctl.h>
5759#include < sys/types.h>
5860#include < sys/wait.h>
5961#include < unistd.h>
@@ -71,14 +73,26 @@ using cuopt::mathematical_optimization::testing::GrpcTestLogCapture;
7173
7274namespace {
7375
74- // =============================================================================
75- // Server Process Manager
76- // =============================================================================
76+ // GRPC_INTEGRATION_TEST
77+ // `-- cuopt_grpc_server parent (pid_, leads process group pgid_ == pid_)
78+ // `-- GPU worker (inherits pgid_)
79+ //
80+ // The server is forked into its own process group so stop() can signal the whole
81+ // group. Killing the server parent orphans its GPU worker, so this test process
82+ // also marks itself a subreaper (PR_SET_CHILD_SUBREAPER): the orphan reparents
83+ // here and stop() reaps the entire group, even inside a container whose PID 1
84+ // does not reap. Polling kill(-pgid_, 0) instead is not enough -- it counts an
85+ // unreaped zombie as a live member and can never observe a grandchild's death.
7786
7887class ServerProcess {
7988 public:
80- ServerProcess () : pid_(-1 ), port_(0 ) {}
81- ~ServerProcess () { stop (); }
89+ ServerProcess () : pid_(-1 ), pgid_(-1 ), port_(0 ) {}
90+ ServerProcess (const ServerProcess&) = delete ;
91+ ServerProcess& operator =(const ServerProcess&) = delete ;
92+ ~ServerProcess ()
93+ {
94+ if (!stop ()) { std::cerr << " Failed to clean up test server process group\n " ; }
95+ }
8296
8397 void set_tls_config (const std::string& root_certs,
8498 const std::string& client_cert = " " ,
@@ -91,21 +105,31 @@ class ServerProcess {
91105
92106 bool start (int port, const std::vector<std::string>& extra_args = {})
93107 {
94- port_ = port;
108+ if (pid_ > 0 ) {
109+ std::cerr << " Cannot reuse a ServerProcess while it still owns a process lifecycle\n " ;
110+ return false ;
111+ }
95112
96113 std::string server_path = find_server_binary ();
97114 if (server_path.empty ()) {
98115 std::cerr << " Could not find cuopt_grpc_server binary\n " ;
99116 return false ;
100117 }
101118
102- pid_ = fork ();
119+ // Reparent any process orphaned by killing the server (i.e. the GPU worker)
120+ // onto this test process so stop() can reap it without relying on init.
121+ prctl (PR_SET_CHILD_SUBREAPER , 1 , 0 , 0 , 0 );
122+
123+ port_ = port;
124+ pid_ = fork ();
103125 if (pid_ < 0 ) {
104126 std::cerr << " fork() failed\n " ;
105127 return false ;
106128 }
107129
108130 if (pid_ == 0 ) {
131+ setpgid (0 , 0 ); // child leads its own group; parent mirrors this below
132+
109133 std::vector<const char *> args;
110134 args.push_back (server_path.c_str ());
111135 args.push_back (" --port" );
@@ -131,30 +155,37 @@ class ServerProcess {
131155 _exit (127 );
132156 }
133157
134- return wait_for_ready (15000 );
158+ // Mirror the child's setpgid() so the group is established regardless of which
159+ // process runs first; the child is a fresh fork, so its group id is its pid.
160+ setpgid (pid_, pid_);
161+ pgid_ = pid_;
162+
163+ if (!wait_for_ready (15000 )) {
164+ if (!stop ()) { std::cerr << " Failed to clean up server after readiness failure\n " ; }
165+ return false ;
166+ }
167+
168+ return true ;
135169 }
136170
137- void stop ()
171+ bool stop ()
138172 {
139- if (pid_ > 0 ) {
140- kill (pid_, SIGTERM );
141-
142- int status;
143- int wait_ms = 0 ;
144- while (wait_ms < 5000 ) {
145- int ret = waitpid (pid_, &status, WNOHANG );
146- if (ret != 0 ) break ;
147- std::this_thread::sleep_for (std::chrono::milliseconds (100 ));
148- wait_ms += 100 ;
149- }
150-
151- if (waitpid (pid_, &status, WNOHANG ) == 0 ) {
152- kill (pid_, SIGKILL );
153- waitpid (pid_, &status, 0 );
173+ if (pid_ <= 0 ) return true ;
174+
175+ // Ask the whole group (server + GPU worker) to exit gracefully, then reap
176+ // every member. Killing the server parent can orphan a mid-solve worker;
177+ // because we are a subreaper it reparents here and reap_group() collects it.
178+ kill (-pgid_, SIGTERM );
179+ if (!reap_group (std::chrono::seconds (15 ))) {
180+ kill (-pgid_, SIGKILL );
181+ if (!reap_group (std::chrono::seconds (15 ))) {
182+ std::cerr << " Server process group " << pgid_ << " did not exit\n " ;
183+ return false ;
154184 }
155-
156- pid_ = -1 ;
157185 }
186+
187+ clear_lifecycle_state ();
188+ return true ;
158189 }
159190
160191 int port () const { return port_; }
@@ -172,6 +203,32 @@ class ServerProcess {
172203 }
173204
174205 private:
206+ void clear_lifecycle_state ()
207+ {
208+ pid_ = -1 ;
209+ pgid_ = -1 ;
210+ port_ = 0 ;
211+ }
212+
213+ // Reap every member of the server's process group, returning true once the
214+ // group is fully drained. start() makes this process a subreaper, so a worker
215+ // orphaned when the server parent dies reparents here and is reaped too --
216+ // first the parent, then (once its death reparents the worker) the worker.
217+ bool reap_group (std::chrono::milliseconds timeout)
218+ {
219+ auto deadline = std::chrono::steady_clock::now () + timeout;
220+ while (true ) {
221+ int status = 0 ;
222+ pid_t ret = waitpid (-pgid_, &status, WNOHANG );
223+ if (ret > 0 ) continue ; // reaped a member; keep draining
224+ if (ret < 0 && errno == EINTR ) continue ;
225+ if (ret < 0 && errno == ECHILD ) return true ; // no members left
226+ // ret == 0: members remain but none have exited yet.
227+ if (std::chrono::steady_clock::now () >= deadline) return false ;
228+ std::this_thread::sleep_for (std::chrono::milliseconds (10 ));
229+ }
230+ }
231+
175232 std::string find_in_path (const std::string& name)
176233 {
177234 const char * path_env = std::getenv (" PATH" );
@@ -246,8 +303,8 @@ class ServerProcess {
246303 grpc_client_t client (config);
247304 if (client.connect ()) { return true ; }
248305
249- int status;
250- if (waitpid (pid_, &status, WNOHANG ) != 0 ) {
306+ int status = 0 ;
307+ if (waitpid (pid_, &status, WNOHANG ) == pid_ ) {
251308 std::cerr << " Server process died during startup\n " ;
252309 return false ;
253310 }
@@ -257,6 +314,7 @@ class ServerProcess {
257314 }
258315
259316 pid_t pid_;
317+ pid_t pgid_;
260318 int port_;
261319 std::string tls_root_certs_;
262320 std::string tls_client_cert_;
@@ -516,7 +574,7 @@ class DefaultServerTests : public GrpcIntegrationTestBase {
516574
517575 static void TearDownTestSuite ()
518576 {
519- if (s_server_) s_server_->stop ();
577+ if (s_server_) EXPECT_TRUE ( s_server_->stop () );
520578 s_server_.reset ();
521579 }
522580
@@ -1083,7 +1141,7 @@ class ChunkedUploadTests : public GrpcIntegrationTestBase {
10831141
10841142 static void TearDownTestSuite ()
10851143 {
1086- if (s_server_) s_server_->stop ();
1144+ if (s_server_) EXPECT_TRUE ( s_server_->stop () );
10871145 s_server_.reset ();
10881146 }
10891147
@@ -1399,7 +1457,7 @@ class PathSelectionTests : public GrpcIntegrationTestBase {
13991457
14001458 static void TearDownTestSuite ()
14011459 {
1402- if (s_server_) s_server_->stop ();
1460+ if (s_server_) EXPECT_TRUE ( s_server_->stop () );
14031461 s_server_.reset ();
14041462 }
14051463
@@ -1583,7 +1641,7 @@ TEST_F(PathSelectionTests, UnaryUploadMIPWithPathLogging)
15831641class ErrorRecoveryTests : public GrpcIntegrationTestBase {
15841642 protected:
15851643 void SetUp () override { port_ = get_test_port (); }
1586- void TearDown () override { server_.stop (); }
1644+ void TearDown () override { EXPECT_TRUE ( server_.stop () ); }
15871645
15881646 bool start_server (const std::vector<std::string>& extra_args = {})
15891647 {
@@ -1602,7 +1660,7 @@ TEST_F(ErrorRecoveryTests, ClientReconnectsAfterServerRestart)
16021660 auto status_before = client->check_status (" test-job" );
16031661 EXPECT_TRUE (status_before.success );
16041662
1605- server_.stop ();
1663+ ASSERT_TRUE ( server_.stop () );
16061664 EXPECT_FALSE (server_.is_running ());
16071665
16081666 auto status_down = client->check_status (" test-job" );
@@ -1630,7 +1688,7 @@ TEST_F(ErrorRecoveryTests, ClientHandlesServerCrashDuringSolve)
16301688 ASSERT_TRUE (submit_result.success );
16311689
16321690 std::this_thread::sleep_for (std::chrono::milliseconds (500 ));
1633- server_.stop ();
1691+ EXPECT_TRUE ( server_.stop () );
16341692
16351693 auto status_result = client->check_status (submit_result.job_id );
16361694 EXPECT_FALSE (status_result.success );
@@ -1694,7 +1752,7 @@ TEST_F(ErrorRecoveryTests, ChunkedUploadAfterServerRestart)
16941752 auto result1 = client->solve_mip (problem, settings, false );
16951753 EXPECT_TRUE (result1.success ) << result1.error_message ;
16961754
1697- server_.stop ();
1755+ ASSERT_TRUE ( server_.stop () );
16981756 ASSERT_TRUE (start_server ({" --max-message-mb" , " 256" }));
16991757
17001758 auto client2 = create_client (config);
@@ -1745,7 +1803,7 @@ class TlsServerTests : public GrpcIntegrationTestBase {
17451803
17461804 static void TearDownTestSuite ()
17471805 {
1748- if (s_server_) s_server_->stop ();
1806+ if (s_server_) EXPECT_TRUE ( s_server_->stop () );
17491807 s_server_.reset ();
17501808 }
17511809
@@ -1846,7 +1904,7 @@ class MtlsServerTests : public GrpcIntegrationTestBase {
18461904
18471905 static void TearDownTestSuite ()
18481906 {
1849- if (s_server_) s_server_->stop ();
1907+ if (s_server_) EXPECT_TRUE ( s_server_->stop () );
18501908 s_server_.reset ();
18511909 }
18521910
@@ -1917,7 +1975,7 @@ class ChunkValidationTests : public GrpcIntegrationTestBase {
19171975
19181976 static void TearDownTestSuite ()
19191977 {
1920- if (s_server_) s_server_->stop ();
1978+ if (s_server_) EXPECT_TRUE ( s_server_->stop () );
19211979 s_server_.reset ();
19221980 }
19231981
0 commit comments