Skip to content

Commit 3009b24

Browse files
authored
Merge branch 'pg--cat-1551-update-components-semantic-tokens' into pg--cat-1659-update-semantic-tokens-shared-folder
2 parents e9edb95 + 210684d commit 3009b24

66 files changed

Lines changed: 4019 additions & 574 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 229 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,229 @@
1+
package config;
2+
3+
import akka.Done;
4+
import akka.actor.CoordinatedShutdown;
5+
import com.google.inject.AbstractModule;
6+
import com.google.inject.Inject;
7+
import com.typesafe.config.Config;
8+
import java.io.IOException;
9+
import java.util.concurrent.CompletableFuture;
10+
import java.util.concurrent.atomic.AtomicBoolean;
11+
import javax.inject.Provider;
12+
import javax.inject.Singleton;
13+
import org.apache.http.impl.client.CloseableHttpClient;
14+
import org.slf4j.Logger;
15+
import org.slf4j.LoggerFactory;
16+
17+
/**
18+
* Guice module managing graceful shutdown for the DataHub Frontend pod.
19+
*
20+
* <p>When enabled via FRONTEND_GRACEFUL_SHUTDOWN_ENABLED, this module coordinates with Akka's
21+
* CoordinatedShutdown to gracefully handle Kubernetes pod termination:
22+
*
23+
* <ol>
24+
* <li>AWS Spot/K8s sends SIGTERM to the pod
25+
* <li>Akka CoordinatedShutdown triggers, calling registered phase tasks
26+
* <li>In before-service-unbind phase: FrontendShutdownHook flips isShuttingDown flag to true
27+
* <li>HealthCheckController reads this flag and returns 503, signaling readiness to LB
28+
* <li>Load balancer stops routing new requests and drains existing connections
29+
* <li>Remaining phases (service-requests-done: 65s, service-stop: 15s) allow graceful cleanup
30+
* <li>SIGKILL arrives after terminationGracePeriodSeconds (120s) if not already terminated
31+
* </ol>
32+
*
33+
* <p>Thread-safe via AtomicBoolean to prevent race conditions between shutdown hook and concurrent
34+
* request handling.
35+
*
36+
* <p>Disabled by default (FRONTEND_GRACEFUL_SHUTDOWN_ENABLED=false) for backward compatibility.
37+
* When disabled, a no-op FrontendShutdownHook is instantiated, ensuring graceful shutdown can be
38+
* toggled without code changes.
39+
*/
40+
@Singleton
41+
public class GracefulShutdownModule extends AbstractModule {
42+
43+
/**
44+
* Flag indicating if the service is shutting down.
45+
*
46+
* <p>Set to true by FrontendShutdownHook during the before-service-unbind phase of Akka
47+
* CoordinatedShutdown. Read by HealthCheckController to return 503 responses, signaling
48+
* Kubernetes that the pod should stop receiving new requests.
49+
*
50+
* <p>Uses AtomicBoolean for thread-safe reads/writes from concurrent request handlers and the
51+
* shutdown hook.
52+
*/
53+
private final AtomicBoolean isShuttingDown = new AtomicBoolean(false);
54+
55+
@Override
56+
protected void configure() {
57+
bind(GracefulShutdownModule.class).toInstance(this);
58+
bind(FrontendShutdownHook.class)
59+
.toProvider(FrontendShutdownHookProvider.class)
60+
.asEagerSingleton();
61+
}
62+
63+
/**
64+
* Checks if the service is currently shutting down.
65+
*
66+
* <p>This is called by HealthCheckController to determine if the readiness probe should return
67+
* 503 (Service Unavailable). Non-blocking atomic read.
68+
*
69+
* @return true if shutdown has been initiated, false otherwise
70+
*/
71+
public boolean isShuttingDown() {
72+
return isShuttingDown.get();
73+
}
74+
75+
/**
76+
* Marks the service as shutting down (one-way operation for production use).
77+
*
78+
* <p>Called by FrontendShutdownHook during the before-service-unbind phase of Akka
79+
* CoordinatedShutdown to signal that the service is shutting down and should not accept new
80+
* requests.
81+
*
82+
* <p>This is a one-way latch that only transitions from false→true, matching the GMS handler
83+
* pattern (GracefulShutdownHandler.onApplicationClosed). This prevents external code from
84+
* reversing a shutdown state.
85+
*/
86+
public void markShuttingDown() {
87+
isShuttingDown.set(true);
88+
}
89+
90+
/**
91+
* Sets the shutdown flag to an arbitrary value.
92+
*
93+
* <p><b>@VisibleForTesting</b> — Used by tests to reset state between test runs. This method
94+
* exists only for test cleanup, allowing tests to verify behavior in both shutdown and running
95+
* states.
96+
*
97+
* <p>Production code should use {@link #markShuttingDown()} instead, which enforces one-way
98+
* semantics and prevents accidental state reversals.
99+
*
100+
* @param value true to mark service as shutting down, false to reset state (tests only)
101+
*/
102+
public void setShuttingDown(boolean value) {
103+
isShuttingDown.set(value);
104+
}
105+
106+
/**
107+
* Provider factory for FrontendShutdownHook that conditionally enables graceful shutdown.
108+
*
109+
* <p>This indirection allows graceful shutdown to be toggled via configuration (
110+
* FRONTEND_GRACEFUL_SHUTDOWN_ENABLED) without code changes or complex conditional bean
111+
* definitions. Returns either an active hook (when enabled) or a no-op hook (when disabled).
112+
*
113+
* <p><b>Design Pattern (No-Op Construction)</b>: When disabled, a FrontendShutdownHook is still
114+
* instantiated but with null arguments. The constructor's null-check (line 171) detects this and
115+
* skips task registration, creating a safe no-op. This avoids the complexity of conditional bean
116+
* definitions while maintaining clear intent through documentation.
117+
*/
118+
@Singleton
119+
public static class FrontendShutdownHookProvider implements Provider<FrontendShutdownHook> {
120+
121+
private final Config config;
122+
private final CoordinatedShutdown coordinatedShutdown;
123+
private final CloseableHttpClient httpClient;
124+
private final GracefulShutdownModule module;
125+
126+
@Inject
127+
public FrontendShutdownHookProvider(
128+
Config config,
129+
CoordinatedShutdown coordinatedShutdown,
130+
CloseableHttpClient httpClient,
131+
GracefulShutdownModule module) {
132+
this.config = config;
133+
this.coordinatedShutdown = coordinatedShutdown;
134+
this.httpClient = httpClient;
135+
this.module = module;
136+
}
137+
138+
@Override
139+
public FrontendShutdownHook get() {
140+
if (config.getBoolean("frontend.graceful_shutdown_enabled")) {
141+
return new FrontendShutdownHook(coordinatedShutdown, httpClient, module);
142+
}
143+
// Return a no-op hook if graceful shutdown is disabled.
144+
// Passing null for coordinatedShutdown signals the constructor (see line 171)
145+
// to skip all task registration, making this hook a no-op.
146+
// This is the no-op construction pattern — allows feature toggle without code changes.
147+
return new FrontendShutdownHook(null, null, module);
148+
}
149+
}
150+
151+
/**
152+
* Registers Akka CoordinatedShutdown phase tasks to gracefully shut down the frontend.
153+
*
154+
* <p>Akka CoordinatedShutdown provides a multi-phase shutdown mechanism that coordinates resource
155+
* cleanup with the termination signal. The phases execute in order:
156+
*
157+
* <ol>
158+
* <li><b>before-service-unbind (10s timeout)</b>: Signal intent to clients (e.g., via status
159+
* pages, server header). Sets isShuttingDown=true so HealthCheckController returns 503.
160+
* <li><b>service-requests-done (65s timeout)</b>: Wait for in-flight HTTP requests to complete.
161+
* Play Framework drains the HTTP server during this phase.
162+
* <li><b>service-stop (15s timeout)</b>: Cleanup remaining resources (WebSocket clients, HTTP
163+
* caches, etc.).
164+
* </ol>
165+
*
166+
* <p><b>Timing Budget Analysis</b>: Akka phases total 10 + 65 + 15 = 90 seconds. Kubernetes
167+
* terminationGracePeriodSeconds is set to 120s. The preStop hook (70s sleep) and Akka shutdown
168+
* overlap after SIGTERM, so the effective maximum is max(70, 90) = 90 seconds, safely within the
169+
* 120s K8s budget. AWS Spot instances provide 120 seconds before SIGKILL.
170+
*
171+
* <p><b>No-op behavior</b>: If coordinatedShutdown is null (feature disabled), no tasks are
172+
* registered, and the hook becomes a no-op. This allows graceful shutdown to be toggled via
173+
* configuration alone.
174+
*/
175+
@Singleton
176+
public static class FrontendShutdownHook {
177+
178+
private static final Logger log = LoggerFactory.getLogger(FrontendShutdownHook.class);
179+
private final CoordinatedShutdown coordinatedShutdown;
180+
private final CloseableHttpClient httpClient;
181+
private final GracefulShutdownModule module;
182+
183+
public FrontendShutdownHook(
184+
CoordinatedShutdown coordinatedShutdown,
185+
CloseableHttpClient httpClient,
186+
GracefulShutdownModule module) {
187+
this.coordinatedShutdown = coordinatedShutdown;
188+
this.httpClient = httpClient;
189+
this.module = module;
190+
191+
if (coordinatedShutdown != null) {
192+
// Phase 1: before-service-unbind (10s default timeout)
193+
// Signal clients that shutdown is starting and flip the readiness flag.
194+
// HealthCheckController reads isShuttingDown and returns 503 (Service Unavailable),
195+
// which Kubernetes interprets as readiness failure. The load balancer stops routing
196+
// new traffic while existing connections are allowed to drain.
197+
coordinatedShutdown.addTask(
198+
CoordinatedShutdown.PhaseBeforeServiceUnbind(),
199+
"mark-unhealthy",
200+
() ->
201+
CompletableFuture.runAsync(
202+
() -> {
203+
log.info("Frontend shutdown initiated - stopping new connections soon");
204+
module.markShuttingDown();
205+
})
206+
.thenApply(v -> Done.done()));
207+
208+
// Phase 3: service-stop (15s default timeout)
209+
// After in-flight requests drain, close long-lived connections (HTTP client for GMS calls).
210+
coordinatedShutdown.addTask(
211+
CoordinatedShutdown.PhaseServiceStop(),
212+
"close-http-clients",
213+
() ->
214+
CompletableFuture.runAsync(
215+
() -> {
216+
try {
217+
log.info("Frontend shutdown initiated - shutting down open resources");
218+
if (httpClient != null) {
219+
httpClient.close();
220+
}
221+
} catch (IOException e) {
222+
log.error("Error closing CloseableHttpClient during shutdown", e);
223+
}
224+
})
225+
.thenApply(v -> Done.done()));
226+
}
227+
}
228+
}
229+
}

datahub-frontend/app/controllers/Application.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import com.linkedin.metadata.utils.BasePathUtils;
1313
import com.linkedin.util.Pair;
1414
import com.typesafe.config.Config;
15+
import config.GracefulShutdownModule;
1516
import java.io.InputStream;
1617
import java.net.URI;
1718
import java.net.http.HttpClient;
@@ -50,16 +51,22 @@ public class Application extends Controller {
5051

5152
private final Config config;
5253
private final Environment environment;
54+
private final GracefulShutdownModule shutdownModule;
5355

5456
private final String basePath;
5557
private final String gaTrackingId;
5658
private final List<String> streamingPathPrefixes;
5759

5860
@Inject
59-
public Application(HttpClient httpClient, Environment environment, @Nonnull Config config) {
61+
public Application(
62+
HttpClient httpClient,
63+
Environment environment,
64+
@Nonnull Config config,
65+
GracefulShutdownModule shutdownModule) {
6066
this.httpClient = httpClient;
6167
this.config = config;
6268
this.environment = environment;
69+
this.shutdownModule = shutdownModule;
6370
this.basePath = config.getString("datahub.basePath");
6471
this.gaTrackingId =
6572
config.hasPath("analytics.google.tracking.id")
@@ -122,6 +129,9 @@ private Result serveAsset(@Nullable String path) {
122129

123130
@Nonnull
124131
public Result healthcheck() {
132+
if (shutdownModule.isShuttingDown()) {
133+
return status(SERVICE_UNAVAILABLE, "Shutting down");
134+
}
125135
return ok("GOOD");
126136
}
127137

datahub-frontend/conf/application.conf

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ play.modules.disabled += "play.api.mvc.CookiesModule"
2727
play.modules.enabled += "auth.cookie.CustomCookiesModule"
2828
play.modules.enabled += "auth.AuthModule"
2929
play.modules.enabled += "modules.StartupModule"
30+
play.modules.enabled += "config.GracefulShutdownModule"
3031

3132
# Debug configuration dumping, if set to true might log secrets too.
3233
# Do not enable Akka log-config-on-start (CVE-2023-45865: env vars could be logged).
@@ -321,3 +322,37 @@ play.server.https.keyStore {
321322
type = ${?PLAY_HTTPS_KEYSTORE_TYPE}
322323
password = ${?PLAY_HTTPS_KEYSTORE_PASSWORD}
323324
}
325+
326+
# Graceful shutdown settings
327+
# Enable/disable graceful shutdown hook for marking service as unhealthy during shutdown
328+
# When enabled, the frontend will flip its readiness flag on SIGTERM, allowing Kubernetes
329+
# to drain existing connections while rejecting new requests.
330+
# Default: false (backward compatible, graceful shutdown must be explicitly enabled)
331+
frontend.graceful_shutdown_enabled = false
332+
frontend.graceful_shutdown_enabled = ${?FRONTEND_GRACEFUL_SHUTDOWN_ENABLED}
333+
334+
akka.coordinated-shutdown.phases {
335+
# Akka CoordinatedShutdown phases execute sequentially: before-service-unbind → service-requests-done → service-stop
336+
# Total duration: 10s + 65s + 15s = 90 seconds
337+
#
338+
# Timing budget analysis:
339+
# - Kubernetes preStop hook (70s sleep) and Akka shutdown (90s total) overlap after SIGTERM
340+
# - Effective max shutdown time: max(70s, 90s) = 90s, safely within K8s terminationGracePeriodSeconds (120s)
341+
# - AWS Spot instance termination window: 120s before SIGKILL
342+
# - Buffer: 30s between Akka completion (90s) and K8s SIGKILL (120s)
343+
344+
# Phase 1: before-service-unbind (signal clients, flip readiness flag)
345+
# time to signal clients before connections close
346+
before-service-unbind.timeout = 10s
347+
before-service-unbind.timeout = ${?FRONTEND_BEFORE_SERVICE_UNBIND_TIMEOUT}
348+
349+
# Phase 2: service-requests-done (drain in-flight HTTP requests)
350+
# time for in-flight HTTP requests to drain — equivalent to Spring's timeout-per-shutdown-phase
351+
service-requests-done.timeout = 65s
352+
service-requests-done.timeout = ${?FRONTEND_SERVICE_REQUESTS_DONE_TIMEOUT}
353+
354+
# Phase 3: service-stop (cleanup resources)
355+
# time for cleanup tasks (WebSocket clients, HTTP caches, etc.)
356+
service-stop.timeout = 15s
357+
service-stop.timeout = ${?FRONTEND_SERVICE_STOP_TIMEOUT}
358+
}

datahub-frontend/test/app/ApplicationTest.java

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import static org.mockito.Mockito.*;
88
import static play.mvc.Http.Status.MOVED_PERMANENTLY;
99
import static play.mvc.Http.Status.OK;
10+
import static play.mvc.Http.Status.SERVICE_UNAVAILABLE;
1011
import static play.test.Helpers.fakeRequest;
1112
import static play.test.Helpers.route;
1213

@@ -24,6 +25,7 @@
2425
import com.nimbusds.jwt.JWT;
2526
import com.nimbusds.jwt.JWTClaimsSet;
2627
import com.nimbusds.jwt.JWTParser;
28+
import config.GracefulShutdownModule;
2729
import controllers.routes;
2830
import java.io.IOException;
2931
import java.net.HttpURLConnection;
@@ -1466,6 +1468,25 @@ public void testIndexWhenResourceNotFound() {
14661468
assertEquals("no-cache", result.headers().get("Cache-Control"));
14671469
}
14681470

1471+
@Test
1472+
public void testHealthCheckReturns503WhenShuttingDown() {
1473+
// Get the singleton GracefulShutdownModule instance from the app injector
1474+
GracefulShutdownModule shutdownModule = app.injector().instanceOf(GracefulShutdownModule.class);
1475+
// Set shutdown flag via instance method
1476+
shutdownModule.setShuttingDown(true);
1477+
1478+
try {
1479+
Http.RequestBuilder request = fakeRequest(Helpers.GET, "/health");
1480+
Result result = route(app, request);
1481+
assertEquals(SERVICE_UNAVAILABLE, result.status());
1482+
String content = Helpers.contentAsString(result);
1483+
assertEquals("Shutting down", content);
1484+
} finally {
1485+
// Reset the shutdown flag after test
1486+
shutdownModule.setShuttingDown(false);
1487+
}
1488+
}
1489+
14691490
/**
14701491
* Test module that provides a mock Application controller that simulates resource loading failure
14711492
*/
@@ -1486,7 +1507,10 @@ protected controllers.Application provideFailingApplicationController(
14861507
Environment mockEnvironment = mock(Environment.class);
14871508
when(mockEnvironment.resourceAsStream("public/index.html")).thenReturn(null);
14881509

1489-
return new controllers.Application(mockHttpClient, mockEnvironment, config);
1510+
// Mock GracefulShutdownModule for the test
1511+
GracefulShutdownModule mockShutdownModule = mock(GracefulShutdownModule.class);
1512+
return new controllers.Application(
1513+
mockHttpClient, mockEnvironment, config, mockShutdownModule);
14901514
}
14911515

14921516
@Provides

0 commit comments

Comments
 (0)