fix(api): restore HTTP GET /health endpoint (#25234)

pront · claude · web-flow · commit 1c70988b5415 · 2026-04-21T17:00:33.000Z
* refactor(api): serve gRPC via hyper + axum router Convert tonic's Server to an axum Router via into_router(), then serve over the same TcpListener via hyper::Server. Enables HTTP/1.1 acceptance so additional HTTP routes can be added alongside gRPC on the same port. Behavior-preserving. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * fix(api): restore HTTP GET /health endpoint Re-expose the HTTP health endpoint that was removed as part of the GraphQL-to-gRPC migration (#24364). The endpoint matches the pre-0.55 response shape: 200 with body {"ok": true} while serving and 503 with body {"ok": false} after set_not_serving() is called during drain. HEAD is also handled. gRPC clients continue to use grpc.health.v1.Health/Check; both probes now share the same serving state so they agree during shutdown. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * test(api): cover HTTP GET/HEAD /health endpoint Adds two integration tests hitting the restored HTTP health endpoint via reqwest: - GET returns 200 with body {"ok":true} - HEAD returns 200 Exposes harness.api_port() so tests can reach the API port directly. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * docs(api): document HTTP /health endpoint Document the HTTP GET/HEAD /health endpoint served alongside the gRPC API, framed as compatibility with Vector 0.54.0 and earlier. Updates the reference endpoints schema to allow HEAD, adds HEAD/GET entries for /health in api.cue with 200/503 responses, and adds a curl example to the API reference page. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * style(api): apply cargo fmt Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/src/api/grpc_server.rs b/src/api/grpc_server.rs
@@ -1,17 +1,36 @@
-use std::{error::Error as StdError, net::SocketAddr};
+use std::{
+    error::Error as StdError,
+    net::SocketAddr,
+    sync::{
+        Arc,
+        atomic::{AtomicBool, Ordering},
+    },
+};
+
+use axum::{
+    Router,
+    extract::State,
+    http::{StatusCode, header},
+    response::IntoResponse,
+    routing::get,
+};
 use tokio::sync::oneshot;
-use tokio_stream::wrappers::TcpListenerStream;
 use tonic::transport::Server as TonicServer;
 use tonic_health::server::{HealthReporter, health_reporter};
 use vector_lib::tap::topology::WatchRx;
 
 use super::grpc::ObservabilityService;
 use crate::{config::Config, proto::observability::Server as ObservabilityServer};
 
+/// Shared flag backing the HTTP `/health` endpoint. Mirrors the gRPC
+/// `HealthReporter` serving status so HTTP and gRPC probes agree.
+type ServingState = Arc<AtomicBool>;
+
 /// gRPC API server for Vector observability.
 pub struct GrpcServer {
     _shutdown: oneshot::Sender<()>,
     health_reporter: HealthReporter,
+    serving: ServingState,
     addr: SocketAddr,
 }
 
@@ -46,12 +65,22 @@ impl GrpcServer {
         // The empty service ("") is registered as SERVING by default.
         let (health_reporter, health_service) = health_reporter();
 
+        let serving: ServingState = Arc::new(AtomicBool::new(true));
+
         let (_shutdown, rx) = oneshot::channel();
 
+        // Convert the tokio TcpListener into a std listener for hyper's Server.
+        let std_listener = listener
+            .into_std()
+            .map_err(|e| crate::Error::from(format!("Failed to convert TCP listener: {}", e)))?;
+        std_listener.set_nonblocking(true).map_err(|e| {
+            crate::Error::from(format!("Failed to set TCP listener non-blocking: {}", e))
+        })?;
+
+        let router_serving = Arc::clone(&serving);
+
         // Spawn the server with the already-bound listener
         tokio::spawn(async move {
-            let incoming = TcpListenerStream::new(listener);
-
             // Build reflection service for tools like grpcurl
             let reflection_service = tonic_reflection::server::Builder::configure()
                 .register_encoded_file_descriptor_set(
@@ -61,11 +90,21 @@ impl GrpcServer {
                 .build()
                 .expect("Failed to build reflection service");
 
-            let result = TonicServer::builder()
+            // Build the tonic router (gRPC services) and merge with the HTTP router
+            // so both protocols share the same port. `accept_http1(true)` lets plain
+            // HTTP/1.1 requests reach the merged axum routes.
+            let router = TonicServer::builder()
+                .accept_http1(true)
                 .add_service(health_service)
                 .add_service(ObservabilityServer::new(service))
                 .add_service(reflection_service)
-                .serve_with_incoming_shutdown(incoming, async {
+                .into_router()
+                .merge(http_router(router_serving));
+
+            let result = hyper::Server::from_tcp(std_listener)
+                .expect("Failed to build HTTP server from TCP listener")
+                .serve(router.into_make_service())
+                .with_graceful_shutdown(async {
                     rx.await.ok();
                     info!("GRPC API server shutting down.");
                 })
@@ -86,16 +125,18 @@ impl GrpcServer {
         Ok(Self {
             _shutdown,
             health_reporter,
+            serving,
             addr: actual_addr,
         })
     }
 
     /// Signal that the server is no longer serving.
     ///
     /// Call this **before** draining the topology so that Kubernetes gRPC
-    /// readiness probes fail early and the pod is removed from endpoints
-    /// before the process exits.
+    /// readiness probes and HTTP `/health` probes fail early and the pod is
+    /// removed from endpoints before the process exits.
     pub async fn set_not_serving(&mut self) {
+        self.serving.store(false, Ordering::Relaxed);
         self.health_reporter
             .set_service_status("", tonic_health::ServingStatus::NotServing)
             .await;
@@ -106,3 +147,31 @@ impl GrpcServer {
         self.addr
     }
 }
+
+/// Axum router exposing `GET`/`HEAD /health`.
+///
+/// Returns `200 {"ok":true}` while the server is serving and
+/// `503 {"ok":false}` once [`GrpcServer::set_not_serving`] has been called.
+/// Matches the response shape of the pre-gRPC GraphQL-era endpoint so
+/// existing HTTP health probes (Kubernetes, load balancers) keep working.
+fn http_router(state: ServingState) -> Router {
+    Router::new()
+        .route("/health", get(health_handler).head(health_handler))
+        .with_state(state)
+}
+
+async fn health_handler(State(state): State<ServingState>) -> impl IntoResponse {
+    if state.load(Ordering::Relaxed) {
+        (
+            StatusCode::OK,
+            [(header::CONTENT_TYPE, "application/json")],
+            r#"{"ok":true}"#,
+        )
+    } else {
+        (
+            StatusCode::SERVICE_UNAVAILABLE,
+            [(header::CONTENT_TYPE, "application/json")],
+            r#"{"ok":false}"#,
+        )
+    }
+}
diff --git a/tests/vector_api/harness.rs b/tests/vector_api/harness.rs
@@ -133,6 +133,11 @@ impl TestHarness {
         &mut self.api_client
     }
 
+    /// Returns the TCP port the API server is bound to
+    pub fn api_port(&self) -> u16 {
+        self.api_port
+    }
+
     /// Reloads Vector configuration by sending SIGHUP or using watch mode
     ///
     /// Polls Vector to detect crashes early and succeed fast when reload completes.
diff --git a/tests/vector_api/health.rs b/tests/vector_api/health.rs
@@ -1,4 +1,7 @@
-//! Integration tests for the standard gRPC health check on the observability API.
+//! Integration tests for the health endpoints on the observability API:
+//!
+//! * the standard gRPC health check (`grpc.health.v1.Health/Check`)
+//! * the HTTP `/health` endpoint served on the same port
 
 use super::{common::*, harness::*};
 
@@ -23,3 +26,42 @@ async fn health_check_reports_serving() {
 
     assert!(harness.check_running(), "Vector should still be running");
 }
+
+/// Verifies the HTTP `GET /health` endpoint returns 200 with `{"ok":true}` on a
+/// running Vector instance. This endpoint is load-balancer friendly and is
+/// shared with gRPC clients on the same API port.
+#[tokio::test]
+async fn http_health_endpoint_returns_200_when_serving() {
+    let config = single_source_config("demo", 1.0, None);
+    let harness = TestHarness::new(&config)
+        .await
+        .expect("Vector should start");
+
+    let url = format!("http://127.0.0.1:{}/health", harness.api_port());
+    let response = reqwest::get(&url)
+        .await
+        .expect("GET /health should succeed");
+
+    assert_eq!(response.status(), reqwest::StatusCode::OK);
+    let body = response.text().await.expect("body should be readable");
+    assert_eq!(body, r#"{"ok":true}"#);
+}
+
+/// Verifies HTTP `HEAD /health` returns 200 without a body. ALB/ELB style
+/// probes that prefer HEAD should work the same as GET.
+#[tokio::test]
+async fn http_health_endpoint_supports_head() {
+    let config = single_source_config("demo", 1.0, None);
+    let harness = TestHarness::new(&config)
+        .await
+        .expect("Vector should start");
+
+    let url = format!("http://127.0.0.1:{}/health", harness.api_port());
+    let response = reqwest::Client::new()
+        .head(&url)
+        .send()
+        .await
+        .expect("HEAD /health should succeed");
+
+    assert_eq!(response.status(), reqwest::StatusCode::OK);
+}
diff --git a/website/content/en/docs/reference/api.md b/website/content/en/docs/reference/api.md
@@ -21,6 +21,12 @@ instance. This page covers how to configure and enable Vector's API.
 The API exposes a gRPC service defined in [`proto/vector/observability.proto`](https://github.com/vectordotdev/vector/blob/master/proto/vector/observability.proto).
 You can interact with it using any standard gRPC tooling.
 
+For compatibility with Vector 0.54.0 and earlier, the HTTP `GET /health`
+endpoint continues to be served on the same port as the gRPC API, so
+existing HTTP probes (for example AWS ALB health checks and Kubernetes
+HTTP liveness/readiness probes) keep working without changes. See the
+[Endpoints](#endpoints) section above for details.
+
 ### Example using grpcurl
 
 ```bash
@@ -35,3 +41,10 @@ grpcurl -plaintext \
   -d '{"outputs_patterns": ["*"], "limit": 100, "interval_ms": 500}' \
   localhost:8686 vector.observability.v1.ObservabilityService/StreamOutputEvents
 ```
+
+### Example using curl (HTTP health)
+
+```bash
+# 200 with body {"ok":true} while serving, 503 {"ok":false} during drain
+curl -i http://localhost:8686/health
+```
diff --git a/website/cue/reference.cue b/website/cue/reference.cue
@@ -68,6 +68,7 @@ _values: {
 #Endpoints: [Path=string]: {
 	DELETE?: #Endpoint
 	GET?:    #Endpoint
+	HEAD?:   #Endpoint
 	POST?:   #Endpoint
 	PUT?:    #Endpoint
 }
diff --git a/website/cue/reference/api.cue b/website/cue/reference/api.cue
@@ -24,13 +24,34 @@ api: {
 		"/health": {
 			GET: {
 				description: """
-					Healthcheck endpoint. Useful to verify that
-					Vector is up and running.
+					HTTP healthcheck endpoint served on the same port as the
+					gRPC API, preserved for compatibility with Vector 0.54.0
+					and earlier so existing HTTP probes (for example AWS ALB
+					and Kubernetes HTTP probes) keep working unchanged.
+					The response body is `{"ok": true}` while Vector is
+					serving and `{"ok": false}` once Vector begins draining.
 					"""
 				responses: {
 					"200": {
 						description: "Vector is initialized and running."
 					}
+					"503": {
+						description: "Vector is draining or shutting down and should be removed from the load balancer."
+					}
+				}
+			}
+			HEAD: {
+				description: """
+					Same semantics as `GET /health` but returns no body.
+					Intended for load balancer probes that prefer `HEAD`.
+					"""
+				responses: {
+					"200": {
+						description: "Vector is initialized and running."
+					}
+					"503": {
+						description: "Vector is draining or shutting down and should be removed from the load balancer."
+					}
 				}
 			}
 		}

Original file line number	Diff line number	Diff line change
`@@ -68,6 +68,7 @@ _values: {`
`68`	`68`	`#Endpoints: [Path=string]: {`
`69`	`69`	`DELETE?: #Endpoint`
`70`	`70`	`GET?: #Endpoint`
	`71`	`+ HEAD?: #Endpoint`
`71`	`72`	`POST?: #Endpoint`
`72`	`73`	`PUT?: #Endpoint`
`73`	`74`	`}`