Skip to content

Commit 1c70988

Browse files
prontclaude
andauthored
fix(api): restore HTTP GET /health endpoint (#25234)
* refactor(api): serve gRPC via hyper + axum router Convert tonic's Server to an axum Router via into_router(), then serve over the same TcpListener via hyper::Server. Enables HTTP/1.1 acceptance so additional HTTP routes can be added alongside gRPC on the same port. Behavior-preserving. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * fix(api): restore HTTP GET /health endpoint Re-expose the HTTP health endpoint that was removed as part of the GraphQL-to-gRPC migration (#24364). The endpoint matches the pre-0.55 response shape: 200 with body {"ok": true} while serving and 503 with body {"ok": false} after set_not_serving() is called during drain. HEAD is also handled. gRPC clients continue to use grpc.health.v1.Health/Check; both probes now share the same serving state so they agree during shutdown. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * test(api): cover HTTP GET/HEAD /health endpoint Adds two integration tests hitting the restored HTTP health endpoint via reqwest: - GET returns 200 with body {"ok":true} - HEAD returns 200 Exposes harness.api_port() so tests can reach the API port directly. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * docs(api): document HTTP /health endpoint Document the HTTP GET/HEAD /health endpoint served alongside the gRPC API, framed as compatibility with Vector 0.54.0 and earlier. Updates the reference endpoints schema to allow HEAD, adds HEAD/GET entries for /health in api.cue with 200/503 responses, and adds a curl example to the API reference page. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * style(api): apply cargo fmt Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 3ca3e61 commit 1c70988

6 files changed

Lines changed: 162 additions & 11 deletions

File tree

src/api/grpc_server.rs

Lines changed: 77 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,36 @@
1-
use std::{error::Error as StdError, net::SocketAddr};
1+
use std::{
2+
error::Error as StdError,
3+
net::SocketAddr,
4+
sync::{
5+
Arc,
6+
atomic::{AtomicBool, Ordering},
7+
},
8+
};
9+
10+
use axum::{
11+
Router,
12+
extract::State,
13+
http::{StatusCode, header},
14+
response::IntoResponse,
15+
routing::get,
16+
};
217
use tokio::sync::oneshot;
3-
use tokio_stream::wrappers::TcpListenerStream;
418
use tonic::transport::Server as TonicServer;
519
use tonic_health::server::{HealthReporter, health_reporter};
620
use vector_lib::tap::topology::WatchRx;
721

822
use super::grpc::ObservabilityService;
923
use crate::{config::Config, proto::observability::Server as ObservabilityServer};
1024

25+
/// Shared flag backing the HTTP `/health` endpoint. Mirrors the gRPC
26+
/// `HealthReporter` serving status so HTTP and gRPC probes agree.
27+
type ServingState = Arc<AtomicBool>;
28+
1129
/// gRPC API server for Vector observability.
1230
pub struct GrpcServer {
1331
_shutdown: oneshot::Sender<()>,
1432
health_reporter: HealthReporter,
33+
serving: ServingState,
1534
addr: SocketAddr,
1635
}
1736

@@ -46,12 +65,22 @@ impl GrpcServer {
4665
// The empty service ("") is registered as SERVING by default.
4766
let (health_reporter, health_service) = health_reporter();
4867

68+
let serving: ServingState = Arc::new(AtomicBool::new(true));
69+
4970
let (_shutdown, rx) = oneshot::channel();
5071

72+
// Convert the tokio TcpListener into a std listener for hyper's Server.
73+
let std_listener = listener
74+
.into_std()
75+
.map_err(|e| crate::Error::from(format!("Failed to convert TCP listener: {}", e)))?;
76+
std_listener.set_nonblocking(true).map_err(|e| {
77+
crate::Error::from(format!("Failed to set TCP listener non-blocking: {}", e))
78+
})?;
79+
80+
let router_serving = Arc::clone(&serving);
81+
5182
// Spawn the server with the already-bound listener
5283
tokio::spawn(async move {
53-
let incoming = TcpListenerStream::new(listener);
54-
5584
// Build reflection service for tools like grpcurl
5685
let reflection_service = tonic_reflection::server::Builder::configure()
5786
.register_encoded_file_descriptor_set(
@@ -61,11 +90,21 @@ impl GrpcServer {
6190
.build()
6291
.expect("Failed to build reflection service");
6392

64-
let result = TonicServer::builder()
93+
// Build the tonic router (gRPC services) and merge with the HTTP router
94+
// so both protocols share the same port. `accept_http1(true)` lets plain
95+
// HTTP/1.1 requests reach the merged axum routes.
96+
let router = TonicServer::builder()
97+
.accept_http1(true)
6598
.add_service(health_service)
6699
.add_service(ObservabilityServer::new(service))
67100
.add_service(reflection_service)
68-
.serve_with_incoming_shutdown(incoming, async {
101+
.into_router()
102+
.merge(http_router(router_serving));
103+
104+
let result = hyper::Server::from_tcp(std_listener)
105+
.expect("Failed to build HTTP server from TCP listener")
106+
.serve(router.into_make_service())
107+
.with_graceful_shutdown(async {
69108
rx.await.ok();
70109
info!("GRPC API server shutting down.");
71110
})
@@ -86,16 +125,18 @@ impl GrpcServer {
86125
Ok(Self {
87126
_shutdown,
88127
health_reporter,
128+
serving,
89129
addr: actual_addr,
90130
})
91131
}
92132

93133
/// Signal that the server is no longer serving.
94134
///
95135
/// Call this **before** draining the topology so that Kubernetes gRPC
96-
/// readiness probes fail early and the pod is removed from endpoints
97-
/// before the process exits.
136+
/// readiness probes and HTTP `/health` probes fail early and the pod is
137+
/// removed from endpoints before the process exits.
98138
pub async fn set_not_serving(&mut self) {
139+
self.serving.store(false, Ordering::Relaxed);
99140
self.health_reporter
100141
.set_service_status("", tonic_health::ServingStatus::NotServing)
101142
.await;
@@ -106,3 +147,31 @@ impl GrpcServer {
106147
self.addr
107148
}
108149
}
150+
151+
/// Axum router exposing `GET`/`HEAD /health`.
152+
///
153+
/// Returns `200 {"ok":true}` while the server is serving and
154+
/// `503 {"ok":false}` once [`GrpcServer::set_not_serving`] has been called.
155+
/// Matches the response shape of the pre-gRPC GraphQL-era endpoint so
156+
/// existing HTTP health probes (Kubernetes, load balancers) keep working.
157+
fn http_router(state: ServingState) -> Router {
158+
Router::new()
159+
.route("/health", get(health_handler).head(health_handler))
160+
.with_state(state)
161+
}
162+
163+
async fn health_handler(State(state): State<ServingState>) -> impl IntoResponse {
164+
if state.load(Ordering::Relaxed) {
165+
(
166+
StatusCode::OK,
167+
[(header::CONTENT_TYPE, "application/json")],
168+
r#"{"ok":true}"#,
169+
)
170+
} else {
171+
(
172+
StatusCode::SERVICE_UNAVAILABLE,
173+
[(header::CONTENT_TYPE, "application/json")],
174+
r#"{"ok":false}"#,
175+
)
176+
}
177+
}

tests/vector_api/harness.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,11 @@ impl TestHarness {
133133
&mut self.api_client
134134
}
135135

136+
/// Returns the TCP port the API server is bound to
137+
pub fn api_port(&self) -> u16 {
138+
self.api_port
139+
}
140+
136141
/// Reloads Vector configuration by sending SIGHUP or using watch mode
137142
///
138143
/// Polls Vector to detect crashes early and succeed fast when reload completes.

tests/vector_api/health.rs

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
//! Integration tests for the standard gRPC health check on the observability API.
1+
//! Integration tests for the health endpoints on the observability API:
2+
//!
3+
//! * the standard gRPC health check (`grpc.health.v1.Health/Check`)
4+
//! * the HTTP `/health` endpoint served on the same port
25
36
use super::{common::*, harness::*};
47

@@ -23,3 +26,42 @@ async fn health_check_reports_serving() {
2326

2427
assert!(harness.check_running(), "Vector should still be running");
2528
}
29+
30+
/// Verifies the HTTP `GET /health` endpoint returns 200 with `{"ok":true}` on a
31+
/// running Vector instance. This endpoint is load-balancer friendly and is
32+
/// shared with gRPC clients on the same API port.
33+
#[tokio::test]
34+
async fn http_health_endpoint_returns_200_when_serving() {
35+
let config = single_source_config("demo", 1.0, None);
36+
let harness = TestHarness::new(&config)
37+
.await
38+
.expect("Vector should start");
39+
40+
let url = format!("http://127.0.0.1:{}/health", harness.api_port());
41+
let response = reqwest::get(&url)
42+
.await
43+
.expect("GET /health should succeed");
44+
45+
assert_eq!(response.status(), reqwest::StatusCode::OK);
46+
let body = response.text().await.expect("body should be readable");
47+
assert_eq!(body, r#"{"ok":true}"#);
48+
}
49+
50+
/// Verifies HTTP `HEAD /health` returns 200 without a body. ALB/ELB style
51+
/// probes that prefer HEAD should work the same as GET.
52+
#[tokio::test]
53+
async fn http_health_endpoint_supports_head() {
54+
let config = single_source_config("demo", 1.0, None);
55+
let harness = TestHarness::new(&config)
56+
.await
57+
.expect("Vector should start");
58+
59+
let url = format!("http://127.0.0.1:{}/health", harness.api_port());
60+
let response = reqwest::Client::new()
61+
.head(&url)
62+
.send()
63+
.await
64+
.expect("HEAD /health should succeed");
65+
66+
assert_eq!(response.status(), reqwest::StatusCode::OK);
67+
}

website/content/en/docs/reference/api.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,12 @@ instance. This page covers how to configure and enable Vector's API.
2121
The API exposes a gRPC service defined in [`proto/vector/observability.proto`](https://github.com/vectordotdev/vector/blob/master/proto/vector/observability.proto).
2222
You can interact with it using any standard gRPC tooling.
2323

24+
For compatibility with Vector 0.54.0 and earlier, the HTTP `GET /health`
25+
endpoint continues to be served on the same port as the gRPC API, so
26+
existing HTTP probes (for example AWS ALB health checks and Kubernetes
27+
HTTP liveness/readiness probes) keep working without changes. See the
28+
[Endpoints](#endpoints) section above for details.
29+
2430
### Example using grpcurl
2531

2632
```bash
@@ -35,3 +41,10 @@ grpcurl -plaintext \
3541
-d '{"outputs_patterns": ["*"], "limit": 100, "interval_ms": 500}' \
3642
localhost:8686 vector.observability.v1.ObservabilityService/StreamOutputEvents
3743
```
44+
45+
### Example using curl (HTTP health)
46+
47+
```bash
48+
# 200 with body {"ok":true} while serving, 503 {"ok":false} during drain
49+
curl -i http://localhost:8686/health
50+
```

website/cue/reference.cue

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ _values: {
6868
#Endpoints: [Path=string]: {
6969
DELETE?: #Endpoint
7070
GET?: #Endpoint
71+
HEAD?: #Endpoint
7172
POST?: #Endpoint
7273
PUT?: #Endpoint
7374
}

website/cue/reference/api.cue

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,34 @@ api: {
2424
"/health": {
2525
GET: {
2626
description: """
27-
Healthcheck endpoint. Useful to verify that
28-
Vector is up and running.
27+
HTTP healthcheck endpoint served on the same port as the
28+
gRPC API, preserved for compatibility with Vector 0.54.0
29+
and earlier so existing HTTP probes (for example AWS ALB
30+
and Kubernetes HTTP probes) keep working unchanged.
31+
The response body is `{"ok": true}` while Vector is
32+
serving and `{"ok": false}` once Vector begins draining.
2933
"""
3034
responses: {
3135
"200": {
3236
description: "Vector is initialized and running."
3337
}
38+
"503": {
39+
description: "Vector is draining or shutting down and should be removed from the load balancer."
40+
}
41+
}
42+
}
43+
HEAD: {
44+
description: """
45+
Same semantics as `GET /health` but returns no body.
46+
Intended for load balancer probes that prefer `HEAD`.
47+
"""
48+
responses: {
49+
"200": {
50+
description: "Vector is initialized and running."
51+
}
52+
"503": {
53+
description: "Vector is draining or shutting down and should be removed from the load balancer."
54+
}
3455
}
3556
}
3657
}

0 commit comments

Comments
 (0)