Skip to content

Commit 2407f28

Browse files
authored
Merge pull request #117 from fystack/health-check-endpoint
Support expose health check endpoint /health for cloud deployment
2 parents 335f284 + dcc569a commit 2407f28

4 files changed

Lines changed: 183 additions & 0 deletions

File tree

cmd/mpcium/main.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"github.com/fystack/mpcium/pkg/constant"
1616
"github.com/fystack/mpcium/pkg/event"
1717
"github.com/fystack/mpcium/pkg/eventconsumer"
18+
"github.com/fystack/mpcium/pkg/healthcheck"
1819
"github.com/fystack/mpcium/pkg/identity"
1920
"github.com/fystack/mpcium/pkg/infra"
2021
"github.com/fystack/mpcium/pkg/keyinfo"
@@ -241,6 +242,21 @@ func runNode(ctx context.Context, c *cli.Command) error {
241242
}
242243
logger.Info("[READY] Node is ready", "nodeID", nodeID)
243244

245+
// Start health check server (disabled by default)
246+
var healthServer *healthcheck.Server
247+
if viper.GetBool("healthcheck.enabled") {
248+
healthAddr := viper.GetString("healthcheck.address")
249+
if healthAddr == "" {
250+
healthAddr = ":8080"
251+
}
252+
healthServer = healthcheck.NewServer(healthAddr, peerRegistry, natsConn, consulClient)
253+
go func() {
254+
if err := healthServer.Start(); err != nil {
255+
logger.Error("Health check server error", err)
256+
}
257+
}()
258+
}
259+
244260
logger.Info("Starting consumers", "nodeID", nodeID)
245261
appContext, cancel := context.WithCancel(context.Background())
246262
//Setup signal handling to cancel context on termination signals.
@@ -251,6 +267,15 @@ func runNode(ctx context.Context, c *cli.Command) error {
251267
logger.Warn("Shutdown signal received, canceling context...")
252268
cancel()
253269

270+
// Shutdown health check server if it was started
271+
if healthServer != nil {
272+
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second)
273+
defer shutdownCancel()
274+
if err := healthServer.Shutdown(shutdownCtx); err != nil {
275+
logger.Error("Failed to shutdown health check server", err)
276+
}
277+
}
278+
254279
// Resign from peer registry first (before closing NATS)
255280
if err := peerRegistry.Resign(); err != nil {
256281
logger.Error("Failed to resign from peer registry", err)

config.prod.yaml.template

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,7 @@ backup_dir: backups
2323
max_concurrent_keygen: 2
2424
max_concurrent_signing: 10
2525
session_warm_up_delay_ms: 100
26+
27+
healthcheck:
28+
enabled: false # disabled by default, set to true for cloud deployment
29+
address: "127.0.0.1:8080"

config.yaml.template

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ backup_dir: backups
1515
max_concurrent_keygen: 2
1616
max_concurrent_signing: 10
1717
session_warm_up_delay_ms: 100
18+
healthcheck:
19+
enabled: true # disabled by default, set to true for cloud deployment
20+
address: "0.0.0.0:8080"
21+
1822

1923
# Authorization (optional)
2024
# authorization:

pkg/healthcheck/server.go

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
package healthcheck
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"fmt"
7+
"net"
8+
"net/http"
9+
"time"
10+
11+
"github.com/fystack/mpcium/pkg/logger"
12+
"github.com/fystack/mpcium/pkg/mpc"
13+
"github.com/hashicorp/consul/api"
14+
"github.com/nats-io/nats.go"
15+
)
16+
17+
// Server provides HTTP health check endpoints for Kubernetes probes
18+
type Server struct {
19+
httpServer *http.Server
20+
peerRegistry mpc.PeerRegistry
21+
natsConn *nats.Conn
22+
consulClient *api.Client
23+
}
24+
25+
// HealthResponse represents the JSON response for health check endpoints
26+
type HealthResponse struct {
27+
Status string `json:"status"`
28+
Live bool `json:"live"`
29+
Ready bool `json:"ready"`
30+
Details map[string]any `json:"details,omitempty"`
31+
}
32+
33+
// NewServer creates a new health check HTTP server
34+
func NewServer(addr string, peerRegistry mpc.PeerRegistry, natsConn *nats.Conn, consulClient *api.Client) *Server {
35+
s := &Server{
36+
peerRegistry: peerRegistry,
37+
natsConn: natsConn,
38+
consulClient: consulClient,
39+
}
40+
41+
mux := http.NewServeMux()
42+
mux.HandleFunc("/health", s.healthHandler)
43+
44+
s.httpServer = &http.Server{
45+
Addr: addr,
46+
Handler: mux,
47+
ReadTimeout: 5 * time.Second,
48+
WriteTimeout: 5 * time.Second,
49+
IdleTimeout: 15 * time.Second,
50+
}
51+
52+
return s
53+
}
54+
55+
// Start begins serving health check endpoints
56+
func (s *Server) Start() error {
57+
addr := s.httpServer.Addr
58+
59+
// Parse host and port from address
60+
host, port, err := net.SplitHostPort(addr)
61+
if err != nil {
62+
// If parsing fails, just use the address as-is
63+
logger.Info("Starting health check server", "address", addr)
64+
} else {
65+
// Replace empty host or 0.0.0.0 with localhost for display
66+
if host == "" || host == "0.0.0.0" {
67+
host = "localhost"
68+
}
69+
endpoint := fmt.Sprintf("http://%s:%s/health", host, port)
70+
logger.Info("Starting health check server", "endpoint", endpoint)
71+
}
72+
73+
if err := s.httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed {
74+
return fmt.Errorf("health check server failed: %w", err)
75+
}
76+
return nil
77+
}
78+
79+
// Shutdown gracefully stops the health check server
80+
func (s *Server) Shutdown(ctx context.Context) error {
81+
logger.Info("Shutting down health check server")
82+
return s.httpServer.Shutdown(ctx)
83+
}
84+
85+
// healthHandler responds to health check requests
86+
// This endpoint checks both liveness and readiness in a single response
87+
func (s *Server) healthHandler(w http.ResponseWriter, r *http.Request) {
88+
details := make(map[string]any)
89+
ready := true
90+
live := true // Service is always live if it can respond
91+
92+
// Check NATS connection
93+
natsConnected := s.natsConn != nil && s.natsConn.IsConnected()
94+
details["nats_connected"] = natsConnected
95+
if !natsConnected {
96+
ready = false
97+
}
98+
99+
// Check Consul connection
100+
consulConnected := false
101+
if s.consulClient != nil {
102+
if leader, err := s.consulClient.Status().Leader(); err == nil && leader != "" {
103+
consulConnected = true
104+
}
105+
}
106+
details["consul_connected"] = consulConnected
107+
if !consulConnected {
108+
ready = false
109+
}
110+
111+
// Check peer registry readiness (includes ECDH completion)
112+
if s.peerRegistry != nil {
113+
peersReady := s.peerRegistry.ArePeersReady()
114+
majorityReady := s.peerRegistry.AreMajorityReady()
115+
readyCount := s.peerRegistry.GetReadyPeersCount()
116+
totalCount := s.peerRegistry.GetTotalPeersCount()
117+
118+
details["peers_ready_count"] = fmt.Sprintf("%d/%d", readyCount, totalCount)
119+
details["all_peers_ready"] = peersReady
120+
details["majority_ready"] = majorityReady
121+
122+
// Node is ready if majority of peers are ready (allows for some fault tolerance)
123+
if !majorityReady {
124+
ready = false
125+
}
126+
} else {
127+
details["peers_available"] = false
128+
ready = false
129+
}
130+
131+
response := HealthResponse{
132+
Live: live,
133+
Ready: ready,
134+
Details: details,
135+
}
136+
137+
w.Header().Set("Content-Type", "application/json")
138+
139+
if ready {
140+
response.Status = "ready"
141+
w.WriteHeader(http.StatusOK)
142+
} else {
143+
response.Status = "not_ready"
144+
w.WriteHeader(http.StatusServiceUnavailable)
145+
}
146+
147+
if err := json.NewEncoder(w).Encode(response); err != nil {
148+
logger.Error("Failed to encode health check response", err)
149+
}
150+
}

0 commit comments

Comments
 (0)