|
| 1 | +package healthcheck |
| 2 | + |
| 3 | +import ( |
| 4 | + "context" |
| 5 | + "encoding/json" |
| 6 | + "fmt" |
| 7 | + "net" |
| 8 | + "net/http" |
| 9 | + "time" |
| 10 | + |
| 11 | + "github.com/fystack/mpcium/pkg/logger" |
| 12 | + "github.com/fystack/mpcium/pkg/mpc" |
| 13 | + "github.com/hashicorp/consul/api" |
| 14 | + "github.com/nats-io/nats.go" |
| 15 | +) |
| 16 | + |
| 17 | +// Server provides HTTP health check endpoints for Kubernetes probes |
| 18 | +type Server struct { |
| 19 | + httpServer *http.Server |
| 20 | + peerRegistry mpc.PeerRegistry |
| 21 | + natsConn *nats.Conn |
| 22 | + consulClient *api.Client |
| 23 | +} |
| 24 | + |
| 25 | +// HealthResponse represents the JSON response for health check endpoints |
| 26 | +type HealthResponse struct { |
| 27 | + Status string `json:"status"` |
| 28 | + Live bool `json:"live"` |
| 29 | + Ready bool `json:"ready"` |
| 30 | + Details map[string]any `json:"details,omitempty"` |
| 31 | +} |
| 32 | + |
| 33 | +// NewServer creates a new health check HTTP server |
| 34 | +func NewServer(addr string, peerRegistry mpc.PeerRegistry, natsConn *nats.Conn, consulClient *api.Client) *Server { |
| 35 | + s := &Server{ |
| 36 | + peerRegistry: peerRegistry, |
| 37 | + natsConn: natsConn, |
| 38 | + consulClient: consulClient, |
| 39 | + } |
| 40 | + |
| 41 | + mux := http.NewServeMux() |
| 42 | + mux.HandleFunc("/health", s.healthHandler) |
| 43 | + |
| 44 | + s.httpServer = &http.Server{ |
| 45 | + Addr: addr, |
| 46 | + Handler: mux, |
| 47 | + ReadTimeout: 5 * time.Second, |
| 48 | + WriteTimeout: 5 * time.Second, |
| 49 | + IdleTimeout: 15 * time.Second, |
| 50 | + } |
| 51 | + |
| 52 | + return s |
| 53 | +} |
| 54 | + |
| 55 | +// Start begins serving health check endpoints |
| 56 | +func (s *Server) Start() error { |
| 57 | + addr := s.httpServer.Addr |
| 58 | + |
| 59 | + // Parse host and port from address |
| 60 | + host, port, err := net.SplitHostPort(addr) |
| 61 | + if err != nil { |
| 62 | + // If parsing fails, just use the address as-is |
| 63 | + logger.Info("Starting health check server", "address", addr) |
| 64 | + } else { |
| 65 | + // Replace empty host or 0.0.0.0 with localhost for display |
| 66 | + if host == "" || host == "0.0.0.0" { |
| 67 | + host = "localhost" |
| 68 | + } |
| 69 | + endpoint := fmt.Sprintf("http://%s:%s/health", host, port) |
| 70 | + logger.Info("Starting health check server", "endpoint", endpoint) |
| 71 | + } |
| 72 | + |
| 73 | + if err := s.httpServer.ListenAndServe(); err != nil && err != http.ErrServerClosed { |
| 74 | + return fmt.Errorf("health check server failed: %w", err) |
| 75 | + } |
| 76 | + return nil |
| 77 | +} |
| 78 | + |
| 79 | +// Shutdown gracefully stops the health check server |
| 80 | +func (s *Server) Shutdown(ctx context.Context) error { |
| 81 | + logger.Info("Shutting down health check server") |
| 82 | + return s.httpServer.Shutdown(ctx) |
| 83 | +} |
| 84 | + |
| 85 | +// healthHandler responds to health check requests |
| 86 | +// This endpoint checks both liveness and readiness in a single response |
| 87 | +func (s *Server) healthHandler(w http.ResponseWriter, r *http.Request) { |
| 88 | + details := make(map[string]any) |
| 89 | + ready := true |
| 90 | + live := true // Service is always live if it can respond |
| 91 | + |
| 92 | + // Check NATS connection |
| 93 | + natsConnected := s.natsConn != nil && s.natsConn.IsConnected() |
| 94 | + details["nats_connected"] = natsConnected |
| 95 | + if !natsConnected { |
| 96 | + ready = false |
| 97 | + } |
| 98 | + |
| 99 | + // Check Consul connection |
| 100 | + consulConnected := false |
| 101 | + if s.consulClient != nil { |
| 102 | + if leader, err := s.consulClient.Status().Leader(); err == nil && leader != "" { |
| 103 | + consulConnected = true |
| 104 | + } |
| 105 | + } |
| 106 | + details["consul_connected"] = consulConnected |
| 107 | + if !consulConnected { |
| 108 | + ready = false |
| 109 | + } |
| 110 | + |
| 111 | + // Check peer registry readiness (includes ECDH completion) |
| 112 | + if s.peerRegistry != nil { |
| 113 | + peersReady := s.peerRegistry.ArePeersReady() |
| 114 | + majorityReady := s.peerRegistry.AreMajorityReady() |
| 115 | + readyCount := s.peerRegistry.GetReadyPeersCount() |
| 116 | + totalCount := s.peerRegistry.GetTotalPeersCount() |
| 117 | + |
| 118 | + details["peers_ready_count"] = fmt.Sprintf("%d/%d", readyCount, totalCount) |
| 119 | + details["all_peers_ready"] = peersReady |
| 120 | + details["majority_ready"] = majorityReady |
| 121 | + |
| 122 | + // Node is ready if majority of peers are ready (allows for some fault tolerance) |
| 123 | + if !majorityReady { |
| 124 | + ready = false |
| 125 | + } |
| 126 | + } else { |
| 127 | + details["peers_available"] = false |
| 128 | + ready = false |
| 129 | + } |
| 130 | + |
| 131 | + response := HealthResponse{ |
| 132 | + Live: live, |
| 133 | + Ready: ready, |
| 134 | + Details: details, |
| 135 | + } |
| 136 | + |
| 137 | + if ready { |
| 138 | + response.Status = "ready" |
| 139 | + w.Header().Set("Content-Type", "application/json") |
| 140 | + w.WriteHeader(http.StatusOK) |
| 141 | + } else { |
| 142 | + response.Status = "not_ready" |
| 143 | + w.Header().Set("Content-Type", "application/json") |
| 144 | + w.WriteHeader(http.StatusServiceUnavailable) |
| 145 | + } |
| 146 | + |
| 147 | + json.NewEncoder(w).Encode(response) |
| 148 | +} |
0 commit comments