|
| 1 | +// Package prober TCP-probes route upstreams on a fixed interval and exposes |
| 2 | +// per-host health snapshots for the dashboard and runtime status reporting. |
| 3 | +package prober |
| 4 | + |
| 5 | +import ( |
| 6 | + "context" |
| 7 | + "errors" |
| 8 | + "net" |
| 9 | + "net/url" |
| 10 | + "strings" |
| 11 | + "sync" |
| 12 | + "time" |
| 13 | + |
| 14 | + "github.com/venkatkrishna07/mkdev/internal/store" |
| 15 | +) |
| 16 | + |
| 17 | +// Status is the health verdict for a single route's upstream. |
| 18 | +type Status int |
| 19 | + |
| 20 | +// Status values reported by Health / Snapshot. |
| 21 | +const ( |
| 22 | + // StatusOff is the zero value and the recorded status for disabled routes. |
| 23 | + StatusOff Status = iota |
| 24 | + // StatusUp means the most recent dial succeeded. |
| 25 | + StatusUp |
| 26 | + // StatusDown means the most recent dial failed; LastErr carries the reason. |
| 27 | + StatusDown |
| 28 | +) |
| 29 | + |
| 30 | +func (s Status) String() string { |
| 31 | + switch s { |
| 32 | + case StatusUp: |
| 33 | + return "up" |
| 34 | + case StatusDown: |
| 35 | + return "down" |
| 36 | + default: |
| 37 | + return "off" |
| 38 | + } |
| 39 | +} |
| 40 | + |
| 41 | +// HealthState is the most recent probe outcome for one host. |
| 42 | +type HealthState struct { |
| 43 | + Status Status |
| 44 | + LastErr string |
| 45 | + LastProbe time.Time |
| 46 | +} |
| 47 | + |
| 48 | +const ( |
| 49 | + probePoolSize = 8 |
| 50 | + errMaxLen = 80 |
| 51 | +) |
| 52 | + |
| 53 | +// dialer is overridable so future code can swap the network call without |
| 54 | +// changing the Prober's structure. Default does a bounded TCP DialContext. |
| 55 | +var dialer = func(ctx context.Context, target string, timeout time.Duration) error { |
| 56 | + d := net.Dialer{Timeout: timeout} |
| 57 | + conn, err := d.DialContext(ctx, "tcp", target) |
| 58 | + if err != nil { |
| 59 | + return err |
| 60 | + } |
| 61 | + _ = conn.Close() |
| 62 | + return nil |
| 63 | +} |
| 64 | + |
| 65 | +// Prober periodically TCP-dials every enabled route's upstream. |
| 66 | +type Prober struct { |
| 67 | + interval time.Duration |
| 68 | + timeout time.Duration |
| 69 | + routes func() ([]store.Route, error) |
| 70 | + states sync.Map // host (lowercased) -> HealthState |
| 71 | +} |
| 72 | + |
| 73 | +// New returns a Prober that pulls routes from the given function and probes |
| 74 | +// each enabled upstream every interval with per-dial timeout. |
| 75 | +func New(routes func() ([]store.Route, error), interval, timeout time.Duration) *Prober { |
| 76 | + return &Prober{ |
| 77 | + interval: interval, |
| 78 | + timeout: timeout, |
| 79 | + routes: routes, |
| 80 | + } |
| 81 | +} |
| 82 | + |
| 83 | +// Health returns the last known state for host, or the zero value (StatusOff) |
| 84 | +// if the host has never been probed. |
| 85 | +func (p *Prober) Health(host string) HealthState { |
| 86 | + v, ok := p.states.Load(strings.ToLower(host)) |
| 87 | + if !ok { |
| 88 | + return HealthState{} |
| 89 | + } |
| 90 | + return v.(HealthState) |
| 91 | +} |
| 92 | + |
| 93 | +// Snapshot returns a copy of every host's current health state. |
| 94 | +func (p *Prober) Snapshot() map[string]HealthState { |
| 95 | + out := map[string]HealthState{} |
| 96 | + p.states.Range(func(k, v any) bool { |
| 97 | + out[k.(string)] = v.(HealthState) |
| 98 | + return true |
| 99 | + }) |
| 100 | + return out |
| 101 | +} |
| 102 | + |
| 103 | +// Run probes immediately, then on every interval tick, until ctx is cancelled. |
| 104 | +func (p *Prober) Run(ctx context.Context) { |
| 105 | + p.tick(ctx) |
| 106 | + t := time.NewTicker(p.interval) |
| 107 | + defer t.Stop() |
| 108 | + for { |
| 109 | + select { |
| 110 | + case <-ctx.Done(): |
| 111 | + return |
| 112 | + case <-t.C: |
| 113 | + p.tick(ctx) |
| 114 | + } |
| 115 | + } |
| 116 | +} |
| 117 | + |
| 118 | +func (p *Prober) tick(ctx context.Context) { |
| 119 | + routes, err := p.routes() |
| 120 | + if err != nil { |
| 121 | + return |
| 122 | + } |
| 123 | + |
| 124 | + live := make(map[string]struct{}, len(routes)) |
| 125 | + jobs := make(chan store.Route) |
| 126 | + var wg sync.WaitGroup |
| 127 | + |
| 128 | + for range probePoolSize { |
| 129 | + wg.Go(func() { |
| 130 | + for r := range jobs { |
| 131 | + p.probe(ctx, r) |
| 132 | + } |
| 133 | + }) |
| 134 | + } |
| 135 | + |
| 136 | + for _, r := range routes { |
| 137 | + host := strings.ToLower(r.Domain) |
| 138 | + live[host] = struct{}{} |
| 139 | + if !r.Enabled { |
| 140 | + p.states.Store(host, HealthState{Status: StatusOff, LastProbe: time.Now()}) |
| 141 | + continue |
| 142 | + } |
| 143 | + select { |
| 144 | + case jobs <- r: |
| 145 | + case <-ctx.Done(): |
| 146 | + close(jobs) |
| 147 | + wg.Wait() |
| 148 | + return |
| 149 | + } |
| 150 | + } |
| 151 | + close(jobs) |
| 152 | + wg.Wait() |
| 153 | + |
| 154 | + p.states.Range(func(k, _ any) bool { |
| 155 | + if _, ok := live[k.(string)]; !ok { |
| 156 | + p.states.Delete(k) |
| 157 | + } |
| 158 | + return true |
| 159 | + }) |
| 160 | +} |
| 161 | + |
| 162 | +func (p *Prober) probe(ctx context.Context, r store.Route) { |
| 163 | + host := strings.ToLower(r.Domain) |
| 164 | + st := HealthState{Status: StatusUp, LastProbe: time.Now()} |
| 165 | + target := strings.TrimSpace(r.Target) |
| 166 | + switch target { |
| 167 | + case "": |
| 168 | + st.Status, st.LastErr = StatusDown, "bad upstream" |
| 169 | + default: |
| 170 | + addr, err := dialAddress(target) |
| 171 | + if err == nil { |
| 172 | + dctx, cancel := context.WithTimeout(ctx, p.timeout) |
| 173 | + err = dialer(dctx, addr, p.timeout) |
| 174 | + cancel() |
| 175 | + } |
| 176 | + if err != nil { |
| 177 | + st.Status, st.LastErr = StatusDown, truncErr(err.Error()) |
| 178 | + } |
| 179 | + } |
| 180 | + p.states.Store(host, st) |
| 181 | +} |
| 182 | + |
| 183 | +// dialAddress normalises a route Target (bare host[:port] or full URL) into a |
| 184 | +// host:port string suitable for net.Dialer. |
| 185 | +func dialAddress(target string) (string, error) { |
| 186 | + s := target |
| 187 | + if !strings.HasPrefix(s, "http://") && !strings.HasPrefix(s, "https://") { |
| 188 | + s = "http://" + s |
| 189 | + } |
| 190 | + u, err := url.Parse(s) |
| 191 | + if err != nil { |
| 192 | + return "", err |
| 193 | + } |
| 194 | + host := u.Hostname() |
| 195 | + if host == "" { |
| 196 | + return "", errors.New("no host in target") |
| 197 | + } |
| 198 | + port := u.Port() |
| 199 | + if port == "" { |
| 200 | + if u.Scheme == "https" { |
| 201 | + port = "443" |
| 202 | + } else { |
| 203 | + port = "80" |
| 204 | + } |
| 205 | + } |
| 206 | + return net.JoinHostPort(host, port), nil |
| 207 | +} |
| 208 | + |
| 209 | +func truncErr(s string) string { |
| 210 | + if len(s) <= errMaxLen { |
| 211 | + return s |
| 212 | + } |
| 213 | + return s[:errMaxLen-3] + "..." |
| 214 | +} |
0 commit comments