Skip to content

Commit db82aef

Browse files
JAORMXclaude
andauthored
Add workload upgrade detection package (#5407)
* Add upgrade detection for registry workloads CLI and API users have no way to discover when a newer version of a registry-sourced MCP server is available; only Studio implements drift detection, in its frontend. Introduce a backend package that all clients can consume. Add pkg/workloads/upgrade with a Checker that compares a running workload's image tag against its registry entry (semver-aware, with a string fallback) and reports environment-variable and configuration (transport / permission-profile / network-isolation) drift. Comparison degrades safely to "unknown" for :latest, digest refs, repository changes, and non-registry-sourced workloads, so only a strictly-newer tag on the same repository yields "upgrade-available". This is the read-only detection core (RFC THV-0068, phase A); the apply path, API endpoints, and CLI follow in later changes. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * Address review feedback on upgrade detection - Lowercase an uppercase "V" tag prefix so semver comparison works; "V1.2.0" vs "V1.3.0" no longer falls through to undecidable and hides a real upgrade. - Drop the raw provider error from CheckResult.Reason (it is serialized into the API response and can leak internal addressing); log it at DEBUG and return a fixed string. Same for the CheckAll path. - Add a defensive default to the comparison switch so an unexpected value yields StatusUnknown rather than the least-safe StatusUpToDate. - Stop reporting network-isolation drift: the registry has no network-isolation field, so it fired for every isolated workload regardless of the candidate version. Remove the ConfigDrift field and the now-unused BoolChange type. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 05ca226 commit db82aef

7 files changed

Lines changed: 1162 additions & 0 deletions

File tree

pkg/workloads/upgrade/checker.go

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
// SPDX-FileCopyrightText: Copyright 2026 Stacklok, Inc.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package upgrade
5+
6+
import (
7+
"context"
8+
"errors"
9+
"fmt"
10+
"log/slog"
11+
12+
regtypes "github.com/stacklok/toolhive-core/registry/types"
13+
"github.com/stacklok/toolhive/pkg/registry"
14+
"github.com/stacklok/toolhive/pkg/runner"
15+
"github.com/stacklok/toolhive/pkg/secrets"
16+
)
17+
18+
// Checker determines whether registry-sourced workloads have an available
19+
// upgrade by comparing their current image and configuration against the
20+
// metadata the injected registry provider reports.
21+
type Checker struct {
22+
provider registry.Provider
23+
}
24+
25+
// NewChecker creates a Checker backed by the given registry provider.
26+
//
27+
// The provider is the source of truth for candidate image metadata; callers
28+
// typically pass the shared singleton from registry.GetDefaultProvider so the
29+
// provider's response cache is reused across checks. It returns an error if the
30+
// provider is nil.
31+
func NewChecker(provider registry.Provider) (*Checker, error) {
32+
if provider == nil {
33+
return nil, fmt.Errorf("registry provider must not be nil")
34+
}
35+
return &Checker{provider: provider}, nil
36+
}
37+
38+
// Check evaluates a single workload's RunConfig against the registry and
39+
// returns the upgrade status. It never mutates the supplied config. Per-item
40+
// problems (missing server, unparsable tags, non-image entries) are encoded in
41+
// the returned CheckResult's Status/Reason rather than returned as an error;
42+
// an error is returned only for an invalid call (nil config).
43+
func (c *Checker) Check(_ context.Context, cfg *runner.RunConfig) (*CheckResult, error) {
44+
if cfg == nil {
45+
return nil, fmt.Errorf("run config must not be nil")
46+
}
47+
48+
result := &CheckResult{
49+
WorkloadName: cfg.Name,
50+
RegistryServer: cfg.RegistryServerName,
51+
CurrentImage: cfg.Image,
52+
}
53+
54+
if cfg.RegistryServerName == "" {
55+
result.Status = StatusNotRegistrySourced
56+
return result, nil
57+
}
58+
59+
server, err := c.provider.GetServer(cfg.RegistryServerName)
60+
if err != nil {
61+
if errors.Is(err, registry.ErrServerNotFound) {
62+
result.Status = StatusServerNotFound
63+
return result, nil
64+
}
65+
// Keep the detailed provider error out of the result: Reason is
66+
// serialized into the HTTP response, and for an unreachable or
67+
// misconfigured registry the raw error can carry internal addressing
68+
// (e.g. "dial tcp 10.x.x.x:443: ..."). Log it for operators instead.
69+
slog.Debug("registry lookup failed", "server", cfg.RegistryServerName, "error", err)
70+
result.Status = StatusUnknown
71+
result.Reason = "registry lookup failed"
72+
return result, nil
73+
}
74+
75+
imgMeta, ok := server.(*regtypes.ImageMetadata)
76+
if !ok {
77+
result.Status = StatusUnknown
78+
result.Reason = fmt.Sprintf("registry entry %q is not a container image (cannot determine upgrade)", cfg.RegistryServerName)
79+
return result, nil
80+
}
81+
82+
result.CandidateImage = imgMeta.Image
83+
84+
comparison, reason := compareImageTags(cfg.Image, imgMeta.Image)
85+
switch comparison {
86+
case comparisonNewer:
87+
result.Status = StatusUpgradeAvailable
88+
result.EnvVarDrift = computeEnvDrift(cfg, imgMeta)
89+
result.ConfigDrift = computeConfigDrift(cfg, imgMeta)
90+
case comparisonSameOrOlder:
91+
result.Status = StatusUpToDate
92+
case comparisonUndecidable:
93+
result.Status = StatusUnknown
94+
result.Reason = reason
95+
default:
96+
// Defensive: a future tagComparison value (or an unset zero value) must
97+
// not fall through to the least-safe StatusUpToDate. Treat anything
98+
// unexpected as unknown.
99+
result.Status = StatusUnknown
100+
}
101+
102+
return result, nil
103+
}
104+
105+
// CheckAll evaluates a batch of workloads. It never returns an error: each
106+
// workload's outcome (including per-item failures) is encoded in its own
107+
// CheckResult. The returned slice preserves the input order. Nil entries in the
108+
// input are skipped.
109+
func (c *Checker) CheckAll(ctx context.Context, configs []*runner.RunConfig) []*CheckResult {
110+
results := make([]*CheckResult, 0, len(configs))
111+
for _, cfg := range configs {
112+
if cfg == nil {
113+
continue
114+
}
115+
// Check only errors on a nil config, which we already guarded against,
116+
// so the error here is unreachable; encode defensively rather than drop.
117+
res, err := c.Check(ctx, cfg)
118+
if err != nil {
119+
slog.Debug("upgrade check failed", "workload", cfg.Name, "error", err)
120+
res = &CheckResult{
121+
WorkloadName: cfg.Name,
122+
Status: StatusUnknown,
123+
Reason: "check failed",
124+
}
125+
}
126+
results = append(results, res)
127+
}
128+
return results
129+
}
130+
131+
// computeEnvDrift reports the candidate environment variables the workload does
132+
// not currently satisfy. A variable is considered satisfied if it appears as a
133+
// plain env var key in the config, or as the target of one of the config's
134+
// secret parameters. Removed is left unpopulated (best-effort, forward-compat).
135+
//
136+
// It treats the config as read-only. Returns nil when there is no drift.
137+
func computeEnvDrift(cfg *runner.RunConfig, imgMeta *regtypes.ImageMetadata) *EnvVarDrift {
138+
satisfied := make(map[string]struct{}, len(cfg.EnvVars)+len(cfg.Secrets))
139+
for k := range cfg.EnvVars {
140+
satisfied[k] = struct{}{}
141+
}
142+
for _, s := range cfg.Secrets {
143+
parsed, err := secrets.ParseSecretParameter(s)
144+
if err != nil {
145+
// Malformed secret parameters can't satisfy a variable; skip them.
146+
continue
147+
}
148+
if parsed.Target != "" {
149+
satisfied[parsed.Target] = struct{}{}
150+
}
151+
}
152+
153+
var added []EnvVarInfo
154+
for _, ev := range imgMeta.EnvVars {
155+
if ev == nil {
156+
continue
157+
}
158+
if _, ok := satisfied[ev.Name]; ok {
159+
continue
160+
}
161+
added = append(added, toEnvVarInfo(ev))
162+
}
163+
164+
if len(added) == 0 {
165+
return nil
166+
}
167+
return &EnvVarDrift{Added: added}
168+
}
169+
170+
// computeConfigDrift reports posture differences between the workload's current
171+
// configuration and the candidate registry entry. Each field is nil when that
172+
// aspect did not drift or could not be compared.
173+
//
174+
// The permission profile is compared against imgMeta.Permissions.Name (a
175+
// *permissions.Profile, not a string). Comparison degrades gracefully: when the
176+
// candidate has no profile, or the workload's profile is a custom name/path
177+
// that has no registry analogue, that dimension is not reported as drift unless
178+
// both sides are known and differ. It treats the config as read-only.
179+
func computeConfigDrift(cfg *runner.RunConfig, imgMeta *regtypes.ImageMetadata) *ConfigDrift {
180+
drift := &ConfigDrift{}
181+
182+
// Transport: compare the workload's transport string against the registry
183+
// entry's transport. GetTransport() may return an empty string when the
184+
// registry entry does not declare one; only report drift when both are set.
185+
currentTransport := cfg.Transport.String()
186+
candidateTransport := imgMeta.GetTransport()
187+
if candidateTransport != "" && currentTransport != "" && currentTransport != candidateTransport {
188+
drift.Transport = &StringChange{From: currentTransport, To: candidateTransport}
189+
}
190+
191+
// Permission profile: compare names. The candidate name is only known when
192+
// the registry entry carries a profile with a non-empty Name.
193+
candidateProfile := ""
194+
if imgMeta.Permissions != nil {
195+
candidateProfile = imgMeta.Permissions.Name
196+
}
197+
currentProfile := cfg.PermissionProfileNameOrPath
198+
if candidateProfile != "" && currentProfile != "" && currentProfile != candidateProfile {
199+
drift.PermissionProfile = &StringChange{From: currentProfile, To: candidateProfile}
200+
}
201+
202+
if drift.Transport == nil && drift.PermissionProfile == nil {
203+
return nil
204+
}
205+
return drift
206+
}
207+
208+
// toEnvVarInfo converts a registry EnvVar into the drift-report shape, clearing
209+
// the Default value when the variable is a secret to avoid leaking sensitive
210+
// data into reports that may be logged or returned over the API.
211+
func toEnvVarInfo(ev *regtypes.EnvVar) EnvVarInfo {
212+
info := EnvVarInfo{
213+
Name: ev.Name,
214+
Description: ev.Description,
215+
Required: ev.Required,
216+
Secret: ev.Secret,
217+
Default: ev.Default,
218+
}
219+
if info.Secret {
220+
info.Default = ""
221+
}
222+
return info
223+
}

0 commit comments

Comments
 (0)