Skip to content

Commit 2d6cac1

Browse files
bdehamerCopilotCopilot
authored
Cache 404 responses for unknown artifacts (#66)
* implement cache for unknown artifacts Signed-off-by: Brian DeHamer <bdehamer@github.com> * Make `NoArtifactError.Error()` nil-safe (#67) * Initial plan * Make NoArtifactError.Error() nil-safe Co-authored-by: bdehamer <398027+bdehamer@users.noreply.github.com> Agent-Logs-Url: https://github.com/github/deployment-tracker/sessions/3eb7150d-f2e1-4ef9-8452-686a5ddb22f0 --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: bdehamer <398027+bdehamer@users.noreply.github.com> * Add info log when digest is cached as unknown Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --------- Signed-off-by: Brian DeHamer <bdehamer@github.com> Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com> Co-authored-by: bdehamer <398027+bdehamer@users.noreply.github.com> Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
1 parent 6285f23 commit 2d6cac1

File tree

4 files changed

+215
-1
lines changed

4 files changed

+215
-1
lines changed

internal/controller/controller.go

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ const (
3131
EventCreated = "CREATED"
3232
// EventDeleted indicates that a pod has been deleted.
3333
EventDeleted = "DELETED"
34+
35+
// unknownArtifactTTL is the TTL for cached 404 responses from the
36+
// deployment record API. Once an artifact is known to be missing,
37+
// we suppress further API calls for this duration.
38+
unknownArtifactTTL = 1 * time.Hour
3439
)
3540

3641
type ttlCache interface {
@@ -66,6 +71,9 @@ type Controller struct {
6671
// post requests are idempotent, so if this cache fails due to
6772
// restarts or other events, nothing will break.
6873
observedDeployments ttlCache
74+
// best effort cache to suppress API calls for artifacts that
75+
// returned a 404 (no artifact found). Keyed by image digest.
76+
unknownArtifacts ttlCache
6977
}
7078

7179
// New creates a new deployment tracker controller.
@@ -113,6 +121,7 @@ func New(clientset kubernetes.Interface, metadataAggregator podMetadataAggregato
113121
apiClient: apiClient,
114122
cfg: cfg,
115123
observedDeployments: amcache.NewExpiring(),
124+
unknownArtifacts: amcache.NewExpiring(),
116125
}
117126

118127
// Add event handlers to the informer
@@ -442,6 +451,16 @@ func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, conta
442451
return fmt.Errorf("invalid status: %s", status)
443452
}
444453

454+
// Check if this artifact was previously unknown (404 from the API)
455+
if _, exists := c.unknownArtifacts.Get(digest); exists {
456+
dtmetrics.PostDeploymentRecordUnknownArtifactCacheHit.Inc()
457+
slog.Debug("Artifact previously returned 404, skipping post",
458+
"deployment_name", dn,
459+
"digest", digest,
460+
)
461+
return nil
462+
}
463+
445464
// Extract image name and tag
446465
imageName, version := ociutil.ExtractName(container.Image)
447466

@@ -471,9 +490,14 @@ func (c *Controller) recordContainer(ctx context.Context, pod *corev1.Pod, conta
471490
)
472491

473492
if err := c.apiClient.PostOne(ctx, record); err != nil {
474-
// Return if no artifact is found
493+
// Return if no artifact is found and cache the digest
475494
var noArtifactErr *deploymentrecord.NoArtifactError
476495
if errors.As(err, &noArtifactErr) {
496+
c.unknownArtifacts.Set(digest, true, unknownArtifactTTL)
497+
slog.Info("No artifact found, digest cached as unknown",
498+
"deployment_name", dn,
499+
"digest", digest,
500+
)
477501
return nil
478502
}
479503

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
package controller
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"sync"
7+
"testing"
8+
"time"
9+
10+
"github.com/github/deployment-tracker/pkg/deploymentrecord"
11+
"github.com/stretchr/testify/assert"
12+
"github.com/stretchr/testify/require"
13+
corev1 "k8s.io/api/core/v1"
14+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
15+
amcache "k8s.io/apimachinery/pkg/util/cache"
16+
)
17+
18+
// mockPoster records all PostOne calls and returns a configurable error.
19+
type mockPoster struct {
20+
mu sync.Mutex
21+
calls int
22+
lastErr error
23+
}
24+
25+
func (m *mockPoster) PostOne(_ context.Context, _ *deploymentrecord.DeploymentRecord) error {
26+
m.mu.Lock()
27+
defer m.mu.Unlock()
28+
m.calls++
29+
return m.lastErr
30+
}
31+
32+
func (m *mockPoster) getCalls() int {
33+
m.mu.Lock()
34+
defer m.mu.Unlock()
35+
return m.calls
36+
}
37+
38+
// newTestController creates a minimal Controller suitable for unit-testing
39+
// recordContainer without a real Kubernetes cluster.
40+
func newTestController(poster *mockPoster) *Controller {
41+
return &Controller{
42+
apiClient: poster,
43+
cfg: &Config{
44+
Template: "{{namespace}}/{{deploymentName}}/{{containerName}}",
45+
LogicalEnvironment: "test",
46+
PhysicalEnvironment: "test",
47+
Cluster: "test",
48+
},
49+
observedDeployments: amcache.NewExpiring(),
50+
unknownArtifacts: amcache.NewExpiring(),
51+
}
52+
}
53+
54+
// testPod returns a pod with a single container and a known digest.
55+
func testPod(digest string) (*corev1.Pod, corev1.Container) {
56+
container := corev1.Container{
57+
Name: "app",
58+
Image: "nginx:latest",
59+
}
60+
pod := &corev1.Pod{
61+
ObjectMeta: metav1.ObjectMeta{
62+
Name: "test-pod",
63+
Namespace: "default",
64+
OwnerReferences: []metav1.OwnerReference{{
65+
APIVersion: "apps/v1",
66+
Kind: "ReplicaSet",
67+
Name: "test-deployment-abc123",
68+
}},
69+
},
70+
Status: corev1.PodStatus{
71+
ContainerStatuses: []corev1.ContainerStatus{{
72+
Name: "app",
73+
ImageID: fmt.Sprintf("docker-pullable://nginx@%s", digest),
74+
}},
75+
},
76+
}
77+
return pod, container
78+
}
79+
80+
func TestRecordContainer_UnknownArtifactCachePopulatedOn404(t *testing.T) {
81+
t.Parallel()
82+
digest := "sha256:unknown404digest"
83+
poster := &mockPoster{
84+
lastErr: &deploymentrecord.NoArtifactError{},
85+
}
86+
ctrl := newTestController(poster)
87+
pod, container := testPod(digest)
88+
89+
// First call should hit the API and get a 404
90+
err := ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDeployed, EventCreated, nil)
91+
require.NoError(t, err)
92+
assert.Equal(t, 1, poster.getCalls())
93+
94+
// Digest should now be in the unknown artifacts cache
95+
_, exists := ctrl.unknownArtifacts.Get(digest)
96+
assert.True(t, exists, "digest should be cached after 404")
97+
}
98+
99+
func TestRecordContainer_UnknownArtifactCacheSkipsAPICall(t *testing.T) {
100+
t.Parallel()
101+
digest := "sha256:cacheddigest"
102+
poster := &mockPoster{
103+
lastErr: &deploymentrecord.NoArtifactError{},
104+
}
105+
ctrl := newTestController(poster)
106+
pod, container := testPod(digest)
107+
108+
// First call — API returns 404, populates cache
109+
err := ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDeployed, EventCreated, nil)
110+
require.NoError(t, err)
111+
assert.Equal(t, 1, poster.getCalls())
112+
113+
// Second call — should be served from cache, no API call
114+
err = ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDeployed, EventCreated, nil)
115+
require.NoError(t, err)
116+
assert.Equal(t, 1, poster.getCalls(), "API should not be called for cached unknown artifact")
117+
}
118+
119+
func TestRecordContainer_UnknownArtifactCacheAppliesToDecommission(t *testing.T) {
120+
t.Parallel()
121+
digest := "sha256:decommission404"
122+
poster := &mockPoster{
123+
lastErr: &deploymentrecord.NoArtifactError{},
124+
}
125+
ctrl := newTestController(poster)
126+
pod, container := testPod(digest)
127+
128+
// Deploy call — 404, populates cache
129+
err := ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDeployed, EventCreated, nil)
130+
require.NoError(t, err)
131+
assert.Equal(t, 1, poster.getCalls())
132+
133+
// Decommission call for same digest — should skip API
134+
err = ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDecommissioned, EventDeleted, nil)
135+
require.NoError(t, err)
136+
assert.Equal(t, 1, poster.getCalls(), "decommission should also be skipped for cached unknown artifact")
137+
}
138+
139+
func TestRecordContainer_UnknownArtifactCacheExpires(t *testing.T) {
140+
t.Parallel()
141+
digest := "sha256:expiringdigest"
142+
poster := &mockPoster{
143+
lastErr: &deploymentrecord.NoArtifactError{},
144+
}
145+
ctrl := newTestController(poster)
146+
pod, container := testPod(digest)
147+
148+
// Seed the cache with a very short TTL to test expiry
149+
ctrl.unknownArtifacts.Set(digest, true, 50*time.Millisecond)
150+
151+
// Immediately — should be cached
152+
err := ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDeployed, EventCreated, nil)
153+
require.NoError(t, err)
154+
assert.Equal(t, 0, poster.getCalls(), "should skip API while cached")
155+
156+
// Wait for expiry
157+
time.Sleep(100 * time.Millisecond)
158+
159+
// After expiry — should call API again
160+
err = ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDeployed, EventCreated, nil)
161+
require.NoError(t, err)
162+
assert.Equal(t, 1, poster.getCalls(), "should call API after cache expiry")
163+
}
164+
165+
func TestRecordContainer_SuccessfulPostDoesNotPopulateUnknownCache(t *testing.T) {
166+
t.Parallel()
167+
digest := "sha256:knowndigest"
168+
poster := &mockPoster{lastErr: nil} // success
169+
ctrl := newTestController(poster)
170+
pod, container := testPod(digest)
171+
172+
err := ctrl.recordContainer(context.Background(), pod, container, deploymentrecord.StatusDeployed, EventCreated, nil)
173+
require.NoError(t, err)
174+
assert.Equal(t, 1, poster.getCalls())
175+
176+
// Digest should NOT be in the unknown artifacts cache
177+
_, exists := ctrl.unknownArtifacts.Get(digest)
178+
assert.False(t, exists, "successful post should not cache digest as unknown")
179+
}

pkg/deploymentrecord/client.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,9 @@ type NoArtifactError struct {
170170
}
171171

172172
func (n *NoArtifactError) Error() string {
173+
if n == nil || n.err == nil {
174+
return "no artifact found"
175+
}
173176
return fmt.Sprintf("no artifact found: %s", n.err.Error())
174177
}
175178

pkg/dtmetrics/prom.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,4 +88,12 @@ var (
8888
Help: "The total number of non-retryable client failures",
8989
},
9090
)
91+
92+
//nolint: revive
93+
PostDeploymentRecordUnknownArtifactCacheHit = promauto.NewCounter(
94+
prometheus.CounterOpts{
95+
Name: "deptracker_post_record_unknown_artifact_cache_hit",
96+
Help: "The total number of API calls avoided due to a cached 404 for an unknown artifact digest",
97+
},
98+
)
9199
)

0 commit comments

Comments
 (0)