Skip to content

Commit b193056

Browse files
committed
fix: address PR comments
1 parent d23cc05 commit b193056

3 files changed

Lines changed: 40 additions & 1 deletion

File tree

internal/controller/reporting.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,16 @@ func (c *Controller) processSyncEvents(ctx context.Context, syncClusterPods []an
183183
"errors", len(jobStatus.Errors),
184184
)
185185

186+
// If the job failed, don't populate observedDeployments — records may not
187+
// have been created, and suppressing re-posts would delay self-healing.
188+
// Only populate unknownArtifacts from errors so we avoid redundant 404s.
189+
if jobStatus.Status == "failed" {
190+
slog.Warn("Cluster job failed, skipping observedDeployments cache fill",
191+
"job_id", jobResp.JobID,
192+
)
193+
return nil
194+
}
195+
186196
c.fillCachesFromJobResult(syncRecords, jobResp, jobStatus)
187197
return nil
188198
}

internal/controller/reporting_test.go

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,35 @@ func TestProcessSyncEvents_JobWaitFailed(t *testing.T) {
331331
assert.True(t, exists, "observedDeployments should be populated from submitted records on wait failure")
332332
}
333333

334+
func TestProcessSyncEvents_JobStatusFailed(t *testing.T) {
335+
t.Parallel()
336+
digest := "sha256:abc123"
337+
poster := &mockPoster{
338+
jobResp: &deploymentrecord.JobResponse{JobID: 42},
339+
jobStatus: &deploymentrecord.JobStatus{
340+
JobID: 42,
341+
Status: "failed",
342+
Errors: []deploymentrecord.JobError{
343+
{Name: "nginx", Cause: "error"},
344+
},
345+
},
346+
}
347+
ctrl := newTestController(poster)
348+
ctrl.workloadResolver = &mockResolver{name: "test-deploy"}
349+
pod := makeTestPod("app", "test-deploy-abc123", digest, "ReplicaSet")
350+
351+
err := ctrl.processSyncEvents(context.Background(), []any{pod})
352+
require.NoError(t, err, "failed job should not block startup")
353+
assert.Equal(t, 1, poster.getCreateClusterJobCalls())
354+
assert.Equal(t, 1, poster.getWaitForClusterJobCalls())
355+
356+
// observedDeployments should NOT be populated — records may not have been
357+
// created, and suppressing re-posts would delay self-healing.
358+
cacheKey := getCacheKey(EventCreated, "default/test-deploy/app", digest)
359+
_, exists := ctrl.observedDeployments.Get(cacheKey)
360+
assert.False(t, exists, "observedDeployments should not be populated when job failed")
361+
}
362+
334363
func TestMakeSyncRecords_TerminalJobPodIncluded(t *testing.T) {
335364
t.Parallel()
336365
digest := "sha256:terminal-job-digest"

pkg/deploymentrecord/client.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -504,7 +504,7 @@ func (c *Client) doWithRetry(ctx context.Context, method, targetURL string, body
504504
"status_code", resp.StatusCode,
505505
"resp_msg", string(respBody),
506506
)
507-
return nil, resp.StatusCode, &ClientError{err: fmt.Errorf("unexpected client err with status code %d", resp.StatusCode)}
507+
return respBody, resp.StatusCode, &ClientError{err: fmt.Errorf("unexpected client err with status code %d", resp.StatusCode)}
508508
default:
509509
// Retry with backoff
510510
dtmetrics.PostDeploymentRecordSoftFail.Inc()

0 commit comments

Comments
 (0)