@@ -2,7 +2,6 @@ package controller
22
33import (
44 "context"
5- "encoding/json"
65 "errors"
76 "fmt"
87 "log/slog"
@@ -118,42 +117,73 @@ func (c *Controller) processEvent(ctx context.Context, event PodEvent) error {
118117}
119118
120119func (c * Controller ) processSyncEvents (ctx context.Context , syncClusterPods []any ) error {
120+ if ! c .cfg .BulkClusterSync {
121+ slog .Info ("Async cluster sync disabled, skipping startup sync" )
122+ return nil
123+ }
124+
121125 syncRecords := c .makeSyncRecords (ctx , syncClusterPods )
122126 if len (syncRecords ) == 0 {
123127 slog .Info ("No sync records to post" )
124128 return nil
125129 }
126130
127- respBody , err := c .apiClient .PostCluster (ctx , syncRecords , c .cfg .Cluster )
128- var clusterNoRepositoriesError * deploymentrecord.ClusterNoRepositoriesError
131+ jobResp , err := c .apiClient .CreateClusterJob (ctx , syncRecords , c .cfg .Cluster )
129132 if err != nil {
130- if errors .As (err , & clusterNoRepositoriesError ) {
131- slog .Info ("Cluster sync found no creatable records" ,
133+ var conflictErr * deploymentrecord.ClusterJobConflictError
134+ var noReposErr * deploymentrecord.ClusterNoRepositoriesError
135+
136+ switch {
137+ case errors .As (err , & conflictErr ):
138+ slog .Warn ("Cluster job already in progress, skipping startup sync" ,
132139 "org" , c .cfg .Organization ,
133140 )
141+ c .fillCachesFromSubmitted (syncRecords )
134142 return nil
143+
144+ case errors .As (err , & noReposErr ):
145+ slog .Info ("Async cluster endpoint not available, skipping startup sync" ,
146+ "org" , c .cfg .Organization ,
147+ )
148+ return nil
149+
150+ default :
151+ slog .Error ("Failed to create cluster job" ,
152+ "error" , err ,
153+ "record_count" , len (syncRecords ),
154+ )
155+ return fmt .Errorf ("failed to create cluster job: %w" , err )
135156 }
136- slog .Error ("Failed to post sync cluster records" ,
137- "error" , err ,
138- "record_count" , len (syncRecords ),
157+ }
158+
159+ if len (jobResp .Errors ) > 0 {
160+ slog .Warn ("Some deployments rejected from job submission" ,
161+ "job_id" , jobResp .JobID ,
162+ "rejected_count" , len (jobResp .Errors ),
139163 )
140- return fmt .Errorf ("failed to post sync cluster records: %w" , err )
141164 }
142- var deploymentRecords deploymentrecord.RecordsClusterResp
143- err = json .Unmarshal (respBody , & deploymentRecords )
165+
166+ // Wait for job completion with a timeout to prevent indefinite startup delay.
167+ jobCtx , cancel := context .WithTimeout (ctx , 5 * time .Minute )
168+ defer cancel ()
169+ jobStatus , err := c .apiClient .WaitForClusterJob (jobCtx , c .cfg .Cluster , jobResp .JobID )
144170 if err != nil {
145- slog .Error ("Failed to unmarshall response" ,
171+ slog .Error ("Failed waiting for cluster job, filling caches from submitted records" ,
172+ "job_id" , jobResp .JobID ,
146173 "error" , err ,
147- "record_count" , len (syncRecords ),
148174 )
175+ c .fillCachesFromSubmitted (syncRecords )
149176 return nil
150177 }
151- slog .Info ("Successfully posted sync cluster records" ,
152- "created" , len (deploymentRecords .DeploymentRecords ),
153- "errors" , len (deploymentRecords .Errors ),
178+
179+ slog .Info ("Cluster job completed" ,
180+ "job_id" , jobResp .JobID ,
181+ "status" , jobStatus .Status ,
182+ "total_count" , jobStatus .TotalCount ,
183+ "errors" , len (jobStatus .Errors ),
154184 )
155185
156- c .fillCaches ( deploymentRecords )
186+ c .fillCachesFromJobResult ( syncRecords , jobResp , jobStatus )
157187 return nil
158188}
159189
@@ -224,20 +254,51 @@ func (c *Controller) makeSyncRecords(ctx context.Context, syncClusterPods []any)
224254 return syncRecords
225255}
226256
227- func (c * Controller ) fillCaches (deploymentRecords deploymentrecord.RecordsClusterResp ) {
228- slog .Info ("Filling caches after posting sync cluster records" )
229- // Fill observedDeployments cache with successful digests
230- for _ , r := range deploymentRecords .DeploymentRecords {
257+ // fillCachesFromSubmitted populates the observedDeployments cache from the
258+ // records we submitted, without waiting for a response. Used when we can't
259+ // get a response (409 conflict, wait timeout, job failure).
260+ func (c * Controller ) fillCachesFromSubmitted (records []* deploymentrecord.Record ) {
261+ slog .Info ("Filling observedDeployments cache from submitted records" ,
262+ "count" , len (records ),
263+ )
264+ for _ , r := range records {
231265 cacheKey := getCacheKey (EventCreated , r .DeploymentName , r .Digest )
232266 c .observedDeployments .Set (cacheKey , true , 2 * time .Minute )
233267 }
268+ }
269+
270+ // fillCachesFromJobResult populates both caches after an async job completes.
271+ // observedDeployments is filled from submitted records, and unknownArtifacts
272+ // is filled from error responses with cause "not_found".
273+ func (c * Controller ) fillCachesFromJobResult (records []* deploymentrecord.Record , jobResp * deploymentrecord.JobResponse , jobStatus * deploymentrecord.JobStatus ) {
274+ slog .Info ("Filling caches after cluster job completion" ,
275+ "record_count" , len (records ),
276+ )
234277
235- // Fill unknownArtifacts cache with unknown digests
236- for _ , r := range deploymentRecords .Errors {
237- if r .Cause == "not_found" {
238- c .unknownArtifacts .Set (r .Digest , true , unknownArtifactTTL )
278+ // Build a name→digests lookup from submitted records so we can
279+ // key unknownArtifacts by digest (which is how recordContainer looks them up).
280+ // Multiple records can share the same image name with different digests,
281+ // so we only cache when the mapping is unambiguous (exactly one digest per name).
282+ nameToDigests := make (map [string ][]string , len (records ))
283+ for _ , r := range records {
284+ cacheKey := getCacheKey (EventCreated , r .DeploymentName , r .Digest )
285+ c .observedDeployments .Set (cacheKey , true , 2 * time .Minute )
286+ nameToDigests [r .Name ] = append (nameToDigests [r .Name ], r .Digest )
287+ }
288+
289+ cacheUnknownDigests := func (errors []deploymentrecord.JobError ) {
290+ for _ , e := range errors {
291+ if e .Cause == "not_found" {
292+ if digests , ok := nameToDigests [e .Name ]; ok && len (digests ) == 1 {
293+ c .unknownArtifacts .Set (digests [0 ], true , unknownArtifactTTL )
294+ }
295+ }
239296 }
240297 }
298+
299+ // Fill unknownArtifacts from job submission and completion errors
300+ cacheUnknownDigests (jobResp .Errors )
301+ cacheUnknownDigests (jobStatus .Errors )
241302}
242303
243304// recordContainer records a single container's deployment info.
0 commit comments