@@ -99,6 +99,10 @@ type MonitorConfig struct {
9999// NewMonitor creates a new Monitor.
100100func NewMonitor (cfg MonitorConfig ) * Monitor {
101101 rl := workqueue .DefaultTypedControllerRateLimiter [string ]()
102+ // We use a rate-limiting queue to:
103+ // 1. Deduplicate requests from the daemon server and the periodic monitor loop.
104+ // 2. Decouple the daemon server from processing inline, allowing it to just enqueue items.
105+ // 3. Benefit from automatic exponential backoff for retries on failure.
102106 queue := workqueue .NewTypedRateLimitingQueueWithConfig (rl , workqueue.TypedRateLimitingQueueConfig [string ]{
103107 Name : "metis-daemon-monitor" ,
104108 })
@@ -121,18 +125,17 @@ func NewMonitor(cfg MonitorConfig) *Monitor {
121125 }
122126
123127 return & Monitor {
124- queue : queue ,
125- nncClient : cfg .NNCClient ,
126- nodeName : cfg .NodeName ,
127- store : cfg .Store ,
128- logger : cfg .Logger ,
129- lowUtilizationTimers : make (map [string ]time.Time ),
130- GetPendingRequestsCount : cfg .GetPendingRequestsCount ,
131- cooldownPushbackInterval : cooldownPushbackInterval ,
132- drainingExpiration : drainingExpiration ,
133- monitorInterval : monitorInterval ,
134- lowUtilizationThreshold : DefaultLowUtilizationThreshold ,
135-
128+ queue : queue ,
129+ nncClient : cfg .NNCClient ,
130+ nodeName : cfg .NodeName ,
131+ store : cfg .Store ,
132+ logger : cfg .Logger ,
133+ lowUtilizationTimers : make (map [string ]time.Time ),
134+ GetPendingRequestsCount : cfg .GetPendingRequestsCount ,
135+ cooldownPushbackInterval : cooldownPushbackInterval ,
136+ drainingExpiration : drainingExpiration ,
137+ monitorInterval : monitorInterval ,
138+ lowUtilizationThreshold : DefaultLowUtilizationThreshold ,
136139 targetUtilizationAfterScaleUp : DefaultTargetUtilizationAfterScaleUp ,
137140 cooldownPushbackThreshold : DefaultCooldownPushbackThreshold ,
138141 sustainedLowUtilizationDuration : sustainedLowUtilizationDuration ,
@@ -143,8 +146,8 @@ func NewMonitor(cfg MonitorConfig) *Monitor {
143146func (m * Monitor ) Run (ctx context.Context , workers int ) {
144147 defer m .queue .ShutDown ()
145148
146- m .logger .Info ("Starting IPAM monitor " , "workers" , workers )
147- defer m .logger .Info ("Stopping IPAM monitor " )
149+ m .logger .Info ("Starting Metis Daemon Monitor " , "node" , m . nodeName , " workers" , workers , "interval" , m . monitorInterval )
150+ defer m .logger .Info ("Stopping Metis Daemon Monitor " )
148151
149152 // Periodic enqueuer
150153 go wait .UntilWithContext (ctx , func (ctx context.Context ) {
@@ -178,13 +181,16 @@ func (m *Monitor) processExpiredDrainingBlocks(ctx context.Context) {
178181 m .logger .Error (err , "failed to get networks for expired draining blocks check" )
179182 return
180183 }
184+
185+ nnc , err := m .getNodeNetworkConfig (ctx )
186+ if err != nil {
187+ m .logger .Error (err , "failed to get NodeNetworkConfig" )
188+ return
189+ }
190+ nncCopy := nnc .DeepCopy ()
191+ anyUpdated := false
192+
181193 for _ , network := range networks {
182- nnc , err := m .getNodeNetworkConfig (ctx )
183- if err != nil {
184- m .logger .Error (err , "failed to get NodeNetworkConfig" , "network" , network )
185- continue
186- }
187- nncCopy := nnc .DeepCopy ()
188194 var currentAllocation * nncv1.Allocation
189195 for i := range nncCopy .Spec .Allocations {
190196 if nncCopy .Spec .Allocations [i ].Network == network {
@@ -193,15 +199,19 @@ func (m *Monitor) processExpiredDrainingBlocks(ctx context.Context) {
193199 }
194200 }
195201
196- updated , err := m .handleExpiredDrainingBlocks (ctx , network , nncCopy , currentAllocation )
202+ updated , err := m .handleExpiredDrainingBlocksPerNetwork (ctx , network , nncCopy , currentAllocation )
197203 if err != nil {
198204 m .logger .Error (err , "failed to handle expired draining blocks" , "network" , network )
199205 continue
200206 }
201207 if updated {
202- if err := m .patchNNC (ctx , nncCopy ); err != nil {
203- m .logger .Error (err , "failed to patch NNC for expired draining blocks" , "network" , network )
204- }
208+ anyUpdated = true
209+ }
210+ }
211+
212+ if anyUpdated {
213+ if err := m .patchNNC (ctx , nncCopy ); err != nil {
214+ m .logger .Error (err , "failed to patch NNC for expired draining blocks" )
205215 }
206216 }
207217}
@@ -493,7 +503,12 @@ func (m *Monitor) maybeDrainExcessive(ctx context.Context, network string, info
493503 return false
494504}
495505
496- func (m * Monitor ) handleExpiredDrainingBlocks (ctx context.Context , network string , nncCopy * nncv1.NodeNetworkConfig , currentAllocation * nncv1.Allocation ) (bool , error ) {
506+ func (m * Monitor ) handleExpiredDrainingBlocksPerNetwork (ctx context.Context , network string , nncCopy * nncv1.NodeNetworkConfig , currentAllocation * nncv1.Allocation ) (bool , error ) {
507+ deletingBlocks , err := m .store .GetDeletingCIDRBlocks (ctx , network )
508+ if err != nil {
509+ return false , fmt .Errorf ("failed to query deleting cidr blocks: %w" , err )
510+ }
511+
497512 expiredBlocks , err := m .store .FindAndMarkExpiredDrainingCIDRBlocks (ctx , network , m .drainingExpiration )
498513 if err != nil {
499514 return false , fmt .Errorf ("failed to query and mark draining cidr blocks: %w" , err )
@@ -502,11 +517,6 @@ func (m *Monitor) handleExpiredDrainingBlocks(ctx context.Context, network strin
502517 var reducePods int
503518 updated := false
504519
505- deletingBlocks , err := m .store .GetDeletingCIDRBlocks (ctx )
506- if err != nil {
507- return false , fmt .Errorf ("failed to query deleting cidr blocks: %w" , err )
508- }
509-
510520 statusMap := make (map [string ]nncv1.PodCIDR )
511521 for _ , podCIDR := range nncCopy .Status .PodCIDRs {
512522 if podCIDR .Network == network {
@@ -539,6 +549,7 @@ func (m *Monitor) handleExpiredDrainingBlocks(ctx context.Context, network strin
539549
540550 // Reconcile blocks that are in Deleting state in the local DB but failed to be added
541551 // to the CRD's Spec.ReleasableCIDRs in a previous iteration.
552+ // Removing entry from status and releasableset is atomic, this is atomically done by other controllers.
542553 for _ , block := range deletingBlocks {
543554 podCIDR , inStatus := statusMap [block .CIDR ]
544555 if inStatus && ! releasableSet [block .CIDR ] {
0 commit comments