Skip to content

Commit b2fa2e0

Browse files
committed
ctrl: kubeletConfig: handle paused MCPs
Having a paused MCP should prevent updating the corresponding config map for the specified node group. So far, the code wasn't considering the case of paused MCPs, which lead to creating/updating the config map to the newest kubeletconfig CR updates,a thing that caused a mismatch between the configuration in the config map vs the one reflected on the NRTs. In this commit, we modify the kubeletconfig controller to handle paused MCPs such that it skips updating existing RTE config maps; and for new node groups whose MCP is paused, the controller will fetch the old machineConfig (before the pause) and creates RTE config map based on the decoded kubeletconfig data from it. Signed-off-by: Shereen Haj <shajmakh@redhat.com>
1 parent 693a488 commit b2fa2e0

5 files changed

Lines changed: 486 additions & 37 deletions

File tree

internal/controller/kubeletconfig_controller.go

Lines changed: 95 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,9 @@ import (
4848
"github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform"
4949

5050
nropv1 "github.com/openshift-kni/numaresources-operator/api/v1"
51+
intkubeletconfig "github.com/openshift-kni/numaresources-operator/internal/kubeletconfig"
5152
"github.com/openshift-kni/numaresources-operator/internal/machineconfigpools"
53+
intreconcile "github.com/openshift-kni/numaresources-operator/internal/reconcile"
5254
"github.com/openshift-kni/numaresources-operator/pkg/apply"
5355
"github.com/openshift-kni/numaresources-operator/pkg/kubeletconfig"
5456
"github.com/openshift-kni/numaresources-operator/pkg/objectnames"
@@ -57,7 +59,8 @@ import (
5759
)
5860

5961
const (
60-
kubeletConfigRetryPeriod = 30 * time.Second
62+
kubeletConfigRetryPeriod = 30 * time.Second
63+
MachineConfigPoolPausedRetryPeriod = 2 * time.Minute
6164
)
6265

6366
const (
@@ -116,22 +119,20 @@ func (r *KubeletConfigReconciler) Reconcile(ctx context.Context, req ctrl.Reques
116119

117120
// KubeletConfig changes are expected to be sporadic, yet are important enough
118121
// to be made visible at kubernetes level. So we generate events to handle them
119-
cm, err := r.reconcileConfigMap(ctx, instance, req.NamespacedName)
120-
if err != nil {
121-
var klErr *InvalidKubeletConfig
122-
if errors.As(err, &klErr) {
123-
r.Recorder.Event(instance, "Normal", "ProcessSkip", "ignored kubelet config "+klErr.ObjectName)
124-
return ctrl.Result{}, nil
125-
}
126-
127-
klog.ErrorS(err, "failed to reconcile configmap", "controller", "kubeletconfig")
122+
cm, step := r.reconcileConfigMap(ctx, instance, req.NamespacedName)
123+
if step.Error != nil && step.ConditionInfo.Reason != intreconcile.EventProcessSkip {
124+
klog.ErrorS(step.Error, "failed to reconcile configmap", "controller", "kubeletconfig")
125+
r.Recorder.Event(instance, step.ConditionInfo.Type, step.ConditionInfo.Reason, step.ConditionInfo.Message)
126+
return step.Result, step.Error
127+
}
128128

129-
r.Recorder.Event(instance, "Warning", "ProcessFailed", "Failed to update RTE config from kubelet config "+req.NamespacedName.String())
130-
return ctrl.Result{}, err
129+
if step.ConditionInfo.Reason == intreconcile.EventProcessSuccess {
130+
step = step.WithMessage(fmt.Sprintf("Updated RTE config %s/%s from kubelet config %s", cm.Namespace, cm.Name, req.NamespacedName.String()))
131131
}
132132

133-
r.Recorder.Event(instance, "Normal", "ProcessOK", fmt.Sprintf("Updated RTE config %s/%s from kubelet config %s", cm.Namespace, cm.Name, req.NamespacedName.String()))
134-
return ctrl.Result{}, nil
133+
r.Recorder.Event(instance, step.ConditionInfo.Type, step.ConditionInfo.Reason, step.ConditionInfo.Message)
134+
135+
return step.Result, nil
135136
}
136137

137138
func (r *KubeletConfigReconciler) SetupWithManager(mgr ctrl.Manager) error {
@@ -197,25 +198,29 @@ func (e *InvalidKubeletConfig) Unwrap() error {
197198
return e.Err
198199
}
199200

200-
func (r *KubeletConfigReconciler) reconcileConfigMap(ctx context.Context, instance *nropv1.NUMAResourcesOperator, kcKey client.ObjectKey) (*corev1.ConfigMap, error) {
201+
func (r *KubeletConfigReconciler) reconcileConfigMap(ctx context.Context, instance *nropv1.NUMAResourcesOperator, kcKey client.ObjectKey) (*corev1.ConfigMap, intreconcile.Step) {
201202
// first check if the ConfigMap should be deleted
202203
// to save all the additional work related for create/update
203204
cm, deleted, err := r.deleteConfigMap(ctx, instance, kcKey)
204205
if deleted {
205-
return cm, err
206+
return cm, intreconcile.StepWarning(fmt.Errorf("Failed to update RTE config from kubelet config %s: %v", kcKey.Name, err))
206207
}
207208

208-
kcHandler, err := r.makeKCHandlerForPlatform(ctx, instance, kcKey)
209-
if err != nil {
210-
return nil, err
209+
kcHandler, step := r.makeKCHandlerForPlatform(ctx, instance, kcKey)
210+
if step.Error != nil {
211+
return nil, step
211212
}
213+
212214
kubeletConfig, err := kubeletconfig.MCOKubeletConfToKubeletConf(kcHandler.mcoKc)
213215
if err != nil {
214216
klog.ErrorS(err, "cannot extract KubeletConfiguration from MCO KubeletConfig", "name", kcKey.Name)
215-
return nil, err
217+
return nil, FailedConfigMapUpdateStep(kcKey.Name, err)
216218
}
217-
218-
return r.syncConfigMap(ctx, kubeletConfig, instance, kcHandler)
219+
cm, err = r.syncConfigMap(ctx, kubeletConfig, instance, kcHandler)
220+
if err != nil {
221+
return cm, FailedConfigMapUpdateStep(kcKey.Name, err)
222+
}
223+
return cm, step
219224
}
220225

221226
func (r *KubeletConfigReconciler) syncConfigMap(ctx context.Context, kubeletConfig *kubeletconfigv1beta1.KubeletConfiguration, instance *nropv1.NUMAResourcesOperator, kcHandler *kubeletConfigHandler) (*corev1.ConfigMap, error) {
@@ -244,63 +249,106 @@ func (r *KubeletConfigReconciler) syncConfigMap(ctx context.Context, kubeletConf
244249
return rendered, nil
245250
}
246251

247-
func (r *KubeletConfigReconciler) makeKCHandlerForPlatform(ctx context.Context, instance *nropv1.NUMAResourcesOperator, kcKey client.ObjectKey) (*kubeletConfigHandler, error) {
252+
func (r *KubeletConfigReconciler) makeKCHandlerForPlatform(ctx context.Context, instance *nropv1.NUMAResourcesOperator, kcKey client.ObjectKey) (*kubeletConfigHandler, intreconcile.Step) {
248253
switch r.Platform {
249254
case platform.OpenShift:
250255
mcoKc := &mcov1.KubeletConfig{}
251256
if err := r.Client.Get(ctx, kcKey, mcoKc); err != nil {
252-
return nil, err
257+
return nil, FailedConfigMapUpdateStep(kcKey.Name, err)
253258
}
254259

255260
mcps, err := machineconfigpools.GetListByNodeGroupsV1(ctx, r.Client, instance.Spec.NodeGroups)
256261
if err != nil {
257-
return nil, err
262+
return nil, FailedConfigMapUpdateStep(kcKey.Name, err)
258263
}
259264

260265
mcp, err := machineconfigpools.FindBySelector(mcps, mcoKc.Spec.MachineConfigPoolSelector)
261266
if err != nil {
262267
klog.ErrorS(err, "cannot find a matching mcp for MCO KubeletConfig", "name", kcKey.Name)
263268
var notFound *machineconfigpools.NotFound
264269
if errors.As(err, &notFound) {
265-
return nil, &InvalidKubeletConfig{
266-
ObjectName: kcKey.Name,
267-
Err: notFound,
268-
}
270+
return nil, intreconcile.StepNormalSkip(fmt.Errorf("%s: %v", kcKey, notFound))
269271
}
270-
return nil, err
272+
return nil, FailedConfigMapUpdateStep(kcKey.Name, err)
271273
}
272274

273275
klog.V(3).InfoS("matched MCP to MCO KubeletConfig", "kubeletconfig name", kcKey.Name, "MCP name", mcp.Name)
274276

275277
// nothing we care about, and we can't do much anyway
276278
if mcoKc.Spec.KubeletConfig == nil {
277279
klog.InfoS("detected KubeletConfig with empty payload, ignoring", "name", kcKey.Name)
278-
return nil, &InvalidKubeletConfig{ObjectName: kcKey.Name}
280+
return nil, intreconcile.StepNormalSkip(fmt.Errorf("Invalid KubeletConfig %s", kcKey.Name))
281+
}
282+
283+
if mcp.Spec.Paused {
284+
klog.InfoS("detected paused MCP", "name", mcp.Name)
285+
//if the CM exists -> just skip;
286+
//if the CM does not exist -> create it based on the current active machineConfig
287+
288+
expectedCMName := objectnames.GetComponentName(instance.Name, mcp.Name)
289+
existingCM := &corev1.ConfigMap{}
290+
if err := r.Client.Get(ctx, client.ObjectKey{Namespace: r.Namespace, Name: expectedCMName}, existingCM); err != nil {
291+
if apierrors.IsNotFound(err) {
292+
currentConfigName := mcp.Status.Configuration.Name
293+
currentConfigObj := &mcov1.MachineConfig{}
294+
if err := r.Client.Get(ctx, client.ObjectKey{Name: currentConfigName}, currentConfigObj); err != nil {
295+
klog.ErrorS(err, "cannot find the current machineConfig", "name", currentConfigName)
296+
return nil, stepNormalSkipForPausedupdates("failed to find the current machineConfig %s: %v", currentConfigName, err)
297+
}
298+
299+
// use local version of github.com/openshift/machine-config-operator/pkg/controller/common.ParseAndConvertConfig
300+
_, dataInBytes, err := intkubeletconfig.ParseKubeletConfigRawData(currentConfigObj.Spec.Config.Raw)
301+
if err != nil {
302+
klog.ErrorS(err, "cannot parse the current machineConfig", "name", currentConfigName)
303+
return nil, stepNormalSkipForPausedupdates("failed to parse the current machineConfig %s: %v", currentConfigName, err)
304+
}
305+
306+
decodeKc, err := intkubeletconfig.DecodeKubeletConfigurationFromData(dataInBytes)
307+
if err != nil {
308+
klog.ErrorS(err, "cannot decode the current KubeletConfig data from MachineConfig", "name", currentConfigName)
309+
return nil, stepNormalSkipForPausedupdates("failed to decode the current KubeletConfig data from MachineConfig %s: %v", currentConfigName, err)
310+
}
311+
312+
successStepWithRetry := intreconcile.StepNormalSucess(fmt.Sprintf("Created ConfigMap based on the current machineConfig for paused MCP %s", mcp.Name))
313+
successStepWithRetry.Result = ctrl.Result{Requeue: true, RequeueAfter: MachineConfigPoolPausedRetryPeriod}
314+
return &kubeletConfigHandler{
315+
ownerObject: existingCM,
316+
mcoKc: decodeKc,
317+
poolName: mcp.Name,
318+
setCtrlRef: controllerutil.SetControllerReference,
319+
}, successStepWithRetry
320+
}
321+
}
322+
323+
step := intreconcile.StepNormalSkip(fmt.Errorf("MachineConfigPool of KubeletConfig %s is paused", kcKey.Name))
324+
step.Result = ctrl.Result{Requeue: true, RequeueAfter: MachineConfigPoolPausedRetryPeriod}
325+
return nil, step
279326
}
327+
280328
return &kubeletConfigHandler{
281329
ownerObject: mcoKc,
282330
mcoKc: mcoKc,
283331
poolName: mcp.Name,
284332
setCtrlRef: controllerutil.SetControllerReference,
285-
}, nil
333+
}, intreconcile.StepNormalSucess("")
286334

287335
case platform.HyperShift:
288336
cmKc := &corev1.ConfigMap{}
289337
if err := r.Client.Get(ctx, kcKey, cmKc); err != nil {
290-
return nil, err
338+
return nil, FailedConfigMapUpdateStep(kcKey.Name, err)
291339
}
292340

293341
nodePoolName := cmKc.Labels[HyperShiftNodePoolLabel]
294342
kcData := cmKc.Data[HyperShiftConfigMapConfigKey]
295343
mcoKc, err := kubeletconfig.DecodeFromData([]byte(kcData), r.Scheme)
296344
if err != nil {
297-
return nil, err
345+
return nil, FailedConfigMapUpdateStep(kcKey.Name, err)
298346
}
299347

300348
// nothing we care about, and we can't do much anyway
301349
if mcoKc.Spec.KubeletConfig == nil {
302350
klog.InfoS("detected KubeletConfig with empty payload, ignoring", "name", kcKey.Name)
303-
return nil, &InvalidKubeletConfig{ObjectName: kcKey.Name}
351+
return nil, intreconcile.StepNormalSkip(fmt.Errorf("Invalid KubeletConfig %s", kcKey.Name))
304352
}
305353
return &kubeletConfigHandler{
306354
ownerObject: cmKc,
@@ -312,9 +360,19 @@ func (r *KubeletConfigReconciler) makeKCHandlerForPlatform(ctx context.Context,
312360
setCtrlRef: func(owner, controlled metav1.Object, scheme *runtime.Scheme, opts ...controllerutil.OwnerReferenceOption) error {
313361
return nil
314362
},
315-
}, nil
363+
}, intreconcile.StepNormalSucess("")
316364
}
317-
return nil, fmt.Errorf("unsupported platform: %s", r.Platform)
365+
return nil, FailedConfigMapUpdateStep(kcKey.Name, fmt.Errorf("unsupported platform: %s", r.Platform))
366+
}
367+
368+
func stepNormalSkipForPausedupdates(s string, args ...any) intreconcile.Step {
369+
step := intreconcile.StepNormalSkip(fmt.Errorf(s, args...))
370+
step.Result = ctrl.Result{Requeue: true, RequeueAfter: MachineConfigPoolPausedRetryPeriod}
371+
return step
372+
}
373+
374+
func FailedConfigMapUpdateStep(objName string, err error) intreconcile.Step {
375+
return intreconcile.StepWarning(fmt.Errorf("Failed to update RTE config from kubelet config %s: %v", objName, err))
318376
}
319377

320378
func (r *KubeletConfigReconciler) deleteConfigMap(ctx context.Context, instance *nropv1.NUMAResourcesOperator, kcKey client.ObjectKey) (*corev1.ConfigMap, bool, error) {

0 commit comments

Comments
 (0)