Skip to content

Commit 9460ffe

Browse files
committed
fix: do not process any kata manager related information, consider it enabled=false
Signed-off-by: Rajat Chopra <rajatc@nvidia.com>
1 parent 7d27a74 commit 9460ffe

2 files changed

Lines changed: 2 additions & 183 deletions

File tree

controllers/object_controls.go

Lines changed: 2 additions & 182 deletions
Original file line numberDiff line numberDiff line change
@@ -714,7 +714,6 @@ func preProcessDaemonSet(obj *appsv1.DaemonSet, n ClusterPolicyController) error
714714
"nvidia-mig-manager": TransformMIGManager,
715715
"nvidia-operator-validator": TransformValidator,
716716
"nvidia-sandbox-validator": TransformSandboxValidator,
717-
"nvidia-kata-manager": TransformKataManager,
718717
"nvidia-cc-manager": TransformCCManager,
719718
}
720719

@@ -2023,83 +2022,6 @@ func TransformMIGManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec,
20232022
return nil
20242023
}
20252024

2026-
// TransformKataManager transforms Kata Manager daemonset with required config as per ClusterPolicy
2027-
func TransformKataManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
2028-
// update image
2029-
image, err := gpuv1.ImagePath(&config.KataManager)
2030-
if err != nil {
2031-
return err
2032-
}
2033-
obj.Spec.Template.Spec.Containers[0].Image = image
2034-
2035-
// update image pull policy
2036-
obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.KataManager.ImagePullPolicy)
2037-
2038-
// set image pull secrets
2039-
if len(config.KataManager.ImagePullSecrets) > 0 {
2040-
addPullSecrets(&obj.Spec.Template.Spec, config.KataManager.ImagePullSecrets)
2041-
}
2042-
2043-
// set resource limits
2044-
if config.KataManager.Resources != nil {
2045-
// apply resource limits to all containers
2046-
for i := range obj.Spec.Template.Spec.Containers {
2047-
obj.Spec.Template.Spec.Containers[i].Resources.Requests = config.KataManager.Resources.Requests
2048-
obj.Spec.Template.Spec.Containers[i].Resources.Limits = config.KataManager.Resources.Limits
2049-
}
2050-
}
2051-
2052-
// set arguments if specified for mig-manager container
2053-
if len(config.KataManager.Args) > 0 {
2054-
obj.Spec.Template.Spec.Containers[0].Args = config.KataManager.Args
2055-
}
2056-
2057-
// mount artifactsDir
2058-
artifactsDir := DefaultKataArtifactsDir
2059-
if config.KataManager.Config.ArtifactsDir != "" {
2060-
artifactsDir = config.KataManager.Config.ArtifactsDir
2061-
}
2062-
2063-
// set env used by readinessProbe to determine path to kata-manager pid file.
2064-
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), "KATA_ARTIFACTS_DIR", artifactsDir)
2065-
2066-
artifactsVolMount := corev1.VolumeMount{Name: "kata-artifacts", MountPath: artifactsDir}
2067-
obj.Spec.Template.Spec.Containers[0].VolumeMounts = append(obj.Spec.Template.Spec.Containers[0].VolumeMounts, artifactsVolMount)
2068-
2069-
artifactsVol := corev1.Volume{Name: "kata-artifacts", VolumeSource: corev1.VolumeSource{HostPath: &corev1.HostPathVolumeSource{Path: artifactsDir, Type: ptr.To(corev1.HostPathDirectoryOrCreate)}}}
2070-
obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, artifactsVol)
2071-
2072-
// Compute hash of kata manager config and add an annotation with the value.
2073-
// If the kata config changes, a new revision of the daemonset will be
2074-
// created and thus the kata-manager pods will restart with the updated config.
2075-
hash := utils.GetObjectHash(config.KataManager.Config)
2076-
2077-
if obj.Spec.Template.Annotations == nil {
2078-
obj.Spec.Template.Annotations = make(map[string]string)
2079-
}
2080-
obj.Spec.Template.Annotations[KataManagerAnnotationHashKey] = hash
2081-
2082-
if len(config.KataManager.Env) > 0 {
2083-
for _, env := range config.KataManager.Env {
2084-
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), env.Name, env.Value)
2085-
}
2086-
}
2087-
2088-
// mount containerd config and socket
2089-
// setup mounts for runtime config file
2090-
runtime := n.runtime.String()
2091-
// kata manager is the only container in this daemonset
2092-
err = transformForRuntime(obj, config, runtime, &obj.Spec.Template.Spec.Containers[0])
2093-
if err != nil {
2094-
return fmt.Errorf("error transforming kata-manager daemonset : %w", err)
2095-
}
2096-
2097-
// set hostNetwork for kata-manager if specified
2098-
applyHostNetworkConfig(&obj.Spec.Template.Spec, config.KataManager.HostNetwork)
2099-
2100-
return nil
2101-
}
2102-
21032025
// TransformVFIOManager transforms VFIO-PCI Manager daemonset with required config as per ClusterPolicy
21042026
func TransformVFIOManager(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error {
21052027
// update k8s-driver-manager initContainer
@@ -5156,115 +5078,13 @@ func transformRuntimeClass(n ClusterPolicyController, spec nodev1.RuntimeClass)
51565078
return gpuv1.Ready, nil
51575079
}
51585080

5159-
func transformKataRuntimeClasses(n ClusterPolicyController) (gpuv1.State, error) {
5160-
ctx := n.ctx
5161-
state := n.idx
5162-
config := n.singleton.Spec
5163-
5164-
// Get all existing Kata RuntimeClasses
5165-
opts := []client.ListOption{&client.MatchingLabels{"nvidia.com/kata-runtime-class": "true"}}
5166-
list := &nodev1.RuntimeClassList{}
5167-
err := n.client.List(ctx, list, opts...)
5168-
if err != nil {
5169-
n.logger.Info("Could not get Kata RuntimeClassList", err)
5170-
return gpuv1.NotReady, fmt.Errorf("error getting kata RuntimeClassList: %v", err)
5171-
}
5172-
n.logger.V(1).Info("Kata RuntimeClasses", "Number", len(list.Items))
5173-
5174-
if !config.KataManager.IsEnabled() {
5175-
// Delete all Kata RuntimeClasses
5176-
n.logger.Info("Kata Manager disabled, deleting all Kata RuntimeClasses")
5177-
for _, rc := range list.Items {
5178-
rc := rc
5179-
n.logger.V(1).Info("Deleting Kata RuntimeClass", "Name", rc.Name)
5180-
err := n.client.Delete(ctx, &rc)
5181-
if err != nil {
5182-
return gpuv1.NotReady, fmt.Errorf("error deleting kata RuntimeClass '%s': %v", rc.Name, err)
5183-
}
5184-
}
5185-
return gpuv1.Ready, nil
5186-
}
5187-
5188-
// Get names of desired kata RuntimeClasses
5189-
rcNames := make(map[string]struct{})
5190-
for _, rc := range config.KataManager.Config.RuntimeClasses {
5191-
rcNames[rc.Name] = struct{}{}
5192-
}
5193-
5194-
// Delete any existing Kata RuntimeClasses that are no longer specified in KataManager configuration
5195-
for _, rc := range list.Items {
5196-
if _, ok := rcNames[rc.Name]; !ok {
5197-
rc := rc
5198-
n.logger.Info("Deleting Kata RuntimeClass", "Name", rc.Name)
5199-
err := n.client.Delete(ctx, &rc)
5200-
if err != nil {
5201-
return gpuv1.NotReady, fmt.Errorf("error deleting kata RuntimeClass '%s': %v", rc.Name, err)
5202-
}
5203-
}
5204-
}
5205-
5206-
// Using kata RuntimClass template, create / update RuntimeClass objects specified in KataManager configuration
5207-
template := n.resources[state].RuntimeClasses[0]
5208-
for _, rc := range config.KataManager.Config.RuntimeClasses {
5209-
logger := n.logger.WithValues("RuntimeClass", rc.Name)
5210-
5211-
if rc.Name == config.Operator.RuntimeClass {
5212-
return gpuv1.NotReady, fmt.Errorf("error creating kata runtimeclass '%s' as it conflicts with the runtimeclass used for the gpu-operator operand pods itself", rc.Name)
5213-
}
5214-
5215-
obj := nodev1.RuntimeClass{}
5216-
obj.Name = rc.Name
5217-
obj.Handler = rc.Name
5218-
obj.Labels = template.Labels
5219-
obj.Scheduling = &nodev1.Scheduling{}
5220-
nodeSelector := make(map[string]string)
5221-
for k, v := range template.Scheduling.NodeSelector {
5222-
nodeSelector[k] = v
5223-
}
5224-
if rc.NodeSelector != nil {
5225-
// append user provided selectors to default nodeSelector
5226-
for k, v := range rc.NodeSelector {
5227-
nodeSelector[k] = v
5228-
}
5229-
}
5230-
obj.Scheduling.NodeSelector = nodeSelector
5231-
5232-
if err := controllerutil.SetControllerReference(n.singleton, &obj, n.scheme); err != nil {
5233-
return gpuv1.NotReady, err
5234-
}
5235-
5236-
found := &nodev1.RuntimeClass{}
5237-
err := n.client.Get(ctx, types.NamespacedName{Namespace: "", Name: obj.Name}, found)
5238-
if err != nil && apierrors.IsNotFound(err) {
5239-
logger.Info("Not found, creating...")
5240-
err = n.client.Create(ctx, &obj)
5241-
if err != nil {
5242-
logger.Info("Couldn't create", "Error", err)
5243-
return gpuv1.NotReady, err
5244-
}
5245-
continue
5246-
} else if err != nil {
5247-
return gpuv1.NotReady, err
5248-
}
5249-
5250-
logger.Info("Found Resource, updating...")
5251-
obj.ResourceVersion = found.ResourceVersion
5252-
5253-
err = n.client.Update(ctx, &obj)
5254-
if err != nil {
5255-
logger.Info("Couldn't update", "Error", err)
5256-
return gpuv1.NotReady, err
5257-
}
5258-
}
5259-
return gpuv1.Ready, nil
5260-
}
5261-
52625081
func RuntimeClasses(n ClusterPolicyController) (gpuv1.State, error) {
52635082
status := gpuv1.Ready
52645083
state := n.idx
52655084

52665085
if n.stateNames[state] == "state-kata-manager" {
5267-
return transformKataRuntimeClasses(n)
5086+
// Kata Manager is deprecated, no need to process anything
5087+
return gpuv1.Ready, nil
52685088
}
52695089

52705090
nvidiaRuntimeClasses := n.resources[state].RuntimeClasses

controllers/state_manager.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -905,7 +905,6 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP
905905
addState(n, "/opt/gpu-operator/state-vfio-manager")
906906
addState(n, "/opt/gpu-operator/state-sandbox-device-plugin")
907907
addState(n, "/opt/gpu-operator/state-kata-device-plugin")
908-
addState(n, "/opt/gpu-operator/state-kata-manager")
909908
addState(n, "/opt/gpu-operator/state-cc-manager")
910909
}
911910

0 commit comments

Comments
 (0)