diff --git a/README.md b/README.md index 27a6f93..dcd5a48 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ Slurm is a full featured HPC workload manager. To highlight a few features: ## Limitations -- Exclusive, whole node allocations are made for each pod. +- Exclusive, whole node allocations are made for each pod when using group workloads (PodGroups, LeaderWorkerSet). - Only supports the following DRA drivers: - [DRA Driver CPU][dra-driver-cpu] for CPUs. - [DRA Example Driver][dra-example-driver] for GPUs. diff --git a/docs/scheduler.md b/docs/scheduler.md index 47081ff..0a6d7c9 100644 --- a/docs/scheduler.md +++ b/docs/scheduler.md @@ -95,11 +95,14 @@ see the [annotations.go] source. | Annotation | Description | Example | | -------------------------------------- | --------------------------------- | ------------ | -| slurmjob.slinky.slurm.net/gres | Overrides the default gres. | "GPU:V100:2" | -| slurmjob.slinky.slurm.net/job-name | Sets the job name. | "research" | -| slurmjob.slinky.slurm.net/max-nodes | Sets the maximum number of nodes. | "3" | -| slurmjob.slinky.slurm.net/mem-per-node | Sets the amount of memory. | "8Gi" | -| slurmjob.slinky.slurm.net/partition | Overrides the default partition. | "debug" | +| Annotation | Description | Example | +| -------------------------------------- | -------------------------------------------------------------------- | ------------ | +| slurmjob.slinky.slurm.net/gres | Overrides the default gres. | "GPU:V100:2" | +| slurmjob.slinky.slurm.net/job-name | Sets the job name. | "research" | +| slurmjob.slinky.slurm.net/max-nodes | Sets the maximum number of nodes. | "3" | +| slurmjob.slinky.slurm.net/mem-per-node | Sets the amount of memory. | "8Gi" | +| slurmjob.slinky.slurm.net/partition | Overrides the default partition. | "debug" | +| slurmjob.slinky.slurm.net/shared | Sets the shared policy. Allowed: "none", "user". Only supported on single-pod workloads. | "user" | An example of the annotations in use: diff --git a/internal/admission/admission.go b/internal/admission/admission.go index 528d255..7c6d448 100644 --- a/internal/admission/admission.go +++ b/internal/admission/admission.go @@ -18,6 +18,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + lwsv1 "sigs.k8s.io/lws/api/leaderworkerset/v1" + sched "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" ) type PodAdmission struct { @@ -88,6 +90,9 @@ func (r *PodAdmission) ValidateCreate(ctx context.Context, pod *corev1.Pod) (adm if pod.Spec.ResourceClaims != nil { return nil, fmt.Errorf("can't schedule a pod with a resourceclaim, use the annotation %s to request devices instead", wellknown.AnnotationGres) } + if err := validateSharedAnnotation(pod); err != nil { + return nil, err + } return nil, nil } @@ -113,6 +118,16 @@ func (r *PodAdmission) ValidateUpdate(ctx context.Context, oldPod *corev1.Pod, n return nil, fmt.Errorf("can't update a running pod's external node annotation") } } + // Once the Slurm external job is running, the shared annotation should not be modified. + if newPod.Labels[wellknown.LabelExternalJobId] != "" && + newPod.Annotations[wellknown.AnnotationExternalJobNode] != "" { + if oldPod.Annotations[wellknown.AnnotationShared] != newPod.Annotations[wellknown.AnnotationShared] { + return nil, fmt.Errorf("can't change shared annotation when the Slurm external job is already running") + } + } + if err := validateSharedAnnotation(newPod); err != nil { + return nil, err + } return nil, nil } @@ -121,6 +136,25 @@ func (r *PodAdmission) ValidateDelete(ctx context.Context, pod *corev1.Pod) (adm return nil, nil } +// validateSharedAnnotation validates the shared annotation value and rejects +// group workloads (PodGroup, LeaderWorkerSet). +func validateSharedAnnotation(pod *corev1.Pod) error { + value, ok := pod.Annotations[wellknown.AnnotationShared] + if !ok { + return nil + } + if err := wellknown.ValidateSharedValue(value); err != nil { + return err + } + if pod.Labels[sched.PodGroupLabel] != "" { + return fmt.Errorf("shared annotation is not allowed on PodGroup pods") + } + if pod.Labels[lwsv1.GroupUniqueHashLabelKey] != "" { + return fmt.Errorf("shared annotation is not allowed on LeaderWorkerSet pods") + } + return nil +} + func (r *PodAdmission) isManagedNamespace(ctx context.Context, namespace string) (bool, error) { if r.ManagedNamespaceSelector != nil { selector, err := metav1.LabelSelectorAsSelector(r.ManagedNamespaceSelector) diff --git a/internal/admission/admission_test.go b/internal/admission/admission_test.go index e55d53a..1c9b938 100644 --- a/internal/admission/admission_test.go +++ b/internal/admission/admission_test.go @@ -17,6 +17,8 @@ import ( ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + lwsv1 "sigs.k8s.io/lws/api/leaderworkerset/v1" + sched "sigs.k8s.io/scheduler-plugins/apis/scheduling/v1alpha1" ) const ( @@ -434,6 +436,88 @@ func TestPodAdmission_ValidateCreate(t *testing.T) { want: nil, wantErr: false, }, + { + name: "PodWithSharedUser", + fields: fields{ + ManagedNamespaces: []string{namespace}, + }, + args: args{ + ctx: context.TODO(), + pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Annotations: map[string]string{ + wellknown.AnnotationShared: "user", + }, + }, + }, + }, + want: nil, + wantErr: false, + }, + { + name: "PodWithSharedInvalid", + fields: fields{ + ManagedNamespaces: []string{namespace}, + }, + args: args{ + ctx: context.TODO(), + pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Annotations: map[string]string{ + wellknown.AnnotationShared: "invalid", + }, + }, + }, + }, + want: nil, + wantErr: true, + }, + { + name: "PodWithSharedAndPodGroupLabel", + fields: fields{ + ManagedNamespaces: []string{namespace}, + }, + args: args{ + ctx: context.TODO(), + pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Annotations: map[string]string{ + wellknown.AnnotationShared: "user", + }, + Labels: map[string]string{ + sched.PodGroupLabel: "pg", + }, + }, + }, + }, + want: nil, + wantErr: true, + }, + { + name: "PodWithSharedAndLWSLabel", + fields: fields{ + ManagedNamespaces: []string{namespace}, + }, + args: args{ + ctx: context.TODO(), + pod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Annotations: map[string]string{ + wellknown.AnnotationShared: "user", + }, + Labels: map[string]string{ + lwsv1.GroupUniqueHashLabelKey: "lws", + }, + }, + }, + }, + want: nil, + wantErr: true, + }, { name: "PodWithSchedulerNameInUnmanagedNamespace", fields: fields{ @@ -669,6 +753,177 @@ func TestPodAdmission_ValidateUpdate(t *testing.T) { want: nil, wantErr: true, }, + { + name: "UpdatePodWithSharedUser", + fields: fields{ + ManagedNamespaces: []string{namespace}, + }, + args: args{ + ctx: context.TODO(), + oldPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + }, + }, + newPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Annotations: map[string]string{ + wellknown.AnnotationShared: "user", + }, + }, + }, + }, + want: nil, + wantErr: false, + }, + { + name: "UpdatePodWithSharedInvalid", + fields: fields{ + ManagedNamespaces: []string{namespace}, + }, + args: args{ + ctx: context.TODO(), + oldPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + }, + }, + newPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Annotations: map[string]string{ + wellknown.AnnotationShared: "invalid", + }, + }, + }, + }, + want: nil, + wantErr: true, + }, + { + name: "UpdatePodWithSharedAndPodGroupLabel", + fields: fields{ + ManagedNamespaces: []string{namespace}, + }, + args: args{ + ctx: context.TODO(), + oldPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + }, + }, + newPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Annotations: map[string]string{ + wellknown.AnnotationShared: "user", + }, + Labels: map[string]string{ + sched.PodGroupLabel: "pg", + }, + }, + }, + }, + want: nil, + wantErr: true, + }, + { + name: "UpdatePodWithSharedAndLWSLabel", + fields: fields{ + ManagedNamespaces: []string{namespace}, + }, + args: args{ + ctx: context.TODO(), + oldPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + }, + }, + newPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Annotations: map[string]string{ + wellknown.AnnotationShared: "user", + }, + Labels: map[string]string{ + lwsv1.GroupUniqueHashLabelKey: "lws", + }, + }, + }, + }, + want: nil, + wantErr: true, + }, + { + name: "AddSharedAnnotationWhenPlaceholderJobRunning", + fields: fields{ + ManagedNamespaces: []string{namespace}, + }, + args: args{ + ctx: context.TODO(), + oldPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Labels: map[string]string{ + wellknown.LabelExternalJobId: "1", + }, + Annotations: map[string]string{ + wellknown.AnnotationExternalJobNode: "node1", + }, + }, + }, + newPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Labels: map[string]string{ + wellknown.LabelExternalJobId: "1", + }, + Annotations: map[string]string{ + wellknown.AnnotationExternalJobNode: "node1", + wellknown.AnnotationShared: "user", + }, + }, + }, + }, + want: nil, + wantErr: true, + }, + { + name: "ChangeSharedAnnotationValueWhenPlaceholderJobRunning", + fields: fields{ + ManagedNamespaces: []string{namespace}, + }, + args: args{ + ctx: context.TODO(), + oldPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Labels: map[string]string{ + wellknown.LabelExternalJobId: "1", + }, + Annotations: map[string]string{ + wellknown.AnnotationExternalJobNode: "node1", + wellknown.AnnotationShared: "user", + }, + }, + }, + newPod: &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: namespace, + Labels: map[string]string{ + wellknown.LabelExternalJobId: "1", + }, + Annotations: map[string]string{ + wellknown.AnnotationExternalJobNode: "node1", + wellknown.AnnotationShared: "none", + }, + }, + }, + }, + want: nil, + wantErr: true, + }, { name: "RunningPodWithSchedulerNameCantChangeJobIDInUnmanagedNamespace", fields: fields{ diff --git a/internal/scheduler/plugins/slurmbridge/slurmcontrol/slurmcontrol.go b/internal/scheduler/plugins/slurmbridge/slurmcontrol/slurmcontrol.go index fb3f967..35074b8 100644 --- a/internal/scheduler/plugins/slurmbridge/slurmcontrol/slurmcontrol.go +++ b/internal/scheduler/plugins/slurmbridge/slurmcontrol/slurmcontrol.go @@ -61,15 +61,20 @@ type GresLayout struct { Type string } -func sharedFromExclusiveAnnotation(slurmJobIR *slurmjobir.SlurmJobIR) *[]api.V0044JobDescMsgShared { - exclusive := true - if slurmJobIR != nil && slurmJobIR.JobInfo.Exclusive != nil { - exclusive = *slurmJobIR.JobInfo.Exclusive - } - if exclusive { +// sharedForJob returns the shared policy for the job based on the shared annotation. +// Defaults to SharedNone (exclusive) when not set or for group workloads. +func sharedForJob(slurmJobIR *slurmjobir.SlurmJobIR) *[]api.V0044JobDescMsgShared { + if len(slurmJobIR.Pods.Items) != 1 || slurmJobIR.JobInfo.Shared == nil { return &[]api.V0044JobDescMsgShared{api.V0044JobDescMsgSharedNone} } - return &[]api.V0044JobDescMsgShared{} + shared, ok := map[string]api.V0044JobDescMsgShared{ + "none": api.V0044JobDescMsgSharedNone, + "user": api.V0044JobDescMsgSharedUser, + }[*slurmJobIR.JobInfo.Shared] + if !ok { + shared = api.V0044JobDescMsgSharedNone + } + return &[]api.V0044JobDescMsgShared{shared} } // DeleteSlurmJob will delete an external job @@ -201,7 +206,7 @@ func (r *realSlurmControl) submitJob(ctx context.Context, pod *corev1.Pod, slurm }(), Qos: slurmJobIR.JobInfo.QOS, Reservation: slurmJobIR.JobInfo.Reservation, - Shared: sharedFromExclusiveAnnotation(slurmJobIR), + Shared: sharedForJob(slurmJobIR), TasksPerNode: slurmJobIR.JobInfo.TasksPerNode, TimeLimit: func() *api.V0044Uint32NoValStruct { if slurmJobIR.JobInfo.TimeLimit != nil { diff --git a/internal/scheduler/plugins/slurmbridge/slurmcontrol/slurmcontrol_test.go b/internal/scheduler/plugins/slurmbridge/slurmcontrol/slurmcontrol_test.go index 780d66a..0c31f9c 100644 --- a/internal/scheduler/plugins/slurmbridge/slurmcontrol/slurmcontrol_test.go +++ b/internal/scheduler/plugins/slurmbridge/slurmcontrol/slurmcontrol_test.go @@ -27,48 +27,45 @@ import ( "k8s.io/utils/ptr" ) -func Test_sharedFromExclusiveAnnotation(t *testing.T) { +func Test_sharedForJob(t *testing.T) { + onePod := corev1.PodList{Items: []corev1.Pod{{}}} tests := []struct { - name string - slurmJobIR *slurmjobir.SlurmJobIR - wantExclusive bool + name string + slurmJobIR *slurmjobir.SlurmJobIR + wantShared api.V0044JobDescMsgShared }{ { - name: "nil slurmJobIR defaults to exclusive", - slurmJobIR: nil, - wantExclusive: true, + name: "no Shared annotation defaults to SharedNone", + slurmJobIR: &slurmjobir.SlurmJobIR{Pods: onePod}, + wantShared: api.V0044JobDescMsgSharedNone, }, { - name: "slurmJobIR with Exclusive nil defaults to exclusive", - slurmJobIR: &slurmjobir.SlurmJobIR{}, - wantExclusive: true, + name: "Shared=none yields SharedNone", + slurmJobIR: &slurmjobir.SlurmJobIR{Pods: onePod, JobInfo: slurmjobir.SlurmJobIRJobInfo{Shared: ptr.To("none")}}, + wantShared: api.V0044JobDescMsgSharedNone, }, { - name: "slurmJobIR.Exclusive true", - slurmJobIR: &slurmjobir.SlurmJobIR{JobInfo: slurmjobir.SlurmJobIRJobInfo{Exclusive: ptr.To(true)}}, - wantExclusive: true, + name: "Shared=user yields SharedUser", + slurmJobIR: &slurmjobir.SlurmJobIR{Pods: onePod, JobInfo: slurmjobir.SlurmJobIRJobInfo{Shared: ptr.To("user")}}, + wantShared: api.V0044JobDescMsgSharedUser, }, { - name: "slurmJobIR.Exclusive false yields non-exclusive (empty Shared)", - slurmJobIR: &slurmjobir.SlurmJobIR{JobInfo: slurmjobir.SlurmJobIRJobInfo{Exclusive: ptr.To(false)}}, - wantExclusive: false, + name: "group workload (multiple pods) always yields SharedNone", + slurmJobIR: &slurmjobir.SlurmJobIR{ + Pods: corev1.PodList{Items: []corev1.Pod{{}, {}}}, + JobInfo: slurmjobir.SlurmJobIRJobInfo{Shared: ptr.To("user")}, + }, + wantShared: api.V0044JobDescMsgSharedNone, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - got := sharedFromExclusiveAnnotation(tt.slurmJobIR) - if got == nil { - t.Fatalf("sharedFromExclusiveAnnotation() = nil") + got := sharedForJob(tt.slurmJobIR) + if got == nil || len(*got) != 1 { + t.Fatalf("sharedForJob() = %v, want single element", got) } - if tt.wantExclusive { - if len(*got) != 1 { - t.Fatalf("sharedFromExclusiveAnnotation() = %v, want single element (exclusive)", got) - } - if (*got)[0] != api.V0044JobDescMsgSharedNone { - t.Errorf("sharedFromExclusiveAnnotation() Shared = %v, want SharedNone", (*got)[0]) - } - } else if len(*got) != 0 { - t.Errorf("sharedFromExclusiveAnnotation() = %v, want empty (non-exclusive)", got) + if (*got)[0] != tt.wantShared { + t.Errorf("sharedForJob() = %v, want %v", (*got)[0], tt.wantShared) } }) } diff --git a/internal/utils/slurmjobir/slurmjobir.go b/internal/utils/slurmjobir/slurmjobir.go index b07c733..bd7f2ae 100644 --- a/internal/utils/slurmjobir/slurmjobir.go +++ b/internal/utils/slurmjobir/slurmjobir.go @@ -42,6 +42,7 @@ type SlurmJobIRJobInfo struct { Partition *string QOS *string Reservation *string + Shared *string // one of wellknown.SharedAllowedValues TasksPerNode *int32 TimeLimit *int32 UserId *string @@ -229,6 +230,11 @@ func parseAnnotations(slurmJobIR *SlurmJobIR, anno map[string]string) error { slurmJobIR.JobInfo.QOS = &value case wellknown.AnnotationReservation: slurmJobIR.JobInfo.Reservation = &value + case wellknown.AnnotationShared: + if err := wellknown.ValidateSharedValue(value); err != nil { + return err + } + slurmJobIR.JobInfo.Shared = &value case wellknown.AnnotationTimeLimit: num, err := ConvStrTo32(value) if err != nil { diff --git a/internal/utils/slurmjobir/slurmjobir_test.go b/internal/utils/slurmjobir/slurmjobir_test.go index c894f2d..e43e64a 100644 --- a/internal/utils/slurmjobir/slurmjobir_test.go +++ b/internal/utils/slurmjobir/slurmjobir_test.go @@ -487,6 +487,36 @@ func Test_parseAnnotations(t *testing.T) { }, wantErr: true, }, + { + name: "SharedSetToUser", + args: args{ + slurmJobIR: &SlurmJobIR{}, + anno: map[string]string{ + wellknown.AnnotationShared: "user", + }, + }, + wantErr: false, + wantRes: SlurmJobIR{ + JobInfo: SlurmJobIRJobInfo{ + Shared: ptr.To("user"), + }, + }, + }, + { + name: "SharedSetToNone", + args: args{ + slurmJobIR: &SlurmJobIR{}, + anno: map[string]string{ + wellknown.AnnotationShared: "none", + }, + }, + wantErr: false, + wantRes: SlurmJobIR{ + JobInfo: SlurmJobIRJobInfo{ + Shared: ptr.To("none"), + }, + }, + }, { name: "Exclusive annotation false", args: args{ diff --git a/internal/wellknown/annotations.go b/internal/wellknown/annotations.go index cb5afc2..0bfd7be 100644 --- a/internal/wellknown/annotations.go +++ b/internal/wellknown/annotations.go @@ -3,6 +3,11 @@ package wellknown +import ( + "fmt" + "slices" +) + const ( // AnnotationExternalJobNode indicates the Node which corresponds to the // the pod's external job. @@ -69,4 +74,19 @@ const ( // AnnotationWckey sets the Wckey // for the Slurm external job. AnnotationWckey = SlurmJobPrefix + "wckey" + // AnnotationShared sets the shared policy + // for the Slurm external job. + AnnotationShared = SlurmJobPrefix + "shared" ) + +// SharedAllowedValues are the allowed values for the shared annotation +// (V0044JobDescMsgShared in slurm-client). +var SharedAllowedValues = []string{"none", "user"} + +// ValidateSharedValue returns an error if v is not one of SharedAllowedValues. +func ValidateSharedValue(v string) error { + if slices.Contains(SharedAllowedValues, v) { + return nil + } + return fmt.Errorf("shared annotation value must be one of: none, user") +}