diff --git a/README.md b/README.md index fca3571..96277de 100644 --- a/README.md +++ b/README.md @@ -74,9 +74,9 @@ Slurm is a full featured HPC workload manager. To highlight a few features: | Kubernetes | [v1.34](https://kubernetes.io/blog/2025/08/27/kubernetes-v1-34-release/) | | Slurm | [25.05](https://www.schedmd.com/slurm-version-25-05-0-is-now-available/) | -## Limitations +## Slurm Configuration -- Exclusive, whole node allocations are made for each pod. +See [Slurm Configuration](docs/slurm-configuration.md) for best practices on configuring Slurm for use with slurm-bridge. ## Installation diff --git a/docs/index.rst b/docs/index.rst index fabd691..bbafac5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -106,7 +106,7 @@ Compatibility Limitations ----------- -- Exclusive, whole node allocations are made for each pod. +- To allow multiple pods per multi-GPU node, the Slurm partition used by slurm-bridge must be configured for node sharing (see the Scheduler documentation, "Bin-Packing" section). Installation ------------ diff --git a/docs/slurm-configuration.md b/docs/slurm-configuration.md new file mode 100644 index 0000000..6d2a4d6 --- /dev/null +++ b/docs/slurm-configuration.md @@ -0,0 +1,28 @@ +# Slurm Configuration + +## Bin Pack Multiple Pods on a Node + +By default, Slurm will reserve a full node for each pod. To allow bin packing, adjust slurm.conf: + +- **OverSubscribe** — Set to `YES` or `FORCE` on the partition so multiple jobs (pods) can share nodes. + +- **SchedulerParameters** - Set: + - `bf_busy_nodes` — Backfill scheduler prefers nodes that are already busy, packing jobs onto fewer nodes and leaving others idle for whole-node jobs. + - `pack_serial_at_end` — Schedules serial jobs at the end of the backfill window to reduce fragmentation and improve packing. + +If using slinky, this can be set by adjusting its `values.yaml`: + +```yaml +controller: + extraConf: | + SchedulerParameters=pack_serial_at_end,bf_busy_nodes +nodesets: + slinky: # or other nodeset name + partition: + config: | + OverSubscribe=YES +``` + +For more details, see: +- [cons_tres resource sharing](https://slurm.schedmd.com/cons_tres_share.html). +- [Scheduler Params](https://slurm.schedmd.com/slurm.conf.html#OPT_SchedulerParameters) \ No newline at end of file diff --git a/internal/scheduler/plugins/slurmbridge/slurmbridge.go b/internal/scheduler/plugins/slurmbridge/slurmbridge.go index 2559ae5..91bdfe5 100644 --- a/internal/scheduler/plugins/slurmbridge/slurmbridge.go +++ b/internal/scheduler/plugins/slurmbridge/slurmbridge.go @@ -254,7 +254,7 @@ func (sb *SlurmBridge) PreFilter(ctx context.Context, state fwk.CycleState, pod if err != nil { return nil, fwk.NewStatus(fwk.Error, err.Error()) } - err = sb.annotatePodsWithNodes(ctx, placeholderJob.JobId, kubeNodes.Clone(), &s.slurmJobIR.Pods) + err = sb.annotatePodsWithNodes(ctx, placeholderJob.JobId, kubeNodes, &s.slurmJobIR.Pods) if err != nil { return nil, fwk.NewStatus(fwk.Error, err.Error()) } @@ -266,7 +266,7 @@ func (sb *SlurmBridge) PreFilter(ctx context.Context, state fwk.CycleState, pod // By passing the list of nodes in the placeholder job as PreFilterResult, // Filter plugins will only run for nodes in the Slurm job. This is the final // PreFilter step that must occur before pods are allowed to run. - return &framework.PreFilterResult{NodeNames: kubeNodes}, fwk.NewStatus(fwk.Success, "") + return &framework.PreFilterResult{NodeNames: sets.New(kubeNodes...)}, fwk.NewStatus(fwk.Success, "") } } @@ -314,9 +314,15 @@ func (sb *SlurmBridge) PostFilter(ctx context.Context, state fwk.CycleState, pod } } + // One job per group (PodGroup, JobSet, LWS): require feasible nodes >= len(pods). + // One job per pod (standalone Pod, Job): require at least one feasible node. + minNodesRequired := len(s.slurmJobIR.Pods.Items) + if !slurmjobir.IsOneJobPerGroupWorkload(s.slurmJobIR) { + minNodesRequired = 1 + } // If this situation occurs, the best we can do is trigger another // scheduling cycle. - if len(s.slurmJobIR.JobInfo.Nodes) < len(s.slurmJobIR.Pods.Items) { + if len(s.slurmJobIR.JobInfo.Nodes) < minNodesRequired { return nil, fwk.NewStatus(fwk.Success) } @@ -431,14 +437,24 @@ func (sb *SlurmBridge) labelPodsWithJobId(ctx context.Context, jobid int32, slur } // annotatePodsWithNodes will annotate a node assignment to pods -func (sb *SlurmBridge) annotatePodsWithNodes(ctx context.Context, jobid int32, kubeNodes sets.Set[string], pods *corev1.PodList) error { +func (sb *SlurmBridge) annotatePodsWithNodes(ctx context.Context, jobid int32, kubeNodes []string, pods *corev1.PodList) error { logger := klog.FromContext(ctx) - for _, p := range pods.Items { - // Return if there are no nodes left - if kubeNodes.Len() == 0 { - logger.V(5).Info("no nodes left to annotate") - break + // Sort pods in deterministic order (namespace, name) so that the + // i-th node in the Slurm nodelist is assigned to the i-th pod in that order. + indices := make([]int, len(pods.Items)) + for i := range indices { + indices[i] = i + } + slices.SortFunc(indices, func(i, j int) int { + a, b := pods.Items[i], pods.Items[j] + if a.Namespace != b.Namespace { + return strings.Compare(a.Namespace, b.Namespace) } + return strings.Compare(a.Name, b.Name) + }) + nodeIndex := 0 + for _, idx := range indices { + p := pods.Items[idx] // If this pod doesn't have a JobId that matches, it should be skipped as // it didn't exist when the placeholder job was created podJobID := slurmjobir.ParseSlurmJobId(p.Labels[wellknown.LabelPlaceholderJobId]) @@ -446,14 +462,15 @@ func (sb *SlurmBridge) annotatePodsWithNodes(ctx context.Context, jobid int32, k logger.V(5).Info("pod JobID does not match placeholder JobID") continue } + if nodeIndex >= len(kubeNodes) { + logger.V(5).Info("no nodes left to annotate") + return ErrorNoKubeNode + } if p.Annotations == nil { p.Annotations = make(map[string]string) } - node, ok := kubeNodes.PopAny() - if !ok { - logger.V(4).Info("could not get a node to assign") - return ErrorNoKubeNode - } + node := kubeNodes[nodeIndex] + nodeIndex++ toUpdate := p.DeepCopy() toUpdate.Annotations[wellknown.AnnotationPlaceholderNode] = node if err := sb.Patch(ctx, toUpdate, client.StrategicMergeFrom(&p)); err != nil { @@ -465,7 +482,8 @@ func (sb *SlurmBridge) annotatePodsWithNodes(ctx context.Context, jobid int32, k } // slurmToKubeNodes will translate slurm node names to kubernetes node names -func (sb *SlurmBridge) slurmToKubeNodes(ctx context.Context, slurmNodes []string) (sets.Set[string], error) { +// preserving order so pod-to-node assignment matches Slurm task layout. +func (sb *SlurmBridge) slurmToKubeNodes(ctx context.Context, slurmNodes []string) ([]string, error) { logger := klog.FromContext(ctx) nodeList := &corev1.NodeList{} @@ -474,7 +492,7 @@ func (sb *SlurmBridge) slurmToKubeNodes(ctx context.Context, slurmNodes []string return nil, err } - kubeNodes := make(sets.Set[string]) + kubeNodes := make([]string, 0, len(slurmNodes)) nodeNameMap := nodecontrollerutils.MakeNodeNameMap(ctx, nodeList) for _, slurmNode := range slurmNodes { kubeNode, ok := nodeNameMap[slurmNode] @@ -492,9 +510,8 @@ func (sb *SlurmBridge) slurmToKubeNodes(ctx context.Context, slurmNodes []string } else { return nil, ErrorNoKubeNodeMatch } - } - kubeNodes.Insert(kubeNode) + kubeNodes = append(kubeNodes, kubeNode) } return kubeNodes, nil diff --git a/internal/scheduler/plugins/slurmbridge/slurmcontrol/slurmcontrol.go b/internal/scheduler/plugins/slurmbridge/slurmcontrol/slurmcontrol.go index f2508a7..cfa9dea 100644 --- a/internal/scheduler/plugins/slurmbridge/slurmcontrol/slurmcontrol.go +++ b/internal/scheduler/plugins/slurmbridge/slurmcontrol/slurmcontrol.go @@ -190,8 +190,8 @@ func (r *realSlurmControl) submitJob(ctx context.Context, pod *corev1.Pod, slurm }(), Qos: slurmJobIR.JobInfo.QOS, Reservation: slurmJobIR.JobInfo.Reservation, - // SharedNone is effectively Exclusive - Shared: &[]v0044.V0044JobDescMsgShared{v0044.V0044JobDescMsgSharedNone}, + // SharedUser allows multiple pods to share a node + Shared: &[]v0044.V0044JobDescMsgShared{v0044.V0044JobDescMsgSharedUser}, TasksPerNode: slurmJobIR.JobInfo.TasksPerNode, TimeLimit: func() *v0044.V0044Uint32NoValStruct { if slurmJobIR.JobInfo.TimeLimit != nil { diff --git a/internal/utils/slurmjobir/job.go b/internal/utils/slurmjobir/job.go index 099a5db..3a34e8d 100644 --- a/internal/utils/slurmjobir/job.go +++ b/internal/utils/slurmjobir/job.go @@ -25,7 +25,10 @@ func (t *translator) fromJob(pod *corev1.Pod, rootPOM *metav1.PartialObjectMetad slurmJobIR := &SlurmJobIR{} slurmJobIR.Pods.Items = append(slurmJobIR.Pods.Items, *pod) - slurmJobIR.JobInfo.MinNodes = ptr.To(int32(1)) + tasks := int32(1) + slurmJobIR.JobInfo.MinNodes = ptr.To(tasks) + slurmJobIR.JobInfo.MaxNodes = ptr.To(tasks) + slurmJobIR.JobInfo.TasksPerNode = ptr.To(tasks) if job.Spec.Template.Spec.Resources != nil { slurmJobIR.JobInfo.CpuPerTask = ptr.To(int32(job.Spec.Template.Spec.Resources.Limits.Cpu().Value())) //nolint:gosec // disable G115 slurmJobIR.JobInfo.MemPerNode = ptr.To(int64(GetMemoryFromQuantity(job.Spec.Template.Spec.Resources.Limits.Memory()))) diff --git a/internal/utils/slurmjobir/job_test.go b/internal/utils/slurmjobir/job_test.go index 90c048d..49cb070 100644 --- a/internal/utils/slurmjobir/job_test.go +++ b/internal/utils/slurmjobir/job_test.go @@ -124,9 +124,11 @@ func Test_translator_fromJob(t *testing.T) { }, want: &SlurmJobIR{ JobInfo: SlurmJobIRJobInfo{ - MinNodes: ptr.To(int32(1)), - CpuPerTask: ptr.To(int32(22)), - MemPerNode: ptr.To(int64(1)), + MinNodes: ptr.To(int32(1)), + MaxNodes: ptr.To(int32(1)), + TasksPerNode: ptr.To(int32(1)), + CpuPerTask: ptr.To(int32(22)), + MemPerNode: ptr.To(int64(1)), }, Pods: corev1.PodList{ Items: []corev1.Pod{{ diff --git a/internal/utils/slurmjobir/jobset_test.go b/internal/utils/slurmjobir/jobset_test.go index a642f9c..f52e2fe 100644 --- a/internal/utils/slurmjobir/jobset_test.go +++ b/internal/utils/slurmjobir/jobset_test.go @@ -127,9 +127,11 @@ func Test_translator_fromJobSet(t *testing.T) { }, want: &SlurmJobIR{ JobInfo: SlurmJobIRJobInfo{ - MinNodes: ptr.To(int32(1)), - CpuPerTask: ptr.To(int32(22)), - MemPerNode: ptr.To(int64(1)), + MinNodes: ptr.To(int32(1)), + MaxNodes: ptr.To(int32(1)), + TasksPerNode: ptr.To(int32(1)), + CpuPerTask: ptr.To(int32(22)), + MemPerNode: ptr.To(int64(1)), }, Pods: corev1.PodList{ Items: []corev1.Pod{ diff --git a/internal/utils/slurmjobir/pod.go b/internal/utils/slurmjobir/pod.go index 3d94456..e4d65df 100644 --- a/internal/utils/slurmjobir/pod.go +++ b/internal/utils/slurmjobir/pod.go @@ -19,5 +19,6 @@ func (t *translator) fromPod(pod *corev1.Pod) (*SlurmJobIR, error) { tasks := int32(1) slurmJobIR.JobInfo.TasksPerNode = &tasks slurmJobIR.JobInfo.MaxNodes = &tasks + slurmJobIR.JobInfo.MinNodes = &tasks return slurmJobIR, nil } diff --git a/internal/utils/slurmjobir/slurmjobir.go b/internal/utils/slurmjobir/slurmjobir.go index 945cd3b..a979e4e 100644 --- a/internal/utils/slurmjobir/slurmjobir.go +++ b/internal/utils/slurmjobir/slurmjobir.go @@ -59,6 +59,21 @@ type translator struct { ctx context.Context } +// IsOneJobPerGroupWorkload returns true when the workload uses one Slurm placeholder job per group +// (PodGroup, JobSet, LWS). When false, the workload uses one job per pod (standalone Pod or Job). +// Returns false if ir is nil. +func IsOneJobPerGroupWorkload(ir *SlurmJobIR) bool { + if ir == nil { + return false + } + switch ir.RootPOM.TypeMeta { + case podGroup_v1alpha1, jobSet_v1alpha2, lws_v1: + return true + default: + return false + } +} + func PreFilter(c client.Client, ctx context.Context, pod *corev1.Pod, slurmJobIR *SlurmJobIR) *fwk.Status { t := translator{Reader: c, ctx: ctx} switch slurmJobIR.RootPOM.TypeMeta { diff --git a/internal/utils/slurmjobir/slurmjobir_test.go b/internal/utils/slurmjobir/slurmjobir_test.go index c6aa63d..794a0b0 100644 --- a/internal/utils/slurmjobir/slurmjobir_test.go +++ b/internal/utils/slurmjobir/slurmjobir_test.go @@ -110,6 +110,10 @@ func TestTranslateToSlurmJobIR(t *testing.T) { JobInfo: SlurmJobIRJobInfo{ Account: ptr.To("test1"), GroupId: ptr.To("1000"), + MinNodes: func() *int32 { + minNodes := int32(1) + return &minNodes + }(), MaxNodes: func() *int32 { maxNodes := int32(1) return &maxNodes @@ -146,14 +150,9 @@ func TestTranslateToSlurmJobIR(t *testing.T) { Items: []corev1.Pod{*podWithBadAnnotation.DeepCopy()}, }, JobInfo: SlurmJobIRJobInfo{ - MaxNodes: func() *int32 { - maxNodes := int32(1) - return &maxNodes - }(), - TasksPerNode: func() *int32 { - tasksPerNode := int32(1) - return &tasksPerNode - }(), + MinNodes: ptr.To(int32(1)), + MaxNodes: ptr.To(int32(1)), + TasksPerNode: ptr.To(int32(1)), }, }, wantErr: true, @@ -501,3 +500,49 @@ func Test_parseAnnotations(t *testing.T) { }) } } + +func TestIsOneJobPerGroupWorkload(t *testing.T) { + tests := []struct { + name string + ir *SlurmJobIR + want bool + }{ + { + name: "PodGroup is one job per group", + ir: &SlurmJobIR{RootPOM: metav1.PartialObjectMetadata{TypeMeta: podGroup_v1alpha1}}, + want: true, + }, + { + name: "JobSet is one job per group", + ir: &SlurmJobIR{RootPOM: metav1.PartialObjectMetadata{TypeMeta: jobSet_v1alpha2}}, + want: true, + }, + { + name: "LWS is one job per group", + ir: &SlurmJobIR{RootPOM: metav1.PartialObjectMetadata{TypeMeta: lws_v1}}, + want: true, + }, + { + name: "standalone Pod is one job per pod", + ir: &SlurmJobIR{RootPOM: metav1.PartialObjectMetadata{TypeMeta: pod_v1}}, + want: false, + }, + { + name: "Job is one job per pod", + ir: &SlurmJobIR{RootPOM: metav1.PartialObjectMetadata{TypeMeta: job_v1}}, + want: false, + }, + { + name: "nil IR returns false and does not panic", + ir: nil, + want: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := IsOneJobPerGroupWorkload(tt.ir); got != tt.want { + t.Errorf("IsOneJobPerGroupWorkload() = %v, want %v", got, tt.want) + } + }) + } +}