From 3e4fdbcdc8024cffd38792ef48726b70b800e122 Mon Sep 17 00:00:00 2001 From: Charlie Getzen Date: Thu, 29 Jan 2026 15:30:41 -0800 Subject: [PATCH 1/8] feat: multiple pods per node --- README.md | 2 +- docs/index.rst | 2 +- docs/scheduler.md | 22 +++++ .../plugins/slurmbridge/slurmbridge.go | 66 ++++++++------- .../slurmbridge/slurmcontrol/slurmcontrol.go | 4 +- internal/utils/slurmjobir/slurmjobir.go | 40 +++++++++ internal/utils/slurmjobir/slurmjobir_test.go | 83 +++++++++++++++++++ 7 files changed, 186 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index fca3571..fb8397f 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ Slurm is a full featured HPC workload manager. To highlight a few features: ## Limitations -- Exclusive, whole node allocations are made for each pod. +- To allow multiple pods per multi-GPU node, the Slurm partition used by slurm-bridge must be configured for node sharing (see [Slurm configuration](docs/scheduler.md#slurm-configuration) in the Scheduler documentation). ## Installation diff --git a/docs/index.rst b/docs/index.rst index fabd691..a8603e6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -106,7 +106,7 @@ Compatibility Limitations ----------- -- Exclusive, whole node allocations are made for each pod. +- To allow multiple pods per multi-GPU node, the Slurm partition used by slurm-bridge must be configured for node sharing (see the Scheduler documentation, "Slurm configuration" section). Installation ------------ diff --git a/docs/scheduler.md b/docs/scheduler.md index b1db806..1a473e8 100644 --- a/docs/scheduler.md +++ b/docs/scheduler.md @@ -37,6 +37,28 @@ represented Pods. This scheduler defers scheduling decisions to Slurm, hence certain assumptions about the environment must be met for this to function correctly. +### Slurm configuration + +To allow multiple pods to share a multi-GPU node (e.g. one pod per GPU on an +8-GPU node), the following Slurm configuration is required: + +1. **Partition**: The partition used by slurm-bridge must allow multiple jobs (or + tasks) per node. In `slurm.conf`, set **`OverSubscribe=YES`** or + **`OverSubscribe=FORCE`** on the partition. With `OverSubscribe=YES`, jobs + that request sharing (see below) can be packed on the same node. See + [Slurm partition configuration](https://slurm.schedmd.com/slurm.conf.html). + +2. **Job-level**: Placeholder jobs are submitted with **Shared=User** so that + Slurm may schedule them on nodes that allow sharing. No additional Slurm + configuration is required beyond the partition setting above. + +3. **GRES**: Slurm tracks GRES per allocation; each placeholder job requests + that pod's GPU count (e.g. 1 GPU). No change to existing GRES configuration + is required. + +For more detail on resource sharing in Slurm, see +[cons_tres resource sharing](https://slurm.schedmd.com/cons_tres_share.html). + ### Sequence Diagram ```mermaid diff --git a/internal/scheduler/plugins/slurmbridge/slurmbridge.go b/internal/scheduler/plugins/slurmbridge/slurmbridge.go index 2559ae5..9683ba8 100644 --- a/internal/scheduler/plugins/slurmbridge/slurmbridge.go +++ b/internal/scheduler/plugins/slurmbridge/slurmbridge.go @@ -248,13 +248,15 @@ func (sb *SlurmBridge) PreFilter(ctx context.Context, state fwk.CycleState, pod logger.V(4).Info("placeholder job exists but no nodes have been allocated") return nil, fwk.NewStatus(fwk.Pending, ErrorNoNodesAssigned.Error()) } - // The placeholder job is running. Assign nodes to pods. + // The placeholder job is running. Assign nodes to pods in nodelist order + // so pod-to-node mapping matches Slurm task layout (important when multiple + // pods share a node via Shared=User). slurmNodes, _ := hostlist.Expand(placeholderJob.Nodes) - kubeNodes, err := sb.slurmToKubeNodes(ctx, slurmNodes) + kubeNodesOrdered, err := sb.slurmToKubeNodesOrdered(ctx, slurmNodes) if err != nil { return nil, fwk.NewStatus(fwk.Error, err.Error()) } - err = sb.annotatePodsWithNodes(ctx, placeholderJob.JobId, kubeNodes.Clone(), &s.slurmJobIR.Pods) + err = sb.annotatePodsWithNodes(ctx, placeholderJob.JobId, kubeNodesOrdered, &s.slurmJobIR.Pods) if err != nil { return nil, fwk.NewStatus(fwk.Error, err.Error()) } @@ -266,7 +268,7 @@ func (sb *SlurmBridge) PreFilter(ctx context.Context, state fwk.CycleState, pod // By passing the list of nodes in the placeholder job as PreFilterResult, // Filter plugins will only run for nodes in the Slurm job. This is the final // PreFilter step that must occur before pods are allowed to run. - return &framework.PreFilterResult{NodeNames: kubeNodes}, fwk.NewStatus(fwk.Success, "") + return &framework.PreFilterResult{NodeNames: sets.New(kubeNodesOrdered...)}, fwk.NewStatus(fwk.Success, "") } } @@ -314,16 +316,24 @@ func (sb *SlurmBridge) PostFilter(ctx context.Context, state fwk.CycleState, pod } } - // If this situation occurs, the best we can do is trigger another - // scheduling cycle. - if len(s.slurmJobIR.JobInfo.Nodes) < len(s.slurmJobIR.Pods.Items) { + // One job per group (PodGroup, JobSet, LWS): require feasible nodes >= len(pods). + // One job per pod (standalone Pod, Job): require at least one feasible node. + submitIR := s.slurmJobIR + minNodesRequired := len(s.slurmJobIR.Pods.Items) + if !slurmjobir.IsOneJobPerGroupWorkload(s.slurmJobIR) { + singlePodIR := slurmjobir.BuildSinglePodIR(s.slurmJobIR, pod) + singlePodIR.JobInfo.Nodes = s.slurmJobIR.JobInfo.Nodes + submitIR = singlePodIR + minNodesRequired = 1 + } + if len(s.slurmJobIR.JobInfo.Nodes) < minNodesRequired { return nil, fwk.NewStatus(fwk.Success) } // If no placeholder job exists, we should create one with the list // of nodes that passed Filter plugins. if placeholderJob.JobId == 0 { - jobid, err := sb.slurmControl.SubmitJob(ctx, pod, s.slurmJobIR) + jobid, err := sb.slurmControl.SubmitJob(ctx, pod, submitIR) if err != nil { aggErrors := func() utilerrors.Aggregate { var target utilerrors.Aggregate @@ -340,7 +350,7 @@ func (sb *SlurmBridge) PostFilter(ctx context.Context, state fwk.CycleState, pod return nil, fwk.NewStatus(fwk.Error, err.Error()) } logger.V(5).Info("submitted placeholder to slurm", klog.KObj(pod)) - err = sb.labelPodsWithJobId(ctx, jobid, s.slurmJobIR) + err = sb.labelPodsWithJobId(ctx, jobid, submitIR) if err != nil { return nil, fwk.NewStatus(fwk.Error, err.Error()) } @@ -352,14 +362,14 @@ func (sb *SlurmBridge) PostFilter(ctx context.Context, state fwk.CycleState, pod logger.V(4).Info("placeholder job exists but no nodes have been allocated") // As the placeholder job is not yet running, update to the job // to include any changes from slurmJobIR. - jobid, err := sb.slurmControl.UpdateJob(ctx, pod, s.slurmJobIR) + jobid, err := sb.slurmControl.UpdateJob(ctx, pod, submitIR) if err != nil { logger.Error(err, "error updating Slurm job") return nil, fwk.NewStatus(fwk.Error, err.Error()) } // Update the pods with the jobId label in case there // are new pods included in slurmJobIR after the update. - err = sb.labelPodsWithJobId(ctx, jobid, s.slurmJobIR) + err = sb.labelPodsWithJobId(ctx, jobid, submitIR) if err != nil { logger.Error(err, "error labeling pods after update") return nil, fwk.NewStatus(fwk.Error, err.Error()) @@ -430,15 +440,12 @@ func (sb *SlurmBridge) labelPodsWithJobId(ctx context.Context, jobid int32, slur return nil } -// annotatePodsWithNodes will annotate a node assignment to pods -func (sb *SlurmBridge) annotatePodsWithNodes(ctx context.Context, jobid int32, kubeNodes sets.Set[string], pods *corev1.PodList) error { +// annotatePodsWithNodes assigns nodes to pods in order so pod-to-node mapping +// matches Slurm task layout (e.g. when Shared=User and nodelist has repeated nodes). +func (sb *SlurmBridge) annotatePodsWithNodes(ctx context.Context, jobid int32, kubeNodesOrdered []string, pods *corev1.PodList) error { logger := klog.FromContext(ctx) + nodeIndex := 0 for _, p := range pods.Items { - // Return if there are no nodes left - if kubeNodes.Len() == 0 { - logger.V(5).Info("no nodes left to annotate") - break - } // If this pod doesn't have a JobId that matches, it should be skipped as // it didn't exist when the placeholder job was created podJobID := slurmjobir.ParseSlurmJobId(p.Labels[wellknown.LabelPlaceholderJobId]) @@ -446,14 +453,15 @@ func (sb *SlurmBridge) annotatePodsWithNodes(ctx context.Context, jobid int32, k logger.V(5).Info("pod JobID does not match placeholder JobID") continue } + if nodeIndex >= len(kubeNodesOrdered) { + logger.V(5).Info("no nodes left to annotate") + return ErrorNoKubeNode + } if p.Annotations == nil { p.Annotations = make(map[string]string) } - node, ok := kubeNodes.PopAny() - if !ok { - logger.V(4).Info("could not get a node to assign") - return ErrorNoKubeNode - } + node := kubeNodesOrdered[nodeIndex] + nodeIndex++ toUpdate := p.DeepCopy() toUpdate.Annotations[wellknown.AnnotationPlaceholderNode] = node if err := sb.Patch(ctx, toUpdate, client.StrategicMergeFrom(&p)); err != nil { @@ -464,8 +472,9 @@ func (sb *SlurmBridge) annotatePodsWithNodes(ctx context.Context, jobid int32, k return nil } -// slurmToKubeNodes will translate slurm node names to kubernetes node names -func (sb *SlurmBridge) slurmToKubeNodes(ctx context.Context, slurmNodes []string) (sets.Set[string], error) { +// slurmToKubeNodesOrdered translates Slurm node names to Kubernetes node names +// preserving order so pod-to-node assignment matches Slurm task layout. +func (sb *SlurmBridge) slurmToKubeNodesOrdered(ctx context.Context, slurmNodes []string) ([]string, error) { logger := klog.FromContext(ctx) nodeList := &corev1.NodeList{} @@ -474,7 +483,7 @@ func (sb *SlurmBridge) slurmToKubeNodes(ctx context.Context, slurmNodes []string return nil, err } - kubeNodes := make(sets.Set[string]) + kubeNodesOrdered := make([]string, 0, len(slurmNodes)) nodeNameMap := nodecontrollerutils.MakeNodeNameMap(ctx, nodeList) for _, slurmNode := range slurmNodes { kubeNode, ok := nodeNameMap[slurmNode] @@ -492,12 +501,11 @@ func (sb *SlurmBridge) slurmToKubeNodes(ctx context.Context, slurmNodes []string } else { return nil, ErrorNoKubeNodeMatch } - } - kubeNodes.Insert(kubeNode) + kubeNodesOrdered = append(kubeNodesOrdered, kubeNode) } - return kubeNodes, nil + return kubeNodesOrdered, nil } // revertPlaceholderJob will delete the placeholder job associate with the pod diff --git a/internal/scheduler/plugins/slurmbridge/slurmcontrol/slurmcontrol.go b/internal/scheduler/plugins/slurmbridge/slurmcontrol/slurmcontrol.go index f2508a7..454fa0e 100644 --- a/internal/scheduler/plugins/slurmbridge/slurmcontrol/slurmcontrol.go +++ b/internal/scheduler/plugins/slurmbridge/slurmcontrol/slurmcontrol.go @@ -190,8 +190,8 @@ func (r *realSlurmControl) submitJob(ctx context.Context, pod *corev1.Pod, slurm }(), Qos: slurmJobIR.JobInfo.QOS, Reservation: slurmJobIR.JobInfo.Reservation, - // SharedNone is effectively Exclusive - Shared: &[]v0044.V0044JobDescMsgShared{v0044.V0044JobDescMsgSharedNone}, + // SharedUser allows multiple jobs (or tasks) to share a node (e.g. multiple pods per multi-GPU node). + Shared: &[]v0044.V0044JobDescMsgShared{v0044.V0044JobDescMsgSharedUser}, TasksPerNode: slurmJobIR.JobInfo.TasksPerNode, TimeLimit: func() *v0044.V0044Uint32NoValStruct { if slurmJobIR.JobInfo.TimeLimit != nil { diff --git a/internal/utils/slurmjobir/slurmjobir.go b/internal/utils/slurmjobir/slurmjobir.go index 945cd3b..5b2a4d1 100644 --- a/internal/utils/slurmjobir/slurmjobir.go +++ b/internal/utils/slurmjobir/slurmjobir.go @@ -59,6 +59,17 @@ type translator struct { ctx context.Context } +// IsOneJobPerGroupWorkload returns true when the workload uses one Slurm placeholder job per group +// (PodGroup, JobSet, LWS). When false, the workload uses one job per pod (standalone Pod or Job). +func IsOneJobPerGroupWorkload(ir *SlurmJobIR) bool { + switch ir.RootPOM.TypeMeta { + case podGroup_v1alpha1, jobSet_v1alpha2, lws_v1: + return true + default: + return false + } +} + func PreFilter(c client.Client, ctx context.Context, pod *corev1.Pod, slurmJobIR *SlurmJobIR) *fwk.Status { t := translator{Reader: c, ctx: ctx} switch slurmJobIR.RootPOM.TypeMeta { @@ -114,6 +125,35 @@ func TranslateToSlurmJobIR(c client.Client, ctx context.Context, pod *corev1.Pod return slurmJobIR, err } +// BuildSinglePodIR builds a SlurmJobIR for a single pod (one job per pod). It copies RootPOM and +// annotation-derived JobInfo from the group IR, then derives CpuPerTask, MemPerNode, and Gres from +// the given pod only, and sets MinNodes=1, MaxNodes=1, TasksPerNode=1. +func BuildSinglePodIR(ir *SlurmJobIR, pod *corev1.Pod) *SlurmJobIR { + single := &SlurmJobIR{ + RootPOM: ir.RootPOM, + Pods: corev1.PodList{Items: []corev1.Pod{*pod}}, + JobInfo: SlurmJobIRJobInfo{ + Account: ir.JobInfo.Account, + Constraints: ir.JobInfo.Constraints, + GroupId: ir.JobInfo.GroupId, + JobName: ir.JobInfo.JobName, + Licenses: ir.JobInfo.Licenses, + Partition: ir.JobInfo.Partition, + QOS: ir.JobInfo.QOS, + Reservation: ir.JobInfo.Reservation, + TimeLimit: ir.JobInfo.TimeLimit, + UserId: ir.JobInfo.UserId, + Wckey: ir.JobInfo.Wckey, + MinNodes: ptr.To(int32(1)), + MaxNodes: ptr.To(int32(1)), + TasksPerNode: ptr.To(int32(1)), + }, + } + parsePodsCpuAndMemory(single) + parseGPUDevicePlugin(single) + return single +} + /* Set CPU and Memory for the placeholder job based on the maximum Pod CPU and Memory (including overhead) */ func parsePodsCpuAndMemory(slurmJobIR *SlurmJobIR) { var cpuMax resource.Quantity diff --git a/internal/utils/slurmjobir/slurmjobir_test.go b/internal/utils/slurmjobir/slurmjobir_test.go index c6aa63d..974ef6b 100644 --- a/internal/utils/slurmjobir/slurmjobir_test.go +++ b/internal/utils/slurmjobir/slurmjobir_test.go @@ -501,3 +501,86 @@ func Test_parseAnnotations(t *testing.T) { }) } } + +func TestIsOneJobPerGroupWorkload(t *testing.T) { + tests := []struct { + name string + ir *SlurmJobIR + want bool + }{ + { + name: "PodGroup is one job per group", + ir: &SlurmJobIR{RootPOM: metav1.PartialObjectMetadata{TypeMeta: podGroup_v1alpha1}}, + want: true, + }, + { + name: "JobSet is one job per group", + ir: &SlurmJobIR{RootPOM: metav1.PartialObjectMetadata{TypeMeta: jobSet_v1alpha2}}, + want: true, + }, + { + name: "LWS is one job per group", + ir: &SlurmJobIR{RootPOM: metav1.PartialObjectMetadata{TypeMeta: lws_v1}}, + want: true, + }, + { + name: "standalone Pod is one job per pod", + ir: &SlurmJobIR{RootPOM: metav1.PartialObjectMetadata{TypeMeta: pod_v1}}, + want: false, + }, + { + name: "Job is one job per pod", + ir: &SlurmJobIR{RootPOM: metav1.PartialObjectMetadata{TypeMeta: job_v1}}, + want: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := IsOneJobPerGroupWorkload(tt.ir); got != tt.want { + t.Errorf("IsOneJobPerGroupWorkload() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestBuildSinglePodIR(t *testing.T) { + pod := podWithResources("2", "256Mi", "2", "512Mi") + pod.Namespace = "default" + pod.Name = "test-pod" + ir := &SlurmJobIR{ + RootPOM: metav1.PartialObjectMetadata{TypeMeta: pod_v1, ObjectMeta: metav1.ObjectMeta{Name: "test-pod", Namespace: "default"}}, + Pods: corev1.PodList{Items: []corev1.Pod{pod, podWithResources("1", "128Mi", "1", "256Mi")}}, + JobInfo: SlurmJobIRJobInfo{ + Account: ptr.To("test"), + Partition: ptr.To("debug"), + MinNodes: ptr.To(int32(2)), + MaxNodes: ptr.To(int32(2)), + }, + } + parsePodsCpuAndMemory(ir) + parseGPUDevicePlugin(ir) + + single := BuildSinglePodIR(ir, &pod) + if single == nil { + t.Fatal("BuildSinglePodIR() returned nil") + } + if len(single.Pods.Items) != 1 { + t.Errorf("BuildSinglePodIR() Pods.Items length = %d, want 1", len(single.Pods.Items)) + } + if single.Pods.Items[0].Name != pod.Name { + t.Errorf("BuildSinglePodIR() pod name = %s, want %s", single.Pods.Items[0].Name, pod.Name) + } + if !apiequality.Semantic.DeepEqual(single.RootPOM, ir.RootPOM) { + t.Error("BuildSinglePodIR() RootPOM should match original") + } + if ptr.Deref(single.JobInfo.MinNodes, 0) != 1 || ptr.Deref(single.JobInfo.MaxNodes, 0) != 1 { + t.Errorf("BuildSinglePodIR() MinNodes=%v MaxNodes=%v, want 1, 1", single.JobInfo.MinNodes, single.JobInfo.MaxNodes) + } + if ptr.Deref(single.JobInfo.Account, "") != "test" || ptr.Deref(single.JobInfo.Partition, "") != "debug" { + t.Errorf("BuildSinglePodIR() Account/Partition should be copied from original") + } + // Single pod has 2 CPU, 512Mi from the pod we passed + if ptr.Deref(single.JobInfo.CpuPerTask, 0) != 2 { + t.Errorf("BuildSinglePodIR() CpuPerTask = %v, want 2", single.JobInfo.CpuPerTask) + } +} From 8fd4127b6dc5a0581b3f6b73710540493339dc7a Mon Sep 17 00:00:00 2001 From: Charlie Getzen Date: Fri, 30 Jan 2026 11:03:16 -0800 Subject: [PATCH 2/8] fixes --- docs/scheduler.md | 17 +------- .../plugins/slurmbridge/slurmbridge.go | 43 ++++++++++++++----- internal/utils/slurmjobir/slurmjobir.go | 5 +++ internal/utils/slurmjobir/slurmjobir_test.go | 5 +++ 4 files changed, 44 insertions(+), 26 deletions(-) diff --git a/docs/scheduler.md b/docs/scheduler.md index 1a473e8..f372820 100644 --- a/docs/scheduler.md +++ b/docs/scheduler.md @@ -39,22 +39,7 @@ correctly. ### Slurm configuration -To allow multiple pods to share a multi-GPU node (e.g. one pod per GPU on an -8-GPU node), the following Slurm configuration is required: - -1. **Partition**: The partition used by slurm-bridge must allow multiple jobs (or - tasks) per node. In `slurm.conf`, set **`OverSubscribe=YES`** or - **`OverSubscribe=FORCE`** on the partition. With `OverSubscribe=YES`, jobs - that request sharing (see below) can be packed on the same node. See - [Slurm partition configuration](https://slurm.schedmd.com/slurm.conf.html). - -2. **Job-level**: Placeholder jobs are submitted with **Shared=User** so that - Slurm may schedule them on nodes that allow sharing. No additional Slurm - configuration is required beyond the partition setting above. - -3. **GRES**: Slurm tracks GRES per allocation; each placeholder job requests - that pod's GPU count (e.g. 1 GPU). No change to existing GRES configuration - is required. +To allow multiple pods to be scheduled on a single node, the `OverSubscribe` configuration must be set to `YES` or `FORCE` on the partition in slurm.conf. For more detail on resource sharing in Slurm, see [cons_tres resource sharing](https://slurm.schedmd.com/cons_tres_share.html). diff --git a/internal/scheduler/plugins/slurmbridge/slurmbridge.go b/internal/scheduler/plugins/slurmbridge/slurmbridge.go index 9683ba8..5246bdf 100644 --- a/internal/scheduler/plugins/slurmbridge/slurmbridge.go +++ b/internal/scheduler/plugins/slurmbridge/slurmbridge.go @@ -251,7 +251,11 @@ func (sb *SlurmBridge) PreFilter(ctx context.Context, state fwk.CycleState, pod // The placeholder job is running. Assign nodes to pods in nodelist order // so pod-to-node mapping matches Slurm task layout (important when multiple // pods share a node via Shared=User). - slurmNodes, _ := hostlist.Expand(placeholderJob.Nodes) + slurmNodes, err := hostlist.Expand(placeholderJob.Nodes) + if err != nil { + logger.Error(err, "failed to expand Slurm nodelist", "nodelist", placeholderJob.Nodes) + return nil, fwk.NewStatus(fwk.Error, err.Error()) + } kubeNodesOrdered, err := sb.slurmToKubeNodesOrdered(ctx, slurmNodes) if err != nil { return nil, fwk.NewStatus(fwk.Error, err.Error()) @@ -442,10 +446,25 @@ func (sb *SlurmBridge) labelPodsWithJobId(ctx context.Context, jobid int32, slur // annotatePodsWithNodes assigns nodes to pods in order so pod-to-node mapping // matches Slurm task layout (e.g. when Shared=User and nodelist has repeated nodes). +// Pods are processed in deterministic order (namespace, then name) so that the +// i-th node in the Slurm nodelist is assigned to the i-th pod in that order. func (sb *SlurmBridge) annotatePodsWithNodes(ctx context.Context, jobid int32, kubeNodesOrdered []string, pods *corev1.PodList) error { logger := klog.FromContext(ctx) + // Build sorted indices so we iterate in deterministic order without mutating the caller's slice. + indices := make([]int, len(pods.Items)) + for i := range indices { + indices[i] = i + } + slices.SortFunc(indices, func(i, j int) int { + a, b := pods.Items[i], pods.Items[j] + if a.Namespace != b.Namespace { + return strings.Compare(a.Namespace, b.Namespace) + } + return strings.Compare(a.Name, b.Name) + }) nodeIndex := 0 - for _, p := range pods.Items { + for _, idx := range indices { + p := &pods.Items[idx] // If this pod doesn't have a JobId that matches, it should be skipped as // it didn't exist when the placeholder job was created podJobID := slurmjobir.ParseSlurmJobId(p.Labels[wellknown.LabelPlaceholderJobId]) @@ -464,7 +483,7 @@ func (sb *SlurmBridge) annotatePodsWithNodes(ctx context.Context, jobid int32, k nodeIndex++ toUpdate := p.DeepCopy() toUpdate.Annotations[wellknown.AnnotationPlaceholderNode] = node - if err := sb.Patch(ctx, toUpdate, client.StrategicMergeFrom(&p)); err != nil { + if err := sb.Patch(ctx, toUpdate, client.StrategicMergeFrom(p)); err != nil { logger.Error(err, "failed to update pod with slurm job id") return ErrorPodUpdateFailed } @@ -581,13 +600,17 @@ func (sb *SlurmBridge) validatePodToJob(ctx context.Context, pod *corev1.Pod) er toUpdate.Labels[wellknown.LabelPlaceholderJobId] = strconv.Itoa(int(val.JobId)) } // If the pod has a Node set, validate it against podToJob - nodes, _ := hostlist.Expand(val.Nodes) - if pod.Annotations[wellknown.AnnotationPlaceholderNode] != "" && - !slices.Contains(nodes, pod.Annotations[wellknown.AnnotationPlaceholderNode]) { - logger.V(3).Info("Pod node annotation does not match Slurm nodes", "pod", klog.KObj(pod), - "node annotation", pod.Annotations[wellknown.AnnotationPlaceholderNode], - "slurm job", val) - toUpdate.Annotations[wellknown.AnnotationPlaceholderNode] = "" + nodes, err := hostlist.Expand(val.Nodes) + if err != nil { + logger.Error(err, "failed to expand Slurm nodelist for validation", "nodelist", val.Nodes) + } else { + if pod.Annotations[wellknown.AnnotationPlaceholderNode] != "" && + !slices.Contains(nodes, pod.Annotations[wellknown.AnnotationPlaceholderNode]) { + logger.V(3).Info("Pod node annotation does not match Slurm nodes", "pod", klog.KObj(pod), + "node annotation", pod.Annotations[wellknown.AnnotationPlaceholderNode], + "slurm job", val) + toUpdate.Annotations[wellknown.AnnotationPlaceholderNode] = "" + } } if !reflect.DeepEqual(pod, toUpdate) { if err := sb.Patch(ctx, toUpdate, client.StrategicMergeFrom(pod)); err != nil { diff --git a/internal/utils/slurmjobir/slurmjobir.go b/internal/utils/slurmjobir/slurmjobir.go index 5b2a4d1..adf0e24 100644 --- a/internal/utils/slurmjobir/slurmjobir.go +++ b/internal/utils/slurmjobir/slurmjobir.go @@ -61,7 +61,11 @@ type translator struct { // IsOneJobPerGroupWorkload returns true when the workload uses one Slurm placeholder job per group // (PodGroup, JobSet, LWS). When false, the workload uses one job per pod (standalone Pod or Job). +// Returns false if ir is nil. func IsOneJobPerGroupWorkload(ir *SlurmJobIR) bool { + if ir == nil { + return false + } switch ir.RootPOM.TypeMeta { case podGroup_v1alpha1, jobSet_v1alpha2, lws_v1: return true @@ -128,6 +132,7 @@ func TranslateToSlurmJobIR(c client.Client, ctx context.Context, pod *corev1.Pod // BuildSinglePodIR builds a SlurmJobIR for a single pod (one job per pod). It copies RootPOM and // annotation-derived JobInfo from the group IR, then derives CpuPerTask, MemPerNode, and Gres from // the given pod only, and sets MinNodes=1, MaxNodes=1, TasksPerNode=1. +// The caller must set JobInfo.Nodes when using the result for submission (e.g. for RequiredNodes). func BuildSinglePodIR(ir *SlurmJobIR, pod *corev1.Pod) *SlurmJobIR { single := &SlurmJobIR{ RootPOM: ir.RootPOM, diff --git a/internal/utils/slurmjobir/slurmjobir_test.go b/internal/utils/slurmjobir/slurmjobir_test.go index 974ef6b..b9a71d7 100644 --- a/internal/utils/slurmjobir/slurmjobir_test.go +++ b/internal/utils/slurmjobir/slurmjobir_test.go @@ -533,6 +533,11 @@ func TestIsOneJobPerGroupWorkload(t *testing.T) { ir: &SlurmJobIR{RootPOM: metav1.PartialObjectMetadata{TypeMeta: job_v1}}, want: false, }, + { + name: "nil IR returns false and does not panic", + ir: nil, + want: false, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { From 6e5af8c57153ec7c9c2a89251cafcc6c8d8446a7 Mon Sep 17 00:00:00 2001 From: Charlie Getzen Date: Fri, 30 Jan 2026 12:51:47 -0800 Subject: [PATCH 3/8] docs --- README.md | 4 ++-- docs/index.rst | 2 +- docs/scheduler.md | 7 ------- docs/slurm-configuration.md | 28 ++++++++++++++++++++++++++++ 4 files changed, 31 insertions(+), 10 deletions(-) create mode 100644 docs/slurm-configuration.md diff --git a/README.md b/README.md index fb8397f..96277de 100644 --- a/README.md +++ b/README.md @@ -74,9 +74,9 @@ Slurm is a full featured HPC workload manager. To highlight a few features: | Kubernetes | [v1.34](https://kubernetes.io/blog/2025/08/27/kubernetes-v1-34-release/) | | Slurm | [25.05](https://www.schedmd.com/slurm-version-25-05-0-is-now-available/) | -## Limitations +## Slurm Configuration -- To allow multiple pods per multi-GPU node, the Slurm partition used by slurm-bridge must be configured for node sharing (see [Slurm configuration](docs/scheduler.md#slurm-configuration) in the Scheduler documentation). +See [Slurm Configuration](docs/slurm-configuration.md) for best practices on configuring Slurm for use with slurm-bridge. ## Installation diff --git a/docs/index.rst b/docs/index.rst index a8603e6..bbafac5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -106,7 +106,7 @@ Compatibility Limitations ----------- -- To allow multiple pods per multi-GPU node, the Slurm partition used by slurm-bridge must be configured for node sharing (see the Scheduler documentation, "Slurm configuration" section). +- To allow multiple pods per multi-GPU node, the Slurm partition used by slurm-bridge must be configured for node sharing (see the Scheduler documentation, "Bin-Packing" section). Installation ------------ diff --git a/docs/scheduler.md b/docs/scheduler.md index f372820..b1db806 100644 --- a/docs/scheduler.md +++ b/docs/scheduler.md @@ -37,13 +37,6 @@ represented Pods. This scheduler defers scheduling decisions to Slurm, hence certain assumptions about the environment must be met for this to function correctly. -### Slurm configuration - -To allow multiple pods to be scheduled on a single node, the `OverSubscribe` configuration must be set to `YES` or `FORCE` on the partition in slurm.conf. - -For more detail on resource sharing in Slurm, see -[cons_tres resource sharing](https://slurm.schedmd.com/cons_tres_share.html). - ### Sequence Diagram ```mermaid diff --git a/docs/slurm-configuration.md b/docs/slurm-configuration.md new file mode 100644 index 0000000..6d2a4d6 --- /dev/null +++ b/docs/slurm-configuration.md @@ -0,0 +1,28 @@ +# Slurm Configuration + +## Bin Pack Multiple Pods on a Node + +By default, Slurm will reserve a full node for each pod. To allow bin packing, adjust slurm.conf: + +- **OverSubscribe** — Set to `YES` or `FORCE` on the partition so multiple jobs (pods) can share nodes. + +- **SchedulerParameters** - Set: + - `bf_busy_nodes` — Backfill scheduler prefers nodes that are already busy, packing jobs onto fewer nodes and leaving others idle for whole-node jobs. + - `pack_serial_at_end` — Schedules serial jobs at the end of the backfill window to reduce fragmentation and improve packing. + +If using slinky, this can be set by adjusting its `values.yaml`: + +```yaml +controller: + extraConf: | + SchedulerParameters=pack_serial_at_end,bf_busy_nodes +nodesets: + slinky: # or other nodeset name + partition: + config: | + OverSubscribe=YES +``` + +For more details, see: +- [cons_tres resource sharing](https://slurm.schedmd.com/cons_tres_share.html). +- [Scheduler Params](https://slurm.schedmd.com/slurm.conf.html#OPT_SchedulerParameters) \ No newline at end of file From 0a26ba5f7ece8c2b1af7dfbaef307d42372509f5 Mon Sep 17 00:00:00 2001 From: Charlie Getzen Date: Fri, 30 Jan 2026 13:18:31 -0800 Subject: [PATCH 4/8] cleanup --- .../plugins/slurmbridge/slurmbridge.go | 42 ++++++++----------- .../slurmbridge/slurmcontrol/slurmcontrol.go | 2 +- 2 files changed, 19 insertions(+), 25 deletions(-) diff --git a/internal/scheduler/plugins/slurmbridge/slurmbridge.go b/internal/scheduler/plugins/slurmbridge/slurmbridge.go index 5246bdf..ed6d9d9 100644 --- a/internal/scheduler/plugins/slurmbridge/slurmbridge.go +++ b/internal/scheduler/plugins/slurmbridge/slurmbridge.go @@ -248,19 +248,13 @@ func (sb *SlurmBridge) PreFilter(ctx context.Context, state fwk.CycleState, pod logger.V(4).Info("placeholder job exists but no nodes have been allocated") return nil, fwk.NewStatus(fwk.Pending, ErrorNoNodesAssigned.Error()) } - // The placeholder job is running. Assign nodes to pods in nodelist order - // so pod-to-node mapping matches Slurm task layout (important when multiple - // pods share a node via Shared=User). - slurmNodes, err := hostlist.Expand(placeholderJob.Nodes) + // The placeholder job is running. Assign nodes to pods. + slurmNodes, _ := hostlist.Expand(placeholderJob.Nodes) + kubeNodes, err := sb.slurmToKubeNodes(ctx, slurmNodes) if err != nil { - logger.Error(err, "failed to expand Slurm nodelist", "nodelist", placeholderJob.Nodes) return nil, fwk.NewStatus(fwk.Error, err.Error()) } - kubeNodesOrdered, err := sb.slurmToKubeNodesOrdered(ctx, slurmNodes) - if err != nil { - return nil, fwk.NewStatus(fwk.Error, err.Error()) - } - err = sb.annotatePodsWithNodes(ctx, placeholderJob.JobId, kubeNodesOrdered, &s.slurmJobIR.Pods) + err = sb.annotatePodsWithNodes(ctx, placeholderJob.JobId, kubeNodes, &s.slurmJobIR.Pods) if err != nil { return nil, fwk.NewStatus(fwk.Error, err.Error()) } @@ -272,7 +266,7 @@ func (sb *SlurmBridge) PreFilter(ctx context.Context, state fwk.CycleState, pod // By passing the list of nodes in the placeholder job as PreFilterResult, // Filter plugins will only run for nodes in the Slurm job. This is the final // PreFilter step that must occur before pods are allowed to run. - return &framework.PreFilterResult{NodeNames: sets.New(kubeNodesOrdered...)}, fwk.NewStatus(fwk.Success, "") + return &framework.PreFilterResult{NodeNames: sets.New(kubeNodes...)}, fwk.NewStatus(fwk.Success, "") } } @@ -330,6 +324,8 @@ func (sb *SlurmBridge) PostFilter(ctx context.Context, state fwk.CycleState, pod submitIR = singlePodIR minNodesRequired = 1 } + // If this situation occurs, the best we can do is trigger another + // scheduling cycle. if len(s.slurmJobIR.JobInfo.Nodes) < minNodesRequired { return nil, fwk.NewStatus(fwk.Success) } @@ -444,13 +440,11 @@ func (sb *SlurmBridge) labelPodsWithJobId(ctx context.Context, jobid int32, slur return nil } -// annotatePodsWithNodes assigns nodes to pods in order so pod-to-node mapping -// matches Slurm task layout (e.g. when Shared=User and nodelist has repeated nodes). -// Pods are processed in deterministic order (namespace, then name) so that the -// i-th node in the Slurm nodelist is assigned to the i-th pod in that order. -func (sb *SlurmBridge) annotatePodsWithNodes(ctx context.Context, jobid int32, kubeNodesOrdered []string, pods *corev1.PodList) error { +// annotatePodsWithNodes will annotate a node assignment to pods +func (sb *SlurmBridge) annotatePodsWithNodes(ctx context.Context, jobid int32, kubeNodes []string, pods *corev1.PodList) error { logger := klog.FromContext(ctx) - // Build sorted indices so we iterate in deterministic order without mutating the caller's slice. + // Sort pods in deterministic order (namespace, name) so that the + // i-th node in the Slurm nodelist is assigned to the i-th pod in that order. indices := make([]int, len(pods.Items)) for i := range indices { indices[i] = i @@ -472,14 +466,14 @@ func (sb *SlurmBridge) annotatePodsWithNodes(ctx context.Context, jobid int32, k logger.V(5).Info("pod JobID does not match placeholder JobID") continue } - if nodeIndex >= len(kubeNodesOrdered) { + if nodeIndex >= len(kubeNodes) { logger.V(5).Info("no nodes left to annotate") return ErrorNoKubeNode } if p.Annotations == nil { p.Annotations = make(map[string]string) } - node := kubeNodesOrdered[nodeIndex] + node := kubeNodes[nodeIndex] nodeIndex++ toUpdate := p.DeepCopy() toUpdate.Annotations[wellknown.AnnotationPlaceholderNode] = node @@ -491,9 +485,9 @@ func (sb *SlurmBridge) annotatePodsWithNodes(ctx context.Context, jobid int32, k return nil } -// slurmToKubeNodesOrdered translates Slurm node names to Kubernetes node names +// slurmToKubeNodes will translate slurm node names to kubernetes node names // preserving order so pod-to-node assignment matches Slurm task layout. -func (sb *SlurmBridge) slurmToKubeNodesOrdered(ctx context.Context, slurmNodes []string) ([]string, error) { +func (sb *SlurmBridge) slurmToKubeNodes(ctx context.Context, slurmNodes []string) ([]string, error) { logger := klog.FromContext(ctx) nodeList := &corev1.NodeList{} @@ -502,7 +496,7 @@ func (sb *SlurmBridge) slurmToKubeNodesOrdered(ctx context.Context, slurmNodes [ return nil, err } - kubeNodesOrdered := make([]string, 0, len(slurmNodes)) + kubeNodes := make([]string, 0, len(slurmNodes)) nodeNameMap := nodecontrollerutils.MakeNodeNameMap(ctx, nodeList) for _, slurmNode := range slurmNodes { kubeNode, ok := nodeNameMap[slurmNode] @@ -521,10 +515,10 @@ func (sb *SlurmBridge) slurmToKubeNodesOrdered(ctx context.Context, slurmNodes [ return nil, ErrorNoKubeNodeMatch } } - kubeNodesOrdered = append(kubeNodesOrdered, kubeNode) + kubeNodes = append(kubeNodes, kubeNode) } - return kubeNodesOrdered, nil + return kubeNodes, nil } // revertPlaceholderJob will delete the placeholder job associate with the pod diff --git a/internal/scheduler/plugins/slurmbridge/slurmcontrol/slurmcontrol.go b/internal/scheduler/plugins/slurmbridge/slurmcontrol/slurmcontrol.go index 454fa0e..cfa9dea 100644 --- a/internal/scheduler/plugins/slurmbridge/slurmcontrol/slurmcontrol.go +++ b/internal/scheduler/plugins/slurmbridge/slurmcontrol/slurmcontrol.go @@ -190,7 +190,7 @@ func (r *realSlurmControl) submitJob(ctx context.Context, pod *corev1.Pod, slurm }(), Qos: slurmJobIR.JobInfo.QOS, Reservation: slurmJobIR.JobInfo.Reservation, - // SharedUser allows multiple jobs (or tasks) to share a node (e.g. multiple pods per multi-GPU node). + // SharedUser allows multiple pods to share a node Shared: &[]v0044.V0044JobDescMsgShared{v0044.V0044JobDescMsgSharedUser}, TasksPerNode: slurmJobIR.JobInfo.TasksPerNode, TimeLimit: func() *v0044.V0044Uint32NoValStruct { From 315a94211fb5cfac375b1b100ed6f32296bed01e Mon Sep 17 00:00:00 2001 From: Charlie Getzen Date: Fri, 30 Jan 2026 13:41:02 -0800 Subject: [PATCH 5/8] revert-small-change --- .../plugins/slurmbridge/slurmbridge.go | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/internal/scheduler/plugins/slurmbridge/slurmbridge.go b/internal/scheduler/plugins/slurmbridge/slurmbridge.go index ed6d9d9..27cf512 100644 --- a/internal/scheduler/plugins/slurmbridge/slurmbridge.go +++ b/internal/scheduler/plugins/slurmbridge/slurmbridge.go @@ -594,17 +594,14 @@ func (sb *SlurmBridge) validatePodToJob(ctx context.Context, pod *corev1.Pod) er toUpdate.Labels[wellknown.LabelPlaceholderJobId] = strconv.Itoa(int(val.JobId)) } // If the pod has a Node set, validate it against podToJob - nodes, err := hostlist.Expand(val.Nodes) - if err != nil { - logger.Error(err, "failed to expand Slurm nodelist for validation", "nodelist", val.Nodes) - } else { - if pod.Annotations[wellknown.AnnotationPlaceholderNode] != "" && - !slices.Contains(nodes, pod.Annotations[wellknown.AnnotationPlaceholderNode]) { - logger.V(3).Info("Pod node annotation does not match Slurm nodes", "pod", klog.KObj(pod), - "node annotation", pod.Annotations[wellknown.AnnotationPlaceholderNode], - "slurm job", val) - toUpdate.Annotations[wellknown.AnnotationPlaceholderNode] = "" - } + + nodes, _ := hostlist.Expand(val.Nodes) + if pod.Annotations[wellknown.AnnotationPlaceholderNode] != "" && + !slices.Contains(nodes, pod.Annotations[wellknown.AnnotationPlaceholderNode]) { + logger.V(3).Info("Pod node annotation does not match Slurm nodes", "pod", klog.KObj(pod), + "node annotation", pod.Annotations[wellknown.AnnotationPlaceholderNode], + "slurm job", val) + toUpdate.Annotations[wellknown.AnnotationPlaceholderNode] = "" } if !reflect.DeepEqual(pod, toUpdate) { if err := sb.Patch(ctx, toUpdate, client.StrategicMergeFrom(pod)); err != nil { From 6a5e1e936a2ef5cf377f92a33e676195119cb09d Mon Sep 17 00:00:00 2001 From: Charlie Getzen Date: Fri, 30 Jan 2026 13:57:44 -0800 Subject: [PATCH 6/8] slight-updates --- internal/scheduler/plugins/slurmbridge/slurmbridge.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/internal/scheduler/plugins/slurmbridge/slurmbridge.go b/internal/scheduler/plugins/slurmbridge/slurmbridge.go index 27cf512..e03abfd 100644 --- a/internal/scheduler/plugins/slurmbridge/slurmbridge.go +++ b/internal/scheduler/plugins/slurmbridge/slurmbridge.go @@ -254,7 +254,7 @@ func (sb *SlurmBridge) PreFilter(ctx context.Context, state fwk.CycleState, pod if err != nil { return nil, fwk.NewStatus(fwk.Error, err.Error()) } - err = sb.annotatePodsWithNodes(ctx, placeholderJob.JobId, kubeNodes, &s.slurmJobIR.Pods) + err = sb.annotatePodsWithNodes(ctx, placeholderJob.JobId, slices.Clone(kubeNodes), &s.slurmJobIR.Pods) if err != nil { return nil, fwk.NewStatus(fwk.Error, err.Error()) } @@ -458,7 +458,7 @@ func (sb *SlurmBridge) annotatePodsWithNodes(ctx context.Context, jobid int32, k }) nodeIndex := 0 for _, idx := range indices { - p := &pods.Items[idx] + p := pods.Items[idx] // If this pod doesn't have a JobId that matches, it should be skipped as // it didn't exist when the placeholder job was created podJobID := slurmjobir.ParseSlurmJobId(p.Labels[wellknown.LabelPlaceholderJobId]) @@ -477,7 +477,7 @@ func (sb *SlurmBridge) annotatePodsWithNodes(ctx context.Context, jobid int32, k nodeIndex++ toUpdate := p.DeepCopy() toUpdate.Annotations[wellknown.AnnotationPlaceholderNode] = node - if err := sb.Patch(ctx, toUpdate, client.StrategicMergeFrom(p)); err != nil { + if err := sb.Patch(ctx, toUpdate, client.StrategicMergeFrom(&p)); err != nil { logger.Error(err, "failed to update pod with slurm job id") return ErrorPodUpdateFailed } @@ -594,7 +594,6 @@ func (sb *SlurmBridge) validatePodToJob(ctx context.Context, pod *corev1.Pod) er toUpdate.Labels[wellknown.LabelPlaceholderJobId] = strconv.Itoa(int(val.JobId)) } // If the pod has a Node set, validate it against podToJob - nodes, _ := hostlist.Expand(val.Nodes) if pod.Annotations[wellknown.AnnotationPlaceholderNode] != "" && !slices.Contains(nodes, pod.Annotations[wellknown.AnnotationPlaceholderNode]) { From 49058d8ccadcc0d439fde19658315e5b84c2cd29 Mon Sep 17 00:00:00 2001 From: Charlie Getzen Date: Fri, 30 Jan 2026 14:39:50 -0800 Subject: [PATCH 7/8] simplify --- .../plugins/slurmbridge/slurmbridge.go | 12 ++-- internal/utils/slurmjobir/job.go | 5 +- internal/utils/slurmjobir/job_test.go | 8 ++- internal/utils/slurmjobir/jobset_test.go | 8 ++- internal/utils/slurmjobir/pod.go | 1 + internal/utils/slurmjobir/slurmjobir.go | 30 ---------- internal/utils/slurmjobir/slurmjobir_test.go | 57 +++---------------- 7 files changed, 26 insertions(+), 95 deletions(-) diff --git a/internal/scheduler/plugins/slurmbridge/slurmbridge.go b/internal/scheduler/plugins/slurmbridge/slurmbridge.go index e03abfd..662813a 100644 --- a/internal/scheduler/plugins/slurmbridge/slurmbridge.go +++ b/internal/scheduler/plugins/slurmbridge/slurmbridge.go @@ -316,12 +316,8 @@ func (sb *SlurmBridge) PostFilter(ctx context.Context, state fwk.CycleState, pod // One job per group (PodGroup, JobSet, LWS): require feasible nodes >= len(pods). // One job per pod (standalone Pod, Job): require at least one feasible node. - submitIR := s.slurmJobIR minNodesRequired := len(s.slurmJobIR.Pods.Items) if !slurmjobir.IsOneJobPerGroupWorkload(s.slurmJobIR) { - singlePodIR := slurmjobir.BuildSinglePodIR(s.slurmJobIR, pod) - singlePodIR.JobInfo.Nodes = s.slurmJobIR.JobInfo.Nodes - submitIR = singlePodIR minNodesRequired = 1 } // If this situation occurs, the best we can do is trigger another @@ -333,7 +329,7 @@ func (sb *SlurmBridge) PostFilter(ctx context.Context, state fwk.CycleState, pod // If no placeholder job exists, we should create one with the list // of nodes that passed Filter plugins. if placeholderJob.JobId == 0 { - jobid, err := sb.slurmControl.SubmitJob(ctx, pod, submitIR) + jobid, err := sb.slurmControl.SubmitJob(ctx, pod, s.slurmJobIR) if err != nil { aggErrors := func() utilerrors.Aggregate { var target utilerrors.Aggregate @@ -350,7 +346,7 @@ func (sb *SlurmBridge) PostFilter(ctx context.Context, state fwk.CycleState, pod return nil, fwk.NewStatus(fwk.Error, err.Error()) } logger.V(5).Info("submitted placeholder to slurm", klog.KObj(pod)) - err = sb.labelPodsWithJobId(ctx, jobid, submitIR) + err = sb.labelPodsWithJobId(ctx, jobid, s.slurmJobIR) if err != nil { return nil, fwk.NewStatus(fwk.Error, err.Error()) } @@ -362,14 +358,14 @@ func (sb *SlurmBridge) PostFilter(ctx context.Context, state fwk.CycleState, pod logger.V(4).Info("placeholder job exists but no nodes have been allocated") // As the placeholder job is not yet running, update to the job // to include any changes from slurmJobIR. - jobid, err := sb.slurmControl.UpdateJob(ctx, pod, submitIR) + jobid, err := sb.slurmControl.UpdateJob(ctx, pod, s.slurmJobIR) if err != nil { logger.Error(err, "error updating Slurm job") return nil, fwk.NewStatus(fwk.Error, err.Error()) } // Update the pods with the jobId label in case there // are new pods included in slurmJobIR after the update. - err = sb.labelPodsWithJobId(ctx, jobid, submitIR) + err = sb.labelPodsWithJobId(ctx, jobid, s.slurmJobIR) if err != nil { logger.Error(err, "error labeling pods after update") return nil, fwk.NewStatus(fwk.Error, err.Error()) diff --git a/internal/utils/slurmjobir/job.go b/internal/utils/slurmjobir/job.go index 099a5db..3a34e8d 100644 --- a/internal/utils/slurmjobir/job.go +++ b/internal/utils/slurmjobir/job.go @@ -25,7 +25,10 @@ func (t *translator) fromJob(pod *corev1.Pod, rootPOM *metav1.PartialObjectMetad slurmJobIR := &SlurmJobIR{} slurmJobIR.Pods.Items = append(slurmJobIR.Pods.Items, *pod) - slurmJobIR.JobInfo.MinNodes = ptr.To(int32(1)) + tasks := int32(1) + slurmJobIR.JobInfo.MinNodes = ptr.To(tasks) + slurmJobIR.JobInfo.MaxNodes = ptr.To(tasks) + slurmJobIR.JobInfo.TasksPerNode = ptr.To(tasks) if job.Spec.Template.Spec.Resources != nil { slurmJobIR.JobInfo.CpuPerTask = ptr.To(int32(job.Spec.Template.Spec.Resources.Limits.Cpu().Value())) //nolint:gosec // disable G115 slurmJobIR.JobInfo.MemPerNode = ptr.To(int64(GetMemoryFromQuantity(job.Spec.Template.Spec.Resources.Limits.Memory()))) diff --git a/internal/utils/slurmjobir/job_test.go b/internal/utils/slurmjobir/job_test.go index 90c048d..49cb070 100644 --- a/internal/utils/slurmjobir/job_test.go +++ b/internal/utils/slurmjobir/job_test.go @@ -124,9 +124,11 @@ func Test_translator_fromJob(t *testing.T) { }, want: &SlurmJobIR{ JobInfo: SlurmJobIRJobInfo{ - MinNodes: ptr.To(int32(1)), - CpuPerTask: ptr.To(int32(22)), - MemPerNode: ptr.To(int64(1)), + MinNodes: ptr.To(int32(1)), + MaxNodes: ptr.To(int32(1)), + TasksPerNode: ptr.To(int32(1)), + CpuPerTask: ptr.To(int32(22)), + MemPerNode: ptr.To(int64(1)), }, Pods: corev1.PodList{ Items: []corev1.Pod{{ diff --git a/internal/utils/slurmjobir/jobset_test.go b/internal/utils/slurmjobir/jobset_test.go index a642f9c..f52e2fe 100644 --- a/internal/utils/slurmjobir/jobset_test.go +++ b/internal/utils/slurmjobir/jobset_test.go @@ -127,9 +127,11 @@ func Test_translator_fromJobSet(t *testing.T) { }, want: &SlurmJobIR{ JobInfo: SlurmJobIRJobInfo{ - MinNodes: ptr.To(int32(1)), - CpuPerTask: ptr.To(int32(22)), - MemPerNode: ptr.To(int64(1)), + MinNodes: ptr.To(int32(1)), + MaxNodes: ptr.To(int32(1)), + TasksPerNode: ptr.To(int32(1)), + CpuPerTask: ptr.To(int32(22)), + MemPerNode: ptr.To(int64(1)), }, Pods: corev1.PodList{ Items: []corev1.Pod{ diff --git a/internal/utils/slurmjobir/pod.go b/internal/utils/slurmjobir/pod.go index 3d94456..e4d65df 100644 --- a/internal/utils/slurmjobir/pod.go +++ b/internal/utils/slurmjobir/pod.go @@ -19,5 +19,6 @@ func (t *translator) fromPod(pod *corev1.Pod) (*SlurmJobIR, error) { tasks := int32(1) slurmJobIR.JobInfo.TasksPerNode = &tasks slurmJobIR.JobInfo.MaxNodes = &tasks + slurmJobIR.JobInfo.MinNodes = &tasks return slurmJobIR, nil } diff --git a/internal/utils/slurmjobir/slurmjobir.go b/internal/utils/slurmjobir/slurmjobir.go index adf0e24..a979e4e 100644 --- a/internal/utils/slurmjobir/slurmjobir.go +++ b/internal/utils/slurmjobir/slurmjobir.go @@ -129,36 +129,6 @@ func TranslateToSlurmJobIR(c client.Client, ctx context.Context, pod *corev1.Pod return slurmJobIR, err } -// BuildSinglePodIR builds a SlurmJobIR for a single pod (one job per pod). It copies RootPOM and -// annotation-derived JobInfo from the group IR, then derives CpuPerTask, MemPerNode, and Gres from -// the given pod only, and sets MinNodes=1, MaxNodes=1, TasksPerNode=1. -// The caller must set JobInfo.Nodes when using the result for submission (e.g. for RequiredNodes). -func BuildSinglePodIR(ir *SlurmJobIR, pod *corev1.Pod) *SlurmJobIR { - single := &SlurmJobIR{ - RootPOM: ir.RootPOM, - Pods: corev1.PodList{Items: []corev1.Pod{*pod}}, - JobInfo: SlurmJobIRJobInfo{ - Account: ir.JobInfo.Account, - Constraints: ir.JobInfo.Constraints, - GroupId: ir.JobInfo.GroupId, - JobName: ir.JobInfo.JobName, - Licenses: ir.JobInfo.Licenses, - Partition: ir.JobInfo.Partition, - QOS: ir.JobInfo.QOS, - Reservation: ir.JobInfo.Reservation, - TimeLimit: ir.JobInfo.TimeLimit, - UserId: ir.JobInfo.UserId, - Wckey: ir.JobInfo.Wckey, - MinNodes: ptr.To(int32(1)), - MaxNodes: ptr.To(int32(1)), - TasksPerNode: ptr.To(int32(1)), - }, - } - parsePodsCpuAndMemory(single) - parseGPUDevicePlugin(single) - return single -} - /* Set CPU and Memory for the placeholder job based on the maximum Pod CPU and Memory (including overhead) */ func parsePodsCpuAndMemory(slurmJobIR *SlurmJobIR) { var cpuMax resource.Quantity diff --git a/internal/utils/slurmjobir/slurmjobir_test.go b/internal/utils/slurmjobir/slurmjobir_test.go index b9a71d7..794a0b0 100644 --- a/internal/utils/slurmjobir/slurmjobir_test.go +++ b/internal/utils/slurmjobir/slurmjobir_test.go @@ -110,6 +110,10 @@ func TestTranslateToSlurmJobIR(t *testing.T) { JobInfo: SlurmJobIRJobInfo{ Account: ptr.To("test1"), GroupId: ptr.To("1000"), + MinNodes: func() *int32 { + minNodes := int32(1) + return &minNodes + }(), MaxNodes: func() *int32 { maxNodes := int32(1) return &maxNodes @@ -146,14 +150,9 @@ func TestTranslateToSlurmJobIR(t *testing.T) { Items: []corev1.Pod{*podWithBadAnnotation.DeepCopy()}, }, JobInfo: SlurmJobIRJobInfo{ - MaxNodes: func() *int32 { - maxNodes := int32(1) - return &maxNodes - }(), - TasksPerNode: func() *int32 { - tasksPerNode := int32(1) - return &tasksPerNode - }(), + MinNodes: ptr.To(int32(1)), + MaxNodes: ptr.To(int32(1)), + TasksPerNode: ptr.To(int32(1)), }, }, wantErr: true, @@ -547,45 +546,3 @@ func TestIsOneJobPerGroupWorkload(t *testing.T) { }) } } - -func TestBuildSinglePodIR(t *testing.T) { - pod := podWithResources("2", "256Mi", "2", "512Mi") - pod.Namespace = "default" - pod.Name = "test-pod" - ir := &SlurmJobIR{ - RootPOM: metav1.PartialObjectMetadata{TypeMeta: pod_v1, ObjectMeta: metav1.ObjectMeta{Name: "test-pod", Namespace: "default"}}, - Pods: corev1.PodList{Items: []corev1.Pod{pod, podWithResources("1", "128Mi", "1", "256Mi")}}, - JobInfo: SlurmJobIRJobInfo{ - Account: ptr.To("test"), - Partition: ptr.To("debug"), - MinNodes: ptr.To(int32(2)), - MaxNodes: ptr.To(int32(2)), - }, - } - parsePodsCpuAndMemory(ir) - parseGPUDevicePlugin(ir) - - single := BuildSinglePodIR(ir, &pod) - if single == nil { - t.Fatal("BuildSinglePodIR() returned nil") - } - if len(single.Pods.Items) != 1 { - t.Errorf("BuildSinglePodIR() Pods.Items length = %d, want 1", len(single.Pods.Items)) - } - if single.Pods.Items[0].Name != pod.Name { - t.Errorf("BuildSinglePodIR() pod name = %s, want %s", single.Pods.Items[0].Name, pod.Name) - } - if !apiequality.Semantic.DeepEqual(single.RootPOM, ir.RootPOM) { - t.Error("BuildSinglePodIR() RootPOM should match original") - } - if ptr.Deref(single.JobInfo.MinNodes, 0) != 1 || ptr.Deref(single.JobInfo.MaxNodes, 0) != 1 { - t.Errorf("BuildSinglePodIR() MinNodes=%v MaxNodes=%v, want 1, 1", single.JobInfo.MinNodes, single.JobInfo.MaxNodes) - } - if ptr.Deref(single.JobInfo.Account, "") != "test" || ptr.Deref(single.JobInfo.Partition, "") != "debug" { - t.Errorf("BuildSinglePodIR() Account/Partition should be copied from original") - } - // Single pod has 2 CPU, 512Mi from the pod we passed - if ptr.Deref(single.JobInfo.CpuPerTask, 0) != 2 { - t.Errorf("BuildSinglePodIR() CpuPerTask = %v, want 2", single.JobInfo.CpuPerTask) - } -} From 0fd9a79a88505aa330c275e3a501893640f57dba Mon Sep 17 00:00:00 2001 From: Charlie Getzen Date: Fri, 30 Jan 2026 14:52:07 -0800 Subject: [PATCH 8/8] simplify --- internal/scheduler/plugins/slurmbridge/slurmbridge.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/scheduler/plugins/slurmbridge/slurmbridge.go b/internal/scheduler/plugins/slurmbridge/slurmbridge.go index 662813a..91bdfe5 100644 --- a/internal/scheduler/plugins/slurmbridge/slurmbridge.go +++ b/internal/scheduler/plugins/slurmbridge/slurmbridge.go @@ -254,7 +254,7 @@ func (sb *SlurmBridge) PreFilter(ctx context.Context, state fwk.CycleState, pod if err != nil { return nil, fwk.NewStatus(fwk.Error, err.Error()) } - err = sb.annotatePodsWithNodes(ctx, placeholderJob.JobId, slices.Clone(kubeNodes), &s.slurmJobIR.Pods) + err = sb.annotatePodsWithNodes(ctx, placeholderJob.JobId, kubeNodes, &s.slurmJobIR.Pods) if err != nil { return nil, fwk.NewStatus(fwk.Error, err.Error()) }