Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,9 @@ Slurm is a full featured HPC workload manager. To highlight a few features:
| Kubernetes | [v1.34](https://kubernetes.io/blog/2025/08/27/kubernetes-v1-34-release/) |
| Slurm | [25.05](https://www.schedmd.com/slurm-version-25-05-0-is-now-available/) |

## Limitations
## Slurm Configuration

- Exclusive, whole node allocations are made for each pod.
See [Slurm Configuration](docs/slurm-configuration.md) for best practices on configuring Slurm for use with slurm-bridge.

## Installation

Expand Down
2 changes: 1 addition & 1 deletion docs/index.rst

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 28 additions & 0 deletions docs/slurm-configuration.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# Slurm Configuration

## Bin Pack Multiple Pods on a Node

By default, Slurm will reserve a full node for each pod. To allow bin packing, adjust slurm.conf:

- **OverSubscribe** — Set to `YES` or `FORCE` on the partition so multiple jobs (pods) can share nodes.

- **SchedulerParameters** - Set:
- `bf_busy_nodes` — Backfill scheduler prefers nodes that are already busy, packing jobs onto fewer nodes and leaving others idle for whole-node jobs.
- `pack_serial_at_end` — Schedules serial jobs at the end of the backfill window to reduce fragmentation and improve packing.

If using slinky, this can be set by adjusting its `values.yaml`:

```yaml
controller:
extraConf: |
SchedulerParameters=pack_serial_at_end,bf_busy_nodes
nodesets:
slinky: # or other nodeset name
partition:
config: |
OverSubscribe=YES
```

For more details, see:
- [cons_tres resource sharing](https://slurm.schedmd.com/cons_tres_share.html).
- [Scheduler Params](https://slurm.schedmd.com/slurm.conf.html#OPT_SchedulerParameters)
53 changes: 35 additions & 18 deletions internal/scheduler/plugins/slurmbridge/slurmbridge.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ func (sb *SlurmBridge) PreFilter(ctx context.Context, state fwk.CycleState, pod
if err != nil {
return nil, fwk.NewStatus(fwk.Error, err.Error())
}
err = sb.annotatePodsWithNodes(ctx, placeholderJob.JobId, kubeNodes.Clone(), &s.slurmJobIR.Pods)
err = sb.annotatePodsWithNodes(ctx, placeholderJob.JobId, kubeNodes, &s.slurmJobIR.Pods)
if err != nil {
return nil, fwk.NewStatus(fwk.Error, err.Error())
}
Expand All @@ -266,7 +266,7 @@ func (sb *SlurmBridge) PreFilter(ctx context.Context, state fwk.CycleState, pod
// By passing the list of nodes in the placeholder job as PreFilterResult,
// Filter plugins will only run for nodes in the Slurm job. This is the final
// PreFilter step that must occur before pods are allowed to run.
return &framework.PreFilterResult{NodeNames: kubeNodes}, fwk.NewStatus(fwk.Success, "")
return &framework.PreFilterResult{NodeNames: sets.New(kubeNodes...)}, fwk.NewStatus(fwk.Success, "")
}
}

Expand Down Expand Up @@ -314,9 +314,15 @@ func (sb *SlurmBridge) PostFilter(ctx context.Context, state fwk.CycleState, pod
}
}

// One job per group (PodGroup, JobSet, LWS): require feasible nodes >= len(pods).
// One job per pod (standalone Pod, Job): require at least one feasible node.
minNodesRequired := len(s.slurmJobIR.Pods.Items)
if !slurmjobir.IsOneJobPerGroupWorkload(s.slurmJobIR) {
minNodesRequired = 1
}
// If this situation occurs, the best we can do is trigger another
// scheduling cycle.
if len(s.slurmJobIR.JobInfo.Nodes) < len(s.slurmJobIR.Pods.Items) {
if len(s.slurmJobIR.JobInfo.Nodes) < minNodesRequired {
return nil, fwk.NewStatus(fwk.Success)
}

Expand Down Expand Up @@ -431,29 +437,40 @@ func (sb *SlurmBridge) labelPodsWithJobId(ctx context.Context, jobid int32, slur
}

// annotatePodsWithNodes will annotate a node assignment to pods
func (sb *SlurmBridge) annotatePodsWithNodes(ctx context.Context, jobid int32, kubeNodes sets.Set[string], pods *corev1.PodList) error {
func (sb *SlurmBridge) annotatePodsWithNodes(ctx context.Context, jobid int32, kubeNodes []string, pods *corev1.PodList) error {
logger := klog.FromContext(ctx)
for _, p := range pods.Items {
// Return if there are no nodes left
if kubeNodes.Len() == 0 {
logger.V(5).Info("no nodes left to annotate")
break
// Sort pods in deterministic order (namespace, name) so that the
// i-th node in the Slurm nodelist is assigned to the i-th pod in that order.
indices := make([]int, len(pods.Items))
for i := range indices {
indices[i] = i
}
slices.SortFunc(indices, func(i, j int) int {
a, b := pods.Items[i], pods.Items[j]
if a.Namespace != b.Namespace {
return strings.Compare(a.Namespace, b.Namespace)
}
return strings.Compare(a.Name, b.Name)
})
nodeIndex := 0
for _, idx := range indices {
p := pods.Items[idx]
// If this pod doesn't have a JobId that matches, it should be skipped as
// it didn't exist when the placeholder job was created
podJobID := slurmjobir.ParseSlurmJobId(p.Labels[wellknown.LabelPlaceholderJobId])
if jobid != podJobID {
logger.V(5).Info("pod JobID does not match placeholder JobID")
continue
}
if nodeIndex >= len(kubeNodes) {
logger.V(5).Info("no nodes left to annotate")
return ErrorNoKubeNode
}
if p.Annotations == nil {
p.Annotations = make(map[string]string)
}
node, ok := kubeNodes.PopAny()
if !ok {
logger.V(4).Info("could not get a node to assign")
return ErrorNoKubeNode
}
node := kubeNodes[nodeIndex]
nodeIndex++
toUpdate := p.DeepCopy()
toUpdate.Annotations[wellknown.AnnotationPlaceholderNode] = node
if err := sb.Patch(ctx, toUpdate, client.StrategicMergeFrom(&p)); err != nil {
Expand All @@ -465,7 +482,8 @@ func (sb *SlurmBridge) annotatePodsWithNodes(ctx context.Context, jobid int32, k
}

// slurmToKubeNodes will translate slurm node names to kubernetes node names
func (sb *SlurmBridge) slurmToKubeNodes(ctx context.Context, slurmNodes []string) (sets.Set[string], error) {
// preserving order so pod-to-node assignment matches Slurm task layout.
func (sb *SlurmBridge) slurmToKubeNodes(ctx context.Context, slurmNodes []string) ([]string, error) {
logger := klog.FromContext(ctx)

nodeList := &corev1.NodeList{}
Expand All @@ -474,7 +492,7 @@ func (sb *SlurmBridge) slurmToKubeNodes(ctx context.Context, slurmNodes []string
return nil, err
}

kubeNodes := make(sets.Set[string])
kubeNodes := make([]string, 0, len(slurmNodes))
nodeNameMap := nodecontrollerutils.MakeNodeNameMap(ctx, nodeList)
for _, slurmNode := range slurmNodes {
kubeNode, ok := nodeNameMap[slurmNode]
Expand All @@ -492,9 +510,8 @@ func (sb *SlurmBridge) slurmToKubeNodes(ctx context.Context, slurmNodes []string
} else {
return nil, ErrorNoKubeNodeMatch
}

}
kubeNodes.Insert(kubeNode)
kubeNodes = append(kubeNodes, kubeNode)
}

return kubeNodes, nil
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,8 @@ func (r *realSlurmControl) submitJob(ctx context.Context, pod *corev1.Pod, slurm
}(),
Qos: slurmJobIR.JobInfo.QOS,
Reservation: slurmJobIR.JobInfo.Reservation,
// SharedNone is effectively Exclusive
Shared: &[]v0044.V0044JobDescMsgShared{v0044.V0044JobDescMsgSharedNone},
// SharedUser allows multiple pods to share a node
Shared: &[]v0044.V0044JobDescMsgShared{v0044.V0044JobDescMsgSharedUser},
TasksPerNode: slurmJobIR.JobInfo.TasksPerNode,
TimeLimit: func() *v0044.V0044Uint32NoValStruct {
if slurmJobIR.JobInfo.TimeLimit != nil {
Expand Down
5 changes: 4 additions & 1 deletion internal/utils/slurmjobir/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ func (t *translator) fromJob(pod *corev1.Pod, rootPOM *metav1.PartialObjectMetad

slurmJobIR := &SlurmJobIR{}
slurmJobIR.Pods.Items = append(slurmJobIR.Pods.Items, *pod)
slurmJobIR.JobInfo.MinNodes = ptr.To(int32(1))
tasks := int32(1)
slurmJobIR.JobInfo.MinNodes = ptr.To(tasks)
slurmJobIR.JobInfo.MaxNodes = ptr.To(tasks)
slurmJobIR.JobInfo.TasksPerNode = ptr.To(tasks)
if job.Spec.Template.Spec.Resources != nil {
slurmJobIR.JobInfo.CpuPerTask = ptr.To(int32(job.Spec.Template.Spec.Resources.Limits.Cpu().Value())) //nolint:gosec // disable G115
slurmJobIR.JobInfo.MemPerNode = ptr.To(int64(GetMemoryFromQuantity(job.Spec.Template.Spec.Resources.Limits.Memory())))
Expand Down
8 changes: 5 additions & 3 deletions internal/utils/slurmjobir/job_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,11 @@ func Test_translator_fromJob(t *testing.T) {
},
want: &SlurmJobIR{
JobInfo: SlurmJobIRJobInfo{
MinNodes: ptr.To(int32(1)),
CpuPerTask: ptr.To(int32(22)),
MemPerNode: ptr.To(int64(1)),
MinNodes: ptr.To(int32(1)),
MaxNodes: ptr.To(int32(1)),
TasksPerNode: ptr.To(int32(1)),
CpuPerTask: ptr.To(int32(22)),
MemPerNode: ptr.To(int64(1)),
},
Pods: corev1.PodList{
Items: []corev1.Pod{{
Expand Down
8 changes: 5 additions & 3 deletions internal/utils/slurmjobir/jobset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,9 +127,11 @@ func Test_translator_fromJobSet(t *testing.T) {
},
want: &SlurmJobIR{
JobInfo: SlurmJobIRJobInfo{
MinNodes: ptr.To(int32(1)),
CpuPerTask: ptr.To(int32(22)),
MemPerNode: ptr.To(int64(1)),
MinNodes: ptr.To(int32(1)),
MaxNodes: ptr.To(int32(1)),
TasksPerNode: ptr.To(int32(1)),
CpuPerTask: ptr.To(int32(22)),
MemPerNode: ptr.To(int64(1)),
},
Pods: corev1.PodList{
Items: []corev1.Pod{
Expand Down
1 change: 1 addition & 0 deletions internal/utils/slurmjobir/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,6 @@ func (t *translator) fromPod(pod *corev1.Pod) (*SlurmJobIR, error) {
tasks := int32(1)
slurmJobIR.JobInfo.TasksPerNode = &tasks
slurmJobIR.JobInfo.MaxNodes = &tasks
slurmJobIR.JobInfo.MinNodes = &tasks
return slurmJobIR, nil
}
15 changes: 15 additions & 0 deletions internal/utils/slurmjobir/slurmjobir.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,21 @@ type translator struct {
ctx context.Context
}

// IsOneJobPerGroupWorkload returns true when the workload uses one Slurm placeholder job per group
// (PodGroup, JobSet, LWS). When false, the workload uses one job per pod (standalone Pod or Job).
// Returns false if ir is nil.
func IsOneJobPerGroupWorkload(ir *SlurmJobIR) bool {
if ir == nil {
return false
}
switch ir.RootPOM.TypeMeta {
case podGroup_v1alpha1, jobSet_v1alpha2, lws_v1:
return true
default:
return false
}
}

func PreFilter(c client.Client, ctx context.Context, pod *corev1.Pod, slurmJobIR *SlurmJobIR) *fwk.Status {
t := translator{Reader: c, ctx: ctx}
switch slurmJobIR.RootPOM.TypeMeta {
Expand Down
61 changes: 53 additions & 8 deletions internal/utils/slurmjobir/slurmjobir_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,10 @@ func TestTranslateToSlurmJobIR(t *testing.T) {
JobInfo: SlurmJobIRJobInfo{
Account: ptr.To("test1"),
GroupId: ptr.To("1000"),
MinNodes: func() *int32 {
minNodes := int32(1)
return &minNodes
}(),
MaxNodes: func() *int32 {
maxNodes := int32(1)
return &maxNodes
Expand Down Expand Up @@ -146,14 +150,9 @@ func TestTranslateToSlurmJobIR(t *testing.T) {
Items: []corev1.Pod{*podWithBadAnnotation.DeepCopy()},
},
JobInfo: SlurmJobIRJobInfo{
MaxNodes: func() *int32 {
maxNodes := int32(1)
return &maxNodes
}(),
TasksPerNode: func() *int32 {
tasksPerNode := int32(1)
return &tasksPerNode
}(),
MinNodes: ptr.To(int32(1)),
MaxNodes: ptr.To(int32(1)),
TasksPerNode: ptr.To(int32(1)),
},
},
wantErr: true,
Expand Down Expand Up @@ -501,3 +500,49 @@ func Test_parseAnnotations(t *testing.T) {
})
}
}

func TestIsOneJobPerGroupWorkload(t *testing.T) {
tests := []struct {
name string
ir *SlurmJobIR
want bool
}{
{
name: "PodGroup is one job per group",
ir: &SlurmJobIR{RootPOM: metav1.PartialObjectMetadata{TypeMeta: podGroup_v1alpha1}},
want: true,
},
{
name: "JobSet is one job per group",
ir: &SlurmJobIR{RootPOM: metav1.PartialObjectMetadata{TypeMeta: jobSet_v1alpha2}},
want: true,
},
{
name: "LWS is one job per group",
ir: &SlurmJobIR{RootPOM: metav1.PartialObjectMetadata{TypeMeta: lws_v1}},
want: true,
},
{
name: "standalone Pod is one job per pod",
ir: &SlurmJobIR{RootPOM: metav1.PartialObjectMetadata{TypeMeta: pod_v1}},
want: false,
},
{
name: "Job is one job per pod",
ir: &SlurmJobIR{RootPOM: metav1.PartialObjectMetadata{TypeMeta: job_v1}},
want: false,
},
{
name: "nil IR returns false and does not panic",
ir: nil,
want: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if got := IsOneJobPerGroupWorkload(tt.ir); got != tt.want {
t.Errorf("IsOneJobPerGroupWorkload() = %v, want %v", got, tt.want)
}
})
}
}