Skip to content

Commit 27d8ca4

Browse files
committed
make size simpler - just is number of worker nodes
Signed-off-by: vsoch <vsoch@users.noreply.github.com>
1 parent b797793 commit 27d8ca4

7 files changed

Lines changed: 46 additions & 19 deletions

File tree

api/v1alpha1/slurm_types.go

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ type SlurmSpec struct {
6363
// +optional
6464
SlurmVersion string `json:"slurmVersion,omitempty"`
6565

66-
// Size of the slurm (1 server + (N-1) nodes)
66+
// Size is number of worker nodes
6767
Size int32 `json:"size"`
6868

6969
// Interactive mode keeps the cluster running
@@ -230,8 +230,8 @@ func (s *Slurm) SelectorName() string {
230230

231231
// Validate the slurm
232232
func (s *Slurm) Validate() bool {
233-
if s.WorkerNodes() < 1 {
234-
fmt.Printf("😥️ Slurm cluster must have at least one worker node, Size >= 2.\n")
233+
if s.Spec.Size < 1 {
234+
fmt.Printf("😥️ Slurm cluster must have 1 or more worker nodes.\n")
235235
return false
236236
}
237237
// Ensure we have the default image set
@@ -258,12 +258,6 @@ func (s *Slurm) Validate() bool {
258258
return true
259259
}
260260

261-
// WorkerNodes returns the number of worker nodes
262-
// At this point we've already validated size is >= 1
263-
func (s *Slurm) WorkerNodes() int32 {
264-
return s.Spec.Size - 1
265-
}
266-
267261
// WorkerNode returns the worker node (if defined) or falls back to the server
268262
func (s *Slurm) WorkerNode() Node {
269263

config/crd/bases/flux-framework.org_slurms.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@ spec:
244244
description: Resources include limits and requests
245245
type: object
246246
size:
247-
description: Size of the slurm (1 server + (N-1) nodes)
247+
description: Size is number of worker nodes
248248
format: int32
249249
type: integer
250250
slurmVersion:

controllers/slurm/jobset.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,8 @@ func (r *SlurmReconciler) newJobSet(
9393

9494
}
9595

96-
// Create a cluster (JobSet) with workers (required)
97-
workerNodes := cluster.WorkerNodes()
98-
workerJob, err := r.getJob(cluster, cluster.WorkerNode(), workerNodes, "w", true)
96+
// Create a cluster (JobSet) with workers
97+
workerJob, err := r.getJob(cluster, cluster.WorkerNode(), cluster.Spec.Size, "w", true)
9998
if err != nil {
10099
r.Log.Error(err, "There was an error getting the worker ReplicatedJob")
101100
return &jobs, err

controllers/slurm/templates.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ func generateHostlist(cluster *api.Slurm) string {
9595
hosts := ""
9696

9797
serviceName := cluster.ServiceName()
98-
for i := 0; i < int(cluster.WorkerNodes()); i++ {
98+
for i := 0; i < int(cluster.Spec.Size); i++ {
9999
if hosts == "" {
100100
hosts = fmt.Sprintf("%s-w-0-%d.%s.%s.svc.cluster.local", cluster.Name, i, serviceName, cluster.Namespace)
101101
} else {

docker/Dockerfile

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,12 +39,22 @@ RUN set -ex \
3939
bash-completion \
4040
vim-enhanced \
4141
http-parser-devel \
42+
hwloc-devel \
43+
libevent-devel \
4244
json-c-devel \
4345
&& yum clean all \
4446
&& rm -rf /var/cache/yum
4547

4648
RUN pip3 install Cython nose
4749

50+
RUN set -x \
51+
&& wget https://github.com/openpmix/openpmix/releases/download/v3.2.3/pmix-3.2.3.tar.gz \
52+
&& tar -xvf pmix-3.2.3.tar.gz \
53+
&& cd pmix-3.2.3 \
54+
&& ./configure --prefix=/opt/pmix --disable-static \
55+
&& make -j$(nproc) install \
56+
&& cd .. && rm -rf pmix-3.2.3*
57+
4858
RUN set -ex \
4959
&& wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-${ARCH}" \
5060
&& wget -O /usr/local/bin/gosu.asc "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-${ARCH}.asc" \
@@ -58,8 +68,15 @@ RUN set -ex \
5868
RUN set -x \
5969
&& git clone -b ${SLURM_TAG} --single-branch --depth=1 https://github.com/SchedMD/slurm.git \
6070
&& pushd slurm \
71+
&& export CFLAGS="-I/usr/include" \
72+
&& export LDFLAGS="-L/usr/lib/$(uname -m)-linux-gnu" \
6173
&& ./configure --enable-debug --prefix=/usr --sysconfdir=/etc/slurm \
62-
--with-mysql_config=/usr/bin --libdir=/usr/lib64 \
74+
--with-mysql_config=/usr/bin \
75+
--libdir=/usr/lib \
76+
--with-pmix=/opt/pmix \
77+
--with-hwloc=/usr \
78+
--without-hdf5 \
79+
--with-libevent=/usr \
6380
&& make install \
6481
&& install -D -m644 etc/cgroup.conf.example /etc/slurm/cgroup.conf.example \
6582
&& install -D -m644 etc/slurm.conf.example /etc/slurm/slurm.conf.example \

docker/Dockerfile.ubuntu

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,18 +40,35 @@ RUN set -ex \
4040
build-essential \
4141
pkg-config \
4242
gosu \
43+
libhwloc-dev \
44+
libevent-dev \
4345
&& apt-get clean \
4446
&& rm -rf /var/lib/apt/lists/*
4547

4648
# Note: Ubuntu 24.04 uses PEP 668 (externally managed environments).
4749
# If pip install fails, use --break-system-packages or install in a venv.
4850
RUN pip3 install --break-system-packages Cython nose
4951

52+
RUN set -x \
53+
&& wget https://github.com/openpmix/openpmix/releases/download/v3.2.3/pmix-3.2.3.tar.gz \
54+
&& tar -xvf pmix-3.2.3.tar.gz \
55+
&& cd pmix-3.2.3 \
56+
&& ./configure --prefix=/opt/pmix --disable-static \
57+
&& make -j$(nproc) install \
58+
&& cd .. && rm -rf pmix-3.2.3*
59+
5060
RUN set -x \
5161
&& git clone -b ${SLURM_TAG} --single-branch --depth=1 https://github.com/SchedMD/slurm.git \
52-
&& cd slurm \
62+
&& export CFLAGS="-I/usr/include" \
63+
&& export LDFLAGS="-L/usr/lib/$(uname -m)-linux-gnu" \
64+
&& cd ./slurm \
5365
&& ./configure --enable-debug --prefix=/usr --sysconfdir=/etc/slurm \
54-
--with-mysql_config=/usr/bin --libdir=/usr/lib \
66+
--with-mysql_config=/usr/bin \
67+
--libdir=/usr/lib \
68+
--with-pmix=/opt/pmix \
69+
--without-hdf5 \
70+
--with-hwloc=/usr \
71+
--with-libevent=/usr \
5572
&& make install \
5673
&& install -D -m644 etc/cgroup.conf.example /etc/slurm/cgroup.conf.example \
5774
&& install -D -m644 etc/slurm.conf.example /etc/slurm/slurm.conf.example \
@@ -96,4 +113,4 @@ EXPOSE 6717
96113
EXPOSE 6718
97114
EXPOSE 6719
98115

99-
CMD ["slurmdbd"]
116+
CMD ["slurmdbd"]

examples/dist/slurm-operator.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ spec:
256256
description: Resources include limits and requests
257257
type: object
258258
size:
259-
description: Size of the slurm (1 server + (N-1) nodes)
259+
description: Size is number of worker nodes
260260
format: int32
261261
type: integer
262262
slurmVersion:

0 commit comments

Comments
 (0)