Skip to content

Commit 6bfb246

Browse files
authored
Merge pull request #327 from laraPPr/test_submit_to_all
support for template values in `scontrol` command
2 parents 7c85854 + 16a4e0f commit 6bfb246

3 files changed

Lines changed: 33 additions & 12 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -891,6 +891,7 @@ scontrol_command = /usr/bin/scontrol
891891
```
892892

893893
`scontrol_command` is the full path to the Slurm command used for manipulating existing jobs. You may want to verify if `scontrol` is provided at that path or determine its actual location (via `which scontrol`).
894+
It is also possible to add placeholder values to the scontrol_command. These placeholders can capture output from the `squeue` command that the bot runs internally, and pass it back to the `scontrol_command`. An example where this may be useful is in a setup where multiple clusters are managed by the same SLURM instance, and the `scontrol_command` for that instance needs to get the correct cluster name passed. This can be achieved by defining `scontrol_command = /usr/bin/scontrol --clusters=%%(cluster)s`. Valid placeholder names are currently: `jobid`, `cluster`, `partition`, `state`, and `reason`.
894895

895896
#### `[submitted_job_comments]` section
896897

app.cfg.example

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,11 @@ poll_command = /usr/bin/squeue
384384
poll_interval = 60
385385

386386
# full path to the command for manipulating existing jobs
387+
# It is also possible to add placeholder values to the scontrol_command.
388+
# An example where this may be useful is in a setup where multiple clusters are managed by the same SLURM instance,
389+
# and the `scontrol_command` for that instance needs to get the correct cluster name passed.
390+
# This can be achieved by defining `scontrol_command = /usr/bin/scontrol --clusters=%%(cluster)s`.
391+
# Valid placeholder names are currently: `jobid`, `cluster`, `partition`, `state`, and `reason`.
387392
scontrol_command = /usr/bin/scontrol
388393

389394

eessi_bot_job_manager.py

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,8 @@ def get_current_jobs(self):
118118
squeue_cmd = "%s --long --noheader --user=%s" % (self.poll_command, username)
119119
if self.job_name:
120120
squeue_cmd += " --name='%s'" % self.job_name
121+
# Format the output of SLURM
122+
squeue_cmd += " --Format JobId:100@,Cluster:100@,Partition:100@,State:100@,Reason:100"
121123
squeue_output, squeue_err, squeue_exitcode = run_cmd(
122124
squeue_cmd,
123125
"get_current_jobs(): squeue command",
@@ -138,18 +140,23 @@ def get_current_jobs(self):
138140
# get job info, logging any Slurm issues
139141
# Note, all output lines of squeue are processed because we run it with
140142
# --noheader.
141-
for line in lines:
142-
job = line.rstrip().split()
143-
if len(job) >= 9:
144-
job_id = job[0]
145-
state = job[4]
146-
current_jobs[job_id] = {
147-
"jobid": job_id,
148-
"state": state,
149-
"reason": job[8],
150-
}
151-
if state in bad_state_messages:
152-
log("Job {} in state {}: {}".format(job_id, state, bad_state_messages[state]))
143+
if lines != ['']:
144+
for line in lines:
145+
job = [x.rstrip() for x in line.rstrip().split('@')]
146+
if len(job) == 5:
147+
job_id = job[0]
148+
state = job[3]
149+
current_jobs[job_id] = {
150+
"jobid": job_id,
151+
"cluster": job[1],
152+
"partition": job[2],
153+
"state": state,
154+
"reason": job[4],
155+
}
156+
if state in bad_state_messages:
157+
log("Job {} in state {}: {}".format(job_id, state, bad_state_messages[state]))
158+
else:
159+
raise Exception(f"The output of {squeue_cmd} does not have 5 job parameters")
153160

154161
return current_jobs
155162

@@ -296,6 +303,14 @@ def process_new_job(self, new_job):
296303
"""
297304
job_id = new_job["jobid"]
298305

306+
# processing placeholders in scontrol command which is defined in the bot's app.cfg (setting `scontrol_command`)
307+
try:
308+
self.scontrol_command = self.scontrol_command % new_job
309+
except KeyError:
310+
log(f"Failed to process {self.scontrol_command}.")
311+
log(f"Information on placeholder is not collected in new_job: {new_job}.")
312+
raise
313+
299314
scontrol_cmd = "%s --oneliner show jobid %s" % (
300315
self.scontrol_command,
301316
job_id,

0 commit comments

Comments
 (0)