Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -891,6 +891,7 @@ scontrol_command = /usr/bin/scontrol
```

`scontrol_command` is the full path to the Slurm command used for manipulating existing jobs. You may want to verify if `scontrol` is provided at that path or determine its actual location (via `which scontrol`).
It is also possible to add placeholder values to the scontrol_command. These placeholders can capture output from the `squeue` command that the bot runs internally, and pass it back to the `scontrol_command`. An example where this may be useful is in a setup where multiple clusters are managed by the same SLURM instance, and the `scontrol_command` for that instance needs to get the correct cluster name passed. This can be achieved by defining `scontrol_command = /usr/bin/scontrol --clusters=%%(cluster)s`. Valid placeholder names are currently: `jobid`, `cluster`, `partition`, `state`, and `reason`.

#### `[submitted_job_comments]` section

Expand Down
5 changes: 5 additions & 0 deletions app.cfg.example
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,11 @@ poll_command = /usr/bin/squeue
poll_interval = 60

# full path to the command for manipulating existing jobs
# It is also possible to add placeholder values to the scontrol_command.
# An example where this may be useful is in a setup where multiple clusters are managed by the same SLURM instance,
# and the `scontrol_command` for that instance needs to get the correct cluster name passed.
# This can be achieved by defining `scontrol_command = /usr/bin/scontrol --clusters=%%(cluster)s`.
# Valid placeholder names are currently: `jobid`, `cluster`, `partition`, `state`, and `reason`.
scontrol_command = /usr/bin/scontrol


Expand Down
39 changes: 27 additions & 12 deletions eessi_bot_job_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ def get_current_jobs(self):
squeue_cmd = "%s --long --noheader --user=%s" % (self.poll_command, username)
if self.job_name:
squeue_cmd += " --name='%s'" % self.job_name
# Format the output of SLURM
squeue_cmd += " --Format JobId:100@,Cluster:100@,Partition:100@,State:100@,Reason:100"
squeue_output, squeue_err, squeue_exitcode = run_cmd(
squeue_cmd,
"get_current_jobs(): squeue command",
Expand All @@ -138,18 +140,23 @@ def get_current_jobs(self):
# get job info, logging any Slurm issues
# Note, all output lines of squeue are processed because we run it with
# --noheader.
for line in lines:
job = line.rstrip().split()
if len(job) >= 9:
job_id = job[0]
state = job[4]
current_jobs[job_id] = {
"jobid": job_id,
"state": state,
"reason": job[8],
}
if state in bad_state_messages:
log("Job {} in state {}: {}".format(job_id, state, bad_state_messages[state]))
if lines != ['']:
for line in lines:
job = [x.rstrip() for x in line.rstrip().split('@')]
if len(job) == 5:
job_id = job[0]
state = job[3]
current_jobs[job_id] = {
"jobid": job_id,
"cluster": job[1],
"partition": job[2],
"state": state,
"reason": job[4],
}
if state in bad_state_messages:
log("Job {} in state {}: {}".format(job_id, state, bad_state_messages[state]))
else:
raise Exception(f"The output of {squeue_cmd} does not have 5 job parameters")

return current_jobs

Expand Down Expand Up @@ -296,6 +303,14 @@ def process_new_job(self, new_job):
"""
job_id = new_job["jobid"]

# processing placeholders in scontrol command which is defined in the bot's app.cfg (setting `scontrol_command`)
Comment thread
laraPPr marked this conversation as resolved.
try:
self.scontrol_command = self.scontrol_command % new_job
except KeyError:
log(f"Failed to process {self.scontrol_command}.")
log(f"Information on placeholder is not collected in new_job: {new_job}.")
raise

scontrol_cmd = "%s --oneliner show jobid %s" % (
self.scontrol_command,
job_id,
Expand Down