Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -891,6 +891,8 @@ scontrol_command = /usr/bin/scontrol
```

`scontrol_command` is the full path to the Slurm command used for manipulating existing jobs. You may want to verify if `scontrol` is provided at that path or determine its actual location (via `which scontrol`).
It is also possible to add placeholder values to the scontrol_command. This might be necessary on systems where the name of the clusters needs to be passed in order to manage the jobs. For example: `/usr/bin/scontrol --clusters=%%(cluster)s`.
Only placeholders defined in `current_jobs` can be included in the scontrol_command this is currently jobid, cluster, partition, state and reason.
Comment thread
laraPPr marked this conversation as resolved.
Outdated

#### `[submitted_job_comments]` section

Expand Down
39 changes: 27 additions & 12 deletions eessi_bot_job_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ def get_current_jobs(self):
squeue_cmd = "%s --long --noheader --user=%s" % (self.poll_command, username)
if self.job_name:
squeue_cmd += " --name='%s'" % self.job_name
# Format the output of SLURM
squeue_cmd += " --Format JobId:100@,Cluster:100@,Partition:100@,State:100@,Reason:100"
squeue_output, squeue_err, squeue_exitcode = run_cmd(
squeue_cmd,
"get_current_jobs(): squeue command",
Expand All @@ -138,18 +140,23 @@ def get_current_jobs(self):
# get job info, logging any Slurm issues
# Note, all output lines of squeue are processed because we run it with
# --noheader.
for line in lines:
job = line.rstrip().split()
if len(job) >= 9:
job_id = job[0]
state = job[4]
current_jobs[job_id] = {
"jobid": job_id,
"state": state,
"reason": job[8],
}
if state in bad_state_messages:
log("Job {} in state {}: {}".format(job_id, state, bad_state_messages[state]))
if lines != ['']:
for line in lines:
job = [x.rstrip() for x in line.rstrip().split('@')]
if len(job) == 5:
job_id = job[0]
state = job[3]
current_jobs[job_id] = {
"jobid": job_id,
"cluster": job[1],
"partition": job[2],
"state": state,
"reason": job[4],
}
if state in bad_state_messages:
log("Job {} in state {}: {}".format(job_id, state, bad_state_messages[state]))
else:
raise Exception(f"The output of {squeue_cmd} does not have 5 job parameters")

return current_jobs

Expand Down Expand Up @@ -296,6 +303,14 @@ def process_new_job(self, new_job):
"""
job_id = new_job["jobid"]

# processing placeholders in scontrol command which is defined in the bot's app.cfg (setting `scontrol_command`)
Comment thread
laraPPr marked this conversation as resolved.
try:
self.scontrol_command = self.scontrol_command % new_job
except KeyError:
log(f"Failed to process {self.scontrol_command}.")
log(f"Information on placeholder is not collected in new_job: {new_job}.")
raise

scontrol_cmd = "%s --oneliner show jobid %s" % (
self.scontrol_command,
job_id,
Expand Down