diff --git a/README.md b/README.md index c139f1d9..dd6b3234 100644 --- a/README.md +++ b/README.md @@ -891,6 +891,7 @@ scontrol_command = /usr/bin/scontrol ``` `scontrol_command` is the full path to the Slurm command used for manipulating existing jobs. You may want to verify if `scontrol` is provided at that path or determine its actual location (via `which scontrol`). +It is also possible to add placeholder values to the scontrol_command. These placeholders can capture output from the `squeue` command that the bot runs internally, and pass it back to the `scontrol_command`. An example where this may be useful is in a setup where multiple clusters are managed by the same SLURM instance, and the `scontrol_command` for that instance needs to get the correct cluster name passed. This can be achieved by defining `scontrol_command = /usr/bin/scontrol --clusters=%%(cluster)s`. Valid placeholder names are currently: `jobid`, `cluster`, `partition`, `state`, and `reason`. #### `[submitted_job_comments]` section diff --git a/app.cfg.example b/app.cfg.example index 62caa332..0b393a4c 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -384,6 +384,11 @@ poll_command = /usr/bin/squeue poll_interval = 60 # full path to the command for manipulating existing jobs +# It is also possible to add placeholder values to the scontrol_command. +# An example where this may be useful is in a setup where multiple clusters are managed by the same SLURM instance, +# and the `scontrol_command` for that instance needs to get the correct cluster name passed. +# This can be achieved by defining `scontrol_command = /usr/bin/scontrol --clusters=%%(cluster)s`. +# Valid placeholder names are currently: `jobid`, `cluster`, `partition`, `state`, and `reason`. scontrol_command = /usr/bin/scontrol diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 1efb9d85..d0423923 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -118,6 +118,8 @@ def get_current_jobs(self): squeue_cmd = "%s --long --noheader --user=%s" % (self.poll_command, username) if self.job_name: squeue_cmd += " --name='%s'" % self.job_name + # Format the output of SLURM + squeue_cmd += " --Format JobId:100@,Cluster:100@,Partition:100@,State:100@,Reason:100" squeue_output, squeue_err, squeue_exitcode = run_cmd( squeue_cmd, "get_current_jobs(): squeue command", @@ -138,18 +140,23 @@ def get_current_jobs(self): # get job info, logging any Slurm issues # Note, all output lines of squeue are processed because we run it with # --noheader. - for line in lines: - job = line.rstrip().split() - if len(job) >= 9: - job_id = job[0] - state = job[4] - current_jobs[job_id] = { - "jobid": job_id, - "state": state, - "reason": job[8], - } - if state in bad_state_messages: - log("Job {} in state {}: {}".format(job_id, state, bad_state_messages[state])) + if lines != ['']: + for line in lines: + job = [x.rstrip() for x in line.rstrip().split('@')] + if len(job) == 5: + job_id = job[0] + state = job[3] + current_jobs[job_id] = { + "jobid": job_id, + "cluster": job[1], + "partition": job[2], + "state": state, + "reason": job[4], + } + if state in bad_state_messages: + log("Job {} in state {}: {}".format(job_id, state, bad_state_messages[state])) + else: + raise Exception(f"The output of {squeue_cmd} does not have 5 job parameters") return current_jobs @@ -296,6 +303,14 @@ def process_new_job(self, new_job): """ job_id = new_job["jobid"] + # processing placeholders in scontrol command which is defined in the bot's app.cfg (setting `scontrol_command`) + try: + self.scontrol_command = self.scontrol_command % new_job + except KeyError: + log(f"Failed to process {self.scontrol_command}.") + log(f"Information on placeholder is not collected in new_job: {new_job}.") + raise + scontrol_cmd = "%s --oneliner show jobid %s" % ( self.scontrol_command, job_id,