From 624109eef10549c6cc7e48ec703fc17596c43cb2 Mon Sep 17 00:00:00 2001 From: laraPPr Date: Thu, 17 Jul 2025 15:13:29 +0200 Subject: [PATCH 01/13] update the scontrol command to work on system with multiple clusters Signed-off-by: laraPPr --- eessi_bot_job_manager.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 4fcf9af3..23c32f3b 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -118,6 +118,8 @@ def get_current_jobs(self): squeue_cmd = "%s --long --noheader --user=%s" % (self.poll_command, username) if self.job_name: squeue_cmd += " --name='%s'" % self.job_name + # Format the output of SLURM + squeue_cmd += " --Format JobId:100@,Cluster:100@,Partition:100@,State:100@,Reason:100" squeue_output, squeue_err, squeue_exitcode = run_cmd( squeue_cmd, "get_current_jobs(): squeue command", @@ -139,14 +141,18 @@ def get_current_jobs(self): # Note, all output lines of squeue are processed because we run it with # --noheader. for line in lines: - job = line.rstrip().split() - if len(job) >= 9: - job_id = job[0] - state = job[4] + job = line.rstrip().split('@') + print(job) + if len(job) == 5: + print(job) + job_id = job[0].rstrip() + state = job[3].rstrip() current_jobs[job_id] = { "jobid": job_id, + "cluster": job[1].rstrip(), + "partition": job[2].rstrip(), "state": state, - "reason": job[8], + "reason": job[4].rstrip(), } if state in bad_state_messages: log("Job {} in state {}: {}".format(job_id, state, bad_state_messages[state])) @@ -296,6 +302,14 @@ def process_new_job(self, new_job): """ job_id = new_job["jobid"] + # check if their is a placeholder value in the scontrol_command + if bool(re.search(r'%\([^)]+\)s', self.scontrol_command)): + placeholders = re.findall(r'%\(([^)]+)\)s', self.scontrol_command) + for placeholder in placeholders: + if placeholder == 'new_job["cluster"]': + self.scontrol_command = self.scontrol_command % {placeholder: new_job["cluster"]} + print(new_job['cluster']) + print(self.scontrol_command) scontrol_cmd = "%s --oneliner show jobid %s" % ( self.scontrol_command, job_id, From b14043ec6966804cd305f646cf0d7c9a76e3666c Mon Sep 17 00:00:00 2001 From: Lara Ramona Peeters <49882639+laraPPr@users.noreply.github.com> Date: Thu, 17 Jul 2025 15:16:59 +0200 Subject: [PATCH 02/13] clean up --- eessi_bot_job_manager.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 23c32f3b..d6c59854 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -142,9 +142,7 @@ def get_current_jobs(self): # --noheader. for line in lines: job = line.rstrip().split('@') - print(job) if len(job) == 5: - print(job) job_id = job[0].rstrip() state = job[3].rstrip() current_jobs[job_id] = { @@ -308,8 +306,6 @@ def process_new_job(self, new_job): for placeholder in placeholders: if placeholder == 'new_job["cluster"]': self.scontrol_command = self.scontrol_command % {placeholder: new_job["cluster"]} - print(new_job['cluster']) - print(self.scontrol_command) scontrol_cmd = "%s --oneliner show jobid %s" % ( self.scontrol_command, job_id, From 03e720d6966be1b895c8756d94b3000bc000c7d4 Mon Sep 17 00:00:00 2001 From: laraPPr Date: Mon, 4 Aug 2025 11:29:04 +0200 Subject: [PATCH 03/13] add exception and simply using placeholder in scontrol command Signed-off-by: laraPPr --- eessi_bot_job_manager.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 23c32f3b..6860eafd 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -156,6 +156,8 @@ def get_current_jobs(self): } if state in bad_state_messages: log("Job {} in state {}: {}".format(job_id, state, bad_state_messages[state])) + else: + raise Exception(f"The output of {squeue_cmd} does not have 5 job parameters") return current_jobs @@ -302,14 +304,10 @@ def process_new_job(self, new_job): """ job_id = new_job["jobid"] - # check if their is a placeholder value in the scontrol_command - if bool(re.search(r'%\([^)]+\)s', self.scontrol_command)): - placeholders = re.findall(r'%\(([^)]+)\)s', self.scontrol_command) - for placeholder in placeholders: - if placeholder == 'new_job["cluster"]': - self.scontrol_command = self.scontrol_command % {placeholder: new_job["cluster"]} - print(new_job['cluster']) - print(self.scontrol_command) + # if placeholder "cluster" is used in scontrol command + self.scontrol_command = self.scontrol_command % { + 'new_job["cluster"]': new_job["cluster"] + } scontrol_cmd = "%s --oneliner show jobid %s" % ( self.scontrol_command, job_id, From 53b032f3a692342d673a4681b2979d2edbc6d958 Mon Sep 17 00:00:00 2001 From: laraPPr Date: Mon, 4 Aug 2025 11:35:28 +0200 Subject: [PATCH 04/13] make stylecheck happy Signed-off-by: laraPPr --- eessi_bot_job_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 62d66f2b..cd7e3cc3 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -303,9 +303,9 @@ def process_new_job(self, new_job): job_id = new_job["jobid"] # if placeholder "cluster" is used in scontrol command - self.scontrol_command = self.scontrol_command % { + self.scontrol_command = self.scontrol_command % { 'new_job["cluster"]': new_job["cluster"] - } + } scontrol_cmd = "%s --oneliner show jobid %s" % ( self.scontrol_command, job_id, From 519428f39796cfef81815068d13e5880fd6c20b9 Mon Sep 17 00:00:00 2001 From: laraPPr Date: Mon, 4 Aug 2025 11:43:58 +0200 Subject: [PATCH 05/13] some more error logging for the scontrol command Signed-off-by: laraPPr --- eessi_bot_job_manager.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index cd7e3cc3..ff2684ac 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -303,9 +303,15 @@ def process_new_job(self, new_job): job_id = new_job["jobid"] # if placeholder "cluster" is used in scontrol command - self.scontrol_command = self.scontrol_command % { - 'new_job["cluster"]': new_job["cluster"] - } + try: + placeholder = 'new_job["cluster"]' + self.scontrol_command = self.scontrol_command % { + placeholder: new_job["cluster"] + } + except KeyError: + log(f"Failed to process placeholder in scontrol_command. Expected {placeholder} or nothing.") + raise + scontrol_cmd = "%s --oneliner show jobid %s" % ( self.scontrol_command, job_id, From 72c39a44306736edd3146b9c186077b8d877ee5f Mon Sep 17 00:00:00 2001 From: laraPPr Date: Wed, 6 Aug 2025 11:33:17 +0200 Subject: [PATCH 06/13] simplify using placeholder for scontrol command Signed-off-by: laraPPr --- eessi_bot_job_manager.py | 45 ++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 0ea474b2..baa01859 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -117,7 +117,7 @@ def get_current_jobs(self): squeue_cmd = "%s --long --noheader --user=%s" % (self.poll_command, username) if self.job_name: - squeue_cmd += " --name='%s'" % self.job_name + squeue_cmd += " --name=%s" % self.job_name # Format the output of SLURM squeue_cmd += " --Format JobId:100@,Cluster:100@,Partition:100@,State:100@,Reason:100" squeue_output, squeue_err, squeue_exitcode = run_cmd( @@ -140,22 +140,23 @@ def get_current_jobs(self): # get job info, logging any Slurm issues # Note, all output lines of squeue are processed because we run it with # --noheader. - for line in lines: - job = line.rstrip().split('@') - if len(job) == 5: - job_id = job[0].rstrip() - state = job[3].rstrip() - current_jobs[job_id] = { - "jobid": job_id, - "cluster": job[1].rstrip(), - "partition": job[2].rstrip(), - "state": state, - "reason": job[4].rstrip(), - } - if state in bad_state_messages: - log("Job {} in state {}: {}".format(job_id, state, bad_state_messages[state])) - else: - raise Exception(f"The output of {squeue_cmd} does not have 5 job parameters") + if lines != ['']: + for line in lines: + job = line.rstrip().split('@') + if len(job) == 5: + job_id = job[0].rstrip() + state = job[3].rstrip() + current_jobs[job_id] = { + "jobid": job_id, + "cluster": job[1].rstrip(), + "partition": job[2].rstrip(), + "state": state, + "reason": job[4].rstrip(), + } + if state in bad_state_messages: + log("Job {} in state {}: {}".format(job_id, state, bad_state_messages[state])) + else: + raise Exception(f"The output of {squeue_cmd} does not have 5 job parameters") return current_jobs @@ -302,14 +303,12 @@ def process_new_job(self, new_job): """ job_id = new_job["jobid"] - # if placeholder "cluster" is used in scontrol command + # if placeholder is used in scontrol command try: - placeholder = 'new_job["cluster"]' - self.scontrol_command = self.scontrol_command % { - placeholder: new_job["cluster"] - } + self.scontrol_command = self.scontrol_command % new_job except KeyError: - log(f"Failed to process placeholder in scontrol_command. Expected {placeholder} or nothing.") + log(f"Failed to process {self.scontrol_command}.") + log(f"Information on placeholder is not collected in new_job: {new_job}.") raise scontrol_cmd = "%s --oneliner show jobid %s" % ( From 19c28f5fcef4942184082aece72050c38733421b Mon Sep 17 00:00:00 2001 From: laraPPr Date: Wed, 6 Aug 2025 11:36:42 +0200 Subject: [PATCH 07/13] remove uneccesary change Signed-off-by: laraPPr --- eessi_bot_job_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index baa01859..14c10abb 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -117,7 +117,7 @@ def get_current_jobs(self): squeue_cmd = "%s --long --noheader --user=%s" % (self.poll_command, username) if self.job_name: - squeue_cmd += " --name=%s" % self.job_name + squeue_cmd += " --name='%s'" % self.job_name # Format the output of SLURM squeue_cmd += " --Format JobId:100@,Cluster:100@,Partition:100@,State:100@,Reason:100" squeue_output, squeue_err, squeue_exitcode = run_cmd( From 9433bea664a06c7d244c5aa97a4d6fb8b7c970fa Mon Sep 17 00:00:00 2001 From: laraPPr Date: Wed, 6 Aug 2025 11:44:56 +0200 Subject: [PATCH 08/13] use rstrip in a list comprehension Signed-off-by: laraPPr --- eessi_bot_job_manager.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 14c10abb..e9fbf4b7 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -142,16 +142,16 @@ def get_current_jobs(self): # --noheader. if lines != ['']: for line in lines: - job = line.rstrip().split('@') + job = [x.rstrip() for x in line.rstrip().split('@')] if len(job) == 5: - job_id = job[0].rstrip() - state = job[3].rstrip() + job_id = job[0] + state = job[3] current_jobs[job_id] = { "jobid": job_id, - "cluster": job[1].rstrip(), - "partition": job[2].rstrip(), + "cluster": job[1], + "partition": job[2], "state": state, - "reason": job[4].rstrip(), + "reason": job[4], } if state in bad_state_messages: log("Job {} in state {}: {}".format(job_id, state, bad_state_messages[state])) From 09a4842388078b39b8ceeded0973d6c1acbb08e3 Mon Sep 17 00:00:00 2001 From: Lara Ramona Peeters <49882639+laraPPr@users.noreply.github.com> Date: Wed, 6 Aug 2025 15:08:53 +0200 Subject: [PATCH 09/13] Update eessi_bot_job_manager.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Thomas Röblitz --- eessi_bot_job_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index e9fbf4b7..d0423923 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -303,7 +303,7 @@ def process_new_job(self, new_job): """ job_id = new_job["jobid"] - # if placeholder is used in scontrol command + # processing placeholders in scontrol command which is defined in the bot's app.cfg (setting `scontrol_command`) try: self.scontrol_command = self.scontrol_command % new_job except KeyError: From a2e52ddc347053a77e442783a73e0059dfd3b239 Mon Sep 17 00:00:00 2001 From: laraPPr Date: Thu, 7 Aug 2025 10:05:27 +0200 Subject: [PATCH 10/13] Update README Signed-off-by: laraPPr --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index c139f1d9..3f87cf2f 100644 --- a/README.md +++ b/README.md @@ -891,6 +891,8 @@ scontrol_command = /usr/bin/scontrol ``` `scontrol_command` is the full path to the Slurm command used for manipulating existing jobs. You may want to verify if `scontrol` is provided at that path or determine its actual location (via `which scontrol`). +It is also possible to add placeholder values to the scontrol_command. This might be necessary on systems where the name of the clusters needs to be passed in order to manage the jobs. For example: `/usr/bin/scontrol --clusters=%%(cluster)s`. +Only placeholders defined in `current_jobs` can be included in the scontrol_command this is currently jobid, cluster, partition, state and reason. #### `[submitted_job_comments]` section From 7a55d304f38c5c9405c9ff47840b250290e3dfc3 Mon Sep 17 00:00:00 2001 From: Lara Ramona Peeters <49882639+laraPPr@users.noreply.github.com> Date: Tue, 19 Aug 2025 14:27:17 +0200 Subject: [PATCH 11/13] Update README.md Co-authored-by: Caspar van Leeuwen <33718780+casparvl@users.noreply.github.com> --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 3f87cf2f..b565756a 100644 --- a/README.md +++ b/README.md @@ -891,8 +891,7 @@ scontrol_command = /usr/bin/scontrol ``` `scontrol_command` is the full path to the Slurm command used for manipulating existing jobs. You may want to verify if `scontrol` is provided at that path or determine its actual location (via `which scontrol`). -It is also possible to add placeholder values to the scontrol_command. This might be necessary on systems where the name of the clusters needs to be passed in order to manage the jobs. For example: `/usr/bin/scontrol --clusters=%%(cluster)s`. -Only placeholders defined in `current_jobs` can be included in the scontrol_command this is currently jobid, cluster, partition, state and reason. +It is also possible to add placeholder values to the scontrol_command. These placeholders can capture output from the `squeue` command that the bot runs internally, and pass it back to the `scontrol_command`. An example where this is in a setup where multiple clusters are managed by the same SLURM instance, and the `scontrol_command` for that instance needs to get the correct cluster name passed. This can be achieved by defining `scontrol_command = /usr/bin/scontrol --clusters=%%(cluster)s`. Valid placeholder names are currently: `jobid`, `cluster`, `partition`, `state`, and `reason`. #### `[submitted_job_comments]` section From e34925748ab0d71fe0458c9ca40516ca9ab77958 Mon Sep 17 00:00:00 2001 From: Lara Ramona Peeters <49882639+laraPPr@users.noreply.github.com> Date: Tue, 19 Aug 2025 14:37:08 +0200 Subject: [PATCH 12/13] Update README.md Co-authored-by: Caspar van Leeuwen <33718780+casparvl@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b565756a..dd6b3234 100644 --- a/README.md +++ b/README.md @@ -891,7 +891,7 @@ scontrol_command = /usr/bin/scontrol ``` `scontrol_command` is the full path to the Slurm command used for manipulating existing jobs. You may want to verify if `scontrol` is provided at that path or determine its actual location (via `which scontrol`). -It is also possible to add placeholder values to the scontrol_command. These placeholders can capture output from the `squeue` command that the bot runs internally, and pass it back to the `scontrol_command`. An example where this is in a setup where multiple clusters are managed by the same SLURM instance, and the `scontrol_command` for that instance needs to get the correct cluster name passed. This can be achieved by defining `scontrol_command = /usr/bin/scontrol --clusters=%%(cluster)s`. Valid placeholder names are currently: `jobid`, `cluster`, `partition`, `state`, and `reason`. +It is also possible to add placeholder values to the scontrol_command. These placeholders can capture output from the `squeue` command that the bot runs internally, and pass it back to the `scontrol_command`. An example where this may be useful is in a setup where multiple clusters are managed by the same SLURM instance, and the `scontrol_command` for that instance needs to get the correct cluster name passed. This can be achieved by defining `scontrol_command = /usr/bin/scontrol --clusters=%%(cluster)s`. Valid placeholder names are currently: `jobid`, `cluster`, `partition`, `state`, and `reason`. #### `[submitted_job_comments]` section From 16a4e0f8c905bb2f1b6424637bcc50d072f0254a Mon Sep 17 00:00:00 2001 From: laraPPr Date: Tue, 19 Aug 2025 16:03:29 +0200 Subject: [PATCH 13/13] add description to app.cfg.example Signed-off-by: laraPPr --- app.cfg.example | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/app.cfg.example b/app.cfg.example index 62caa332..0b393a4c 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -384,6 +384,11 @@ poll_command = /usr/bin/squeue poll_interval = 60 # full path to the command for manipulating existing jobs +# It is also possible to add placeholder values to the scontrol_command. +# An example where this may be useful is in a setup where multiple clusters are managed by the same SLURM instance, +# and the `scontrol_command` for that instance needs to get the correct cluster name passed. +# This can be achieved by defining `scontrol_command = /usr/bin/scontrol --clusters=%%(cluster)s`. +# Valid placeholder names are currently: `jobid`, `cluster`, `partition`, `state`, and `reason`. scontrol_command = /usr/bin/scontrol