Skip to content

Commit 6b00963

Browse files
authored
Merge pull request #359 from sondrebr/add_bot_cancel
Add new `bot: cancel` command
2 parents c7d85e0 + 2785cd6 commit 6b00963

9 files changed

Lines changed: 226 additions & 23 deletions

File tree

README.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,12 @@ submit_command = /usr/bin/sbatch
554554

555555
`submit_command` is the full path to the Slurm job submission command used for submitting batch jobs. You may want to verify if `sbatch` is provided at that path or determine its actual location (using `which sbatch`).
556556

557+
```ini
558+
cancel_command = /usr/bin/scancel
559+
```
560+
561+
`cancel_command` is the full path to the Slurm command used for cancelling batch jobs. You may want to verify if `scancel` is provided at that path or determine its actual location (using `which scancel`).
562+
557563
```ini
558564
build_permission = -NOT_ALLOWED_GH_ACCOUNT_NAME- [...]
559565
```
@@ -566,11 +572,11 @@ name on GitHub. Thus, one could not - by accident - give build permissions to an
566572
unknown account.
567573

568574
```ini
569-
no_build_permission_comment = The `bot: build ...` command has been used by user `{build_labeler}`, but this person does not have permission to trigger builds.
575+
no_build_permission_comment = GH account `{build_labeler}` is not authorized to trigger or cancel build jobs.
570576
```
571577

572578
`no_build_permission_comment` defines a comment (template) that is used when
573-
the account trying to trigger build jobs has no permission to do so.
579+
the account trying to trigger or cancel build jobs has no permission to do so.
574580

575581
```ini
576582
allow_update_submit_opts = false

app.cfg.example

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,13 +158,16 @@ slurm_params = --hold
158158
# full path to the job submission command
159159
submit_command = /usr/bin/sbatch
160160

161+
# full path to the job cancellation command
162+
cancel_command = /usr/bin/scancel
163+
161164
# defines which GitHub accounts have the permission to trigger
162165
# build jobs, i.e., for which accounts the bot acts on `bot: build ...`
163166
# commands. If the value is left empty, everyone can trigger build jobs.
164167
build_permission = -NOT_ALLOWED_GH_ACCOUNT_NAME-
165168

166169
# template for comment when user who set a label has no permission to trigger build jobs
167-
no_build_permission_comment = Label `bot:build` has been set by user `{build_labeler}`, but this person does not have permission to trigger builds
170+
no_build_permission_comment = GH account `{build_labeler}` is not authorized to trigger or cancel build jobs.
168171

169172
# whether or not to allow updating the submit options via custom module det_submit_opts
170173
# Should only be enabled (true) with care because this will result in code from the target

eessi_bot_event_handler.py

Lines changed: 60 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@
2929

3030
# Local application imports (anything from EESSI/eessi-bot-software-layer)
3131
from connections import github
32-
from tasks.build import check_build_permission, get_node_types, request_bot_build_issue_comments, \
33-
submit_build_jobs
32+
from tasks.build import cancel_jobs, check_build_permission, get_job_ids, get_node_types, \
33+
get_work_dirs, request_bot_build_issue_comments, submit_build_jobs
3434
from tasks.deploy import deploy_built_artefacts, determine_job_dirs
3535
from tasks.clean_up import move_to_trash_bin
3636
from tools import config
@@ -53,6 +53,7 @@
5353
config.BUILDENV_SETTING_BUILD_JOB_SCRIPT, # required
5454
config.BUILDENV_SETTING_BUILD_LOGS_DIR, # optional+recommended
5555
config.BUILDENV_SETTING_BUILD_PERMISSION, # optional+recommended
56+
config.BUILDENV_SETTING_CANCEL_COMMAND, # required
5657
config.BUILDENV_SETTING_CONTAINER_CACHEDIR, # optional+recommended
5758
# config.BUILDENV_SETTING_CLONE_GIT_REPO_VIA, # optional
5859
# config.BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS, # optional
@@ -103,6 +104,7 @@
103104
# the poll interval setting is required for the alternative job handover
104105
# protocol (delayed_begin)
105106
config.SECTION_JOB_MANAGER: [
107+
config.JOB_MANAGER_SETTING_POLL_COMMAND, # required
106108
config.JOB_MANAGER_SETTING_POLL_INTERVAL], # required
107109
config.SECTION_REPO_TARGETS: [
108110
config.REPO_TARGETS_SETTING_REPOS_CFG_DIR], # required
@@ -508,7 +510,7 @@ def handle_bot_command_help(self, event_info, bot_command):
508510
help_msg += "\n - Commands must be sent with a **new** comment (edits of existing comments are ignored)."
509511
help_msg += "\n - A comment may contain multiple commands, one per line."
510512
help_msg += "\n - Every command begins at the start of a line and has the syntax `bot: COMMAND [ARGUMENTS]*`"
511-
help_msg += "\n - Currently supported COMMANDs are: `help`, `build`, `show_config`, `status`"
513+
help_msg += "\n - Currently supported COMMANDs are: `help`, `build`, `show_config`, `status`, `cancel`"
512514
help_msg += "\n"
513515
help_msg += "\n For more information, see https://www.eessi.io/docs/bot"
514516
return help_msg
@@ -680,6 +682,61 @@ def handle_bot_command_status(self, event_info, bot_command):
680682
else:
681683
return "\n - failed to create status comment"
682684

685+
def handle_bot_command_cancel(self, event_info, bot_command):
686+
"""
687+
Handles bot command 'cancel' by parsing 'jobid:' arguments and
688+
cancelling the jobs.
689+
690+
Args:
691+
event_info (dict): event received by event_handler
692+
bot_command (EESSIBotCommand): command to be handled
693+
694+
Returns:
695+
comment (string): list of cancelled jobs if any, error message if not
696+
"""
697+
self.log("processing bot command 'cancel'")
698+
699+
request_body = event_info["raw_request_body"]
700+
repo_name = request_body["repository"]["full_name"]
701+
pr_number = request_body["issue"]["number"]
702+
user = request_body["comment"]["user"]["login"]
703+
704+
gh = github.get_instance()
705+
pr = gh.get_repo(repo_name).get_pull(pr_number)
706+
707+
# Jobs can only be cancelled by the user who submitted the job
708+
# -> No need to proceed if user cannot submit jobs
709+
if not check_build_permission(pr, event_info):
710+
self.log(f"User '{user}' does not have build permission - skipping cancellation.")
711+
return f"\n - User `{user}` cannot submit or cancel build jobs."
712+
713+
# Get valid 'jobid:' arguments
714+
job_ids = get_job_ids(bot_command.action_filters)
715+
if len(job_ids) == 0:
716+
self.log("Got no valid job IDs")
717+
return "\n - No valid job IDs were given."
718+
719+
# Get working directories of jobs
720+
work_dirs = get_work_dirs(job_ids, self.cfg)
721+
if len(work_dirs) == 0:
722+
self.log("None of the given jobs are cancellable")
723+
return "\n - No cancellable jobs were given."
724+
725+
# Log skipped jobs
726+
for job_id in job_ids:
727+
if job_id not in work_dirs.keys():
728+
log(f"Skipping job {job_id} - not found")
729+
730+
# Cancel jobs
731+
cancelled_jobs = cancel_jobs(work_dirs, user, pr, self.cfg)
732+
if len(cancelled_jobs) == 0:
733+
return "\n - No jobs were cancelled."
734+
else:
735+
comment = ""
736+
for job_id in cancelled_jobs:
737+
comment += f"\n - cancelled job `{job_id}`"
738+
return comment
739+
683740
def start(self, app, port=3000):
684741
"""
685742
Logs startup information to shell and log file and starts the app using

tasks/build.py

Lines changed: 131 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@
3737
from connections import github
3838
from tools import config, cvmfs_repository, job_metadata, pr_comments, run_cmd
3939
import tools.filter as tools_filter
40-
from tools.pr_comments import ChatLevels, create_comment
40+
from tools.pr_comments import ChatLevels, create_comment, update_comment
4141
from tools.build_params import BUILD_PARAM_ARCH, BUILD_PARAM_ACCEL
4242

4343
# defaults (used if not specified via, eg, 'app.cfg')
@@ -54,7 +54,9 @@
5454
# other constants
5555
EXPORT_VARS_FILE = 'export_vars.sh'
5656

57-
Job = namedtuple('Job', ('working_dir', 'arch_target', 'repo_id', 'slurm_opts', 'year_month', 'pr_id', 'accelerator'))
57+
58+
Job = namedtuple('Job',
59+
('working_dir', 'arch_target', 'repo_id', 'slurm_opts', 'year_month', 'pr_id', 'accelerator', 'owner'))
5860

5961
# global repo_cfg
6062
repo_cfg = {}
@@ -111,6 +113,10 @@ def get_build_env_cfg(cfg):
111113
log(f"{fn}(): submit_command '{submit_command}'")
112114
config_data[config.BUILDENV_SETTING_SUBMIT_COMMAND] = submit_command
113115

116+
cancel_command = buildenv.get(config.BUILDENV_SETTING_CANCEL_COMMAND)
117+
log(f"{fn}(): cancel_command '{cancel_command}'")
118+
config_data[config.BUILDENV_SETTING_CANCEL_COMMAND] = cancel_command
119+
114120
job_handover_protocol = buildenv.get(config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL)
115121
slurm_params = buildenv.get(config.BUILDENV_SETTING_SLURM_PARAMS)
116122
if job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_HOLD_RELEASE:
@@ -585,6 +591,8 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params):
585591
base_branch_name = pr.base.ref
586592
log(f"{fn}(): pr.base.repo.ref '{base_branch_name}'")
587593

594+
job_owner = event_info['raw_request_body']['sender']['login']
595+
588596
# create run dir (base directory for potentially several jobs)
589597
# TODO may still be too early (before we get to any actual job being
590598
# prepared below when calling 'download_pr')
@@ -692,7 +700,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params):
692700

693701
# enlist jobs to proceed
694702
job = Job(job_dir, partition_info['cpu_subdir'], repo_id, partition_info['slurm_params'], year_month,
695-
pr_id, accelerator)
703+
pr_id, accelerator, job_owner)
696704
jobs.append(job)
697705

698706
log(f"{fn}(): {len(jobs)} jobs to proceed after applying white list")
@@ -1391,3 +1399,123 @@ def request_bot_build_issue_comments(repo_name, pr_number):
13911399
status_table['result'].append(result)
13921400

13931401
return status_table
1402+
1403+
1404+
def get_job_ids(action_filter):
1405+
"""
1406+
Gets and validates 'jobid:' arguments.
1407+
1408+
Args:
1409+
action_filter (EESSIBotActionFilter): Instance containing 'jobid:' arguments
1410+
1411+
Returns:
1412+
job_ids (list): valid 'jobid:' arguments
1413+
"""
1414+
fn = sys._getframe().f_code.co_name
1415+
1416+
# Get 'jobid:' arguments
1417+
job_filter = action_filter.get_filter_by_component(tools_filter.FILTER_COMPONENT_JOBID)
1418+
if not job_filter:
1419+
log(f"{fn}(): 'bot: cancel' command needs at least one 'jobid:' argument.")
1420+
return []
1421+
1422+
# Validate job IDs
1423+
job_ids = []
1424+
for job_id in job_filter:
1425+
try:
1426+
if int(job_id) > 0:
1427+
job_ids.append(job_id)
1428+
else:
1429+
log(f"{fn}(): Invalid job ID: '{job_id}'")
1430+
except Exception as err:
1431+
log(f"{fn}(): Invalid job ID: {err}")
1432+
1433+
return job_ids
1434+
1435+
1436+
def get_work_dirs(job_ids, cfg):
1437+
"""
1438+
Gets working directories of build jobs.
1439+
1440+
Args:
1441+
job_ids (list): list of job_ids to check.
1442+
cfg (ConfigParser): Instance containing full configuration from app.cfg
1443+
1444+
Returns:
1445+
work_dirs (dict): dict mapping each job_id to its work_dir
1446+
"""
1447+
poll_command = cfg[config.SECTION_JOB_MANAGER][config.JOB_MANAGER_SETTING_POLL_COMMAND]
1448+
1449+
# squeue only the given job IDs
1450+
cs_jobs = ",".join(job_ids)
1451+
command_line = f"{poll_command} --noheader --Format=JobId:0@,WorkDir:0 --job={cs_jobs}"
1452+
out, err, exit_code = run_cmd(command_line, "Get WorkDirs of jobs")
1453+
1454+
# All output lines are formatted as '{job_id}@{work_dir}'
1455+
work_dirs = {}
1456+
for line in out.split("\n"):
1457+
job = [field.strip() for field in line.split("@")]
1458+
if len(job) != 2:
1459+
continue
1460+
work_dirs[job[0]] = job[1]
1461+
1462+
return work_dirs
1463+
1464+
1465+
def cancel_jobs(jobs, user, pr, cfg):
1466+
"""
1467+
Cancels the given build jobs.
1468+
1469+
Args:
1470+
jobs (dict): dictionary mapping each job_id to cancel to its work_dir
1471+
user (str): The user who sent the 'bot: cancel' command
1472+
pr (github.PullRequest.PullRequest): instance representing the pull request
1473+
cfg (ConfigParser): Instance containing full configuration from app.cfg
1474+
1475+
Returns:
1476+
cancelled_jobs (list): job_ids of successfully cancelled jobs
1477+
"""
1478+
fn = sys._getframe().f_code.co_name
1479+
1480+
buildenv = get_build_env_cfg(cfg)
1481+
cancel_command = buildenv[config.BUILDENV_SETTING_CANCEL_COMMAND]
1482+
1483+
cancelled_jobs = []
1484+
for job_id, work_dir in jobs.items():
1485+
# Get job owner and PR comment ID from metadata
1486+
metadata_path = os.path.join(work_dir, f"_bot_job{job_id}.metadata")
1487+
metadata = job_metadata.get_section_from_file(
1488+
filepath=metadata_path,
1489+
section=job_metadata.JOB_PR_SECTION,
1490+
)
1491+
job_owner = metadata.get(job_metadata.JOB_PR_JOB_OWNER)
1492+
pr_comment_id = metadata.get(job_metadata.JOB_PR_PR_COMMENT_ID)
1493+
1494+
# Only the job owner should be able to cancel a job
1495+
if job_owner != user:
1496+
log(f"{fn}(): User {user} did not start job {job_id} - skipping cancellation")
1497+
continue
1498+
log(f"{fn}(): Job {job_id} was started by user {user} - cancelling job")
1499+
1500+
# Cancel job
1501+
command_line = f"{cancel_command} --verbose {job_id}"
1502+
out, err, exit_code = run_cmd(command_line, f"cancel job {job_id}", raise_on_error=False)
1503+
1504+
# Check if command was successful
1505+
if exit_code != 0:
1506+
log(f"{fn}(): scancel resulted in a non-zero exit code for job {job_id}.")
1507+
continue
1508+
if any([line.startswith("scancel: error: ") for line in err.split("\n")]):
1509+
log(f"{fn}(): Unable to cancel job {job_id}.")
1510+
continue
1511+
1512+
log(f"{fn}(): Cancelled job {job_id}")
1513+
1514+
# Update job status table
1515+
dt = datetime.now(timezone.utc)
1516+
update = f"\n|{dt.strftime('%b %d %X %Z %Y')}|finished|job id `{job_id}` was cancelled|"
1517+
update_comment(int(pr_comment_id), pr, update)
1518+
1519+
cancelled_jobs.append(job_id)
1520+
1521+
return cancelled_jobs

tests/test_bot_job123.metadata

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
repo = test_repo
33
pr_number = 999
44
pr_comment_id = 77
5+
job_owner = user01
56

0 commit comments

Comments
 (0)