diff --git a/README.md b/README.md index 8aa1b931..58bb369f 100644 --- a/README.md +++ b/README.md @@ -548,6 +548,12 @@ submit_command = /usr/bin/sbatch `submit_command` is the full path to the Slurm job submission command used for submitting batch jobs. You may want to verify if `sbatch` is provided at that path or determine its actual location (using `which sbatch`). +```ini +cancel_command = /usr/bin/scancel +``` + +`cancel_command` is the full path to the Slurm command used for cancelling batch jobs. You may want to verify if `scancel` is provided at that path or determine its actual location (using `which scancel`). + ```ini build_permission = -NOT_ALLOWED_GH_ACCOUNT_NAME- [...] ``` @@ -560,11 +566,11 @@ name on GitHub. Thus, one could not - by accident - give build permissions to an unknown account. ```ini -no_build_permission_comment = The `bot: build ...` command has been used by user `{build_labeler}`, but this person does not have permission to trigger builds. +no_build_permission_comment = GH account `{build_labeler}` is not authorized to trigger or cancel build jobs. ``` `no_build_permission_comment` defines a comment (template) that is used when -the account trying to trigger build jobs has no permission to do so. +the account trying to trigger or cancel build jobs has no permission to do so. ```ini allow_update_submit_opts = false diff --git a/app.cfg.example b/app.cfg.example index 0b393a4c..f4981d6a 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -155,13 +155,16 @@ slurm_params = --hold # full path to the job submission command submit_command = /usr/bin/sbatch +# full path to the job cancellation command +cancel_command = /usr/bin/scancel + # defines which GitHub accounts have the permission to trigger # build jobs, i.e., for which accounts the bot acts on `bot: build ...` # commands. If the value is left empty, everyone can trigger build jobs. build_permission = -NOT_ALLOWED_GH_ACCOUNT_NAME- # template for comment when user who set a label has no permission to trigger build jobs -no_build_permission_comment = Label `bot:build` has been set by user `{build_labeler}`, but this person does not have permission to trigger builds +no_build_permission_comment = GH account `{build_labeler}` is not authorized to trigger or cancel build jobs. # whether or not to allow updating the submit options via custom module det_submit_opts # Should only be enabled (true) with care because this will result in code from the target diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index 04529a7e..31ba82ae 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -29,8 +29,8 @@ # Local application imports (anything from EESSI/eessi-bot-software-layer) from connections import github -from tasks.build import check_build_permission, get_node_types, request_bot_build_issue_comments, \ - submit_build_jobs +from tasks.build import cancel_jobs, check_build_permission, get_job_ids, get_node_types, \ + get_work_dirs, request_bot_build_issue_comments, submit_build_jobs from tasks.deploy import deploy_built_artefacts, determine_job_dirs from tasks.clean_up import move_to_trash_bin from tools import config @@ -53,6 +53,7 @@ config.BUILDENV_SETTING_BUILD_JOB_SCRIPT, # required config.BUILDENV_SETTING_BUILD_LOGS_DIR, # optional+recommended config.BUILDENV_SETTING_BUILD_PERMISSION, # optional+recommended + config.BUILDENV_SETTING_CANCEL_COMMAND, # required config.BUILDENV_SETTING_CONTAINER_CACHEDIR, # optional+recommended # config.BUILDENV_SETTING_CLONE_GIT_REPO_VIA, # optional # config.BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS, # optional @@ -102,6 +103,7 @@ # the poll interval setting is required for the alternative job handover # protocol (delayed_begin) config.SECTION_JOB_MANAGER: [ + config.JOB_MANAGER_SETTING_POLL_COMMAND, # required config.JOB_MANAGER_SETTING_POLL_INTERVAL], # required config.SECTION_REPO_TARGETS: [ config.REPO_TARGETS_SETTING_REPOS_CFG_DIR], # required @@ -507,7 +509,7 @@ def handle_bot_command_help(self, event_info, bot_command): help_msg += "\n - Commands must be sent with a **new** comment (edits of existing comments are ignored)." help_msg += "\n - A comment may contain multiple commands, one per line." help_msg += "\n - Every command begins at the start of a line and has the syntax `bot: COMMAND [ARGUMENTS]*`" - help_msg += "\n - Currently supported COMMANDs are: `help`, `build`, `show_config`, `status`" + help_msg += "\n - Currently supported COMMANDs are: `help`, `build`, `show_config`, `status`, `cancel`" help_msg += "\n" help_msg += "\n For more information, see https://www.eessi.io/docs/bot" return help_msg @@ -679,6 +681,61 @@ def handle_bot_command_status(self, event_info, bot_command): else: return "\n - failed to create status comment" + def handle_bot_command_cancel(self, event_info, bot_command): + """ + Handles bot command 'cancel' by parsing 'jobid:' arguments and + cancelling the jobs. + + Args: + event_info (dict): event received by event_handler + bot_command (EESSIBotCommand): command to be handled + + Returns: + comment (string): list of cancelled jobs if any, error message if not + """ + self.log("processing bot command 'cancel'") + + request_body = event_info["raw_request_body"] + repo_name = request_body["repository"]["full_name"] + pr_number = request_body["issue"]["number"] + user = request_body["comment"]["user"]["login"] + + gh = github.get_instance() + pr = gh.get_repo(repo_name).get_pull(pr_number) + + # Jobs can only be cancelled by the user who submitted the job + # -> No need to proceed if user cannot submit jobs + if not check_build_permission(pr, event_info): + self.log(f"User '{user}' does not have build permission - skipping cancellation.") + return f"\n - User `{user}` cannot submit or cancel build jobs." + + # Get valid 'jobid:' arguments + job_ids = get_job_ids(bot_command.action_filters) + if len(job_ids) == 0: + self.log("Got no valid job IDs") + return "\n - No valid job IDs were given." + + # Get working directories of jobs + work_dirs = get_work_dirs(job_ids, self.cfg) + if len(work_dirs) == 0: + self.log("None of the given jobs are cancellable") + return "\n - No cancellable jobs were given." + + # Log skipped jobs + for job_id in job_ids: + if job_id not in work_dirs.keys(): + log(f"Skipping job {job_id} - not found") + + # Cancel jobs + cancelled_jobs = cancel_jobs(work_dirs, user, pr, self.cfg) + if len(cancelled_jobs) == 0: + return "\n - No jobs were cancelled." + else: + comment = "" + for job_id in cancelled_jobs: + comment += f"\n - cancelled job `{job_id}`" + return comment + def start(self, app, port=3000): """ Logs startup information to shell and log file and starts the app using diff --git a/tasks/build.py b/tasks/build.py index 165ab544..0522da6a 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -34,7 +34,7 @@ # Local application imports (anything from EESSI/eessi-bot-software-layer) from tools import config, cvmfs_repository, job_metadata, pr_comments, run_cmd import tools.filter as tools_filter -from tools.pr_comments import ChatLevels, create_comment +from tools.pr_comments import ChatLevels, create_comment, update_comment from tools.build_params import BUILD_PARAM_ARCH, BUILD_PARAM_ACCEL # defaults (used if not specified via, eg, 'app.cfg') @@ -51,7 +51,9 @@ # other constants EXPORT_VARS_FILE = 'export_vars.sh' -Job = namedtuple('Job', ('working_dir', 'arch_target', 'repo_id', 'slurm_opts', 'year_month', 'pr_id', 'accelerator')) + +Job = namedtuple('Job', + ('working_dir', 'arch_target', 'repo_id', 'slurm_opts', 'year_month', 'pr_id', 'accelerator', 'owner')) # global repo_cfg repo_cfg = {} @@ -108,6 +110,10 @@ def get_build_env_cfg(cfg): log(f"{fn}(): submit_command '{submit_command}'") config_data[config.BUILDENV_SETTING_SUBMIT_COMMAND] = submit_command + cancel_command = buildenv.get(config.BUILDENV_SETTING_CANCEL_COMMAND) + log(f"{fn}(): cancel_command '{cancel_command}'") + config_data[config.BUILDENV_SETTING_CANCEL_COMMAND] = cancel_command + job_handover_protocol = buildenv.get(config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL) slurm_params = buildenv.get(config.BUILDENV_SETTING_SLURM_PARAMS) if job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_HOLD_RELEASE: @@ -582,6 +588,8 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params): base_branch_name = pr.base.ref log(f"{fn}(): pr.base.repo.ref '{base_branch_name}'") + job_owner = event_info['raw_request_body']['sender']['login'] + # create run dir (base directory for potentially several jobs) # TODO may still be too early (before we get to any actual job being # prepared below when calling 'download_pr') @@ -689,7 +697,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params): # enlist jobs to proceed job = Job(job_dir, partition_info['cpu_subdir'], repo_id, partition_info['slurm_params'], year_month, - pr_id, accelerator) + pr_id, accelerator, job_owner) jobs.append(job) log(f"{fn}(): {len(jobs)} jobs to proceed after applying white list") @@ -1358,3 +1366,123 @@ def request_bot_build_issue_comments(repo_name, pr_number): if len(comments) != 100: break return status_table + + +def get_job_ids(action_filter): + """ + Gets and validates 'jobid:' arguments. + + Args: + action_filter (EESSIBotActionFilter): Instance containing 'jobid:' arguments + + Returns: + job_ids (list): valid 'jobid:' arguments + """ + fn = sys._getframe().f_code.co_name + + # Get 'jobid:' arguments + job_filter = action_filter.get_filter_by_component(tools_filter.FILTER_COMPONENT_JOBID) + if not job_filter: + log(f"{fn}(): 'bot: cancel' command needs at least one 'jobid:' argument.") + return [] + + # Validate job IDs + job_ids = [] + for job_id in job_filter: + try: + if int(job_id) > 0: + job_ids.append(job_id) + else: + log(f"{fn}(): Invalid job ID: '{job_id}'") + except Exception as err: + log(f"{fn}(): Invalid job ID: {err}") + + return job_ids + + +def get_work_dirs(job_ids, cfg): + """ + Gets working directories of build jobs. + + Args: + job_ids (list): list of job_ids to check. + cfg (ConfigParser): Instance containing full configuration from app.cfg + + Returns: + work_dirs (dict): dict mapping each job_id to its work_dir + """ + poll_command = cfg[config.SECTION_JOB_MANAGER][config.JOB_MANAGER_SETTING_POLL_COMMAND] + + # squeue only the given job IDs + cs_jobs = ",".join(job_ids) + command_line = f"{poll_command} --noheader --Format=JobId:0@,WorkDir:0 --job={cs_jobs}" + out, err, exit_code = run_cmd(command_line, "Get WorkDirs of jobs") + + # All output lines are formatted as '{job_id}@{work_dir}' + work_dirs = {} + for line in out.split("\n"): + job = [field.strip() for field in line.split("@")] + if len(job) != 2: + continue + work_dirs[job[0]] = job[1] + + return work_dirs + + +def cancel_jobs(jobs, user, pr, cfg): + """ + Cancels the given build jobs. + + Args: + jobs (dict): dictionary mapping each job_id to cancel to its work_dir + user (str): The user who sent the 'bot: cancel' command + pr (github.PullRequest.PullRequest): instance representing the pull request + cfg (ConfigParser): Instance containing full configuration from app.cfg + + Returns: + cancelled_jobs (list): job_ids of successfully cancelled jobs + """ + fn = sys._getframe().f_code.co_name + + buildenv = get_build_env_cfg(cfg) + cancel_command = buildenv[config.BUILDENV_SETTING_CANCEL_COMMAND] + + cancelled_jobs = [] + for job_id, work_dir in jobs.items(): + # Get job owner and PR comment ID from metadata + metadata_path = os.path.join(work_dir, f"_bot_job{job_id}.metadata") + metadata = job_metadata.get_section_from_file( + filepath=metadata_path, + section=job_metadata.JOB_PR_SECTION, + ) + job_owner = metadata.get(job_metadata.JOB_PR_JOB_OWNER) + pr_comment_id = metadata.get(job_metadata.JOB_PR_PR_COMMENT_ID) + + # Only the job owner should be able to cancel a job + if job_owner != user: + log(f"{fn}(): User {user} did not start job {job_id} - skipping cancellation") + continue + log(f"{fn}(): Job {job_id} was started by user {user} - cancelling job") + + # Cancel job + command_line = f"{cancel_command} --verbose {job_id}" + out, err, exit_code = run_cmd(command_line, f"cancel job {job_id}", raise_on_error=False) + + # Check if command was successful + if exit_code != 0: + log(f"{fn}(): scancel resulted in a non-zero exit code for job {job_id}.") + continue + if any([line.startswith("scancel: error: ") for line in err.split("\n")]): + log(f"{fn}(): Unable to cancel job {job_id}.") + continue + + log(f"{fn}(): Cancelled job {job_id}") + + # Update job status table + dt = datetime.now(timezone.utc) + update = f"\n|{dt.strftime('%b %d %X %Z %Y')}|finished|job id `{job_id}` was cancelled|" + update_comment(int(pr_comment_id), pr, update) + + cancelled_jobs.append(job_id) + + return cancelled_jobs diff --git a/tests/test_bot_job123.metadata b/tests/test_bot_job123.metadata index 29f8965d..62010c4d 100644 --- a/tests/test_bot_job123.metadata +++ b/tests/test_bot_job123.metadata @@ -2,4 +2,5 @@ repo = test_repo pr_number = 999 pr_comment_id = 77 +job_owner = user01 diff --git a/tests/test_task_build.py b/tests/test_task_build.py index af49ac9b..fcb9f428 100644 --- a/tests/test_task_build.py +++ b/tests/test_task_build.py @@ -287,7 +287,7 @@ def test_create_pr_comment_succeeds(monkeypatch, mocked_github, tmpdir): print("CREATING PR COMMENT") ym = datetime.today().strftime('%Y.%m') pr_number = 1 - job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") + job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic", "user01") build_params = EESSIBotBuildParams("arch=amd/zen4,accel=nvidia/cc90") job_id = "123" @@ -318,7 +318,7 @@ def test_create_pr_comment_succeeds_none(monkeypatch, mocked_github, tmpdir): print("CREATING PR COMMENT") ym = datetime.today().strftime('%Y.%m') pr_number = 1 - job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") + job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic", "user01") build_params = EESSIBotBuildParams("arch=amd/zen4,accel=nvidia/cc90") job_id = "123" @@ -345,7 +345,7 @@ def test_create_pr_comment_raises_once_then_succeeds(monkeypatch, mocked_github, print("CREATING PR COMMENT") ym = datetime.today().strftime('%Y.%m') pr_number = 1 - job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") + job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic", "user01") build_params = EESSIBotBuildParams("arch=amd/zen4,accel=nvidia/cc90") job_id = "123" @@ -372,7 +372,7 @@ def test_create_pr_comment_always_raises(monkeypatch, mocked_github, tmpdir): print("CREATING PR COMMENT") ym = datetime.today().strftime('%Y.%m') pr_number = 1 - job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") + job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic", "user01") build_params = EESSIBotBuildParams("arch=amd/zen4,accel=nvidia/cc90") job_id = "123" @@ -400,7 +400,7 @@ def test_create_pr_comment_three_raises(monkeypatch, mocked_github, tmpdir): print("CREATING PR COMMENT") ym = datetime.today().strftime('%Y.%m') pr_number = 1 - job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") + job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic", "user01") build_params = EESSIBotBuildParams("arch=amd/zen4,accel=nvidia/cc90") job_id = "123" @@ -423,7 +423,7 @@ def test_create_read_metadata_file(mocked_github, tmpdir): # create some test data ym = datetime.today().strftime('%Y.%m') pr_number = 999 - job = Job(tmpdir, "test/architecture", "EESSI", "--speed_up_job", ym, pr_number, "fpga/magic") + job = Job(tmpdir, "test/architecture", "EESSI", "--speed_up_job", ym, pr_number, "fpga/magic", "user01") job_id = "123" @@ -441,6 +441,7 @@ def test_create_read_metadata_file(mocked_github, tmpdir): # repo = test_repo # pr_number = 999 # pr_comment_id = 77 + # job_owner = user01 test_file = "tests/test_bot_job123.metadata" assert filecmp.cmp(expected_file_path, test_file, shallow=False) @@ -450,18 +451,21 @@ def test_create_read_metadata_file(mocked_github, tmpdir): assert metadata["PR"]["repo"] == "test_repo" assert metadata["PR"]["pr_number"] == "999" assert metadata["PR"]["pr_comment_id"] == "77" - assert sorted(metadata["PR"].keys()) == ["pr_comment_id", "pr_number", "repo"] + assert metadata["PR"]["job_owner"] == "user01" + assert sorted(metadata["PR"].keys()) == ["job_owner", "pr_comment_id", "pr_number", "repo"] # use directory that does not exist dir_does_not_exist = os.path.join(tmpdir, "dir_does_not_exist") - job2 = Job(dir_does_not_exist, "test/architecture", "EESSI", "--speed_up_job", ym, pr_number, "fpga/magic") + job2 = Job(dir_does_not_exist, "test/architecture", "EESSI", "--speed_up_job", ym, pr_number, "fpga/magic", + "user01") job_id2 = "222" with pytest.raises(FileNotFoundError): create_metadata_file(job2, job_id2, pr_comment) # use directory without write permission dir_without_write_perm = os.path.join("/") - job3 = Job(dir_without_write_perm, "test/architecture", "EESSI", "--speed_up_job", ym, pr_number, "fpga/magic") + job3 = Job(dir_without_write_perm, "test/architecture", "EESSI", "--speed_up_job", ym, pr_number, "fpga/magic", + "user01") job_id3 = "333" with pytest.raises(OSError): create_metadata_file(job3, job_id3, pr_comment) @@ -471,7 +475,7 @@ def test_create_read_metadata_file(mocked_github, tmpdir): # use undefined values for parameters # job_id = None - job4 = Job(tmpdir, "test/architecture", "EESSI", "--speed_up_job", ym, pr_number, "fpga/magic") + job4 = Job(tmpdir, "test/architecture", "EESSI", "--speed_up_job", ym, pr_number, "fpga/magic", "user01") job_id4 = None create_metadata_file(job4, job_id4, pr_comment) @@ -486,7 +490,7 @@ def test_create_read_metadata_file(mocked_github, tmpdir): # use undefined values for parameters # job.working_dir = None - job5 = Job(None, "test/architecture", "EESSI", "--speed_up_job", ym, pr_number, "fpga/magic") + job5 = Job(None, "test/architecture", "EESSI", "--speed_up_job", ym, pr_number, "fpga/magic", "user01") job_id5 = "555" with pytest.raises(TypeError): create_metadata_file(job5, job_id5, pr_comment) diff --git a/tools/config.py b/tools/config.py index 7f814ea4..70dfb472 100644 --- a/tools/config.py +++ b/tools/config.py @@ -44,6 +44,7 @@ BUILDENV_SETTING_BUILD_JOB_SCRIPT = 'build_job_script' BUILDENV_SETTING_BUILD_LOGS_DIR = 'build_logs_dir' BUILDENV_SETTING_BUILD_PERMISSION = 'build_permission' +BUILDENV_SETTING_CANCEL_COMMAND = 'cancel_command' BUILDENV_SETTING_CLONE_GIT_REPO_VIA = 'clone_git_repo_via' BUILDENV_SETTING_CONTAINER_CACHEDIR = 'container_cachedir' BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS = 'cvmfs_customizations' diff --git a/tools/filter.py b/tools/filter.py index ddc58352..54e0f5e3 100644 --- a/tools/filter.py +++ b/tools/filter.py @@ -27,13 +27,13 @@ FILTER_COMPONENT_ARCH = 'architecture' FILTER_COMPONENT_EXPORT = 'exportvariable' FILTER_COMPONENT_INST = 'instance' -FILTER_COMPONENT_JOB = 'job' +FILTER_COMPONENT_JOBID = 'jobid' FILTER_COMPONENT_REPO = 'repository' FILTER_COMPONENTS = [FILTER_COMPONENT_ACCEL, FILTER_COMPONENT_ARCH, FILTER_COMPONENT_EXPORT, FILTER_COMPONENT_INST, - FILTER_COMPONENT_JOB, + FILTER_COMPONENT_JOBID, FILTER_COMPONENT_REPO ] diff --git a/tools/job_metadata.py b/tools/job_metadata.py index f5ee21ce..e4031faf 100644 --- a/tools/job_metadata.py +++ b/tools/job_metadata.py @@ -63,6 +63,7 @@ JOB_PR_REPO = "repo" JOB_PR_PR_NUMBER = "pr_number" JOB_PR_PR_COMMENT_ID = "pr_comment_id" +JOB_PR_JOB_OWNER = "job_owner" # JWD/_bot_jobJOBID.result JOB_RESULT_SECTION = "RESULT" @@ -99,12 +100,14 @@ def create_metadata_file(job, job_id, pr_comment): repo_name = pr_comment.repo_name pr_number = pr_comment.pr_number pr_comment_id = pr_comment.pr_comment_id + job_owner = job.owner # create _bot_job.metadata file in the job's working directory bot_jobfile = configparser.ConfigParser() bot_jobfile[JOB_PR_SECTION] = {'repo': repo_name, 'pr_number': pr_number, - 'pr_comment_id': pr_comment_id} + 'pr_comment_id': pr_comment_id, + 'job_owner': job_owner} bot_jobfile_path = os.path.join(job.working_dir, f'_bot_job{job_id}.metadata') with open(bot_jobfile_path, 'w') as bjf: bot_jobfile.write(bjf)