diff --git a/README.md b/README.md index 8aa1b931..07759d52 100644 --- a/README.md +++ b/README.md @@ -342,6 +342,12 @@ The example file (`app.cfg.example`) includes notes on what you have to adjust t The section `[github]` contains information for connecting to GitHub: +```ini +api_timeout = 10 +``` + +Time limit for requests to GitHub's REST API. + ```ini app_id = 123456 ``` @@ -548,6 +554,12 @@ submit_command = /usr/bin/sbatch `submit_command` is the full path to the Slurm job submission command used for submitting batch jobs. You may want to verify if `sbatch` is provided at that path or determine its actual location (using `which sbatch`). +```ini +cancel_command = /usr/bin/scancel +``` + +`cancel_command` is the full path to the Slurm command used for cancelling batch jobs. You may want to verify if `scancel` is provided at that path or determine its actual location (using `which scancel`). + ```ini build_permission = -NOT_ALLOWED_GH_ACCOUNT_NAME- [...] ``` @@ -560,11 +572,11 @@ name on GitHub. Thus, one could not - by accident - give build permissions to an unknown account. ```ini -no_build_permission_comment = The `bot: build ...` command has been used by user `{build_labeler}`, but this person does not have permission to trigger builds. +no_build_permission_comment = GH account `{build_labeler}` is not authorized to trigger or cancel build jobs. ``` `no_build_permission_comment` defines a comment (template) that is used when -the account trying to trigger build jobs has no permission to do so. +the account trying to trigger or cancel build jobs has no permission to do so. ```ini allow_update_submit_opts = false diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 3a764054..832a5798 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -1,6 +1,28 @@ This file contains a description of the major changes to the EESSI build-and-deploy bot. For more detailed information, please see the git log. +v0.11.0 (28 January 2026) +-------------------------- + +This is a minor release of the EESSI build-and-deploy bot. + +Bug fixes: +* consider all builds for `bot: status [last_build]` command (#357) + * this also replaces running `curl` by using the `requests` library for one `curl` call + +Improvements: +* adds support for new command `bot: cancel jobid:[JOBID] ...` (#359) + * only the owner of a job can cancel it + * multiple jobs can be cancelled by specifying multiple `jobid:[JOBID]` + arguments separated by space + +Changes to 'app.cfg' settings (see README.md and app.cfg.example for details): +* CHANGED (required) 'no_build_permission_comment' in section '[buildenv]' + Note! sites using the old value may see misleading comments added by the bot, + but the bot will work without the change. +* NEW (required) 'cancel_command' in section '[buildenv]' + + v0.10.0 (13 November 2025) -------------------------- diff --git a/app.cfg.example b/app.cfg.example index 0b393a4c..be8e2198 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -18,6 +18,9 @@ # Also see documentation at https://github.com/EESSI/eessi-bot-software-layer/blob/main/README.md#step5.5 [github] +# API timeout, time limit for requests to GitHub's REST API +api_timeout = 10 + # replace '123456' with the ID of your GitHub App; see https://github.com/settings/apps app_id = 123456 @@ -155,13 +158,16 @@ slurm_params = --hold # full path to the job submission command submit_command = /usr/bin/sbatch +# full path to the job cancellation command +cancel_command = /usr/bin/scancel + # defines which GitHub accounts have the permission to trigger # build jobs, i.e., for which accounts the bot acts on `bot: build ...` # commands. If the value is left empty, everyone can trigger build jobs. build_permission = -NOT_ALLOWED_GH_ACCOUNT_NAME- # template for comment when user who set a label has no permission to trigger build jobs -no_build_permission_comment = Label `bot:build` has been set by user `{build_labeler}`, but this person does not have permission to trigger builds +no_build_permission_comment = GH account `{build_labeler}` is not authorized to trigger or cancel build jobs. # whether or not to allow updating the submit options via custom module det_submit_opts # Should only be enabled (true) with care because this will result in code from the target diff --git a/containers/Dockerfile.smee-client b/containers/Dockerfile.smee-client index 28c5d21a..7b226467 100644 --- a/containers/Dockerfile.smee-client +++ b/containers/Dockerfile.smee-client @@ -1,12 +1,18 @@ -ARG smee_client_version=4.4.1 -# ARG smee_client_version_commit=b837fa85fd05853731160e21356ffd30c8c3e791 # v4.4.1 - -# pinning base image to specific hash (corresponding to lts-alpine) +# pin base image to specific hash (corresponding to lts-alpine) FROM node@sha256:f36fed0b2129a8492535e2853c64fbdbd2d29dc1219ee3217023ca48aebd3787 -ARG smee_client_version -# ARG smee_client_version_commit -# Then install -RUN npm install --global smee-client@${smee_client_version} +# create app dir for locked installation +WORKDIR /app + +# copy lockfile and manifest +COPY containers/package.json containers/package-lock.json ./ + +# install exactly what's in the lockfile (change version in package.json and update +# lockfile via 'npm install --package-lock-only') +RUN npm ci --omit=dev + +# expose CLI by symlinking +RUN ln -sf /app/node_modules/.bin/smee /usr/local/bin/smee + ENTRYPOINT ["smee"] CMD ["--help"] diff --git a/containers/package-lock.json b/containers/package-lock.json new file mode 100644 index 00000000..22aa5a14 --- /dev/null +++ b/containers/package-lock.json @@ -0,0 +1,62 @@ +{ + "name": "smee-wrapper", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "smee-wrapper", + "version": "1.0.0", + "license": "GPL-2.0-only", + "dependencies": { + "smee-client": "4.4.1" + } + }, + "node_modules/eventsource": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/eventsource/-/eventsource-4.0.0.tgz", + "integrity": "sha512-fvIkb9qZzdMxgZrEQDyll+9oJsyaVvY92I2Re+qK0qEJ+w5s0X3dtz+M0VAPOjP1gtU3iqWyjQ0G3nvd5CLZ2g==", + "license": "MIT", + "dependencies": { + "eventsource-parser": "^3.0.1" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/eventsource-parser": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/eventsource-parser/-/eventsource-parser-3.0.6.tgz", + "integrity": "sha512-Vo1ab+QXPzZ4tCa8SwIHJFaSzy4R6SHf7BY79rFBDf0idraZWAkYrDjDj8uWaSm3S2TK+hJ7/t1CEmZ7jXw+pg==", + "license": "MIT", + "engines": { + "node": ">=18.0.0" + } + }, + "node_modules/smee-client": { + "version": "4.4.1", + "resolved": "https://registry.npmjs.org/smee-client/-/smee-client-4.4.1.tgz", + "integrity": "sha512-o2px88YVTmKWpaF3sg0Qpuk5MEVpM7BrImjNy/PYf3ENiD51dnPWtXnYHAM6h5/+06ug7z7BBu3werlIEwVkdw==", + "license": "ISC", + "dependencies": { + "eventsource": "^4.0.0", + "undici": "^7.0.0" + }, + "bin": { + "smee": "bin/smee.js" + }, + "engines": { + "node": "^20.18 || >= 22" + } + }, + "node_modules/undici": { + "version": "7.16.0", + "resolved": "https://registry.npmjs.org/undici/-/undici-7.16.0.tgz", + "integrity": "sha512-QEg3HPMll0o3t2ourKwOeUAZ159Kn9mx5pnzHRQO8+Wixmh88YdZRiIwat0iNzNNXn0yoEtXJqFpyW7eM8BV7g==", + "license": "MIT", + "engines": { + "node": ">=20.18.1" + } + } + } +} diff --git a/containers/package.json b/containers/package.json new file mode 100644 index 00000000..8b33566e --- /dev/null +++ b/containers/package.json @@ -0,0 +1,9 @@ +{ + "name": "smee-wrapper", + "private": true, + "version": "1.0.0", + "license": "GPL-2.0-only", + "dependencies": { + "smee-client": "4.4.1" + } +} diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index 04529a7e..41787e1b 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -29,8 +29,8 @@ # Local application imports (anything from EESSI/eessi-bot-software-layer) from connections import github -from tasks.build import check_build_permission, get_node_types, request_bot_build_issue_comments, \ - submit_build_jobs +from tasks.build import cancel_jobs, check_build_permission, get_job_ids, get_node_types, \ + get_work_dirs, request_bot_build_issue_comments, submit_build_jobs from tasks.deploy import deploy_built_artefacts, determine_job_dirs from tasks.clean_up import move_to_trash_bin from tools import config @@ -53,6 +53,7 @@ config.BUILDENV_SETTING_BUILD_JOB_SCRIPT, # required config.BUILDENV_SETTING_BUILD_LOGS_DIR, # optional+recommended config.BUILDENV_SETTING_BUILD_PERMISSION, # optional+recommended + config.BUILDENV_SETTING_CANCEL_COMMAND, # required config.BUILDENV_SETTING_CONTAINER_CACHEDIR, # optional+recommended # config.BUILDENV_SETTING_CLONE_GIT_REPO_VIA, # optional # config.BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS, # optional @@ -95,6 +96,7 @@ config.SECTION_EVENT_HANDLER: [ config.EVENT_HANDLER_SETTING_LOG_PATH], # required config.SECTION_GITHUB: [ + config.GITHUB_SETTING_API_TIMEOUT, # required config.GITHUB_SETTING_APP_ID, # required config.GITHUB_SETTING_APP_NAME, # required config.GITHUB_SETTING_INSTALLATION_ID, # required @@ -102,6 +104,7 @@ # the poll interval setting is required for the alternative job handover # protocol (delayed_begin) config.SECTION_JOB_MANAGER: [ + config.JOB_MANAGER_SETTING_POLL_COMMAND, # required config.JOB_MANAGER_SETTING_POLL_INTERVAL], # required config.SECTION_REPO_TARGETS: [ config.REPO_TARGETS_SETTING_REPOS_CFG_DIR], # required @@ -507,7 +510,7 @@ def handle_bot_command_help(self, event_info, bot_command): help_msg += "\n - Commands must be sent with a **new** comment (edits of existing comments are ignored)." help_msg += "\n - A comment may contain multiple commands, one per line." help_msg += "\n - Every command begins at the start of a line and has the syntax `bot: COMMAND [ARGUMENTS]*`" - help_msg += "\n - Currently supported COMMANDs are: `help`, `build`, `show_config`, `status`" + help_msg += "\n - Currently supported COMMANDs are: `help`, `build`, `show_config`, `status`, `cancel`" help_msg += "\n" help_msg += "\n For more information, see https://www.eessi.io/docs/bot" return help_msg @@ -679,6 +682,61 @@ def handle_bot_command_status(self, event_info, bot_command): else: return "\n - failed to create status comment" + def handle_bot_command_cancel(self, event_info, bot_command): + """ + Handles bot command 'cancel' by parsing 'jobid:' arguments and + cancelling the jobs. + + Args: + event_info (dict): event received by event_handler + bot_command (EESSIBotCommand): command to be handled + + Returns: + comment (string): list of cancelled jobs if any, error message if not + """ + self.log("processing bot command 'cancel'") + + request_body = event_info["raw_request_body"] + repo_name = request_body["repository"]["full_name"] + pr_number = request_body["issue"]["number"] + user = request_body["comment"]["user"]["login"] + + gh = github.get_instance() + pr = gh.get_repo(repo_name).get_pull(pr_number) + + # Jobs can only be cancelled by the user who submitted the job + # -> No need to proceed if user cannot submit jobs + if not check_build_permission(pr, event_info): + self.log(f"User '{user}' does not have build permission - skipping cancellation.") + return f"\n - User `{user}` cannot submit or cancel build jobs." + + # Get valid 'jobid:' arguments + job_ids = get_job_ids(bot_command.action_filters) + if len(job_ids) == 0: + self.log("Got no valid job IDs") + return "\n - No valid job IDs were given." + + # Get working directories of jobs + work_dirs = get_work_dirs(job_ids, self.cfg) + if len(work_dirs) == 0: + self.log("None of the given jobs are cancellable") + return "\n - No cancellable jobs were given." + + # Log skipped jobs + for job_id in job_ids: + if job_id not in work_dirs.keys(): + log(f"Skipping job {job_id} - not found") + + # Cancel jobs + cancelled_jobs = cancel_jobs(work_dirs, user, pr, self.cfg) + if len(cancelled_jobs) == 0: + return "\n - No jobs were cancelled." + else: + comment = "" + for job_id in cancelled_jobs: + comment += f"\n - cancelled job `{job_id}`" + return comment + def start(self, app, port=3000): """ Logs startup information to shell and log file and starts the app using diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index fd67b913..85fba369 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -57,6 +57,7 @@ config.FINISHED_JOB_COMMENTS_SETTING_JOB_RESULT_UNKNOWN_FMT, # required config.FINISHED_JOB_COMMENTS_SETTING_JOB_TEST_UNKNOWN_FMT], # required config.SECTION_GITHUB: [ + # config.GITHUB_SETTING_API_TIMEOUT, # unused config.GITHUB_SETTING_APP_ID, # required # config.GITHUB_SETTING_APP_NAME, # unused config.GITHUB_SETTING_INSTALLATION_ID, # required diff --git a/tasks/build.py b/tasks/build.py index 165ab544..6c191013 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -24,17 +24,20 @@ import json import os import re +import requests import shutil import string import sys +import time # Third party imports (anything installed into the local Python environment) from pyghee.utils import error, log # Local application imports (anything from EESSI/eessi-bot-software-layer) +from connections import github from tools import config, cvmfs_repository, job_metadata, pr_comments, run_cmd import tools.filter as tools_filter -from tools.pr_comments import ChatLevels, create_comment +from tools.pr_comments import ChatLevels, create_comment, update_comment from tools.build_params import BUILD_PARAM_ARCH, BUILD_PARAM_ACCEL # defaults (used if not specified via, eg, 'app.cfg') @@ -51,7 +54,9 @@ # other constants EXPORT_VARS_FILE = 'export_vars.sh' -Job = namedtuple('Job', ('working_dir', 'arch_target', 'repo_id', 'slurm_opts', 'year_month', 'pr_id', 'accelerator')) + +Job = namedtuple('Job', + ('working_dir', 'arch_target', 'repo_id', 'slurm_opts', 'year_month', 'pr_id', 'accelerator', 'owner')) # global repo_cfg repo_cfg = {} @@ -108,6 +113,10 @@ def get_build_env_cfg(cfg): log(f"{fn}(): submit_command '{submit_command}'") config_data[config.BUILDENV_SETTING_SUBMIT_COMMAND] = submit_command + cancel_command = buildenv.get(config.BUILDENV_SETTING_CANCEL_COMMAND) + log(f"{fn}(): cancel_command '{cancel_command}'") + config_data[config.BUILDENV_SETTING_CANCEL_COMMAND] = cancel_command + job_handover_protocol = buildenv.get(config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL) slurm_params = buildenv.get(config.BUILDENV_SETTING_SLURM_PARAMS) if job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_HOLD_RELEASE: @@ -582,6 +591,8 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params): base_branch_name = pr.base.ref log(f"{fn}(): pr.base.repo.ref '{base_branch_name}'") + job_owner = event_info['raw_request_body']['sender']['login'] + # create run dir (base directory for potentially several jobs) # TODO may still be too early (before we get to any actual job being # prepared below when calling 'download_pr') @@ -689,7 +700,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params): # enlist jobs to proceed job = Job(job_dir, partition_info['cpu_subdir'], repo_id, partition_info['slurm_params'], year_month, - pr_id, accelerator) + pr_id, accelerator, job_owner) jobs.append(job) log(f"{fn}(): {len(jobs)} jobs to proceed after applying white list") @@ -1203,158 +1214,308 @@ def request_bot_build_issue_comments(repo_name, pr_number): status_table = {'on arch': [], 'for arch': [], 'for repo': [], 'date': [], 'status': [], 'url': [], 'result': []} cfg = config.read_config() + github_section = cfg[config.SECTION_GITHUB] + api_timeout = int(github_section.get(config.GITHUB_SETTING_API_TIMEOUT, 10)) # for loop because github has max 100 items per request. # if the pr has more than 100 comments we need to use per_page # argument at the moment the for loop is for a max of 400 comments could bump this up - for x in range(1, 5): - curl_cmd = f'curl -L https://api.github.com/repos/{repo_name}/issues/{pr_number}/comments?per_page=100&page={x}' - curl_output, curl_error, curl_exit_code = run_cmd(curl_cmd, "fetch all comments") - - comments = json.loads(curl_output) - - for comment in comments: - # iterate through the comments to find the one where the status of the build was in - submitted_job_comments_section = cfg[config.SECTION_SUBMITTED_JOB_COMMENTS] - accelerator_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_WITH_ACCELERATOR] - instance_repo_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_INSTANCE_REPO] - instance_repo_re = template_to_regex(instance_repo_fmt) - comment_body = comment['body'].split('\n') - instance_repo_match = re.match(instance_repo_re, comment_body[0]) - # Check if this body starts with an initial comment from the bot (first item is always the instance + repo - # it is building for) - # Then, check that it has at least 4 lines so that we can safely index up to that number - if instance_repo_match and len(comment_body) >= 4: - # Set some defaults - repo_id = "" - on_arch = "" - for_arch = "" - date = "" - status = "" - url = "" - result = "" - - log(f"{fn}(): found bot build response in issue, processing...") - - # First, extract the repo_id - log(f"{fn}(): found build for repository: {instance_repo_match.group('repo_id')}") - repo_id = instance_repo_match.group('repo_id') - - # Then, try to match the architecture we build on. - # First try this including accelerator, to see if one was defined - on_arch_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_ON_ARCH] - on_arch_fmt_with_accel = on_arch_fmt.format_map(PartialFormatDict(on_accelerator=accelerator_fmt)) - on_arch_re_with_accel = template_to_regex(on_arch_fmt_with_accel) - on_arch_match = re.match(on_arch_re_with_accel, comment_body[1]) + + url = f'https://api.github.com/repos/{repo_name}/issues/{pr_number}/comments' + all_comments = [] + + # call get_instance() to obtain a (new) token (accessible via github.token().token) + # get_instance ensures that the token is renewed if the current one is no + # longer valid or valid for less than 30 minutes + _ = github.get_instance() + try: + while url: + headers = { + 'Authorization': f'Bearer {github.token().token}', + 'Accept': 'application/vnd.github+json', + 'X-GitHub-Api-Version': '2022-11-28' + } + + response = requests.get(url, headers=headers, params={'per_page': 100}, timeout=api_timeout) + response.raise_for_status() + + all_comments.extend(response.json()) + # get next URL from Link header in response (we are done if that is empty) + url = response.links.get('next', {}).get('url') + log(f"{fn}(): more comments? {url!r}") + reset_time = int(response.headers.get('X-RateLimit-Reset')) + utc_time = datetime.fromtimestamp(reset_time, tz=timezone.utc) + time_left = int(reset_time - time.time()) + log(f"{fn}(): limits with token '{github.token().token[:4]}...':\n" + f" rate limit.: {response.headers.get('X-RateLimit-Limit')}\n" + f" remaining..: {response.headers.get('X-RateLimit-Remaining')}\n" + f" reset limit: {utc_time.strftime('%b %d %I:%M:%S %p UTC %Y')} (in {time_left} seconds)\n" + ) + + except Exception as err: + log(f"{fn}(): obtaining comments for PR {pr_number} in repo {repo_name!r} failed: {err}") + return status_table + + for comment in all_comments: + # iterate through the comments to find the one where the status of the build was in + submitted_job_comments_section = cfg[config.SECTION_SUBMITTED_JOB_COMMENTS] + accelerator_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_WITH_ACCELERATOR] + instance_repo_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_INSTANCE_REPO] + instance_repo_re = template_to_regex(instance_repo_fmt) + comment_body = comment['body'].split('\n') + instance_repo_match = re.match(instance_repo_re, comment_body[0]) + # Check if this body starts with an initial comment from the bot (first item is always the instance + repo + # it is building for) + # Then, check that it has at least 4 lines so that we can safely index up to that number + if instance_repo_match and len(comment_body) >= 4: + # Set some defaults + repo_id = "" + on_arch = "" + for_arch = "" + date = "" + status = "" + url = "" + result = "" + + log(f"{fn}(): found bot build response in issue, processing...") + + # First, extract the repo_id + log(f"{fn}(): found build for repository: {instance_repo_match.group('repo_id')}") + repo_id = instance_repo_match.group('repo_id') + + # Then, try to match the architecture we build on. + # First try this including accelerator, to see if one was defined + on_arch_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_ON_ARCH] + on_arch_fmt_with_accel = on_arch_fmt.format_map(PartialFormatDict(on_accelerator=accelerator_fmt)) + on_arch_re_with_accel = template_to_regex(on_arch_fmt_with_accel) + on_arch_match = re.match(on_arch_re_with_accel, comment_body[1]) + if on_arch_match: + # Pattern with accelerator matched, append to status_table + log(f"{fn}(): found build on architecture: {on_arch_match.group('on_arch')}, " + f"with accelerator {on_arch_match.group('accelerator')}") + on_arch = f"`{on_arch_match.group('on_arch')}`, `{on_arch_match.group('accelerator')}`" + else: + # Pattern with accelerator did not match, retry without accelerator + on_arch_re = template_to_regex(on_arch_fmt) + on_arch_match = re.match(on_arch_re, comment_body[1]) if on_arch_match: - # Pattern with accelerator matched, append to status_table - log(f"{fn}(): found build on architecture: {on_arch_match.group('on_arch')}, " - f"with accelerator {on_arch_match.group('accelerator')}") - on_arch = f"`{on_arch_match.group('on_arch')}`, `{on_arch_match.group('accelerator')}`" + # Pattern without accelerator matched, append to status_table + log(f"{fn}(): found build on architecture: {on_arch_match.group('on_arch')}") + on_arch = f"`{on_arch_match.group('on_arch')}`" else: - # Pattern with accelerator did not match, retry without accelerator - on_arch_re = template_to_regex(on_arch_fmt) - on_arch_match = re.match(on_arch_re, comment_body[1]) - if on_arch_match: - # Pattern without accelerator matched, append to status_table - log(f"{fn}(): found build on architecture: {on_arch_match.group('on_arch')}") - on_arch = f"`{on_arch_match.group('on_arch')}`" - else: - # This shouldn't happen: we had an instance_repo_match, but no match for the 'on architecture' - msg = "Could not match regular expression for extracting the architecture to build on.\n" - msg += "String to be matched:\n" - msg += f"{comment_body[1]}\n" - msg += "First regex attempted:\n" - msg += f"{on_arch_re_with_accel.pattern}\n" - msg += "Second regex attempted:\n" - msg += f"{on_arch_re.pattern}\n" - raise ValueError(msg) - - # Now, do the same for the architecture we build for. I.e. first, try to match including accelerator - for_arch_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_FOR_ARCH] - for_arch_fmt_with_accel = for_arch_fmt.format_map(PartialFormatDict(for_accelerator=accelerator_fmt)) - for_arch_re_with_accel = template_to_regex(for_arch_fmt_with_accel) - for_arch_match = re.match(for_arch_re_with_accel, comment_body[2]) + # This shouldn't happen: we had an instance_repo_match, but no match for the 'on architecture' + msg = "Could not match regular expression for extracting the architecture to build on.\n" + msg += "String to be matched:\n" + msg += f"{comment_body[1]}\n" + msg += "First regex attempted:\n" + msg += f"{on_arch_re_with_accel.pattern}\n" + msg += "Second regex attempted:\n" + msg += f"{on_arch_re.pattern}\n" + raise ValueError(msg) + + # Now, do the same for the architecture we build for. I.e. first, try to match including accelerator + for_arch_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_FOR_ARCH] + for_arch_fmt_with_accel = for_arch_fmt.format_map(PartialFormatDict(for_accelerator=accelerator_fmt)) + for_arch_re_with_accel = template_to_regex(for_arch_fmt_with_accel) + for_arch_match = re.match(for_arch_re_with_accel, comment_body[2]) + if for_arch_match: + # Pattern with accelerator matched, append to status_table + log(f"{fn}(): found build for architecture: {for_arch_match.group('for_arch')}, " + f"with accelerator {for_arch_match.group('accelerator')}") + for_arch = f"`{for_arch_match.group('for_arch')}`, `{for_arch_match.group('accelerator')}`" + else: + # Pattern with accelerator did not match, retry without accelerator + for_arch_re = template_to_regex(for_arch_fmt) + for_arch_match = re.match(for_arch_re, comment_body[2]) if for_arch_match: - # Pattern with accelerator matched, append to status_table - log(f"{fn}(): found build for architecture: {for_arch_match.group('for_arch')}, " - f"with accelerator {for_arch_match.group('accelerator')}") - for_arch = f"`{for_arch_match.group('for_arch')}`, `{for_arch_match.group('accelerator')}`" + # Pattern without accelerator matched, append to status_table + log(f"{fn}(): found build for architecture: {for_arch_match.group('for_arch')}") + for_arch = f"`{for_arch_match.group('for_arch')}`" else: - # Pattern with accelerator did not match, retry without accelerator - for_arch_re = template_to_regex(for_arch_fmt) - for_arch_match = re.match(for_arch_re, comment_body[2]) - if for_arch_match: - # Pattern without accelerator matched, append to status_table - log(f"{fn}(): found build for architecture: {for_arch_match.group('for_arch')}") - for_arch = f"`{for_arch_match.group('for_arch')}`" - else: - # This shouldn't happen: we had an instance_repo_match, but no match for the 'on architecture' - msg = "Could not match regular expression for extracting the architecture to build for.\n" - msg += "String to be matched:\n" - msg += f"{comment_body[2]}\n" - msg += "First regex attempted:\n" - msg += f"{for_arch_re_with_accel.pattern}\n" - msg += "Second regex attempted:\n" - msg += f"{for_arch_re.pattern}\n" - raise ValueError(msg) - - # get date, status, url and result from the markdown table - comment_table = comment['body'][comment['body'].find('|'):comment['body'].rfind('|')+1] - - # Convert markdown table to a dictionary - lines = comment_table.split('\n') - rows = [] - keys = [] - for i, row in enumerate(lines): - values = {} - if i == 0: - for key in row.split('|'): - keys.append(key.strip()) - elif i == 1: - continue + # This shouldn't happen: we had an instance_repo_match, but no match for the 'on architecture' + msg = "Could not match regular expression for extracting the architecture to build for.\n" + msg += "String to be matched:\n" + msg += f"{comment_body[2]}\n" + msg += "First regex attempted:\n" + msg += f"{for_arch_re_with_accel.pattern}\n" + msg += "Second regex attempted:\n" + msg += f"{for_arch_re.pattern}\n" + raise ValueError(msg) + + # get date, status, url and result from the markdown table + comment_table = comment['body'][comment['body'].find('|'):comment['body'].rfind('|')+1] + + # Convert markdown table to a dictionary + lines = comment_table.split('\n') + rows = [] + keys = [] + for i, row in enumerate(lines): + values = {} + if i == 0: + for key in row.split('|'): + keys.append(key.strip()) + elif i == 1: + continue + else: + for j, value in enumerate(row.split('|')): + if j > 0 and j < len(keys) - 1: + values[keys[j]] = value.strip() + rows.append(values) + + # add date, status, url to status_table if + for row in rows: + if row['job status'] == 'finished': + date = row['date'] + status = row['job status'] + url = comment['html_url'] + if 'FAILURE' in row['comment']: + result = ':cry: FAILURE' + elif 'SUCCESS' in row['comment']: + result = ':grin: SUCCESS' + elif 'UNKNOWN' in row['comment']: + result = ':shrug: UNKNOWN' else: - for j, value in enumerate(row.split('|')): - if j > 0 and j < len(keys) - 1: - values[keys[j]] = value.strip() - rows.append(values) - - # add date, status, url to status_table if - for row in rows: - if row['job status'] == 'finished': - date = row['date'] - status = row['job status'] - url = comment['html_url'] - if 'FAILURE' in row['comment']: - result = ':cry: FAILURE' - elif 'SUCCESS' in row['comment']: - result = ':grin: SUCCESS' - elif 'UNKNOWN' in row['comment']: - result = ':shrug: UNKNOWN' - else: - result = row['comment'] - elif row['job status'] in ['submitted', 'received', 'running']: - # Make sure that if the job is not finished yet, we also put something useful in these fields - # It is useful to know a job is submitted, running, etc - date = row['date'] - status = row['job status'] - url = comment['html_url'] result = row['comment'] - else: - # Don't do anything for the test line for now - we might add an extra entry to the status - # table later to reflect the test result - continue - - # Add all entries to status_table. We do this at the end of this loop so that the operation is - # more or less 'atomic', i.e. all vectors in the status_table dict have the same length - status_table['for repo'].append(repo_id) - status_table['on arch'].append(on_arch) - status_table['for arch'].append(for_arch) - status_table['date'].append(date) - status_table['status'].append(status) - status_table['url'].append(url) - status_table['result'].append(result) - - if len(comments) != 100: - break + elif row['job status'] in ['submitted', 'received', 'running']: + # Make sure that if the job is not finished yet, we also put something useful in these fields + # It is useful to know a job is submitted, running, etc + date = row['date'] + status = row['job status'] + url = comment['html_url'] + result = row['comment'] + else: + # Don't do anything for the test line for now - we might add an extra entry to the status + # table later to reflect the test result + continue + + # Add all entries to status_table. We do this at the end of this loop so that the operation is + # more or less 'atomic', i.e. all vectors in the status_table dict have the same length + status_table['for repo'].append(repo_id) + status_table['on arch'].append(on_arch) + status_table['for arch'].append(for_arch) + status_table['date'].append(date) + status_table['status'].append(status) + status_table['url'].append(url) + status_table['result'].append(result) + return status_table + + +def get_job_ids(action_filter): + """ + Gets and validates 'jobid:' arguments. + + Args: + action_filter (EESSIBotActionFilter): Instance containing 'jobid:' arguments + + Returns: + job_ids (list): valid 'jobid:' arguments + """ + fn = sys._getframe().f_code.co_name + + # Get 'jobid:' arguments + job_filter = action_filter.get_filter_by_component(tools_filter.FILTER_COMPONENT_JOBID) + if not job_filter: + log(f"{fn}(): 'bot: cancel' command needs at least one 'jobid:' argument.") + return [] + + # Validate job IDs + job_ids = [] + for job_id in job_filter: + try: + if int(job_id) > 0: + job_ids.append(job_id) + else: + log(f"{fn}(): Invalid job ID: '{job_id}'") + except Exception as err: + log(f"{fn}(): Invalid job ID: {err}") + + return job_ids + + +def get_work_dirs(job_ids, cfg): + """ + Gets working directories of build jobs. + + Args: + job_ids (list): list of job_ids to check. + cfg (ConfigParser): Instance containing full configuration from app.cfg + + Returns: + work_dirs (dict): dict mapping each job_id to its work_dir + """ + poll_command = cfg[config.SECTION_JOB_MANAGER][config.JOB_MANAGER_SETTING_POLL_COMMAND] + + # squeue only the given job IDs + cs_jobs = ",".join(job_ids) + command_line = f"{poll_command} --noheader --Format=JobId:0@,WorkDir:0 --job={cs_jobs}" + out, err, exit_code = run_cmd(command_line, "Get WorkDirs of jobs") + + # All output lines are formatted as '{job_id}@{work_dir}' + work_dirs = {} + for line in out.split("\n"): + job = [field.strip() for field in line.split("@")] + if len(job) != 2: + continue + work_dirs[job[0]] = job[1] + + return work_dirs + + +def cancel_jobs(jobs, user, pr, cfg): + """ + Cancels the given build jobs. + + Args: + jobs (dict): dictionary mapping each job_id to cancel to its work_dir + user (str): The user who sent the 'bot: cancel' command + pr (github.PullRequest.PullRequest): instance representing the pull request + cfg (ConfigParser): Instance containing full configuration from app.cfg + + Returns: + cancelled_jobs (list): job_ids of successfully cancelled jobs + """ + fn = sys._getframe().f_code.co_name + + buildenv = get_build_env_cfg(cfg) + cancel_command = buildenv[config.BUILDENV_SETTING_CANCEL_COMMAND] + + cancelled_jobs = [] + for job_id, work_dir in jobs.items(): + # Get job owner and PR comment ID from metadata + metadata_path = os.path.join(work_dir, f"_bot_job{job_id}.metadata") + metadata = job_metadata.get_section_from_file( + filepath=metadata_path, + section=job_metadata.JOB_PR_SECTION, + ) + job_owner = metadata.get(job_metadata.JOB_PR_JOB_OWNER) + pr_comment_id = metadata.get(job_metadata.JOB_PR_PR_COMMENT_ID) + + # Only the job owner should be able to cancel a job + if job_owner != user: + log(f"{fn}(): User {user} did not start job {job_id} - skipping cancellation") + continue + log(f"{fn}(): Job {job_id} was started by user {user} - cancelling job") + + # Cancel job + command_line = f"{cancel_command} --verbose {job_id}" + out, err, exit_code = run_cmd(command_line, f"cancel job {job_id}", raise_on_error=False) + + # Check if command was successful + if exit_code != 0: + log(f"{fn}(): scancel resulted in a non-zero exit code for job {job_id}.") + continue + if any([line.startswith("scancel: error: ") for line in err.split("\n")]): + log(f"{fn}(): Unable to cancel job {job_id}.") + continue + + log(f"{fn}(): Cancelled job {job_id}") + + # Update job status table + dt = datetime.now(timezone.utc) + update = f"\n|{dt.strftime('%b %d %X %Z %Y')}|finished|job id `{job_id}` was cancelled|" + update_comment(int(pr_comment_id), pr, update) + + cancelled_jobs.append(job_id) + + return cancelled_jobs diff --git a/tests/test_bot_job123.metadata b/tests/test_bot_job123.metadata index 29f8965d..62010c4d 100644 --- a/tests/test_bot_job123.metadata +++ b/tests/test_bot_job123.metadata @@ -2,4 +2,5 @@ repo = test_repo pr_number = 999 pr_comment_id = 77 +job_owner = user01 diff --git a/tests/test_task_build.py b/tests/test_task_build.py index af49ac9b..fcb9f428 100644 --- a/tests/test_task_build.py +++ b/tests/test_task_build.py @@ -287,7 +287,7 @@ def test_create_pr_comment_succeeds(monkeypatch, mocked_github, tmpdir): print("CREATING PR COMMENT") ym = datetime.today().strftime('%Y.%m') pr_number = 1 - job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") + job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic", "user01") build_params = EESSIBotBuildParams("arch=amd/zen4,accel=nvidia/cc90") job_id = "123" @@ -318,7 +318,7 @@ def test_create_pr_comment_succeeds_none(monkeypatch, mocked_github, tmpdir): print("CREATING PR COMMENT") ym = datetime.today().strftime('%Y.%m') pr_number = 1 - job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") + job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic", "user01") build_params = EESSIBotBuildParams("arch=amd/zen4,accel=nvidia/cc90") job_id = "123" @@ -345,7 +345,7 @@ def test_create_pr_comment_raises_once_then_succeeds(monkeypatch, mocked_github, print("CREATING PR COMMENT") ym = datetime.today().strftime('%Y.%m') pr_number = 1 - job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") + job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic", "user01") build_params = EESSIBotBuildParams("arch=amd/zen4,accel=nvidia/cc90") job_id = "123" @@ -372,7 +372,7 @@ def test_create_pr_comment_always_raises(monkeypatch, mocked_github, tmpdir): print("CREATING PR COMMENT") ym = datetime.today().strftime('%Y.%m') pr_number = 1 - job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") + job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic", "user01") build_params = EESSIBotBuildParams("arch=amd/zen4,accel=nvidia/cc90") job_id = "123" @@ -400,7 +400,7 @@ def test_create_pr_comment_three_raises(monkeypatch, mocked_github, tmpdir): print("CREATING PR COMMENT") ym = datetime.today().strftime('%Y.%m') pr_number = 1 - job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") + job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic", "user01") build_params = EESSIBotBuildParams("arch=amd/zen4,accel=nvidia/cc90") job_id = "123" @@ -423,7 +423,7 @@ def test_create_read_metadata_file(mocked_github, tmpdir): # create some test data ym = datetime.today().strftime('%Y.%m') pr_number = 999 - job = Job(tmpdir, "test/architecture", "EESSI", "--speed_up_job", ym, pr_number, "fpga/magic") + job = Job(tmpdir, "test/architecture", "EESSI", "--speed_up_job", ym, pr_number, "fpga/magic", "user01") job_id = "123" @@ -441,6 +441,7 @@ def test_create_read_metadata_file(mocked_github, tmpdir): # repo = test_repo # pr_number = 999 # pr_comment_id = 77 + # job_owner = user01 test_file = "tests/test_bot_job123.metadata" assert filecmp.cmp(expected_file_path, test_file, shallow=False) @@ -450,18 +451,21 @@ def test_create_read_metadata_file(mocked_github, tmpdir): assert metadata["PR"]["repo"] == "test_repo" assert metadata["PR"]["pr_number"] == "999" assert metadata["PR"]["pr_comment_id"] == "77" - assert sorted(metadata["PR"].keys()) == ["pr_comment_id", "pr_number", "repo"] + assert metadata["PR"]["job_owner"] == "user01" + assert sorted(metadata["PR"].keys()) == ["job_owner", "pr_comment_id", "pr_number", "repo"] # use directory that does not exist dir_does_not_exist = os.path.join(tmpdir, "dir_does_not_exist") - job2 = Job(dir_does_not_exist, "test/architecture", "EESSI", "--speed_up_job", ym, pr_number, "fpga/magic") + job2 = Job(dir_does_not_exist, "test/architecture", "EESSI", "--speed_up_job", ym, pr_number, "fpga/magic", + "user01") job_id2 = "222" with pytest.raises(FileNotFoundError): create_metadata_file(job2, job_id2, pr_comment) # use directory without write permission dir_without_write_perm = os.path.join("/") - job3 = Job(dir_without_write_perm, "test/architecture", "EESSI", "--speed_up_job", ym, pr_number, "fpga/magic") + job3 = Job(dir_without_write_perm, "test/architecture", "EESSI", "--speed_up_job", ym, pr_number, "fpga/magic", + "user01") job_id3 = "333" with pytest.raises(OSError): create_metadata_file(job3, job_id3, pr_comment) @@ -471,7 +475,7 @@ def test_create_read_metadata_file(mocked_github, tmpdir): # use undefined values for parameters # job_id = None - job4 = Job(tmpdir, "test/architecture", "EESSI", "--speed_up_job", ym, pr_number, "fpga/magic") + job4 = Job(tmpdir, "test/architecture", "EESSI", "--speed_up_job", ym, pr_number, "fpga/magic", "user01") job_id4 = None create_metadata_file(job4, job_id4, pr_comment) @@ -486,7 +490,7 @@ def test_create_read_metadata_file(mocked_github, tmpdir): # use undefined values for parameters # job.working_dir = None - job5 = Job(None, "test/architecture", "EESSI", "--speed_up_job", ym, pr_number, "fpga/magic") + job5 = Job(None, "test/architecture", "EESSI", "--speed_up_job", ym, pr_number, "fpga/magic", "user01") job_id5 = "555" with pytest.raises(TypeError): create_metadata_file(job5, job_id5, pr_comment) diff --git a/tools/config.py b/tools/config.py index 7f814ea4..10a7590d 100644 --- a/tools/config.py +++ b/tools/config.py @@ -44,6 +44,7 @@ BUILDENV_SETTING_BUILD_JOB_SCRIPT = 'build_job_script' BUILDENV_SETTING_BUILD_LOGS_DIR = 'build_logs_dir' BUILDENV_SETTING_BUILD_PERMISSION = 'build_permission' +BUILDENV_SETTING_CANCEL_COMMAND = 'cancel_command' BUILDENV_SETTING_CLONE_GIT_REPO_VIA = 'clone_git_repo_via' BUILDENV_SETTING_CONTAINER_CACHEDIR = 'container_cachedir' BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS = 'cvmfs_customizations' @@ -95,6 +96,7 @@ FINISHED_JOB_COMMENTS_SETTING_JOB_TEST_UNKNOWN_FMT = 'job_test_unknown_fmt' SECTION_GITHUB = 'github' +GITHUB_SETTING_API_TIMEOUT = 'api_timeout' GITHUB_SETTING_APP_ID = 'app_id' GITHUB_SETTING_APP_NAME = 'app_name' GITHUB_SETTING_INSTALLATION_ID = 'installation_id' diff --git a/tools/filter.py b/tools/filter.py index ddc58352..54e0f5e3 100644 --- a/tools/filter.py +++ b/tools/filter.py @@ -27,13 +27,13 @@ FILTER_COMPONENT_ARCH = 'architecture' FILTER_COMPONENT_EXPORT = 'exportvariable' FILTER_COMPONENT_INST = 'instance' -FILTER_COMPONENT_JOB = 'job' +FILTER_COMPONENT_JOBID = 'jobid' FILTER_COMPONENT_REPO = 'repository' FILTER_COMPONENTS = [FILTER_COMPONENT_ACCEL, FILTER_COMPONENT_ARCH, FILTER_COMPONENT_EXPORT, FILTER_COMPONENT_INST, - FILTER_COMPONENT_JOB, + FILTER_COMPONENT_JOBID, FILTER_COMPONENT_REPO ] diff --git a/tools/job_metadata.py b/tools/job_metadata.py index f5ee21ce..e4031faf 100644 --- a/tools/job_metadata.py +++ b/tools/job_metadata.py @@ -63,6 +63,7 @@ JOB_PR_REPO = "repo" JOB_PR_PR_NUMBER = "pr_number" JOB_PR_PR_COMMENT_ID = "pr_comment_id" +JOB_PR_JOB_OWNER = "job_owner" # JWD/_bot_jobJOBID.result JOB_RESULT_SECTION = "RESULT" @@ -99,12 +100,14 @@ def create_metadata_file(job, job_id, pr_comment): repo_name = pr_comment.repo_name pr_number = pr_comment.pr_number pr_comment_id = pr_comment.pr_comment_id + job_owner = job.owner # create _bot_job.metadata file in the job's working directory bot_jobfile = configparser.ConfigParser() bot_jobfile[JOB_PR_SECTION] = {'repo': repo_name, 'pr_number': pr_number, - 'pr_comment_id': pr_comment_id} + 'pr_comment_id': pr_comment_id, + 'job_owner': job_owner} bot_jobfile_path = os.path.join(job.working_dir, f'_bot_job{job_id}.metadata') with open(bot_jobfile_path, 'w') as bjf: bot_jobfile.write(bjf)