Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,12 @@ submit_command = /usr/bin/sbatch

`submit_command` is the full path to the Slurm job submission command used for submitting batch jobs. You may want to verify if `sbatch` is provided at that path or determine its actual location (using `which sbatch`).

```ini
cancel_command = /usr/bin/scancel
```

`cancel_command` is the full path to the Slurm command used for cancelling batch jobs. You may want to verify if `scancel` is provided at that path or determine its actual location (using `which scancel`).

```ini
build_permission = -NOT_ALLOWED_GH_ACCOUNT_NAME- [...]
```
Expand Down
3 changes: 3 additions & 0 deletions app.cfg.example
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,9 @@ slurm_params = --hold
# full path to the job submission command
submit_command = /usr/bin/sbatch

# full path to the job cancellation command
cancel_command = /usr/bin/scancel

# defines which GitHub accounts have the permission to trigger
# build jobs, i.e., for which accounts the bot acts on `bot: build ...`
# commands. If the value is left empty, everyone can trigger build jobs.
Expand Down
63 changes: 60 additions & 3 deletions eessi_bot_event_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@

# Local application imports (anything from EESSI/eessi-bot-software-layer)
from connections import github
from tasks.build import check_build_permission, get_node_types, request_bot_build_issue_comments, \
submit_build_jobs
from tasks.build import cancel_jobs, check_build_permission, get_job_ids, get_node_types, \
get_work_dirs, request_bot_build_issue_comments, submit_build_jobs
from tasks.deploy import deploy_built_artefacts, determine_job_dirs
from tasks.clean_up import move_to_trash_bin
from tools import config
Expand All @@ -53,6 +53,7 @@
config.BUILDENV_SETTING_BUILD_JOB_SCRIPT, # required
config.BUILDENV_SETTING_BUILD_LOGS_DIR, # optional+recommended
config.BUILDENV_SETTING_BUILD_PERMISSION, # optional+recommended
config.BUILDENV_SETTING_CANCEL_COMMAND, # required
config.BUILDENV_SETTING_CONTAINER_CACHEDIR, # optional+recommended
# config.BUILDENV_SETTING_CLONE_GIT_REPO_VIA, # optional
# config.BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS, # optional
Expand Down Expand Up @@ -102,6 +103,7 @@
# the poll interval setting is required for the alternative job handover
# protocol (delayed_begin)
config.SECTION_JOB_MANAGER: [
config.JOB_MANAGER_SETTING_POLL_COMMAND, # required
config.JOB_MANAGER_SETTING_POLL_INTERVAL], # required
config.SECTION_REPO_TARGETS: [
config.REPO_TARGETS_SETTING_REPOS_CFG_DIR], # required
Expand Down Expand Up @@ -507,7 +509,7 @@ def handle_bot_command_help(self, event_info, bot_command):
help_msg += "\n - Commands must be sent with a **new** comment (edits of existing comments are ignored)."
help_msg += "\n - A comment may contain multiple commands, one per line."
help_msg += "\n - Every command begins at the start of a line and has the syntax `bot: COMMAND [ARGUMENTS]*`"
help_msg += "\n - Currently supported COMMANDs are: `help`, `build`, `show_config`, `status`"
help_msg += "\n - Currently supported COMMANDs are: `help`, `build`, `show_config`, `status`, `cancel`"
Comment thread
trz42 marked this conversation as resolved.
help_msg += "\n"
help_msg += "\n For more information, see https://www.eessi.io/docs/bot"
return help_msg
Expand Down Expand Up @@ -679,6 +681,61 @@ def handle_bot_command_status(self, event_info, bot_command):
else:
return "\n - failed to create status comment"

def handle_bot_command_cancel(self, event_info, bot_command):
"""
Handles bot command 'cancel' by parsing 'jobid:' arguments and
cancelling the jobs.

Args:
event_info (dict): event received by event_handler
bot_command (EESSIBotCommand): command to be handled

Returns:
comment (string): list of cancelled jobs if any, error message if not
"""
self.log("processing bot command 'cancel'")

request_body = event_info["raw_request_body"]
repo_name = request_body["repository"]["full_name"]
pr_number = request_body["issue"]["number"]
user = request_body["comment"]["user"]["login"]

gh = github.get_instance()
pr = gh.get_repo(repo_name).get_pull(pr_number)

# Jobs can only be cancelled by the user who submitted the job
# -> No need to proceed if user cannot submit jobs
if not check_build_permission(pr, event_info):
self.log(f"User '{user}' does not have build permission - skipping cancellation.")
return f"\n - User '{user}' cannot submit build jobs."

# Get valid 'jobid:' arguments
job_ids = get_job_ids(bot_command.action_filters)
if len(job_ids) == 0:
self.log("Got no valid job IDs")
return "\n - No valid job IDs were given."

# Get working directories of jobs
work_dirs = get_work_dirs(job_ids, self.cfg)
if len(work_dirs) == 0:
self.log("None of the given jobs are cancellable")
return "\n - No cancellable jobs were given."

# Log skipped jobs
for job_id in job_ids:
if job_id not in work_dirs.keys():
log(f"Skipping job {job_id} - not found")
Comment thread
sondrebr marked this conversation as resolved.

# Cancel jobs
cancelled_jobs = cancel_jobs(work_dirs, user, pr, self.cfg)
if len(cancelled_jobs) == 0:
return "\n - No jobs were cancelled."
else:
comment = ""
for job_id in cancelled_jobs:
comment += f"\n - cancelled job `{job_id}`"
return comment

def start(self, app, port=3000):
"""
Logs startup information to shell and log file and starts the app using
Expand Down
134 changes: 131 additions & 3 deletions tasks/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
# Local application imports (anything from EESSI/eessi-bot-software-layer)
from tools import config, cvmfs_repository, job_metadata, pr_comments, run_cmd
import tools.filter as tools_filter
from tools.pr_comments import ChatLevels, create_comment
from tools.pr_comments import ChatLevels, create_comment, update_comment
from tools.build_params import BUILD_PARAM_ARCH, BUILD_PARAM_ACCEL

# defaults (used if not specified via, eg, 'app.cfg')
Expand All @@ -51,7 +51,9 @@
# other constants
EXPORT_VARS_FILE = 'export_vars.sh'

Job = namedtuple('Job', ('working_dir', 'arch_target', 'repo_id', 'slurm_opts', 'year_month', 'pr_id', 'accelerator'))

Job = namedtuple('Job',
('working_dir', 'arch_target', 'repo_id', 'slurm_opts', 'year_month', 'pr_id', 'accelerator', 'owner'))

# global repo_cfg
repo_cfg = {}
Expand Down Expand Up @@ -108,6 +110,10 @@ def get_build_env_cfg(cfg):
log(f"{fn}(): submit_command '{submit_command}'")
config_data[config.BUILDENV_SETTING_SUBMIT_COMMAND] = submit_command

cancel_command = buildenv.get(config.BUILDENV_SETTING_CANCEL_COMMAND)
log(f"{fn}(): cancel_command '{cancel_command}'")
config_data[config.BUILDENV_SETTING_CANCEL_COMMAND] = cancel_command

job_handover_protocol = buildenv.get(config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL)
slurm_params = buildenv.get(config.BUILDENV_SETTING_SLURM_PARAMS)
if job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_HOLD_RELEASE:
Expand Down Expand Up @@ -582,6 +588,8 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params):
base_branch_name = pr.base.ref
log(f"{fn}(): pr.base.repo.ref '{base_branch_name}'")

job_owner = event_info['raw_request_body']['sender']['login']

# create run dir (base directory for potentially several jobs)
# TODO may still be too early (before we get to any actual job being
# prepared below when calling 'download_pr')
Expand Down Expand Up @@ -689,7 +697,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params):

# enlist jobs to proceed
job = Job(job_dir, partition_info['cpu_subdir'], repo_id, partition_info['slurm_params'], year_month,
pr_id, accelerator)
pr_id, accelerator, job_owner)
jobs.append(job)

log(f"{fn}(): {len(jobs)} jobs to proceed after applying white list")
Expand Down Expand Up @@ -1358,3 +1366,123 @@ def request_bot_build_issue_comments(repo_name, pr_number):
if len(comments) != 100:
break
return status_table


def get_job_ids(action_filter):
"""
Gets and validates 'jobid:' arguments.

Args:
action_filter (EESSIBotActionFilter): Instance containing 'jobid:' arguments

Returns:
job_ids (list): valid 'jobid:' arguments
"""
fn = sys._getframe().f_code.co_name

# Get 'jobid:' arguments
job_filter = action_filter.get_filter_by_component(tools_filter.FILTER_COMPONENT_JOBID)
if not job_filter:
log(f"{fn}(): 'bot: cancel' command needs at least one 'jobid:' argument.")
return []

# Validate job IDs
job_ids = []
for job_id in job_filter:
try:
if int(job_id) > 0:
job_ids.append(job_id)
else:
log(f"{fn}(): Invalid job ID: '{job_id}'")
except Exception as err:
log(f"{fn}(): Invalid job ID: {err}")

return job_ids


def get_work_dirs(job_ids, cfg):
"""
Gets working directories of build jobs.

Args:
job_ids (list): list of job_ids to check.
cfg (ConfigParser): Instance containing full configuration from app.cfg

Returns:
work_dirs (dict): dict mapping each job_id to its work_dir
"""
poll_command = cfg[config.SECTION_JOB_MANAGER][config.JOB_MANAGER_SETTING_POLL_COMMAND]

# squeue only the given job IDs
cs_jobs = ",".join(job_ids)
command_line = f"{poll_command} --noheader --Format=JobId:0@,WorkDir:0 --job={cs_jobs}"
out, err, exit_code = run_cmd(command_line, "Get WorkDirs of jobs")

# All output lines are formatted as '{job_id}@{work_dir}'
work_dirs = {}
for line in out.split("\n"):
job = [field.strip() for field in line.split("@")]
if len(job) != 2:
continue
work_dirs[job[0]] = job[1]

return work_dirs


def cancel_jobs(jobs, user, pr, cfg):
"""
Cancels the given build jobs.

Args:
jobs (dict): dictionary mapping each job_id to cancel to its work_dir
user (str): The user who sent the 'bot: cancel' command
pr (github.PullRequest.PullRequest): instance representing the pull request
cfg (ConfigParser): Instance containing full configuration from app.cfg

Returns:
cancelled_jobs (list): job_ids of successfully cancelled jobs
"""
fn = sys._getframe().f_code.co_name

buildenv = get_build_env_cfg(cfg)
cancel_command = buildenv[config.BUILDENV_SETTING_CANCEL_COMMAND]

cancelled_jobs = []
for job_id, work_dir in jobs.items():
# Get job owner and PR comment ID from metadata
metadata_path = os.path.join(work_dir, f"_bot_job{job_id}.metadata")
metadata = job_metadata.get_section_from_file(
filepath=metadata_path,
section=job_metadata.JOB_PR_SECTION,
)
job_owner = metadata.get(job_metadata.JOB_PR_JOB_OWNER)
pr_comment_id = metadata.get(job_metadata.JOB_PR_PR_COMMENT_ID)

# Only the job owner should be able to cancel a job
if job_owner != user:
log(f"{fn}(): User {user} did not start job {job_id} - skipping cancellation")
continue
log(f"{fn}(): Job {job_id} was started by user {user} - cancelling job")

# Cancel job
command_line = f"{cancel_command} --verbose {job_id}"
out, err, exit_code = run_cmd(command_line, f"cancel job {job_id}", raise_on_error=False)

# Check if command was successful
if exit_code != 0:
log(f"{fn}(): scancel resulted in a non-zero exit code for job {job_id}.")
continue
if any([line.startswith("scancel: error: ") for line in err.split("\n")]):
log(f"{fn}(): Unable to cancel job {job_id}.")
continue

log(f"{fn}(): Cancelled job {job_id}")

# Update job status table
dt = datetime.now(timezone.utc)
update = f"\n|{dt.strftime('%b %d %X %Z %Y')}|finished|job id `{job_id}` was cancelled|"
update_comment(int(pr_comment_id), pr, update)

cancelled_jobs.append(job_id)

return cancelled_jobs
1 change: 1 addition & 0 deletions tests/test_bot_job123.metadata
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
repo = test_repo
pr_number = 999
pr_comment_id = 77
job_owner = user01

Loading