From a444bccdb9ba70f4462232cf0ca47d94e7ac0192 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 6 Feb 2025 14:54:39 +0100 Subject: [PATCH 001/132] add support for cloning Git repository via SSH rather than HTTPS --- README.md | 9 +++++++++ tasks/build.py | 28 +++++++++++++++++++++++++--- tools/config.py | 1 + 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 157e5e7f..66f579f9 100644 --- a/README.md +++ b/README.md @@ -443,6 +443,15 @@ variables) that are allowed to be specified in a PR command with the be exported into the build environment before running the bot/build.sh script. +``` +clone_git_repo_via = 'https' +``` + +The `clone_git_repo_via` setting specifies via which mechanism the Git repository +should be cloned. This can be either: +* '`https`' (default): clone repository via HTTPS with `git clone https://github.com//` +* '`ssh`': clone repository via SSH with `git clone git@github.com:/.git` + #### `[bot_control]` section The `[bot_control]` section contains settings for configuring the feature to diff --git a/tasks/build.py b/tasks/build.py index 0ddcf61f..9a1f9a82 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -44,6 +44,7 @@ _ERROR_GIT_APPLY = "git apply" _ERROR_GIT_CHECKOUT = "git checkout" _ERROR_GIT_CLONE = "curl" +_ERROR_GIT_DIFF = "git diff" _ERROR_NONE = "none" # other constants @@ -355,7 +356,7 @@ def clone_git_repo(repo, path): return (clone_output, clone_error, clone_exit_code) -def download_pr(repo_name, branch_name, pr, arch_job_dir): +def download_pr(repo_name, branch_name, pr, arch_job_dir, clone_via=None): """ Download pull request to job working directory @@ -364,6 +365,7 @@ def download_pr(repo_name, branch_name, pr, arch_job_dir): branch_name (string): name of the base branch of the pull request pr (github.PullRequest.PullRequest): instance representing the pull request arch_job_dir (string): working directory of the job to be submitted + clone_via (string): mechanism to clone Git repository, should be 'https' (default) or 'ssh' Returns: None (implicitly), in case an error is caught in the git clone, git checkout, curl, @@ -376,7 +378,15 @@ def download_pr(repo_name, branch_name, pr, arch_job_dir): # - 'git checkout' base branch of pull request # - 'curl' diff for pull request # - 'git apply' diff file - clone_output, clone_error, clone_exit_code = clone_git_repo(f'https://github.com/{repo_name}', arch_job_dir) + if clone_via in (None, 'https'): + repo_url = f'https://github.com/{repo_name}' + elif clone_via == 'ssh': + repo_url = f'git@github.com:{repo_name}.git' + else: + error_stage = _ERROR_GIT_CLONE + return '', f"Unknown mechanism to clone Git repo: {clone_via}", 1, error_stage + + clone_output, clone_error, clone_exit_code = clone_git_repo(repo_url, arch_job_dir) if clone_exit_code != 0: error_stage = _ERROR_GIT_CLONE return clone_output, clone_error, clone_exit_code, error_stage @@ -407,6 +417,17 @@ def download_pr(repo_name, branch_name, pr, arch_job_dir): error_stage = _ERROR_CURL return curl_output, curl_error, curl_exit_code, error_stage + git_diff_cmd = ' '.join([ + f"git fetch origin pull/{pr.number}/head:{pr.number}", + f"git diff HEAD pr{pr.number} > {pr.number}.diff", + ]) + git_diff_output, git_diff_error, git_diff_exit_code = run_cmd( + git_diff_cmd, "Obtain patch", arch_job_dir, raise_on_error=False + ) + if git_diff_exit_code != 0: + error_stage = _ERROR_GIT_DIFF + return git_diff_output, git_diff_error, git_diff_exit_code + git_apply_cmd = f'git apply {pr.number}.diff' log(f'git apply with command {git_apply_cmd}') git_apply_output, git_apply_error, git_apply_exit_code = run_cmd( @@ -615,8 +636,9 @@ def prepare_jobs(pr, cfg, event_info, action_filter): log(f"{fn}(): job_dir '{job_dir}'") # TODO optimisation? download once, copy and cleanup initial copy? + clone_git_repo_via = build_env_cfg.get(BUILDENV_SETTING_CLONE_GIT_REPO_VIA) download_pr_output, download_pr_error, download_pr_exit_code, error_stage = download_pr( - base_repo_name, base_branch_name, pr, job_dir + base_repo_name, base_branch_name, pr, job_dir, clone_via=clone_git_repo_via, ) comment_download_pr(base_repo_name, pr, download_pr_exit_code, download_pr_error, error_stage) # prepare job configuration file 'job.cfg' in directory /cfg diff --git a/tools/config.py b/tools/config.py index 60554be0..578d38ef 100644 --- a/tools/config.py +++ b/tools/config.py @@ -42,6 +42,7 @@ BUILDENV_SETTING_BUILD_JOB_SCRIPT = 'build_job_script' BUILDENV_SETTING_BUILD_LOGS_DIR = 'build_logs_dir' BUILDENV_SETTING_BUILD_PERMISSION = 'build_permission' +BUILDENV_SETTING_CLONE_GIT_REPO_VIA = 'clone_git_repo_via' BUILDENV_SETTING_CONTAINER_CACHEDIR = 'container_cachedir' BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS = 'cvmfs_customizations' BUILDENV_SETTING_HTTPS_PROXY = 'https_proxy' From 6dcdd8006039804ced279375ab0981a92264a055 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 6 Feb 2025 15:02:44 +0100 Subject: [PATCH 002/132] fix use of config.BUILDENV_SETTING_CLONE_GIT_REPO_VIA --- tasks/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/build.py b/tasks/build.py index 9a1f9a82..6e68ad26 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -636,7 +636,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter): log(f"{fn}(): job_dir '{job_dir}'") # TODO optimisation? download once, copy and cleanup initial copy? - clone_git_repo_via = build_env_cfg.get(BUILDENV_SETTING_CLONE_GIT_REPO_VIA) + clone_git_repo_via = build_env_cfg.get(config.BUILDENV_SETTING_CLONE_GIT_REPO_VIA) download_pr_output, download_pr_error, download_pr_exit_code, error_stage = download_pr( base_repo_name, base_branch_name, pr, job_dir, clone_via=clone_git_repo_via, ) From 3f2cfcba436d0356eb54af4966db1cad8b2f8ecb Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 6 Feb 2025 15:05:29 +0100 Subject: [PATCH 003/132] also use clone_git_repo_via in app.cfg.example --- README.md | 6 +++--- app.cfg.example | 4 ++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 66f579f9..5ed8cd6e 100644 --- a/README.md +++ b/README.md @@ -444,13 +444,13 @@ be exported into the build environment before running the bot/build.sh script. ``` -clone_git_repo_via = 'https' +clone_git_repo_via = https ``` The `clone_git_repo_via` setting specifies via which mechanism the Git repository should be cloned. This can be either: -* '`https`' (default): clone repository via HTTPS with `git clone https://github.com//` -* '`ssh`': clone repository via SSH with `git clone git@github.com:/.git` +* `https` (default): clone repository via HTTPS with `git clone https://github.com//` +* `ssh`: clone repository via SSH with `git clone git@github.com:/.git` #### `[bot_control]` section diff --git a/app.cfg.example b/app.cfg.example index 19581219..aa8eaaba 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -136,6 +136,10 @@ allow_update_submit_opts = false # exported into the build environment via `exportvariable` filters allowed_exportvars = ["NAME1=value_1a", "NAME1=value_1b", "NAME2=value_2"] +# mechanisn to use to clone Git repository +# 'https' to clone via HTTPS (git clone https://github.com//) +# 'ssh' to clone via SSH (git clone git@github.com:/.git) +clone_git_repo_via = https [deploycfg] # script for uploading built software packages From 37c06af97816668032d29630684611d1078b0ed8 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 6 Feb 2025 15:13:04 +0100 Subject: [PATCH 004/132] mark clone_git_repo_via as optional + add logging --- eessi_bot_event_handler.py | 1 + tasks/build.py | 1 + 2 files changed, 2 insertions(+) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index f5b05d10..c398542a 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -52,6 +52,7 @@ config.BUILDENV_SETTING_BUILD_JOB_SCRIPT, # required config.BUILDENV_SETTING_BUILD_LOGS_DIR, # optional+recommended config.BUILDENV_SETTING_BUILD_PERMISSION, # optional+recommended + config.BUILDENV_SETTING_CLONE_GIT_REPO_VIA, # optional config.BUILDENV_SETTING_CONTAINER_CACHEDIR, # optional+recommended # config.BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS, # optional # config.BUILDENV_SETTING_HTTPS_PROXY, # optional diff --git a/tasks/build.py b/tasks/build.py index 6e68ad26..96980dbf 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -637,6 +637,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter): # TODO optimisation? download once, copy and cleanup initial copy? clone_git_repo_via = build_env_cfg.get(config.BUILDENV_SETTING_CLONE_GIT_REPO_VIA) + log(f"Cloning Git repo via: {clone_git_repo_via}") download_pr_output, download_pr_error, download_pr_exit_code, error_stage = download_pr( base_repo_name, base_branch_name, pr, job_dir, clone_via=clone_git_repo_via, ) From 4b046f68eeb4696783988c20cfbb496c30a88d43 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 6 Feb 2025 15:17:09 +0100 Subject: [PATCH 005/132] fix get_build_env_cfg to get clone_git_repo_via value from buildenv in configuration --- tasks/build.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tasks/build.py b/tasks/build.py index 96980dbf..4821ab0f 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -148,6 +148,10 @@ def get_build_env_cfg(cfg): log(f"{fn}(): load_modules '{load_modules}'") config_data[config.BUILDENV_SETTING_LOAD_MODULES] = load_modules + clone_git_repo_via = buildenv.get(config.BUILDENV_SETTING_CLONE_GIT_REPO_VIA, None) + log(f"{fn}(): clone_git_repo_via '{clone_git_repo_via}'") + config_data[config.BUILDENV_SETTING_CLONE_GIT_REPO_VIA] = clone_git_repo_via + return config_data @@ -378,6 +382,7 @@ def download_pr(repo_name, branch_name, pr, arch_job_dir, clone_via=None): # - 'git checkout' base branch of pull request # - 'curl' diff for pull request # - 'git apply' diff file + log(f"Cloning Git repo via: {clone_via}") if clone_via in (None, 'https'): repo_url = f'https://github.com/{repo_name}' elif clone_via == 'ssh': @@ -637,7 +642,6 @@ def prepare_jobs(pr, cfg, event_info, action_filter): # TODO optimisation? download once, copy and cleanup initial copy? clone_git_repo_via = build_env_cfg.get(config.BUILDENV_SETTING_CLONE_GIT_REPO_VIA) - log(f"Cloning Git repo via: {clone_git_repo_via}") download_pr_output, download_pr_error, download_pr_exit_code, error_stage = download_pr( base_repo_name, base_branch_name, pr, job_dir, clone_via=clone_git_repo_via, ) From 45bd5568b45899f5e6d6229f45b55e6a5226cea1 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 6 Feb 2025 16:02:00 +0100 Subject: [PATCH 006/132] remove curl command to obtain PR diff + fix _ERROR_GIT_CLONE constant --- tasks/build.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/tasks/build.py b/tasks/build.py index 4821ab0f..062ba139 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -43,7 +43,7 @@ _ERROR_CURL = "curl" _ERROR_GIT_APPLY = "git apply" _ERROR_GIT_CHECKOUT = "git checkout" -_ERROR_GIT_CLONE = "curl" +_ERROR_GIT_CLONE = "git clone" _ERROR_GIT_DIFF = "git diff" _ERROR_NONE = "none" @@ -408,20 +408,6 @@ def download_pr(repo_name, branch_name, pr, arch_job_dir, clone_via=None): error_stage = _ERROR_GIT_CHECKOUT return checkout_output, checkout_err, checkout_exit_code, error_stage - curl_cmd = ' '.join([ - 'curl -L', - '-H "Accept: application/vnd.github.diff"', - '-H "X-GitHub-Api-Version: 2022-11-28"', - f'https://api.github.com/repos/{repo_name}/pulls/{pr.number} > {pr.number}.diff', - ]) - log(f'curl with command {curl_cmd}') - curl_output, curl_error, curl_exit_code = run_cmd( - curl_cmd, "Obtain patch", arch_job_dir, raise_on_error=False - ) - if curl_exit_code != 0: - error_stage = _ERROR_CURL - return curl_output, curl_error, curl_exit_code, error_stage - git_diff_cmd = ' '.join([ f"git fetch origin pull/{pr.number}/head:{pr.number}", f"git diff HEAD pr{pr.number} > {pr.number}.diff", From b19fe23cb1537d22f9f170c62d3672f70294dd85 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 6 Feb 2025 16:04:33 +0100 Subject: [PATCH 007/132] fix return value for 'git diff' command in download_pr function --- tasks/build.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tasks/build.py b/tasks/build.py index 062ba139..3a440a9c 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -388,8 +388,11 @@ def download_pr(repo_name, branch_name, pr, arch_job_dir, clone_via=None): elif clone_via == 'ssh': repo_url = f'git@github.com:{repo_name}.git' else: + clone_output = '' + clone_error = f"Unknown mechanism to clone Git repo: {clone_via}" + clone_exit_code = 1 error_stage = _ERROR_GIT_CLONE - return '', f"Unknown mechanism to clone Git repo: {clone_via}", 1, error_stage + return clone_output, clone_error, clone_exit_code, error_stage clone_output, clone_error, clone_exit_code = clone_git_repo(repo_url, arch_job_dir) if clone_exit_code != 0: @@ -417,7 +420,7 @@ def download_pr(repo_name, branch_name, pr, arch_job_dir, clone_via=None): ) if git_diff_exit_code != 0: error_stage = _ERROR_GIT_DIFF - return git_diff_output, git_diff_error, git_diff_exit_code + return git_diff_output, git_diff_error, git_diff_exit_code, error_stage git_apply_cmd = f'git apply {pr.number}.diff' log(f'git apply with command {git_apply_cmd}') From 5c51665078dc0b26379512393281b18713f79e7d Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 6 Feb 2025 16:13:07 +0100 Subject: [PATCH 008/132] make sure that download_comment is defined in comment_download_pr --- tasks/build.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tasks/build.py b/tasks/build.py index 3a440a9c..d0473104 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -472,6 +472,8 @@ def comment_download_pr(base_repo_name, pr, download_pr_exit_code, download_pr_e download_comment = (f"```{download_pr_error}```\n" f"{download_pr_comments_cfg[config.DOWNLOAD_PR_COMMENTS_SETTING_GIT_APPLY_FAILURE]}" f"\n{download_pr_comments_cfg[config.DOWNLOAD_PR_COMMENTS_SETTING_GIT_APPLY_TIP]}") + else: + download_comment = f"```{download_pr_error}```" download_comment = pr_comments.create_comment( repo_name=base_repo_name, pr_number=pr.number, comment=download_comment From a1c26353fcb4a9ee3d8d5e87b313428f25e3898b Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 6 Feb 2025 16:13:53 +0100 Subject: [PATCH 009/132] fix 'git diff' command in download_repo --- tasks/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/build.py b/tasks/build.py index d0473104..5613c52a 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -411,7 +411,7 @@ def download_pr(repo_name, branch_name, pr, arch_job_dir, clone_via=None): error_stage = _ERROR_GIT_CHECKOUT return checkout_output, checkout_err, checkout_exit_code, error_stage - git_diff_cmd = ' '.join([ + git_diff_cmd = ' && '.join([ f"git fetch origin pull/{pr.number}/head:{pr.number}", f"git diff HEAD pr{pr.number} > {pr.number}.diff", ]) From 02f31257fef744b14931f2f83be38fa72c76a994 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 6 Feb 2025 16:18:53 +0100 Subject: [PATCH 010/132] fix branch name used by 'git diff' command --- tasks/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/build.py b/tasks/build.py index 5613c52a..20dccf82 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -412,7 +412,7 @@ def download_pr(repo_name, branch_name, pr, arch_job_dir, clone_via=None): return checkout_output, checkout_err, checkout_exit_code, error_stage git_diff_cmd = ' && '.join([ - f"git fetch origin pull/{pr.number}/head:{pr.number}", + f"git fetch origin pull/{pr.number}/head:pr{pr.number}", f"git diff HEAD pr{pr.number} > {pr.number}.diff", ]) git_diff_output, git_diff_error, git_diff_exit_code = run_cmd( From c7a98f2541b91a8fef16815513c91045fa13a912 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Fri, 21 Feb 2025 14:13:12 +0100 Subject: [PATCH 011/132] comment out clone_git_repo_via setting in REQUIRED_CONFIG to avoid making it required MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Thomas Röblitz --- eessi_bot_event_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index c398542a..883aed9d 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -52,7 +52,7 @@ config.BUILDENV_SETTING_BUILD_JOB_SCRIPT, # required config.BUILDENV_SETTING_BUILD_LOGS_DIR, # optional+recommended config.BUILDENV_SETTING_BUILD_PERMISSION, # optional+recommended - config.BUILDENV_SETTING_CLONE_GIT_REPO_VIA, # optional + # config.BUILDENV_SETTING_CLONE_GIT_REPO_VIA, # optional config.BUILDENV_SETTING_CONTAINER_CACHEDIR, # optional+recommended # config.BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS, # optional # config.BUILDENV_SETTING_HTTPS_PROXY, # optional From 7da317db383ed43faa8804d0a088f995bd82e8cb Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Fri, 21 Feb 2025 14:15:29 +0100 Subject: [PATCH 012/132] fix order in REQUIRED_CONFIG (recommended settings before optional ones) --- eessi_bot_event_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index 883aed9d..93bfa978 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -52,8 +52,8 @@ config.BUILDENV_SETTING_BUILD_JOB_SCRIPT, # required config.BUILDENV_SETTING_BUILD_LOGS_DIR, # optional+recommended config.BUILDENV_SETTING_BUILD_PERMISSION, # optional+recommended - # config.BUILDENV_SETTING_CLONE_GIT_REPO_VIA, # optional config.BUILDENV_SETTING_CONTAINER_CACHEDIR, # optional+recommended + # config.BUILDENV_SETTING_CLONE_GIT_REPO_VIA, # optional # config.BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS, # optional # config.BUILDENV_SETTING_HTTPS_PROXY, # optional # config.BUILDENV_SETTING_HTTP_PROXY, # optional From 293ce3c5e3cff7c9f463aa4062b5576e5c1faab5 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Fri, 21 Feb 2025 14:16:37 +0100 Subject: [PATCH 013/132] add more info on use case of clone_git_repo_via set to 'ssh' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Thomas Röblitz --- README.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/README.md b/README.md index 5ed8cd6e..81e85a85 100644 --- a/README.md +++ b/README.md @@ -451,6 +451,18 @@ The `clone_git_repo_via` setting specifies via which mechanism the Git repositor should be cloned. This can be either: * `https` (default): clone repository via HTTPS with `git clone https://github.com//` * `ssh`: clone repository via SSH with `git clone git@github.com:/.git` +In case of using 'ssh', one may need additional steps to ensure that the bot uses the right ssh key and does not ask for a passphrase (if the key used is protected with one). Here are a few things to consider: +- if the ssh key to be used does not have a standard name (e.g., `id_rsa`), add the following entry to `~/.ssh/config` in the bot's account + ``` + Host github.com + User git + IdentityFile ~/.ssh/NAME_OF_PRIVATE_KEY_FILE + ``` +- if the key is protected by a passphrase (**highly recommended**), run an SSH agent and add the key to it + ``` + eval $(ssh-agent -s) + ssh-add ~/.ssh/NAME_OF_PRIVATE_KEY_FILE + ``` #### `[bot_control]` section From 668915c2b4e004cf6473ad9a30cb45a8a85ce6da Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Fri, 21 Feb 2025 14:17:44 +0100 Subject: [PATCH 014/132] also add more info on use case of clone_git_repo_via set to 'ssh' to app.cfg.example MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Thomas Röblitz --- app.cfg.example | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/app.cfg.example b/app.cfg.example index aa8eaaba..6ad77f49 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -138,7 +138,22 @@ allowed_exportvars = ["NAME1=value_1a", "NAME1=value_1b", "NAME2=value_2"] # mechanisn to use to clone Git repository # 'https' to clone via HTTPS (git clone https://github.com//) -# 'ssh' to clone via SSH (git clone git@github.com:/.git) +# In case of using 'ssh', one may need additional steps to ensure that the bot +# uses the right ssh key and does not ask for a passphrase (if the key used is +# protected with one). Here are a few things to consider: +# - if the ssh key to be used does not have a standard name (e.g., 'id_rsa'), +# add the following entry to '~/.ssh/config' in the bot's account +# +# Host github.com +# User git +# IdentityFile ~/.ssh/NAME_OF_PRIVATE_KEY_FILE +# +# - if the key is protected by a passphrase (**highly recommended**), run an +# SSH agent and add the key to it (with the following two commands) +# +# eval $(ssh-agent -s) +# ssh-add ~/.ssh/NAME_OF_PRIVATE_KEY_FILE + clone_git_repo_via = https [deploycfg] From a99ae6015e18a32964a832baceb6b47d12f30d15 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Fri, 21 Feb 2025 15:04:26 +0100 Subject: [PATCH 015/132] only use 'git diff' to obtain PR diff when 'ssh' is used to clone repository --- app.cfg.example | 2 ++ tasks/build.py | 33 ++++++++++++++++++++++----------- tools/config.py | 2 ++ 3 files changed, 26 insertions(+), 11 deletions(-) diff --git a/app.cfg.example b/app.cfg.example index 6ad77f49..e69dd2e1 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -291,6 +291,8 @@ curl_failure = Unable to download the `.diff` file. curl_tip = _Tip: This could be a connection failure. Try again and if the issue remains check if the address is correct_ git_apply_failure = Unable to download or merge changes between the source branch and the destination branch. git_apply_tip = _Tip: This can usually be resolved by syncing your branch and resolving any merge conflicts._ +pr_diff_failure = Unable to obtain PR diff. +pr_diff_tip = _Tip: This could be a problem with SSH access to the repository._ [clean_up] trash_bin_dir = $HOME/trash_bin diff --git a/tasks/build.py b/tasks/build.py index 20dccf82..28ee5cec 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -44,7 +44,7 @@ _ERROR_GIT_APPLY = "git apply" _ERROR_GIT_CHECKOUT = "git checkout" _ERROR_GIT_CLONE = "git clone" -_ERROR_GIT_DIFF = "git diff" +_ERROR_PR_DIFF = "pr_diff" _ERROR_NONE = "none" # other constants @@ -385,8 +385,18 @@ def download_pr(repo_name, branch_name, pr, arch_job_dir, clone_via=None): log(f"Cloning Git repo via: {clone_via}") if clone_via in (None, 'https'): repo_url = f'https://github.com/{repo_name}' + pr_diff_cmd = ' '.join([ + 'curl -L', + '-H "Accept: application/vnd.github.diff"', + '-H "X-GitHub-Api-Version: 2022-11-28"', + f'https://api.github.com/repos/{repo_name}/pulls/{pr.number} > {pr.number}.diff', + ]) elif clone_via == 'ssh': repo_url = f'git@github.com:{repo_name}.git' + pr_diff_cmd = ' && '.join([ + f"git fetch origin pull/{pr.number}/head:pr{pr.number}", + f"git diff HEAD pr{pr.number} > {pr.number}.diff", + ]) else: clone_output = '' clone_error = f"Unknown mechanism to clone Git repo: {clone_via}" @@ -411,21 +421,18 @@ def download_pr(repo_name, branch_name, pr, arch_job_dir, clone_via=None): error_stage = _ERROR_GIT_CHECKOUT return checkout_output, checkout_err, checkout_exit_code, error_stage - git_diff_cmd = ' && '.join([ - f"git fetch origin pull/{pr.number}/head:pr{pr.number}", - f"git diff HEAD pr{pr.number} > {pr.number}.diff", - ]) - git_diff_output, git_diff_error, git_diff_exit_code = run_cmd( - git_diff_cmd, "Obtain patch", arch_job_dir, raise_on_error=False + log(f'obtaining PR diff with command {pr_diff_cmd}') + pr_diff_output, pr_diff_error, pr_diff_exit_code = run_cmd( + pr_diff_cmd, "obtain PR diff", arch_job_dir, raise_on_error=False ) - if git_diff_exit_code != 0: - error_stage = _ERROR_GIT_DIFF - return git_diff_output, git_diff_error, git_diff_exit_code, error_stage + if pr_diff_exit_code != 0: + error_stage = _ERROR_PR_DIFF + return pr_diff_output, pr_diff_error, pr_diff_exit_code, error_stage git_apply_cmd = f'git apply {pr.number}.diff' log(f'git apply with command {git_apply_cmd}') git_apply_output, git_apply_error, git_apply_exit_code = run_cmd( - git_apply_cmd, "Apply patch", arch_job_dir, raise_on_error=False + git_apply_cmd, "apply patch", arch_job_dir, raise_on_error=False ) if git_apply_exit_code != 0: error_stage = _ERROR_GIT_APPLY @@ -472,6 +479,10 @@ def comment_download_pr(base_repo_name, pr, download_pr_exit_code, download_pr_e download_comment = (f"```{download_pr_error}```\n" f"{download_pr_comments_cfg[config.DOWNLOAD_PR_COMMENTS_SETTING_GIT_APPLY_FAILURE]}" f"\n{download_pr_comments_cfg[config.DOWNLOAD_PR_COMMENTS_SETTING_GIT_APPLY_TIP]}") + elif error_stage == _ERROR_PR_DIFF: + download_comment = (f"```{download_pr_error}```\n" + f"{download_pr_comments_cfg[config.DOWNLOAD_PR_COMMENTS_SETTING_PR_DIFF_FAILURE]}" + f"\n{download_pr_comments_cfg[config.DOWNLOAD_PR_COMMENTS_SETTING_PR_DIFF_TIP]}") else: download_comment = f"```{download_pr_error}```" diff --git a/tools/config.py b/tools/config.py index 578d38ef..6ed98bf3 100644 --- a/tools/config.py +++ b/tools/config.py @@ -75,6 +75,8 @@ DOWNLOAD_PR_COMMENTS_SETTING_GIT_CHECKOUT_TIP = 'git_checkout_tip' DOWNLOAD_PR_COMMENTS_SETTING_GIT_CLONE_FAILURE = 'git_clone_failure' DOWNLOAD_PR_COMMENTS_SETTING_GIT_CLONE_TIP = 'git_clone_tip' +DOWNLOAD_PR_COMMENTS_SETTING_PR_DIFF_FAILURE = 'pr_diff_failure' +DOWNLOAD_PR_COMMENTS_SETTING_PR_DIFF_TIP = 'pr_diff_tip' SECTION_EVENT_HANDLER = 'event_handler' EVENT_HANDLER_SETTING_LOG_PATH = 'log_path' From 9b582ab1160eb3a070d4abae9f8ab9e75e9d9e44 Mon Sep 17 00:00:00 2001 From: sam Date: Fri, 4 Apr 2025 16:32:41 +0200 Subject: [PATCH 016/132] git diff from merge-base instead of from HEAD --- tasks/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/build.py b/tasks/build.py index 28ee5cec..2257148b 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -395,7 +395,7 @@ def download_pr(repo_name, branch_name, pr, arch_job_dir, clone_via=None): repo_url = f'git@github.com:{repo_name}.git' pr_diff_cmd = ' && '.join([ f"git fetch origin pull/{pr.number}/head:pr{pr.number}", - f"git diff HEAD pr{pr.number} > {pr.number}.diff", + f"git diff $(git merge-base pr{pr.number} HEAD) pr{pr.number} > {pr.number}.diff", ]) else: clone_output = '' From 4415afc3ac65aba3df5cb9f3cebc9ef72dc227ef Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 17 Apr 2025 16:42:10 +0200 Subject: [PATCH 017/132] Make a first attempt to change the expected arch_target_map. Make sure keys don't have meaning, and all meaning is in the values --- tasks/build.py | 70 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 54 insertions(+), 16 deletions(-) diff --git a/tasks/build.py b/tasks/build.py index 7a0b1c83..bba65e76 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -600,13 +600,35 @@ def prepare_jobs(pr, cfg, event_info, action_filter): return [] jobs = [] - for arch, slurm_opt in arch_map.items(): - arch_dir = arch.replace('/', '_') + # This loop assumes the following structure for arch_target_map: + # arch_target_map = { + # 'virtual_partition_name': { + # 'os': 'linux', + # 'cpu_subdir': 'x86_64/amd/zen4', + # 'accel': ['nvidia/cc90'], # Make this a list, so that we can easily cross compile for a large list with one defined virtual partition + # 'slurm_params': '-p genoa ', + # 'repo_targets': ["eessi.io-2023.06-compat","eessi.io-2023.06-software"], + # }, + # 'virtual_partition_name2': { + # ... etc + for virtual_partition_name, partition_info in arch_map.items(): + # Unpack for convenience + arch_dir = partition_info['cpu_subdir'] + if partition_info['accel']: + # Use the accelerator as defined by the action_filter. We check if this is valid for the current + # virtual partition later + arch_dir += accelerator + arch_dir.replace('/', '_') + # check if repo_targets is defined for this virtual partition + if not 'repo_targets' in partition_info: + log(f"{fn}(): skipping arch {virtual_partition_name}, " + "because no repo_targets were defined for this (virtual) partition") + continue # check if repo_target_map contains an entry for {arch} if arch not in repocfg[config.REPO_TARGETS_SETTING_REPO_TARGET_MAP]: log(f"{fn}(): skipping arch {arch} because repo target map does not define repositories to build for") continue - for repo_id in repocfg[config.REPO_TARGETS_SETTING_REPO_TARGET_MAP][arch]: + for repo_id in partition_info['repo_targets']: # ensure repocfg contains information about the repository repo_id if repo_id != EESSI # Note, EESSI is a bad/misleading name, it should be more like AS_IN_CONTAINER if (repo_id != "EESSI" and repo_id != "EESSI-pilot") and repo_id not in repocfg: @@ -619,13 +641,30 @@ def prepare_jobs(pr, cfg, event_info, action_filter): # false --> log & continue to next iteration of for loop if action_filter: log(f"{fn}(): checking filter {action_filter.to_string()}") - context = {"architecture": arch, "repository": repo_id, "instance": app_name} - log(f"{fn}(): context is '{json.dumps(context, indent=4)}'") - if not action_filter.check_filters(context): - log(f"{fn}(): context does NOT satisfy filter(s), skipping") - continue + context = { + "architecture": partition_info['cpu_subdir'], + "repository": repo_id, + "instance": app_name + } + # Optionally add accelerator to the context + check = False + if partition_info['accel']: + # Create a context for each accelerator, check if _any_ of them is valid + # (one is enough to continue) + for accel in partition_info['accel']: + context['accelerator'] = accel + log(f"{fn}(): context is '{json.dumps(context, indent=4)}'") + check = check | action_filter.check_filters(context) + if not check: + log(f"{fn}(): none of the contexts satisfy filter(s), skipping") + continue else: - log(f"{fn}(): context DOES satisfy filter(s), going on with job") + log(f"{fn}(): context is '{json.dumps(context, indent=4)}'") + if not action_filter.check_filters(context): + log(f"{fn}(): context does NOT satisfy filter(s), skipping") + continue + else: + log(f"{fn}(): context DOES satisfy filter(s), going on with job") # we reached this point when the filter matched (otherwise we # 'continue' with the next repository) # for each match of the filter we create a specific job directory @@ -645,19 +684,18 @@ def prepare_jobs(pr, cfg, event_info, action_filter): ) comment_download_pr(base_repo_name, pr, download_pr_exit_code, download_pr_error, error_stage) # prepare job configuration file 'job.cfg' in directory /cfg - cpu_target = '/'.join(arch.split('/')[1:]) - os_type = arch.split('/')[0] - - log(f"{fn}(): arch = '{arch}' => cpu_target = '{cpu_target}' , os_type = '{os_type}'" - f", accelerator = '{accelerator}'") + log(f"{fn}(): arch = '{arch}' => cpu_target = '{partition_info['cpu_subdir']}' , " + f"os_type = '{partition_info['os']}', accelerator = '{accelerator}'") - prepare_job_cfg(job_dir, build_env_cfg, repocfg, repo_id, cpu_target, os_type, accelerator) + prepare_job_cfg(job_dir, build_env_cfg, repocfg, repo_id, partition_info['cpu_subdir'], + partition_info['os'], accelerator) if exportvars: prepare_export_vars_file(job_dir, exportvars) # enlist jobs to proceed - job = Job(job_dir, arch, repo_id, slurm_opt, year_month, pr_id, accelerator) + job = Job(job_dir, partition_info['cpu_subdir'], repo_id, partition_info['slurm_params'], year_month, + pr_id, accelerator) jobs.append(job) log(f"{fn}(): {len(jobs)} jobs to proceed after applying white list") From d3240e4738fcfe4884b32c1c41d22350447a3680 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 17 Apr 2025 16:45:07 +0200 Subject: [PATCH 018/132] Remove old code that was replaced --- tasks/build.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tasks/build.py b/tasks/build.py index bba65e76..e4e9dfbc 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -624,10 +624,6 @@ def prepare_jobs(pr, cfg, event_info, action_filter): log(f"{fn}(): skipping arch {virtual_partition_name}, " "because no repo_targets were defined for this (virtual) partition") continue - # check if repo_target_map contains an entry for {arch} - if arch not in repocfg[config.REPO_TARGETS_SETTING_REPO_TARGET_MAP]: - log(f"{fn}(): skipping arch {arch} because repo target map does not define repositories to build for") - continue for repo_id in partition_info['repo_targets']: # ensure repocfg contains information about the repository repo_id if repo_id != EESSI # Note, EESSI is a bad/misleading name, it should be more like AS_IN_CONTAINER From f4c0940ce79b71dd0940e382196a69b2549d8527 Mon Sep 17 00:00:00 2001 From: sam Date: Thu, 22 May 2025 13:52:52 +0200 Subject: [PATCH 019/132] update README --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ddcbd8e9..031884e3 100644 --- a/README.md +++ b/README.md @@ -493,7 +493,7 @@ The `clone_git_repo_via` setting specifies via which mechanism the Git repositor should be cloned. This can be either: * `https` (default): clone repository via HTTPS with `git clone https://github.com//` * `ssh`: clone repository via SSH with `git clone git@github.com:/.git` -In case of using 'ssh', one may need additional steps to ensure that the bot uses the right ssh key and does not ask for a passphrase (if the key used is protected with one). Here are a few things to consider: +In case of using 'ssh', one may need additional steps to ensure that the bot uses the right SSH key and does not ask for a passphrase (if the key used is protected with one). Here are a few things to consider: - if the ssh key to be used does not have a standard name (e.g., `id_rsa`), add the following entry to `~/.ssh/config` in the bot's account ``` Host github.com @@ -506,6 +506,8 @@ In case of using 'ssh', one may need additional steps to ensure that the bot use ssh-add ~/.ssh/NAME_OF_PRIVATE_KEY_FILE ``` +Note that the `bot: status` command doesn't work with SSH keys; you'll still need a Github token for that to work. + #### `[bot_control]` section The `[bot_control]` section contains settings for configuring the feature to From bc5c15439b71e8335eadba92bfe077064d68c966 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bob=20Dr=C3=B6ge?= Date: Thu, 12 Jun 2025 20:13:23 +0200 Subject: [PATCH 020/132] workflow for building a smee-client container image --- .../build_smee_client_container_image.yaml | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 .github/workflows/build_smee_client_container_image.yaml diff --git a/.github/workflows/build_smee_client_container_image.yaml b/.github/workflows/build_smee_client_container_image.yaml new file mode 100644 index 00000000..a8fce7d8 --- /dev/null +++ b/.github/workflows/build_smee_client_container_image.yaml @@ -0,0 +1,58 @@ +name: Build and publish a Smee client container image + +on: + push: + paths: + - containers/Dockerfile.smee-client + - .github/workflows/build_smee_client_container_image.yaml + +# Declare default permissions as read only. +permissions: read-all + +jobs: + docker_build_smee_client: + runs-on: ubuntu-latest + steps: + - name: Check out the repo + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Login to GitHub Container Registry + if: github.event_name != 'pull_request' + uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Convert and store repository owner in lowercase, replace colon in tag names by hyphen + run: | + echo REPOSITORY_OWNER=$(echo ${{ github.repository_owner }} | tr '[:upper:]' '[:lower:]') >> $GITHUB_ENV + + - name: Set up QEMU + uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0 + + - name: Cache Docker layers + uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + with: + path: ${{ runner.temp }}/.buildx-cache + key: ${{ runner.os }}-buildx-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-buildx- + + - name: Build and push + uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0 + with: + tags: ghcr.io/${{ env.REPOSITORY_OWNER }}/smee-client + platforms: linux/amd64,linux/arm64 + push: ${{ github.event_name != 'pull_request' }} # don't publish if this is part of an open PR + file: containers/Dockerfile.smee + cache-from: type=local,src=${{ runner.temp }}/.buildx-cache + cache-to: type=local,dest=${{ runner.temp }}/.buildx-cache-new,mode=max + + - name: Move cache + run: | + rm -rf ${{ runner.temp }}/.buildx-cache + mv ${{ runner.temp }}/.buildx-cache-new ${{ runner.temp }}/.buildx-cache From 57fb5edaa38fb67a2d3156297f491d255fc33e88 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bob=20Dr=C3=B6ge?= Date: Thu, 12 Jun 2025 20:14:18 +0200 Subject: [PATCH 021/132] dockerfile for smee-client container --- containers/Dockerfile.smee-client | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 containers/Dockerfile.smee-client diff --git a/containers/Dockerfile.smee-client b/containers/Dockerfile.smee-client new file mode 100644 index 00000000..51376e43 --- /dev/null +++ b/containers/Dockerfile.smee-client @@ -0,0 +1,7 @@ +ARG smee_client_version=4.2.1 + +FROM node:lts-alpine +ARG smee_client_version +RUN npm install --global smee-client@${smee_client_version} +ENTRYPOINT ["smee"] +CMD ["--help"] From 7ac0dd1fccc776cbe155d86bf108c0b0e88216db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bob=20Dr=C3=B6ge?= Date: Thu, 12 Jun 2025 20:15:29 +0200 Subject: [PATCH 022/132] also trigger for PRs --- .github/workflows/build_smee_client_container_image.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/build_smee_client_container_image.yaml b/.github/workflows/build_smee_client_container_image.yaml index a8fce7d8..007fc601 100644 --- a/.github/workflows/build_smee_client_container_image.yaml +++ b/.github/workflows/build_smee_client_container_image.yaml @@ -5,6 +5,10 @@ on: paths: - containers/Dockerfile.smee-client - .github/workflows/build_smee_client_container_image.yaml + pull_request: + paths: + - containers/Dockerfile.smee-client + - .github/workflows/build_smee_client_container_image.yaml # Declare default permissions as read only. permissions: read-all From a3e5c31e2ce365e062b34fddd7ca0fc7b255b7e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bob=20Dr=C3=B6ge?= Date: Thu, 12 Jun 2025 20:18:45 +0200 Subject: [PATCH 023/132] only for pushes --- .github/workflows/build_smee_client_container_image.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/build_smee_client_container_image.yaml b/.github/workflows/build_smee_client_container_image.yaml index 007fc601..a8fce7d8 100644 --- a/.github/workflows/build_smee_client_container_image.yaml +++ b/.github/workflows/build_smee_client_container_image.yaml @@ -5,10 +5,6 @@ on: paths: - containers/Dockerfile.smee-client - .github/workflows/build_smee_client_container_image.yaml - pull_request: - paths: - - containers/Dockerfile.smee-client - - .github/workflows/build_smee_client_container_image.yaml # Declare default permissions as read only. permissions: read-all From d12c2dc233104231ff6ee500d1ead4d72ff12a59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bob=20Dr=C3=B6ge?= Date: Thu, 12 Jun 2025 20:19:18 +0200 Subject: [PATCH 024/132] also for PRs --- .github/workflows/build_smee_client_container_image.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/build_smee_client_container_image.yaml b/.github/workflows/build_smee_client_container_image.yaml index a8fce7d8..007fc601 100644 --- a/.github/workflows/build_smee_client_container_image.yaml +++ b/.github/workflows/build_smee_client_container_image.yaml @@ -5,6 +5,10 @@ on: paths: - containers/Dockerfile.smee-client - .github/workflows/build_smee_client_container_image.yaml + pull_request: + paths: + - containers/Dockerfile.smee-client + - .github/workflows/build_smee_client_container_image.yaml # Declare default permissions as read only. permissions: read-all From 9dc32cac5fece5aa51c79ef5636e26c64e6b045f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bob=20Dr=C3=B6ge?= Date: Thu, 12 Jun 2025 20:20:20 +0200 Subject: [PATCH 025/132] use correct dockerfile filename --- .github/workflows/build_smee_client_container_image.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_smee_client_container_image.yaml b/.github/workflows/build_smee_client_container_image.yaml index 007fc601..ba00aa09 100644 --- a/.github/workflows/build_smee_client_container_image.yaml +++ b/.github/workflows/build_smee_client_container_image.yaml @@ -52,7 +52,7 @@ jobs: tags: ghcr.io/${{ env.REPOSITORY_OWNER }}/smee-client platforms: linux/amd64,linux/arm64 push: ${{ github.event_name != 'pull_request' }} # don't publish if this is part of an open PR - file: containers/Dockerfile.smee + file: containers/Dockerfile.smee-client cache-from: type=local,src=${{ runner.temp }}/.buildx-cache cache-to: type=local,dest=${{ runner.temp }}/.buildx-cache-new,mode=max From 6c599f4d149801dc2488c15ea5424cca9fd6fb0c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bob=20Dr=C3=B6ge?= Date: Thu, 12 Jun 2025 21:03:10 +0200 Subject: [PATCH 026/132] Add package write permission to container build jon --- .github/workflows/build_smee_client_container_image.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build_smee_client_container_image.yaml b/.github/workflows/build_smee_client_container_image.yaml index ba00aa09..a42d1e83 100644 --- a/.github/workflows/build_smee_client_container_image.yaml +++ b/.github/workflows/build_smee_client_container_image.yaml @@ -16,6 +16,8 @@ permissions: read-all jobs: docker_build_smee_client: runs-on: ubuntu-latest + permissions: + packages: write steps: - name: Check out the repo uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 From 7f29d4d2ca5c178d9718c89d1408f455d48f6a6b Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 19 Jun 2025 10:19:30 +0200 Subject: [PATCH 027/132] make space before bot command optional `bot:build ...` or `bot: build` should both work just fine, no reason to require a hard space in between --- tools/commands.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/commands.py b/tools/commands.py index 5db8f7f7..1909c0bf 100644 --- a/tools/commands.py +++ b/tools/commands.py @@ -46,7 +46,7 @@ def get_bot_command(line): fn = sys._getframe().f_code.co_name log(f"{fn}(): searching for bot command in '{line}'") - match = re.search('^bot: (.*)$', line) + match = re.search('^bot:( )?(.*)$', line) # TODO add log messages for both cases if match: return match.group(1).rstrip() From 66005da564cb1aba73d1716c51b5c2fa202a70d2 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 19 Jun 2025 12:04:24 +0200 Subject: [PATCH 028/132] fix regex for bot build command + add log messages --- tools/commands.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/commands.py b/tools/commands.py index 1909c0bf..0982df12 100644 --- a/tools/commands.py +++ b/tools/commands.py @@ -46,11 +46,13 @@ def get_bot_command(line): fn = sys._getframe().f_code.co_name log(f"{fn}(): searching for bot command in '{line}'") - match = re.search('^bot:( )?(.*)$', line) - # TODO add log messages for both cases + match = re.search('^bot:[ ]?(.*)$', line) if match: - return match.group(1).rstrip() + cmd = match.group(1).rstrip() + log(f"Bot command found in '{line}': {cmd}") + return cmd else: + log(f"No bot command found using pattern '{match.pattern}' in: {line}") return None From 02c24659416407a7c3499c85306eaadee7f1addb Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 19 Jun 2025 19:05:22 +0200 Subject: [PATCH 029/132] fix logging with regex pattern in get_bot_command --- tools/commands.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/commands.py b/tools/commands.py index 0982df12..5fb411c4 100644 --- a/tools/commands.py +++ b/tools/commands.py @@ -46,13 +46,14 @@ def get_bot_command(line): fn = sys._getframe().f_code.co_name log(f"{fn}(): searching for bot command in '{line}'") - match = re.search('^bot:[ ]?(.*)$', line) + regex = re.compile('^bot:[ ]?(.*)$') + match = regex.search(line) if match: cmd = match.group(1).rstrip() log(f"Bot command found in '{line}': {cmd}") return cmd else: - log(f"No bot command found using pattern '{match.pattern}' in: {line}") + log(f"No bot command found using pattern '{regex.pattern}' in: {line}") return None From 8eec372bd03a0bd568fd0d2c1a45b927eb1ad5e5 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Thu, 19 Jun 2025 19:06:47 +0200 Subject: [PATCH 030/132] mention function name in log message in get_bot_command --- tools/commands.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/commands.py b/tools/commands.py index 5fb411c4..b842cc19 100644 --- a/tools/commands.py +++ b/tools/commands.py @@ -50,10 +50,10 @@ def get_bot_command(line): match = regex.search(line) if match: cmd = match.group(1).rstrip() - log(f"Bot command found in '{line}': {cmd}") + log(f"{fn}(): Bot command found in '{line}': {cmd}") return cmd else: - log(f"No bot command found using pattern '{regex.pattern}' in: {line}") + log(f"{fn}(): No bot command found using pattern '{regex.pattern}' in: {line}") return None From 750b8a4b2ad83d075fcd8c05214ad07b967e36f2 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Fri, 20 Jun 2025 14:19:57 +0200 Subject: [PATCH 031/132] add pr_diff_failure and pr_diff_tip to required configuration settings --- eessi_bot_event_handler.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index 66aced9c..b1d4123a 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -89,7 +89,9 @@ config.DOWNLOAD_PR_COMMENTS_SETTING_GIT_CHECKOUT_FAILURE, # required config.DOWNLOAD_PR_COMMENTS_SETTING_GIT_CHECKOUT_TIP, # required config.DOWNLOAD_PR_COMMENTS_SETTING_GIT_CLONE_FAILURE, # required - config.DOWNLOAD_PR_COMMENTS_SETTING_GIT_CLONE_TIP], # required + config.DOWNLOAD_PR_COMMENTS_SETTING_GIT_CLONE_TIP, # required + config.DOWNLOAD_PR_COMMENTS_SETTING_PR_DIFF_FAILURE, # required + config.DOWNLOAD_PR_COMMENTS_SETTING_PR_DIFF_TIP], # required config.SECTION_EVENT_HANDLER: [ config.EVENT_HANDLER_SETTING_LOG_PATH], # required config.SECTION_GITHUB: [ From fd31cd110fd3df3170db363fd82e4ea593cde044 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Mon, 23 Jun 2025 13:21:38 +0200 Subject: [PATCH 032/132] update prerequisites and section 1 --- README.md | 55 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index 031884e3..d2734471 100644 --- a/README.md +++ b/README.md @@ -21,59 +21,66 @@ The bot consists of two main components provided in this repository: - GitHub account(s) (two needed for a development scenario), referring to them as `YOU_1` and `YOU_2` below - A fork, say `YOU_1/software-layer`, of - [EESSI/software-layer](https://github.com/EESSI/software-layer) and a fork, - say `YOU_2/software-layer` of your first fork if you want to emulate the - bot's behaviour but not change EESSI's repository. The EESSI bot will act on - events triggered for the target repository (in this context, either - `EESSI/software-layer` or `YOU_1/software-layer`). + [EESSI/software-layer](https://github.com/EESSI/software-layer). The EESSI bot will act on + events triggered for a repository its corresponding GitHub App was installed into. + To install the GitHub App into a repository, the GitHub App needs to be + configured such that it can be installed into any repository or all + repositories belonging to an account/organisation and the installer + (account/person who performs the "installation") has permissions to perform the + installation. - Access to a frontend/login node/service node of a Slurm cluster where the EESSI bot components will run. For the sake of brevity, we call this node simply `bot machine`. - `singularity` with version 3.6 or newer _OR_ `apptainer` with version 1.0 or newer on the compute nodes of the Slurm cluster. -- On the cluster frontend (or where the bot components run), different tools - may be needed to run the Smee client. For `x86_64`, `singularity` or - `apptainer` are sufficient. For `aarch64`, the package manager `npm` is - needed. +- On the `bot machine`, different tools may be needed to run the Smee client. + The Smee client is available via a docker container and can be run with + `singularity` or `apptainer`. Alternatively, the package manager `npm` may be + used to install the Smee client. Running via the EESSI-built container is + preferred. - The EESSI bot components and the (build) jobs will frequently access the Internet. Hence, worker nodes and the `bot machine` of the Slurm cluster need access to the Internet (either directly or via an HTTP proxy). -## Step 1: Smee.io channel and smee client +## Step 1: Relaying events via Smee -We use [smee.io](https://smee.io) as a service to relay events from GitHub +### Step 1a: Create a Smee channel for your own/test scenario +_EESSI uses specific Smee channels. Access to them is restricted for +EESSI-internal use._ +For development and testing purposes, one can use [smee.io](https://smee.io) as a service to relay events from GitHub to the EESSI bot. To do so, create a new channel via https://smee.io and note the URL, e.g., `https://smee.io/CHANNEL-ID`. -On the `bot machine` we need a tool which receives events relayed from -`https://smee.io/CHANNEL-ID` and forwards it to the EESSI bot. We use the Smee -client for this. +### Step 1b: Install Smee client on `bot machine` +On the `bot machine` we need a tool (the Smee client) which receives events relayed from +`https://smee.io/CHANNEL-ID` and forwards it to the EESSI bot event handler. -On machines with `x86_64` architecture, the Smee client can be run via a -container as follows +NOTE, both options below rely on software (the Smee client) that is provided by +3rd parties. Use any of these options at your own risk! + +#### EESSI-built container for Smee client (PREFERRED OPTION) +The Smee client can be run via a container as follows ``` -singularity pull docker://deltaprojects/smee-client -singularity run smee-client_latest.sif --url https://smee.io/CHANNEL-ID +apptainer run docker://ghcr.io/eessi/smee-client:latest --url https://smee.io/CHANNEL-ID ``` or ``` -singularity pull docker://deltaprojects/smee-client -singularity run smee-client_latest.sif --port 3030 --url https://smee.io/CHANNEL-ID +apptainer run docker://ghcr.io/eessi/smee-client:latest --url https://smee.io/CHANNEL-ID --port 3030 ``` for specifying a different port than the default (3000). -On machines with `aarch64` architecture, we can install the the smee client via -the `npm` package manager as follows +#### Use Node.js-based Smee client (alternative option) +The Smee client can be installed via the package manager `npm` as follows ``` npm install smee-client ``` -and then running it with the default port (3000) +and then running it with ``` node_modules/smee-client/bin/smee.js --url https://smee.io/CHANNEL-ID @@ -82,7 +89,7 @@ node_modules/smee-client/bin/smee.js --url https://smee.io/CHANNEL-ID Another port can be used by adding the `--port PORT` argument, for example, ``` -node_modules/smee-client/bin/smee.js --port 3030 --url https://smee.io/CHANNEL-ID +node_modules/smee-client/bin/smee.js --url https://smee.io/CHANNEL-ID --port 3030 ``` ## Step 2: Registering GitHub App From 1a8211cb39f033923b2c06eaa69209b4a07873de Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Mon, 23 Jun 2025 14:15:08 +0200 Subject: [PATCH 033/132] revise sections 2 and 3 --- README.md | 69 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 42 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index d2734471..363fb98e 100644 --- a/README.md +++ b/README.md @@ -18,9 +18,8 @@ The bot consists of two main components provided in this repository: ## Prerequisites -- GitHub account(s) (two needed for a development scenario), referring to them - as `YOU_1` and `YOU_2` below -- A fork, say `YOU_1/software-layer`, of +- GitHub account, say `GH_ACCOUNT` +- A fork, say `GH_ACCOUNT/software-layer`, of [EESSI/software-layer](https://github.com/EESSI/software-layer). The EESSI bot will act on events triggered for a repository its corresponding GitHub App was installed into. To install the GitHub App into a repository, the GitHub App needs to be @@ -92,47 +91,63 @@ Another port can be used by adding the `--port PORT` argument, for example, node_modules/smee-client/bin/smee.js --url https://smee.io/CHANNEL-ID --port 3030 ``` -## Step 2: Registering GitHub App +## Step 2: Registering a GitHub App We need to: -* register a GitHub App; -* link it to the `smee.io` channel; -* set a secret token to verify the webhook sender; -* set some permissions for the GitHub app; -* subscribe the GitHub app to selected events; -* define that this GitHub app should only be installed in your GitHub account (or organisation). +* register a GitHub App +* link it to the `smee.io` channel +* set a secret token used by GitHub to sign webhooks and used by the EESSI bot to + verify that a received event originates from GitHub +* set some permissions for the GitHub app +* subscribe the GitHub app to selected events +* generate a private key (via GitHub GUI) At the [app settings page](https://github.com/settings/apps) click "`New GitHub App`" and fill in the page, in particular the following fields: -- GitHub App name: give the app a name of you choice -- Homepage URL: use the Smee.io channel (`https://smee.io/CHANNEL-ID`) created in [Step 1](#step1) -- Webhook URL: use the Smee.io channel (`https://smee.io/CHANNEL-ID`) created in [Step 1](#step1) -- Webhook secret: create a secret token which is used to verify the webhook sender, for example using: +- **GitHub App name**: give the app a name of your choice +- **Homepage URL**: can use the Smee.io channel (`https://smee.io/CHANNEL-ID`) created in [Step 1](#step1) +- **Webhook URL**: MUST use the Smee.io channel (`https://smee.io/CHANNEL-ID`) created in [Step 1](#step1) +- **Secret**: create a secret token which is used to verify the webhook sender, for example using: ```shell python3 -c 'import secrets; print(secrets.token_hex(64))' ``` -- Permissions: assign the required permissions to the app (e.g., read access to commits, issues, pull requests); - - Make sure to assign read and write access to the Pull requests and Issues in "Repository permissions" section; these permisions can be changed later on; - - Make sure to accept the new permissions from the "Install App" section that you can reach via the menu on the left hand side. - - Then select the wheel right next to your installed app, or use the link `https://github.com/settings/installations/INSTALLATION_ID` - - Once the page is open you will be able to accept the new permissions there. - - Some permissions (e.g., metadata) will be selected automatically because of others you have chosen. +- **Permissions**: assign the required permissions to the app + - Under "Repository permissions" assign "Read and write" for both "Issues" and + "Pull requests" -- Events: subscribe the app to events it shall react on (e.g., related to pull requests and comments) -- Select that the app can only be installed by this (your) GitHub account or organisation. + NOTE, "Read and write" permissions to "Pull requests" gives the bot powerful + means to _mess_ with your pull requests. Unfortunately, there is currently no way + around this or the bot could not create comments in pull requests. -Click on "`Create GitHub App`" to complete this step. +- **Subscribe to events**: subscribe the app to events it shall react on + - Select "Issue comment" and "Pull request" (Note, they may only selectable + after needed Permissions have been chosen above.) +- **Where can this GitHub App be installed?** + - Select "Only on this account" + +Click on "Create GitHub App" to create the app, then generate a private key +(see below). + +### Generate private key +After clicking "Create GitHub App" you will be informed with a banner +to generate a private key. You can follow the link in the banner or simply +scroll down to the section "Private keys" + +Generate the private key, which downloads it and note the SHA256 string (to +more easily identify the key later on). ## Step 3: Installing GitHub App _Note, this will trigger the first event (`installation`). While the EESSI bot is not running yet, you can inspect this via the webpage for your Smee channel. Just open `https://smee.io/CHANNEL-ID` in a browser, and browse through the information included in the event. Naturally, some of the information will be different for other types of events._ -You also need to *install* the GitHub App -- essentially telling GitHub to link the app to an account and one, several, or all repositories on whose events the app then should act upon. +You also need to *install* the GitHub App -- essentially telling GitHub for which +repositories it should send events. -Go to https://github.com/settings/apps and select the app you want to install by clicking on the icon left to the app's name or on the "`Edit`" button right next to the name of the app. +Go to https://github.com/settings/apps/**APP_NAME** and select the menu item +**Install App** on the left-hand side. -On the next page you should see the menu item "`Install App`" on the left-hand side. When you click on this you should see a page with a list of accounts and organisations you can install the app on. Choose one and click on the "`Install`" button next to it. +On the next page you should see a page with a list of accounts and organisations you can install the app on. Choose one and click on the "`Install`" button next to it. -This leads to a page where you can select the repositories on whose the app should react to. Here, for the sake of simplicity, choose just `YOU_1/software-layer` as described in the [prerequisites](#prerequisites). Select one, multiple, or all and click on the "`Install`" button. +This leads to a page where you can select the repositories on whose the app should react to. Here, for the sake of simplicity, choose just `GH_ACCOUNT/software-layer` as described in the [prerequisites](#prerequisites). Select one, multiple, or all and click on the "`Install`" button. ## Step 4: Installing the EESSI bot on a `bot machine` From 85cb96faead3e3a8f17327d357a1a7cdb623fd36 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 25 Jun 2025 11:22:49 +0200 Subject: [PATCH 034/132] reformatting note --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 363fb98e..31d8957b 100644 --- a/README.md +++ b/README.md @@ -114,9 +114,10 @@ At the [app settings page](https://github.com/settings/apps) click "`New GitHub - Under "Repository permissions" assign "Read and write" for both "Issues" and "Pull requests" - NOTE, "Read and write" permissions to "Pull requests" gives the bot powerful - means to _mess_ with your pull requests. Unfortunately, there is currently no way - around this or the bot could not create comments in pull requests. + > [!NOTE] + > "Read and write" permissions to "Pull requests" gives the bot powerful + > means to _mess_ with your pull requests. Unfortunately, there is currently no way + > around this or the bot could not create comments in pull requests. - **Subscribe to events**: subscribe the app to events it shall react on - Select "Issue comment" and "Pull request" (Note, they may only selectable From 9d422b2d0d0e1df6ecf268dd6bedf8b4c3529e1d Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 25 Jun 2025 12:11:13 +0200 Subject: [PATCH 035/132] add CI for markdown linting and fix all issues in README.md --- .github/workflows/markdown-lint.yml | 33 +++ .markdownlint.json | 10 + README.md | 434 ++++++++++++++++++---------- 3 files changed, 324 insertions(+), 153 deletions(-) create mode 100644 .github/workflows/markdown-lint.yml create mode 100644 .markdownlint.json diff --git a/.github/workflows/markdown-lint.yml b/.github/workflows/markdown-lint.yml new file mode 100644 index 00000000..c9b204b3 --- /dev/null +++ b/.github/workflows/markdown-lint.yml @@ -0,0 +1,33 @@ +# This file is part of the EESSI build-and-deploy bot, +# see https://github.com/EESSI/eessi-bot-software-layer +# +# The bot helps with requests to add software installations to the +# EESSI software layer, see https://github.com/EESSI/software-layer +# +# author: Thomas Roeblitz (@trz42) +# +# license: GPLv2 +# + +name: Markdown Lint +on: [push, pull_request] +# Declare default permissions as read only. +permissions: read-all + +jobs: + markdown-lint: + runs-on: ubuntu-24.04 + steps: + - name: Checkout + uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0 + + - name: Setup Node.js + uses: actions/setup-node@60edb5dd545a775178f52524783378180af0d1f8 # v4.0.0 + with: + node-version: '18' + + - name: Install markdownlint-cli + run: npm install -g markdownlint-cli + + - name: Run markdownlint + run: markdownlint "**/*.md" --ignore .git \ No newline at end of file diff --git a/.markdownlint.json b/.markdownlint.json new file mode 100644 index 00000000..79ce1120 --- /dev/null +++ b/.markdownlint.json @@ -0,0 +1,10 @@ +{ + "default": true, + "MD013": false, + "MD033": false, + "MD041": false, + "MD024": false, + "MD026": { + "punctuation": ".,;:!" + } +} \ No newline at end of file diff --git a/README.md b/README.md index 31d8957b..29eb41f7 100644 --- a/README.md +++ b/README.md @@ -44,13 +44,15 @@ The bot consists of two main components provided in this repository: ## Step 1: Relaying events via Smee ### Step 1a: Create a Smee channel for your own/test scenario + _EESSI uses specific Smee channels. Access to them is restricted for EESSI-internal use._ For development and testing purposes, one can use [smee.io](https://smee.io) as a service to relay events from GitHub -to the EESSI bot. To do so, create a new channel via https://smee.io and note +to the EESSI bot. To do so, create a new channel via [smee.io](https://smee.io) and note the URL, e.g., `https://smee.io/CHANNEL-ID`. ### Step 1b: Install Smee client on `bot machine` + On the `bot machine` we need a tool (the Smee client) which receives events relayed from `https://smee.io/CHANNEL-ID` and forwards it to the EESSI bot event handler. @@ -58,58 +60,64 @@ NOTE, both options below rely on software (the Smee client) that is provided by 3rd parties. Use any of these options at your own risk! #### EESSI-built container for Smee client (PREFERRED OPTION) + The Smee client can be run via a container as follows -``` +```bash apptainer run docker://ghcr.io/eessi/smee-client:latest --url https://smee.io/CHANNEL-ID ``` or -``` +```bash apptainer run docker://ghcr.io/eessi/smee-client:latest --url https://smee.io/CHANNEL-ID --port 3030 ``` for specifying a different port than the default (3000). #### Use Node.js-based Smee client (alternative option) + The Smee client can be installed via the package manager `npm` as follows -``` +```bash npm install smee-client ``` and then running it with -``` +```bash node_modules/smee-client/bin/smee.js --url https://smee.io/CHANNEL-ID ``` Another port can be used by adding the `--port PORT` argument, for example, -``` +```bash node_modules/smee-client/bin/smee.js --url https://smee.io/CHANNEL-ID --port 3030 ``` ## Step 2: Registering a GitHub App We need to: -* register a GitHub App -* link it to the `smee.io` channel -* set a secret token used by GitHub to sign webhooks and used by the EESSI bot to + +- register a GitHub App +- link it to the `smee.io` channel +- set a secret token used by GitHub to sign webhooks and used by the EESSI bot to verify that a received event originates from GitHub -* set some permissions for the GitHub app -* subscribe the GitHub app to selected events -* generate a private key (via GitHub GUI) +- set some permissions for the GitHub app +- subscribe the GitHub app to selected events +- generate a private key (via GitHub GUI) At the [app settings page](https://github.com/settings/apps) click "`New GitHub App`" and fill in the page, in particular the following fields: + - **GitHub App name**: give the app a name of your choice - **Homepage URL**: can use the Smee.io channel (`https://smee.io/CHANNEL-ID`) created in [Step 1](#step1) - **Webhook URL**: MUST use the Smee.io channel (`https://smee.io/CHANNEL-ID`) created in [Step 1](#step1) - **Secret**: create a secret token which is used to verify the webhook sender, for example using: + ```shell python3 -c 'import secrets; print(secrets.token_hex(64))' ``` + - **Permissions**: assign the required permissions to the app - Under "Repository permissions" assign "Read and write" for both "Issues" and "Pull requests" @@ -129,6 +137,7 @@ Click on "Create GitHub App" to create the app, then generate a private key (see below). ### Generate private key + After clicking "Create GitHub App" you will be informed with a banner to generate a private key. You can follow the link in the banner or simply scroll down to the section "Private keys" @@ -140,10 +149,10 @@ more easily identify the key later on). _Note, this will trigger the first event (`installation`). While the EESSI bot is not running yet, you can inspect this via the webpage for your Smee channel. Just open `https://smee.io/CHANNEL-ID` in a browser, and browse through the information included in the event. Naturally, some of the information will be different for other types of events._ -You also need to *install* the GitHub App -- essentially telling GitHub for which +You also need to _install_ the GitHub App -- essentially telling GitHub for which repositories it should send events. -Go to https://github.com/settings/apps/**APP_NAME** and select the menu item +Go to [https://github.com/settings/apps/**APP_NAME**](https://github.com/settings/apps/**APP_NAME**) and select the menu item **Install App** on the left-hand side. On the next page you should see a page with a list of accounts and organisations you can install the app on. Choose one and click on the "`Install`" button next to it. @@ -156,14 +165,17 @@ The EESSI bot for the software layer is available from [EESSI/eessi-bot-software Get the EESSI bot _installed_ onto the `bot machine` by running something like -``` +```bash git clone https://github.com/EESSI/eessi-bot-software-layer.git ``` + Determine the full path to bot directory: -``` + +```bash cd eessi-bot-software-layer pwd ``` + Note the output of `pwd`. This will be used to replace `PATH_TO_EESSI_BOT` in the configuration file `app.cfg` (see [Step 5.4](#step5.4)). In the remainder of this page we will refer to this directory as `PATH_TO_EESSI_BOT`. @@ -171,7 +183,8 @@ page we will refer to this directory as `PATH_TO_EESSI_BOT`. If you want to develop the EESSI bot, it is recommended that you fork the [EESSI/eessi-bot-software-layer](https://github.com/EESSI/eessi-bot-software-layer) repository and use the fork on the `bot machine`. If you want to work with a specific pull request for the bot, say number 42, you can obtain the corresponding code with the following commands: -``` + +```bash git clone https://github.com/EESSI/eessi-bot-software-layer.git cd eessi-bot-software-layer pwd @@ -180,7 +193,8 @@ git checkout PR42 ``` The EESSI bot requires some Python packages to be installed, which are specified in the [`requirements.txt`](https://github.com/EESSI/eessi-bot-software-layer/tree/main/requirements.txt) file. It is recommended to install these in a virtual environment based on Python 3.7 or newer. See the commands below for an example on how to set up the virtual environment, activate it, and install the requirements for the EESSI bot. These commands assume that you are in the `eessi-bot-software-layer` directory: -``` + +```bash # assumption here is that you start from *within* the eessi-bot-software-layer directory cd .. python3.7 -m venv venv_eessi_bot_p37 @@ -203,8 +217,9 @@ The script uploads an artefact and an associated metadata file to an S3 bucket. It needs two tools for this: -* the `aws` command to actually upload the files; -* the `jq` command to create the metadata file. + +- the `aws` command to actually upload the files; +- the `jq` command to create the metadata file. This section describes how these tools are installed and configured on the `bot machine`. @@ -212,7 +227,7 @@ This section describes how these tools are installed and configured on the `bot Create a new directory, say `PATH_TO_EESSI_BOT/tools` and change into it. -``` +```bash mkdir PATH_TO_EESSI_BOT/tools cd PATH_TO_EESSI_BOT/tools ``` @@ -240,19 +255,24 @@ Next, install the tool `jq` into the same directory into which `aws` was installed in (for example `PATH_TO_EESSI_BOT/tools`). Download `jq` from `https://github.com/stedolan/jq/releases` into that directory by running, for example, -``` + +```bash cd PATH_TO_EESSI_BOT/tools curl https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 -o jq-linux64 ``` + You may check if there are newer releases and choose a different package depending on your operating system. Update the permissions of the downloaded tool (`jq-linux64` for the above `curl` example) with -``` + +```bash chmod +x jq-linux64 ``` + Finally, create a symbolic link for `jq` by running -``` + +```bash ln -s jq-linux64 jq ``` @@ -261,21 +281,23 @@ Check that the `jq` command works by running `jq --version`. ## Step 5: Configuring the EESSI bot on the `bot machine` For the event handler, you need to set up two environment variables: -* `$GITHUB_TOKEN` (see [Step 5.1](#step5.1)) -* `$GITHUB_APP_SECRET_TOKEN` (see [Step 5.2](#step5.2)). + +- `$GITHUB_TOKEN` (see [Step 5.1](#step5.1)) +- `$GITHUB_APP_SECRET_TOKEN` (see [Step 5.2](#step5.2)). For both the event handler and the job manager you need a private key (see [Step 5.3](#step5.3)). ### Step 5.1: GitHub Personal Access Token (PAT) -Create a Personal Access Token (PAT) for your GitHub account via the page https://github.com/settings/tokens where you find a button "`Generate new token`". +Create a Personal Access Token (PAT) for your GitHub account via the page [https://github.com/settings/tokens](https://github.com/settings/tokens) where you find a button "`Generate new token`". Give it meaningful name (field titled "`Note`"), and set the expiration date. Then select the scopes this PAT will be used for. Then click "`Generate token`". On the result page, take note/copy the resulting token string -- it will only be shown once. On the `bot machine` set the environment variable `$GITHUB_TOKEN`: -``` + +```bash export GITHUB_TOKEN='THE_TOKEN_STRING' ``` @@ -286,79 +308,89 @@ in which you replace `THE_TOKEN_STRING` with the actual token. The GitHub App Secret Token is used to verify the webhook sender. You should have created one already when registering a new GitHub App in [Step 2](#step2). On the `bot machine` set the environment variable `$GITHUB_APP_SECTRET_TOKEN`: -``` + +```bash export GITHUB_APP_SECRET_TOKEN='THE_SECRET_TOKEN_STRING' ``` in which you replace `THE_SECRET_TOKEN_STRING` with the actual token. -Note that depending on the characters used in the string you will likely have to use *single quotes* (`'...'`) when setting the value of the environment variable. +Note that depending on the characters used in the string you will likely have to use _single quotes_ (`'...'`) when setting the value of the environment variable. ### Step 5.3: Create a private key and store it on the `bot machine` The private key is needed to let the app authenticate when updating information at the repository such as commenting on PRs, adding labels, etc. You can create the key at the page of the GitHub App you have registered in [Step 2](#step2). -Open the page https://github.com/settings/apps and then click on the icon left to the name of the GitHub App for the EESSI bot or the "`Edit`" button for the app. +Open the page [https://github.com/settings/apps](https://github.com/settings/apps) and then click on the icon left to the name of the GitHub App for the EESSI bot or the "`Edit`" button for the app. Near the end of the page you will find a section "`Private keys`" where you can create a private key by clicking on the button "`Generate a private key`". The private key should be automatically downloaded to your system. Copy it to the `bot machine` and note the full path to it (`PATH_TO_PRIVATE_KEY`). For example: the private key is on your LOCAL computer. To transfer it to the -`bot machine` use the `scp` command for example: -``` +`bot machine` use the `scp` command for example: + +```bash scp PATH_TO_PRIVATE_KEY_FILE_LOCAL_COMPUTER REMOTE_USERNAME@TARGET_HOST:TARGET/PATH ``` + The location to where the private key is copied on the bot machine (`TARGET/PATH`) should be noted for `PATH_TO_PRIVATE_KEY`. ### Step 5.4: Create the configuration file `app.cfg` If there is no `app.cfg` in the directory `PATH_TO_EESSI_BOT` yet, create an initial version from `app.cfg.example`. -``` +```bash cp -i app.cfg.example app.cfg ``` The example file (`app.cfg.example`) includes notes on what you have to adjust to run the bot in your environment. - #### `[github]` section The section `[github]` contains information for connecting to GitHub: -``` + +```ini app_id = 123456 ``` + Replace '`123456`' with the id of your GitHub App. You can find the id of your GitHub App via the page [GitHub Apps](https://github.com/settings/apps). On this page, select the app you have registered in [Step 2](#step2). On the opened page you will find the `app_id` in the section headed "`About`" listed as "`App ID`". -``` + +```ini app_name = 'MY-bot' ``` + The `app_name` specifies a short name for your bot. It will appear in comments to a pull request. For example, it could include the name of the cluster where the bot runs and a label representing the user that runs the bot, like `hal9000-bot`. -*Note: avoid putting an actual username here as it will be visible on potentially publicly accessible GitHub pages.* +_Note: avoid putting an actual username here as it will be visible on potentially publicly accessible GitHub pages._ -``` +```ini installation_id = 12345678 ``` -Replace '`12345678`' with the id of the *installation* of your GitHub App (see [Step 3](#step3)). + +Replace '`12345678`' with the id of the _installation_ of your GitHub App (see [Step 3](#step3)). You find the installation id of your GitHub App via the page [GitHub Apps](https://github.com/settings/apps). On this page, select the app you have registered in [Step 2](#step2). For determining the `installation_id` select "`Install App`" in the menu on the left-hand side. Then click on the gearwheel button of the installation (to the right of the "`Installed`" label). The URL of the resulting page contains the `installation_id` -- the number after the last "/". The `installation_id` is also provided in the payload of every event within the top-level record named "`installation`". You can see the events and their payload on the webpage of your Smee.io channel (`https://smee.io/CHANNEL-ID`). Alternatively, you can see the events in the "`Advanced`" section of your GitHub App: open the [GitHub Apps](https://github.com/settings/apps) page, select the app you have registered in [Step 2](#step2), and choose "`Advanced`" in the menu on the left-hand side. -``` + +```ini private_key = PATH_TO_PRIVATE_KEY ``` -Replace `PATH_TO_PRIVATE_KEY` with the path you have noted in [Step 5.3](#step5.3). +Replace `PATH_TO_PRIVATE_KEY` with the path you have noted in [Step 5.3](#step5.3). #### `[buildenv]` section The `[buildenv]` section contains information about the build environment. -``` + +```ini build_job_script = PATH_TO_EESSI_BOT/scripts/bot-build.slurm ``` + `build_job_script` points to the job script which will be submitted by the bot event handler. -``` +```ini shared_fs_path = PATH_TO_SHARED_DIRECTORY ``` @@ -366,7 +398,7 @@ Via `shared_fs_path` the path to a directory on a shared filesystem (NFS, etc.) which can be leveraged by the `bot/build.sh` script to store files that should be available across build jobs (software source tarballs, for example). -``` +```ini build_logs_dir = PATH_TO_BUILD_LOGS_DIR ``` @@ -374,33 +406,37 @@ If build logs should be copied to a particular (shared) directory under certain for example when a build failed, the `build_logs_dir` can be set to the path to which logs should be copied by the `bot/build.sh` script. -``` +```ini container_cachedir = PATH_TO_SHARED_DIRECTORY ``` + `container_cachedir` may be used to reuse downloaded container image files across jobs, so jobs can launch containers more quickly. -``` +```ini cvmfs_customizations = { "/etc/cvmfs/default.local": "CVMFS_HTTP_PROXY=\"http://PROXY_DNS_NAME:3128|http://PROXY_IP_ADDRESS:3128\"" } ``` + It may happen that we need to customize the [CernVM-FS](https://cernvm.cern.ch/fs/) configuration for the build job. The value of `cvmfs_customizations` is a dictionary which maps a file name to an entry that needs to be appended to that file. In the example line above, the configuration of `CVMFS_HTTP_PROXY` is appended to the file `/etc/cvmfs/default.local`. The CernVM-FS configuration can be commented out, unless there is a need to customize the CernVM-FS configuration. -``` +```ini http_proxy = http://PROXY_DNS:3128/ https_proxy = http://PROXY_DNS:3128/ ``` + If compute nodes have no direct internet connection, we need to set `http(s)_proxy` or commands such as `pip3` and `eb` (EasyBuild) cannot download software from package repositories. Typically these settings are set in the prologue of a Slurm job. However, when entering the [EESSI compatibility layer](https://www.eessi.io/docs/compatibility_layer), most environment settings are cleared. Hence, they need to be set again at a later stage. -``` +```ini job_delay_begin_factor = 2 ``` + The `job_delay_begin_factor` setting defines how many times the `poll_interval` a job's begin (EligibleTime) from now should be delayed if the handover protocol is set to `delayed_begin` (see setting `job_handover_protocol`). That is, if @@ -408,42 +444,48 @@ the `job_delay_begin_factor` is set to five (5) the delay time is calculated as 5 * `poll_interval`. The event manager would use 2 as default value when submitting jobs. -``` +```ini job_handover_protocol = hold_release ``` + The `job_handover_protocol` setting defines which method is used to handover a job from the event handler to the job manager. Values are - - `hold_release` (job is submitted with `--hold`, job manager removes the hold - with `scontrol release`) - - `delayed_begin` (job is submitted with `--begin=now+(5 * poll_interval)` and - any `--hold` is removed from the submission parameters); see setting - `poll_interval` further below; this is useful if the - bot account cannot run `scontrol release` to remove the hold of the job; - also, the status update in the PR comment of the job is extended by noting - the `EligibleTime` -``` +- `hold_release` (job is submitted with `--hold`, job manager removes the hold + with `scontrol release`) +- `delayed_begin` (job is submitted with `--begin=now+(5 * poll_interval)` and + any `--hold` is removed from the submission parameters); see setting + `poll_interval` further below; this is useful if the + bot account cannot run `scontrol release` to remove the hold of the job; + also, the status update in the PR comment of the job is extended by noting + the `EligibleTime` + +```ini job_name = JOB_NAME ``` + Replace `JOB_NAME` with a string of at least 3 characters that is used as job name when a job is submitted. This is used to filter jobs, e.g., should be used to make sure that multiple bot instances can run in the same Slurm environment. -``` +```ini jobs_base_dir = PATH_TO_JOBS_BASE_DIR ``` + Replace `PATH_TO_JOBS_BASE_DIR` with an absolute filepath like `/home/YOUR_USER_NAME/jobs` (or another path of your choice). Per job the directory structure under `jobs_base_dir` is `YYYY.MM/pr_PR_NUMBER/event_EVENT_ID/run_RUN_NUMBER/OS+SUBDIR`. The base directory will contain symlinks using the job ids pointing to the job's working directory `YYYY.MM/...`. -``` +```ini load_modules = MODULE1/VERSION1,MODULE2/VERSION2,... ``` + `load_modules` provides a means to load modules in the `build_job_script`. None to several modules can be provided in a comma-separated list. It is read by the bot and handed over to `build_job_script` via the `--load-modules` option. -``` +```ini local_tmp = /tmp/$USER/EESSI ``` + `local_tmp` specifies the path to a temporary directory on the node building the software, i.e., on a compute/worker node. You may have to change this if temporary storage under `/tmp` does not exist or is too small. This setting will be used for the @@ -451,9 +493,10 @@ environment variable `$EESSI_TMPDIR`. The value is expanded only inside a runnin job. Thus, typical job environment variables (like `$USER` or `$SLURM_JOB_ID`) may be used to isolate jobs running simultaneously on the same compute node. -``` +```ini site_config_script = /path/to/script/if/any ``` + `site_config_script` specifies the path to a script that - if it exists - is sourced in the build job before any `bot/*` script is run. This allows to customize the build environment due to specifics of the build site/cluster. @@ -461,39 +504,45 @@ Note, such customizations could also be performed by putting them into a module file and use the setting `load_modules` (see above). However, the setting `site_config_script` provides a low threshold for achieving this, too. -``` +```ini slurm_params = "--hold" ``` `slurm_params` defines additional parameters for submitting batch jobs. `"--hold"` should be kept or the bot might not work as intended (the release step done by the job manager component of the bot would be circumvented). Additional parameters, for example, to specify an account, a partition, or any other parameters supported by the [`sbatch` command](https://slurm.schedmd.com/sbatch.html), may be added to customize the job submission. -``` + +```ini submit_command = /usr/bin/sbatch ``` + `submit_command` is the full path to the Slurm job submission command used for submitting batch jobs. You may want to verify if `sbatch` is provided at that path or determine its actual location (using `which sbatch`). -``` +```ini build_permission = GH_ACCOUNT_1 GH_ACCOUNT_2 ... ``` + `build_permission` defines which GitHub accounts have the permission to trigger build jobs, i.e., for which accounts the bot acts on `bot: build ...` commands. If the value is left empty, everyone can trigger build jobs. -``` +```ini no_build_permission_comment = The `bot: build ...` command has been used by user `{build_labeler}`, but this person does not have permission to trigger builds. ``` + `no_build_permission_comment` defines a comment (template) that is used when the account trying to trigger build jobs has no permission to do so. -``` +```ini allow_update_submit_opts = false ``` + `allow_update_submit_opts` determines whether or not to allow updating the submit options via custom module `det_submit_opts` provided by the pull request being processed. -``` +```ini allowed_exportvars = ["NAME1=value_1a", "NAME1=value_1b", "NAME2=value_2"] ``` + `allowed_exportvars` defines a list of name-value pairs (environment variables) that are allowed to be specified in a PR command with the `exportvariable` filter. To specify multiple environment variables, multiple @@ -503,28 +552,32 @@ be exported into the build environment before running the bot/build.sh script. The bot build script makes use of the variable `SKIP_TESTS` to determine if ReFrame tests shall be skipped or not. Default is not to skip them. To allow the use of the variable the setting could look like -``` + +```ini allowed_exportvars = ["SKIP_TESTS=yes", "SKIP_TESTS=no"] ``` - -``` +```ini clone_git_repo_via = https ``` The `clone_git_repo_via` setting specifies via which mechanism the Git repository should be cloned. This can be either: -* `https` (default): clone repository via HTTPS with `git clone https://github.com//` -* `ssh`: clone repository via SSH with `git clone git@github.com:/.git` -In case of using 'ssh', one may need additional steps to ensure that the bot uses the right SSH key and does not ask for a passphrase (if the key used is protected with one). Here are a few things to consider: + +- `https` (default): clone repository via HTTPS with `git clone https://github.com//` +- `ssh`: clone repository via SSH with `git clone git@github.com:/.git` + In case of using 'ssh', one may need additional steps to ensure that the bot uses the right SSH key and does not ask for a passphrase (if the key used is protected with one). Here are a few things to consider: - if the ssh key to be used does not have a standard name (e.g., `id_rsa`), add the following entry to `~/.ssh/config` in the bot's account - ``` + + ```bash Host github.com User git IdentityFile ~/.ssh/NAME_OF_PRIVATE_KEY_FILE ``` + - if the key is protected by a passphrase (**highly recommended**), run an SSH agent and add the key to it - ``` + + ```bash eval $(ssh-agent -s) ssh-add ~/.ssh/NAME_OF_PRIVATE_KEY_FILE ``` @@ -535,16 +588,19 @@ Note that the `bot: status` command doesn't work with SSH keys; you'll still nee The `[bot_control]` section contains settings for configuring the feature to send commands to the bot. -``` + +```ini command_permission = GH_ACCOUNT_1 GH_ACCOUNT_2 ... ``` + The `command_permission` setting defines which GitHub accounts can send commands -to the bot (via new PR comments). If the value is empty *no* GitHub account can send +to the bot (via new PR comments). If the value is empty _no_ GitHub account can send commands. -``` +```ini command_response_fmt = FORMAT_MARKDOWN_AND_HTML ``` + `command_response_fmt` allows to customize the format of the comments about the handling of bot commands. The format needs to include `{app_name}`, `{comment_response}` and `{comment_result}`. `{app_name}` is replaced with the name of the bot instance. @@ -552,16 +608,17 @@ commands. The format needs to include `{app_name}`, `{comment_response}` and for commands before any command is run. `{comment_result}` is replaced with information about the result of the command that was run (can be empty). - #### `[deploycfg]` section The `[deploycfg]` section defines settings for uploading built artefacts (tarballs). -``` + +```ini artefact_upload_script = PATH_TO_EESSI_BOT/scripts/eessi-upload-to-staging ``` + `artefact_upload_script` provides the location for the script used for uploading built software packages to an S3 bucket. -``` +```ini signing = { REPO_ID: { @@ -571,6 +628,7 @@ signing = }, ... } ``` + `signing` provides a setting for signing artefacts. The value uses a JSON-like format with `REPO_ID` being the repository ID. Repository IDs are defined in a file `repos.cfg` (see setting `repos_cfg_dir`), `script` provides the location of the @@ -578,8 +636,9 @@ script that is used to sign a file. If the location is a relative path, the scri must reside in the checked out pull request of the target repository (e.g., EESSI/software-layer). `key` points to the file of the key being used for signing. The bot calls the script with the two arguments: - 1. private key (as provided by the attribute 'key') - 2. path to the file to be signed (the upload script will determine that) + +1. private key (as provided by the attribute 'key') +2. path to the file to be signed (the upload script will determine that) NOTE (on `container_runtime`), signing requires a recent installation of OpenSSH (8.2 or newer). If the frontend where the event handler runs does not have that version installed, you can specify a container runtime via the `container_runtime` @@ -590,19 +649,20 @@ Note (on json format), make sure no trailing commas are used after any elements or parsing/loading the json will likely fail. Also, the whole value should start at a new line and be indented as shown above. -``` +```ini endpoint_url = URL_TO_S3_SERVER ``` + `endpoint_url` provides an endpoint (URL) to a server hosting an S3 bucket. The server could be hosted by a commercial cloud provider like AWS or Azure, or running in a private environment, for example, using Minio. The bot uploads artefacts to the bucket which will be periodically scanned by the ingestion procedure at the Stratum 0 server. - ```ini # example: same bucket for all target repos bucket_name = "eessi-staging" ``` + ```ini # example: bucket to use depends on target repo bucket_name = { @@ -616,8 +676,7 @@ The bucket must be available on the default server (`https://${bucket_name}.s3.a `bucket_name` can be specified as a string value to use the same bucket for all target repos, or it can be mapping from target repo id to bucket name. - -``` +```ini upload_policy = once ``` @@ -630,22 +689,23 @@ The `upload_policy` defines what policy is used for uploading built artefacts to |`once`|Only once upload any built artefact for the build target.| |`none`|Do not upload any built artefacts.| -``` +```ini deploy_permission = GH_ACCOUNT_1 GH_ACCOUNT_2 ... ``` + The `deploy_permission` setting defines which GitHub accounts can trigger the -deployment procedure. The value can be empty (*no* GitHub account can trigger the +deployment procedure. The value can be empty (_no_ GitHub account can trigger the deployment), or a space delimited list of GitHub accounts. -``` +```ini no_deploy_permission_comment = Label `bot:deploy` has been set by user `{deploy_labeler}`, but this person does not have permission to trigger deployments ``` + This defines a message that is added to the status table in a PR comment corresponding to a job whose artefact should have been uploaded (e.g., after setting the `bot:deploy` label). - -``` +```ini metadata_prefix = LOCATION_WHERE_METADATA_FILE_GETS_DEPOSITED artefact_prefix = LOCATION_WHERE_TARBALL_GETS_DEPOSITED ``` @@ -658,13 +718,14 @@ repository id (see also `repo_target_map` below) to a prefix. The prefix itself can use some (environment) variables that are set within the upload script (see `artefact_upload_script` above). Currently those are: - * `'${github_repository}'` (which would be expanded to the full name of the GitHub - repository, e.g., `EESSI/software-layer`), - * `'${legacy_aws_path}'` (which expands to the legacy/old prefix being used for - storing artefacts/metadata files, the old prefix is - `EESSI_VERSION/TARBALL_TYPE/OS_TYPE/CPU_ARCHITECTURE/TIMESTAMP/`), _and_ - * `'${pull_request_number}'` (which would be expanded to the number of the pull - request from which the artefact originates). + +- `'${github_repository}'` (which would be expanded to the full name of the GitHub + repository, e.g., `EESSI/software-layer`), +- `'${legacy_aws_path}'` (which expands to the legacy/old prefix being used for + storing artefacts/metadata files, the old prefix is + `EESSI_VERSION/TARBALL_TYPE/OS_TYPE/CPU_ARCHITECTURE/TIMESTAMP/`), _and_ +- `'${pull_request_number}'` (which would be expanded to the number of the pull + request from which the artefact originates). Note, it's important to single-quote (`'`) the variables as shown above, because they may likely not be defined when the bot calls the upload script. @@ -672,21 +733,25 @@ The list of supported variables can be shown by running `scripts/eessi-upload-to-staging --list-variables`. **Examples:** -``` + +```ini metadata_prefix = {"eessi.io-2023.06": "new/${github_repository}/${pull_request_number}"} artefact_prefix = { "eessi-pilot-2023.06": "", "eessi.io-2023.06": "new/${github_repository}/${pull_request_number}" } ``` + If left empty, the old/legacy prefix is being used. #### `[architecturetargets]` section The section `[architecturetargets]` defines for which targets (OS/SUBDIR), (for example `linux/x86_64/amd/zen2`) the EESSI bot should submit jobs, and which additional `sbatch` parameters will be used for requesting a compute node with the CPU microarchitecture needed to build the software stack. -``` + +```ini arch_target_map = { "linux/x86_64/generic" : "--constraint shape=c4.2xlarge", "linux/x86_64/amd/zen2" : "--constraint shape=c5a.2xlarge" } ``` + The map has one-to-many entries of the format `OS/SUBDIR : ADDITIONAL_SBATCH_PARAMETERS`. For your cluster, you will have to figure out which microarchitectures (`SUBDIR`) are available (as `OS` only `linux` is @@ -694,7 +759,8 @@ currently supported) and how to instruct Slurm to allocate nodes with that architecture to a job (`ADDITIONAL_SBATCH_PARAMETERS`). Note, if you do not have to specify additional parameters to `sbatch` to request a compute node with a specific microarchitecture, you can just write something like: -``` + +```ini arch_target_map = { "linux/x86_64/generic" : "" } ``` @@ -704,20 +770,24 @@ The `[repo_targets]` section defines for which repositories and architectures th Repositories are referenced by IDs (or `repo_id`). Architectures are identified by `OS/SUBDIR` which correspond to settings in the `arch_target_map`. -``` +```ini repo_target_map = { "OS_SUBDIR_1" : ["REPO_ID_1_1","REPO_ID_1_2"], "OS_SUBDIR_2" : ["REPO_ID_2_1","REPO_ID_2_2"] } ``` + For each `OS/SUBDIR` combination a list of available repository IDs can be provided. The repository IDs are defined in a separate file, say `repos.cfg` which is stored in the directory defined via `repos_cfg_dir`: -``` + +```ini repos_cfg_dir = PATH_TO_SHARED_DIRECTORY/cfg_bundles ``` + The `repos.cfg` file also uses the `ini` format as follows + ```ini [eessi-2023.06] repo_name = software.eessi.io @@ -726,6 +796,7 @@ config_bundle = eessi.io-cfg_files.tgz config_map = {"eessi.io/eessi.io.pub":"/etc/cvmfs/keys/eessi.io/eessi.io.pub", "default.local":"/etc/cvmfs/default.local", "eessi.io.conf":"/etc/cvmfs/domain.d/eessi.io.conf"} container = docker://ghcr.io/eessi/build-node:debian11 ``` + The repository id is given in brackets (`[eessi-2023.06]`). Then the name of the repository (`repo_name`) and the version (`repo_version`) are defined. Next, a tarball containing configuration files for CernVM-FS is specified (`config_bundle`). The `config_map` setting maps entries of that tarball to locations inside @@ -737,35 +808,45 @@ The `repos.cfg` file may contain multiple definitions of repositories. #### `[event_handler]` section The `[event_handler]` section contains information required by the bot event handler component. -``` + +```ini log_path = /path/to/eessi_bot_event_handler.log ``` -`log_path` specifies the path to the event handler log. + +`log_path` specifies the path to the event handler log. #### `[job_manager]` section The `[job_manager]` section contains information needed by the job manager. -``` +```ini log_path = /path/to/eessi_bot_job_manager.log ``` -`log_path` specifies the path to the job manager log. -``` +`log_path` specifies the path to the job manager log. + +```ini job_ids_dir = /home/USER/jobs/ids ``` + `job_ids_dir` specifies where the job manager should store information about jobs being tracked. Under this directory it will store information about submitted/running jobs under a subdirectory named '`submitted`', and about finished jobs under a subdirectory named '`finished`'. -``` + +```ini poll_command = /usr/bin/squeue ``` + `poll_command` is the full path to the Slurm command that can be used for checking which jobs exist. You may want to verify if `squeue` is provided at that path or determine its actual location (via `which squeue`). -``` + +```ini poll_interval = 60 ``` + `poll_interval` defines how often the job manager checks the status of the jobs. The unit of the value is seconds. -``` + +```ini scontrol_command = /usr/bin/scontrol ``` + `scontrol_command` is the full path to the Slurm command used for manipulating existing jobs. You may want to verify if `scontrol` is provided at that path or determine its actual location (via `which scontrol`). #### `[submitted_job_comments]` section @@ -773,134 +854,161 @@ scontrol_command = /usr/bin/scontrol The `[submitted_job_comments]` section specifies templates for messages about newly submitted jobs. DEPRECATED setting (use `awaits_release_delayed_begin_msg` and/or `awaits_release_hold_release_msg`) -``` + +```ini awaits_release = job id `{job_id}` awaits release by job manager ``` + `awaits_release` is used to provide a status update of a job (shown as a row in the job's status table). -``` +```ini awaits_release_delayed_begin_msg = job id `{job_id}` will be eligible to start in about {delay_seconds} seconds ``` + `awaits_release_delayed_begin_msg` is used when the `job_handover_protocol` is set to `delayed_begin`. Note, both `{job_id}` and `{delay_seconds}` need to be present in the value or the event handler will throw an exception when formatting the update of the PR comment corresponding to the job. -``` +```ini awaits_release_hold_release_msg = job id `{job_id}` awaits release by job manager ``` + `awaits_release_hold_release_msg` is used when the `job_handover_protocol` is set to `hold_release`. Note, `{job_id}` needs to be present in the value or the event handler will throw an exception when formatting the update of the PR comment corresponding to the job. -``` +```ini initial_comment = New job on instance `{app_name}` for architecture `{arch_name}`{accelerator_spec} for repository `{repo_id}` in job dir `{symlink}` ``` + `initial_comment` is used to create a comment to a PR when a new job has been created. Note, the part '{accelerator_spec}' is only filled-in by the bot if the argument 'accelerator' to the `bot: build` command has been used. -``` + +```ini with_accelerator =  and accelerator `{accelerator}` ``` + `with_accelerator` is used to provide information about the accelerator the job should build for if and only if the argument `accelerator:X/Y` has been provided. #### `[new_job_comments]` section The `[new_job_comments]` section sets templates for messages about jobs whose `hold` flag was released. -``` + +```ini awaits_launch = job awaits launch by Slurm scheduler ``` + `awaits_launch` specifies the status update that is used when the `hold` flag of a job has been removed. #### `[running_job_comments]` section The `[running_job_comments]` section sets templates for messages about jobs that are running. -``` + +```ini running_job = job `{job_id}` is running ``` + `running_job` specifies the status update for a job that started running. #### `[finished_job_comments]` section The `[finished_job_comments]` section sets templates for messages about finished jobs. -``` + +```ini job_result_unknown_fmt =
:shrug: UNKNOWN _(click triangle for details)_
  • Job results file `{filename}` does not exist in job directory, or parsing it failed.
  • No artefacts were found/reported.
``` + `job_result_unknown_fmt` is used in case no result file (produced by `bot/check-build.sh` provided by target repository) was found. -``` +```ini job_test_unknown_fmt =
:shrug: UNKNOWN _(click triangle for details)_
  • Job test file `{filename}` does not exist in job directory, or parsing it failed.
``` + `job_test_unknown_fmt` is used in case no test file (produced by `bot/check-test.sh` provided by target repository) was found. - #### `[download_pr_comments]` section The `[download_pr_comments]` section sets templates for messages related to downloading the contents of a pull request. -``` + +```ini git_clone_failure = Unable to clone the target repository. ``` + `git_clone_failure` is shown when `git clone` failed. -``` +```ini git_clone_tip = _Tip: This could be a connection failure. Try again and if the issue remains check if the address is correct_. ``` + `git_clone_tip` should contain some hint on how to deal with the issue. It is shown when `git clone` failed. -``` +```ini git_checkout_failure = Unable to checkout to the correct branch. ``` + `git_checkout_failure` is shown when `git checkout` failed. -``` +```ini git_checkout_tip = _Tip: Ensure that the branch name is correct and the target branch is available._ ``` + `git_checkout_tip` should contain some hint on how to deal with the failure. It is shown when `git checkout` failed. -``` +```ini curl_failure = Unable to download the `.diff` file. ``` + `curl_failure` is shown when downloading the `PR_NUMBER.diff` -``` + +```ini curl_tip = _Tip: This could be a connection failure. Try again and if the issue remains check if the address is correct_ ``` + `curl_tip` should help in how to deal with failing downloads of the `.diff` file. -``` +```ini git_apply_failure = Unable to download or merge changes between the source branch and the destination branch. ``` + `git_apply_failure` is shown when applying the `.diff` file with `git apply` failed. -``` +```ini git_apply_tip = _Tip: This can usually be resolved by syncing your branch and resolving any merge conflicts._ ``` + `git_apply_tip` should guide the contributor/maintainer about resolving the cause of `git apply` failing. #### `[clean_up]` section The `[clean_up]` section includes settings related to cleaning up disk used by merged (and closed) PRs. -``` + +```ini trash_bin_dir = PATH/TO/TRASH_BIN_DIRECTORY ``` + Ideally this is on the same filesystem used by `jobs_base_dir` and `job_ids_dir` to efficiently move data into the trash bin. If it resides on a different filesystem, the data will be copied. -``` +```ini moved_job_dirs_comment = PR merged! Moved `{job_dirs}` to `{trash_bin_dir}` ``` + Template that is used by the bot to add a comment to a PR noting down which directories have been moved and where. # Step 6: Creating a ReFrame configuration file for the test step (only needed when building for the [EESSI software layer](https://github.com/EESSI/software-layer)) + Part of the test step of the EESSI software layer is running the EESSI test suite. This requires putting a ReFrame configuration file in place that describes the partitions in the `arch_target_map` of the bot config. You can find general documentation on how to write a ReFrame config file in the [EESSI documentation](https://www.eessi.io/docs/test-suite/ReFrame-configuration-file/). However, some specifics apply when setting things up for the test step: @@ -911,19 +1019,22 @@ You can find general documentation on how to write a ReFrame config file in the - The `scheduler` should be `local`, as the bot already schedules the job (ReFrame should just locally spawn the tests in the allocation created by the bot). - The `access` field should not be used by ReFrame if the local scheduler is defined, you can simply omit this keyword. -To configure the number of GPUs and CPUs, we have two options: +To configure the number of GPUs and CPUs, we have two options: + 1. We describe the physical node in the ReFrame configuration file and set the `REFRAME_SCALE_TAG` environment variable to match the size of the allocation that you specify in your bot config. E.g. if your bot config allocates 1/4th of a node, one would set `REFRAME_SCALE_TAG=1_4_node` in the environment of the job submitted by the bot. 2. We describe a virtual node configuration that matches the size of the allcation created by the bot (and we use the default `REFRAME_SCALE_TAG=1_node`, you don't have to set this explicitely). The first approach is the easiest, and thus recommended, since you can use CPU autodetection by ReFrame. The second approach allows for more flexibility. ## Approach 1 (recommended): describing the physical node and setting the `REFRAME_SCALE_TAG` to match the bot config's allocation size + In this approach, we describe the physical node configuration. That means: the amount of physical CPUs and GPUs present in the node. For the CPU part, we can rely on ReFrame's CPU autodetection: if `remote_detect` is set to `True` in the general section of the config, and no CPU topology information is provided in the ReFrame configuration file, ReFrame will automatically detect the [CPU topology](https://reframe-hpc.readthedocs.io/en/stable/config_reference.html#config.systems.partitions.processor). For the GPU part, we need to configure the vendor and the amount of GPUs. E.g. for a partition with 4 Nvidia GPUs per node: -``` + +```json 'partition': { ... 'extras': { @@ -945,8 +1056,10 @@ Note that if you had e.g. a node with 6 GPUs per node, and you were building on Note that if for _some_ partitions you use e.g. quarter nodes, and for some full nodes, you'll have to set the `REFRAME_SCALE_TAG` conditionally based on the node architecture. You could e.g. do this in a `.bashrc` that has some conditional logic to determine the node type and set the corresponding scale. Alternatively, you could use Approach 2. ### Complete example config + In this example, we assume a node with 4 A100 GPUs (compute capability `cc80`) and 72 CPU cores (Intel Skylake) and 512 GB of memory (of which 491520 MiB is useable by SLURM jobs; on this system the rest is reserved for the OS): -``` + +```python from eessi.testsuite.common_config import common_logging_config from eessi.testsuite.constants import * # noqa: F403 @@ -1010,10 +1123,12 @@ site_configuration = { ``` ## Approach 2: describing a virtual node -In this approach, we describe a virtual node configuration for which the size matches exactly what is allocated by the bot (through the `slurm_params` and `arch_target_map`). In this example, we'll assume that this node has 4 GPUs and 72 cores, distributed over 2 sockets each consisting of 1 NUMA domain. We also assume our bot is configured with `slurm_params = --hold --nodes=1 --export=None --time=0:30:0` and `arch_target_map = {"linux/x86_64/intel/skylake_avx512" : "--partition=gpu --cpus-per-task=18 --gpus-per-node 1"}`, i.e. it effectively allocates a quarter node. We describe a virtual partition for ReFrame as if this quarter node is a full node, i.e. we pretend it is a partition with 18 cores and 1 GPU per node, with 1 socket. + +In this approach, we describe a virtual node configuration for which the size matches exactly what is allocated by the bot (through the `slurm_params` and `arch_target_map`). In this example, we'll assume that this node has 4 GPUs and 72 cores, distributed over 2 sockets each consisting of 1 NUMA domain. We also assume our bot is configured with `slurm_params = --hold --nodes=1 --export=None --time=0:30:0` and `arch_target_map = {"linux/x86_64/intel/skylake_avx512" : "--partition=gpu --cpus-per-task=18 --gpus-per-node 1"}`, i.e. it effectively allocates a quarter node. We describe a virtual partition for ReFrame as if this quarter node is a full node, i.e. we pretend it is a partition with 18 cores and 1 GPU per node, with 1 socket. We would first have to hardcode the CPU configuration. -``` + +```json 'partition': { ... 'processor': { @@ -1032,17 +1147,19 @@ We would first have to hardcode the CPU configuration. ``` Note that if instead, this node would have had 8 NUMA domains (4 per socket), the 18 cores would correspond to 2 NUMA domains and we would have had to define: -``` + +```json "numa_nodes": [ "0x001ff", # a bit mask of 000000000111111111, i.e. cores 0-8 are on this NUMA domain "0x3fe00", # a bit mask of 111111111000000000, i.e. cores 9-17 are on this NUMA domain ] ``` -Note that the `topology` dictionary in a ReFrame configuration file can contain more information, such as the bitmasks for the CPU sockets and cores, as well as information on the caches (see [here](https://reframe-hpc.readthedocs.io/en/stable/config_reference.html#config.systems.partitions.processor.topology)). Currently, that information is not needed by the EESSI test suite, but that may change if tests are added that utilize such information to execute efficiently. +Note that the `topology` dictionary in a ReFrame configuration file can contain more information, such as the bitmasks for the CPU sockets and cores, as well as information on the caches (see [ReFrame docs](https://reframe-hpc.readthedocs.io/en/stable/config_reference.html#config.systems.partitions.processor.topology)). Currently, that information is not needed by the EESSI test suite, but that may change if tests are added that utilize such information to execute efficiently. For the GPU configuration, we simply put: -``` + +```json 'partition': { ... 'extras': { @@ -1056,11 +1173,14 @@ For the GPU configuration, we simply put: ] } ``` + To match the fact that we allocate 1 GPU in the `arch_target_map`. ### Complete example config + In this example, we assume a node with 4 A100 GPUs (compute capability `cc80`) and 72 CPU cores (Intel Skylake) and 512 GB of memory (of which 491520 MiB is useable by SLURM jobs; on this system the rest is reserved for the OS). We also assume the bot configuration is such for this partition that 1/4th of these nodes gets allocated for a build job: -``` + +```python site_configuration = { 'systems': [ { @@ -1133,13 +1253,15 @@ site_configuration = { # Step 7: Instructions to run the bot components The bot consists of three components: -* the Smee client; -* the event handler; -* the job manager. + +- the Smee client; +- the event handler; +- the job manager. Running the Smee client was explained in [Step 1](#step1). ## Step 7.1: Running the event handler + As the event handler may run for a long time, it is advised to run it in a `screen` or `tmux` session. The event handler is provided by the [`eessi_bot_event_handler.py`](https://github.com/EESSI/eessi-bot-software-layer/blob/main/eessi_bot_event_handler.py) Python script. @@ -1148,13 +1270,17 @@ Change directory to `eessi-bot-software-layer` (which was created by cloning the repository in [Step 4](#step4) - either the original one from EESSI, or your fork). Then, simply run the event handler script: -``` + +```bash ./event_handler.sh ``` + If multiple instances on the `bot machine` are being executed, you may need to run the event handler and the Smee client with a different port (default is 3000). The event handler can receive events on a different port by adding the parameter `--port PORTNUMBER`, for example, -``` + +```bash ./event_handler.sh --port 3030 ``` + See [Step 1](#step1) for telling the Smee client on which port the event handler receives events. The event handler writes log information to the files `pyghee.log` and @@ -1163,17 +1289,19 @@ The event handler writes log information to the files `pyghee.log` and Note, if you run the bot on a frontend of a cluster with multiple frontends make sure that both the Smee client and the event handler run on the same system! ## Step 7.2: Running the job manager + As the job manager may run for a long time, it is advised to run it in a `screen` or `tmux` session. The job manager is provided by the [`eessi_bot_job_manager_layer.py`](https://github.com/EESSI/eessi-bot-software-layer/blob/main/eessi_bot_job_manager.py) Python script. You can run the job manager from the directory `eessi-bot-software-layer` simply by: -``` +```bash ./job_manager.sh ``` It will run in an infinite loop monitoring jobs and acting on their state changes. If you want to limit the execution of the job manager, you can use thes options: + |Option|Argument| |------|--------| |`-i` / `--max-manager-iterations`|Any number _z_: _z_ < 0 - run the main loop indefinitely, _z_ == 0 - don't run the main loop, _z_ > 0 - run the main loop _z_ times| @@ -1181,9 +1309,10 @@ If you want to limit the execution of the job manager, you can use thes options: An example command would be -``` +```bash ./job_manager.sh -i 1 -j 1234 ``` + to run the main loop exactly once for the job with ID `1234`. The job manager writes log information to the file `eessi_bot_job_manager.log`. @@ -1201,7 +1330,7 @@ Both Git and Curl need to have access to the target repo. A convenient way to access a private repo via a Github token is by adding the following lines to your `~/.netrc` and `~/.curlrc` files: -``` +```bash # ~/.netrc machine github.com login oauth @@ -1212,8 +1341,7 @@ login oauth password ``` -``` +```bash # ~/.curlrc --netrc ``` - From a24ac74cb5339783247a525597e06d7e8c8b7105 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 25 Jun 2025 13:10:45 +0200 Subject: [PATCH 036/132] layout tweaking --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 29eb41f7..386290dd 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,7 @@ We need to: - subscribe the GitHub app to selected events - generate a private key (via GitHub GUI) -At the [app settings page](https://github.com/settings/apps) click "`New GitHub App`" and fill in the page, in particular the following fields: +At the [app settings page](https://github.com/settings/apps) click New GitHub App and fill in the page, in particular the following fields: - **GitHub App name**: give the app a name of your choice - **Homepage URL**: can use the Smee.io channel (`https://smee.io/CHANNEL-ID`) created in [Step 1](#step1) @@ -128,17 +128,17 @@ At the [app settings page](https://github.com/settings/apps) click "`New GitHub > around this or the bot could not create comments in pull requests. - **Subscribe to events**: subscribe the app to events it shall react on - - Select "Issue comment" and "Pull request" (Note, they may only selectable - after needed Permissions have been chosen above.) + - Select "Issue comment" and "Pull request" (Note, they may only be selectable + after the required _Permissions_ have been chosen above.) - **Where can this GitHub App be installed?** - Select "Only on this account" -Click on "Create GitHub App" to create the app, then generate a private key +Click on Create GitHub App to create the app, then generate a private key (see below). ### Generate private key -After clicking "Create GitHub App" you will be informed with a banner +After clicking Create GitHub App you will be informed with a banner to generate a private key. You can follow the link in the banner or simply scroll down to the section "Private keys" From 47dd306c3c2166af97a6942d2cace5a9959f172f Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 25 Jun 2025 13:28:57 +0200 Subject: [PATCH 037/132] refine section 4 --- README.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 386290dd..507c7ef1 100644 --- a/README.md +++ b/README.md @@ -145,9 +145,10 @@ scroll down to the section "Private keys" Generate the private key, which downloads it and note the SHA256 string (to more easily identify the key later on). -## Step 3: Installing GitHub App +## Step 3: Installing the GitHub App into a repository -_Note, this will trigger the first event (`installation`). While the EESSI bot is not running yet, you can inspect this via the webpage for your Smee channel. Just open `https://smee.io/CHANNEL-ID` in a browser, and browse through the information included in the event. Naturally, some of the information will be different for other types of events._ +> [!NOTE] +> This will trigger the first event (`installation`). While the EESSI bot is not running yet, you can inspect this via the webpage for your Smee channel. Just open `https://smee.io/CHANNEL-ID` in a browser, and browse through the information included in the event. Naturally, some of the information will be different for other types of events. You also need to _install_ the GitHub App -- essentially telling GitHub for which repositories it should send events. @@ -155,9 +156,9 @@ repositories it should send events. Go to [https://github.com/settings/apps/**APP_NAME**](https://github.com/settings/apps/**APP_NAME**) and select the menu item **Install App** on the left-hand side. -On the next page you should see a page with a list of accounts and organisations you can install the app on. Choose one and click on the "`Install`" button next to it. +On the next page you should see a list of accounts and organisations you can install the app on. Choose one and click on the Install button next to it. -This leads to a page where you can select the repositories on whose the app should react to. Here, for the sake of simplicity, choose just `GH_ACCOUNT/software-layer` as described in the [prerequisites](#prerequisites). Select one, multiple, or all and click on the "`Install`" button. +This leads to a page where you can select the repositories on whose the app should react to. Here, for the sake of simplicity, choose "Only select repositories", then open the pull-down menu named "Select repositories" and in there select `GH_ACCOUNT/software-layer` (`GH_ACCOUNT` is the GitHub account mentioned in section [prerequisites](#prerequisites)). Finally, click on the Install button. ## Step 4: Installing the EESSI bot on a `bot machine` @@ -176,7 +177,7 @@ cd eessi-bot-software-layer pwd ``` -Note the output of `pwd`. This will be used to replace `PATH_TO_EESSI_BOT` in the +Take note of the output of `pwd`. This will be used to replace `PATH_TO_EESSI_BOT` in the configuration file `app.cfg` (see [Step 5.4](#step5.4)). In the remainder of this page we will refer to this directory as `PATH_TO_EESSI_BOT`. @@ -208,7 +209,7 @@ pip install -r requirements.txt Note, before you can start the bot components (see below), you have to activate the virtual environment with `source venv_eessi_bot_p37/bin/activate`. -You can exit the virtual environment simply by running `deactivate`. +You can exit the virtual environment by running `deactivate`. ### Step 4.1: Installing tools to access S3 bucket From 29de868298c5d03b5d2fbf844b060d8b11e192a3 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 25 Jun 2025 13:40:13 +0200 Subject: [PATCH 038/132] improve sections 5.1 to 5.3 --- README.md | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 507c7ef1..23a2d314 100644 --- a/README.md +++ b/README.md @@ -290,9 +290,9 @@ For both the event handler and the job manager you need a private key (see [Step ### Step 5.1: GitHub Personal Access Token (PAT) -Create a Personal Access Token (PAT) for your GitHub account via the page [https://github.com/settings/tokens](https://github.com/settings/tokens) where you find a button "`Generate new token`". +Create a Personal Access Token (PAT) for your GitHub account via the page [https://github.com/settings/tokens](https://github.com/settings/tokens) where you find a button Generate new token. -Give it meaningful name (field titled "`Note`"), and set the expiration date. Then select the scopes this PAT will be used for. Then click "`Generate token`". +Give it meaningful name in the field titled **Note**, and set the expiration date. Then select the scopes this PAT will be used for. Finally, click Generate token. On the result page, take note/copy the resulting token string -- it will only be shown once. @@ -314,29 +314,20 @@ On the `bot machine` set the environment variable `$GITHUB_APP_SECTRET_TOKEN`: export GITHUB_APP_SECRET_TOKEN='THE_SECRET_TOKEN_STRING' ``` -in which you replace `THE_SECRET_TOKEN_STRING` with the actual token. +in which you replace `THE_SECRET_TOKEN_STRING` with the secret token you have created in [Step 2](#step2). Note that depending on the characters used in the string you will likely have to use _single quotes_ (`'...'`) when setting the value of the environment variable. ### Step 5.3: Create a private key and store it on the `bot machine` -The private key is needed to let the app authenticate when updating information at the repository such as commenting on PRs, adding labels, etc. You can create the key at the page of the GitHub App you have registered in [Step 2](#step2). +The private key is needed to let the app authenticate when updating information at the repository such as commenting on pull requests, adding labels, etc. You can create the key at the page of the GitHub App you have registered in [Step 2](#step2). -Open the page [https://github.com/settings/apps](https://github.com/settings/apps) and then click on the icon left to the name of the GitHub App for the EESSI bot or the "`Edit`" button for the app. +Open the page [https://github.com/settings/apps](https://github.com/settings/apps) and then click on the icon left to the name of the GitHub App for the EESSI bot or the Edit button for the app. -Near the end of the page you will find a section "`Private keys`" where you can create a private key by clicking on the button "`Generate a private key`". +Near the end of the page you will find a section **Private keys** where you can create a private key by clicking on the button Generate a private key. The private key should be automatically downloaded to your system. Copy it to the `bot machine` and note the full path to it (`PATH_TO_PRIVATE_KEY`). -For example: the private key is on your LOCAL computer. To transfer it to the -`bot machine` use the `scp` command for example: - -```bash -scp PATH_TO_PRIVATE_KEY_FILE_LOCAL_COMPUTER REMOTE_USERNAME@TARGET_HOST:TARGET/PATH -``` - -The location to where the private key is copied on the bot machine (`TARGET/PATH`) should be noted for `PATH_TO_PRIVATE_KEY`. - ### Step 5.4: Create the configuration file `app.cfg` If there is no `app.cfg` in the directory `PATH_TO_EESSI_BOT` yet, create an initial version from `app.cfg.example`. From a0150864386b942ca85021415933c342411709e6 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Wed, 25 Jun 2025 15:12:23 +0200 Subject: [PATCH 039/132] half way through revising app.cfg settings --- README.md | 181 +++++++++++++++++++++++++++--------------------- app.cfg.example | 88 ++++++++++++++--------- 2 files changed, 159 insertions(+), 110 deletions(-) diff --git a/README.md b/README.md index 23a2d314..07ae8dfa 100644 --- a/README.md +++ b/README.md @@ -352,7 +352,7 @@ Replace '`123456`' with the id of your GitHub App. You can find the id of your G app_name = 'MY-bot' ``` -The `app_name` specifies a short name for your bot. It will appear in comments to a pull request. For example, it could include the name of the cluster where the bot runs and a label representing the user that runs the bot, like `hal9000-bot`. +The `app_name` specifies a short name for your bot. It will appear in comments to a pull request. For example, it could include the name of the cluster where the bot runs and a label representing the user that runs the bot, like `hal9000-bot`. The name will be used when signing files uploaded to an S3 bucket. Thus, the name has to be the same that is used as value for `namespaces` in the `allowed_signers` file used during the ingestion procedure (see [https://github.com/EESSI/filesystem-layer](https://github.com/EESSI/filesystem-layer)). _Note: avoid putting an actual username here as it will be visible on potentially publicly accessible GitHub pages._ @@ -362,9 +362,9 @@ installation_id = 12345678 Replace '`12345678`' with the id of the _installation_ of your GitHub App (see [Step 3](#step3)). -You find the installation id of your GitHub App via the page [GitHub Apps](https://github.com/settings/apps). On this page, select the app you have registered in [Step 2](#step2). For determining the `installation_id` select "`Install App`" in the menu on the left-hand side. Then click on the gearwheel button of the installation (to the right of the "`Installed`" label). The URL of the resulting page contains the `installation_id` -- the number after the last "/". +You find the installation id of your GitHub App via the page [Applications](https://github.com/settings/installations). On this page, select the app you have registered in [Step 2](#step2) by clicking on the Configure button. The installation id is shown as the URL of the page. -The `installation_id` is also provided in the payload of every event within the top-level record named "`installation`". You can see the events and their payload on the webpage of your Smee.io channel (`https://smee.io/CHANNEL-ID`). Alternatively, you can see the events in the "`Advanced`" section of your GitHub App: open the [GitHub Apps](https://github.com/settings/apps) page, select the app you have registered in [Step 2](#step2), and choose "`Advanced`" in the menu on the left-hand side. +The `installation_id` is also provided in the payload of every event within the top-level record named "`installation`". You can see the events and their payload on the webpage of your Smee.io channel (`https://smee.io/CHANNEL-ID`). Alternatively, you can see the events in the **Advanced** section of your GitHub App: open the [GitHub Apps](https://github.com/settings/apps) page, select the app you have registered in [Step 2](#step2), and choose **Advanced** in the menu on the left-hand side. ```ini private_key = PATH_TO_PRIVATE_KEY @@ -372,6 +372,37 @@ private_key = PATH_TO_PRIVATE_KEY Replace `PATH_TO_PRIVATE_KEY` with the path you have noted in [Step 5.3](#step5.3). +#### `[bot_control]` section + +The `[bot_control]` section contains settings for configuring the feature to +send commands to the bot. + +```ini +command_permission = GH_ACCOUNT_1 GH_ACCOUNT_2 ... +``` + +The `command_permission` setting defines which GitHub accounts can send commands +to the bot (via new PR comments). If the value is empty _no_ GitHub account can send +commands. + +```ini +command_response_fmt = FORMAT_MARKDOWN_AND_HTML +``` + +`command_response_fmt` allows to customize the format of the comments about the handling of bot +commands. The format needs to include `{app_name}`, `{comment_response}` and +`{comment_result}`. `{app_name}` is replaced with the name of the bot instance. +`{comment_response}` is replaced with information about parsing the comment +for commands before any command is run. `{comment_result}` is replaced with +information about the result of the command that was run (can be empty). + +```ini +chatlevel = basic +``` + +`chatlevel` defines the amount of comments the bot writes into PRs (incognito - no comments, minimal - respond with single comment on bot commands `help`, `show_config`, `status` and `build` and update job progress, basic - minimal + report failures, or chatty - comments on any event being processed) +chatlevel = basic + #### `[buildenv]` section The `[buildenv]` section contains information about the build environment. @@ -509,7 +540,7 @@ submit_command = /usr/bin/sbatch `submit_command` is the full path to the Slurm job submission command used for submitting batch jobs. You may want to verify if `sbatch` is provided at that path or determine its actual location (using `which sbatch`). ```ini -build_permission = GH_ACCOUNT_1 GH_ACCOUNT_2 ... +build_permission = GH_ACCOUNT GH_ACCOUNT_2 -NOT_ALLOWED_GH_ACCOUNT_NAME- ... ``` `build_permission` defines which GitHub accounts have the permission to trigger @@ -530,6 +561,8 @@ allow_update_submit_opts = false `allow_update_submit_opts` determines whether or not to allow updating the submit options via custom module `det_submit_opts` provided by the pull request being processed. +Should only be enabled (true) with care because this will result in code from the target +repository being executed by the event handler process, that is, not in a compute job. ```ini allowed_exportvars = ["NAME1=value_1a", "NAME1=value_1b", "NAME2=value_2"] @@ -539,7 +572,7 @@ allowed_exportvars = ["NAME1=value_1a", "NAME1=value_1b", "NAME2=value_2"] variables) that are allowed to be specified in a PR command with the `exportvariable` filter. To specify multiple environment variables, multiple `exportvariable` filters must be used (one per variable). These variables will -be exported into the build environment before running the bot/build.sh script. +be exported into the build environment before running the `bot/build.sh` script. The bot build script makes use of the variable `SKIP_TESTS` to determine if ReFrame tests shall be skipped or not. Default is not to skip them. To allow the @@ -549,6 +582,12 @@ use of the variable the setting could look like allowed_exportvars = ["SKIP_TESTS=yes", "SKIP_TESTS=no"] ``` +A resonable default setting is + +```ini +allowed_exportvars = [] +``` + ```ini clone_git_repo_via = https ``` @@ -576,48 +615,69 @@ should be cloned. This can be either: Note that the `bot: status` command doesn't work with SSH keys; you'll still need a Github token for that to work. -#### `[bot_control]` section +#### `[deploycfg]` section -The `[bot_control]` section contains settings for configuring the feature to -send commands to the bot. +The `[deploycfg]` section defines settings for uploading built artefacts (tarballs). ```ini -command_permission = GH_ACCOUNT_1 GH_ACCOUNT_2 ... +artefact_upload_script = PATH_TO_EESSI_BOT/scripts/eessi-upload-to-staging ``` -The `command_permission` setting defines which GitHub accounts can send commands -to the bot (via new PR comments). If the value is empty _no_ GitHub account can send -commands. +`artefact_upload_script` provides the location for the script used for uploading built software packages to an S3 bucket. ```ini -command_response_fmt = FORMAT_MARKDOWN_AND_HTML +# example: same bucket for all target repos +bucket_name = "eessi-staging" ``` -`command_response_fmt` allows to customize the format of the comments about the handling of bot -commands. The format needs to include `{app_name}`, `{comment_response}` and -`{comment_result}`. `{app_name}` is replaced with the name of the bot instance. -`{comment_response}` is replaced with information about parsing the comment -for commands before any command is run. `{comment_result}` is replaced with -information about the result of the command that was run (can be empty). +```ini +# example: bucket to use depends on target repo identifier (see setting +# `repo_target_map`) +# the key is the identifier of a repo, while the value is the name of the bucket +bucket_name = { + "eessi.io-2023.06-software": "eessi.io-staging-2023.06", + "eessi.io-2025.06-software": "eessi.io-2025.06" +} +``` -#### `[deploycfg]` section +`bucket_name` is the name of the bucket used for uploading of artefacts. +The bucket must be available on the default server (`https://${bucket_name}.s3.amazonaws.com`), or the one provided via `endpoint_url`. -The `[deploycfg]` section defines settings for uploading built artefacts (tarballs). +`bucket_name` can be specified as a string value to use the same bucket for all target repos, or it can be mapping from target repo id to bucket name. ```ini -artefact_upload_script = PATH_TO_EESSI_BOT/scripts/eessi-upload-to-staging +deploy_permission = GH_ACCOUNT_1 GH_ACCOUNT_2 ... ``` -`artefact_upload_script` provides the location for the script used for uploading built software packages to an S3 bucket. +The `deploy_permission` setting defines which GitHub accounts can trigger the +deployment procedure. The value can be empty (_no_ GitHub account can trigger the +deployment), or a space delimited list of GitHub accounts. + +```ini +endpoint_url = URL_TO_S3_SERVER +``` + +`endpoint_url` provides an endpoint (URL) to a server hosting an S3 bucket. The +server could be hosted by a commercial cloud provider like AWS or Azure, or +running in a private environment, for example, using Minio. In EESSI, the bot uploads +artefacts to the bucket which will be periodically scanned by the ingestion procedure at the Stratum 0 server. + +```ini +no_deploy_permission_comment = Label `bot:deploy` has been set by user `{deploy_labeler}`, but this person does not have permission to trigger deployments +``` + +This defines a message that is added to the status table in a PR comment +corresponding to a job whose artefact should have been uploaded (e.g., after +setting the `bot:deploy` label). ```ini signing = { - REPO_ID: { - "script": PATH_TO_SIGN_SCRIPT, - "key": PATH_TO_KEY_FILE, - "container_runtime": PATH_TO_CONTAINER_RUNTIME - }, ... + "REPO_ID": { + "script": "PATH_TO_SIGN_SCRIPT", + "key": "PATH_TO_KEY_FILE", + "container_runtime": "PATH_TO_CONTAINER_RUNTIME" + } } ``` @@ -631,42 +691,21 @@ for signing. The bot calls the script with the two arguments: 1. private key (as provided by the attribute 'key') 2. path to the file to be signed (the upload script will determine that) -NOTE (on `container_runtime`), signing requires a recent installation of OpenSSH -(8.2 or newer). If the frontend where the event handler runs does not have that -version installed, you can specify a container runtime via the `container_runtime` -attribute below. Currently, only Singularity or Apptainer are supported. -Note (on the key), make sure the file permissions are restricted to `0600` (only -readable+writable by the file owner, or the signing will likely fail. -Note (on json format), make sure no trailing commas are used after any elements -or parsing/loading the json will likely fail. Also, the whole value should start -at a new line and be indented as shown above. - -```ini -endpoint_url = URL_TO_S3_SERVER -``` - -`endpoint_url` provides an endpoint (URL) to a server hosting an S3 bucket. The -server could be hosted by a commercial cloud provider like AWS or Azure, or -running in a private environment, for example, using Minio. The bot uploads -artefacts to the bucket which will be periodically scanned by the ingestion procedure at the Stratum 0 server. -```ini -# example: same bucket for all target repos -bucket_name = "eessi-staging" -``` - -```ini -# example: bucket to use depends on target repo -bucket_name = { - "eessi-pilot-2023.06": "eessi-staging-2023.06", - "eessi.io-2023.06": "software.eessi.io-2023.06", -} -``` - -`bucket_name` is the name of the bucket used for uploading of artefacts. -The bucket must be available on the default server (`https://${bucket_name}.s3.amazonaws.com`), or the one provided via `endpoint_url`. - -`bucket_name` can be specified as a string value to use the same bucket for all target repos, or it can be mapping from target repo id to bucket name. +> [!NOTE] +> Wrt `container_runtime`, signing requires a recent installation of OpenSSH +> (8.2 or newer). If the frontend where the event handler runs does not have that +> version installed, you can specify a container runtime via the `container_runtime` +> attribute below. Currently, only Singularity or Apptainer are supported. +> [!NOTE] +> Wrt to the private key file, make sure the file permissions are restricted to `0600` +> (only readable+writable by the file owner) or the signing will likely fail. +> [!NOTE] +> Wrt to the JSON-like format, make sure commas are only used for separating elements +> or parsing/loading the json will likely fail. Also, the whole value should start +> at a new line and be indented as shown above. +> [!NOTE] +> As shown in the example, use double quotes for all keys and values. ```ini upload_policy = once @@ -681,22 +720,6 @@ The `upload_policy` defines what policy is used for uploading built artefacts to |`once`|Only once upload any built artefact for the build target.| |`none`|Do not upload any built artefacts.| -```ini -deploy_permission = GH_ACCOUNT_1 GH_ACCOUNT_2 ... -``` - -The `deploy_permission` setting defines which GitHub accounts can trigger the -deployment procedure. The value can be empty (_no_ GitHub account can trigger the -deployment), or a space delimited list of GitHub accounts. - -```ini -no_deploy_permission_comment = Label `bot:deploy` has been set by user `{deploy_labeler}`, but this person does not have permission to trigger deployments -``` - -This defines a message that is added to the status table in a PR comment -corresponding to a job whose artefact should have been uploaded (e.g., after -setting the `bot:deploy` label). - ```ini metadata_prefix = LOCATION_WHERE_METADATA_FILE_GETS_DEPOSITED artefact_prefix = LOCATION_WHERE_TARBALL_GETS_DEPOSITED diff --git a/app.cfg.example b/app.cfg.example index e57d9ee9..0631ae24 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -61,15 +61,20 @@ command_response_fmt = {comment_result} -# chattiness level of the bot in terms of writing comments into PRs (minimal, basic, or chatty) +# chattiness level of the bot in terms of writing comments into PRs +# (incognito - no comments, minimal - respond with single comment on bot +# commands `help`, `show_config`, `status` and `build` and update job +# progress, basic - minimal + report failures, or chatty - comments on +# any event being processed) chatlevel = basic [buildenv] -# name of the job script used for building an EESSI stack +# name of the job script that is submitted by the event handler (e.g., +# used for building an EESSI stack) build_job_script = PATH_TO_EESSI_BOT/scripts/bot-build.slurm -# path to directory on shared filesystem that can be used for sharing data across build jobs -# (for example source tarballs used by EasyBuild) +# path to a directory on a shared filesystem that can be used for sharing +# data across build jobs (for example source tarballs used by EasyBuild) shared_fs_path = PATH_TO_SHARED_DIRECTORY # Path (directory) to which build logs for (only) failing builds should be copied by bot/build.sh script @@ -150,16 +155,17 @@ slurm_params = --hold # full path to the job submission command submit_command = /usr/bin/sbatch -# which GH account has the permission to trigger the build (by setting -# the label 'bot:build' (apparently this cannot be restricted on GitHub) -# if value is left/empty everyone can trigger the build -# value can be a space delimited list of GH accounts -build_permission = +# defines which GitHub accounts have the permission to trigger +# build jobs, i.e., for which accounts the bot acts on `bot: build ...` +# commands. If the value is left empty, everyone can trigger build jobs. +build_permission = -NOT_ALLOWED_GH_ACCOUNT_NAME- # template for comment when user who set a label has no permission to trigger build jobs no_build_permission_comment = Label `bot:build` has been set by user `{build_labeler}`, but this person does not have permission to trigger builds # whether or not to allow updating the submit options via custom module det_submit_opts +# Should only be enabled (true) with care because this will result in code from the target +# repository being executed by the event handler process, that is, not in a compute job. allow_update_submit_opts = false # defines which name-value pairs (environment variables) are allowed to be @@ -174,6 +180,9 @@ allow_update_submit_opts = false # 'exportvariable:SKIP_TESTS=yes' as a filter, the key-value pair would be # "SKIP_TESTS=yes". # allowed_exportvars = ["NAME1=value_1a", "NAME1=value_1b", "NAME2=value_2"] +# +# It's safe to use the following line as default setting: +allowed_exportvars = [] # mechanisn to use to clone Git repository # 'https' to clone via HTTPS (git clone https://github.com//) @@ -199,6 +208,22 @@ clone_git_repo_via = https # script for uploading built software packages artefact_upload_script = PATH_TO_EESSI_BOT/scripts/eessi-upload-to-staging +# bucket name: +# the value can be a simple string, to always use same bucket regardless of +# the target repo, or can be a mapping of a target repo id (see also +# setting repo_target_map) to a bucket name as in +# bucket_name = { +# "eessi.io-2023.06-software": "eessi.io-staging-2023.06", +# "eessi.io-2025.06-software": "software.eessi.io-2023.06" +# } +bucket_name = eessi-staging + +# which GH account has the permission to trigger the deployment by setting +# the label 'bot:deploy' (apparently this cannot be restricted on GitHub) +# if value is left/empty _no one_ can trigger the deployment +# value can be a space delimited list of GH accounts +deploy_permission = + # URL to S3/minio bucket # if attribute is set, bucket_base will be constructed as follows # bucket_base=${endpoint_url}/${bucket_name} @@ -209,37 +234,47 @@ artefact_upload_script = PATH_TO_EESSI_BOT/scripts/eessi-upload-to-staging # - The latter variant is used for AWS S3 services. endpoint_url = URL_TO_S3_SERVER -# bucket name: -# can be a string value, to always use same bucket regardless of target repo, -# or can be a mapping of target repo id (see also repo_target_map) to bucket name -# like: bucket_name = {"eessi-pilot-2023.06": "eessi-staging-pilot-2023.06", "eessi.io-2023.06": "software.eessi.io-2023.06"} -bucket_name = eessi-staging +# template for comment when user who set a label has no permission to trigger deploying artefacts +no_deploy_permission_comment = Label `bot:deploy` has been set by user `{deploy_labeler}`, but this person does not have permission to trigger deployments # settings for signing artefacts with JSON-like format -# REPO_ID: { "script": PATH_TO_SIGN_SCRIPT, "key": PATH_TO_KEY_FILE, "container_runtime": PATH_TO_CONTAINER_RUNTIME } +# +# "REPO_ID": { "script": "PATH_TO_SIGN_SCRIPT", "key": "PATH_TO_KEY_FILE", "container_runtime": "PATH_TO_CONTAINER_RUNTIME" } +# +# REPO_ID is the repository ID. Repository IDs are defined in a file `repos.cfg` +# (see setting `repos_cfg_dir`). +# # If PATH_TO_SIGN_SCRIPT is a relative path, the script must reside in the # checked out pull request of the target repository (e.g., # EESSI/software-layer). +# # The bot calls the script with the two arguments: # 1. private key (as provided by the attribute 'key') # 2. path to the file to be signed (the upload script will determine that) +# # NOTE (on "container_runtime"), signing requires a recent installation of OpenSSH # (8.2 or newer). If the frontend where the event handler runs does not have that # version installed, you can specify a container runtime via the 'container_runtime' # attribute below. Currently, only Singularity or Apptainer are supported. -# NOTE (on the key), make sure the file permissions are restricted to `0600` (only -# readable+writable by the file owner, or the signing will likely fail. -# Note (on json format), make sure no trailing commas are used after any elements +# NOTE (on the private key file), make sure the file permissions are restricted to `0600` +# (only readable+writable by the file owner) or the signing will likely fail. +# NOTE (on json format), make sure no trailing commas are used after any elements # or parsing/loading the json will likely fail. Also, the whole value should start # at a new line and be indented as shown below. +# NOTE (on the JSON-like format), make sure commas are only used for separating elements +# or parsing/loading the json will likely fail. Also, the whole value should start +# at a new line and be indented as shown below. +# NOTE (on double quotes), as shown in the example below, use double quotes for all keys +# and values. signing = { - "eessi.io-2023.06-software: { - "script": PATH_TO_SIGN_SCRIPT, - "key": PATH_TO_EESSI_BOT/config/user-site-system.key, - "container_runtime": PATH_TO_CONTAINER_RUNTIME + "eessi.io-2023.06-software": { + "script": "PATH_TO_SIGN_SCRIPT", + "key": "PATH_TO_EESSI_BOT/config/user-site-system.key", + "container_runtime": "PATH_TO_CONTAINER_RUNTIME" } } + # upload policy: defines what policy is used for uploading built artefacts # to an S3 bucket # 'all' ..: upload all artefacts (mulitple uploads of the same artefact possible) @@ -249,15 +284,6 @@ signing = # 'none' : do not upload any built artefacts upload_policy = once -# which GH account has the permission to trigger the deployment (by setting -# the label 'bot:deploy' (apparently this cannot be restricted on GitHub) -# if value is left/empty everyone can trigger the deployment -# value can be a space delimited list of GH accounts -deploy_permission = - -# template for comment when user who set a label has no permission to trigger deploying artefacts -no_deploy_permission_comment = Label `bot:deploy` has been set by user `{deploy_labeler}`, but this person does not have permission to trigger deployments - # settings for where (directory) in the S3 bucket to store the metadata file and # the artefact # - Can be a string value to always use the same 'prefix' regardless of the target From 673c41177d007bd4973109cd81faddee14ebe2d6 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 26 Jun 2025 10:38:03 +0200 Subject: [PATCH 040/132] improved app.cfg section and cfg example file --- README.md | 164 ++++++++++++++++++++++++++---------------------- app.cfg.example | 111 +++++++++++++++++--------------- 2 files changed, 148 insertions(+), 127 deletions(-) diff --git a/README.md b/README.md index 07ae8dfa..a76c6724 100644 --- a/README.md +++ b/README.md @@ -456,6 +456,14 @@ package repositories. Typically these settings are set in the prologue of a Slurm job. However, when entering the [EESSI compatibility layer](https://www.eessi.io/docs/compatibility_layer), most environment settings are cleared. Hence, they need to be set again at a later stage. +```ini +job_name = JOB_NAME +``` + +Replace `JOB_NAME` with a string of at least 3 characters that is used as job +name when a job is submitted. This is used to filter jobs, e.g., should be used +to make sure that multiple bot instances can run in the same Slurm environment. + ```ini job_delay_begin_factor = 2 ``` @@ -483,14 +491,6 @@ job from the event handler to the job manager. Values are also, the status update in the PR comment of the job is extended by noting the `EligibleTime` -```ini -job_name = JOB_NAME -``` - -Replace `JOB_NAME` with a string of at least 3 characters that is used as job -name when a job is submitted. This is used to filter jobs, e.g., should be used -to make sure that multiple bot instances can run in the same Slurm environment. - ```ini jobs_base_dir = PATH_TO_JOBS_BASE_DIR ``` @@ -625,6 +625,15 @@ artefact_upload_script = PATH_TO_EESSI_BOT/scripts/eessi-upload-to-staging `artefact_upload_script` provides the location for the script used for uploading built software packages to an S3 bucket. +```ini +endpoint_url = URL_TO_S3_SERVER +``` + +`endpoint_url` provides an endpoint (URL) to a server hosting an S3 bucket. The +server could be hosted by a commercial cloud provider like AWS or Azure, or +running in a private environment, for example, using Minio. In EESSI, the bot uploads +artefacts to the bucket which will be periodically scanned by the ingestion procedure at the Stratum 0 server. + ```ini # example: same bucket for all target repos bucket_name = "eessi-staging" @@ -646,21 +655,25 @@ The bucket must be available on the default server (`https://${bucket_name}.s3.a `bucket_name` can be specified as a string value to use the same bucket for all target repos, or it can be mapping from target repo id to bucket name. ```ini -deploy_permission = GH_ACCOUNT_1 GH_ACCOUNT_2 ... +upload_policy = once ``` -The `deploy_permission` setting defines which GitHub accounts can trigger the -deployment procedure. The value can be empty (_no_ GitHub account can trigger the -deployment), or a space delimited list of GitHub accounts. +The `upload_policy` defines what policy is used for uploading built artefacts to an S3 bucket. + +|`upload_policy` value|Policy| +|:--------|:--------------------------------| +|`all`|Upload all artefacts (mulitple uploads of the same artefact possible).| +|`latest`|For each build target (prefix in artefact name `eessi-VERSION-{software,init,compat}-OS-ARCH)` only upload the latest built artefact.| +|`once`|Only once upload any built artefact for the build target.| +|`none`|Do not upload any built artefacts.| ```ini -endpoint_url = URL_TO_S3_SERVER +deploy_permission = GH_ACCOUNT_1 GH_ACCOUNT_2 ... ``` -`endpoint_url` provides an endpoint (URL) to a server hosting an S3 bucket. The -server could be hosted by a commercial cloud provider like AWS or Azure, or -running in a private environment, for example, using Minio. In EESSI, the bot uploads -artefacts to the bucket which will be periodically scanned by the ingestion procedure at the Stratum 0 server. +The `deploy_permission` setting defines which GitHub accounts can trigger the +deployment procedure. The value can be empty (_no_ GitHub account can trigger the +deployment), or a space delimited list of GitHub accounts. ```ini no_deploy_permission_comment = Label `bot:deploy` has been set by user `{deploy_labeler}`, but this person does not have permission to trigger deployments @@ -670,56 +683,6 @@ This defines a message that is added to the status table in a PR comment corresponding to a job whose artefact should have been uploaded (e.g., after setting the `bot:deploy` label). -```ini -signing = - { - "REPO_ID": { - "script": "PATH_TO_SIGN_SCRIPT", - "key": "PATH_TO_KEY_FILE", - "container_runtime": "PATH_TO_CONTAINER_RUNTIME" - } - } -``` - -`signing` provides a setting for signing artefacts. The value uses a JSON-like format -with `REPO_ID` being the repository ID. Repository IDs are defined in a file -`repos.cfg` (see setting `repos_cfg_dir`), `script` provides the location of the -script that is used to sign a file. If the location is a relative path, the script -must reside in the checked out pull request of the target repository (e.g., -EESSI/software-layer). `key` points to the file of the key being used -for signing. The bot calls the script with the two arguments: - -1. private key (as provided by the attribute 'key') -2. path to the file to be signed (the upload script will determine that) - -> [!NOTE] -> Wrt `container_runtime`, signing requires a recent installation of OpenSSH -> (8.2 or newer). If the frontend where the event handler runs does not have that -> version installed, you can specify a container runtime via the `container_runtime` -> attribute below. Currently, only Singularity or Apptainer are supported. -> [!NOTE] -> Wrt to the private key file, make sure the file permissions are restricted to `0600` -> (only readable+writable by the file owner) or the signing will likely fail. -> [!NOTE] -> Wrt to the JSON-like format, make sure commas are only used for separating elements -> or parsing/loading the json will likely fail. Also, the whole value should start -> at a new line and be indented as shown above. -> [!NOTE] -> As shown in the example, use double quotes for all keys and values. - -```ini -upload_policy = once -``` - -The `upload_policy` defines what policy is used for uploading built artefacts to an S3 bucket. - -|`upload_policy` value|Policy| -|:--------|:--------------------------------| -|`all`|Upload all artefacts (mulitple uploads of the same artefact possible).| -|`latest`|For each build target (prefix in artefact name `eessi-VERSION-{software,init,compat}-OS-ARCH)` only upload the latest built artefact.| -|`once`|Only once upload any built artefact for the build target.| -|`none`|Do not upload any built artefacts.| - ```ini metadata_prefix = LOCATION_WHERE_METADATA_FILE_GETS_DEPOSITED artefact_prefix = LOCATION_WHERE_TARBALL_GETS_DEPOSITED @@ -759,15 +722,54 @@ artefact_prefix = { If left empty, the old/legacy prefix is being used. +```ini +signing = + { + "REPO_ID": { + "script": "PATH_TO_SIGN_SCRIPT", + "key": "PATH_TO_KEY_FILE", + "container_runtime": "PATH_TO_CONTAINER_RUNTIME" + } + } +``` + +`signing` provides a setting for signing artefacts. The value uses a JSON-like format +with `REPO_ID` being the repository ID. Repository IDs are defined in a file +`repos.cfg` (see setting `repos_cfg_dir`), `script` provides the location of the +script that is used to sign a file. If the location is a relative path, the script +must reside in the checked out pull request of the target repository (e.g., +EESSI/software-layer). `key` points to the file of the key being used +for signing. The bot calls the script with the two arguments: + +1. private key (as provided by the attribute 'key') +2. path to the file to be signed (the upload script will determine that) + +> [!NOTE] +> Wrt `container_runtime`, signing requires a recent installation of OpenSSH +> (8.2 or newer). If the frontend where the event handler runs does not have that +> version installed, you can specify a container runtime via the `container_runtime` +> attribute below. Currently, only Singularity or Apptainer are supported. +> [!NOTE] +> Wrt to the private key file, make sure the file permissions are restricted to `0600` +> (only readable+writable by the file owner) or the signing will likely fail. +> [!NOTE] +> Wrt to the JSON-like format, make sure commas are only used for separating elements +> or parsing/loading the json will likely fail. Also, the whole value should start +> at a new line and be indented as shown above. +> [!NOTE] +> As shown in the example, use double quotes for all keys and values. + #### `[architecturetargets]` section The section `[architecturetargets]` defines for which targets (OS/SUBDIR), (for example `linux/x86_64/amd/zen2`) the EESSI bot should submit jobs, and which additional `sbatch` parameters will be used for requesting a compute node with the CPU microarchitecture needed to build the software stack. ```ini -arch_target_map = { "linux/x86_64/generic" : "--constraint shape=c4.2xlarge", "linux/x86_64/amd/zen2" : "--constraint shape=c5a.2xlarge" } +arch_target_map = { + "linux/x86_64/generic": "--partition x86-64-generic-node", + "linux/x86_64/amd/zen2": "--partition x86-64-amd-zen2-node" } ``` -The map has one-to-many entries of the format `OS/SUBDIR : +The map has one-to-many entries of the format `OS/SUBDIR: ADDITIONAL_SBATCH_PARAMETERS`. For your cluster, you will have to figure out which microarchitectures (`SUBDIR`) are available (as `OS` only `linux` is currently supported) and how to instruct Slurm to allocate nodes with that @@ -776,7 +778,7 @@ architecture to a job (`ADDITIONAL_SBATCH_PARAMETERS`). Note, if you do not have to specify additional parameters to `sbatch` to request a compute node with a specific microarchitecture, you can just write something like: ```ini -arch_target_map = { "linux/x86_64/generic" : "" } +arch_target_map = { "linux/x86_64/generic": "" } ``` #### `[repo_targets]` section @@ -787,8 +789,8 @@ by `OS/SUBDIR` which correspond to settings in the `arch_target_map`. ```ini repo_target_map = { - "OS_SUBDIR_1" : ["REPO_ID_1_1","REPO_ID_1_2"], - "OS_SUBDIR_2" : ["REPO_ID_2_1","REPO_ID_2_2"] } + "OS_SUBDIR_1": ["REPO_ID_1_1","REPO_ID_1_2"], + "OS_SUBDIR_2": ["REPO_ID_2_1","REPO_ID_2_2"] } ``` For each `OS/SUBDIR` combination a list of available repository IDs can be @@ -798,13 +800,13 @@ The repository IDs are defined in a separate file, say `repos.cfg` which is stored in the directory defined via `repos_cfg_dir`: ```ini -repos_cfg_dir = PATH_TO_SHARED_DIRECTORY/cfg_bundles +repos_cfg_dir = PATH_TO_SHARED_DIRECTORY/repos ``` The `repos.cfg` file also uses the `ini` format as follows ```ini -[eessi-2023.06] +[eessi.io-2023.06-software] repo_name = software.eessi.io repo_version = 2023.06 config_bundle = eessi.io-cfg_files.tgz @@ -868,7 +870,7 @@ scontrol_command = /usr/bin/scontrol The `[submitted_job_comments]` section specifies templates for messages about newly submitted jobs. -DEPRECATED setting (use `awaits_release_delayed_begin_msg` and/or `awaits_release_hold_release_msg`) +The following setting is no longer used since bot release v0.7.0. Instead, use the replacement settings `awaits_release_delayed_begin_msg` and/or `awaits_release_hold_release_msg`. ```ini awaits_release = job id `{job_id}` awaits release by job manager @@ -1004,6 +1006,18 @@ git_apply_tip = _Tip: This can usually be resolved by syncing your branch and re `git_apply_tip` should guide the contributor/maintainer about resolving the cause of `git apply` failing. +```ini +pr_diff_failure = Unable to obtain PR diff. +``` + +The value of `pr_diff_failure` is shown when the `.diff` file could not be obtained. + +```ini +pr_diff_tip = _Tip: This could be a problem with SSH access to the repository._ +``` + +The value of `pr_diff_tip` should guide the maintainer / bot administrator about resolving the cause for the failing procedure to obtain the `.diff` file. + #### `[clean_up]` section The `[clean_up]` section includes settings related to cleaning up disk used by merged (and closed) PRs. diff --git a/app.cfg.example b/app.cfg.example index 0631ae24..e1972026 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -95,6 +95,10 @@ container_cachedir = PATH_TO_SHARED_DIRECTORY # http_proxy = http://PROXY_DNS:3128/ # https_proxy = http://PROXY_DNS:3128/ +# Used to give all jobs of a bot instance the same name. Can be used to allow +# multiple bot instances running on the same Slurm cluster. +job_name = prod + # The job_delay_begin_factor setting defines how many times the poll_interval a # job's begin (EligibleTime) from now should be delayed if the handover protocol # is set to `delayed_begin` (see setting `job_handover_protocol`). That is, if @@ -114,10 +118,6 @@ job_delay_begin_factor = 2 # the 'EligibleTime' job_handover_protocol = hold_release -# Used to give all jobs of a bot instance the same name. Can be used to allow -# multiple bot instances running on the same Slurm cluster. -job_name = prod - # directory under which the bot prepares directories per job # structure created is as follows: YYYY.MM/pr_PR_NUMBER/event_EVENT_ID/run_RUN_NUMBER/OS+SUBDIR jobs_base_dir = $HOME/jobs @@ -208,6 +208,16 @@ clone_git_repo_via = https # script for uploading built software packages artefact_upload_script = PATH_TO_EESSI_BOT/scripts/eessi-upload-to-staging +# URL to S3/minio bucket +# if attribute is set, bucket_base will be constructed as follows +# bucket_base=${endpoint_url}/${bucket_name} +# otherwise, bucket_base will be constructed as follows +# bucket_base=https://${bucket_name}.s3.amazonaws.com +# - The former variant is used for non AWS S3 services, eg, minio, or when +# the bucket name is not provided in the hostname (see latter case). +# - The latter variant is used for AWS S3 services. +endpoint_url = URL_TO_S3_SERVER + # bucket name: # the value can be a simple string, to always use same bucket regardless of # the target repo, or can be a mapping of a target repo id (see also @@ -218,25 +228,46 @@ artefact_upload_script = PATH_TO_EESSI_BOT/scripts/eessi-upload-to-staging # } bucket_name = eessi-staging +# upload policy: defines what policy is used for uploading built artefacts +# to an S3 bucket +# 'all' ..: upload all artefacts (mulitple uploads of the same artefact possible) +# 'latest': for each build target (eessi-VERSION-{software,init,compat}-OS-ARCH) +# only upload the latest built artefact +# 'once' : only once upload any built artefact for the build target +# 'none' : do not upload any built artefacts +upload_policy = once + # which GH account has the permission to trigger the deployment by setting # the label 'bot:deploy' (apparently this cannot be restricted on GitHub) # if value is left/empty _no one_ can trigger the deployment # value can be a space delimited list of GH accounts deploy_permission = -# URL to S3/minio bucket -# if attribute is set, bucket_base will be constructed as follows -# bucket_base=${endpoint_url}/${bucket_name} -# otherwise, bucket_base will be constructed as follows -# bucket_base=https://${bucket_name}.s3.amazonaws.com -# - The former variant is used for non AWS S3 services, eg, minio, or when -# the bucket name is not provided in the hostname (see latter case). -# - The latter variant is used for AWS S3 services. -endpoint_url = URL_TO_S3_SERVER - # template for comment when user who set a label has no permission to trigger deploying artefacts no_deploy_permission_comment = Label `bot:deploy` has been set by user `{deploy_labeler}`, but this person does not have permission to trigger deployments +# settings for where (directory) in the S3 bucket to store the metadata file and +# the artefact +# - Can be a string value to always use the same 'prefix' regardless of the target +# CVMFS repository, or can be a mapping of a target repository id (see also +# repo_target_map) to a prefix. +# - The prefix itself can use some (environment) variables that are set within +# the script. Currently those are: +# * 'github_repository' (which would be expanded to the full name of the GitHub +# repository, e.g., 'EESSI/software-layer'), +# * 'legacy_aws_path' (which expands to the legacy/old prefix being used for +# storing artefacts/metadata files) and +# * 'pull_request_number' (which would be expanded to the number of the pull +# request from which the artefact originates). +# - The list of supported variables can be shown by running +# `scripts/eessi-upload-to-staging --list-variables`. +# - Examples: +# metadata_prefix = {"eessi.io-2023.06": "new/${github_repository}/${pull_request_number}"} +# artefact_prefix = {"eessi-pilot-2023.06": "", "eessi.io-2023.06": "new/${github_repository}/${pull_request_number}"} +# If left empty, the old/legacy prefix is being used. +metadata_prefix = +artefact_prefix = + # settings for signing artefacts with JSON-like format # # "REPO_ID": { "script": "PATH_TO_SIGN_SCRIPT", "key": "PATH_TO_KEY_FILE", "container_runtime": "PATH_TO_CONTAINER_RUNTIME" } @@ -275,52 +306,24 @@ signing = } } -# upload policy: defines what policy is used for uploading built artefacts -# to an S3 bucket -# 'all' ..: upload all artefacts (mulitple uploads of the same artefact possible) -# 'latest': for each build target (eessi-VERSION-{software,init,compat}-OS-ARCH) -# only upload the latest built artefact -# 'once' : only once upload any built artefact for the build target -# 'none' : do not upload any built artefacts -upload_policy = once - -# settings for where (directory) in the S3 bucket to store the metadata file and -# the artefact -# - Can be a string value to always use the same 'prefix' regardless of the target -# CVMFS repository, or can be a mapping of a target repository id (see also -# repo_target_map) to a prefix. -# - The prefix itself can use some (environment) variables that are set within -# the script. Currently those are: -# * 'github_repository' (which would be expanded to the full name of the GitHub -# repository, e.g., 'EESSI/software-layer'), -# * 'legacy_aws_path' (which expands to the legacy/old prefix being used for -# storing artefacts/metadata files) and -# * 'pull_request_number' (which would be expanded to the number of the pull -# request from which the artefact originates). -# - The list of supported variables can be shown by running -# `scripts/eessi-upload-to-staging --list-variables`. -# - Examples: -# metadata_prefix = {"eessi.io-2023.06": "new/${github_repository}/${pull_request_number}"} -# artefact_prefix = {"eessi-pilot-2023.06": "", "eessi.io-2023.06": "new/${github_repository}/${pull_request_number}"} -# If left empty, the old/legacy prefix is being used. -metadata_prefix = -artefact_prefix = - [architecturetargets] -# defines both for which architectures the bot will build -# and what submission parameters shall be used -arch_target_map = { "linux/x86_64/generic" : "--constraint shape=c4.2xlarge", "linux/x86_64/amd/zen2": "--constraint shape=c5a.2xlarge" } +# defines for which architectures the bot will build and what job submission +# parameters shall be used to allocate a compute node with the correct +arch_target_map = { + "linux/x86_64/generic": "--partition x86-64-generic-node", + "linux/x86_64/amd/zen2": "--partition x86-64-amd-zen2-node" } [repo_targets] # defines for which repository a arch_target should be build for # -# EESSI/2021.12 and NESSI/2022.11 -repo_target_map = { "linux/x86_64/amd/zen2" : ["eessi-2021.12","nessi.no-2022.11"] } +# EESSI/2023.06 and EESSI/2025.06 +repo_target_map = { + "linux/x86_64/amd/zen2" : ["eessi.io-2023.06-software","eessi.io-2025.06-software"] } # points to definition of repositories (default repository defined by build container) -repos_cfg_dir = PATH_TO_SHARED_DIRECTORY/cfg_bundles +repos_cfg_dir = PATH_TO_SHARED_DIRECTORY/repos # configuration for event handler which receives events from a GitHub repository. @@ -356,7 +359,8 @@ scontrol_command = /usr/bin/scontrol # are removed, the output (in PR comments) will lack important # information. [submitted_job_comments] -awaits_release = job id `{job_id}` awaits release by job manager +# awaits_release is no longer used since bot release v0.7.0 +# awaits_release = job id `{job_id}` awaits release by job manager awaits_release_delayed_begin_msg = job id `{job_id}` will be eligible to start in about {delay_seconds} seconds awaits_release_hold_release_msg = job id `{job_id}` awaits release by job manager initial_comment = New job on instance `{app_name}` for CPU micro-architecture `{arch_name}`{accelerator_spec} for repository `{repo_id}` in job dir `{symlink}` @@ -366,6 +370,7 @@ with_accelerator =  and accelerator `{accelerator}` [new_job_comments] awaits_launch = job awaits launch by Slurm scheduler{extra_info} + [running_job_comments] running_job = job `{job_id}` is running @@ -374,6 +379,7 @@ running_job = job `{job_id}` is running job_result_unknown_fmt =
:shrug: UNKNOWN _(click triangle for detailed information)_
  • Job results file `{filename}` does not exist in job directory, or parsing it failed.
  • No artefacts were found/reported.
job_test_unknown_fmt =
:shrug: UNKNOWN _(click triangle for detailed information)_
  • Job test file `{filename}` does not exist in job directory, or parsing it failed.
+ [download_pr_comments] git_clone_failure = Unable to clone the target repository. git_clone_tip = _Tip: This could be a connection failure. Try again and if the issue remains check if the address is correct_. @@ -386,6 +392,7 @@ git_apply_tip = _Tip: This can usually be resolved by syncing your branch and re pr_diff_failure = Unable to obtain PR diff. pr_diff_tip = _Tip: This could be a problem with SSH access to the repository._ + [clean_up] trash_bin_dir = $HOME/trash_bin moved_job_dirs_comment = PR merged! Moved `{job_dirs}` to `{trash_bin_dir}` From 05786f3af80dcd37a3ccbacecd8cab21a1b50dba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 27 Jun 2025 21:12:13 +0200 Subject: [PATCH 041/132] apply suggestions from code review Co-authored-by: Pedro Santos Neves <10762799+Neves-P@users.noreply.github.com> --- README.md | 13 +++++++------ app.cfg.example | 8 ++++---- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index a76c6724..19f0fe11 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,7 @@ and then running it with node_modules/smee-client/bin/smee.js --url https://smee.io/CHANNEL-ID ``` -Another port can be used by adding the `--port PORT` argument, for example, +Another port can be used by adding the `--port PORT` argument. This can be particularly useful if you have multiple bot instances running on the same cluster, in which case you'd want a different port for each. As an example, one could use the non-default port 3030 in this way: ```bash node_modules/smee-client/bin/smee.js --url https://smee.io/CHANNEL-ID --port 3030 @@ -158,7 +158,7 @@ Go to [https://github.com/settings/apps/**APP_NAME**](https://github.com/setting On the next page you should see a list of accounts and organisations you can install the app on. Choose one and click on the Install button next to it. -This leads to a page where you can select the repositories on whose the app should react to. Here, for the sake of simplicity, choose "Only select repositories", then open the pull-down menu named "Select repositories" and in there select `GH_ACCOUNT/software-layer` (`GH_ACCOUNT` is the GitHub account mentioned in section [prerequisites](#prerequisites)). Finally, click on the Install button. +This leads to a page where you can select the repositories where the app should react to. Here, for the sake of simplicity, choose "Only select repositories", then open the pull-down menu named "Select repositories" and in there select `GH_ACCOUNT/software-layer` (`GH_ACCOUNT` is the GitHub account mentioned in section [prerequisites](#prerequisites)). Finally, click on the Install button. ## Step 4: Installing the EESSI bot on a `bot machine` @@ -326,7 +326,7 @@ Open the page [https://github.com/settings/apps](https://github.com/settings/app Near the end of the page you will find a section **Private keys** where you can create a private key by clicking on the button Generate a private key. -The private key should be automatically downloaded to your system. Copy it to the `bot machine` and note the full path to it (`PATH_TO_PRIVATE_KEY`). +The private key should be automatically downloaded to your system. Copy it to the `bot machine` and note the full path to it (`PATH_TO_PRIVATE_KEY`). Also note down the day when the key was generated. The keys should be rotated every 6 months. ### Step 5.4: Create the configuration file `app.cfg` @@ -362,7 +362,7 @@ installation_id = 12345678 Replace '`12345678`' with the id of the _installation_ of your GitHub App (see [Step 3](#step3)). -You find the installation id of your GitHub App via the page [Applications](https://github.com/settings/installations). On this page, select the app you have registered in [Step 2](#step2) by clicking on the Configure button. The installation id is shown as the URL of the page. +You find the installation id of your GitHub App via the page [Applications](https://github.com/settings/installations). On this page, select the app you have registered in [Step 2](#step2) by clicking on the Configure button. The installation id is shown as the number after the last `/` of the page's URL. The `installation_id` is also provided in the payload of every event within the top-level record named "`installation`". You can see the events and their payload on the webpage of your Smee.io channel (`https://smee.io/CHANNEL-ID`). Alternatively, you can see the events in the **Advanced** section of your GitHub App: open the [GitHub Apps](https://github.com/settings/apps) page, select the app you have registered in [Step 2](#step2), and choose **Advanced** in the menu on the left-hand side. @@ -754,8 +754,9 @@ for signing. The bot calls the script with the two arguments: > (only readable+writable by the file owner) or the signing will likely fail. > [!NOTE] > Wrt to the JSON-like format, make sure commas are only used for separating elements -> or parsing/loading the json will likely fail. Also, the whole value should start -> at a new line and be indented as shown above. +> and that there is no trailing comma on the last element, or parsing/loading the json +> will likely fail. Also, the whole value should start a new line and be indented as shown +> above. > [!NOTE] > As shown in the example, use double quotes for all keys and values. diff --git a/app.cfg.example b/app.cfg.example index e1972026..c6bb2c69 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -273,7 +273,7 @@ artefact_prefix = # "REPO_ID": { "script": "PATH_TO_SIGN_SCRIPT", "key": "PATH_TO_KEY_FILE", "container_runtime": "PATH_TO_CONTAINER_RUNTIME" } # # REPO_ID is the repository ID. Repository IDs are defined in a file `repos.cfg` -# (see setting `repos_cfg_dir`). +# and _must_ match it (see setting `repos_cfg_dir`). # # If PATH_TO_SIGN_SCRIPT is a relative path, the script must reside in the # checked out pull request of the target repository (e.g., @@ -289,9 +289,9 @@ artefact_prefix = # attribute below. Currently, only Singularity or Apptainer are supported. # NOTE (on the private key file), make sure the file permissions are restricted to `0600` # (only readable+writable by the file owner) or the signing will likely fail. -# NOTE (on json format), make sure no trailing commas are used after any elements -# or parsing/loading the json will likely fail. Also, the whole value should start -# at a new line and be indented as shown below. +# NOTE (on json format), make sure trailing commas are used after any elements +# except for the last one or parsing/loading the json will likely fail. Also, the +# whole value should start a new line and be indented as shown below. # NOTE (on the JSON-like format), make sure commas are only used for separating elements # or parsing/loading the json will likely fail. Also, the whole value should start # at a new line and be indented as shown below. From e3041c7baaf6543a5223cca71ebe874786dc7de3 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 27 Jun 2025 21:34:41 +0200 Subject: [PATCH 042/132] addressing more suggested changes --- README.md | 18 +++++++++++++++--- app.cfg.example | 3 --- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 19f0fe11..201d8174 100644 --- a/README.md +++ b/README.md @@ -352,7 +352,16 @@ Replace '`123456`' with the id of your GitHub App. You can find the id of your G app_name = 'MY-bot' ``` -The `app_name` specifies a short name for your bot. It will appear in comments to a pull request. For example, it could include the name of the cluster where the bot runs and a label representing the user that runs the bot, like `hal9000-bot`. The name will be used when signing files uploaded to an S3 bucket. Thus, the name has to be the same that is used as value for `namespaces` in the `allowed_signers` file used during the ingestion procedure (see [https://github.com/EESSI/filesystem-layer](https://github.com/EESSI/filesystem-layer)). +The `app_name` specifies a short name for your bot. It will appear in comments to +a pull request. For example, it could include the name of the cluster where the +bot runs and a label representing the user that runs the bot, like `hal9000-bot`. +The name will be used when signing files uploaded to an S3 bucket. Thus, the name +has to be the same that is used as value for `namespaces` in the +`allowed_signers` file used during the ingestion procedure (see +[https://github.com/EESSI/filesystem-layer](https://github.com/EESSI/filesystem-layer)). +The file `allowed_signers` is provided by another (private) repository. More +information on its content can be obtained from the manual page for `ssh-keygen` +or from the sign script which is available as `scripts/sign_verify_file_ssh.sh`. _Note: avoid putting an actual username here as it will be visible on potentially publicly accessible GitHub pages._ @@ -540,12 +549,15 @@ submit_command = /usr/bin/sbatch `submit_command` is the full path to the Slurm job submission command used for submitting batch jobs. You may want to verify if `sbatch` is provided at that path or determine its actual location (using `which sbatch`). ```ini -build_permission = GH_ACCOUNT GH_ACCOUNT_2 -NOT_ALLOWED_GH_ACCOUNT_NAME- ... +build_permission = -NOT_ALLOWED_GH_ACCOUNT_NAME- [...] ``` `build_permission` defines which GitHub accounts have the permission to trigger build jobs, i.e., for which accounts the bot acts on `bot: build ...` commands. -If the value is left empty, everyone can trigger build jobs. +If the value is left empty, everyone can trigger build jobs. The string +`-NOT_ALLOWED_GH_ACCOUNT_NAME-` in the example above is not an allowed account +name on GitHub. Thus, one could not - by accident - give build permissions to an +unknown account. ```ini no_build_permission_comment = The `bot: build ...` command has been used by user `{build_labeler}`, but this person does not have permission to trigger builds. diff --git a/app.cfg.example b/app.cfg.example index c6bb2c69..63905832 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -289,9 +289,6 @@ artefact_prefix = # attribute below. Currently, only Singularity or Apptainer are supported. # NOTE (on the private key file), make sure the file permissions are restricted to `0600` # (only readable+writable by the file owner) or the signing will likely fail. -# NOTE (on json format), make sure trailing commas are used after any elements -# except for the last one or parsing/loading the json will likely fail. Also, the -# whole value should start a new line and be indented as shown below. # NOTE (on the JSON-like format), make sure commas are only used for separating elements # or parsing/loading the json will likely fail. Also, the whole value should start # at a new line and be indented as shown below. From baa16feea32b1cb632b47a956e56b192d91a0dbf Mon Sep 17 00:00:00 2001 From: Pedro Santos Neves <10762799+Neves-P@users.noreply.github.com> Date: Sat, 28 Jun 2025 12:07:29 +0200 Subject: [PATCH 043/132] Remove trailing white spaces Keep linter happy --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 201d8174..cc4d0ff5 100644 --- a/README.md +++ b/README.md @@ -766,8 +766,8 @@ for signing. The bot calls the script with the two arguments: > (only readable+writable by the file owner) or the signing will likely fail. > [!NOTE] > Wrt to the JSON-like format, make sure commas are only used for separating elements -> and that there is no trailing comma on the last element, or parsing/loading the json -> will likely fail. Also, the whole value should start a new line and be indented as shown +> and that there is no trailing comma on the last element, or parsing/loading the json +> will likely fail. Also, the whole value should start a new line and be indented as shown > above. > [!NOTE] > As shown in the example, use double quotes for all keys and values. From b54fdf4d19695d4a7052d29a9e289b5a2fa6f71e Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 30 Jun 2025 13:57:19 +0200 Subject: [PATCH 044/132] Fix xome small issues with non-existing keys --- tasks/build.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tasks/build.py b/tasks/build.py index 6fdf2fac..168e3246 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -638,10 +638,14 @@ def prepare_jobs(pr, cfg, event_info, action_filter): # }, # 'virtual_partition_name2': { # ... etc + # DEBUG LOGGING + log(f"arch_map: {arch_map})") for virtual_partition_name, partition_info in arch_map.items(): + # DEBUG LOGGING + log(f"virtual_partition_name: {virtual_partition_name}, partition_info={partition_info}") # Unpack for convenience arch_dir = partition_info['cpu_subdir'] - if partition_info['accel']: + if 'accel' in partition_info: # Use the accelerator as defined by the action_filter. We check if this is valid for the current # virtual partition later arch_dir += accelerator @@ -671,7 +675,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter): } # Optionally add accelerator to the context check = False - if partition_info['accel']: + if 'accel' in partition_info: # Create a context for each accelerator, check if _any_ of them is valid # (one is enough to continue) for accel in partition_info['accel']: @@ -708,7 +712,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter): ) comment_download_pr(base_repo_name, pr, download_pr_exit_code, download_pr_error, error_stage) # prepare job configuration file 'job.cfg' in directory /cfg - log(f"{fn}(): arch = '{arch}' => cpu_target = '{partition_info['cpu_subdir']}' , " + log(f"{fn}(): virtual partition = '{virtual_partition_name}' => cpu_target = '{partition_info['cpu_subdir']}' , " f"os_type = '{partition_info['os']}', accelerator = '{accelerator}'") prepare_job_cfg(job_dir, build_env_cfg, repocfg, repo_id, partition_info['cpu_subdir'], From 1e6c3d9f2b2eee991b27e1e8c8c7199ce27e7912 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 30 Jun 2025 17:32:24 +0200 Subject: [PATCH 045/132] Avoid doing string += None for the arch_dir if accelerator = None. Also, make sure to print the same information to the logs, regardless of whether this is about a partition that has accelerators defined or not. Finally, make sure that if we have not hit a match by the end of the loop over all accelerators, we continue to the next iteration of the loop over the repo-targets, so that the job-dir preparation is skipped for the current one --- tasks/build.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/tasks/build.py b/tasks/build.py index 168e3246..757423bd 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -645,7 +645,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter): log(f"virtual_partition_name: {virtual_partition_name}, partition_info={partition_info}") # Unpack for convenience arch_dir = partition_info['cpu_subdir'] - if 'accel' in partition_info: + if 'accel' in partition_info and accelerator is not None: # Use the accelerator as defined by the action_filter. We check if this is valid for the current # virtual partition later arch_dir += accelerator @@ -676,14 +676,26 @@ def prepare_jobs(pr, cfg, event_info, action_filter): # Optionally add accelerator to the context check = False if 'accel' in partition_info: - # Create a context for each accelerator, check if _any_ of them is valid - # (one is enough to continue) + match = False + # Create a context for each accelerator defined in app.cfg, then + # check if _any_ of them is valid (one is enough to continue) for accel in partition_info['accel']: context['accelerator'] = accel log(f"{fn}(): context is '{json.dumps(context, indent=4)}'") - check = check | action_filter.check_filters(context) - if not check: - log(f"{fn}(): none of the contexts satisfy filter(s), skipping") + # TODO: it seems the check_filters does not enforce the accelerator to be present in the context - that should be implemented + if not action_filter.check_filters(context): + log(f"{fn}(): context does NOT satisfy filter(s), skipping") + continue + # check = check | action_filter.check_filters(context) + else: + log(f"{fn}(): context DOES satisfy filter(s), going on with job") + match = True + # Break as soon as we have found a valid context, it means the build args are valid + # for at least one of the accelerators in this virtual partition, that's enough + break + # If we get to this point, and none of the contexts matched the filter, we should continue to the + # next iteration of the partition_info['repo_targets'] loop + if not match: continue else: log(f"{fn}(): context is '{json.dumps(context, indent=4)}'") @@ -713,7 +725,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter): comment_download_pr(base_repo_name, pr, download_pr_exit_code, download_pr_error, error_stage) # prepare job configuration file 'job.cfg' in directory /cfg log(f"{fn}(): virtual partition = '{virtual_partition_name}' => cpu_target = '{partition_info['cpu_subdir']}' , " - f"os_type = '{partition_info['os']}', accelerator = '{accelerator}'") + f"os_type = '{partition_info['os']}', requested accelerator = '{accelerator}'") prepare_job_cfg(job_dir, build_env_cfg, repocfg, repo_id, partition_info['cpu_subdir'], partition_info['os'], accelerator) From e000c91768dbdde73e0025985edf21cf0b2a9d70 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 30 Jun 2025 17:33:54 +0200 Subject: [PATCH 046/132] Make sure that if the context (i.e. app.cfg) defines AN accelerator, AND if the build command does NOT, we explicitely check that the defined accelerator is 'None'. This allows skipping CPU-only builds on accelerated partitions. --- tools/filter.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/filter.py b/tools/filter.py index 0caa2af8..9665c7c5 100644 --- a/tools/filter.py +++ b/tools/filter.py @@ -303,4 +303,16 @@ def check_filters(self, context): else: check = False break + + # If the context declares an accelerator, but the build command did not (i.e. no action filter is defined for the accelerator component) + # then the check should only return True if "None" was the accelerator defined in the context + # This ensures that on partitions that no CPU-only builds are done on accelerated partitions, unless these partitions are + # explicitly configured with "None" as _one_ of the valid accelerators in their `accel:` list in app.cfg + if ( + FILTER_COMPONENT_ACCEL in context and not + any(af.component == FILTER_COMPONENT_ACCEL for af in self.action_filters) + ): + if not context[FILTER_COMPONENT_ACCEL] == "None": + check = False + return check From 40ceefe938a0f831f7122c0c06865ee69e16626b Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 30 Jun 2025 17:35:07 +0200 Subject: [PATCH 047/132] Some cleanup --- tasks/build.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tasks/build.py b/tasks/build.py index 757423bd..30cfcabc 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -638,11 +638,8 @@ def prepare_jobs(pr, cfg, event_info, action_filter): # }, # 'virtual_partition_name2': { # ... etc - # DEBUG LOGGING - log(f"arch_map: {arch_map})") for virtual_partition_name, partition_info in arch_map.items(): - # DEBUG LOGGING - log(f"virtual_partition_name: {virtual_partition_name}, partition_info={partition_info}") + log(f"{fn}(): virtual_partition_name is {virtual_partition_name}, partition_info is {partition_info}") # Unpack for convenience arch_dir = partition_info['cpu_subdir'] if 'accel' in partition_info and accelerator is not None: @@ -682,7 +679,6 @@ def prepare_jobs(pr, cfg, event_info, action_filter): for accel in partition_info['accel']: context['accelerator'] = accel log(f"{fn}(): context is '{json.dumps(context, indent=4)}'") - # TODO: it seems the check_filters does not enforce the accelerator to be present in the context - that should be implemented if not action_filter.check_filters(context): log(f"{fn}(): context does NOT satisfy filter(s), skipping") continue From fad4f47babc490cb8e8711ec565f993e22ae8931 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 1 Jul 2025 16:54:50 +0200 Subject: [PATCH 048/132] Remove repo_target_map from config, and all occurences that import it, as it is now replaced by the repo list in the arch_target_map. Also, fix hound issues --- app.cfg.example | 79 +++++++++++++++++++++++++++++++++----- eessi_bot_event_handler.py | 26 ++++++------- tasks/build.py | 37 +++++++----------- tools/config.py | 1 - tools/filter.py | 9 +++-- 5 files changed, 98 insertions(+), 54 deletions(-) diff --git a/app.cfg.example b/app.cfg.example index 63905832..bf7911b5 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -306,19 +306,78 @@ signing = [architecturetargets] # defines for which architectures the bot will build and what job submission -# parameters shall be used to allocate a compute node with the correct +# parameters shall be used to allocate a compute node with the correct +# The keys of the arch_target_map are virtual partition names. They don't have any meaning in the bot code, +# and can thus be chosen as desired. +# Note that you are responsible that ANY bot:build command ONLY matches a single virtual partition! +# If multiple partitions match the same bot:build command, a failure will be triggered in the job dir preparation arch_target_map = { - "linux/x86_64/generic": "--partition x86-64-generic-node", - "linux/x86_64/amd/zen2": "--partition x86-64-amd-zen2-node" } - + # This is a CPU-based partition. We do not specify an "accel" property explicitly. In this case, invoking the bot + # with ANY accelerator command will trigger a build on this partition, as long as the CPU type matches. E.g. + # bot: build instance:xyz repo:eessi.io-2023.06-software arch:zen2 accel:nvidia/cc90 + # will cause the event_filter to mark this virtual partition as a valid match, and will use it to start building + # for zen2 + nvidia/cc90. Thus, by not specifying an "accel" property, this partition may be used for + # cross-compilation for any accelerator. + "cpu_zen2": { + "os": "linux", + "cpu_subdir": "x86_64/amd/zen2", + "slurm_params": "-p rome --nodes 1 --ntasks-per-node 16 --cpus-per-task 1", + "repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"] + }, + # This is a CPU partition. We specify an explicit "accel": "None" property. Thus, this partition will only be + # used if the bot build command does NOT contain an accel argument, e.g. + # bot: build instance:xyz repo:eessi.io-2023.06-software arch:zen4 + # will cause the event_filter to mark this virtual partition as a valid match, and will use it to start building + # for zen4. + # When invoking the bot with an accelerator command, such as + # bot: build instance:xyz repo:eessi.io-2023.06-software arch:zen4 accel:nvidia/cc90 + # the event_filter will NOT mark this virtual partition as a valid match. This is intentional, as this particular + # (example) cluster has a native zen4+cc90 partition (gpu_h100) and we want this command to trigger a native build + # on that partition, rather than cross-compiling on this cpu_zen4 partition. + # One could still allow cross-compilation for other accelerator architectures, e.g. cc70 and cc80 by defining + # "accel": ["nvidia/cc70", "nvidia/cc80"] + "cpu_zen4": { + "os": "linux", + "cpu_subdir": "x86_64/amd/zen4", + "accel": ["None"], + "slurm_params": "-p genoa --nodes 1 --ntasks-per-node 24 --cpus-per-task 1", + "repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"] + }, + # This is a GPU partition. We specify an explicit "accel" property. Thus, only if the bot build command + # specifies that explicit accelerator in combination with the relevant CPU type, + # bot: build instance:xyz repo:eessi.io-2023.06-software arch:icelake accel:nvidia/cc80 + # will a build be triggered on this partition + # If you want to use this partition also for CPU only builds, you can alter the "accel" property to + # "accel": ["None", "nvidia/cc80"] + "gpu_a100": { + "os": "linux", + "cpu_subdir": "x86_64/intel/icelake", + "accel": ["nvidia/cc80"], + "slurm_params": "-p gpu_a100 --nodes 1 --tasks-per-node 18 --cpus-per-task 1 --gpus-per-node 1", + "repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"] + }, + # This is a GPU partition. We specify an explicit "accel" property. Thus, only if the bot build command + # specifies that explicit accelerator in combination with the relevant CPU type, + # bot: build instance:xyz repo:eessi.io-2023.06-software arch:zen4 accel:nvidia/cc90 + # will a build be triggered on this partition + # If you want to use this partition also for cross-compiling for cc70 and cc80 architectures, you can alter + # the "accel" property to + # "accel": ["nvidia/cc70", "nvidia/cc80", "nvidia/cc90"] + # Note that setting: + # "accel": ["None", "nvidia/cc90"] + # is invalid here, since it would lead to both the cpu_zen4 and the gpu_h100 partitions matching the build command + # bot: build instance:xyz repo:eessi.io-2023.06-software arch:zen4 accel:nvidia/cc90 + # This would cause the same job dir to be prepared twice, for different virtual partitions, which will lead + # to an error in the job preparation step + "gpu_h100": { + "os": "linux", + "cpu_subdir": "x86_64/amd/zen4", + "accel": ["nvidia/cc90"], + "slurm_params": "-p gpu_h100 --nodes 1 --tasks-per-node 16 --cpus-per-task 1 --gpus-per-node 1", + "repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"] + }} [repo_targets] -# defines for which repository a arch_target should be build for -# -# EESSI/2023.06 and EESSI/2025.06 -repo_target_map = { - "linux/x86_64/amd/zen2" : ["eessi.io-2023.06-software","eessi.io-2025.06-software"] } - # points to definition of repositories (default repository defined by build container) repos_cfg_dir = PATH_TO_SHARED_DIRECTORY/repos diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index b1d4123a..df777914 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -104,7 +104,6 @@ config.SECTION_JOB_MANAGER: [ config.JOB_MANAGER_SETTING_POLL_INTERVAL], # required config.SECTION_REPO_TARGETS: [ - config.REPO_TARGETS_SETTING_REPO_TARGET_MAP, # required config.REPO_TARGETS_SETTING_REPOS_CFG_DIR], # required config.SECTION_SUBMITTED_JOB_COMMENTS: [ config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT, # required @@ -412,22 +411,19 @@ def handle_pull_request_opened_event(self, event_info, pr, req_chatlevel=ChatLev # TODO check if PR already has a comment with arch targets and # repositories arch_map = get_architecture_targets(self.cfg) - repo_cfg = get_repo_cfg(self.cfg) comment = f"Instance `{app_name}` is configured to build for:" - architectures = ['/'.join(arch.split('/')[1:]) for arch in arch_map.keys()] - comment += "\n- architectures: " - if len(architectures) > 0: - comment += f"{', '.join([f'`{arch}`' for arch in architectures])}" - else: - comment += "none" - repositories = list(set([repo_id for repo_ids in repo_cfg[config.REPO_TARGETS_SETTING_REPO_TARGET_MAP].values() - for repo_id in repo_ids])) - comment += "\n- repositories: " - if len(repositories) > 0: - comment += f"{', '.join([f'`{repo_id}`' for repo_id in repositories])}" - else: - comment += "none" + for partition_num, arch in enumerate(arch_map): + comment += f"\n- partition {partition_num+1}:" + if "os" in arch: + comment += f"\n - os: {arch[os]}" + if "cpu_subdir" in arch: + comment += f"\n - architecture: {arch[cpu_subdir]}" + if "repo_targets" in arch: + comment += f"\n - repositories: {arch[repo_targets]}" + if "accel" in arch: + comment += f"\n - accelerators: {arch[accel]}" + comment += "\n" self.log(f"PR opened: comment '{comment}'") diff --git a/tasks/build.py b/tasks/build.py index 30cfcabc..bc5e5fb3 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -241,8 +241,6 @@ def get_repo_cfg(cfg): Returns: (dict): dictionary containing repository settings as follows - {config.REPO_TARGETS_SETTING_REPOS_CFG_DIR: path to repository config directory as defined in 'app.cfg'} - - {config.REPO_TARGETS_SETTING_REPO_TARGET_MAP: json of - config.REPO_TARGETS_SETTING_REPO_TARGET_MAP value as defined in 'app.cfg'} - for all sections [repo_id] defined in config.REPO_TARGETS_SETTING_REPOS_CFG_DIR/repos.cfg add a mapping {repo_id: dictionary containing settings of that section} """ @@ -259,21 +257,6 @@ def get_repo_cfg(cfg): settings_repos_cfg_dir = config.REPO_TARGETS_SETTING_REPOS_CFG_DIR repo_cfg[settings_repos_cfg_dir] = repo_cfg_org.get(settings_repos_cfg_dir, None) - repo_map = {} - try: - repo_map_str = repo_cfg_org.get(config.REPO_TARGETS_SETTING_REPO_TARGET_MAP) - log(f"{fn}(): repo_map '{repo_map_str}'") - - if repo_map_str is not None: - repo_map = json.loads(repo_map_str) - - log(f"{fn}(): repo_map '{json.dumps(repo_map)}'") - except json.JSONDecodeError as err: - print(err) - error(f"{fn}(): Value for repo_map ({repo_map_str}) could not be decoded.") - - repo_cfg[config.REPO_TARGETS_SETTING_REPO_TARGET_MAP] = repo_map - if repo_cfg[config.REPO_TARGETS_SETTING_REPOS_CFG_DIR] is None: return repo_cfg @@ -627,12 +610,14 @@ def prepare_jobs(pr, cfg, event_info, action_filter): return [] jobs = [] - # This loop assumes the following structure for arch_target_map: + # This loop assumes the following structure for arch_target_map + # Note that 'accel' is a list, to easily allow a single CPU partition to be used for cross compilation + # for a lot of accelerator targets # arch_target_map = { # 'virtual_partition_name': { # 'os': 'linux', # 'cpu_subdir': 'x86_64/amd/zen4', - # 'accel': ['nvidia/cc90'], # Make this a list, so that we can easily cross compile for a large list with one defined virtual partition + # 'accel': ['nvidia/cc90'], # 'slurm_params': '-p genoa ', # 'repo_targets': ["eessi.io-2023.06-compat","eessi.io-2023.06-software"], # }, @@ -648,9 +633,9 @@ def prepare_jobs(pr, cfg, event_info, action_filter): arch_dir += accelerator arch_dir.replace('/', '_') # check if repo_targets is defined for this virtual partition - if not 'repo_targets' in partition_info: + if 'repo_targets' not in partition_info: log(f"{fn}(): skipping arch {virtual_partition_name}, " - "because no repo_targets were defined for this (virtual) partition") + "because no repo_targets were defined for this (virtual) partition") continue for repo_id in partition_info['repo_targets']: # ensure repocfg contains information about the repository repo_id if repo_id != EESSI @@ -671,7 +656,6 @@ def prepare_jobs(pr, cfg, event_info, action_filter): "instance": app_name } # Optionally add accelerator to the context - check = False if 'accel' in partition_info: match = False # Create a context for each accelerator defined in app.cfg, then @@ -720,8 +704,13 @@ def prepare_jobs(pr, cfg, event_info, action_filter): ) comment_download_pr(base_repo_name, pr, download_pr_exit_code, download_pr_error, error_stage) # prepare job configuration file 'job.cfg' in directory /cfg - log(f"{fn}(): virtual partition = '{virtual_partition_name}' => cpu_target = '{partition_info['cpu_subdir']}' , " - f"os_type = '{partition_info['os']}', requested accelerator = '{accelerator}'") + msg = f"{fn}(): virtual partition = '{virtual_partition_name}' => " + msg += f"configured cpu_target = '{partition_info['cpu_subdir']}' , " + msg += f"configured os = '{partition_info['os']}', " + if 'accel' in partition_info: + msg += f"configured accelerator(s) = '{partition_info['accel']}, " + msg += f"requested accelerator = '{accelerator}'" + log(msg) prepare_job_cfg(job_dir, build_env_cfg, repocfg, repo_id, partition_info['cpu_subdir'], partition_info['os'], accelerator) diff --git a/tools/config.py b/tools/config.py index fc790d56..d0ec6498 100644 --- a/tools/config.py +++ b/tools/config.py @@ -110,7 +110,6 @@ NEW_JOB_COMMENTS_SETTING_AWAITS_LAUNCH = 'awaits_launch' SECTION_REPO_TARGETS = 'repo_targets' -REPO_TARGETS_SETTING_REPO_TARGET_MAP = 'repo_target_map' REPO_TARGETS_SETTING_REPOS_CFG_DIR = 'repos_cfg_dir' SECTION_RUNNING_JOB_COMMENTS = 'running_job_comments' diff --git a/tools/filter.py b/tools/filter.py index 9665c7c5..2c4146bb 100644 --- a/tools/filter.py +++ b/tools/filter.py @@ -304,10 +304,11 @@ def check_filters(self, context): check = False break - # If the context declares an accelerator, but the build command did not (i.e. no action filter is defined for the accelerator component) - # then the check should only return True if "None" was the accelerator defined in the context - # This ensures that on partitions that no CPU-only builds are done on accelerated partitions, unless these partitions are - # explicitly configured with "None" as _one_ of the valid accelerators in their `accel:` list in app.cfg + # If the context declares an accelerator, but the build command did not (i.e. no action filter is defined + # for the accelerator component) then the check should only return True if "None" was the accelerator defined + # in the context. This ensures that no CPU-only builds are done on accelerated partitions, unless these + # partitions are explicitly configured with "None" as _one_ of the valid accelerators in their `accel:` list + # in app.cfg if ( FILTER_COMPONENT_ACCEL in context and not any(af.component == FILTER_COMPONENT_ACCEL for af in self.action_filters) From b4032a6becd6596a5377ca5049b1bc8dc40b1e5c Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 1 Jul 2025 16:56:42 +0200 Subject: [PATCH 049/132] Fix quotation of keys --- eessi_bot_event_handler.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index df777914..78bb8e0c 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -416,13 +416,13 @@ def handle_pull_request_opened_event(self, event_info, pr, req_chatlevel=ChatLev for partition_num, arch in enumerate(arch_map): comment += f"\n- partition {partition_num+1}:" if "os" in arch: - comment += f"\n - os: {arch[os]}" + comment += f"\n - os: {arch['os']}" if "cpu_subdir" in arch: - comment += f"\n - architecture: {arch[cpu_subdir]}" + comment += f"\n - architecture: {arch['cpu_subdir']}" if "repo_targets" in arch: - comment += f"\n - repositories: {arch[repo_targets]}" + comment += f"\n - repositories: {arch['repo_targets']}" if "accel" in arch: - comment += f"\n - accelerators: {arch[accel]}" + comment += f"\n - accelerators: {arch['accel']}" comment += "\n" self.log(f"PR opened: comment '{comment}'") From 3898df7a4fb25dbc45d064907bff6390ff4dde40 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 1 Jul 2025 16:58:00 +0200 Subject: [PATCH 050/132] Fix flake8 issue --- eessi_bot_event_handler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index 78bb8e0c..10767c99 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -29,8 +29,8 @@ # Local application imports (anything from EESSI/eessi-bot-software-layer) from connections import github -from tasks.build import check_build_permission, get_architecture_targets, get_repo_cfg, \ - request_bot_build_issue_comments, submit_build_jobs +from tasks.build import check_build_permission, get_architecture_targets, request_bot_build_issue_comments, \ + submit_build_jobs from tasks.deploy import deploy_built_artefacts, determine_job_dirs from tasks.clean_up import move_to_trash_bin from tools import config From ef0c43045ac2ef8f2b73e20be254f87ddd7e2e5a Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 1 Jul 2025 17:10:10 +0200 Subject: [PATCH 051/132] Unpack the actual arch_target_map by accessing it with a key to get to individual partitions when printing the config --- eessi_bot_event_handler.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index 10767c99..dfefe5df 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -414,15 +414,18 @@ def handle_pull_request_opened_event(self, event_info, pr, req_chatlevel=ChatLev comment = f"Instance `{app_name}` is configured to build for:" for partition_num, arch in enumerate(arch_map): - comment += f"\n- partition {partition_num+1}:" - if "os" in arch: - comment += f"\n - os: {arch['os']}" - if "cpu_subdir" in arch: - comment += f"\n - architecture: {arch['cpu_subdir']}" - if "repo_targets" in arch: - comment += f"\n - repositories: {arch['repo_targets']}" - if "accel" in arch: - comment += f"\n - accelerators: {arch['accel']}" + # Do not print virtual partition names, a bot admin may not want to share those + # Instead, just number them + comment += f"\n- Partition {partition_num+1}:" + current_partition = arch_map[arch] + if "os" in current_partition: + comment += f"\n - OS: {current_partition['os']}" + if "cpu_subdir" in current_partition: + comment += f"\n - CPU architecture: {current_partition['cpu_subdir']}" + if "repo_targets" in current_partition: + comment += f"\n - Repositories: {current_partition['repo_targets']}" + if "accel" in current_partition: + comment += f"\n - Accelerators: {current_partition['accel']}" comment += "\n" self.log(f"PR opened: comment '{comment}'") From e3df69044a3eff188da8b3a8c67dfa08b0500bc7 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 8 Jul 2025 15:40:34 +0200 Subject: [PATCH 052/132] Fix mistake in build path --- tasks/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/build.py b/tasks/build.py index bc5e5fb3..3031901e 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -630,7 +630,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter): if 'accel' in partition_info and accelerator is not None: # Use the accelerator as defined by the action_filter. We check if this is valid for the current # virtual partition later - arch_dir += accelerator + arch_dir += f"/{accelerator}" arch_dir.replace('/', '_') # check if repo_targets is defined for this virtual partition if 'repo_targets' not in partition_info: From 424a0012309152c66bfbb5506db7622284d9aba2 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 9 Jul 2025 14:46:42 +0200 Subject: [PATCH 053/132] Parse on: and for: options, and pass the correct values on to the command filters --- tools/commands.py | 74 ++++++++++++++++++++++++++++++++++++++++------- tools/filter.py | 11 +++---- 2 files changed, 67 insertions(+), 18 deletions(-) diff --git a/tools/commands.py b/tools/commands.py index b842cc19..18f3f228 100644 --- a/tools/commands.py +++ b/tools/commands.py @@ -85,19 +85,71 @@ def __init__(self, cmd_str): """ # TODO add function name to log messages cmd_as_list = cmd_str.split() - self.command = cmd_as_list[0] + self.command = cmd_as_list[0] # E.g. 'build' or 'help' + # TODO always init self.action_filters with empty EESSIBotActionFilter? if len(cmd_as_list) > 1: - arg_str = " ".join(cmd_as_list[1:]) - try: - self.action_filters = EESSIBotActionFilter(arg_str) - except EESSIBotActionFilterError as err: - log(f"ERROR: EESSIBotActionFilterError - {err.args}") - self.action_filters = None - raise EESSIBotCommandError("invalid action filter") - except Exception as err: - log(f"Unexpected err={err}, type(err)={type(err)}") - raise + + # Extract arguments for the action filters + # By default, everything that follows the 'on:' argument (until the next space) is + # considered part of the argument list for the action filters + target_args = [] + other_filter_args = [] + on_found = False + for arg in cmd_as_list[1:]: + if arg.startswith('on:'): + on_found = True + # Extract everything after 'on:' and split by comma + filter_content = arg[3:] # Remove 'on:' prefix + target_args.extend(filter_content.split(',')) + elif not arg.startswith('off:'): + # Anything that is not 'on:' or 'for:' should just be passed on as normal + # No further parsing of the value is needed + other_filter_args.extend([arg]) + + # If no 'on:' is found in the argument list, everything that follows the 'for:' argument + # (until the next space) is considered the argument list for the action filters + # Essentially, this represents a native build, i.e. the hardware we build on should be the + # hardware we build on + if not on_found: + for arg in cmd_as_list[1:]: + if arg.startswith('for:'): + # Extract everything after the 'for:' suffix and split by comma + filter_content=arg[4:] + target_args.extend(filter_content.split(',')) + + # Join the filter arguments and pass to EESSIBotActionFilter + # At this point, target_args is e.g. ["arch=amd/zen2","accel=nvidia/cc90"] + # But EESSIBotActionFilter expects e.g. "arch:amd/zen2 accel:nvidia/cc90" + # First, normalize to the ["arch:amd/zen2", "accel:nvidia/cc90"] format + normalized_filters = [] + if target_args: + for filter_item in target_args: + if '=' in filter_item: + component, pattern = filter_item.split('=', 1) + normalized_filters.append(f"{component}:{pattern}") + + # Add the other filter args to the normalized filters. The other_filter_args are already colon-separated + # so no special parsing needed there + log(f"Extracted filter arguments related to hardware target: {normalized_filters}") + log(f"Other extracted filter arguments: {other_filter_args}") + normalized_filters += other_filter_args + + # Finally, change into a space-separated string, as expected by EESSIBotActionFilter + # e.g "arch:amd/zen2 accel:nvidia/cc90 repo:my.repo.io" + if normalized_filters: + arg_str = " ".join(normalized_filters) + try: + log(f"Passing the following arguments to the EESSIBotActionFilter: {arg_str}") + self.action_filters = EESSIBotActionFilter(arg_str) + except EESSIBotActionFilterError as err: + log(f"ERROR: EESSIBotActionFilterError - {err.args}") + self.action_filters = None + raise EESSIBotCommandError("invalid action filter") + except Exception as err: + log(f"Unexpected err={err}, type(err)={type(err)}") + raise + # No arguments were passed to the command self.command else: self.action_filters = EESSIBotActionFilter("") diff --git a/tools/filter.py b/tools/filter.py index 2c4146bb..21984b1b 100644 --- a/tools/filter.py +++ b/tools/filter.py @@ -304,16 +304,13 @@ def check_filters(self, context): check = False break - # If the context declares an accelerator, but the build command did not (i.e. no action filter is defined - # for the accelerator component) then the check should only return True if "None" was the accelerator defined - # in the context. This ensures that no CPU-only builds are done on accelerated partitions, unless these - # partitions are explicitly configured with "None" as _one_ of the valid accelerators in their `accel:` list - # in app.cfg + # If the context declares an accelerator, enforce that a filter is defined for this component as well + # I.e. this enforces that a context with accelerator will only be used if an accelerator is explicitely + # requested in the build command, thus preventing CPU-only builds on GPU nodes (unless explicitely intended) if ( FILTER_COMPONENT_ACCEL in context and not any(af.component == FILTER_COMPONENT_ACCEL for af in self.action_filters) ): - if not context[FILTER_COMPONENT_ACCEL] == "None": - check = False + check = False return check From 3f00e5196dbecc726e3a9955c01a390ea44328e7 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 10 Jul 2025 12:51:35 +0200 Subject: [PATCH 054/132] Make sure that the for: arguments are used as build parameters --- eessi_bot_event_handler.py | 2 +- tasks/build.py | 21 ++++++++++++--------- tools/commands.py | 11 ++++++++--- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index dfefe5df..320d0ad8 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -531,7 +531,7 @@ def handle_bot_command_build(self, event_info, bot_command): build_msg = '' if check_build_permission(pr, event_info): # use filter from command - submitted_jobs = submit_build_jobs(pr, event_info, bot_command.action_filters) + submitted_jobs = submit_build_jobs(pr, event_info, bot_command.action_filters, bot_command.build_params) if submitted_jobs is None or len(submitted_jobs) == 0: build_msg = "\n - no jobs were submitted" else: diff --git a/tasks/build.py b/tasks/build.py index 3031901e..797e95d8 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -33,7 +33,7 @@ from tools import config, cvmfs_repository, job_metadata, pr_comments, run_cmd import tools.filter as tools_filter from tools.pr_comments import ChatLevels, create_comment - +from tools.build_params import BUILD_PARAM_ARCH, BUILD_PARAM_ACCEL # defaults (used if not specified via, eg, 'app.cfg') DEFAULT_JOB_TIME_LIMIT = "24:00:00" @@ -551,7 +551,7 @@ def prepare_export_vars_file(job_dir, exportvars): log(f"{fn}(): created exported variables file {export_vars_path}") -def prepare_jobs(pr, cfg, event_info, action_filter): +def prepare_jobs(pr, cfg, event_info, action_filter, build_params): """ Prepare all jobs whose context matches the given filter. Preparation includes creating a working directory for a job, downloading the pull request into @@ -562,6 +562,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter): cfg (ConfigParser): instance holding full configuration (typically read from 'app.cfg') event_info (dict): event received by event_handler action_filter (EESSIBotActionFilter): used to filter which jobs shall be prepared + build_params (EESSIBotBuildParams): dict that contains the build parameters for the job Returns: (list): list of the prepared jobs @@ -705,15 +706,16 @@ def prepare_jobs(pr, cfg, event_info, action_filter): comment_download_pr(base_repo_name, pr, download_pr_exit_code, download_pr_error, error_stage) # prepare job configuration file 'job.cfg' in directory /cfg msg = f"{fn}(): virtual partition = '{virtual_partition_name}' => " - msg += f"configured cpu_target = '{partition_info['cpu_subdir']}' , " + msg += f"requested cpu_target = '{partition_info['cpu_subdir']}, " + msg += f"build cpu_target = '{build_params[BUILD_PARAM_ARCH]}', " msg += f"configured os = '{partition_info['os']}', " if 'accel' in partition_info: - msg += f"configured accelerator(s) = '{partition_info['accel']}, " - msg += f"requested accelerator = '{accelerator}'" + msg += f"requested accelerator(s) = '{partition_info['accel']}, " + msg += f"build accelerator = '{build_params[BUILD_PARAM_ACCEL]}'" log(msg) - prepare_job_cfg(job_dir, build_env_cfg, repocfg, repo_id, partition_info['cpu_subdir'], - partition_info['os'], accelerator) + prepare_job_cfg(job_dir, build_env_cfg, repocfg, repo_id, build_params[BUILD_PARAM_ARCH], + partition_info['os'], build_params[BUILD_PARAM_ACCEL]) if exportvars: prepare_export_vars_file(job_dir, exportvars) @@ -1032,7 +1034,7 @@ def create_pr_comment(job, job_id, app_name, pr, symlink): return None -def submit_build_jobs(pr, event_info, action_filter): +def submit_build_jobs(pr, event_info, action_filter, build_params): """ Create build jobs for a pull request by preparing jobs which match the given filters, submitting them, adding comments to the pull request on GitHub and @@ -1042,6 +1044,7 @@ def submit_build_jobs(pr, event_info, action_filter): pr (github.PullRequest.PullRequest): instance representing the pull request event_info (dict): event received by event_handler action_filter (EESSIBotActionFilter): used to filter which jobs shall be prepared + build_params (EESSIBotBuildParams): dict that contains the build parameters for the job Returns: (dict): dictionary mapping a job id to a github.IssueComment.IssueComment @@ -1054,7 +1057,7 @@ def submit_build_jobs(pr, event_info, action_filter): app_name = cfg[config.SECTION_GITHUB].get(config.GITHUB_SETTING_APP_NAME) # setup job directories (one per element in product of architecture x repositories) - jobs = prepare_jobs(pr, cfg, event_info, action_filter) + jobs = prepare_jobs(pr, cfg, event_info, action_filter, build_params) # return if there are no jobs to be submitted if not jobs: diff --git a/tools/commands.py b/tools/commands.py index 18f3f228..360279f5 100644 --- a/tools/commands.py +++ b/tools/commands.py @@ -18,7 +18,7 @@ # Local application imports (anything from EESSI/eessi-bot-software-layer) from tools.filter import EESSIBotActionFilter, EESSIBotActionFilterError - +from tools.build_params import EESSIBotBuildParams def contains_any_bot_command(body): """ @@ -89,7 +89,6 @@ def __init__(self, cmd_str): # TODO always init self.action_filters with empty EESSIBotActionFilter? if len(cmd_as_list) > 1: - # Extract arguments for the action filters # By default, everything that follows the 'on:' argument (until the next space) is # considered part of the argument list for the action filters @@ -102,7 +101,13 @@ def __init__(self, cmd_str): # Extract everything after 'on:' and split by comma filter_content = arg[3:] # Remove 'on:' prefix target_args.extend(filter_content.split(',')) - elif not arg.startswith('off:'): + elif arg.startswith('for:'): + # Anything listed as 'for:' is build parameters + build_params = arg[4:] + # EESSIBotBuildParams is essentially a dict, but parses the input argument + # according to the expected argument format for 'for:' + self.build_params = EESSIBotBuildParams(build_params) + else: # Anything that is not 'on:' or 'for:' should just be passed on as normal # No further parsing of the value is needed other_filter_args.extend([arg]) From 2625b304a62a718f0f807cdd8a775aac43190205 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 10 Jul 2025 14:10:47 +0200 Subject: [PATCH 055/132] Change path for job dir so that it represents the 'for' architectures --- tasks/build.py | 26 +++++++++++++------------- tools/job_metadata.py | 2 ++ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/tasks/build.py b/tasks/build.py index 797e95d8..ce563a2f 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -615,27 +615,25 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params): # Note that 'accel' is a list, to easily allow a single CPU partition to be used for cross compilation # for a lot of accelerator targets # arch_target_map = { - # 'virtual_partition_name': { + # 'node_type_name': { # 'os': 'linux', # 'cpu_subdir': 'x86_64/amd/zen4', # 'accel': ['nvidia/cc90'], # 'slurm_params': '-p genoa ', # 'repo_targets': ["eessi.io-2023.06-compat","eessi.io-2023.06-software"], # }, - # 'virtual_partition_name2': { + # 'node_type_name2': { # ... etc - for virtual_partition_name, partition_info in arch_map.items(): - log(f"{fn}(): virtual_partition_name is {virtual_partition_name}, partition_info is {partition_info}") + for node_type_name, partition_info in arch_map.items(): + log(f"{fn}(): node_type_name is {node_type_name}, partition_info is {partition_info}") # Unpack for convenience - arch_dir = partition_info['cpu_subdir'] - if 'accel' in partition_info and accelerator is not None: - # Use the accelerator as defined by the action_filter. We check if this is valid for the current - # virtual partition later - arch_dir += f"/{accelerator}" + arch_dir = build_params[BUILD_PARAM_ARCH] + if BUILD_PARAM_ACCEL in build_params: + arch_dir += f"/{build_params[BUILD_PARAM_ACCEL]}" arch_dir.replace('/', '_') # check if repo_targets is defined for this virtual partition if 'repo_targets' not in partition_info: - log(f"{fn}(): skipping arch {virtual_partition_name}, " + log(f"{fn}(): skipping arch {node_type_name}, " "because no repo_targets were defined for this (virtual) partition") continue for repo_id in partition_info['repo_targets']: @@ -705,7 +703,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params): ) comment_download_pr(base_repo_name, pr, download_pr_exit_code, download_pr_error, error_stage) # prepare job configuration file 'job.cfg' in directory /cfg - msg = f"{fn}(): virtual partition = '{virtual_partition_name}' => " + msg = f"{fn}(): node type = '{node_type_name}' => " msg += f"requested cpu_target = '{partition_info['cpu_subdir']}, " msg += f"build cpu_target = '{build_params[BUILD_PARAM_ARCH]}', " msg += f"configured os = '{partition_info['os']}', " @@ -715,7 +713,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params): log(msg) prepare_job_cfg(job_dir, build_env_cfg, repocfg, repo_id, build_params[BUILD_PARAM_ARCH], - partition_info['os'], build_params[BUILD_PARAM_ACCEL]) + partition_info['os'], build_params[BUILD_PARAM_ACCEL], node_type_name) if exportvars: prepare_export_vars_file(job_dir, exportvars) @@ -732,7 +730,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params): return jobs -def prepare_job_cfg(job_dir, build_env_cfg, repos_cfg, repo_id, software_subdir, os_type, accelerator): +def prepare_job_cfg(job_dir, build_env_cfg, repos_cfg, repo_id, software_subdir, os_type, accelerator, node_type_name): """ Set up job configuration file 'job.cfg' in directory /cfg @@ -744,6 +742,7 @@ def prepare_job_cfg(job_dir, build_env_cfg, repos_cfg, repo_id, software_subdir, software_subdir (string): software subdirectory to build for (e.g., 'x86_64/generic') os_type (string): type of the os (e.g., 'linux') accelerator (string): defines accelerator to build for (e.g., 'nvidia/cc80') + node_type_name (string): the node type name, as configured in app.cfg Returns: None (implicitly) @@ -814,6 +813,7 @@ def prepare_job_cfg(job_dir, build_env_cfg, repos_cfg, repo_id, software_subdir, job_cfg_arch_section = job_metadata.JOB_CFG_ARCHITECTURE_SECTION job_cfg[job_cfg_arch_section] = {} + job_cfg[job_cfg_arch_section][job_metadata.JOB_CFG_ARCHITECTURE_NODE_TYPE] = node_type_name job_cfg[job_cfg_arch_section][job_metadata.JOB_CFG_ARCHITECTURE_SOFTWARE_SUBDIR] = software_subdir job_cfg[job_cfg_arch_section][job_metadata.JOB_CFG_ARCHITECTURE_OS_TYPE] = os_type job_cfg[job_cfg_arch_section][job_metadata.JOB_CFG_ARCHITECTURE_ACCELERATOR] = accelerator if accelerator else '' diff --git a/tools/job_metadata.py b/tools/job_metadata.py index 7b7b8d0a..f5ee21ce 100644 --- a/tools/job_metadata.py +++ b/tools/job_metadata.py @@ -34,7 +34,9 @@ JOB_CFG_UPLOAD_STEP = "upload_step" # JWD/cfg/$JOB_CFG_FILENAME + JOB_CFG_ARCHITECTURE_SECTION = "architecture" +JOB_CFG_ARCHITECTURE_NODE_TYPE = "node_type" JOB_CFG_ARCHITECTURE_OS_TYPE = "os_type" JOB_CFG_ARCHITECTURE_SOFTWARE_SUBDIR = "software_subdir" JOB_CFG_ARCHITECTURE_ACCELERATOR = "accelerator" From ffa2303cfc777151a2bef1de83e21198736a06bf Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 14 Jul 2025 12:38:18 +0200 Subject: [PATCH 056/132] More extensive reporting by the bot on what to build for/on --- app.cfg.example | 2 +- tasks/build.py | 48 ++++++++++++++++++++++++++++++++++-------------- 2 files changed, 35 insertions(+), 15 deletions(-) diff --git a/app.cfg.example b/app.cfg.example index bf7911b5..453cdb64 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -419,7 +419,7 @@ scontrol_command = /usr/bin/scontrol # awaits_release = job id `{job_id}` awaits release by job manager awaits_release_delayed_begin_msg = job id `{job_id}` will be eligible to start in about {delay_seconds} seconds awaits_release_hold_release_msg = job id `{job_id}` awaits release by job manager -initial_comment = New job on instance `{app_name}` for CPU micro-architecture `{arch_name}`{accelerator_spec} for repository `{repo_id}` in job dir `{symlink}` +initial_comment = New job on instance `{app_name}` for repository `{repo_id}`\nBuilding on: `{on_arch}`{on_accelerator}\nBuilding for: `{for_arch}`{for_accelerator}\nJob dir: `{symlink}` with_accelerator =  and accelerator `{accelerator}` diff --git a/tasks/build.py b/tasks/build.py index ce563a2f..006dc17a 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -20,6 +20,7 @@ # Standard library imports from collections import namedtuple import configparser +import codecs from datetime import datetime, timezone import json import os @@ -952,7 +953,7 @@ def submit_job(job, cfg): return job_id, symlink -def create_pr_comment(job, job_id, app_name, pr, symlink): +def create_pr_comment(job, job_id, app_name, pr, symlink, build_params): """ Create a comment to the pull request for a newly submitted job @@ -962,6 +963,7 @@ def create_pr_comment(job, job_id, app_name, pr, symlink): app_name (string): name of the app pr (github.PullRequest.PullRequest): instance representing the pull request symlink (string): symlink from main pr_ dir to job dir + build_params (EESSIBotBuildParams): dict that contains the build parameters for the job Returns: github.IssueComment.IssueComment instance or None (note, github refers to @@ -969,16 +971,24 @@ def create_pr_comment(job, job_id, app_name, pr, symlink): """ fn = sys._getframe().f_code.co_name - # obtain arch from job.arch_target which has the format OS/ARCH - arch_name = '-'.join(job.arch_target.split('/')[1:]) + # Obtain the architecture on which we are building from job.arch_target, which has the format OS/ARCH + on_arch = '-'.join(job.arch_target.split('/')[1:]) + + # Obtain the architecture to build for + for_arch = build_params[BUILD_PARAM_ARCH] submitted_job_comments_cfg = config.read_config()[config.SECTION_SUBMITTED_JOB_COMMENTS] - # set string for accelerator if job.accelerator is defined/set (e.g., not None) - accelerator_spec_str = '' + # Set string for accelerator to build on + accelerator_spec = f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_WITH_ACCELERATOR]}" + on_accelerator_str = '' if job.accelerator: - accelerator_spec = f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_WITH_ACCELERATOR]}" - accelerator_spec_str = accelerator_spec.format(accelerator=job.accelerator) + on_accelerator_str = accelerator_spec.format(accelerator=job.accelerator) + + # Set string for accelerator to build for + for_accelerator_str = '' + if BUILD_PARAM_ACCEL in build_params: + for_accelerator_str = accelerator_spec.format(accelerator=build_params[BUILD_PARAM_ACCEL]) # get current date and time dt = datetime.now(timezone.utc) @@ -986,6 +996,9 @@ def create_pr_comment(job, job_id, app_name, pr, symlink): # construct initial job comment buildenv = config.read_config()[config.SECTION_BUILDENV] job_handover_protocol = buildenv.get(config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL) + raw_comment_template = submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT] + # Support using escape chars in the INITIAL_COMMENT, that means \n should be interpreted as unicode + initial_comment_template = codecs.decode(raw_comment_template, 'unicode_escape') if job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_DELAYED_BEGIN: release_msg_string = config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_DELAYED_BEGIN_MSG release_comment_template = submitted_job_comments_cfg[release_msg_string] @@ -994,34 +1007,41 @@ def create_pr_comment(job, job_id, app_name, pr, symlink): poll_interval = int(job_manager_cfg.get(config.JOB_MANAGER_SETTING_POLL_INTERVAL)) delay_factor = float(buildenv.get(config.BUILDENV_SETTING_JOB_DELAY_BEGIN_FACTOR, 2)) eligible_in_seconds = int(poll_interval * delay_factor) - job_comment = (f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT]}" + job_comment = (f"{initial_comment_template}" f"\n|date|job status|comment|\n" f"|----------|----------|------------------------|\n" f"|{dt.strftime('%b %d %X %Z %Y')}|" f"submitted|" f"{release_comment_template}|").format( app_name=app_name, - arch_name=arch_name, + on_arch=on_arch, + for_arch=for_arch, symlink=symlink, repo_id=job.repo_id, job_id=job_id, delay_seconds=eligible_in_seconds, - accelerator_spec=accelerator_spec_str) + on_accelerator=on_accelerator_str, + for_accelerator=for_accelerator_str) else: release_msg_string = config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_HOLD_RELEASE_MSG release_comment_template = submitted_job_comments_cfg[release_msg_string] - job_comment = (f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT]}" + job_comment = (f"{initial_comment_template}" f"\n|date|job status|comment|\n" f"|----------|----------|------------------------|\n" f"|{dt.strftime('%b %d %X %Z %Y')}|" f"submitted|" f"{release_comment_template}|").format( app_name=app_name, - arch_name=arch_name, + on_arch=on_arch, + for_arch=for_arch, symlink=symlink, repo_id=job.repo_id, job_id=job_id, - accelerator_spec=accelerator_spec_str) + on_accelerator=on_accelerator_str, + for_accelerator=for_accelerator_str) + + # Make sure newline characters are taken as new line characters, not as literal \n + job_comment='\n'.join(job_comment.split('\n')) # create comment to pull request repo_name = pr.base.repo.full_name @@ -1072,7 +1092,7 @@ def submit_build_jobs(pr, event_info, action_filter, build_params): job_id, symlink = submit_job(job, cfg) # create pull request comment to report about the submitted job - pr_comment = create_pr_comment(job, job_id, app_name, pr, symlink) + pr_comment = create_pr_comment(job, job_id, app_name, pr, symlink, build_params) job_id_to_comment_map[job_id] = pr_comment pr_comment = pr_comments.PRComment(pr.base.repo.full_name, pr.number, pr_comment.id) From c98e7e89b5e816f17d51db2df7485d5a19633ecd Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 14 Jul 2025 12:45:00 +0200 Subject: [PATCH 057/132] This is no longer needed, as it is done with the codecs (decode) now --- tasks/build.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tasks/build.py b/tasks/build.py index 006dc17a..08bafe21 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -1040,9 +1040,6 @@ def create_pr_comment(job, job_id, app_name, pr, symlink, build_params): on_accelerator=on_accelerator_str, for_accelerator=for_accelerator_str) - # Make sure newline characters are taken as new line characters, not as literal \n - job_comment='\n'.join(job_comment.split('\n')) - # create comment to pull request repo_name = pr.base.repo.full_name issue_comment = create_comment(repo_name, pr.number, job_comment, ChatLevels.MINIMAL) From 5904f11c4644c39269d9fcf139b78dc1839b1354 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 14 Jul 2025 17:02:53 +0200 Subject: [PATCH 058/132] Print real arch_target_map keys when doing show_config --- app.cfg.example | 2 +- eessi_bot_event_handler.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/app.cfg.example b/app.cfg.example index 453cdb64..9ee912eb 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -308,7 +308,7 @@ signing = # defines for which architectures the bot will build and what job submission # parameters shall be used to allocate a compute node with the correct # The keys of the arch_target_map are virtual partition names. They don't have any meaning in the bot code, -# and can thus be chosen as desired. +# and can thus be chosen as desired. They are publicly visible though if a bot:show_config command is issued. # Note that you are responsible that ANY bot:build command ONLY matches a single virtual partition! # If multiple partitions match the same bot:build command, a failure will be triggered in the job dir preparation arch_target_map = { diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index 320d0ad8..ce35d362 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -413,10 +413,10 @@ def handle_pull_request_opened_event(self, event_info, pr, req_chatlevel=ChatLev arch_map = get_architecture_targets(self.cfg) comment = f"Instance `{app_name}` is configured to build for:" - for partition_num, arch in enumerate(arch_map): + for arch in arch_map: # Do not print virtual partition names, a bot admin may not want to share those # Instead, just number them - comment += f"\n- Partition {partition_num+1}:" + comment += f"\n- Partition {arch}:" current_partition = arch_map[arch] if "os" in current_partition: comment += f"\n - OS: {current_partition['os']}" From 7c869f00a869150ec8d2ad0cfa1f1ea1ed1f8390 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 14 Jul 2025 17:20:18 +0200 Subject: [PATCH 059/132] Reduce number of possible accelerators per node type to one. Nodes with multiple types of accelerators in a single node can be configured as two separate node types if needed --- tasks/build.py | 37 ++++++++++--------------------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/tasks/build.py b/tasks/build.py index 08bafe21..8e30cfb4 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -657,26 +657,14 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params): } # Optionally add accelerator to the context if 'accel' in partition_info: - match = False - # Create a context for each accelerator defined in app.cfg, then - # check if _any_ of them is valid (one is enough to continue) - for accel in partition_info['accel']: - context['accelerator'] = accel - log(f"{fn}(): context is '{json.dumps(context, indent=4)}'") - if not action_filter.check_filters(context): - log(f"{fn}(): context does NOT satisfy filter(s), skipping") - continue - # check = check | action_filter.check_filters(context) - else: - log(f"{fn}(): context DOES satisfy filter(s), going on with job") - match = True - # Break as soon as we have found a valid context, it means the build args are valid - # for at least one of the accelerators in this virtual partition, that's enough - break - # If we get to this point, and none of the contexts matched the filter, we should continue to the - # next iteration of the partition_info['repo_targets'] loop - if not match: + context['accelerator'] = accel + log(f"{fn}(): context is '{json.dumps(context, indent=4)}'") + if not action_filter.check_filters(context): + log(f"{fn}(): context does NOT satisfy filter(s), skipping") continue + # check = check | action_filter.check_filters(context) + else: + log(f"{fn}(): context DOES satisfy filter(s), going on with job") else: log(f"{fn}(): context is '{json.dumps(context, indent=4)}'") if not action_filter.check_filters(context): @@ -686,13 +674,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params): log(f"{fn}(): context DOES satisfy filter(s), going on with job") # we reached this point when the filter matched (otherwise we # 'continue' with the next repository) - # for each match of the filter we create a specific job directory - # however, matching CPU architectures works differently to handling - # accelerators; multiple CPU architectures defined in arch_target_map - # can match the (CPU) architecture component of a filter; in - # contrast, the value of the accelerator filter is just passed down - # to scripts in bot/ directory of the pull request (see function - # prepare_job_cfg and creation of Job tuple below) + # We create a specific job directory for the architecture that is going to be build 'for:' job_dir = os.path.join(run_dir, arch_dir, repo_id) os.makedirs(job_dir, exist_ok=True) log(f"{fn}(): job_dir '{job_dir}'") @@ -710,7 +692,8 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params): msg += f"configured os = '{partition_info['os']}', " if 'accel' in partition_info: msg += f"requested accelerator(s) = '{partition_info['accel']}, " - msg += f"build accelerator = '{build_params[BUILD_PARAM_ACCEL]}'" + if BUILD_PARAM_ACCEL in build_params: + msg += f"build accelerator = '{build_params[BUILD_PARAM_ACCEL]}'" log(msg) prepare_job_cfg(job_dir, build_env_cfg, repocfg, repo_id, build_params[BUILD_PARAM_ARCH], From 179ab0ae13415042974d7822e88b4fa5da27c288 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 14 Jul 2025 17:24:22 +0200 Subject: [PATCH 060/132] Fix app.cfg for the fact that partition_info['accel'] is now a string, not a list --- app.cfg.example | 6 +++--- tasks/build.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/app.cfg.example b/app.cfg.example index 9ee912eb..10f97d6b 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -339,7 +339,7 @@ arch_target_map = { "cpu_zen4": { "os": "linux", "cpu_subdir": "x86_64/amd/zen4", - "accel": ["None"], + "accel": "None", "slurm_params": "-p genoa --nodes 1 --ntasks-per-node 24 --cpus-per-task 1", "repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"] }, @@ -352,7 +352,7 @@ arch_target_map = { "gpu_a100": { "os": "linux", "cpu_subdir": "x86_64/intel/icelake", - "accel": ["nvidia/cc80"], + "accel": "nvidia/cc80", "slurm_params": "-p gpu_a100 --nodes 1 --tasks-per-node 18 --cpus-per-task 1 --gpus-per-node 1", "repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"] }, @@ -372,7 +372,7 @@ arch_target_map = { "gpu_h100": { "os": "linux", "cpu_subdir": "x86_64/amd/zen4", - "accel": ["nvidia/cc90"], + "accel": "nvidia/cc90", "slurm_params": "-p gpu_h100 --nodes 1 --tasks-per-node 16 --cpus-per-task 1 --gpus-per-node 1", "repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"] }} diff --git a/tasks/build.py b/tasks/build.py index 8e30cfb4..7e41ef44 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -657,7 +657,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params): } # Optionally add accelerator to the context if 'accel' in partition_info: - context['accelerator'] = accel + context['accelerator'] = partition_info['accel'] log(f"{fn}(): context is '{json.dumps(context, indent=4)}'") if not action_filter.check_filters(context): log(f"{fn}(): context does NOT satisfy filter(s), skipping") From a9a25850b2640eda1351ede973426d87e477af9c Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 14 Jul 2025 17:38:49 +0200 Subject: [PATCH 061/132] Make sure that we don't access a dict item that doesn't exist --- tasks/build.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tasks/build.py b/tasks/build.py index 7e41ef44..e69aea73 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -631,6 +631,9 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params): arch_dir = build_params[BUILD_PARAM_ARCH] if BUILD_PARAM_ACCEL in build_params: arch_dir += f"/{build_params[BUILD_PARAM_ACCEL]}" + build_for_accel = build_params[BUILD_PARAM_ACCEL] + else: + build_for_accel = '' arch_dir.replace('/', '_') # check if repo_targets is defined for this virtual partition if 'repo_targets' not in partition_info: @@ -692,12 +695,11 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params): msg += f"configured os = '{partition_info['os']}', " if 'accel' in partition_info: msg += f"requested accelerator(s) = '{partition_info['accel']}, " - if BUILD_PARAM_ACCEL in build_params: - msg += f"build accelerator = '{build_params[BUILD_PARAM_ACCEL]}'" + msg += f"build accelerator = '{build_for_accel}'" log(msg) prepare_job_cfg(job_dir, build_env_cfg, repocfg, repo_id, build_params[BUILD_PARAM_ARCH], - partition_info['os'], build_params[BUILD_PARAM_ACCEL], node_type_name) + partition_info['os'], build_for_accel, node_type_name) if exportvars: prepare_export_vars_file(job_dir, exportvars) From 51a9c740833746a24179e7e186e0107234d2c06d Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 14 Jul 2025 18:01:36 +0200 Subject: [PATCH 062/132] Make sure a context match fails if the context doesn't provide e.g. an accelerator, but such an accelerator is requested --- tools/filter.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/filter.py b/tools/filter.py index 21984b1b..ddc58352 100644 --- a/tools/filter.py +++ b/tools/filter.py @@ -303,6 +303,10 @@ def check_filters(self, context): else: check = False break + # Action filter wasn't found in the context, we won't allow this + else: + check = False + break # If the context declares an accelerator, enforce that a filter is defined for this component as well # I.e. this enforces that a context with accelerator will only be used if an accelerator is explicitely From b12c9114479dc81bd6baabfa9d13749ccdd672dc Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 15 Jul 2025 17:38:43 +0200 Subject: [PATCH 063/132] Make old config items invalid, rename to node_type and note_type_map, make sure we can do bot:status for the new formatted printing, etc --- app.cfg.example | 70 +++++--------- eessi_bot_event_handler.py | 17 ++-- eessi_bot_job_manager.py | 3 +- tasks/build.py | 181 ++++++++++++++++++++++++++++++------- tools/config.py | 43 ++++++++- 5 files changed, 222 insertions(+), 92 deletions(-) diff --git a/app.cfg.example b/app.cfg.example index 10f97d6b..7da0147d 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -305,37 +305,29 @@ signing = [architecturetargets] -# defines for which architectures the bot will build and what job submission -# parameters shall be used to allocate a compute node with the correct -# The keys of the arch_target_map are virtual partition names. They don't have any meaning in the bot code, -# and can thus be chosen as desired. They are publicly visible though if a bot:show_config command is issued. -# Note that you are responsible that ANY bot:build command ONLY matches a single virtual partition! -# If multiple partitions match the same bot:build command, a failure will be triggered in the job dir preparation -arch_target_map = { - # This is a CPU-based partition. We do not specify an "accel" property explicitly. In this case, invoking the bot - # with ANY accelerator command will trigger a build on this partition, as long as the CPU type matches. E.g. - # bot: build instance:xyz repo:eessi.io-2023.06-software arch:zen2 accel:nvidia/cc90 - # will cause the event_filter to mark this virtual partition as a valid match, and will use it to start building - # for zen2 + nvidia/cc90. Thus, by not specifying an "accel" property, this partition may be used for - # cross-compilation for any accelerator. +# arch_target_map has been replaced by node_typ_map +# arch_target_map = { +# } + +# Each entry in node_type_map describes a node type: os, what CPU architecture it has (cpu_subdir) the SLURM parameters +# that need to be passed to submit to it, which repository targets (repo_targets) can be build for on this node type +# and (optionally) which accelerators ('accel') +# All are strings, except repo_targets, which is a list of strings +# Note that the Slurm parameters should typically be chosen such that a single type of node (with one specific type of +# CPU and one specific type of GPU) should be allocated. +# Below is an example configuration for a system that contains 4 types of nodes: zen2 CPU nodes, zen4 CPU nodes, +# GPU nodes with an icelake CPU and A100 GPU, GPu nodes with a zen4 CPU and an H100 GPU +# The 'on:' argument to the bot build command determines which node type will be allocated for the build job +# E.g. 'bot:build on:arch=zen4,accel=nvidia/cc90 for:...' will match the gpu_h100 node type below +# If no 'on:' argument is passed to the build command, the 'for:' argument is used instead +# E.g. 'bot:build for:arch=icelake,accel=nvidia/cc80' will match the gpu_a100 node type below +node_type_map = { "cpu_zen2": { "os": "linux", "cpu_subdir": "x86_64/amd/zen2", "slurm_params": "-p rome --nodes 1 --ntasks-per-node 16 --cpus-per-task 1", "repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"] }, - # This is a CPU partition. We specify an explicit "accel": "None" property. Thus, this partition will only be - # used if the bot build command does NOT contain an accel argument, e.g. - # bot: build instance:xyz repo:eessi.io-2023.06-software arch:zen4 - # will cause the event_filter to mark this virtual partition as a valid match, and will use it to start building - # for zen4. - # When invoking the bot with an accelerator command, such as - # bot: build instance:xyz repo:eessi.io-2023.06-software arch:zen4 accel:nvidia/cc90 - # the event_filter will NOT mark this virtual partition as a valid match. This is intentional, as this particular - # (example) cluster has a native zen4+cc90 partition (gpu_h100) and we want this command to trigger a native build - # on that partition, rather than cross-compiling on this cpu_zen4 partition. - # One could still allow cross-compilation for other accelerator architectures, e.g. cc70 and cc80 by defining - # "accel": ["nvidia/cc70", "nvidia/cc80"] "cpu_zen4": { "os": "linux", "cpu_subdir": "x86_64/amd/zen4", @@ -343,12 +335,6 @@ arch_target_map = { "slurm_params": "-p genoa --nodes 1 --ntasks-per-node 24 --cpus-per-task 1", "repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"] }, - # This is a GPU partition. We specify an explicit "accel" property. Thus, only if the bot build command - # specifies that explicit accelerator in combination with the relevant CPU type, - # bot: build instance:xyz repo:eessi.io-2023.06-software arch:icelake accel:nvidia/cc80 - # will a build be triggered on this partition - # If you want to use this partition also for CPU only builds, you can alter the "accel" property to - # "accel": ["None", "nvidia/cc80"] "gpu_a100": { "os": "linux", "cpu_subdir": "x86_64/intel/icelake", @@ -356,19 +342,6 @@ arch_target_map = { "slurm_params": "-p gpu_a100 --nodes 1 --tasks-per-node 18 --cpus-per-task 1 --gpus-per-node 1", "repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"] }, - # This is a GPU partition. We specify an explicit "accel" property. Thus, only if the bot build command - # specifies that explicit accelerator in combination with the relevant CPU type, - # bot: build instance:xyz repo:eessi.io-2023.06-software arch:zen4 accel:nvidia/cc90 - # will a build be triggered on this partition - # If you want to use this partition also for cross-compiling for cc70 and cc80 architectures, you can alter - # the "accel" property to - # "accel": ["nvidia/cc70", "nvidia/cc80", "nvidia/cc90"] - # Note that setting: - # "accel": ["None", "nvidia/cc90"] - # is invalid here, since it would lead to both the cpu_zen4 and the gpu_h100 partitions matching the build command - # bot: build instance:xyz repo:eessi.io-2023.06-software arch:zen4 accel:nvidia/cc90 - # This would cause the same job dir to be prepared twice, for different virtual partitions, which will lead - # to an error in the job preparation step "gpu_h100": { "os": "linux", "cpu_subdir": "x86_64/amd/zen4", @@ -415,12 +388,15 @@ scontrol_command = /usr/bin/scontrol # are removed, the output (in PR comments) will lack important # information. [submitted_job_comments] -# awaits_release is no longer used since bot release v0.7.0 -# awaits_release = job id `{job_id}` awaits release by job manager +awaits_release = job id `{job_id}` awaits release by job manager awaits_release_delayed_begin_msg = job id `{job_id}` will be eligible to start in about {delay_seconds} seconds awaits_release_hold_release_msg = job id `{job_id}` awaits release by job manager -initial_comment = New job on instance `{app_name}` for repository `{repo_id}`\nBuilding on: `{on_arch}`{on_accelerator}\nBuilding for: `{for_arch}`{for_accelerator}\nJob dir: `{symlink}` +new_job_instance_repo = New job on instance `{app_name}` for repository `{repo_id}` +build_on_arch = Building on: `{on_arch}`{on_accelerator} +build_for_arch = Building for: `{for_arch}`{for_accelerator} +jobdir = Job dir: `{symlink}` with_accelerator =  and accelerator `{accelerator}` +# initial_comment = New job on instance `{app_name}` for repository `{repo_id}`\nBuilding on: `{on_arch}`{on_accelerator}\nBuilding for: `{for_arch}`{for_accelerator}\nJob dir: `{symlink}` # no longer used [new_job_comments] diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index ce35d362..9bae8c48 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -29,7 +29,7 @@ # Local application imports (anything from EESSI/eessi-bot-software-layer) from connections import github -from tasks.build import check_build_permission, get_architecture_targets, request_bot_build_issue_comments, \ +from tasks.build import check_build_permission, get_node_types, request_bot_build_issue_comments, \ submit_build_jobs from tasks.deploy import deploy_built_artefacts, determine_job_dirs from tasks.clean_up import move_to_trash_bin @@ -43,7 +43,7 @@ REQUIRED_CONFIG = { config.SECTION_ARCHITECTURETARGETS: [ - config.ARCHITECTURETARGETS_SETTING_ARCH_TARGET_MAP], # required + config.NODE_TYPE_MAP], # required config.SECTION_BOT_CONTROL: [ # config.BOT_CONTROL_SETTING_CHATLEVEL, # optional config.BOT_CONTROL_SETTING_COMMAND_PERMISSION, # required @@ -106,7 +106,10 @@ config.SECTION_REPO_TARGETS: [ config.REPO_TARGETS_SETTING_REPOS_CFG_DIR], # required config.SECTION_SUBMITTED_JOB_COMMENTS: [ - config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT, # required + config.SUBMITTED_JOB_COMMENTS_SETTING_INSTANCE_REPO, # required + config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_ON_ARCH, # required + config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_FOR_ARCH, # required + config.SUBMITTED_JOB_COMMENTS_SETTING_JOBDIR, # required # config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE, # optional config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_DELAYED_BEGIN_MSG, # required config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_HOLD_RELEASE_MSG, # required @@ -410,14 +413,14 @@ def handle_pull_request_opened_event(self, event_info, pr, req_chatlevel=ChatLev app_name = self.cfg[config.SECTION_GITHUB][config.GITHUB_SETTING_APP_NAME] # TODO check if PR already has a comment with arch targets and # repositories - arch_map = get_architecture_targets(self.cfg) + node_map = get_node_types(self.cfg) comment = f"Instance `{app_name}` is configured to build for:" - for arch in arch_map: + for node in node_map: # Do not print virtual partition names, a bot admin may not want to share those # Instead, just number them comment += f"\n- Partition {arch}:" - current_partition = arch_map[arch] + current_partition = node_map[node] if "os" in current_partition: comment += f"\n - OS: {current_partition['os']}" if "cpu_subdir" in current_partition: @@ -691,7 +694,7 @@ def main(): opts = event_handler_parse() # config is read and checked for settings to raise an exception early when the event_handler starts. - if config.check_required_cfg_settings(REQUIRED_CONFIG): + if config.check_cfg_settings(REQUIRED_CONFIG): print("Configuration check: PASSED") else: print("Configuration check: FAILED") diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 4fcf9af3..fedfb0ba 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -73,7 +73,6 @@ config.RUNNING_JOB_COMMENTS_SETTING_RUNNING_JOB] # required } - class EESSIBotSoftwareLayerJobManager: """ Class for representing the job manager of the build-and-deploy bot. It @@ -623,7 +622,7 @@ def main(): # config is read and checked for settings to raise an exception early when # the job_manager runs - if config.check_required_cfg_settings(REQUIRED_CONFIG): + if config.check_cfg_settings(REQUIRED_CONFIG): print("Configuration check: PASSED") else: print("Configuration check: FAILED") diff --git a/tasks/build.py b/tasks/build.py index e69aea73..1b229d96 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -24,7 +24,9 @@ from datetime import datetime, timezone import json import os +import re import shutil +import string import sys # Third party imports (anything installed into the local Python environment) @@ -180,26 +182,47 @@ def get_build_env_cfg(cfg): return config_data -def get_architecture_targets(cfg): - """ - Obtain mappings of architecture targets to Slurm parameters +def get_node_types(cfg): + """Obtain mappings of node types to Slurm parameters Args: cfg (ConfigParser): ConfigParser instance holding full configuration (typically read from 'app.cfg') Returns: - (dict): dictionary mapping architecture targets (format - OS/SOFTWARE_SUBDIR) to architecture specific Slurm job submission - parameters + (dict): Dictionary mapping node types names (arbitrary text) node properties + such as the OS, CPU software subdir, supported repositories, accelerator (optionally) + as well as the slurm parameters to allocate such a type of node """ fn = sys._getframe().f_code.co_name - architecture_targets = cfg[config.SECTION_ARCHITECTURETARGETS] + node_types = cfg[config.SECTION_ARCHITECTURETARGETS] + + node_type_map = json.loads(node_types.get(config.NODE_TYPE_MAP)) + log(f"{fn}(): node type map '{json.dumps(node_type_map)}'") + return node_type_map - arch_target_map = json.loads(architecture_targets.get(config.ARCHITECTURETARGETS_SETTING_ARCH_TARGET_MAP)) - log(f"{fn}(): arch target map '{json.dumps(arch_target_map)}'") - return arch_target_map +# Replaced by get_node_types +# def get_architecture_targets(cfg): +# """ +# Obtain mappings of architecture targets to Slurm parameters +# +# Args: +# cfg (ConfigParser): ConfigParser instance holding full configuration +# (typically read from 'app.cfg') +# +# Returns: +# (dict): dictionary mapping architecture targets (format +# OS/SOFTWARE_SUBDIR) to architecture specific Slurm job submission +# parameters +# """ +# fn = sys._getframe().f_code.co_name +# +# architecture_targets = cfg[config.SECTION_ARCHITECTURETARGETS] +# +# arch_target_map = json.loads(architecture_targets.get(config.ARCHITECTURETARGETS_SETTING_ARCH_TARGET_MAP)) +# log(f"{fn}(): arch target map '{json.dumps(arch_target_map)}'") +# return arch_target_map def get_allowed_exportvars(cfg): @@ -572,7 +595,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params): app_name = cfg[config.SECTION_GITHUB].get(config.GITHUB_SETTING_APP_NAME) build_env_cfg = get_build_env_cfg(cfg) - arch_map = get_architecture_targets(cfg) + node_map = get_node_types(cfg) repocfg = get_repo_cfg(cfg) allowed_exportvars = get_allowed_exportvars(cfg) @@ -625,7 +648,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params): # }, # 'node_type_name2': { # ... etc - for node_type_name, partition_info in arch_map.items(): + for node_type_name, partition_info in node_map.items(): log(f"{fn}(): node_type_name is {node_type_name}, partition_info is {partition_info}") # Unpack for convenience arch_dir = build_params[BUILD_PARAM_ARCH] @@ -846,7 +869,7 @@ def submit_job(job, cfg): # instances run on the same system job_name = cfg[config.SECTION_BUILDENV].get(config.BUILDENV_SETTING_JOB_NAME) - # add a default time limit of 24h to the job submit command if no other time + # add a default time limit of 24h to the job submit comnand if no other time # limit is specified already all_opts_str = " ".join([build_env_cfg[config.BUILDENV_SETTING_SLURM_PARAMS], job.slurm_opts]) all_opts_list = all_opts_str.split(" ") @@ -981,9 +1004,15 @@ def create_pr_comment(job, job_id, app_name, pr, symlink, build_params): # construct initial job comment buildenv = config.read_config()[config.SECTION_BUILDENV] job_handover_protocol = buildenv.get(config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL) - raw_comment_template = submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT] + # NO LONGER NEEDED now that we have cut up the sentence into different config items + # raw_comment_template = submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT] + new_job_instance_repo = submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INSTANCE_REPO] + build_on_arch = submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_ON_ARCH] + build_for_arch = submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_FOR_ARCH] + jobdir = submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_JOBDIR] + # NO LONGER NEEDED now that we have cut up the sentence into different config items # Support using escape chars in the INITIAL_COMMENT, that means \n should be interpreted as unicode - initial_comment_template = codecs.decode(raw_comment_template, 'unicode_escape') + # initial_comment_template = codecs.decode(raw_comment_template, 'unicode_escape') if job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_DELAYED_BEGIN: release_msg_string = config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_DELAYED_BEGIN_MSG release_comment_template = submitted_job_comments_cfg[release_msg_string] @@ -992,8 +1021,11 @@ def create_pr_comment(job, job_id, app_name, pr, symlink, build_params): poll_interval = int(job_manager_cfg.get(config.JOB_MANAGER_SETTING_POLL_INTERVAL)) delay_factor = float(buildenv.get(config.BUILDENV_SETTING_JOB_DELAY_BEGIN_FACTOR, 2)) eligible_in_seconds = int(poll_interval * delay_factor) - job_comment = (f"{initial_comment_template}" - f"\n|date|job status|comment|\n" + job_comment = (f"{new_job_instance_repo}\n" + f"{build_on_arch}\n" + f"{build_for_arch}\n" + f"{jobdir}\n" + f"|date|job status|comment|\n" f"|----------|----------|------------------------|\n" f"|{dt.strftime('%b %d %X %Z %Y')}|" f"submitted|" @@ -1010,8 +1042,11 @@ def create_pr_comment(job, job_id, app_name, pr, symlink, build_params): else: release_msg_string = config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_HOLD_RELEASE_MSG release_comment_template = submitted_job_comments_cfg[release_msg_string] - job_comment = (f"{initial_comment_template}" - f"\n|date|job status|comment|\n" + job_comment = (f"{new_job_instance_repo}\n" + f"{build_on_arch}\n" + f"{build_for_arch}\n" + f"{jobdir}\n" + f"|date|job status|comment|\n" f"|----------|----------|------------------------|\n" f"|{dt.strftime('%b %d %X %Z %Y')}|" f"submitted|" @@ -1125,11 +1160,57 @@ def check_build_permission(pr, event_info): return True +def template_to_regex(format_str, with_eol=True): + """ + Converts a formatting string into a regex that can extract all the formatted + parts of the string. If with_eol is True, it assumes the formatted string is followed by an end-of-line + character. This is a requirement if it has to succesfully match a formatting string that ends with a formatting + field. + + Args: + format_str (string): a formatting string, with template placeholders. + with_eol (bool, optional): a boolean, indicating if the formatting string is expected to be followed by + an end of line character + + """ + + # string.Formatter returns a 4-tuple of literal text, field name, format spec, and conversion + # E.g if format_str = "This is my {app} it is currently {status}" + # formatter = [ + # ("This is my", "app", "", None), + # ("it is currently", "status", "", None), + # ("", None, None, None), + # ] + formatter = string.Formatter() + regex_parts = [] + + for literal_text, field_name, _, _ in formatter.parse(format_str): + # We use re.escape to escape any special characters in the literal_text, as we want to match those literally + regex_parts.append(re.escape(literal_text)) + if field_name is not None: + # Create a non-greedy, named capture group. Note that the name itself as an f-string + # So we get the actual field name as the name of the capture group + # We match any character, but in a non-greedy way. Thus, as soon as it can match the next + # literal text section, it will - thus assuming that that's the end of the field + # We use .* to allow for empty fields (such as the optional accelerator fields) + regex_parts.append(f"(?P<{field_name}>.*?)") + + # Finally, make sure we append a $ to the regex. This is necessary because of our non-greedy matching + # strategy. Otherwise, a formatting string that ends with a formatting item would only match the first letter + # of the field, because it doesn't find anything to match after (and it is non-greedy). With the $, it has + # something to match after the field, thus making sure it matches the whole field + # This does assume that + full_pattern = ''.join(regex_parts) + if with_eol: + full_pattern += "$" + return re.compile(full_pattern) + + def request_bot_build_issue_comments(repo_name, pr_number): """ Query the github API for the issue_comments in a pr. - Archs: + Args: repo_name (string): name of the repository (format USER_OR_ORGANISATION/REPOSITORY) pr_number (int): number og the pr @@ -1139,7 +1220,7 @@ def request_bot_build_issue_comments(repo_name, pr_number): """ fn = sys._getframe().f_code.co_name - status_table = {'arch': [], 'date': [], 'status': [], 'url': [], 'result': []} + status_table = {'on arch': [], 'for arch': [], 'for repo': [], 'date': [], 'status': [], 'url': [], 'result': []} cfg = config.read_config() # for loop because github has max 100 items per request. @@ -1154,19 +1235,55 @@ def request_bot_build_issue_comments(repo_name, pr_number): for comment in comments: # iterate through the comments to find the one where the status of the build was in submitted_job_comments_section = cfg[config.SECTION_SUBMITTED_JOB_COMMENTS] - initial_comment_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT] - if initial_comment_fmt[:20] in comment['body']: + instance_repo_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_INSTANCE_REPO] + instance_repo_re = template_to_regex(instance_repo_fmt) + comment_body = comment['body'].split('\n') + print(f"Matching string {comment_body[0]} with re: {instance_repo_re}") + instance_repo_match = re.match(instance_repo_re, comment_body[0]) + # Check if this body starts with an initial comment from the bot (first item is always the instance + repo + # it is building for) + # Then, check that it has at least 4 lines so that we can safely index up to that number + if instance_repo_match and len(comment_body) >= 4: + print(f"Instance match: {instance_repo_match.groupdict()}") + on_arch_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_ON_ARCH] + on_arch_re = template_to_regex(on_arch_fmt) + print(f"Matching string {comment_body[1]} with re: {on_arch_re}") + on_arch_match = re.match(on_arch_re, comment_body[1]) + for_arch_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_FOR_ARCH] + for_arch_re = template_to_regex(for_arch_fmt) + print(f"Matching string {comment_body[2]} with re: {for_arch_re}") + for_arch_match = re.match(for_arch_re, comment_body[2]) + print(f"On arch match: {on_arch_match.groupdict()}") + print(f"For arch match: {for_arch_match.groupdict()}") + # Does everything match (it should, if we already had an instance_repo_match, but good to be sure) + if on_arch_match and for_arch_match: + instance_repo_dict = instance_repo_match.groupdict() + on_arch_dict = on_arch_match.groupdict() + for_arch_dict = for_arch_match.groupdict() + # Play it safe + # TODO: probably log something in the 'else' case + if 'on_arch' in on_arch_dict: + status_table['on arch'].append(on_arch_dict['on_arch']) + if 'for_arch' in for_arch_dict: + status_table['for arch'].append(for_arch_dict['for_arch']) + if 'repo_id' in instance_repo_dict: + status_table['for repo'].append(instance_repo_dict['repo_id']) + + # TODO: extract the building on, building for and repository name, and put those in the table + # NOTE: previously, we could use arch. We can't anymore, since we also want the 'for' architecture + # that is not available in this scope + # We should probably split the SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT into different components + # and create a regex out of those components? # get archictecture from comment['body'] - first_line = comment['body'].split('\n')[0] - arch_map = get_architecture_targets(cfg) - for arch in arch_map.keys(): - # drop the first element in arch (which names the OS type) and join the remaining items with '-' - target_arch = '-'.join(arch.split('/')[1:]) - if target_arch in first_line: - status_table['arch'].append(target_arch) - else: - log(f"{fn}(): target_arch '{target_arch}' not found in first line '{first_line}'") + node_map = get_node_types(cfg) + # for arch in node_map.keys(): + # # drop the first element in arch (which names the OS type) and join the remaining items with '-' + # target_arch = '-'.join(arch.split('/')[1:]) + # if target_arch in first_line: + # status_table['arch'].append(target_arch) + # else: + # log(f"{fn}(): target_arch '{target_arch}' not found in first line '{first_line}'") # get date, status, url and result from the markdown table comment_table = comment['body'][comment['body'].find('|'):comment['body'].rfind('|')+1] diff --git a/tools/config.py b/tools/config.py index d0ec6498..ddfe9e3f 100644 --- a/tools/config.py +++ b/tools/config.py @@ -30,7 +30,8 @@ # sectionname_SETTING_settingname for any setting with name settingname in # section sectionname SECTION_ARCHITECTURETARGETS = 'architecturetargets' -ARCHITECTURETARGETS_SETTING_ARCH_TARGET_MAP = 'arch_target_map' +ARCHITECTURETARGETS_SETTING_ARCH_TARGET_MAP = 'arch_target_map' # Obsolete, replaced by NODE_TYPE_MAP +NODE_TYPE_MAP = 'node_type_map' SECTION_BOT_CONTROL = 'bot_control' BOT_CONTROL_SETTING_COMMAND_PERMISSION = 'command_permission' @@ -120,6 +121,10 @@ SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE = 'awaits_release' SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_DELAYED_BEGIN_MSG = 'awaits_release_delayed_begin_msg' SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_HOLD_RELEASE_MSG = 'awaits_release_hold_release_msg' +SUBMITTED_JOB_COMMENTS_SETTING_INSTANCE_REPO = 'new_job_instance_repo' +SUBMITTED_JOB_COMMENTS_SETTING_BUILD_ON_ARCH = 'build_on_arch' +SUBMITTED_JOB_COMMENTS_SETTING_BUILD_FOR_ARCH = 'build_for_arch' +SUBMITTED_JOB_COMMENTS_SETTING_JOBDIR = 'jobdir' SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT = 'initial_comment' SUBMITTED_JOB_COMMENTS_SETTING_WITH_ACCELERATOR = 'with_accelerator' @@ -135,6 +140,26 @@ JOB_HANDOVER_PROTOCOL_HOLD_RELEASE } +# Allows us to error on config items that were removed +FORBIDDEN_CONFIG = { + SECTION_ARCHITECTURETARGETS: [ + ( + ARCHITECTURETARGETS_SETTING_ARCH_TARGET_MAP, + f"Config invalid: '{ARCHITECTURETARGETS_SETTING_ARCH_TARGET_MAP}' was removed and replaced by " + f"'{NODE_TYPE_MAP}'. See app.cfg.example for details." + ) + ], + SECTION_SUBMITTED_JOB_COMMENTS: [ + ( + SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT, + f"Config invalid: '{SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT}' was removed and replaced by " + f"'{SUBMITTED_JOB_COMMENTS_SETTING_INSTANCE_REPO}', '{SUBMITTED_JOB_COMMENTS_SETTING_BUILD_ON_ARCH}', " + f"'{SUBMITTED_JOB_COMMENTS_SETTING_BUILD_FOR_ARCH}' and '{SUBMITTED_JOB_COMMENTS_SETTING_JOBDIR}'. " + "See app.cfg.example for details." + ) + ] +} + def read_config(path='app.cfg'): """ @@ -158,10 +183,10 @@ def read_config(path='app.cfg'): return config -def check_required_cfg_settings(req_settings, path="app.cfg"): +def check_cfg_settings(req_settings, path="app.cfg"): """ - Reads the config file, checks if it contains the required settings, - if not logs an error message and exits. + Reads the config file, checks if it contains the required settings, and if it does not contain forbidden + (i.e. removed) settings. If the check fails, logs an error message and exits. Args: req_settings (dict (str, list)): required settings @@ -181,4 +206,14 @@ def check_required_cfg_settings(req_settings, path="app.cfg"): for item in req_settings[section]: if item not in cfg[section]: error(f'Missing configuration item "{item}" in section "{section}" of configuration file {path}.') + + # Check for forbidden arguments + for section in FORBIDDEN_CONFIG: + if section in cfg: + for item in FORBIDDEN_CONFIG[section]: + # First element of the tuple is the forbidden config item, check if its in the section + if item[0] in cfg[section]: + # Item 1 contains a specific error message + error(item[1]) + return True From 697dc6ea1baa432e790756bb9d94e8df9fb3c97d Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Jul 2025 11:29:05 +0200 Subject: [PATCH 064/132] Update the status command to account for the new on:... for:... syntax --- eessi_bot_event_handler.py | 13 ++-- tasks/build.py | 121 +++++++++++++++++++++++++------------ 2 files changed, 93 insertions(+), 41 deletions(-) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index 9bae8c48..723a6139 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -590,10 +590,12 @@ def handle_bot_command_status(self, event_info, bot_command): comment_status = '' comment_status += "\nThis is the status of all the `bot: build` commands:" - comment_status += "\n|arch|result|date|status|url|" - comment_status += "\n|----|------|----|------|---|" + comment_status += "\n|on|for|repo|result|date|status|url|" + comment_status += "\n|----|----|----|------|----|------|---|" for x in range(0, len(status_table['date'])): - comment_status += f"\n|{status_table['arch'][x]}|" + comment_status += f"\n|{status_table['on arch'][x]}|" + comment_status += f"{status_table['for arch'][x]}|" + comment_status += f"{status_table['for repo'][x]}|" comment_status += f"{status_table['result'][x]}|" comment_status += f"{status_table['date'][x]}|" comment_status += f"{status_table['status'][x]}|" @@ -601,7 +603,10 @@ def handle_bot_command_status(self, event_info, bot_command): self.log(f"Overview of finished builds: comment '{comment_status}'") issue_comment = create_comment(repo_name, pr_number, comment_status, ChatLevels.MINIMAL) - return issue_comment + if issue_comment: + return f"\n - added status comment {issue_comment.html_url}" + else: + return "\n - failed to create status comment" def start(self, app, port=3000): """ diff --git a/tasks/build.py b/tasks/build.py index 1b229d96..f5a64577 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -1206,6 +1206,15 @@ def template_to_regex(format_str, with_eol=True): return re.compile(full_pattern) +class PartialFormatDict(dict): + """ + A dictionary class that allows for missing keys - and will just return {key} in that case. + This can be used to partially format some, but not all placeholders in a formatting string. + """ + def __missing__(self, key): + return "{" + key + "}" + + def request_bot_build_issue_comments(repo_name, pr_number): """ Query the github API for the issue_comments in a pr. @@ -1235,6 +1244,7 @@ def request_bot_build_issue_comments(repo_name, pr_number): for comment in comments: # iterate through the comments to find the one where the status of the build was in submitted_job_comments_section = cfg[config.SECTION_SUBMITTED_JOB_COMMENTS] + accelerator_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_WITH_ACCELERATOR] instance_repo_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_INSTANCE_REPO] instance_repo_re = template_to_regex(instance_repo_fmt) comment_body = comment['body'].split('\n') @@ -1244,46 +1254,83 @@ def request_bot_build_issue_comments(repo_name, pr_number): # it is building for) # Then, check that it has at least 4 lines so that we can safely index up to that number if instance_repo_match and len(comment_body) >= 4: + log(f"{fn}(): found bot build response in issue, processing...") + # First, extract the repo_id print(f"Instance match: {instance_repo_match.groupdict()}") + log(f"{fn}(): found build for repository: {instance_repo_match.group('repo_id')}") + status_table['for repo'].append(instance_repo_match.group('repo_id')) + + # TODO: this unconditionally adds the accelerator_fmt, but that's only needed _if an accelerator was used_ + # We should split these cases. Probably by first doing a match _with_ accelerator (the most specific case) + # If that fails to match, we continue to match without accelerator + + # Then, try to match the architecture we build on. + # First try this including accelerator, to see if one was defined on_arch_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_ON_ARCH] - on_arch_re = template_to_regex(on_arch_fmt) - print(f"Matching string {comment_body[1]} with re: {on_arch_re}") - on_arch_match = re.match(on_arch_re, comment_body[1]) + on_arch_fmt_with_accel = on_arch_fmt.format_map(PartialFormatDict(on_accelerator=accelerator_fmt)) + on_arch_re_with_accel = template_to_regex(on_arch_fmt_with_accel) + print(f"Matching string {comment_body[1]} with re: {on_arch_re_with_accel}") + on_arch_match = re.match(on_arch_re_with_accel, comment_body[1]) + if on_arch_match: + # Pattern with accelerator matched, append to status_table + print(f"On arch match: {on_arch_match.groupdict()}") + log(f"{fn}(): found build on architecture: {on_arch_match.group('on_arch')}, " + f"with accelerator {on_arch_match.group('accelerator')}") + status_table['on arch'].append(f"`{on_arch_match.group('on_arch')}`, " + f"`{on_arch_match.group('accelerator')}`") + else: + # Pattern with accelerator did not match, retry without accelerator + on_arch_re = template_to_regex(on_arch_fmt) + print(f"Matching string {comment_body[1]} with re: {on_arch_re}") + on_arch_match = re.match(on_arch_re, comment_body[1]) + if on_arch_match: + # Pattern without accelerator matched, append to status_table + print(f"On arch match: {on_arch_match.groupdict()}") + log(f"{fn}(): found build on architecture: {on_arch_match.group('on_arch')}") + status_table['on arch'].append(f"`{on_arch_match.group('on_arch')}`") + else: + # This shouldn't happen: we had an instance_repo_match, but no match for the 'on architecture' + msg = "Could not match regular expression for extracting the architecture to build on.\n" + msg += "String to be matched:\n" + msg += f"{comment_body[1]}\n" + msg += "First regex attempted:\n" + msg += f"{on_arch_re_with_accel.pattern}\n" + msg += "Second regex attempted:\n" + msg += f"{on_arch_re.pattern}\n" + raise ValueError(msg) + # Now, do the same for the architecture we build for. I.e. first, try to match including accelerator for_arch_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_FOR_ARCH] - for_arch_re = template_to_regex(for_arch_fmt) - print(f"Matching string {comment_body[2]} with re: {for_arch_re}") - for_arch_match = re.match(for_arch_re, comment_body[2]) - print(f"On arch match: {on_arch_match.groupdict()}") - print(f"For arch match: {for_arch_match.groupdict()}") - # Does everything match (it should, if we already had an instance_repo_match, but good to be sure) - if on_arch_match and for_arch_match: - instance_repo_dict = instance_repo_match.groupdict() - on_arch_dict = on_arch_match.groupdict() - for_arch_dict = for_arch_match.groupdict() - # Play it safe - # TODO: probably log something in the 'else' case - if 'on_arch' in on_arch_dict: - status_table['on arch'].append(on_arch_dict['on_arch']) - if 'for_arch' in for_arch_dict: - status_table['for arch'].append(for_arch_dict['for_arch']) - if 'repo_id' in instance_repo_dict: - status_table['for repo'].append(instance_repo_dict['repo_id']) - - # TODO: extract the building on, building for and repository name, and put those in the table - # NOTE: previously, we could use arch. We can't anymore, since we also want the 'for' architecture - # that is not available in this scope - # We should probably split the SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT into different components - # and create a regex out of those components? - - # get archictecture from comment['body'] - node_map = get_node_types(cfg) - # for arch in node_map.keys(): - # # drop the first element in arch (which names the OS type) and join the remaining items with '-' - # target_arch = '-'.join(arch.split('/')[1:]) - # if target_arch in first_line: - # status_table['arch'].append(target_arch) - # else: - # log(f"{fn}(): target_arch '{target_arch}' not found in first line '{first_line}'") + for_arch_fmt_with_accel = for_arch_fmt.format_map(PartialFormatDict(for_accelerator=accelerator_fmt)) + for_arch_re_with_accel = template_to_regex(for_arch_fmt_with_accel) + print(f"Matching string {comment_body[2]} with re: {for_arch_re_with_accel}") + for_arch_match = re.match(for_arch_re_with_accel, comment_body[2]) + if for_arch_match: + # Pattern with accelerator matched, append to status_table + print(f"For arch match: {for_arch_match.groupdict()}") + log(f"{fn}(): found build for architecture: {for_arch_match.group('for_arch')}, " + f"with accelerator {for_arch_match.group('accelerator')}") + status_table['for arch'].append(f"`{for_arch_match.group('for_arch')}`, " + f"`{for_arch_match.group('accelerator')}`") + else: + # Pattern with accelerator did not match, retry without accelerator + for_arch_re = template_to_regex(for_arch_fmt) + print(f"Matching string {comment_body[1]} with re: {for_arch_re}") + for_arch_match = re.match(for_arch_re, comment_body[2]) + if for_arch_match: + # Pattern without accelerator matched, append to status_table + print(f"For arch match: {for_arch_match.groupdict()}") + log(f"{fn}(): found build for architecture: {for_arch_match.group('for_arch')}") + status_table['for arch'].append(f"`{for_arch_match.group('for_arch')}`") + else: + # This shouldn't happen: we had an instance_repo_match, but no match for the 'on architecture' + msg = "Could not match regular expression for extracting the architecture to build for.\n" + msg += "String to be matched:\n" + msg += f"{comment_body[2]}\n" + msg += "First regex attempted:\n" + msg += f"{for_arch_re_with_accel.pattern}\n" + msg += "Second regex attempted:\n" + msg += f"{for_arch_re.pattern}\n" + raise ValueError(msg) # get date, status, url and result from the markdown table comment_table = comment['body'][comment['body'].find('|'):comment['body'].rfind('|')+1] From c0fe051b88e8ffd92f1b2f5b98e353c0f82c3cae Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Jul 2025 11:34:04 +0200 Subject: [PATCH 065/132] Remove debugging print statements --- tasks/build.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/tasks/build.py b/tasks/build.py index f5a64577..54516b84 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -1248,7 +1248,6 @@ def request_bot_build_issue_comments(repo_name, pr_number): instance_repo_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_INSTANCE_REPO] instance_repo_re = template_to_regex(instance_repo_fmt) comment_body = comment['body'].split('\n') - print(f"Matching string {comment_body[0]} with re: {instance_repo_re}") instance_repo_match = re.match(instance_repo_re, comment_body[0]) # Check if this body starts with an initial comment from the bot (first item is always the instance + repo # it is building for) @@ -1256,7 +1255,6 @@ def request_bot_build_issue_comments(repo_name, pr_number): if instance_repo_match and len(comment_body) >= 4: log(f"{fn}(): found bot build response in issue, processing...") # First, extract the repo_id - print(f"Instance match: {instance_repo_match.groupdict()}") log(f"{fn}(): found build for repository: {instance_repo_match.group('repo_id')}") status_table['for repo'].append(instance_repo_match.group('repo_id')) @@ -1269,11 +1267,9 @@ def request_bot_build_issue_comments(repo_name, pr_number): on_arch_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_ON_ARCH] on_arch_fmt_with_accel = on_arch_fmt.format_map(PartialFormatDict(on_accelerator=accelerator_fmt)) on_arch_re_with_accel = template_to_regex(on_arch_fmt_with_accel) - print(f"Matching string {comment_body[1]} with re: {on_arch_re_with_accel}") on_arch_match = re.match(on_arch_re_with_accel, comment_body[1]) if on_arch_match: # Pattern with accelerator matched, append to status_table - print(f"On arch match: {on_arch_match.groupdict()}") log(f"{fn}(): found build on architecture: {on_arch_match.group('on_arch')}, " f"with accelerator {on_arch_match.group('accelerator')}") status_table['on arch'].append(f"`{on_arch_match.group('on_arch')}`, " @@ -1281,11 +1277,9 @@ def request_bot_build_issue_comments(repo_name, pr_number): else: # Pattern with accelerator did not match, retry without accelerator on_arch_re = template_to_regex(on_arch_fmt) - print(f"Matching string {comment_body[1]} with re: {on_arch_re}") on_arch_match = re.match(on_arch_re, comment_body[1]) if on_arch_match: # Pattern without accelerator matched, append to status_table - print(f"On arch match: {on_arch_match.groupdict()}") log(f"{fn}(): found build on architecture: {on_arch_match.group('on_arch')}") status_table['on arch'].append(f"`{on_arch_match.group('on_arch')}`") else: @@ -1302,11 +1296,9 @@ def request_bot_build_issue_comments(repo_name, pr_number): for_arch_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_FOR_ARCH] for_arch_fmt_with_accel = for_arch_fmt.format_map(PartialFormatDict(for_accelerator=accelerator_fmt)) for_arch_re_with_accel = template_to_regex(for_arch_fmt_with_accel) - print(f"Matching string {comment_body[2]} with re: {for_arch_re_with_accel}") for_arch_match = re.match(for_arch_re_with_accel, comment_body[2]) if for_arch_match: # Pattern with accelerator matched, append to status_table - print(f"For arch match: {for_arch_match.groupdict()}") log(f"{fn}(): found build for architecture: {for_arch_match.group('for_arch')}, " f"with accelerator {for_arch_match.group('accelerator')}") status_table['for arch'].append(f"`{for_arch_match.group('for_arch')}`, " @@ -1314,11 +1306,9 @@ def request_bot_build_issue_comments(repo_name, pr_number): else: # Pattern with accelerator did not match, retry without accelerator for_arch_re = template_to_regex(for_arch_fmt) - print(f"Matching string {comment_body[1]} with re: {for_arch_re}") for_arch_match = re.match(for_arch_re, comment_body[2]) if for_arch_match: # Pattern without accelerator matched, append to status_table - print(f"For arch match: {for_arch_match.groupdict()}") log(f"{fn}(): found build for architecture: {for_arch_match.group('for_arch')}") status_table['for arch'].append(f"`{for_arch_match.group('for_arch')}`") else: From f179b664667a285e3dceef5728b5c5237e343936 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Jul 2025 11:45:08 +0200 Subject: [PATCH 066/132] Warn about the removal of the repo_target_map --- app.cfg.example | 5 +++++ tools/config.py | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/app.cfg.example b/app.cfg.example index 7da0147d..b357fbab 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -351,6 +351,11 @@ node_type_map = { }} [repo_targets] + +# No longer used, repo targets are now specified per node type in the node_type_map +# repo_target_map = { +# "linux/x86_64/amd/zen2" : ["eessi.io-2023.06-software","eessi.io-2025.06-software"] } + # points to definition of repositories (default repository defined by build container) repos_cfg_dir = PATH_TO_SHARED_DIRECTORY/repos diff --git a/tools/config.py b/tools/config.py index ddfe9e3f..7f814ea4 100644 --- a/tools/config.py +++ b/tools/config.py @@ -111,6 +111,7 @@ NEW_JOB_COMMENTS_SETTING_AWAITS_LAUNCH = 'awaits_launch' SECTION_REPO_TARGETS = 'repo_targets' +REPO_TARGETS_SETTING_REPO_TARGET_MAP = 'repo_target_map' REPO_TARGETS_SETTING_REPOS_CFG_DIR = 'repos_cfg_dir' SECTION_RUNNING_JOB_COMMENTS = 'running_job_comments' @@ -149,6 +150,13 @@ f"'{NODE_TYPE_MAP}'. See app.cfg.example for details." ) ], + SECTION_REPO_TARGETS: [ + ( + REPO_TARGETS_SETTING_REPO_TARGET_MAP, + f"Config invalid: '{REPO_TARGETS_SETTING_REPO_TARGET_MAP} was removed. Repository targets can now be " + f"specified within the '{NODE_TYPE_MAP}' dictionary. See app.cfg.example for details." + ) + ], SECTION_SUBMITTED_JOB_COMMENTS: [ ( SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT, From aad663e189f5af3aa7c798ef863f398960380371 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Jul 2025 11:45:41 +0200 Subject: [PATCH 067/132] Fix typo --- app.cfg.example | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app.cfg.example b/app.cfg.example index b357fbab..63d3155a 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -305,7 +305,7 @@ signing = [architecturetargets] -# arch_target_map has been replaced by node_typ_map +# arch_target_map has been replaced by node_type_map # arch_target_map = { # } From be8c7d0553004829891b2f0cd1a4c47e6915b9ab Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Jul 2025 11:48:18 +0200 Subject: [PATCH 068/132] Fix hound issues --- eessi_bot_event_handler.py | 4 +--- tasks/build.py | 9 +++------ 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index 723a6139..7287d533 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -417,9 +417,7 @@ def handle_pull_request_opened_event(self, event_info, pr, req_chatlevel=ChatLev comment = f"Instance `{app_name}` is configured to build for:" for node in node_map: - # Do not print virtual partition names, a bot admin may not want to share those - # Instead, just number them - comment += f"\n- Partition {arch}:" + comment += f"\n- Partition {node}:" current_partition = node_map[node] if "os" in current_partition: comment += f"\n - OS: {current_partition['os']}" diff --git a/tasks/build.py b/tasks/build.py index 54516b84..71d55de9 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -20,7 +20,6 @@ # Standard library imports from collections import namedtuple import configparser -import codecs from datetime import datetime, timezone import json import os @@ -1254,14 +1253,11 @@ def request_bot_build_issue_comments(repo_name, pr_number): # Then, check that it has at least 4 lines so that we can safely index up to that number if instance_repo_match and len(comment_body) >= 4: log(f"{fn}(): found bot build response in issue, processing...") + # First, extract the repo_id log(f"{fn}(): found build for repository: {instance_repo_match.group('repo_id')}") status_table['for repo'].append(instance_repo_match.group('repo_id')) - # TODO: this unconditionally adds the accelerator_fmt, but that's only needed _if an accelerator was used_ - # We should split these cases. Probably by first doing a match _with_ accelerator (the most specific case) - # If that fails to match, we continue to match without accelerator - # Then, try to match the architecture we build on. # First try this including accelerator, to see if one was defined on_arch_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_ON_ARCH] @@ -1292,6 +1288,7 @@ def request_bot_build_issue_comments(repo_name, pr_number): msg += "Second regex attempted:\n" msg += f"{on_arch_re.pattern}\n" raise ValueError(msg) + # Now, do the same for the architecture we build for. I.e. first, try to match including accelerator for_arch_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_FOR_ARCH] for_arch_fmt_with_accel = for_arch_fmt.format_map(PartialFormatDict(for_accelerator=accelerator_fmt)) @@ -1302,7 +1299,7 @@ def request_bot_build_issue_comments(repo_name, pr_number): log(f"{fn}(): found build for architecture: {for_arch_match.group('for_arch')}, " f"with accelerator {for_arch_match.group('accelerator')}") status_table['for arch'].append(f"`{for_arch_match.group('for_arch')}`, " - f"`{for_arch_match.group('accelerator')}`") + f"`{for_arch_match.group('accelerator')}`") else: # Pattern with accelerator did not match, retry without accelerator for_arch_re = template_to_regex(for_arch_fmt) From 7f766f4b3639304f8c03fec1daf4a382c7505ddf Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Jul 2025 11:49:48 +0200 Subject: [PATCH 069/132] Format releveant output of show_config as code --- eessi_bot_event_handler.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index 7287d533..a14ace5c 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -417,16 +417,16 @@ def handle_pull_request_opened_event(self, event_info, pr, req_chatlevel=ChatLev comment = f"Instance `{app_name}` is configured to build for:" for node in node_map: - comment += f"\n- Partition {node}:" + comment += f"\n- Partition `{node}`:" current_partition = node_map[node] if "os" in current_partition: - comment += f"\n - OS: {current_partition['os']}" + comment += f"\n - OS: `{current_partition['os']}`" if "cpu_subdir" in current_partition: - comment += f"\n - CPU architecture: {current_partition['cpu_subdir']}" + comment += f"\n - CPU architecture: `{current_partition['cpu_subdir']}`" if "repo_targets" in current_partition: - comment += f"\n - Repositories: {current_partition['repo_targets']}" + comment += f"\n - Repositories: `{current_partition['repo_targets']}`" if "accel" in current_partition: - comment += f"\n - Accelerators: {current_partition['accel']}" + comment += f"\n - Accelerators: `{current_partition['accel']}`" comment += "\n" self.log(f"PR opened: comment '{comment}'") From d205598c787f9162de5b75afaf5784b0051cab40 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Jul 2025 11:50:40 +0200 Subject: [PATCH 070/132] Rephrase to make things more clear --- eessi_bot_event_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index a14ace5c..d7beca4d 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -415,7 +415,7 @@ def handle_pull_request_opened_event(self, event_info, pr, req_chatlevel=ChatLev # repositories node_map = get_node_types(self.cfg) - comment = f"Instance `{app_name}` is configured to build for:" + comment = f"Instance `{app_name}` is configured to build on:" for node in node_map: comment += f"\n- Partition `{node}`:" current_partition = node_map[node] From ebcc7fd9c801ac6b6a8d5168cdbac9965799266e Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Jul 2025 11:56:51 +0200 Subject: [PATCH 071/132] Forgot to add this new file... --- tools/build_params.py | 75 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 tools/build_params.py diff --git a/tools/build_params.py b/tools/build_params.py new file mode 100644 index 00000000..81bc7bc5 --- /dev/null +++ b/tools/build_params.py @@ -0,0 +1,75 @@ +# This file is part of the EESSI build-and-deploy bot, +# see https://github.com/EESSI/eessi-bot-software-layer +# +# The bot helps with requests to add software installations to the +# EESSI software layer, see https://github.com/EESSI/software-layer +# +# author: Caspar van Leeuwen +# +# license: GPLv2 +# + +from tools.filter import FILTER_COMPONENT_ACCEL, FILTER_COMPONENT_ARCH + +# Define these constants with the same values. We want the arguments passed to +# on: and for: to use the same keywords +BUILD_PARAM_ACCEL = FILTER_COMPONENT_ACCEL +BUILD_PARAM_ARCH = FILTER_COMPONENT_ARCH +BUILD_PARAMS = [ + BUILD_PARAM_ACCEL, + BUILD_PARAM_ARCH +] + +class EESSIBotBuildParamsValueError(Exception): + """ + Exception to be raised when an inappropriate value is specified for a build parameter + """ + pass + +class EESSIBotBuildParamsNameError(Exception): + """ + Exception to be raised when an unkown build parameter name is specified + """ + pass + +class EESSIBotBuildParams(dict): + """ + Class for representing build parameters. Essentially, this is a dictionary class + but with some additional parsing for the constructor + """ + def __init__(self, build_parameters): + """ + EESSIBotBuildParams constructor + + Args: + build_params (string): string containing comma separated build parameters + Example: "arch:amd/zen4,accel:nvidia/cc90" + + Raises: + EESSIBotBuildParamsNameError: raised if parsing an unknown build parameter + string + EESSIBotBuildParamsValueError: raised if an invalid value is passed for a build parameter + """ + build_param_dict = {} + + # Loop over defined build parameters argument + build_params_list = build_parameters.split(',') + for item in build_params_list: + # Separate build parameter name and value + build_param = item.split('=') + if len(build_param) != 2: + msg = f"Expected argument {item} to be split into exactly two parts when splitting by '=', " + msg += f"but the number of items after splitting is {len(build_param)}" + raise EESSIBotBuildParamsValueError(msg) + param_found = False + for full_param_name in BUILD_PARAMS: + # Identify which build param we are matching + if full_param_name.startswith(build_param[0]): + param_found = True + # Store the value of the build parameter by it's full name + build_param_dict[full_param_name] = build_param[1] + if not param_found: + msg = f"Build parameter {build_param[0]} not found. Known build parameters: {BUILD_PARAMS}" + raise EESSIBotBuildParamsNameError(msg) + + super().__init__(build_param_dict) From 0a8bc9b9465d15059139d3742cbf91ec79208a9f Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Jul 2025 11:58:15 +0200 Subject: [PATCH 072/132] Fix hound issues --- tools/build_params.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/build_params.py b/tools/build_params.py index 81bc7bc5..4ba34886 100644 --- a/tools/build_params.py +++ b/tools/build_params.py @@ -20,18 +20,21 @@ BUILD_PARAM_ARCH ] + class EESSIBotBuildParamsValueError(Exception): """ Exception to be raised when an inappropriate value is specified for a build parameter """ pass + class EESSIBotBuildParamsNameError(Exception): """ Exception to be raised when an unkown build parameter name is specified """ pass + class EESSIBotBuildParams(dict): """ Class for representing build parameters. Essentially, this is a dictionary class @@ -65,9 +68,9 @@ def __init__(self, build_parameters): for full_param_name in BUILD_PARAMS: # Identify which build param we are matching if full_param_name.startswith(build_param[0]): - param_found = True - # Store the value of the build parameter by it's full name - build_param_dict[full_param_name] = build_param[1] + param_found = True + # Store the value of the build parameter by it's full name + build_param_dict[full_param_name] = build_param[1] if not param_found: msg = f"Build parameter {build_param[0]} not found. Known build parameters: {BUILD_PARAMS}" raise EESSIBotBuildParamsNameError(msg) From 81257dbb5f690745c299c5653ebeb0eeb4e5731a Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Jul 2025 12:05:08 +0200 Subject: [PATCH 073/132] Update build params call signature --- tests/test_task_build.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/test_task_build.py b/tests/test_task_build.py index 1c289947..eef91879 100644 --- a/tests/test_task_build.py +++ b/tests/test_task_build.py @@ -29,6 +29,7 @@ # Local application imports (anything from EESSI/eessi-bot-software-layer) from tasks.build import Job, create_pr_comment from tools import run_cmd, run_subprocess +from tools.build_params import EESSIBotBuildParams from tools.job_metadata import create_metadata_file, read_metadata_file from tools.pr_comments import PRComment, get_submitted_job_comment @@ -287,6 +288,7 @@ def test_create_pr_comment_succeeds(monkeypatch, mocked_github, tmpdir): ym = datetime.today().strftime('%Y.%m') pr_number = 1 job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") + build_params = EESSIBotBuildParams("arch:amd/zen4,accel:nvidia/cc90") job_id = "123" app_name = "pytest" @@ -295,7 +297,7 @@ def test_create_pr_comment_succeeds(monkeypatch, mocked_github, tmpdir): repo = mocked_github.get_repo(repo_name) pr = repo.get_pull(pr_number) symlink = "/symlink" - comment = create_pr_comment(job, job_id, app_name, pr, symlink) + comment = create_pr_comment(job, job_id, app_name, pr, symlink, build_params) assert comment.id == 1 # check if created comment includes jobid? print("VERIFYING PR COMMENT") @@ -317,6 +319,7 @@ def test_create_pr_comment_succeeds_none(monkeypatch, mocked_github, tmpdir): ym = datetime.today().strftime('%Y.%m') pr_number = 1 job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") + build_params = EESSIBotBuildParams("arch:amd/zen4,accel:nvidia/cc90") job_id = "123" app_name = "pytest" @@ -325,7 +328,7 @@ def test_create_pr_comment_succeeds_none(monkeypatch, mocked_github, tmpdir): repo = mocked_github.get_repo(repo_name) pr = repo.get_pull(pr_number) symlink = "/symlink" - comment = create_pr_comment(job, job_id, app_name, pr, symlink) + comment = create_pr_comment(job, job_id, app_name, pr, symlink, build_params) assert comment is None @@ -343,6 +346,7 @@ def test_create_pr_comment_raises_once_then_succeeds(monkeypatch, mocked_github, ym = datetime.today().strftime('%Y.%m') pr_number = 1 job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") + build_params = EESSIBotBuildParams("arch:amd/zen4,accel:nvidia/cc90") job_id = "123" app_name = "pytest" @@ -351,7 +355,7 @@ def test_create_pr_comment_raises_once_then_succeeds(monkeypatch, mocked_github, repo = mocked_github.get_repo(repo_name) pr = repo.get_pull(pr_number) symlink = "/symlink" - comment = create_pr_comment(job, job_id, app_name, pr, symlink) + comment = create_pr_comment(job, job_id, app_name, pr, symlink, build_params) assert comment.id == 1 assert pr.create_call_count == 2 @@ -369,6 +373,7 @@ def test_create_pr_comment_always_raises(monkeypatch, mocked_github, tmpdir): ym = datetime.today().strftime('%Y.%m') pr_number = 1 job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") + build_params = EESSIBotBuildParams("arch:amd/zen4,accel:nvidia/cc90") job_id = "123" app_name = "pytest" @@ -378,7 +383,7 @@ def test_create_pr_comment_always_raises(monkeypatch, mocked_github, tmpdir): pr = repo.get_pull(pr_number) symlink = "/symlink" with pytest.raises(Exception) as err: - create_pr_comment(job, job_id, app_name, pr, symlink) + create_pr_comment(job, job_id, app_name, pr, symlink, build_params) assert err.type == CreateIssueCommentException assert pr.create_call_count == 3 @@ -396,6 +401,7 @@ def test_create_pr_comment_three_raises(monkeypatch, mocked_github, tmpdir): ym = datetime.today().strftime('%Y.%m') pr_number = 1 job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") + build_params = EESSIBotBuildParams("arch:amd/zen4,accel:nvidia/cc90") job_id = "123" app_name = "pytest" @@ -405,7 +411,7 @@ def test_create_pr_comment_three_raises(monkeypatch, mocked_github, tmpdir): pr = repo.get_pull(pr_number) symlink = "/symlink" with pytest.raises(Exception) as err: - create_pr_comment(job, job_id, app_name, pr, symlink) + create_pr_comment(job, job_id, app_name, pr, symlink, build_params) assert err.type == CreateIssueCommentException assert pr.create_call_count == 3 From f974463389222e2b44bea895a480637e802e3f28 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Jul 2025 12:07:33 +0200 Subject: [PATCH 074/132] Fix example argument, and argument used to create build parameters in the test --- tools/build_params.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/build_params.py b/tools/build_params.py index 4ba34886..05eb63eb 100644 --- a/tools/build_params.py +++ b/tools/build_params.py @@ -45,8 +45,8 @@ def __init__(self, build_parameters): EESSIBotBuildParams constructor Args: - build_params (string): string containing comma separated build parameters - Example: "arch:amd/zen4,accel:nvidia/cc90" + build_parameters (string): string containing comma separated build parameters + Example: "arch=amd/zen4,accel=nvidia/cc90" Raises: EESSIBotBuildParamsNameError: raised if parsing an unknown build parameter From 4104796084c244e516211e27d58c47c66d942e1e Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Jul 2025 12:09:50 +0200 Subject: [PATCH 075/132] Forgot to actually git add this file again... anyway, updated the syntax for the string to create build parameter objects --- tests/test_task_build.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_task_build.py b/tests/test_task_build.py index eef91879..af49ac9b 100644 --- a/tests/test_task_build.py +++ b/tests/test_task_build.py @@ -288,7 +288,7 @@ def test_create_pr_comment_succeeds(monkeypatch, mocked_github, tmpdir): ym = datetime.today().strftime('%Y.%m') pr_number = 1 job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") - build_params = EESSIBotBuildParams("arch:amd/zen4,accel:nvidia/cc90") + build_params = EESSIBotBuildParams("arch=amd/zen4,accel=nvidia/cc90") job_id = "123" app_name = "pytest" @@ -319,7 +319,7 @@ def test_create_pr_comment_succeeds_none(monkeypatch, mocked_github, tmpdir): ym = datetime.today().strftime('%Y.%m') pr_number = 1 job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") - build_params = EESSIBotBuildParams("arch:amd/zen4,accel:nvidia/cc90") + build_params = EESSIBotBuildParams("arch=amd/zen4,accel=nvidia/cc90") job_id = "123" app_name = "pytest" @@ -346,7 +346,7 @@ def test_create_pr_comment_raises_once_then_succeeds(monkeypatch, mocked_github, ym = datetime.today().strftime('%Y.%m') pr_number = 1 job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") - build_params = EESSIBotBuildParams("arch:amd/zen4,accel:nvidia/cc90") + build_params = EESSIBotBuildParams("arch=amd/zen4,accel=nvidia/cc90") job_id = "123" app_name = "pytest" @@ -373,7 +373,7 @@ def test_create_pr_comment_always_raises(monkeypatch, mocked_github, tmpdir): ym = datetime.today().strftime('%Y.%m') pr_number = 1 job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") - build_params = EESSIBotBuildParams("arch:amd/zen4,accel:nvidia/cc90") + build_params = EESSIBotBuildParams("arch=amd/zen4,accel=nvidia/cc90") job_id = "123" app_name = "pytest" @@ -401,7 +401,7 @@ def test_create_pr_comment_three_raises(monkeypatch, mocked_github, tmpdir): ym = datetime.today().strftime('%Y.%m') pr_number = 1 job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") - build_params = EESSIBotBuildParams("arch:amd/zen4,accel:nvidia/cc90") + build_params = EESSIBotBuildParams("arch=amd/zen4,accel=nvidia/cc90") job_id = "123" app_name = "pytest" From 0b82386098b5952cb9f67d3d75c8dafd020f800f Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Jul 2025 12:16:44 +0200 Subject: [PATCH 076/132] Update the app.cfg used for the unit tests to account for the changes in this PR --- tests/test_app.cfg | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/test_app.cfg b/tests/test_app.cfg index 4f833bbf..56c1d6cc 100644 --- a/tests/test_app.cfg +++ b/tests/test_app.cfg @@ -21,7 +21,10 @@ job_handover_protocol = hold_release awaits_release = job id `{job_id}` awaits release by job manager awaits_release_delayed_begin_msg = job id `{job_id}` will be eligible to start in about {delay_seconds} seconds awaits_release_hold_release_msg = job id `{job_id}` awaits release by job manager -initial_comment = New job on instance `{app_name}` for CPU micro-architecture `{arch_name}`{accelerator_spec} for repository `{repo_id}` in job dir `{symlink}` +new_job_instance_repo = New job on instance `{app_name}` for repository `{repo_id}` +build_on_arch = Building on: `{on_arch}`{on_accelerator} +build_for_arch = Building for: `{for_arch}`{for_accelerator} +jobdir = Job dir: `{symlink}` with_accelerator =  and accelerator `{accelerator}` [new_job_comments] From 372a7fe36b14486cbbe2eeeb20c5e2f703c9dfcf Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Jul 2025 12:36:00 +0200 Subject: [PATCH 077/132] Update tests for new requirement that all filters have to be present in context --- tests/test_tools_filter.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/test_tools_filter.py b/tests/test_tools_filter.py index b689aa60..3684046a 100644 --- a/tests/test_tools_filter.py +++ b/tests/test_tools_filter.py @@ -231,21 +231,27 @@ def test_match_empty_context(complex_filter): assert expected == actual -def test_match_architecture_context(complex_filter): +# Test if it matches a context that does NOT contain all components +def test_match_sparse_context(complex_filter): context = {"architecture": "x86_64/intel/cascadelake"} - expected = True + expected = False actual = complex_filter.check_filters(context) assert expected == actual - -def test_match_architecture_job_context(complex_filter): - context = {"architecture": "x86_64/intel/cascadelake", "job": 1234} +def test_matching_context(complex_filter): + context = {"architecture": "x86_64/intel/cascadelake", "repository": "nessi.no-2022.A", "instance": "A"} expected = True actual = complex_filter.check_filters(context) assert expected == actual +def test_non_match_archictecture_context(complex_filter): + context = {"architecture": "x86_64/amd/zen4", "repository": "EESSI", "instance": "mybot", "job": 1234} + expected = False + actual = complex_filter.check_filters(context) + assert expected == actual + -def test_non_match_architecture_repository_context(complex_filter): +def test_non_match_repository_context(complex_filter): context = {"architecture": "x86_64/intel/cascadelake", "repository": "EESSI"} expected = False actual = complex_filter.check_filters(context) From d2be02a2a20141319a8380d694a7482420f97ab9 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Jul 2025 12:40:58 +0200 Subject: [PATCH 078/132] Update tests to accomodate for new behaviour of filter checking that all filter components need to be present in the context for it to be a (possible) match --- tests/test_tools_filter.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/tests/test_tools_filter.py b/tests/test_tools_filter.py index 3684046a..ffee4b9e 100644 --- a/tests/test_tools_filter.py +++ b/tests/test_tools_filter.py @@ -231,19 +231,21 @@ def test_match_empty_context(complex_filter): assert expected == actual -# Test if it matches a context that does NOT contain all components +# A context lacking keys for components in the filter shouldn't match def test_match_sparse_context(complex_filter): context = {"architecture": "x86_64/intel/cascadelake"} expected = False actual = complex_filter.check_filters(context) assert expected == actual + def test_matching_context(complex_filter): context = {"architecture": "x86_64/intel/cascadelake", "repository": "nessi.no-2022.A", "instance": "A"} expected = True actual = complex_filter.check_filters(context) assert expected == actual + def test_non_match_archictecture_context(complex_filter): context = {"architecture": "x86_64/amd/zen4", "repository": "EESSI", "instance": "mybot", "job": 1234} expected = False @@ -252,11 +254,25 @@ def test_non_match_archictecture_context(complex_filter): def test_non_match_repository_context(complex_filter): - context = {"architecture": "x86_64/intel/cascadelake", "repository": "EESSI"} + context = {"architecture": "x86_64/intel/cascadelake", "repository": "EESSI", "instance": "A"} expected = False actual = complex_filter.check_filters(context) assert expected == actual +def test_non_match_instance_context(complex_filter): + context = {"architecture": "x86_64/intel/cascadelake", "repository": "nessi.no-2022.A", "instance": "B"} + expected = False + actual = complex_filter.check_filters(context) + assert expected == actual + + +# If additional keys are present in the context for which no filter component is defined +# it should not prevent a match +def test_match_additional_context(complex_filter): + context = {"architecture": "x86_64/intel/cascadelake", "repository": "nessi.no-2022.A", "instance": "A", "job": 1234} + expected = True + actual = complex_filter.check_filters(context) + assert expected == actual @pytest.fixture def arch_filter_slash_syntax(): From 6b3a118656ef90bb69ba60c181bcc6580278265b Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Jul 2025 12:41:44 +0200 Subject: [PATCH 079/132] Fix hound issues --- tests/test_tools_filter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_tools_filter.py b/tests/test_tools_filter.py index ffee4b9e..df464659 100644 --- a/tests/test_tools_filter.py +++ b/tests/test_tools_filter.py @@ -269,11 +269,13 @@ def test_non_match_instance_context(complex_filter): # If additional keys are present in the context for which no filter component is defined # it should not prevent a match def test_match_additional_context(complex_filter): - context = {"architecture": "x86_64/intel/cascadelake", "repository": "nessi.no-2022.A", "instance": "A", "job": 1234} + context = {"architecture": "x86_64/intel/cascadelake", "repository": "nessi.no-2022.A", "instance": "A", + "job": 1234} expected = True actual = complex_filter.check_filters(context) assert expected == actual + @pytest.fixture def arch_filter_slash_syntax(): af = EESSIBotActionFilter("") From 3b310f5fdec375ed64899edbe9c47e2b4b85401d Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Jul 2025 12:43:24 +0200 Subject: [PATCH 080/132] Fix flake8 issues --- eessi_bot_job_manager.py | 1 + tests/test_tools_filter.py | 1 + tools/commands.py | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index fedfb0ba..1efb9d85 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -73,6 +73,7 @@ config.RUNNING_JOB_COMMENTS_SETTING_RUNNING_JOB] # required } + class EESSIBotSoftwareLayerJobManager: """ Class for representing the job manager of the build-and-deploy bot. It diff --git a/tests/test_tools_filter.py b/tests/test_tools_filter.py index df464659..c6516cf7 100644 --- a/tests/test_tools_filter.py +++ b/tests/test_tools_filter.py @@ -259,6 +259,7 @@ def test_non_match_repository_context(complex_filter): actual = complex_filter.check_filters(context) assert expected == actual + def test_non_match_instance_context(complex_filter): context = {"architecture": "x86_64/intel/cascadelake", "repository": "nessi.no-2022.A", "instance": "B"} expected = False diff --git a/tools/commands.py b/tools/commands.py index 360279f5..a44e1fc8 100644 --- a/tools/commands.py +++ b/tools/commands.py @@ -20,6 +20,7 @@ from tools.filter import EESSIBotActionFilter, EESSIBotActionFilterError from tools.build_params import EESSIBotBuildParams + def contains_any_bot_command(body): """ Checks if argument contains any bot command. @@ -120,7 +121,7 @@ def __init__(self, cmd_str): for arg in cmd_as_list[1:]: if arg.startswith('for:'): # Extract everything after the 'for:' suffix and split by comma - filter_content=arg[4:] + filter_content = arg[4:] target_args.extend(filter_content.split(',')) # Join the filter arguments and pass to EESSIBotActionFilter From 624109eef10549c6cc7e48ec703fc17596c43cb2 Mon Sep 17 00:00:00 2001 From: laraPPr Date: Thu, 17 Jul 2025 15:13:29 +0200 Subject: [PATCH 081/132] update the scontrol command to work on system with multiple clusters Signed-off-by: laraPPr --- eessi_bot_job_manager.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 4fcf9af3..23c32f3b 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -118,6 +118,8 @@ def get_current_jobs(self): squeue_cmd = "%s --long --noheader --user=%s" % (self.poll_command, username) if self.job_name: squeue_cmd += " --name='%s'" % self.job_name + # Format the output of SLURM + squeue_cmd += " --Format JobId:100@,Cluster:100@,Partition:100@,State:100@,Reason:100" squeue_output, squeue_err, squeue_exitcode = run_cmd( squeue_cmd, "get_current_jobs(): squeue command", @@ -139,14 +141,18 @@ def get_current_jobs(self): # Note, all output lines of squeue are processed because we run it with # --noheader. for line in lines: - job = line.rstrip().split() - if len(job) >= 9: - job_id = job[0] - state = job[4] + job = line.rstrip().split('@') + print(job) + if len(job) == 5: + print(job) + job_id = job[0].rstrip() + state = job[3].rstrip() current_jobs[job_id] = { "jobid": job_id, + "cluster": job[1].rstrip(), + "partition": job[2].rstrip(), "state": state, - "reason": job[8], + "reason": job[4].rstrip(), } if state in bad_state_messages: log("Job {} in state {}: {}".format(job_id, state, bad_state_messages[state])) @@ -296,6 +302,14 @@ def process_new_job(self, new_job): """ job_id = new_job["jobid"] + # check if their is a placeholder value in the scontrol_command + if bool(re.search(r'%\([^)]+\)s', self.scontrol_command)): + placeholders = re.findall(r'%\(([^)]+)\)s', self.scontrol_command) + for placeholder in placeholders: + if placeholder == 'new_job["cluster"]': + self.scontrol_command = self.scontrol_command % {placeholder: new_job["cluster"]} + print(new_job['cluster']) + print(self.scontrol_command) scontrol_cmd = "%s --oneliner show jobid %s" % ( self.scontrol_command, job_id, From b14043ec6966804cd305f646cf0d7c9a76e3666c Mon Sep 17 00:00:00 2001 From: Lara Ramona Peeters <49882639+laraPPr@users.noreply.github.com> Date: Thu, 17 Jul 2025 15:16:59 +0200 Subject: [PATCH 082/132] clean up --- eessi_bot_job_manager.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 23c32f3b..d6c59854 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -142,9 +142,7 @@ def get_current_jobs(self): # --noheader. for line in lines: job = line.rstrip().split('@') - print(job) if len(job) == 5: - print(job) job_id = job[0].rstrip() state = job[3].rstrip() current_jobs[job_id] = { @@ -308,8 +306,6 @@ def process_new_job(self, new_job): for placeholder in placeholders: if placeholder == 'new_job["cluster"]': self.scontrol_command = self.scontrol_command % {placeholder: new_job["cluster"]} - print(new_job['cluster']) - print(self.scontrol_command) scontrol_cmd = "%s --oneliner show jobid %s" % ( self.scontrol_command, job_id, From de0bd1c96cc7e2ba2e42c04c95bfbc07480ddec3 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 21 Jul 2025 11:03:12 +0200 Subject: [PATCH 083/132] Removed some comments that were only there for development, no longer needed --- tasks/build.py | 29 +---------------------------- 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/tasks/build.py b/tasks/build.py index 71d55de9..89084867 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -201,28 +201,6 @@ def get_node_types(cfg): log(f"{fn}(): node type map '{json.dumps(node_type_map)}'") return node_type_map -# Replaced by get_node_types -# def get_architecture_targets(cfg): -# """ -# Obtain mappings of architecture targets to Slurm parameters -# -# Args: -# cfg (ConfigParser): ConfigParser instance holding full configuration -# (typically read from 'app.cfg') -# -# Returns: -# (dict): dictionary mapping architecture targets (format -# OS/SOFTWARE_SUBDIR) to architecture specific Slurm job submission -# parameters -# """ -# fn = sys._getframe().f_code.co_name -# -# architecture_targets = cfg[config.SECTION_ARCHITECTURETARGETS] -# -# arch_target_map = json.loads(architecture_targets.get(config.ARCHITECTURETARGETS_SETTING_ARCH_TARGET_MAP)) -# log(f"{fn}(): arch target map '{json.dumps(arch_target_map)}'") -# return arch_target_map - def get_allowed_exportvars(cfg): """ @@ -868,7 +846,7 @@ def submit_job(job, cfg): # instances run on the same system job_name = cfg[config.SECTION_BUILDENV].get(config.BUILDENV_SETTING_JOB_NAME) - # add a default time limit of 24h to the job submit comnand if no other time + # add a default time limit of 24h to the job submit command if no other time # limit is specified already all_opts_str = " ".join([build_env_cfg[config.BUILDENV_SETTING_SLURM_PARAMS], job.slurm_opts]) all_opts_list = all_opts_str.split(" ") @@ -1003,15 +981,10 @@ def create_pr_comment(job, job_id, app_name, pr, symlink, build_params): # construct initial job comment buildenv = config.read_config()[config.SECTION_BUILDENV] job_handover_protocol = buildenv.get(config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL) - # NO LONGER NEEDED now that we have cut up the sentence into different config items - # raw_comment_template = submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT] new_job_instance_repo = submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INSTANCE_REPO] build_on_arch = submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_ON_ARCH] build_for_arch = submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_FOR_ARCH] jobdir = submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_JOBDIR] - # NO LONGER NEEDED now that we have cut up the sentence into different config items - # Support using escape chars in the INITIAL_COMMENT, that means \n should be interpreted as unicode - # initial_comment_template = codecs.decode(raw_comment_template, 'unicode_escape') if job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_DELAYED_BEGIN: release_msg_string = config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_DELAYED_BEGIN_MSG release_comment_template = submitted_job_comments_cfg[release_msg_string] From d4ecc7be4561057de41701a3b6cfef3c85009d29 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen <33718780+casparvl@users.noreply.github.com> Date: Thu, 24 Jul 2025 14:51:43 +0200 Subject: [PATCH 084/132] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clarify descriptions in app.cfg.example Co-authored-by: Bob Dröge --- app.cfg.example | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/app.cfg.example b/app.cfg.example index 63d3155a..53a06ed4 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -309,18 +309,21 @@ signing = # arch_target_map = { # } -# Each entry in node_type_map describes a node type: os, what CPU architecture it has (cpu_subdir) the SLURM parameters -# that need to be passed to submit to it, which repository targets (repo_targets) can be build for on this node type -# and (optionally) which accelerators ('accel') -# All are strings, except repo_targets, which is a list of strings +# Each entry in the node_type_map dictionary describes a build node type. The key is a (descriptive) name for this build node, and its value is a dictionary containing the following build node properties as key-value pairs: + - os: its operating system (os) + - cpu_subdir: its CPU architecture + - slurm_params: the SLURM parameters that need to be passed to submit jobs to it + - repo_targets: supported repository targets for this node type + - accel (optional): which accelerators this node has +# All are strings, except repo_targets, which is a list of strings. # Note that the Slurm parameters should typically be chosen such that a single type of node (with one specific type of # CPU and one specific type of GPU) should be allocated. # Below is an example configuration for a system that contains 4 types of nodes: zen2 CPU nodes, zen4 CPU nodes, -# GPU nodes with an icelake CPU and A100 GPU, GPu nodes with a zen4 CPU and an H100 GPU -# The 'on:' argument to the bot build command determines which node type will be allocated for the build job -# E.g. 'bot:build on:arch=zen4,accel=nvidia/cc90 for:...' will match the gpu_h100 node type below -# If no 'on:' argument is passed to the build command, the 'for:' argument is used instead -# E.g. 'bot:build for:arch=icelake,accel=nvidia/cc80' will match the gpu_a100 node type below +# GPU nodes with an icelake CPU and A100 GPU, GPU nodes with a zen4 CPU and an H100 GPU. +# The 'on:' argument to the bot build command determines which node type will be allocated for the build job, +# e.g. 'bot:build on:arch=zen4,accel=nvidia/cc90 for:...' will match the gpu_h100 node type below. +# If no 'on:' argument is passed to the build command, the 'for:' argument is used instead, +# e.g. 'bot:build for:arch=icelake,accel=nvidia/cc80' will match the gpu_a100 node type below. node_type_map = { "cpu_zen2": { "os": "linux", From af731e19bba67ff16937454a8943bf1769c7dc86 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 28 Jul 2025 16:53:00 +0200 Subject: [PATCH 085/132] Re-comment the awaits_release, as this was done in develop as well. This option is no longe rused --- app.cfg.example | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app.cfg.example b/app.cfg.example index 53a06ed4..62caa332 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -396,7 +396,8 @@ scontrol_command = /usr/bin/scontrol # are removed, the output (in PR comments) will lack important # information. [submitted_job_comments] -awaits_release = job id `{job_id}` awaits release by job manager +# awaits_release is no longer used since bot release v0.7.0 +# awaits_release = job id `{job_id}` awaits release by job manager awaits_release_delayed_begin_msg = job id `{job_id}` will be eligible to start in about {delay_seconds} seconds awaits_release_hold_release_msg = job id `{job_id}` awaits release by job manager new_job_instance_repo = New job on instance `{app_name}` for repository `{repo_id}` From d48b35553fd38ef83dda6e85bf81dee453e2f25b Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 28 Jul 2025 17:02:50 +0200 Subject: [PATCH 086/132] Replace Partition with Node type in show_config output. Also, update docstring for status command to show the correct return type (string) --- eessi_bot_event_handler.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index d7beca4d..ffd5211d 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -417,7 +417,7 @@ def handle_pull_request_opened_event(self, event_info, pr, req_chatlevel=ChatLev comment = f"Instance `{app_name}` is configured to build on:" for node in node_map: - comment += f"\n- Partition `{node}`:" + comment += f"\n- Node type `{node}`:" current_partition = node_map[node] if "os" in current_partition: comment += f"\n - OS: `{current_partition['os']}`" @@ -578,8 +578,8 @@ def handle_bot_command_status(self, event_info, bot_command): bot_command (EESSIBotCommand): command to be handled Returns: - github.IssueComment.IssueComment (note, github refers to - PyGithub, not the github from the internal connections module) + (string): list item with a link to the issue comment that was created + containing the status overview """ self.log("processing bot command 'status'") repo_name = event_info['raw_request_body']['repository']['full_name'] From 6017433494a2188e77f4593996b01296cb406382 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 28 Jul 2025 20:15:32 +0200 Subject: [PATCH 087/132] Processed various smaller review comments for tasks/build.py. Elaborated on the docstring and comments of the template_to_regex function, since it's functionality may be a bit hard (abstract) to understand otherwise. --- tasks/build.py | 58 +++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/tasks/build.py b/tasks/build.py index 89084867..6f66912a 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -612,19 +612,9 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params): return [] jobs = [] - # This loop assumes the following structure for arch_target_map - # Note that 'accel' is a list, to easily allow a single CPU partition to be used for cross compilation - # for a lot of accelerator targets - # arch_target_map = { - # 'node_type_name': { - # 'os': 'linux', - # 'cpu_subdir': 'x86_64/amd/zen4', - # 'accel': ['nvidia/cc90'], - # 'slurm_params': '-p genoa ', - # 'repo_targets': ["eessi.io-2023.06-compat","eessi.io-2023.06-software"], - # }, - # 'node_type_name2': { - # ... etc + # Looping over all node types in the node_map to create a context for each node type and repository + # configured there. Then, check the action filters against these configs to find matching ones. + # If there is a match, prepare the job dir and create the Job object for node_type_name, partition_info in node_map.items(): log(f"{fn}(): node_type_name is {node_type_name}, partition_info is {partition_info}") # Unpack for convenience @@ -661,20 +651,13 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params): # Optionally add accelerator to the context if 'accel' in partition_info: context['accelerator'] = partition_info['accel'] - log(f"{fn}(): context is '{json.dumps(context, indent=4)}'") - if not action_filter.check_filters(context): - log(f"{fn}(): context does NOT satisfy filter(s), skipping") - continue - # check = check | action_filter.check_filters(context) - else: - log(f"{fn}(): context DOES satisfy filter(s), going on with job") - else: - log(f"{fn}(): context is '{json.dumps(context, indent=4)}'") - if not action_filter.check_filters(context): - log(f"{fn}(): context does NOT satisfy filter(s), skipping") - continue - else: - log(f"{fn}(): context DOES satisfy filter(s), going on with job") + + log(f"{fn}(): context is '{json.dumps(context, indent=4)}'") + if not action_filter.check_filters(context): + log(f"{fn}(): context does NOT satisfy filter(s), skipping") + continue + else: + log(f"{fn}(): context DOES satisfy filter(s), going on with job") # we reached this point when the filter matched (otherwise we # 'continue' with the next repository) # We create a specific job directory for the architecture that is going to be build 'for:' @@ -1139,6 +1122,16 @@ def template_to_regex(format_str, with_eol=True): character. This is a requirement if it has to succesfully match a formatting string that ends with a formatting field. + Example: if one function creates a formatted string + value = "my_field_value" + format_str = f"This is my string, with a custom field: {my_field}\n" + formatted_string = format_str.format(my_field=value) + Another function can then grab the original value of my_field by doing: + my_re = template_to_regex(format_str) + match_object = re.match(my_re, formatted_string) + match_object['my_field'] then contains "my_field_value" + This is useful when e.g. one function posts a GitHub comment, and another wants to extract information from that + Args: format_str (string): a formatting string, with template placeholders. with_eol (bool, optional): a boolean, indicating if the formatting string is expected to be followed by @@ -1160,8 +1153,10 @@ def template_to_regex(format_str, with_eol=True): # We use re.escape to escape any special characters in the literal_text, as we want to match those literally regex_parts.append(re.escape(literal_text)) if field_name is not None: - # Create a non-greedy, named capture group. Note that the name itself as an f-string + # Create a non-greedy, named capture group. Note that the {field_name} itself is a format specifier # So we get the actual field name as the name of the capture group + # In other words, if our format_str is "My string with {a_field}" then the named capture group will be + # called 'a_field' # We match any character, but in a non-greedy way. Thus, as soon as it can match the next # literal text section, it will - thus assuming that that's the end of the field # We use .* to allow for empty fields (such as the optional accelerator fields) @@ -1171,7 +1166,12 @@ def template_to_regex(format_str, with_eol=True): # strategy. Otherwise, a formatting string that ends with a formatting item would only match the first letter # of the field, because it doesn't find anything to match after (and it is non-greedy). With the $, it has # something to match after the field, thus making sure it matches the whole field - # This does assume that + # This does assume that the format_str in the string to be matched is indeed followed by and end-of-line character + # I.e. if a function that creates the formatted string does + # my_string = f"{format_str}\n" + # (i.e. has an end-of-line after the format specifier) it can be matched by another function that does + # my_re = template_to_regex(format_str) + # re.match(my_re, my_string) full_pattern = ''.join(regex_parts) if with_eol: full_pattern += "$" From 279e08fb5f9aace633e3d717d21ef82d735c5347 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen <33718780+casparvl@users.noreply.github.com> Date: Tue, 29 Jul 2025 13:48:43 +0200 Subject: [PATCH 088/132] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bob Dröge --- eessi_bot_event_handler.py | 18 +++++++++--------- tests/test_tools_filter.py | 2 +- tools/commands.py | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index ffd5211d..a8beff82 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -418,15 +418,15 @@ def handle_pull_request_opened_event(self, event_info, pr, req_chatlevel=ChatLev comment = f"Instance `{app_name}` is configured to build on:" for node in node_map: comment += f"\n- Node type `{node}`:" - current_partition = node_map[node] - if "os" in current_partition: - comment += f"\n - OS: `{current_partition['os']}`" - if "cpu_subdir" in current_partition: - comment += f"\n - CPU architecture: `{current_partition['cpu_subdir']}`" - if "repo_targets" in current_partition: - comment += f"\n - Repositories: `{current_partition['repo_targets']}`" - if "accel" in current_partition: - comment += f"\n - Accelerators: `{current_partition['accel']}`" + current_node_type = node_map[node] + if "os" in current_node_type: + comment += f"\n - OS: `{current_node_type['os']}`" + if "cpu_subdir" in current_node_type: + comment += f"\n - CPU architecture: `{current_node_type['cpu_subdir']}`" + if "repo_targets" in current_node_type: + comment += f"\n - Repositories: `{current_node_type['repo_targets']}`" + if "accel" in current_node_type: + comment += f"\n - Accelerators: `{current_node_type['accel']}`" comment += "\n" self.log(f"PR opened: comment '{comment}'") diff --git a/tests/test_tools_filter.py b/tests/test_tools_filter.py index c6516cf7..26cd31cd 100644 --- a/tests/test_tools_filter.py +++ b/tests/test_tools_filter.py @@ -246,7 +246,7 @@ def test_matching_context(complex_filter): assert expected == actual -def test_non_match_archictecture_context(complex_filter): +def test_non_match_architecture_context(complex_filter): context = {"architecture": "x86_64/amd/zen4", "repository": "EESSI", "instance": "mybot", "job": 1234} expected = False actual = complex_filter.check_filters(context) diff --git a/tools/commands.py b/tools/commands.py index a44e1fc8..3902ab70 100644 --- a/tools/commands.py +++ b/tools/commands.py @@ -115,7 +115,7 @@ def __init__(self, cmd_str): # If no 'on:' is found in the argument list, everything that follows the 'for:' argument # (until the next space) is considered the argument list for the action filters - # Essentially, this represents a native build, i.e. the hardware we build on should be the + # Essentially, this represents a native build, i.e. the hardware we build for should be the # hardware we build on if not on_found: for arg in cmd_as_list[1:]: From 80f5f1db9a7587d7d62f9527ed426a5a968c59d4 Mon Sep 17 00:00:00 2001 From: casparvl casparvl Date: Tue, 29 Jul 2025 13:18:34 +0000 Subject: [PATCH 089/132] Fix indentation issue --- tasks/build.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tasks/build.py b/tasks/build.py index 6f66912a..d72439f1 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -651,13 +651,13 @@ def prepare_jobs(pr, cfg, event_info, action_filter, build_params): # Optionally add accelerator to the context if 'accel' in partition_info: context['accelerator'] = partition_info['accel'] + log(f"{fn}(): context is '{json.dumps(context, indent=4)}'") - log(f"{fn}(): context is '{json.dumps(context, indent=4)}'") - if not action_filter.check_filters(context): - log(f"{fn}(): context does NOT satisfy filter(s), skipping") - continue - else: - log(f"{fn}(): context DOES satisfy filter(s), going on with job") + if not action_filter.check_filters(context): + log(f"{fn}(): context does NOT satisfy filter(s), skipping") + continue + else: + log(f"{fn}(): context DOES satisfy filter(s), going on with job") # we reached this point when the filter matched (otherwise we # 'continue' with the next repository) # We create a specific job directory for the architecture that is going to be build 'for:' From 2f3c0aee679a20f3f31c8e2c49cf587e4fc34ece Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen <33718780+casparvl@users.noreply.github.com> Date: Thu, 31 Jul 2025 16:13:46 +0200 Subject: [PATCH 090/132] Update tasks/build.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bob Dröge --- tasks/build.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tasks/build.py b/tasks/build.py index d72439f1..6148227e 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -1166,7 +1166,7 @@ def template_to_regex(format_str, with_eol=True): # strategy. Otherwise, a formatting string that ends with a formatting item would only match the first letter # of the field, because it doesn't find anything to match after (and it is non-greedy). With the $, it has # something to match after the field, thus making sure it matches the whole field - # This does assume that the format_str in the string to be matched is indeed followed by and end-of-line character + # This does assume that the format_str in the string to be matched is indeed followed by an end-of-line character # I.e. if a function that creates the formatted string does # my_string = f"{format_str}\n" # (i.e. has an end-of-line after the format specifier) it can be matched by another function that does From 03e720d6966be1b895c8756d94b3000bc000c7d4 Mon Sep 17 00:00:00 2001 From: laraPPr Date: Mon, 4 Aug 2025 11:29:04 +0200 Subject: [PATCH 091/132] add exception and simply using placeholder in scontrol command Signed-off-by: laraPPr --- eessi_bot_job_manager.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 23c32f3b..6860eafd 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -156,6 +156,8 @@ def get_current_jobs(self): } if state in bad_state_messages: log("Job {} in state {}: {}".format(job_id, state, bad_state_messages[state])) + else: + raise Exception(f"The output of {squeue_cmd} does not have 5 job parameters") return current_jobs @@ -302,14 +304,10 @@ def process_new_job(self, new_job): """ job_id = new_job["jobid"] - # check if their is a placeholder value in the scontrol_command - if bool(re.search(r'%\([^)]+\)s', self.scontrol_command)): - placeholders = re.findall(r'%\(([^)]+)\)s', self.scontrol_command) - for placeholder in placeholders: - if placeholder == 'new_job["cluster"]': - self.scontrol_command = self.scontrol_command % {placeholder: new_job["cluster"]} - print(new_job['cluster']) - print(self.scontrol_command) + # if placeholder "cluster" is used in scontrol command + self.scontrol_command = self.scontrol_command % { + 'new_job["cluster"]': new_job["cluster"] + } scontrol_cmd = "%s --oneliner show jobid %s" % ( self.scontrol_command, job_id, From 53b032f3a692342d673a4681b2979d2edbc6d958 Mon Sep 17 00:00:00 2001 From: laraPPr Date: Mon, 4 Aug 2025 11:35:28 +0200 Subject: [PATCH 092/132] make stylecheck happy Signed-off-by: laraPPr --- eessi_bot_job_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 62d66f2b..cd7e3cc3 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -303,9 +303,9 @@ def process_new_job(self, new_job): job_id = new_job["jobid"] # if placeholder "cluster" is used in scontrol command - self.scontrol_command = self.scontrol_command % { + self.scontrol_command = self.scontrol_command % { 'new_job["cluster"]': new_job["cluster"] - } + } scontrol_cmd = "%s --oneliner show jobid %s" % ( self.scontrol_command, job_id, From 519428f39796cfef81815068d13e5880fd6c20b9 Mon Sep 17 00:00:00 2001 From: laraPPr Date: Mon, 4 Aug 2025 11:43:58 +0200 Subject: [PATCH 093/132] some more error logging for the scontrol command Signed-off-by: laraPPr --- eessi_bot_job_manager.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index cd7e3cc3..ff2684ac 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -303,9 +303,15 @@ def process_new_job(self, new_job): job_id = new_job["jobid"] # if placeholder "cluster" is used in scontrol command - self.scontrol_command = self.scontrol_command % { - 'new_job["cluster"]': new_job["cluster"] - } + try: + placeholder = 'new_job["cluster"]' + self.scontrol_command = self.scontrol_command % { + placeholder: new_job["cluster"] + } + except KeyError: + log(f"Failed to process placeholder in scontrol_command. Expected {placeholder} or nothing.") + raise + scontrol_cmd = "%s --oneliner show jobid %s" % ( self.scontrol_command, job_id, From e42b34e4373bdb23cc0e55c6c0018cd2e10da791 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 4 Aug 2025 16:52:41 +0200 Subject: [PATCH 094/132] Update readme for new node_type_map functionality --- README.md | 74 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index cc4d0ff5..5612b04b 100644 --- a/README.md +++ b/README.md @@ -777,37 +777,38 @@ for signing. The bot calls the script with the two arguments: The section `[architecturetargets]` defines for which targets (OS/SUBDIR), (for example `linux/x86_64/amd/zen2`) the EESSI bot should submit jobs, and which additional `sbatch` parameters will be used for requesting a compute node with the CPU microarchitecture needed to build the software stack. ```ini -arch_target_map = { - "linux/x86_64/generic": "--partition x86-64-generic-node", - "linux/x86_64/amd/zen2": "--partition x86-64-amd-zen2-node" } +node_type_map = { + "cpu_zen2": { + "os": "linux", + "cpu_subdir": "x86_64/amd/zen2", + "slurm_params": "-p rome --nodes 1 --ntasks-per-node 16 --cpus-per-task 1", + "repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"] + }, + "gpu_h100": { + "os": "linux", + "cpu_subdir": "x86_64/amd/zen4", + "accel": "nvidia/cc90", + "slurm_params": "-p gpu_h100 --nodes 1 --tasks-per-node 16 --cpus-per-task 1 --gpus-per-node 1", + "repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"] + }} ``` -The map has one-to-many entries of the format `OS/SUBDIR: -ADDITIONAL_SBATCH_PARAMETERS`. For your cluster, you will have to figure out -which microarchitectures (`SUBDIR`) are available (as `OS` only `linux` is -currently supported) and how to instruct Slurm to allocate nodes with that -architecture to a job (`ADDITIONAL_SBATCH_PARAMETERS`). +Each entry in the `node_type_map` dictionary describes a build node type. The key is a (descriptive) name for this build node, and its value is a dictionary containing the following build node properties as key-value pairs: + - `os`: its operating system (os) + - `cpu_subdir`: its CPU architecture + - `slurm_params`: the SLURM parameters that need to be passed to submit jobs to it + - `repo_targets`: supported repository targets for this node type + - `accel` (optional): which accelerators this node has +All values are strings, except repo_targets, which is a list of strings. Repository targets listed in `repo_target` should correspond to the repository IDs as defined in the `repos.cfg` file in the `repos_cfg_dir` (see below). -Note, if you do not have to specify additional parameters to `sbatch` to request a compute node with a specific microarchitecture, you can just write something like: +Note that the Slurm parameters should typically be chosen such that a single type of node (with one specific type of CPU and one specific type of GPU) should be allocated. + +To command the bot to build on the `cpu_zen2` node type above, one would give the command `bot:build on:arch=zen2 ...`. To command the bot to build on the `gpu_h100` node type, one would give the command `bot:build on:arch=zen4,accel=nvidia/cc90 ...` -```ini -arch_target_map = { "linux/x86_64/generic": "" } -``` #### `[repo_targets]` section The `[repo_targets]` section defines for which repositories and architectures the bot can run a job. -Repositories are referenced by IDs (or `repo_id`). Architectures are identified -by `OS/SUBDIR` which correspond to settings in the `arch_target_map`. - -```ini -repo_target_map = { - "OS_SUBDIR_1": ["REPO_ID_1_1","REPO_ID_1_2"], - "OS_SUBDIR_2": ["REPO_ID_2_1","REPO_ID_2_2"] } -``` - -For each `OS/SUBDIR` combination a list of available repository IDs can be -provided. The repository IDs are defined in a separate file, say `repos.cfg` which is stored in the directory defined via `repos_cfg_dir`: @@ -911,19 +912,36 @@ event handler will throw an exception when formatting the update of the PR comment corresponding to the job. ```ini -initial_comment = New job on instance `{app_name}` for architecture `{arch_name}`{accelerator_spec} for repository `{repo_id}` in job dir `{symlink}` +new_job_instance_repo = New job on instance `{app_name}` for repository `{repo_id}` ``` -`initial_comment` is used to create a comment to a PR when a new job has been -created. Note, the part '{accelerator_spec}' is only filled-in by the bot if the -argument 'accelerator' to the `bot: build` command has been used. +`new_job_instance_repo` is used as the first line in a comment to a PR when a new job has been created. + +```ini +build_on_arch = Building on: `{on_arch}`{on_accelerator} +``` + +`build_on_arch` is used as the second line in a comment to a PR when a new job has been created. Note that the `on_accelerator` spec is only filled-in by the bot if the `on:...,accel=...` has been passed to the bot. + +```ini +build_for_arch = Building for: `{for_arch}`{for_accelerator} +``` + +`build_for_arch` is used as the third line in a comment to a PR when a new job has been created. Note that the `for_accelerator` spec is only filled-in by the bot if the `for:...,accel=...` has been passed to the bot. + +```ini +jobdir = Job dir: `{symlink}` +``` + +`jobdir` is used as the fourth line in a comment to a PR when a new job has been created. + ```ini with_accelerator =  and accelerator `{accelerator}` ``` `with_accelerator` is used to provide information about the accelerator the job -should build for if and only if the argument `accelerator:X/Y` has been provided. +should build for if and only if the argument `on:...,accel=...` or `for:...,accel=...` has been provided. #### `[new_job_comments]` section From 0b4e22fc3b2cba23bc19c49b07745ca69f822db2 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 4 Aug 2025 17:04:56 +0200 Subject: [PATCH 095/132] Add examples of bot build commands --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5612b04b..0caf77b9 100644 --- a/README.md +++ b/README.md @@ -803,7 +803,11 @@ All values are strings, except repo_targets, which is a list of strings. Reposit Note that the Slurm parameters should typically be chosen such that a single type of node (with one specific type of CPU and one specific type of GPU) should be allocated. -To command the bot to build on the `cpu_zen2` node type above, one would give the command `bot:build on:arch=zen2 ...`. To command the bot to build on the `gpu_h100` node type, one would give the command `bot:build on:arch=zen4,accel=nvidia/cc90 ...` +To command the bot to build on the `cpu_zen2` node type above, one would give the command `bot:build on:arch=zen2 ...`. To command the bot to build on the `gpu_h100` node type, one would give the command `bot:build on:arch=zen4,accel=nvidia/cc90 ...`. + +For a native build (i.e. building for `zen2` on a `zen2` node), one can pass `bot:build on:arch=zen2 for:arch=x86_64/amd/zen2`, or use the short-hand `bot:build for:arch=x86_64/amd/zen2` (i.e. omitting the `on` argument implies a native build). This will trigger a build on the `cpu_zen2` node type (as configured above) and prepare a configuration file in the job directory that instructs to build for a `zen2` CPU architecture. + +For cross-compiling GPU code for Nvidia Compute Capabiltiy 8.0 (and a `zen2` CPU architecture), one would instruct the bot with `bot:build on:arch=zen2 for:arch=x86_64/amd/zen2,accel=nvidia/cc80`. This will trigger a build on the `cpu_zen2` node type (as configured above) and prepare a configuration file in the job directory that instructs to build for a `zen2` CPU architecture with an `nvidia/cc80` GPU architecture. #### `[repo_targets]` section From be7efe42036eaf4bae399bfe1b789e65a19ccbc5 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 5 Aug 2025 10:58:15 +0200 Subject: [PATCH 096/132] Process review comments --- README.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0caf77b9..54edfe9d 100644 --- a/README.md +++ b/README.md @@ -805,14 +805,20 @@ Note that the Slurm parameters should typically be chosen such that a single typ To command the bot to build on the `cpu_zen2` node type above, one would give the command `bot:build on:arch=zen2 ...`. To command the bot to build on the `gpu_h100` node type, one would give the command `bot:build on:arch=zen4,accel=nvidia/cc90 ...`. -For a native build (i.e. building for `zen2` on a `zen2` node), one can pass `bot:build on:arch=zen2 for:arch=x86_64/amd/zen2`, or use the short-hand `bot:build for:arch=x86_64/amd/zen2` (i.e. omitting the `on` argument implies a native build). This will trigger a build on the `cpu_zen2` node type (as configured above) and prepare a configuration file in the job directory that instructs to build for a `zen2` CPU architecture. +For a native build (i.e. building for `zen2` on a `zen2` node), one can pass `bot:build on:arch=zen2 for:arch=x86_64/amd/zen2`, or use the short-hand `bot:build for:arch=x86_64/amd/zen2` (i.e. omitting the `on` argument implies a native build; note that the reverse, omitting the `for` argument, does not work). This will trigger a build on the `cpu_zen2` node type (as configured above) and prepare a configuration file in the job directory that instructs to build for a `zen2` CPU architecture. -For cross-compiling GPU code for Nvidia Compute Capabiltiy 8.0 (and a `zen2` CPU architecture), one would instruct the bot with `bot:build on:arch=zen2 for:arch=x86_64/amd/zen2,accel=nvidia/cc80`. This will trigger a build on the `cpu_zen2` node type (as configured above) and prepare a configuration file in the job directory that instructs to build for a `zen2` CPU architecture with an `nvidia/cc80` GPU architecture. +For cross-compiling GPU code for NVIDIA Compute Capabiltiy 8.0 (and a `zen2` CPU architecture), one would instruct the bot with `bot:build on:arch=zen2 for:arch=x86_64/amd/zen2,accel=nvidia/cc80`. This will trigger a build on the `cpu_zen2` node type (as configured above) and prepare a configuration file in the job directory that instructs to build for a `zen2` CPU architecture with an `nvidia/cc80` GPU architecture. +Note that the `arch_target_map` and `repo_target_map` (used in version <=0.8.0) configuration option was replaced by `node_type_map`. The `arch_target_map` and `repo_target_map` that would be equivalent to the `node_type_map` above was + +```ini +arch_target_map = { "linux/x86_64/amd/zen2": "-p rome --nodes 1 --ntasks-per-node 16 --cpus-per-task 1", "linux/x86_64/amd/zen4": "-p gpu_h100 --nodes 1 --tasks-per-node 16 --cpus-per-task 1 --gpus-per-node 1" } +repo_target_map = { "linux/x86_64/amd/zen2": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"], "linux/x86_64/amd/zen4": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"] } +``` #### `[repo_targets]` section -The `[repo_targets]` section defines for which repositories and architectures the bot can run a job. +The `[repo_targets]` section defines where the configuration for the repository targets defined in the `node_type_map` can be found The repository IDs are defined in a separate file, say `repos.cfg` which is stored in the directory defined via `repos_cfg_dir`: From ae691e6f61696df102ee258e652dd6790cd08499 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 5 Aug 2025 11:00:02 +0200 Subject: [PATCH 097/132] Fix linting errors --- README.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 54edfe9d..11ea821b 100644 --- a/README.md +++ b/README.md @@ -794,11 +794,13 @@ node_type_map = { ``` Each entry in the `node_type_map` dictionary describes a build node type. The key is a (descriptive) name for this build node, and its value is a dictionary containing the following build node properties as key-value pairs: - - `os`: its operating system (os) - - `cpu_subdir`: its CPU architecture - - `slurm_params`: the SLURM parameters that need to be passed to submit jobs to it - - `repo_targets`: supported repository targets for this node type - - `accel` (optional): which accelerators this node has + +- `os`: its operating system (os) +- `cpu_subdir`: its CPU architecture +- `slurm_params`: the SLURM parameters that need to be passed to submit jobs to it +- `repo_targets`: supported repository targets for this node type +- `accel` (optional): which accelerators this node has + All values are strings, except repo_targets, which is a list of strings. Repository targets listed in `repo_target` should correspond to the repository IDs as defined in the `repos.cfg` file in the `repos_cfg_dir` (see below). Note that the Slurm parameters should typically be chosen such that a single type of node (with one specific type of CPU and one specific type of GPU) should be allocated. @@ -945,7 +947,6 @@ jobdir = Job dir: `{symlink}` `jobdir` is used as the fourth line in a comment to a PR when a new job has been created. - ```ini with_accelerator =  and accelerator `{accelerator}` ``` From 2ad48ec91b44143a7029e8a673f1969f97bf92fb Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen <33718780+casparvl@users.noreply.github.com> Date: Tue, 5 Aug 2025 13:48:46 +0200 Subject: [PATCH 098/132] Update README.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bob Dröge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 11ea821b..d4177c1c 100644 --- a/README.md +++ b/README.md @@ -811,7 +811,7 @@ For a native build (i.e. building for `zen2` on a `zen2` node), one can pass `bo For cross-compiling GPU code for NVIDIA Compute Capabiltiy 8.0 (and a `zen2` CPU architecture), one would instruct the bot with `bot:build on:arch=zen2 for:arch=x86_64/amd/zen2,accel=nvidia/cc80`. This will trigger a build on the `cpu_zen2` node type (as configured above) and prepare a configuration file in the job directory that instructs to build for a `zen2` CPU architecture with an `nvidia/cc80` GPU architecture. -Note that the `arch_target_map` and `repo_target_map` (used in version <=0.8.0) configuration option was replaced by `node_type_map`. The `arch_target_map` and `repo_target_map` that would be equivalent to the `node_type_map` above was +Note that the `arch_target_map` and `repo_target_map` (used in version <=0.8.0) configuration options were replaced by `node_type_map`. The `arch_target_map` and `repo_target_map` that would be equivalent to the `node_type_map` above are: ```ini arch_target_map = { "linux/x86_64/amd/zen2": "-p rome --nodes 1 --ntasks-per-node 16 --cpus-per-task 1", "linux/x86_64/amd/zen4": "-p gpu_h100 --nodes 1 --tasks-per-node 16 --cpus-per-task 1 --gpus-per-node 1" } From 769150ddd53dfef21176c0919ac41ef735e84abe Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen <33718780+casparvl@users.noreply.github.com> Date: Tue, 5 Aug 2025 13:50:16 +0200 Subject: [PATCH 099/132] Update README.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bob Dröge --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d4177c1c..a35e675d 100644 --- a/README.md +++ b/README.md @@ -820,7 +820,7 @@ repo_target_map = { "linux/x86_64/amd/zen2": ["eessi.io-2023.06-compat","eessi.i #### `[repo_targets]` section -The `[repo_targets]` section defines where the configuration for the repository targets defined in the `node_type_map` can be found +The `[repo_targets]` section defines where the configuration for the repository targets defined in the `node_type_map` can be found. The repository IDs are defined in a separate file, say `repos.cfg` which is stored in the directory defined via `repos_cfg_dir`: From b4d86e0e0649fd2dd1d1b6758926c641148a72ba Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen <33718780+casparvl@users.noreply.github.com> Date: Tue, 5 Aug 2025 13:58:16 +0200 Subject: [PATCH 100/132] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a35e675d..c139f1d9 100644 --- a/README.md +++ b/README.md @@ -805,7 +805,7 @@ All values are strings, except repo_targets, which is a list of strings. Reposit Note that the Slurm parameters should typically be chosen such that a single type of node (with one specific type of CPU and one specific type of GPU) should be allocated. -To command the bot to build on the `cpu_zen2` node type above, one would give the command `bot:build on:arch=zen2 ...`. To command the bot to build on the `gpu_h100` node type, one would give the command `bot:build on:arch=zen4,accel=nvidia/cc90 ...`. +To command the bot to build on the `cpu_zen2` node type above, one would give the command `bot:build on:arch=zen2 for:...`. To command the bot to build on the `gpu_h100` node type, one would give the command `bot:build on:arch=zen4,accel=nvidia/cc90 for:...`. For a native build (i.e. building for `zen2` on a `zen2` node), one can pass `bot:build on:arch=zen2 for:arch=x86_64/amd/zen2`, or use the short-hand `bot:build for:arch=x86_64/amd/zen2` (i.e. omitting the `on` argument implies a native build; note that the reverse, omitting the `for` argument, does not work). This will trigger a build on the `cpu_zen2` node type (as configured above) and prepare a configuration file in the job directory that instructs to build for a `zen2` CPU architecture. From 72c39a44306736edd3146b9c186077b8d877ee5f Mon Sep 17 00:00:00 2001 From: laraPPr Date: Wed, 6 Aug 2025 11:33:17 +0200 Subject: [PATCH 101/132] simplify using placeholder for scontrol command Signed-off-by: laraPPr --- eessi_bot_job_manager.py | 45 ++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 0ea474b2..baa01859 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -117,7 +117,7 @@ def get_current_jobs(self): squeue_cmd = "%s --long --noheader --user=%s" % (self.poll_command, username) if self.job_name: - squeue_cmd += " --name='%s'" % self.job_name + squeue_cmd += " --name=%s" % self.job_name # Format the output of SLURM squeue_cmd += " --Format JobId:100@,Cluster:100@,Partition:100@,State:100@,Reason:100" squeue_output, squeue_err, squeue_exitcode = run_cmd( @@ -140,22 +140,23 @@ def get_current_jobs(self): # get job info, logging any Slurm issues # Note, all output lines of squeue are processed because we run it with # --noheader. - for line in lines: - job = line.rstrip().split('@') - if len(job) == 5: - job_id = job[0].rstrip() - state = job[3].rstrip() - current_jobs[job_id] = { - "jobid": job_id, - "cluster": job[1].rstrip(), - "partition": job[2].rstrip(), - "state": state, - "reason": job[4].rstrip(), - } - if state in bad_state_messages: - log("Job {} in state {}: {}".format(job_id, state, bad_state_messages[state])) - else: - raise Exception(f"The output of {squeue_cmd} does not have 5 job parameters") + if lines != ['']: + for line in lines: + job = line.rstrip().split('@') + if len(job) == 5: + job_id = job[0].rstrip() + state = job[3].rstrip() + current_jobs[job_id] = { + "jobid": job_id, + "cluster": job[1].rstrip(), + "partition": job[2].rstrip(), + "state": state, + "reason": job[4].rstrip(), + } + if state in bad_state_messages: + log("Job {} in state {}: {}".format(job_id, state, bad_state_messages[state])) + else: + raise Exception(f"The output of {squeue_cmd} does not have 5 job parameters") return current_jobs @@ -302,14 +303,12 @@ def process_new_job(self, new_job): """ job_id = new_job["jobid"] - # if placeholder "cluster" is used in scontrol command + # if placeholder is used in scontrol command try: - placeholder = 'new_job["cluster"]' - self.scontrol_command = self.scontrol_command % { - placeholder: new_job["cluster"] - } + self.scontrol_command = self.scontrol_command % new_job except KeyError: - log(f"Failed to process placeholder in scontrol_command. Expected {placeholder} or nothing.") + log(f"Failed to process {self.scontrol_command}.") + log(f"Information on placeholder is not collected in new_job: {new_job}.") raise scontrol_cmd = "%s --oneliner show jobid %s" % ( From 19c28f5fcef4942184082aece72050c38733421b Mon Sep 17 00:00:00 2001 From: laraPPr Date: Wed, 6 Aug 2025 11:36:42 +0200 Subject: [PATCH 102/132] remove uneccesary change Signed-off-by: laraPPr --- eessi_bot_job_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index baa01859..14c10abb 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -117,7 +117,7 @@ def get_current_jobs(self): squeue_cmd = "%s --long --noheader --user=%s" % (self.poll_command, username) if self.job_name: - squeue_cmd += " --name=%s" % self.job_name + squeue_cmd += " --name='%s'" % self.job_name # Format the output of SLURM squeue_cmd += " --Format JobId:100@,Cluster:100@,Partition:100@,State:100@,Reason:100" squeue_output, squeue_err, squeue_exitcode = run_cmd( From 9433bea664a06c7d244c5aa97a4d6fb8b7c970fa Mon Sep 17 00:00:00 2001 From: laraPPr Date: Wed, 6 Aug 2025 11:44:56 +0200 Subject: [PATCH 103/132] use rstrip in a list comprehension Signed-off-by: laraPPr --- eessi_bot_job_manager.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 14c10abb..e9fbf4b7 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -142,16 +142,16 @@ def get_current_jobs(self): # --noheader. if lines != ['']: for line in lines: - job = line.rstrip().split('@') + job = [x.rstrip() for x in line.rstrip().split('@')] if len(job) == 5: - job_id = job[0].rstrip() - state = job[3].rstrip() + job_id = job[0] + state = job[3] current_jobs[job_id] = { "jobid": job_id, - "cluster": job[1].rstrip(), - "partition": job[2].rstrip(), + "cluster": job[1], + "partition": job[2], "state": state, - "reason": job[4].rstrip(), + "reason": job[4], } if state in bad_state_messages: log("Job {} in state {}: {}".format(job_id, state, bad_state_messages[state])) From 09a4842388078b39b8ceeded0973d6c1acbb08e3 Mon Sep 17 00:00:00 2001 From: Lara Ramona Peeters <49882639+laraPPr@users.noreply.github.com> Date: Wed, 6 Aug 2025 15:08:53 +0200 Subject: [PATCH 104/132] Update eessi_bot_job_manager.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Thomas Röblitz --- eessi_bot_job_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index e9fbf4b7..d0423923 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -303,7 +303,7 @@ def process_new_job(self, new_job): """ job_id = new_job["jobid"] - # if placeholder is used in scontrol command + # processing placeholders in scontrol command which is defined in the bot's app.cfg (setting `scontrol_command`) try: self.scontrol_command = self.scontrol_command % new_job except KeyError: From a2e52ddc347053a77e442783a73e0059dfd3b239 Mon Sep 17 00:00:00 2001 From: laraPPr Date: Thu, 7 Aug 2025 10:05:27 +0200 Subject: [PATCH 105/132] Update README Signed-off-by: laraPPr --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index c139f1d9..3f87cf2f 100644 --- a/README.md +++ b/README.md @@ -891,6 +891,8 @@ scontrol_command = /usr/bin/scontrol ``` `scontrol_command` is the full path to the Slurm command used for manipulating existing jobs. You may want to verify if `scontrol` is provided at that path or determine its actual location (via `which scontrol`). +It is also possible to add placeholder values to the scontrol_command. This might be necessary on systems where the name of the clusters needs to be passed in order to manage the jobs. For example: `/usr/bin/scontrol --clusters=%%(cluster)s`. +Only placeholders defined in `current_jobs` can be included in the scontrol_command this is currently jobid, cluster, partition, state and reason. #### `[submitted_job_comments]` section From 63fa9f34da936a0a413fccc124c26033cc25ed82 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 7 Aug 2025 10:28:01 +0200 Subject: [PATCH 106/132] Set a default self.build_params = None on constructiong the EESSIBotCommand. Then, in handling the bot build command, don't submit a job if self.build_params wasn't defined --- eessi_bot_event_handler.py | 6 ++++++ tools/commands.py | 1 + 2 files changed, 7 insertions(+) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index a8beff82..f9517003 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -530,6 +530,12 @@ def handle_bot_command_build(self, event_info, bot_command): pr_number = event_info['raw_request_body']['issue']['number'] pr = gh.get_repo(repo_name).get_pull(pr_number) build_msg = '' + # Require that build_params is defined, it is required. Otherwise, return early + if bot_command.build_params is None: + build_msg = "No 'for:' argument was passed to the bot:build command. This argumen is required, so " + build_msg += "not submitting build jobs" + return build_msg + if check_build_permission(pr, event_info): # use filter from command submitted_jobs = submit_build_jobs(pr, event_info, bot_command.action_filters, bot_command.build_params) diff --git a/tools/commands.py b/tools/commands.py index 3902ab70..fe45d079 100644 --- a/tools/commands.py +++ b/tools/commands.py @@ -87,6 +87,7 @@ def __init__(self, cmd_str): # TODO add function name to log messages cmd_as_list = cmd_str.split() self.command = cmd_as_list[0] # E.g. 'build' or 'help' + self.build_params = None # TODO always init self.action_filters with empty EESSIBotActionFilter? if len(cmd_as_list) > 1: From 4360e3dde19cf0ecafb3a770cc59a3f09bfd7e61 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 7 Aug 2025 10:59:54 +0200 Subject: [PATCH 107/132] Fix typo --- eessi_bot_event_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index f9517003..7cd45a57 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -532,7 +532,7 @@ def handle_bot_command_build(self, event_info, bot_command): build_msg = '' # Require that build_params is defined, it is required. Otherwise, return early if bot_command.build_params is None: - build_msg = "No 'for:' argument was passed to the bot:build command. This argumen is required, so " + build_msg = "No 'for:' argument was passed to the bot:build command. This argument is required, so " build_msg += "not submitting build jobs" return build_msg From 1ad4640b2ae9f0050b3bc8e052dadd99b803ae79 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 18 Aug 2025 18:07:24 +0200 Subject: [PATCH 108/132] WIP: trying to get a status command that only prints the last status for each 'for' architecture --- eessi_bot_event_handler.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index a8beff82..1c0ccbdc 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -586,6 +586,34 @@ def handle_bot_command_status(self, event_info, bot_command): pr_number = event_info['raw_request_body']['issue']['number'] status_table = request_bot_build_issue_comments(repo_name, pr_number) + + # TODO: make the block until 'status_table = status_table_last' conditional on the bot command + # If the bot command is something like 'bot:status=last', then we should execute this sorting block + # First, add a timestamp for the date, so that we can use it for sorting + dates = status_table['date'] + timestamps = [] + for date in dates: + date_object = datetime.datetime.strptime(date, "%b %d %X %Z %Y") + timestamps.append(int(date_object.timestamp())) + status_table['timestamp'] = timestamps + + # Figure out the sorting indices, so that things are sorted first by the 'for arch', and then by 'date' + sorted_indices = sorted(range(len(status_table['for arch'])), key=lambda x: (status_table['for arch'[x], status_table['timestamp'][x])) + # Reverse, so that the newest builds are first + sorted_indices.reverse() + # Apply the sorted indices to get a sorted table + sorted_table = {key: [status_table[key][i] for i in sorted_indices] for key in status_table} + + # Keep only the first entry for each 'for arch', as that is now the newest + status_table_last = {'on arch': [], 'for arch': [], 'for repo': [], 'date': [], 'status': [], 'url': [], 'result': []} + for x in range(0, len(status_table['date'])): + if status_table['for arch'][x] not in status_table_last['for arch']: + for key in status_table_last: + status_table_last[key].append(status_table[key][x]) + + # overwrite the original status_table + status_table = status_table_last + comment_status = '' comment_status += "\nThis is the status of all the `bot: build` commands:" comment_status += "\n|on|for|repo|result|date|status|url|" From 7a55d304f38c5c9405c9ff47840b250290e3dfc3 Mon Sep 17 00:00:00 2001 From: Lara Ramona Peeters <49882639+laraPPr@users.noreply.github.com> Date: Tue, 19 Aug 2025 14:27:17 +0200 Subject: [PATCH 109/132] Update README.md Co-authored-by: Caspar van Leeuwen <33718780+casparvl@users.noreply.github.com> --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 3f87cf2f..b565756a 100644 --- a/README.md +++ b/README.md @@ -891,8 +891,7 @@ scontrol_command = /usr/bin/scontrol ``` `scontrol_command` is the full path to the Slurm command used for manipulating existing jobs. You may want to verify if `scontrol` is provided at that path or determine its actual location (via `which scontrol`). -It is also possible to add placeholder values to the scontrol_command. This might be necessary on systems where the name of the clusters needs to be passed in order to manage the jobs. For example: `/usr/bin/scontrol --clusters=%%(cluster)s`. -Only placeholders defined in `current_jobs` can be included in the scontrol_command this is currently jobid, cluster, partition, state and reason. +It is also possible to add placeholder values to the scontrol_command. These placeholders can capture output from the `squeue` command that the bot runs internally, and pass it back to the `scontrol_command`. An example where this is in a setup where multiple clusters are managed by the same SLURM instance, and the `scontrol_command` for that instance needs to get the correct cluster name passed. This can be achieved by defining `scontrol_command = /usr/bin/scontrol --clusters=%%(cluster)s`. Valid placeholder names are currently: `jobid`, `cluster`, `partition`, `state`, and `reason`. #### `[submitted_job_comments]` section From e34925748ab0d71fe0458c9ca40516ca9ab77958 Mon Sep 17 00:00:00 2001 From: Lara Ramona Peeters <49882639+laraPPr@users.noreply.github.com> Date: Tue, 19 Aug 2025 14:37:08 +0200 Subject: [PATCH 110/132] Update README.md Co-authored-by: Caspar van Leeuwen <33718780+casparvl@users.noreply.github.com> --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b565756a..dd6b3234 100644 --- a/README.md +++ b/README.md @@ -891,7 +891,7 @@ scontrol_command = /usr/bin/scontrol ``` `scontrol_command` is the full path to the Slurm command used for manipulating existing jobs. You may want to verify if `scontrol` is provided at that path or determine its actual location (via `which scontrol`). -It is also possible to add placeholder values to the scontrol_command. These placeholders can capture output from the `squeue` command that the bot runs internally, and pass it back to the `scontrol_command`. An example where this is in a setup where multiple clusters are managed by the same SLURM instance, and the `scontrol_command` for that instance needs to get the correct cluster name passed. This can be achieved by defining `scontrol_command = /usr/bin/scontrol --clusters=%%(cluster)s`. Valid placeholder names are currently: `jobid`, `cluster`, `partition`, `state`, and `reason`. +It is also possible to add placeholder values to the scontrol_command. These placeholders can capture output from the `squeue` command that the bot runs internally, and pass it back to the `scontrol_command`. An example where this may be useful is in a setup where multiple clusters are managed by the same SLURM instance, and the `scontrol_command` for that instance needs to get the correct cluster name passed. This can be achieved by defining `scontrol_command = /usr/bin/scontrol --clusters=%%(cluster)s`. Valid placeholder names are currently: `jobid`, `cluster`, `partition`, `state`, and `reason`. #### `[submitted_job_comments]` section From 9a5bf4d6050d1b036c396f21366c9d9e652b5bed Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Aug 2025 14:51:08 +0200 Subject: [PATCH 111/132] Add support for passing general arguments to a command, so that we can support bot:status last_build. Then, implement functionality in the status command that makes sure only the last build result for each architecture is printed --- eessi_bot_event_handler.py | 60 +++++++++++++++++++++----------------- tools/commands.py | 12 ++++++-- 2 files changed, 42 insertions(+), 30 deletions(-) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index 1c0ccbdc..c7f02c0b 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -586,33 +586,39 @@ def handle_bot_command_status(self, event_info, bot_command): pr_number = event_info['raw_request_body']['issue']['number'] status_table = request_bot_build_issue_comments(repo_name, pr_number) - - # TODO: make the block until 'status_table = status_table_last' conditional on the bot command - # If the bot command is something like 'bot:status=last', then we should execute this sorting block - # First, add a timestamp for the date, so that we can use it for sorting - dates = status_table['date'] - timestamps = [] - for date in dates: - date_object = datetime.datetime.strptime(date, "%b %d %X %Z %Y") - timestamps.append(int(date_object.timestamp())) - status_table['timestamp'] = timestamps - - # Figure out the sorting indices, so that things are sorted first by the 'for arch', and then by 'date' - sorted_indices = sorted(range(len(status_table['for arch'])), key=lambda x: (status_table['for arch'[x], status_table['timestamp'][x])) - # Reverse, so that the newest builds are first - sorted_indices.reverse() - # Apply the sorted indices to get a sorted table - sorted_table = {key: [status_table[key][i] for i in sorted_indices] for key in status_table} - - # Keep only the first entry for each 'for arch', as that is now the newest - status_table_last = {'on arch': [], 'for arch': [], 'for repo': [], 'date': [], 'status': [], 'url': [], 'result': []} - for x in range(0, len(status_table['date'])): - if status_table['for arch'][x] not in status_table_last['for arch']: - for key in status_table_last: - status_table_last[key].append(status_table[key][x]) - - # overwrite the original status_table - status_table = status_table_last + if 'last_build' in bot_command.general_args: + # If the bot command is something like 'bot:status =last_build', then only retain the last build for each + # architecture in the status_table + # To do this, we first insert a timestamp to facilitate sorting by time + # Then, we obtain sorting indices that first sort by architecture, then by build time + # Then, we reverse the sorting, so that the last build (highest timestamp) for each archictecture occurs + # first. + # Finally, we copy the table, but each time we encounter an entry for an architecture that we've already + # copied, we ignore it, since - as a result of the sorting - the second entry is always older than the + # first + dates = status_table['date'] + timestamps = [] + for date in dates: + date_object = datetime.datetime.strptime(date, "%b %d %X %Z %Y") + timestamps.append(int(date_object.timestamp())) + status_table['timestamp'] = timestamps + + # Figure out the sorting indices, so that things are sorted first by the 'for arch', and then by 'date' + sorted_indices = sorted(range(len(status_table['for arch'])), key=lambda x: (status_table['for arch'[x], status_table['timestamp'][x])) + # Reverse, so that the newest builds are first + sorted_indices.reverse() + # Apply the sorted indices to get a sorted table + sorted_table = {key: [status_table[key][i] for i in sorted_indices] for key in status_table} + + # Keep only the first entry for each 'for arch', as that is now the newest + status_table_last = {'on arch': [], 'for arch': [], 'for repo': [], 'date': [], 'status': [], 'url': [], 'result': []} + for x in range(0, len(status_table['date'])): + if status_table['for arch'][x] not in status_table_last['for arch']: + for key in status_table_last: + status_table_last[key].append(status_table[key][x]) + + # overwrite the original status_table + status_table = status_table_last comment_status = '' comment_status += "\nThis is the status of all the `bot: build` commands:" diff --git a/tools/commands.py b/tools/commands.py index 3902ab70..122e363a 100644 --- a/tools/commands.py +++ b/tools/commands.py @@ -87,6 +87,7 @@ def __init__(self, cmd_str): # TODO add function name to log messages cmd_as_list = cmd_str.split() self.command = cmd_as_list[0] # E.g. 'build' or 'help' + self.general_args = [] # TODO always init self.action_filters with empty EESSIBotActionFilter? if len(cmd_as_list) > 1: @@ -109,9 +110,14 @@ def __init__(self, cmd_str): # according to the expected argument format for 'for:' self.build_params = EESSIBotBuildParams(build_params) else: - # Anything that is not 'on:' or 'for:' should just be passed on as normal - # No further parsing of the value is needed - other_filter_args.extend([arg]) + # Anything that is not 'on:' or 'for:' + # Check if it's a filter argument, if so, pass it on to other_filter_args witout further parsing + # If it's not a filter argument, it is a general argument - just store it so any other function + # can read it + if ':' in arg: + other_filter_args.extend([arg]) + else: + self.general_args.append(arg) # If no 'on:' is found in the argument list, everything that follows the 'for:' argument # (until the next space) is considered the argument list for the action filters From 583cacce5b09dbb6d2eff680f2506060459b5b23 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Aug 2025 14:53:31 +0200 Subject: [PATCH 112/132] Fix typo --- eessi_bot_event_handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index c7f02c0b..dcf8d4ed 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -604,7 +604,7 @@ def handle_bot_command_status(self, event_info, bot_command): status_table['timestamp'] = timestamps # Figure out the sorting indices, so that things are sorted first by the 'for arch', and then by 'date' - sorted_indices = sorted(range(len(status_table['for arch'])), key=lambda x: (status_table['for arch'[x], status_table['timestamp'][x])) + sorted_indices = sorted(range(len(status_table['for arch'])), key=lambda x: (status_table['for arch'][x], status_table['timestamp'][x])) # Reverse, so that the newest builds are first sorted_indices.reverse() # Apply the sorted indices to get a sorted table From 16a4e0f8c905bb2f1b6424637bcc50d072f0254a Mon Sep 17 00:00:00 2001 From: laraPPr Date: Tue, 19 Aug 2025 16:03:29 +0200 Subject: [PATCH 113/132] add description to app.cfg.example Signed-off-by: laraPPr --- app.cfg.example | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/app.cfg.example b/app.cfg.example index 62caa332..0b393a4c 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -384,6 +384,11 @@ poll_command = /usr/bin/squeue poll_interval = 60 # full path to the command for manipulating existing jobs +# It is also possible to add placeholder values to the scontrol_command. +# An example where this may be useful is in a setup where multiple clusters are managed by the same SLURM instance, +# and the `scontrol_command` for that instance needs to get the correct cluster name passed. +# This can be achieved by defining `scontrol_command = /usr/bin/scontrol --clusters=%%(cluster)s`. +# Valid placeholder names are currently: `jobid`, `cluster`, `partition`, `state`, and `reason`. scontrol_command = /usr/bin/scontrol From 25dd76d3667690f1a01512e2b955ebe60d2d3a23 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Aug 2025 17:50:08 +0200 Subject: [PATCH 114/132] Make sure that commands still function if no action filters were defined. Also, log the general args to the log --- tools/commands.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tools/commands.py b/tools/commands.py index 122e363a..64ce7fc7 100644 --- a/tools/commands.py +++ b/tools/commands.py @@ -88,6 +88,7 @@ def __init__(self, cmd_str): cmd_as_list = cmd_str.split() self.command = cmd_as_list[0] # E.g. 'build' or 'help' self.general_args = [] + self.action_filters = None # TODO always init self.action_filters with empty EESSIBotActionFilter? if len(cmd_as_list) > 1: @@ -145,6 +146,7 @@ def __init__(self, cmd_str): # so no special parsing needed there log(f"Extracted filter arguments related to hardware target: {normalized_filters}") log(f"Other extracted filter arguments: {other_filter_args}") + log(f"Other general arguments: {self.general_args}") normalized_filters += other_filter_args # Finally, change into a space-separated string, as expected by EESSIBotActionFilter @@ -175,5 +177,8 @@ def to_string(self): Returns: string: the string representation created by the method """ - action_filters_str = self.action_filters.to_string() - return f"{' '.join([self.command, action_filters_str]).rstrip()}" + if self.action_filters is None: + return "" + else: + action_filters_str = self.action_filters.to_string() + return f"{' '.join([self.command, action_filters_str]).rstrip()}" From 296942f8b7e5ab82900e733960bc46914bf3df1f Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Aug 2025 17:51:24 +0200 Subject: [PATCH 115/132] Fix some bugs. Also, make sure to actually use the sorted table when composing the table with only the last builds. Finally, add resorting for an output that is sorted alphabetically by 'for' architecture --- eessi_bot_event_handler.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index dcf8d4ed..a42c1e53 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -599,7 +599,7 @@ def handle_bot_command_status(self, event_info, bot_command): dates = status_table['date'] timestamps = [] for date in dates: - date_object = datetime.datetime.strptime(date, "%b %d %X %Z %Y") + date_object = datetime.strptime(date, "%b %d %X %Z %Y") timestamps.append(int(date_object.timestamp())) status_table['timestamp'] = timestamps @@ -609,16 +609,23 @@ def handle_bot_command_status(self, event_info, bot_command): sorted_indices.reverse() # Apply the sorted indices to get a sorted table sorted_table = {key: [status_table[key][i] for i in sorted_indices] for key in status_table} + self.log(f"Sorted status table: {sorted_table}") # Keep only the first entry for each 'for arch', as that is now the newest status_table_last = {'on arch': [], 'for arch': [], 'for repo': [], 'date': [], 'status': [], 'url': [], 'result': []} - for x in range(0, len(status_table['date'])): - if status_table['for arch'][x] not in status_table_last['for arch']: + for x in range(0, len(sorted_table['date'])): + if sorted_table['for arch'][x] not in status_table_last['for arch']: + self.log(f"arch: {sorted_table['for arch'][x]} not yet in status_table_last") for key in status_table_last: - status_table_last[key].append(status_table[key][x]) + self.log(f"Adding to '{key}' and the value {sorted_table[key][x]}") + status_table_last[key].append(sorted_table[key][x]) + + # Re-sort, now only on 'for arch', for nicer viewing + sorted_indices = sorted(range(len(status_table_last['for arch'])), key=lambda x: status_table_last['for arch'][x]) + sorted_table_last = {key: [status_table_last[key][i] for i in sorted_indices] for key in status_table_last} # overwrite the original status_table - status_table = status_table_last + status_table = sorted_table_last comment_status = '' comment_status += "\nThis is the status of all the `bot: build` commands:" From d335c3dec1e2c2b7aff8d65ae6c0ce0943826e65 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Aug 2025 17:59:56 +0200 Subject: [PATCH 116/132] Fix too long lines --- eessi_bot_event_handler.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index a42c1e53..c0e26f86 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -604,7 +604,8 @@ def handle_bot_command_status(self, event_info, bot_command): status_table['timestamp'] = timestamps # Figure out the sorting indices, so that things are sorted first by the 'for arch', and then by 'date' - sorted_indices = sorted(range(len(status_table['for arch'])), key=lambda x: (status_table['for arch'][x], status_table['timestamp'][x])) + key_func = lambda x: (status_table['for arch'][x], status_table['timestamp'][x]) + sorted_indices = sorted(range(len(status_table['for arch'])), key=key_func) # Reverse, so that the newest builds are first sorted_indices.reverse() # Apply the sorted indices to get a sorted table @@ -612,7 +613,9 @@ def handle_bot_command_status(self, event_info, bot_command): self.log(f"Sorted status table: {sorted_table}") # Keep only the first entry for each 'for arch', as that is now the newest - status_table_last = {'on arch': [], 'for arch': [], 'for repo': [], 'date': [], 'status': [], 'url': [], 'result': []} + status_table_last = { + 'on arch': [], 'for arch': [], 'for repo': [], 'date': [], 'status': [], 'url': [], 'result': [] + } for x in range(0, len(sorted_table['date'])): if sorted_table['for arch'][x] not in status_table_last['for arch']: self.log(f"arch: {sorted_table['for arch'][x]} not yet in status_table_last") @@ -621,7 +624,8 @@ def handle_bot_command_status(self, event_info, bot_command): status_table_last[key].append(sorted_table[key][x]) # Re-sort, now only on 'for arch', for nicer viewing - sorted_indices = sorted(range(len(status_table_last['for arch'])), key=lambda x: status_table_last['for arch'][x]) + key_func = lambda x: status_table_last['for arch'][x] + sorted_indices = sorted(range(len(status_table_last['for arch'])), key=key_func) sorted_table_last = {key: [status_table_last[key][i] for i in sorted_indices] for key in status_table_last} # overwrite the original status_table From 3fc6a6ef536cf15e114ada7ac37e72362e75de4e Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 19 Aug 2025 18:04:04 +0200 Subject: [PATCH 117/132] Don't define the lambda's separately --- eessi_bot_event_handler.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index c0e26f86..74886a0e 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -604,8 +604,10 @@ def handle_bot_command_status(self, event_info, bot_command): status_table['timestamp'] = timestamps # Figure out the sorting indices, so that things are sorted first by the 'for arch', and then by 'date' - key_func = lambda x: (status_table['for arch'][x], status_table['timestamp'][x]) - sorted_indices = sorted(range(len(status_table['for arch'])), key=key_func) + sorted_indices = sorted( + range(len(status_table['for arch'])), + key=lambda x: (status_table['for arch'][x], status_table['timestamp'][x]) + ) # Reverse, so that the newest builds are first sorted_indices.reverse() # Apply the sorted indices to get a sorted table @@ -624,8 +626,10 @@ def handle_bot_command_status(self, event_info, bot_command): status_table_last[key].append(sorted_table[key][x]) # Re-sort, now only on 'for arch', for nicer viewing - key_func = lambda x: status_table_last['for arch'][x] - sorted_indices = sorted(range(len(status_table_last['for arch'])), key=key_func) + sorted_indices = sorted( + range(len(status_table_last['for arch'])), + key=lambda x: status_table_last['for arch'][x] + ) sorted_table_last = {key: [status_table_last[key][i] for i in sorted_indices] for key in status_table_last} # overwrite the original status_table From fc3c23dc8a08bf5780cc6faa6a39f3993324a5e2 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Wed, 20 Aug 2025 14:25:13 +0200 Subject: [PATCH 118/132] don't replace self.scontrol_command in process_new_job, to keep placeholder and allow processing jobs that require different placeholder values --- eessi_bot_job_manager.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index d0423923..238174c0 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -305,18 +305,18 @@ def process_new_job(self, new_job): # processing placeholders in scontrol command which is defined in the bot's app.cfg (setting `scontrol_command`) try: - self.scontrol_command = self.scontrol_command % new_job + templated_scontrol_command = self.scontrol_command % new_job except KeyError: log(f"Failed to process {self.scontrol_command}.") log(f"Information on placeholder is not collected in new_job: {new_job}.") raise - scontrol_cmd = "%s --oneliner show jobid %s" % ( - self.scontrol_command, + cmd = "%s --oneliner show jobid %s" % ( + templated_scontrol_command, job_id, ) scontrol_output, scontrol_err, scontrol_exitcode = run_cmd( - scontrol_cmd, + cmd, "process_new_job(): scontrol command", log_file=self.logfile, ) @@ -366,7 +366,7 @@ def process_new_job(self, new_job): extra_info = '' if self.job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_HOLD_RELEASE: release_cmd = "%s release %s" % ( - self.scontrol_command, + templated_scontrol_command, job_id, ) From 0fdfed4efe9b020d3e8faf3b8c1437aad0d3518e Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Wed, 20 Aug 2025 14:29:52 +0200 Subject: [PATCH 119/132] fix comment --- eessi_bot_job_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 238174c0..fd67b913 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -321,7 +321,7 @@ def process_new_job(self, new_job): log_file=self.logfile, ) - # parse output of 'scontrol_cmd' + # parse output of scontrol command that fetches job info job_info = self.parse_scontrol_show_job_output(str(scontrol_output)) # check if job_info contains 'WorkDir', if not we cannot process the job From 4b6d62a959f8c2fb5ced3bed96c9c146d227a79c Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 21 Aug 2025 09:54:07 +0200 Subject: [PATCH 120/132] release notes for v0.9.9 --- RELEASE_NOTES | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index eb5b4ebd..43e1b9af 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -1,6 +1,36 @@ This file contains a description of the major changes to the EESSI build-and-deploy bot. For more detailed information, please see the git log. +v0.9.0 (21 August 2025) +-------------------------- + +This is a minor release of the EESSI build-and-deploy bot. + +Note! Though it is a minor release it includes breaking changes of the user +interface for triggering build jobs. For details see below and documentation at +https://www.eessi.io/docs/bot/#building + +Bug fixes: +* revised and updated app.cfg.example and README.md (#325) +* do not overwrite config value for scontrol command (#335) + +Improvements: +* add support for cloning target repository via ssh (#300) +* major refactoring of the definition of build targets and breaking change of + the user interface to trigger build (#312, #329, #331) + * `bot:build` filters `architecture:` and `accel:` are replaced by `on:` and + `for:` (for details see documentation at https://www.eessi.io/docs/bot/#building) +* add CI to build and publish smee-client container image (#321, #322) +* make space before bot command optional (#324) +* support template values in the `scontrol` command (#327) +* support additional parameter for `bot:status` command (#335) + +Changes to 'app.cfg' settings (see README.md and app.cfg.example for details): +* NEW (optional) 'clone_git_repo_via' in section '[buildenv]' +* NEW (required) 'pr_diff_failure' in section '[download_pr_comments]' +* NEW (required) 'pr_diff_tip' in section '[download_pr_comments]' + + v0.8.0 (23 May 2025) -------------------------- From e0800ebcb970d4a3babd1e436b1ea4346d48f2d7 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 21 Aug 2025 13:10:49 +0200 Subject: [PATCH 121/132] add missing information --- RELEASE_NOTES | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 43e1b9af..7818036e 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -18,14 +18,20 @@ Improvements: * add support for cloning target repository via ssh (#300) * major refactoring of the definition of build targets and breaking change of the user interface to trigger build (#312, #329, #331) + * `arch_target_map` is replaced by `node_type_map` which provides a more + comprehensive/flexible approach to define architectures that are available + for build jobs * `bot:build` filters `architecture:` and `accel:` are replaced by `on:` and `for:` (for details see documentation at https://www.eessi.io/docs/bot/#building) * add CI to build and publish smee-client container image (#321, #322) * make space before bot command optional (#324) * support template values in the `scontrol` command (#327) -* support additional parameter for `bot:status` command (#335) +* support additional parameter for `bot:status` command (#334) Changes to 'app.cfg' settings (see README.md and app.cfg.example for details): +* REMOVED (required) 'arch_target_map' in section '[buildenv]', replaced by + 'node_type_map' +* NEW (required) 'node_type_map' in section '[buildenv]' * NEW (optional) 'clone_git_repo_via' in section '[buildenv]' * NEW (required) 'pr_diff_failure' in section '[download_pr_comments]' * NEW (required) 'pr_diff_tip' in section '[download_pr_comments]' From 42d95ae94b5a75f76ecc2f6464ea577a17ce969e Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 21 Aug 2025 13:26:38 +0200 Subject: [PATCH 122/132] replacing more arch_target_map with node_type_map --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index dd6b3234..8aa1b931 100644 --- a/README.md +++ b/README.md @@ -1081,7 +1081,7 @@ moved and where. # Step 6: Creating a ReFrame configuration file for the test step (only needed when building for the [EESSI software layer](https://github.com/EESSI/software-layer)) -Part of the test step of the EESSI software layer is running the EESSI test suite. This requires putting a ReFrame configuration file in place that describes the partitions in the `arch_target_map` of the bot config. +Part of the test step of the EESSI software layer is running the EESSI test suite. This requires putting a ReFrame configuration file in place that describes the partitions in the `node_type_map` of the bot config. You can find general documentation on how to write a ReFrame config file in the [EESSI documentation](https://www.eessi.io/docs/test-suite/ReFrame-configuration-file/). However, some specifics apply when setting things up for the test step: @@ -1196,7 +1196,7 @@ site_configuration = { ## Approach 2: describing a virtual node -In this approach, we describe a virtual node configuration for which the size matches exactly what is allocated by the bot (through the `slurm_params` and `arch_target_map`). In this example, we'll assume that this node has 4 GPUs and 72 cores, distributed over 2 sockets each consisting of 1 NUMA domain. We also assume our bot is configured with `slurm_params = --hold --nodes=1 --export=None --time=0:30:0` and `arch_target_map = {"linux/x86_64/intel/skylake_avx512" : "--partition=gpu --cpus-per-task=18 --gpus-per-node 1"}`, i.e. it effectively allocates a quarter node. We describe a virtual partition for ReFrame as if this quarter node is a full node, i.e. we pretend it is a partition with 18 cores and 1 GPU per node, with 1 socket. +In this approach, we describe a virtual node configuration for which the size matches exactly what is allocated by the bot (through the `slurm_params` and `node_type_map`). In this example, we'll assume that this node has 4 GPUs and 72 cores, distributed over 2 sockets each consisting of 1 NUMA domain. We also assume our bot is configured with `slurm_params = --hold --nodes=1 --export=None --time=0:30:0` and `node_type_map = {"linux/x86_64/intel/skylake_avx512" : "--partition=gpu --cpus-per-task=18 --gpus-per-node 1"}`, i.e. it effectively allocates a quarter node. We describe a virtual partition for ReFrame as if this quarter node is a full node, i.e. we pretend it is a partition with 18 cores and 1 GPU per node, with 1 socket. We would first have to hardcode the CPU configuration. @@ -1246,7 +1246,7 @@ For the GPU configuration, we simply put: } ``` -To match the fact that we allocate 1 GPU in the `arch_target_map`. +To match the fact that we allocate 1 GPU in the `node_type_map`. ### Complete example config From fdc3b3adc3bddae4162fde3b0245f2e657614aba Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 21 Aug 2025 17:38:01 +0200 Subject: [PATCH 123/132] add PR with additional fixes for README.md --- RELEASE_NOTES | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 7818036e..216b2196 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -17,7 +17,7 @@ Bug fixes: Improvements: * add support for cloning target repository via ssh (#300) * major refactoring of the definition of build targets and breaking change of - the user interface to trigger build (#312, #329, #331) + the user interface to trigger build (#312, #329, #331, #337) * `arch_target_map` is replaced by `node_type_map` which provides a more comprehensive/flexible approach to define architectures that are available for build jobs From c7e61aa93db39df0a2cc13c4a3e6f480c3d102e7 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Thu, 21 Aug 2025 18:13:13 +0200 Subject: [PATCH 124/132] improvements/fixes taking review into account --- RELEASE_NOTES | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 216b2196..2fdb11a8 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -6,9 +6,9 @@ v0.9.0 (21 August 2025) This is a minor release of the EESSI build-and-deploy bot. -Note! Though it is a minor release it includes breaking changes of the user -interface for triggering build jobs. For details see below and documentation at -https://www.eessi.io/docs/bot/#building +Note! Though it is a minor release it includes breaking changes of the bot +configuration and the user interface for triggering build jobs. For details +see below and documentation at https://www.eessi.io/docs/bot/#building Bug fixes: * revised and updated app.cfg.example and README.md (#325) @@ -20,18 +20,22 @@ Improvements: the user interface to trigger build (#312, #329, #331, #337) * `arch_target_map` is replaced by `node_type_map` which provides a more comprehensive/flexible approach to define architectures that are available - for build jobs + for build jobs; for details, see https://github.com/EESSI/eessi-bot-software-layer?tab=readme-ov-file#architecturetargets-section + * the `repo_target_map` setting is removed because the information is now + included in the `node_type_map` setting * `bot:build` filters `architecture:` and `accel:` are replaced by `on:` and `for:` (for details see documentation at https://www.eessi.io/docs/bot/#building) * add CI to build and publish smee-client container image (#321, #322) * make space before bot command optional (#324) -* support template values in the `scontrol` command (#327) -* support additional parameter for `bot:status` command (#334) +* support template values in the `scontrol` command (#327, #335) +* support additional parameter (`last_build`) for `bot:status` command (#334) Changes to 'app.cfg' settings (see README.md and app.cfg.example for details): -* REMOVED (required) 'arch_target_map' in section '[buildenv]', replaced by +* REMOVED (required) 'arch_target_map' in section '[architecturetargets]', replaced by 'node_type_map' -* NEW (required) 'node_type_map' in section '[buildenv]' +* REMOVED (required) 'repo_target_map' in section '[repo_targets]', replaced by + 'node_type_map' +* NEW (required) 'node_type_map' in section '[architecturetargets]' * NEW (optional) 'clone_git_repo_via' in section '[buildenv]' * NEW (required) 'pr_diff_failure' in section '[download_pr_comments]' * NEW (required) 'pr_diff_tip' in section '[download_pr_comments]' From 405fb2f84d24c7d128c585b2092d271f66ff0bd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20R=C3=B6blitz?= Date: Fri, 22 Aug 2025 07:30:55 +0200 Subject: [PATCH 125/132] rephrase info about breaking changes Co-authored-by: Kenneth Hoste --- RELEASE_NOTES | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 2fdb11a8..55e86a14 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -16,8 +16,8 @@ Bug fixes: Improvements: * add support for cloning target repository via ssh (#300) -* major refactoring of the definition of build targets and breaking change of - the user interface to trigger build (#312, #329, #331, #337) +* major refactoring of the definition of build targets and breaking change in + bot configuration and of the user interface to trigger builds (#312, #329, #331, #337) * `arch_target_map` is replaced by `node_type_map` which provides a more comprehensive/flexible approach to define architectures that are available for build jobs; for details, see https://github.com/EESSI/eessi-bot-software-layer?tab=readme-ov-file#architecturetargets-section From 447a05c851f6a408a1bcb93a3e672af6389f99e2 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 22 Aug 2025 07:34:16 +0200 Subject: [PATCH 126/132] date bump --- RELEASE_NOTES | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 55e86a14..346fbc08 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -1,7 +1,7 @@ This file contains a description of the major changes to the EESSI build-and-deploy bot. For more detailed information, please see the git log. -v0.9.0 (21 August 2025) +v0.9.0 (22 August 2025) -------------------------- This is a minor release of the EESSI build-and-deploy bot. From d4d649cb474fbb62b6db1d31add259b658388b95 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 22 Aug 2025 08:28:36 +0200 Subject: [PATCH 127/132] remove duplicate information --- RELEASE_NOTES | 1 - 1 file changed, 1 deletion(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 346fbc08..4b61e797 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -12,7 +12,6 @@ see below and documentation at https://www.eessi.io/docs/bot/#building Bug fixes: * revised and updated app.cfg.example and README.md (#325) -* do not overwrite config value for scontrol command (#335) Improvements: * add support for cloning target repository via ssh (#300) From 1e70f3ea5bb8bd87698a41fa9214d5dcf4faf789 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 22 Aug 2025 08:56:15 +0200 Subject: [PATCH 128/132] update upload-artifact version to 4.6.2 --- .github/workflows/scorecards.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 96379ba1..6dfef965 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -67,7 +67,7 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@3cea5372237819ed00197afe530f5a7ea3e805c8 # v3.1.0 + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: SARIF file path: results.sarif From 05ead1dbb9f8ac3de9cc76b1c3c74463efda115e Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 22 Aug 2025 09:05:28 +0200 Subject: [PATCH 129/132] also run scorecards when pushing/pull_requesting to develop branch --- .github/workflows/scorecards.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 6dfef965..2c2989ee 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -17,10 +17,9 @@ on: schedule: - cron: '25 15 * * 3' push: - branches: [ "main" ] + branches: [ "main", "develop" ] pull_request: - branches: - - main + branches: [ "main", "develop" ] # Declare default permissions as read only. permissions: read-all From c6220febe469c4624b100ff633644195295daf34 Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 22 Aug 2025 09:12:12 +0200 Subject: [PATCH 130/132] change action version 4.3.3 --- .github/workflows/scorecards.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 2c2989ee..5b8ff0c3 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -66,7 +66,7 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 + uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3 with: name: SARIF file path: results.sarif From 7e197ae37009e7eaea0827e91544759854514010 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Fri, 22 Aug 2025 09:20:24 +0200 Subject: [PATCH 131/132] also mention v0.9.0 release notes that initial_comment was replaced in bot configuration by new_job_instance_repo/build_on_arch/build_for_arch/jobdir --- RELEASE_NOTES | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index 4b61e797..6dd9bac6 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -22,6 +22,10 @@ Improvements: for build jobs; for details, see https://github.com/EESSI/eessi-bot-software-layer?tab=readme-ov-file#architecturetargets-section * the `repo_target_map` setting is removed because the information is now included in the `node_type_map` setting + * the `initial_comment` setting in the `[submitted_job_comments]` section of the bot configuration + has been replaced with separate settings: `new_job_instance_repo`, `build_on_arch`, `build_for_arch`, `jobdir`; + for details, see https://github.com/EESSI/eessi-bot-software-layer/blob/develop/README.md#submitted_job_comments-section + and the example bot configuration `app.cfg.example`; * `bot:build` filters `architecture:` and `accel:` are replaced by `on:` and `for:` (for details see documentation at https://www.eessi.io/docs/bot/#building) * add CI to build and publish smee-client container image (#321, #322) From fc4d2045fb75e87814e3c5e3facbcad389d2d9fe Mon Sep 17 00:00:00 2001 From: Thomas Roeblitz Date: Fri, 22 Aug 2025 09:21:49 +0200 Subject: [PATCH 132/132] update version of codeql-action/upload-sarif to v3.25.6 --- .github/workflows/scorecards.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 5b8ff0c3..a0025b23 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -74,6 +74,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@807578363a7869ca324a79039e6db9c843e0e100 # v2.1.27 + uses: github/codeql-action/upload-sarif@9fdb3e49720b44c48891d036bb502feb25684276 # v3.25.6 with: sarif_file: results.sarif