diff --git a/.github/workflows/tests_scripts.yml b/.github/workflows/tests_scripts.yml
new file mode 100644
index 00000000..1f8d012b
--- /dev/null
+++ b/.github/workflows/tests_scripts.yml
@@ -0,0 +1,39 @@
+# documentation: https://help.github.com/en/articles/workflow-syntax-for-github-actions
+name: Tests for scripts
+on:
+ push:
+ paths:
+ - scripts/sign_verify_file_ssh.sh
+ pull_request:
+ paths:
+ - scripts/sign_verify_file_ssh.sh
+permissions:
+ contents: read # to fetch code (actions/checkout)
+jobs:
+ build:
+ runs-on: ubuntu-24.04
+ steps:
+ - name: checkout
+ uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+
+ - name: test sign_verify_file_ssh.sh script
+ run: |
+ # Create a PEM format ssh identity
+ ssh-keygen -t rsa -b 4096 -m PEM -f id_rsa.pem -N ""
+ # Create a file to sign
+ echo "Very important stuff" > out.txt
+ export FILE_TO_SIGN="out.txt"
+ # Sign the file
+ ./scripts/sign_verify_file_ssh.sh sign id_rsa.pem "$FILE_TO_SIGN"
+ # Create an allowed_signers file based on the public key
+ echo -n "allowed_identity " > allowed_signers
+ cat id_rsa.pem.pub >> allowed_signers
+ # Verify the signature
+ ./scripts/sign_verify_file_ssh.sh verify allowed_signers "$FILE_TO_SIGN"
+ # Make a new signature that does not appear in the allowed signers file
+ ssh-keygen -t rsa -b 4096 -m PEM -f id_rsa.alt.pem -N ""
+ # Replace the allowed signers file
+ echo -n "disallowed_identity " > allowed_signers
+ cat id_rsa.alt.pem.pub >> allowed_signers
+ # Make sure signature checking fails in this case
+ ./scripts/sign_verify_file_ssh.sh verify allowed_signers "$FILE_TO_SIGN" && exit 1 || echo "Expected failure for unknown identity"
diff --git a/README.md b/README.md
index 836cc168..77c9cdd1 100644
--- a/README.md
+++ b/README.md
@@ -375,6 +375,30 @@ package repositories. Typically these settings are set in the prologue of a
Slurm job. However, when entering the [EESSI compatibility layer](https://www.eessi.io/docs/compatibility_layer),
most environment settings are cleared. Hence, they need to be set again at a later stage.
+```
+job_delay_begin_factor = 2
+```
+The `job_delay_begin_factor` setting defines how many times the `poll_interval` a
+job's begin (EligibleTime) from now should be delayed if the handover protocol
+is set to `delayed_begin` (see setting `job_handover_protocol`). That is, if
+the `job_delay_begin_factor` is set to five (5) the delay time is calculated as
+5 * `poll_interval`. The event manager would use 2 as default value when
+submitting jobs.
+
+```
+job_handover_protocol = hold_release
+```
+The `job_handover_protocol` setting defines which method is used to handover a
+job from the event handler to the job manager. Values are
+ - `hold_release` (job is submitted with `--hold`, job manager removes the hold
+ with `scontrol release`)
+ - `delayed_begin` (job is submitted with `--begin=now+(5 * poll_interval)` and
+ any `--hold` is removed from the submission parameters); see setting
+ `poll_interval` further below; this is useful if the
+ bot account cannot run `scontrol release` to remove the hold of the job;
+ also, the status update in the PR comment of the job is extended by noting
+ the `EligibleTime`
+
```
job_name = JOB_NAME
```
@@ -403,6 +427,17 @@ on a compute/worker node. You may have to change this if temporary storage under
environment variable `$EESSI_TMPDIR`. The value is expanded only inside a running
job. Thus, typical job environment variables (like `$USER` or `$SLURM_JOB_ID`) may be used to isolate jobs running
simultaneously on the same compute node.
+
+```
+site_config_script = /path/to/script/if/any
+```
+`site_config_script` specifies the path to a script that - if it exists - is
+sourced in the build job before any `bot/*` script is run. This allows to
+customize the build environment due to specifics of the build site/cluster.
+Note, such customizations could also be performed by putting them into a
+module file and use the setting `load_modules` (see above). However, the
+setting `site_config_script` provides a low threshold for achieving this, too.
+
```
slurm_params = "--hold"
```
@@ -433,6 +468,22 @@ allow_update_submit_opts = false
options via custom module `det_submit_opts` provided by the pull request being
processed.
+```
+allowed_exportvars = ["NAME1=value_1a", "NAME1=value_1b", "NAME2=value_2"]
+```
+`allowed_exportvars` defines a list of name-value pairs (environment
+variables) that are allowed to be specified in a PR command with the
+`exportvariable` filter. To specify multiple environment variables, multiple
+`exportvariable` filters must be used (one per variable). These variables will
+be exported into the build environment before running the bot/build.sh script.
+
+The bot build script makes use of the variable `SKIP_TESTS` to determine if
+ReFrame tests shall be skipped or not. Default is not to skip them. To allow the
+use of the variable the setting could look like
+```
+allowed_exportvars = ["SKIP_TESTS=yes", "SKIP_TESTS=no"]
+```
+
#### `[bot_control]` section
@@ -464,6 +515,35 @@ artefact_upload_script = PATH_TO_EESSI_BOT/scripts/eessi-upload-to-staging
```
`artefact_upload_script` provides the location for the script used for uploading built software packages to an S3 bucket.
+```
+signing =
+ {
+ REPO_ID: {
+ "script": PATH_TO_SIGN_SCRIPT,
+ "key": PATH_TO_KEY_FILE,
+ "container_runtime": PATH_TO_CONTAINER_RUNTIME
+ }, ...
+ }
+```
+`signing` provides a setting for signing artefacts. The value uses a JSON-like format
+with `REPO_ID` being the repository ID. Repository IDs are defined in a file
+`repos.cfg` (see setting `repos_cfg_dir`), `script` provides the location of the
+script that is used to sign a file. If the location is a relative path, the script
+must reside in the checked out pull request of the target repository (e.g.,
+EESSI/software-layer). `key` points to the file of the key being used
+for signing. The bot calls the script with the two arguments:
+ 1. private key (as provided by the attribute 'key')
+ 2. path to the file to be signed (the upload script will determine that)
+NOTE (on `container_runtime`), signing requires a recent installation of OpenSSH
+(8.2 or newer). If the frontend where the event handler runs does not have that
+version installed, you can specify a container runtime via the `container_runtime`
+attribute below. Currently, only Singularity or Apptainer are supported.
+Note (on the key), make sure the file permissions are restricted to `0600` (only
+readable+writable by the file owner, or the signing will likely fail.
+Note (on json format), make sure no trailing commas are used after any elements
+or parsing/loading the json will likely fail. Also, the whole value should start
+at a new line and be indented as shown above.
+
```
endpoint_url = URL_TO_S3_SERVER
```
@@ -645,12 +725,30 @@ scontrol_command = /usr/bin/scontrol
#### `[submitted_job_comments]` section
The `[submitted_job_comments]` section specifies templates for messages about newly submitted jobs.
+
+DEPRECATED setting (use `awaits_release_delayed_begin_msg` and/or `awaits_release_hold_release_msg`)
```
awaits_release = job id `{job_id}` awaits release by job manager
```
`awaits_release` is used to provide a status update of a job (shown as a row in the job's status
table).
+```
+awaits_release_delayed_begin_msg = job id `{job_id}` will be eligible to start in about {delay_seconds} seconds
+```
+`awaits_release_delayed_begin_msg` is used when the `job_handover_protocol` is
+set to `delayed_begin`. Note, both `{job_id}` and `{delay_seconds}` need to be
+present in the value or the event handler will throw an exception when formatting
+the update of the PR comment corresponding to the job.
+
+```
+awaits_release_hold_release_msg = job id `{job_id}` awaits release by job manager
+```
+`awaits_release_hold_release_msg` is used when the `job_handover_protocol` is
+set to `hold_release`. Note, `{job_id}` needs to be present in the value or the
+event handler will throw an exception when formatting the update of the PR
+comment corresponding to the job.
+
```
initial_comment = New job on instance `{app_name}` for architecture `{arch_name}`{accelerator_spec} for repository `{repo_id}` in job dir `{symlink}`
```
@@ -756,7 +854,237 @@ moved_job_dirs_comment = PR merged! Moved `{job_dirs}` to `{trash_bin_dir}`
Template that is used by the bot to add a comment to a PR noting down which directories have been
moved and where.
-# Instructions to run the bot components
+# Step 6: Creating a ReFrame configuration file for the test step (only needed when building for the [EESSI software layer](https://github.com/EESSI/software-layer))
+Part of the test step of the EESSI software layer is running the EESSI test suite. This requires putting a ReFrame configuration file in place that describes the partitions in the `arch_target_map` of the bot config.
+
+You can find general documentation on how to write a ReFrame config file in the [EESSI documentation](https://www.eessi.io/docs/test-suite/ReFrame-configuration-file/). However, some specifics apply when setting things up for the test step:
+
+- The configuration file has to be in `{shared_fs_path}/reframe_config.py` (recommended) or you have to set `RFM_CONFIG_FILES` to point to the configuration file and you have to make sure that is a location that is available (mounted) in the build container.
+- The system name _has_ to be `BotBuildTests`
+- Partition names should be ${EESSI_SOFTWARE_SUBDIR//\//_} for non-accelerator partitions and ${EESSI_SOFTWARE_SUBDIR//\//_}_${EESSI_ACCELERATOR_TARGET//\//_} for accelerator partitions. In words: the partition name should be the software subdir, replacing slashes with underscores, and for accelerators appending the accelerator target (again replacing slashes with underscores). E.g. x86_64_intel_skylake_avx512_nvidia_cc80 would be a valid partition name for a partition with Intel skylake's + Nvidia A100s.\
+- The `scheduler` should be `local`, as the bot already schedules the job (ReFrame should just locally spawn the tests in the allocation created by the bot).
+- The `access` field should not be used by ReFrame if the local scheduler is defined, you can simply omit this keyword.
+
+To configure the number of GPUs and CPUs, we have two options:
+1. We describe the physical node in the ReFrame configuration file and set the `REFRAME_SCALE_TAG` environment variable to match the size of the allocation that you specify in your bot config. E.g. if your bot config allocates 1/4th of a node, one would set `REFRAME_SCALE_TAG=1_4_node` in the environment of the job submitted by the bot.
+2. We describe a virtual node configuration that matches the size of the allcation created by the bot (and we use the default `REFRAME_SCALE_TAG=1_node`, you don't have to set this explicitely).
+
+The first approach is the easiest, and thus recommended, since you can use CPU autodetection by ReFrame. The second approach allows for more flexibility.
+
+## Approach 1 (recommended): describing the physical node and setting the `REFRAME_SCALE_TAG` to match the bot config's allocation size
+In this approach, we describe the physical node configuration. That means: the amount of physical CPUs and GPUs present in the node.
+
+For the CPU part, we can rely on ReFrame's CPU autodetection: if `remote_detect` is set to `True` in the general section of the config, and no CPU topology information is provided in the ReFrame configuration file, ReFrame will automatically detect the [CPU topology](https://reframe-hpc.readthedocs.io/en/stable/config_reference.html#config.systems.partitions.processor).
+
+For the GPU part, we need to configure the vendor and the amount of GPUs. E.g. for a partition with 4 Nvidia GPUs per node:
+```
+'partition': {
+...
+ 'extras': {
+ GPU_VENDOR: GPU_VENDORS[NVIDIA],
+ },
+ 'devices': [
+ {
+ 'type': DEVICE_TYPES[GPU],
+ 'num_devices': 4,
+ }
+ ]
+}
+```
+
+Now, we need to make sure ReFrame only starts tests that have scales that fit within the allocation created by the bot. E.g. on a GPU node, it would be quite common to only allocate a single GPU for building GPU software. In the above example, that means only a quarter node. We can make sure the EESSI test suite only runs tests that fit within a 25% of the physical node described above by making sure the `REFRAM_SCALE_TAG` environment variable is set to `1_4_node`. You can find a list of all valid values for the `REFRAME_SCALE_TAG` by checking the `SCALES` constant in the [EESSI test suite](https://github.com/EESSI/test-suite/blob/main/eessi/testsuite/constants.py).
+
+Note that if you had e.g. a node with 6 GPUs per node, and you were building on 1 GPU, you probably want to go for Approach 2, since `1_6_node` is not a known scale in the EESSI test suite. Although you could set `REFRAME_SCALE_TAG=1_8_node`, this would lead to undefined behavior for the amount of GPUs allocated (may be 1, may be 0). For CPU-based nodes, this could however be a reasonable approach.
+
+Note that if for _some_ partitions you use e.g. quarter nodes, and for some full nodes, you'll have to set the `REFRAME_SCALE_TAG` conditionally based on the node architecture. You could e.g. do this in a `.bashrc` that has some conditional logic to determine the node type and set the corresponding scale. Alternatively, you could use Approach 2.
+
+### Complete example config
+In this example, we assume a node with 4 A100 GPUs (compute capability `cc80`) and 72 CPU cores (Intel Skylake) and 512 GB of memory (of which 491520 MiB is useable by SLURM jobs; on this system the rest is reserved for the OS):
+```
+from eessi.testsuite.common_config import common_logging_config
+from eessi.testsuite.constants import * # noqa: F403
+
+
+site_configuration = {
+ 'systems': [
+ {
+ 'name': 'BotBuildTests', # The system HAS to have this name, do NOT change it
+ 'descr': 'Software-layer bot',
+ 'hostnames': ['.*'],
+ 'modules_system': 'lmod',
+ 'partitions': [
+ {
+ 'name': 'x86_64_intel_skylake_avx512_nvidia_cc80',
+ 'scheduler': 'local',
+ 'launcher': 'mpirun',
+ 'environs': ['default'],
+ 'features': [
+ FEATURES[GPU] # We want this to run GPU-based tests from the EESSI test suite
+ ] + list(SCALES.keys()),
+ 'resources': [
+ {
+ 'name': 'memory',
+ 'options': ['--mem={size}'],
+ }
+ ],
+ 'extras': {
+ # Make sure to round down, otherwise a job might ask for more mem than is available
+ # per node
+ 'mem_per_node': 491520, # in MiB (512 GB minus some reserved for the OS)
+ GPU_VENDOR: GPU_VENDORS[NVIDIA],
+ },
+ 'devices': [
+ {
+ 'type': DEVICE_TYPES[GPU],
+ 'num_devices': 4,
+ }
+ ],
+ 'max_jobs': 1
+ },
+ ]
+ }
+ ],
+ 'environments': [
+ {
+ 'name': 'default',
+ 'cc': 'cc',
+ 'cxx': '',
+ 'ftn': ''
+ }
+ ],
+ 'general': [
+ {
+ 'purge_environment': True,
+ 'resolve_module_conflicts': False, # avoid loading the module before submitting the job
+ 'remote_detect': True, # Make sure to automatically detect the CPU topology
+ }
+ ],
+ 'logging': common_logging_config(),
+}
+```
+
+## Approach 2: describing a virtual node
+In this approach, we describe a virtual node configuration for which the size matches exactly what is allocated by the bot (through the `slurm_params` and `arch_target_map`). In this example, we'll assume that this node has 4 GPUs and 72 cores, distributed over 2 sockets each consisting of 1 NUMA domain. We also assume our bot is configured with `slurm_params = --hold --nodes=1 --export=None --time=0:30:0` and `arch_target_map = {"linux/x86_64/intel/skylake_avx512" : "--partition=gpu --cpus-per-task=18 --gpus-per-node 1"}`, i.e. it effectively allocates a quarter node. We describe a virtual partition for ReFrame as if this quarter node is a full node, i.e. we pretend it is a partition with 18 cores and 1 GPU per node, with 1 socket.
+
+We would first have to hardcode the CPU configuration.
+```
+'partition': {
+...
+ 'processor': {
+ "num_cpus": 18,
+ "num_cpus_per_core": 1,
+ "num_cpus_per_socket": 18,
+ "num_sockets": 1,
+ "topology": {
+ "numa_nodes": [
+ # As stated, the 18 cores are on a single NUMA domain. Thus, the bitmask should be a sequence of 18 1's, which is 3ffff in hexadecimal representation
+ "0x3ffff", # a bit mask of 111111111111111111, i.e. cores 0-17 are on this NUMA domain
+ ],
+ },
+ }
+}
+```
+
+Note that if instead, this node would have had 8 NUMA domains (4 per socket), the 18 cores would correspond to 2 NUMA domains and we would have had to define:
+```
+"numa_nodes": [
+ "0x001ff", # a bit mask of 000000000111111111, i.e. cores 0-8 are on this NUMA domain
+ "0x3fe00", # a bit mask of 111111111000000000, i.e. cores 9-17 are on this NUMA domain
+]
+```
+
+Note that the `topology` dictionary in a ReFrame configuration file can contain more information, such as the bitmasks for the CPU sockets and cores, as well as information on the caches (see [here](https://reframe-hpc.readthedocs.io/en/stable/config_reference.html#config.systems.partitions.processor.topology)). Currently, that information is not needed by the EESSI test suite, but that may change if tests are added that utilize such information to execute efficiently.
+
+For the GPU configuration, we simply put:
+```
+'partition': {
+...
+ 'extras': {
+ GPU_VENDOR: GPU_VENDORS[NVIDIA],
+ },
+ 'devices': [
+ {
+ 'type': DEVICE_TYPES[GPU],
+ 'num_devices': 1,
+ }
+ ]
+}
+```
+To match the fact that we allocate 1 GPU in the `arch_target_map`.
+
+### Complete example config
+In this example, we assume a node with 4 A100 GPUs (compute capability `cc80`) and 72 CPU cores (Intel Skylake) and 512 GB of memory (of which 491520 MiB is useable by SLURM jobs; on this system the rest is reserved for the OS). We also assume the bot configuration is such for this partition that 1/4th of these nodes gets allocated for a build job:
+```
+site_configuration = {
+ 'systems': [
+ {
+ 'name': 'BotBuildTests', # The system HAS to have this name, do NOT change it
+ 'descr': 'Software-layer bot',
+ 'hostnames': ['.*'],
+ 'modules_system': 'lmod',
+ 'partitions': [
+ {
+ 'name': 'x86_64_intel_skylake_avx512_nvidia_cc80',
+ 'scheduler': 'local',
+ 'launcher': 'mpirun',
+ 'environs': ['default'],
+ 'features': [
+ FEATURES[GPU] # We want this to run GPU-based tests from the EESSI test suite
+ ] + list(SCALES.keys()),
+ 'resources': [
+ {
+ 'name': 'memory',
+ 'options': ['--mem={size}'],
+ }
+ ],
+ 'extras': {
+ # Make sure to round down, otherwise a job might ask for more mem than is available
+ # per node
+ 'mem_per_node': 122880, # in MiB (1/4th of 491520 MiB)
+ GPU_VENDOR: GPU_VENDORS[NVIDIA],
+ },
+ 'devices': [
+ {
+ 'type': DEVICE_TYPES[GPU],
+ 'num_devices': 1,
+ }
+ ],
+ 'processor': {
+ "num_cpus": 18,
+ "num_cpus_per_core": 1,
+ "num_cpus_per_socket": 18,
+ "num_sockets": 1,
+ "topology": {
+ "numa_nodes": [
+ # As stated, the 18 cores are on a single NUMA domain. Thus, the bitmask should be a sequence of 18 1's, which is 3ffff in hexadecimal representation
+ "0x3ffff",
+ ],
+ },
+ },
+ 'max_jobs': 1
+ },
+ ]
+ }
+ ],
+ 'environments': [
+ {
+ 'name': 'default',
+ 'cc': 'cc',
+ 'cxx': '',
+ 'ftn': ''
+ }
+ ],
+ 'general': [
+ {
+ 'purge_environment': True,
+ 'resolve_module_conflicts': False, # avoid loading the module before submitting the job
+ }
+ ],
+ 'logging': common_logging_config(),
+}
+```
+
+# Step 7: Instructions to run the bot components
The bot consists of three components:
* the Smee client;
@@ -765,7 +1093,7 @@ The bot consists of three components:
Running the Smee client was explained in [Step 1](#step1).
-## Step 6.1: Running the event handler
+## Step 7.1: Running the event handler
As the event handler may run for a long time, it is advised to run it in a `screen` or `tmux` session.
The event handler is provided by the [`eessi_bot_event_handler.py`](https://github.com/EESSI/eessi-bot-software-layer/blob/main/eessi_bot_event_handler.py) Python script.
@@ -788,7 +1116,7 @@ The event handler writes log information to the files `pyghee.log` and
Note, if you run the bot on a frontend of a cluster with multiple frontends make sure that both the Smee client and the event handler run on the same system!
-## Step 6.2: Running the job manager
+## Step 7.2: Running the job manager
As the job manager may run for a long time, it is advised to run it in a `screen` or `tmux` session.
The job manager is provided by the [`eessi_bot_job_manager_layer.py`](https://github.com/EESSI/eessi-bot-software-layer/blob/main/eessi_bot_job_manager.py) Python script. You can run the job manager from the directory `eessi-bot-software-layer` simply by:
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
index dd5378e1..5dc4bf33 100644
--- a/RELEASE_NOTES
+++ b/RELEASE_NOTES
@@ -1,6 +1,37 @@
This file contains a description of the major changes to the EESSI
build-and-deploy bot. For more detailed information, please see the git log.
+v0.7.0 (13 March 2025)
+--------------------------
+
+This is a minor release of the EESSI build-and-deploy bot.
+
+Bug fixes:
+* bot only reports moving to trash_bin when relevant (#292)
+
+Improvements:
+* add support for specifying that build job script is located in another repository (#283)
+* implement exportvariable filter (#288, #291)
+ * see related configuration setting `allowed_exportvars`
+* add alternative method to submit job (using `--begin=now+SOME_DELAY`) (#297)
+ * also see the new related configuration settings `job_handover_protocol` and `job_delay_begin_factor`
+* set the local_tmp that is configured for a site as tmpdir in bot build job script (#299)
+* add setting for a script to customize build environment (#302)
+* add support for signing tarball and metadata file and uploading signatures to S3 bucket (#303)
+* add SSH signing script `sign_verify_file_ssh.sh` (#304)
+* updates of the docs (#293, #298)
+
+Changes to 'app.cfg' settings (see README.md and app.cfg.example for details):
+* NEW (optional) 'allowed_exportvars' in section '[buildenv]'
+* NEW (required) 'awaits_release_hold_release_msg' in section '[submitted_job_comments]'
+* NEW (required) 'awaits_release_hold_release_msg' in section '[submitted_job_comments]'
+* DEPRECATED (optional) 'awaits_release' in section '[submitted_job_comments]'
+* NEW (optional) 'job_delay_begin_factor' in section '[buildenv]'
+* NEW (required) 'job_handover_protocol' in section '[buildenv]'
+* NEW (optional) 'signing' in section '[deploycfg]'
+* NEW (optional) 'site_config_script' in section '[buildenv]'
+
+
v0.6.0 (18 September 2024)
--------------------------
diff --git a/app.cfg.example b/app.cfg.example
index 152de2bc..f9b296f6 100644
--- a/app.cfg.example
+++ b/app.cfg.example
@@ -88,6 +88,25 @@ container_cachedir = PATH_TO_SHARED_DIRECTORY
# http_proxy = http://PROXY_DNS:3128/
# https_proxy = http://PROXY_DNS:3128/
+# The job_delay_begin_factor setting defines how many times the poll_interval a
+# job's begin (EligibleTime) from now should be delayed if the handover protocol
+# is set to `delayed_begin` (see setting `job_handover_protocol`). That is, if
+# the job_delay_begin_factor is set to five (5) the delay time is calculated as
+# 5 * poll_interval. The event manager would use 2 as the default factor when
+# submitting jobs.
+job_delay_begin_factor = 2
+
+# The job_handover_protocol setting defines which method is used to handover a
+# job from the event handler to the job manager. Values are
+# - hold_release (job is submitted with '--hold', job manager removes the hold
+# with 'scontrol release')
+# - delayed_begin (job is submitted with '--begin=now+(5 * poll_interval)' and
+# any '--hold' is removed from the submission parameters); this is useful if the
+# bot account cannot run 'scontrol release' to remove the hold of the job;
+# also, the status update in the PR comment of the job is extended by noting
+# the 'EligibleTime'
+job_handover_protocol = hold_release
+
# Used to give all jobs of a bot instance the same name. Can be used to allow
# multiple bot instances running on the same Slurm cluster.
job_name = prod
@@ -110,6 +129,15 @@ load_modules =
# variables that are only set inside a Slurm job
local_tmp = /tmp/$USER/EESSI
+# PATH to a script that - if it exists - is sourced in the build job
+# before any 'bot/*' script is run. This allows to customize the
+# build environment due to specifics of the build site/cluster.
+# Note, such customizations could also be performed by putting them
+# into a module file and using the setting 'load_modules' (see above).
+# However, the setting 'site_config_script' provides a low threshold
+# for achieving this, too.
+site_config_script = /path/to/script/if/any
+
# parameters to be added to all job submissions
# NOTE do not quote parameter string. Quotes are retained when reading in config and
# then the whole 'string' is recognised as a single parameter.
@@ -132,6 +160,19 @@ no_build_permission_comment = Label `bot:build` has been set by user `{build_lab
# whether or not to allow updating the submit options via custom module det_submit_opts
allow_update_submit_opts = false
+# defines which name-value pairs (environment variables) are allowed to be
+# exported into the build environment via 'exportvariable' filters
+# The bot build script makes use of the variable 'SKIP_TESTS' to determine if
+# ReFrame tests shall be skipped or not. Default value is 'no'. If the value is
+# 'yes' and the exportvariable filter is added to a bot build command
+# ('export:SKIP_TESTS=yes'), ReFrame tests are skipped.
+# NOTE, the setting is optional and commented by default. If you want to enable
+# this feature ('exportvariable' filters), uncomment the line below and define
+# meaningful key-value pair(s). For example, to enable the use of
+# 'exportvariable:SKIP_TESTS=yes' as a filter, the key-value pair would be
+# "SKIP_TESTS=yes".
+# allowed_exportvars = ["NAME1=value_1a", "NAME1=value_1b", "NAME2=value_2"]
+
[deploycfg]
# script for uploading built software packages
@@ -153,6 +194,31 @@ endpoint_url = URL_TO_S3_SERVER
# like: bucket_name = {"eessi-pilot-2023.06": "eessi-staging-pilot-2023.06", "eessi.io-2023.06": "software.eessi.io-2023.06"}
bucket_name = eessi-staging
+# settings for signing artefacts with JSON-like format
+# REPO_ID: { "script": PATH_TO_SIGN_SCRIPT, "key": PATH_TO_KEY_FILE, "container_runtime": PATH_TO_CONTAINER_RUNTIME }
+# If PATH_TO_SIGN_SCRIPT is a relative path, the script must reside in the
+# checked out pull request of the target repository (e.g.,
+# EESSI/software-layer).
+# The bot calls the script with the two arguments:
+# 1. private key (as provided by the attribute 'key')
+# 2. path to the file to be signed (the upload script will determine that)
+# NOTE (on "container_runtime"), signing requires a recent installation of OpenSSH
+# (8.2 or newer). If the frontend where the event handler runs does not have that
+# version installed, you can specify a container runtime via the 'container_runtime'
+# attribute below. Currently, only Singularity or Apptainer are supported.
+# NOTE (on the key), make sure the file permissions are restricted to `0600` (only
+# readable+writable by the file owner, or the signing will likely fail.
+# Note (on json format), make sure no trailing commas are used after any elements
+# or parsing/loading the json will likely fail. Also, the whole value should start
+# at a new line and be indented as shown below.
+signing =
+ {
+ "eessi.io-2023.06-software: {
+ "script": PATH_TO_SIGN_SCRIPT,
+ "key": PATH_TO_EESSI_BOT/config/user-site-system.key,
+ "container_runtime": PATH_TO_CONTAINER_RUNTIME
+ }
+ }
# upload policy: defines what policy is used for uploading built artefacts
# to an S3 bucket
# 'all' ..: upload all artefacts (mulitple uploads of the same artefact possible)
@@ -244,12 +310,14 @@ scontrol_command = /usr/bin/scontrol
# information.
[submitted_job_comments]
awaits_release = job id `{job_id}` awaits release by job manager
+awaits_release_delayed_begin_msg = job id `{job_id}` will be eligible to start in about {delay_seconds} seconds
+awaits_release_hold_release_msg = job id `{job_id}` awaits release by job manager
initial_comment = New job on instance `{app_name}` for CPU micro-architecture `{arch_name}`{accelerator_spec} for repository `{repo_id}` in job dir `{symlink}`
with_accelerator = and accelerator `{accelerator}`
[new_job_comments]
-awaits_launch = job awaits launch by Slurm scheduler
+awaits_launch = job awaits launch by Slurm scheduler{extra_info}
[running_job_comments]
running_job = job `{job_id}` is running
diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py
index 00a1db81..5895fbfb 100644
--- a/eessi_bot_event_handler.py
+++ b/eessi_bot_event_handler.py
@@ -56,6 +56,8 @@
# config.BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS, # optional
# config.BUILDENV_SETTING_HTTPS_PROXY, # optional
# config.BUILDENV_SETTING_HTTP_PROXY, # optional
+ # config.BUILDENV_SETTING_JOB_DELAY_BEGIN_FACTOR, # optional (default: 2)
+ config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL, # required
config.BUILDENV_SETTING_JOB_NAME, # required
config.BUILDENV_SETTING_JOBS_BASE_DIR, # required
# config.BUILDENV_SETTING_LOAD_MODULES, # optional
@@ -75,6 +77,7 @@
# config.DEPLOYCFG_SETTING_ENDPOINT_URL, # optional
config.DEPLOYCFG_SETTING_METADATA_PREFIX, # (required)
config.DEPLOYCFG_SETTING_NO_DEPLOY_PERMISSION_COMMENT, # required
+ # config.DEPLOYCFG_SETTING_SIGNING, # optional
config.DEPLOYCFG_SETTING_UPLOAD_POLICY], # required
config.SECTION_DOWNLOAD_PR_COMMENTS: [
config.DOWNLOAD_PR_COMMENTS_SETTING_CURL_FAILURE, # required
@@ -92,12 +95,18 @@
config.GITHUB_SETTING_APP_NAME, # required
config.GITHUB_SETTING_INSTALLATION_ID, # required
config.GITHUB_SETTING_PRIVATE_KEY], # required
+ # the poll interval setting is required for the alternative job handover
+ # protocol (delayed_begin)
+ config.SECTION_JOB_MANAGER: [
+ config.JOB_MANAGER_SETTING_POLL_INTERVAL], # required
config.SECTION_REPO_TARGETS: [
config.REPO_TARGETS_SETTING_REPO_TARGET_MAP, # required
config.REPO_TARGETS_SETTING_REPOS_CFG_DIR], # required
config.SECTION_SUBMITTED_JOB_COMMENTS: [
config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT, # required
- config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE, # required
+ # config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE, # optional
+ config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_DELAYED_BEGIN_MSG, # required
+ config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_HOLD_RELEASE_MSG, # required
config.SUBMITTED_JOB_COMMENTS_SETTING_WITH_ACCELERATOR], # required
}
@@ -636,33 +645,37 @@ def handle_pull_request_closed_event(self, event_info, pr):
self.log(f"PR {pr.number}: determining directories to be moved to trash bin")
job_dirs = determine_job_dirs(pr.number)
- # 2) Get trash_bin_dir from configs
- trash_bin_root_dir = self.cfg[config.SECTION_CLEAN_UP][config.CLEAN_UP_SETTING_TRASH_BIN_ROOT_DIR]
-
- repo_name = request_body['repository']['full_name']
- dt_start = datetime.now(timezone.utc)
- trash_bin_dir = "/".join([trash_bin_root_dir, repo_name, dt_start.strftime('%Y.%m.%d')])
-
- # Subdirectory with date of move. Also with repository name. Handle symbolic links (later?)
- # cron job deletes symlinks?
-
- # 3) move the directories to the trash_bin
- self.log(f"PR {pr.number}: moving directories to trash bin {trash_bin_dir}")
- move_to_trash_bin(trash_bin_dir, job_dirs)
- dt_end = datetime.now(timezone.utc)
- dt_delta = dt_end - dt_start
- seconds_elapsed = dt_delta.days * 24 * 3600 + dt_delta.seconds
- self.log(f"PR {pr.number}: moved directories to trash bin {trash_bin_dir} (took {seconds_elapsed} seconds)")
+ if job_dirs == []:
+ self.log(f"PR {pr.number}: No job directories found; nothing to move.")
+ else:
+ # 2) Get trash_bin_dir from configs
+ trash_bin_root_dir = self.cfg[config.SECTION_CLEAN_UP][config.CLEAN_UP_SETTING_TRASH_BIN_ROOT_DIR]
- # 4) report move to pull request
- repo_name = pr.base.repo.full_name
- gh = github.get_instance()
- repo = gh.get_repo(repo_name)
- pull_request = repo.get_pull(pr.number)
- clean_up_comment = self.cfg[config.SECTION_CLEAN_UP][config.CLEAN_UP_SETTING_MOVED_JOB_DIRS_COMMENT]
- moved_comment = clean_up_comment.format(job_dirs=job_dirs, trash_bin_dir=trash_bin_dir)
- issue_comment = pull_request.create_issue_comment(moved_comment)
- return issue_comment
+ repo_name = request_body['repository']['full_name']
+ dt_start = datetime.now(timezone.utc)
+ trash_bin_dir = "/".join([trash_bin_root_dir, repo_name, dt_start.strftime('%Y.%m.%d')])
+
+ # Subdirectory with date of move. Also with repository name. Handle symbolic links (later?)
+ # cron job deletes symlinks?
+
+ # 3) move the directories to the trash_bin
+ self.log(f"PR {pr.number}: moving directories to trash bin {trash_bin_dir}")
+ move_to_trash_bin(trash_bin_dir, job_dirs)
+ dt_end = datetime.now(timezone.utc)
+ dt_delta = dt_end - dt_start
+ seconds_elapsed = dt_delta.days * 24 * 3600 + dt_delta.seconds
+ self.log(f"PR {pr.number}: moved directories to trash bin {trash_bin_dir} (took {seconds_elapsed} seconds)")
+
+ # 4) report move to pull request
+
+ repo_name = pr.base.repo.full_name
+ gh = github.get_instance()
+ repo = gh.get_repo(repo_name)
+ pull_request = repo.get_pull(pr.number)
+ clean_up_comment = self.cfg[config.SECTION_CLEAN_UP][config.CLEAN_UP_SETTING_MOVED_JOB_DIRS_COMMENT]
+ moved_comment = clean_up_comment.format(job_dirs=job_dirs, trash_bin_dir=trash_bin_dir)
+ issue_comment = pull_request.create_issue_comment(moved_comment)
+ return issue_comment
def main():
diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py
index bb0c6dd8..4fcf9af3 100644
--- a/eessi_bot_job_manager.py
+++ b/eessi_bot_job_manager.py
@@ -51,6 +51,7 @@
# settings that are required in 'app.cfg'
REQUIRED_CONFIG = {
config.SECTION_BUILDENV: [
+ config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL, # required
config.BUILDENV_SETTING_JOB_NAME], # required
config.SECTION_FINISHED_JOB_COMMENTS: [
config.FINISHED_JOB_COMMENTS_SETTING_JOB_RESULT_UNKNOWN_FMT, # required
@@ -91,6 +92,9 @@ def __init__(self):
self.job_name = buildenv_cfg.get(config.BUILDENV_SETTING_JOB_NAME)
if self.job_name and len(self.job_name) < 3:
raise Exception(f"job name ({self.job_name}) is shorter than 3 characters")
+ self.job_handover_protocol = buildenv_cfg.get(config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL)
+ if self.job_handover_protocol not in config.JOB_HANDOVER_PROTOCOLS_SET:
+ raise Exception(f"job handover protocol ({self.job_handover_protocol}) is unknown")
def get_current_jobs(self):
"""
@@ -256,6 +260,25 @@ def determine_finished_jobs(self, known_jobs, current_jobs):
return finished_jobs
+ def parse_scontrol_show_job_output(self, output):
+ """
+ The output of 'scontrol --oneliner show job' is a list of key=value pairs
+ separated by whitespaces.
+
+ Args:
+ output (string): the output of the scontrol command
+
+ Returns:
+ (dict): Returns a dictionary of the key-value pairs
+ """
+ job_info = {}
+ stripped_output = output.strip()
+ for pair in stripped_output.split():
+ key, value = pair.split('=', 1)
+ job_info[key] = value
+
+ return job_info
+
def process_new_job(self, new_job):
"""
Process a new job by verifying that it is a bot job and if so
@@ -283,19 +306,20 @@ def process_new_job(self, new_job):
log_file=self.logfile,
)
- # parse output of 'scontrol_cmd' to determine the job's working
- # directory
- match = re.search(r".* WorkDir=(\S+) .*",
- str(scontrol_output))
- if match:
+ # parse output of 'scontrol_cmd'
+ job_info = self.parse_scontrol_show_job_output(str(scontrol_output))
+
+ # check if job_info contains 'WorkDir', if not we cannot process the job
+ # further
+ if 'WorkDir' in job_info:
log(
"process_new_job(): work dir of job %s: '%s'"
- % (job_id, match.group(1)),
+ % (job_id, job_info['WorkDir']),
self.logfile,
)
job_metadata_path = "%s/_bot_job%s.metadata" % (
- match.group(1),
+ job_info['WorkDir'],
job_id,
)
@@ -313,21 +337,34 @@ def process_new_job(self, new_job):
symlink_source = os.path.join(self.submitted_jobs_dir, job_id)
log(
"process_new_job(): create a symlink: %s -> %s"
- % (symlink_source, match.group(1)),
+ % (symlink_source, job_info['WorkDir']),
self.logfile,
)
- os.symlink(match.group(1), symlink_source)
-
- release_cmd = "%s release %s" % (
- self.scontrol_command,
- job_id,
- )
+ os.symlink(job_info['WorkDir'], symlink_source)
+
+ # handle different job handover protocols
+ # *_HOLD_RELEASE: job was submitted with '--hold' and shall be
+ # released with 'scontrol release JOB_ID'
+ # *_DELAYED_BEGIN: job was submitted with '--begin=now+SOMEDELAY',
+ # no extra action is needed
+ job_status = ''
+ extra_info = ''
+ if self.job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_HOLD_RELEASE:
+ release_cmd = "%s release %s" % (
+ self.scontrol_command,
+ job_id,
+ )
- release_output, release_err, release_exitcode = run_cmd(
- release_cmd,
- "process_new_job(): scontrol command",
- log_file=self.logfile,
- )
+ release_output, release_err, release_exitcode = run_cmd(
+ release_cmd,
+ "process_new_job(): scontrol command",
+ log_file=self.logfile,
+ )
+ job_status = 'released'
+ extra_info = ''
+ elif self.job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_DELAYED_BEGIN:
+ job_status = 'received'
+ extra_info = " (eligible to start from {job_info['EligibleTime'})"
# update PR defined by repo and pr_number stored in the job's
# metadata file
@@ -356,8 +393,9 @@ def process_new_job(self, new_job):
if "comment_id" in new_job:
new_job_comments_cfg = config.read_config()[config.SECTION_NEW_JOB_COMMENTS]
dt = datetime.now(timezone.utc)
- update = "\n|%s|released|" % dt.strftime("%b %d %X %Z %Y")
- update += f"{new_job_comments_cfg[config.NEW_JOB_COMMENTS_SETTING_AWAITS_LAUNCH]}|"
+ update = "\n|%s|%s|" % (dt.strftime("%b %d %X %Z %Y"), job_status)
+ description_col_fmt = new_job_comments_cfg[config.NEW_JOB_COMMENTS_SETTING_AWAITS_LAUNCH]
+ update += f"{description_col_fmt.format(extra_info=extra_info)}|"
update_comment(new_job["comment_id"], pr, update)
else:
log(
diff --git a/scripts/bot-build.slurm b/scripts/bot-build.slurm
index bb0faa91..593bd158 100755
--- a/scripts/bot-build.slurm
+++ b/scripts/bot-build.slurm
@@ -10,6 +10,7 @@
#
# author: Kenneth Hoste (@boegel)
# author: Thomas Roeblitz (@trz42)
+# author: Sam Moors (@smoors)
#
# license: GPLv2
#
@@ -22,7 +23,70 @@
# - the directory may contain any additional files references in job.cfg,
# for example, repos.cfg and configuration file bundles for repositories
+# set default for SKIP_TESTS (don't skip ReFrame tests)
+SKIP_TESTS=no
+
echo "Starting bot-build.slurm"
+EXPORT_VARS_SCRIPT=cfg/export_vars.sh
+if [ -f ${EXPORT_VARS_SCRIPT} ]; then
+ echo "${EXPORT_VARS_SCRIPT} script found in '${PWD}', so sourcing it!"
+ source ${EXPORT_VARS_SCRIPT}
+ echo "$EXPORT_VARS_SCRIPT finished"
+else
+ echo "could not find ${EXPORT_VARS_SCRIPT} script in '${PWD}', skipping" >&2
+fi
+
+# First, read if there is a local_tmp or a site_config_script defined in
+# the site_config section of cfg/job.cfg
+# - local_tmp is used to define what location/storage folder is used as a base
+# for creating temporary directories
+# - site_config_script points to a script that is used to customize local
+# settings for build jobs; if the script exists, it is sourced
+JOB_CFG=cfg/job.cfg
+inside_site_config=false
+local_tmp_value=""
+site_config_script_value=""
+while IFS= read -r line; do
+ # Check if we've reached [site_config]
+ if [[ $line =~ ^\[site_config\]$ ]]; then
+ inside_site_config=true
+ continue
+ fi
+
+ # If another section starts and we haven't found local_tmp or
+ # site_config_script, don't try to match
+ if [[ $line =~ ^\[.*\]$ && $inside_site_config == true ]]; then
+ inside_site_config=false
+ fi
+
+ # Extract 'local_tmp' or 'site_config_script' when inside [site_config]
+ # and leave while loop when both are found
+ if $inside_site_config && [[ $line =~ ^local_tmp\ *=\ *([^[:space:]]+) ]]; then
+ local_tmp_value="${BASH_REMATCH[1]}"
+ fi
+ if $inside_site_config && [[ $line =~ ^site_config_script\ *=\ *([^[:space:]]+) ]]; then
+ site_config_script_value="${BASH_REMATCH[1]}"
+ fi
+ if [[ -n "$local_tmp_value" ]] && [[ -n "$site_config_script_value" ]]; then
+ break
+ fi
+done < "$JOB_CFG"
+if [[ -n "${local_tmp_value}" ]]; then
+ local_tmp_value=$(envsubst <<< ${local_tmp_value})
+ # Ensure dir exists before calling mktemp
+ mkdir -p ${local_tmp_value}
+ local_tmp_value=$(mktemp -d --tmpdir=${local_tmp_value} eessi_job.XXXXXXXXXX)
+ echo "Overwriting current TMPDIR '$TMPDIR' with the value '${local_tmp_value}', as configured in cfg/job.cfg"
+ export TMPDIR="${local_tmp_value}"
+fi
+
+if [[ -n "${site_config_script_value}" ]] && [[ -r ${site_config_script_value} ]]; then
+ echo "Sourcing site config script '${site_config_script_value}'"
+ source "${site_config_script_value}"
+else
+ echo "Site config script defined as '${site_config_script_value}' does not exist; ignoring it"
+fi
+
BOT_BUILD_SCRIPT=bot/build.sh
if [ -f ${BOT_BUILD_SCRIPT} ]; then
echo "${BOT_BUILD_SCRIPT} script found in '${PWD}', so running it!"
@@ -47,14 +111,19 @@ artefacts =
EOF
fi
echo "check build step finished"
-TEST_SCRIPT=bot/test.sh
-if [ -f ${TEST_SCRIPT} ]; then
- echo "${TEST_SCRIPT} script found in '${PWD}', so running it!"
- ${TEST_SCRIPT}
- echo "${TEST_SCRIPT} finished"
-else
- echo "could not find ${TEST_SCRIPT} script in '${PWD}'" >&2
+
+# SKIP_TESTS can be defined as export variable in the bot's config and then added to bot commands (export:SKIP_TESTS=yes)
+if [[ "${SKIP_TESTS}" != "yes" ]]; then
+ TEST_SCRIPT=bot/test.sh
+ if [ -f ${TEST_SCRIPT} ]; then
+ echo "${TEST_SCRIPT} script found in '${PWD}', so running it!"
+ ${TEST_SCRIPT}
+ echo "${TEST_SCRIPT} finished"
+ else
+ echo "could not find ${TEST_SCRIPT} script in '${PWD}'" >&2
+ fi
fi
+
CHECK_TEST_SCRIPT=bot/check-test.sh
if [ -f ${CHECK_TEST_SCRIPT} ]; then
echo "${CHECK_TEST_SCRIPT} script found in '${PWD}', so running it!"
diff --git a/scripts/eessi-upload-to-staging b/scripts/eessi-upload-to-staging
index b5e4482d..25fd9675 100755
--- a/scripts/eessi-upload-to-staging
+++ b/scripts/eessi-upload-to-staging
@@ -83,6 +83,9 @@ function display_help
echo " ingestion procedure" >&2
echo " -l | --list-variables - list variables that are available" >&2
echo " for expansion" >&2
+ echo " -k | --sign-key SCRIPT_KEY - specify location of the key to be" >&2
+ echo " used to sign artefacts and metadata" >&2
+ echo " files [optional; default: don't sign]" >&2
echo " -m | --metadata-prefix PREFIX - a directory to which the metadata" >&2
echo " file shall be uploaded; BASH variable" >&2
echo " expansion will be applied; arg '-l'" >&2
@@ -93,6 +96,13 @@ function display_help
echo " link the upload to a PR" >&2
echo " -r | --repository FULL_NAME - a repository name ACCOUNT/REPONAME;" >&2
echo " used to link the upload to a PR" >&2
+ echo " -s | --sign-script SCRIPT_PATH - path to script that is used to sign" >&2
+ echo " artefacts and metadata files. The" >&2
+ echo " script is called with two arguments:" >&2
+ echo " KEY file_to_sign. The KEY is the one" >&2
+ echo " provided via option --sign-key. The" >&2
+ echo " latter is determined by this script." >&2
+ echo " [optional; default: don't sign]" >&2
}
if [[ $# -lt 1 ]]; then
@@ -120,6 +130,8 @@ endpoint_url=
pr_comment_id="none"
pull_request_number="none"
github_repository="EESSI/software-layer"
+sign_key=
+sign_script=
# provided via options in the bot's config file app.cfg and/or command line argument
metadata_prefix=
@@ -155,6 +167,14 @@ while [[ $# -gt 0 ]]; do
pr_comment_id="$2"
shift 2
;;
+ -k|--sign-key)
+ sign_key=$2
+ if [[ ! -r "${sign_key}" ]]; then
+ echo "Error: SSH key '${sign_key}' to be used for signing doesn't exist or cannot be read" >&2
+ exit 1
+ fi
+ shift 2
+ ;;
-m|--metadata-prefix)
metadata_prefix="$2"
shift 2
@@ -171,6 +191,14 @@ while [[ $# -gt 0 ]]; do
github_repository="$2"
shift 2
;;
+ -s|--sign-script)
+ sign_script=$2
+ if [[ ! -x "${sign_script}" ]]; then
+ echo "Error: Script '${sign_script}' to be used for signing doesn't exist or is not executable" >&2
+ exit 1
+ fi
+ shift 2
+ ;;
-*|--*)
echo "Error: Unknown option: $1" >&2
exit 1
@@ -185,6 +213,21 @@ done
# restore potentially parsed filename(s) into $*
set -- "${POSITIONAL_ARGS[@]}"
+# ensure that either none or both of $sign_key and $sign_script are defined
+if [[ -n "${sign_key}" ]] && [[ -n "${sign_script}" ]]; then
+ sign=1
+elif [[ -n "${sign_key}" ]]; then
+ sign=0
+ echo "Error: Signing requires a key (${sign_key}) AND a script (${sign_script}); likely the bot config is incomplete" >&2
+ exit 1
+elif [[ -n "${sign_script}" ]]; then
+ sign=0
+ echo "Error: Signing requires a key (${sign_key}) AND a script (${sign_script}); likely the bot config is incomplete" >&2
+ exit 1
+else
+ sign=0
+fi
+
# infer bucket_base:
# if endpoint_url is not set (assume AWS S3 is used),
# bucket_base=https://${bucket_name}.s3.amazonaws.com/
@@ -217,6 +260,33 @@ for file in "$*"; do
aws_path=$(envsubst <<< "${artefact_prefix}")
fi
aws_file=$(basename ${file})
+ # 1st sign artefact, and upload signature
+ if [[ "${sign}" = "1" ]]; then
+ # sign artefact
+ ${sign_script} sign ${sign_key} ${file}
+ # TODO check if signing worked (just check exit code == 0)
+ sig_file=${file}.sig
+ aws_sig_file=${aws_file}.sig
+
+ # uploading signature
+ echo " store artefact signature at ${aws_path}/${aws_sig_file}"
+ upload_to_staging_bucket \
+ "${sig_file}" \
+ "${bucket_name}" \
+ "${aws_path}/${aws_sig_file}" \
+ "${endpoint_url}"
+ else
+ echo "no signing method defined; not signing artefact"
+ fi
+
+ echo Uploading to "${url}"
+ echo " store artefact at ${aws_path}/${aws_file}"
+ upload_to_staging_bucket \
+ "${file}" \
+ "${bucket_name}" \
+ "${aws_path}/${aws_file}" \
+ "${endpoint_url}"
+
echo "Creating metadata file"
url="${bucket_base}/${aws_path}/${aws_file}"
echo "create_metadata_file file=${file} \
@@ -229,17 +299,11 @@ for file in "$*"; do
"${github_repository}" \
"${pull_request_number}" \
"${pr_comment_id}")
+ aws_metadata_file=${aws_file}.meta.txt
+ # TODO check that creating the metadata file succeeded
echo "metadata:"
cat ${metadata_file}
- echo Uploading to "${url}"
- echo " store artefact at ${aws_path}/${aws_file}"
- upload_to_staging_bucket \
- "${file}" \
- "${bucket_name}" \
- "${aws_path}/${aws_file}" \
- "${endpoint_url}"
-
if [ -z ${metadata_prefix} ]; then
aws_path=${legacy_aws_path}
else
@@ -247,6 +311,23 @@ for file in "$*"; do
export github_repository
aws_path=$(envsubst <<< "${metadata_prefix}")
fi
+ # 2nd sign metadata file, and upload signature
+ if [[ "${sign}" = "1" ]]; then
+ # sign metadata file
+ ${sign_script} sign ${sign_key} ${metadata_file}
+ # TODO check if signing worked (just check exit code == 0)
+ sig_metadata_file=${metadata_file}.sig
+ aws_sig_metadata_file=${aws_metadata_file}.sig
+
+ echo " store metadata signature at ${aws_path}/${aws_sig_metadata_file}"
+ upload_to_staging_bucket \
+ "${sig_metadata_file}" \
+ "${bucket_name}" \
+ "${aws_path}/${aws_sig_metadata_file}" \
+ "${endpoint_url}"
+ else
+ echo "no signing method defined; not signing metadata file"
+ fi
echo " store metadata file at ${aws_path}/${aws_file}.meta.txt"
upload_to_staging_bucket \
"${metadata_file}" \
diff --git a/scripts/sign_verify_file_ssh.sh b/scripts/sign_verify_file_ssh.sh
new file mode 100755
index 00000000..679ea7d6
--- /dev/null
+++ b/scripts/sign_verify_file_ssh.sh
@@ -0,0 +1,156 @@
+#!/bin/bash
+#
+# SSH Signature Signing and Verification Script
+# - Sign a file using an SSH private key.
+# - Verify a signed file using an allowed signers file.
+#
+# Generates a signature file named `.sig` in the same directory.
+#
+# Author: Alan O'Cais
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see .
+
+# Usage message
+usage() {
+ cat <
+ $0 verify [signature_file]
+
+Options:
+ sign:
+ - : Path to SSH private key (use KEY_PASSPHRASE env for passphrase)
+ - : File to sign
+
+ verify:
+ - : Path to the allowed signers file
+ - : File to verify
+ - [signature_file]: Optional, defaults to '.sig'
+
+Example allowed signers format:
+ identity_1
+EOF
+ exit 9
+}
+
+# Error codes
+FILE_PROBLEM=1
+CONVERSION_FAILURE=2
+VALIDATION_FAILED=3
+
+# Ensure minimum arguments
+[ "$#" -lt 3 ] && usage
+
+MODE="$1"
+FILE_TO_SIGN="$3"
+
+# Ensure the target file exists
+if [ ! -f "$FILE_TO_SIGN" ]; then
+ echo "Error: File '$FILE_TO_SIGN' not found."
+ exit $FILE_PROBLEM
+fi
+
+# Use a very conservatuve umask throughout this script since we are dealing with sensitive things
+umask 077 || { echo "Error: Failed to set 0177 umask."; exit $FILE_PROBLEM; }
+
+# Create a restricted temporary directory and ensure cleanup on exit
+TEMP_DIR=$(mktemp -d) || { echo "Error: Failed to create temporary directory."; exit $FILE_PROBLEM; }
+trap 'rm -rf "$TEMP_DIR"' EXIT
+
+# Converts the SSH private key to OpenSSH format and generates a public key
+convert_private_key() {
+ local input_key="$1"
+ local output_key="$2"
+
+ echo "Converting SSH key to OpenSSH format..."
+ cp "$input_key" "$output_key" || { echo "Error: Failed to copy $input_key to $output_key"; exit $FILE_PROBLEM; }
+
+ # This saves the key in the default OpenSSH format (which is required for signing)
+ ssh-keygen -p -f "$output_key" -P "${KEY_PASSPHRASE:-}" -N "${KEY_PASSPHRASE:-}" || {
+ echo "Error: Failed to convert key to OpenSSH format."
+ exit $CONVERSION_FAILURE
+ }
+
+ # Extract the public key from the private key
+ ssh-keygen -y -f "$input_key" -P "${KEY_PASSPHRASE:-}" > "${output_key}.pub" || {
+ echo "Error: Failed to extract public key."
+ exit $CONVERSION_FAILURE
+ }
+}
+
+# Sign mode
+if [ "$MODE" == "sign" ]; then
+ PRIVATE_KEY="$2"
+ TEMP_KEY="$TEMP_DIR/converted_key"
+ SIG_FILE="${FILE_TO_SIGN}.sig"
+
+ # Check for key and existing signature
+ [ ! -f "$PRIVATE_KEY" ] && { echo "Error: Private key not found."; exit $FILE_PROBLEM; }
+ [ -f "$SIG_FILE" ] && { echo "Error: Signature already exists. Remove to re-sign."; exit $FILE_PROBLEM; }
+
+ convert_private_key "$PRIVATE_KEY" "$TEMP_KEY"
+
+ echo "Signing the file..."
+ ssh-keygen -Y sign -f "$TEMP_KEY" -P "${KEY_PASSPHRASE:-}" -n file "$FILE_TO_SIGN"
+
+ [ ! -f "$SIG_FILE" ] && { echo "Error: Signing failed."; exit $FILE_PROBLEM; }
+ echo "Signature created: $SIG_FILE"
+
+ cat </cfg. This file will be
+ sourced before running the bot/build.sh script.
+
+ Args:
+ job_dir (string): working directory of the job
+ exportvars (list): strings of the form VAR=VALUE to be exported
+
+ Returns:
+ None (implicitly)
+ """
+ fn = sys._getframe().f_code.co_name
+
+ content = '\n'.join(f'export {x}' for x in exportvars)
+ export_vars_path = os.path.join(job_dir, 'cfg', EXPORT_VARS_FILE)
+
+ with open(export_vars_path, 'w') as file:
+ file.write(content)
+
+ log(f"{fn}(): created exported variables file {export_vars_path}")
+
+
def prepare_jobs(pr, cfg, event_info, action_filter):
"""
Prepare all jobs whose context matches the given filter. Preparation includes
@@ -465,6 +562,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter):
build_env_cfg = get_build_env_cfg(cfg)
arch_map = get_architecture_targets(cfg)
repocfg = get_repo_cfg(cfg)
+ allowed_exportvars = get_allowed_exportvars(cfg)
base_repo_name = pr.base.repo.full_name
log(f"{fn}(): pr.base.repo.full_name '{base_repo_name}'")
@@ -491,6 +589,16 @@ def prepare_jobs(pr, cfg, event_info, action_filter):
log(f"{fn}(): found no accelerator requirement")
accelerator = None
+ # determine exportvars from action_filter argument
+ exportvars = action_filter.get_filter_by_component(tools_filter.FILTER_COMPONENT_EXPORT)
+
+ # all exportvar filters must be allowed in order to run any jobs
+ if exportvars:
+ not_allowed = [x for x in exportvars if x not in allowed_exportvars]
+ if not_allowed:
+ log(f"{fn}(): exportvariable(s) {not_allowed} not allowed")
+ return []
+
jobs = []
for arch, slurm_opt in arch_map.items():
arch_dir = arch.replace('/', '_')
@@ -545,6 +653,9 @@ def prepare_jobs(pr, cfg, event_info, action_filter):
prepare_job_cfg(job_dir, build_env_cfg, repocfg, repo_id, cpu_target, os_type, accelerator)
+ if exportvars:
+ prepare_export_vars_file(job_dir, exportvars)
+
# enlist jobs to proceed
job = Job(job_dir, arch, repo_id, slurm_opt, year_month, pr_id, accelerator)
jobs.append(job)
@@ -580,6 +691,7 @@ def prepare_job_cfg(job_dir, build_env_cfg, repos_cfg, repo_id, software_subdir,
# repository's definition, some combine two values):
# [site_config]
# local_tmp = config.BUILDENV_SETTING_LOCAL_TMP
+ # site_config_script = config.BUILDENV_SETTING_SITE_CONFIG_SCRIPT
# shared_fs_path = config.BUILDENV_SETTING_SHARED_FS_PATH
# build_logs_dir = config.BUILDENV_SETTING_BUILD_LOGS_DIR
#
@@ -604,6 +716,7 @@ def prepare_job_cfg(job_dir, build_env_cfg, repos_cfg, repo_id, software_subdir,
config.BUILDENV_SETTING_LOAD_MODULES: job_metadata.JOB_CFG_SITE_CONFIG_LOAD_MODULES,
config.BUILDENV_SETTING_LOCAL_TMP: job_metadata.JOB_CFG_SITE_CONFIG_LOCAL_TMP,
config.BUILDENV_SETTING_SHARED_FS_PATH: job_metadata.JOB_CFG_SITE_CONFIG_SHARED_FS_PATH,
+ config.BUILDENV_SETTING_SITE_CONFIG_SCRIPT: job_metadata.JOB_CFG_SITE_CONFIG_SITE_CONFIG_SCRIPT,
}
for build_env_key, job_cfg_key in build_env_to_job_cfg_keys.items():
if build_env_cfg[build_env_key]:
@@ -709,13 +822,50 @@ def submit_job(job, cfg):
job = job._replace(slurm_opts=det_submit_opts(job))
log(f"{fn}(): updated job.slurm_opts: {job.slurm_opts}")
+ build_job_script = build_env_cfg[config.BUILDENV_SETTING_BUILD_JOB_SCRIPT]
+ if isinstance(build_job_script, str):
+ build_job_script_path = build_job_script
+ log(f"{fn}(): path to build job script: {build_job_script_path}")
+ elif isinstance(build_job_script, dict):
+ build_job_script_repo = build_job_script.get('repo')
+ if build_job_script_repo:
+ log(f"{fn}(): repository in which build job script is located: {build_job_script_repo}")
+ else:
+ error(f"Failed to determine repository in which build job script is located from: {build_job_script}")
+
+ build_job_script_path = build_job_script.get('path')
+ if build_job_script_path:
+ log(f"{fn}(): path to build job script in repository: {build_job_script_path}")
+ else:
+ error(f"Failed to determine path of build job script in repository from: {build_job_script}")
+
+ # clone repo to temporary directory, and correctly set path to build job script
+ repo_subdir = build_job_script_repo.split('/')[-1]
+ if repo_subdir.endswith('.git'):
+ repo_subdir = repo_subdir[:-4]
+ target_dir = os.path.join(job.working_dir, repo_subdir)
+ os.makedirs(target_dir, exist_ok=True)
+
+ clone_output, clone_error, clone_exit_code = clone_git_repo(build_job_script_repo, target_dir)
+ if clone_exit_code == 0:
+ log(f"{fn}(): repository {build_job_script_repo} cloned to {target_dir}")
+ else:
+ error(f"Failed to clone repository {build_job_script_repo}: {clone_error}")
+
+ build_job_script_path = os.path.join(target_dir, build_job_script_path)
+ else:
+ error(f"Incorrect build job script specification, unknown type: {build_job_script}")
+
+ if not os.path.exists(build_job_script_path):
+ error(f"Build job script not found at {build_job_script_path}")
+
command_line = ' '.join([
build_env_cfg[config.BUILDENV_SETTING_SUBMIT_COMMAND],
build_env_cfg[config.BUILDENV_SETTING_SLURM_PARAMS],
time_limit,
job.slurm_opts] +
([f"--job-name='{job_name}'"] if job_name else []) +
- [build_env_cfg[config.BUILDENV_SETTING_BUILD_JOB_SCRIPT]])
+ [build_job_script_path])
cmdline_output, cmdline_error, cmdline_exit_code = run_cmd(command_line,
"submit job for target '%s'" % job.arch_target,
@@ -770,18 +920,44 @@ def create_pr_comment(job, job_id, app_name, pr, gh, symlink):
dt = datetime.now(timezone.utc)
# construct initial job comment
- job_comment = (f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT]}"
- f"\n|date|job status|comment|\n"
- f"|----------|----------|------------------------|\n"
- f"|{dt.strftime('%b %d %X %Z %Y')}|"
- f"submitted|"
- f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE]}|").format(
- app_name=app_name,
- arch_name=arch_name,
- symlink=symlink,
- repo_id=job.repo_id,
- job_id=job_id,
- accelerator_spec=accelerator_spec_str)
+ buildenv = config.read_config()[config.SECTION_BUILDENV]
+ job_handover_protocol = buildenv.get(config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL)
+ if job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_DELAYED_BEGIN:
+ release_msg_string = config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_DELAYED_BEGIN_MSG
+ release_comment_template = submitted_job_comments_cfg[release_msg_string]
+ # calculate delay from poll_interval and delay_factor
+ job_manager_cfg = config.read_config()[config.SECTION_JOB_MANAGER]
+ poll_interval = int(job_manager_cfg.get(config.JOB_MANAGER_SETTING_POLL_INTERVAL))
+ delay_factor = float(buildenv.get(config.BUILDENV_SETTING_JOB_DELAY_BEGIN_FACTOR, 2))
+ eligible_in_seconds = int(poll_interval * delay_factor)
+ job_comment = (f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT]}"
+ f"\n|date|job status|comment|\n"
+ f"|----------|----------|------------------------|\n"
+ f"|{dt.strftime('%b %d %X %Z %Y')}|"
+ f"submitted|"
+ f"{release_comment_template}|").format(
+ app_name=app_name,
+ arch_name=arch_name,
+ symlink=symlink,
+ repo_id=job.repo_id,
+ job_id=job_id,
+ delay_seconds=eligible_in_seconds,
+ accelerator_spec=accelerator_spec_str)
+ else:
+ release_msg_string = config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_HOLD_RELEASE_MSG
+ release_comment_template = submitted_job_comments_cfg[release_msg_string]
+ job_comment = (f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT]}"
+ f"\n|date|job status|comment|\n"
+ f"|----------|----------|------------------------|\n"
+ f"|{dt.strftime('%b %d %X %Z %Y')}|"
+ f"submitted|"
+ f"{release_comment_template}|").format(
+ app_name=app_name,
+ arch_name=arch_name,
+ symlink=symlink,
+ repo_id=job.repo_id,
+ job_id=job_id,
+ accelerator_spec=accelerator_spec_str)
# create comment to pull request
repo_name = pr.base.repo.full_name
diff --git a/tasks/deploy.py b/tasks/deploy.py
index 32e7705f..2d36d24e 100644
--- a/tasks/deploy.py
+++ b/tasks/deploy.py
@@ -265,6 +265,12 @@ def upload_artefact(job_dir, payload, timestamp, repo_name, pr_number, pr_commen
bucket_spec = deploycfg.get(config.DEPLOYCFG_SETTING_BUCKET_NAME)
metadata_prefix = deploycfg.get(config.DEPLOYCFG_SETTING_METADATA_PREFIX)
artefact_prefix = deploycfg.get(config.DEPLOYCFG_SETTING_ARTEFACT_PREFIX)
+ signing_str = deploycfg.get(config.DEPLOYCFG_SETTING_SIGNING) or ''
+ try:
+ signing = json.loads(signing_str)
+ except json.decoder.JSONDecodeError:
+ signing = {}
+ log(f"{funcname}(): error initialising signing from ({signing_str})")
# if bucket_spec value looks like a dict, try parsing it as such
if bucket_spec.lstrip().startswith('{'):
@@ -334,11 +340,29 @@ def upload_artefact(job_dir, payload, timestamp, repo_name, pr_number, pr_commen
return
# run 'eessi-upload-to-staging {abs_path}'
+ # (1) construct command line
+ # (2) setup container environment (for signing artefacts ...) if needed
+ # (3) run command
+
# (1) construct command line
# script assumes a few defaults:
# bucket_name = 'eessi-staging'
# if endpoint_url not set use EESSI S3 bucket
- # (2) run command
+ do_signing = signing and target_repo_id in signing
+ sign_args = []
+ if do_signing:
+ sign_key_str = signing[target_repo_id][config.DEPLOYCFG_SETTING_SIGNING_KEY]
+ sign_key_path = os.path.abspath(sign_key_str)
+ sign_args.extend(['--sign-key', sign_key_path])
+ sign_script_str = signing[target_repo_id][config.DEPLOYCFG_SETTING_SIGNING_SCRIPT]
+ # if script begins not with '/', assume its location is relative to the job directory
+ # (that's because the script is provided by the target repository)
+ if sign_script_str.startswith('/'):
+ sign_script_path = os.path.abspath(sign_script_str)
+ else:
+ sign_script_path = os.path.abspath(os.path.join(job_dir, sign_script_str))
+ sign_args.extend(['--sign-script', sign_script_path])
+
cmd_args = [artefact_upload_script, ]
if len(artefact_prefix_arg) > 0:
cmd_args.extend(['--artefact-prefix', artefact_prefix_arg])
@@ -351,11 +375,61 @@ def upload_artefact(job_dir, payload, timestamp, repo_name, pr_number, pr_commen
cmd_args.extend(['--pr-comment-id', str(pr_comment_id)])
cmd_args.extend(['--pull-request-number', str(pr_number)])
cmd_args.extend(['--repository', repo_name])
+ cmd_args.extend(sign_args)
cmd_args.append(abs_path)
- upload_cmd = ' '.join(cmd_args)
- # run_cmd does all the logging we might need
- out, err, ec = run_cmd(upload_cmd, 'Upload artefact to S3 bucket', raise_on_error=False)
+ # (2) setup container environment (for signing artefacts ...) if needed
+ # determine container to run (from job.cfg)
+ # determine container cache dir (from job.cfg)
+ # setup directory for temporary container storage (previous_tmp/upload_step)
+ # define miscellaneous args (--home ...)
+ run_in_container = (
+ do_signing and
+ config.DEPLOYCFG_SETTING_SIGNING_CONTAINER_RUNTIME in signing[target_repo_id]
+ )
+ container_cmd = []
+ my_env = {}
+ if run_in_container:
+ container = jobcfg[job_metadata.JOB_CFG_REPOSITORY_SECTION][job_metadata.JOB_CFG_REPOSITORY_CONTAINER]
+ cachedir = jobcfg[job_metadata.JOB_CFG_SITE_CONFIG_SECTION][job_metadata.JOB_CFG_SITE_CONFIG_CONTAINER_CACHEDIR]
+ upload_tmp_dir = os.path.join(job_dir, job_metadata.JOB_CFG_PREVIOUS_TMP, job_metadata.JOB_CFG_UPLOAD_STEP)
+ os.makedirs(upload_tmp_dir, exist_ok=True)
+ container_runtime = signing[target_repo_id][config.DEPLOYCFG_SETTING_SIGNING_CONTAINER_RUNTIME]
+
+ # determine (additional) bind mounts from paths used to call upload script and its arguments
+ # - assumes that all paths begin with '/'
+ bind_mounts = set()
+ # first add parent of job_dir and real path of the parent
+ job_parent_dir = os.path.dirname(job_dir)
+ bind_mounts.add(job_parent_dir)
+ real_job_parent_dir = os.path.realpath(job_parent_dir)
+ if job_parent_dir != real_job_parent_dir:
+ bind_mounts.add(real_job_parent_dir)
+ # now, process all args that begin with '/'
+ for arg in cmd_args:
+ if arg.startswith('/'):
+ arg_dir = os.path.dirname(arg)
+ bind_mounts.add(arg_dir)
+ # also, determine the real path for arg_dir and add it if it's different to arg_dir
+ real_dir = os.path.realpath(arg_dir)
+ if arg_dir != real_dir:
+ bind_mounts.add(real_dir)
+
+ container_cmd = [container_runtime, ]
+ container_cmd.extend(['exec'])
+ # avoid that $HOME 'leaks' in due to system settings
+ container_cmd.extend(['--no-home'])
+ for bind in bind_mounts:
+ container_cmd.extend(['--bind', bind])
+ container_cmd.extend([container])
+ my_env = {
+ 'SINGULARITY_CACHEDIR': cachedir,
+ 'SINGULARITY_TMPDIR': upload_tmp_dir
+ }
+
+ cmd_and_args = ' '.join(container_cmd + cmd_args)
+ log(f"command to launch upload script: {cmd_and_args}")
+ out, err, ec = run_cmd(cmd_and_args, 'Upload artefact to S3 bucket', raise_on_error=False, env=my_env)
if ec == 0:
# add file to 'job_dir/../uploaded.txt'
diff --git a/tests/test_app.cfg b/tests/test_app.cfg
index 43e11bf9..84161ba0 100644
--- a/tests/test_app.cfg
+++ b/tests/test_app.cfg
@@ -12,12 +12,15 @@
# sample config file for tests (some functions run config.read_config()
# which reads app.cfg by default)
[buildenv]
+job_handover_protocol = hold_release
[job_manager]
# variable 'comment' under 'submitted_job_comments' should not be changed as there are regular expression patterns matching it
[submitted_job_comments]
awaits_release = job id `{job_id}` awaits release by job manager
+awaits_release_delayed_begin_msg = job id `{job_id}` will be eligible to start in about {delay_seconds} seconds
+awaits_release_hold_release_msg = job id `{job_id}` awaits release by job manager
initial_comment = New job on instance `{app_name}` for CPU micro-architecture `{arch_name}`{accelerator_spec} for repository `{repo_id}` in job dir `{symlink}`
with_accelerator = and accelerator `{accelerator}`
diff --git a/tools/__init__.py b/tools/__init__.py
index 640cae17..0e7d2028 100644
--- a/tools/__init__.py
+++ b/tools/__init__.py
@@ -23,7 +23,7 @@
# TODO do we really need two functions (run_cmd and run_subprocess) for
# running a command?
-def run_cmd(cmd, log_msg='', working_dir=None, log_file=None, raise_on_error=True):
+def run_cmd(cmd, log_msg='', working_dir=None, log_file=None, raise_on_error=True, env=None):
"""
Runs a command in the shell and raises an error if one occurs.
@@ -33,6 +33,7 @@ def run_cmd(cmd, log_msg='', working_dir=None, log_file=None, raise_on_error=Tru
working_dir (string): location of the job's working directory
log_file (string): path to log file
raise_on_error (bool): if True raise an exception in case of error
+ env (dict): environment settings for running the command
Returns:
tuple of 3 elements containing
@@ -45,7 +46,7 @@ def run_cmd(cmd, log_msg='', working_dir=None, log_file=None, raise_on_error=Tru
raise_on_error is True
"""
# TODO use common method for logging function name in log messages
- stdout, stderr, exit_code = run_subprocess(cmd, log_msg, working_dir, log_file)
+ stdout, stderr, exit_code = run_subprocess(cmd, log_msg, working_dir, log_file, env)
if exit_code != 0:
error_msg = (
@@ -66,7 +67,7 @@ def run_cmd(cmd, log_msg='', working_dir=None, log_file=None, raise_on_error=Tru
return stdout, stderr, exit_code
-def run_subprocess(cmd, log_msg, working_dir, log_file):
+def run_subprocess(cmd, log_msg='', working_dir=None, log_file=None, env=None):
"""
Runs a command in the shell. No error is raised if the command fails.
@@ -75,6 +76,7 @@ def run_subprocess(cmd, log_msg, working_dir, log_file):
log_msg (string): purpose of the command
working_dir (string): location of the job's working directory
log_file (string): path to log file
+ env (dict): environment settings for running the command
Returns:
tuple of 3 elements containing
@@ -91,7 +93,12 @@ def run_subprocess(cmd, log_msg, working_dir, log_file):
else:
log(f"run_subprocess(): Running '{cmd}' in directory '{working_dir}'", log_file=log_file)
+ my_env = os.environ.copy()
+ if env is not None:
+ my_env.update(env)
+
result = subprocess.run(cmd,
+ env=my_env,
cwd=working_dir,
shell=True,
encoding="UTF-8",
diff --git a/tools/config.py b/tools/config.py
index ff641ebb..5d0c6a7e 100644
--- a/tools/config.py
+++ b/tools/config.py
@@ -25,7 +25,7 @@
# Local application imports (anything from EESSI/eessi-bot-software-layer)
from .logging import error
-# define configration constants
+# define configuration constants
# SECTION_sectionname for any section name in app.cfg
# sectionname_SETTING_settingname for any setting with name settingname in
# section sectionname
@@ -37,6 +37,7 @@
BOT_CONTROL_SETTING_COMMAND_RESPONSE_FMT = 'command_response_fmt'
SECTION_BUILDENV = 'buildenv'
+BUILDENV_SETTING_ALLOWED_EXPORTVARS = 'allowed_exportvars'
BUILDENV_SETTING_ALLOW_UPDATE_SUBMIT_OPTS = 'allow_update_submit_opts'
BUILDENV_SETTING_BUILD_JOB_SCRIPT = 'build_job_script'
BUILDENV_SETTING_BUILD_LOGS_DIR = 'build_logs_dir'
@@ -45,12 +46,15 @@
BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS = 'cvmfs_customizations'
BUILDENV_SETTING_HTTPS_PROXY = 'https_proxy'
BUILDENV_SETTING_HTTP_PROXY = 'http_proxy'
+BUILDENV_SETTING_JOB_DELAY_BEGIN_FACTOR = 'job_delay_begin_factor'
+BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL = 'job_handover_protocol'
BUILDENV_SETTING_JOB_NAME = 'job_name'
BUILDENV_SETTING_JOBS_BASE_DIR = 'jobs_base_dir'
BUILDENV_SETTING_LOAD_MODULES = 'load_modules'
BUILDENV_SETTING_LOCAL_TMP = 'local_tmp'
BUILDENV_SETTING_NO_BUILD_PERMISSION_COMMENT = 'no_build_permission_comment'
BUILDENV_SETTING_SHARED_FS_PATH = 'shared_fs_path'
+BUILDENV_SETTING_SITE_CONFIG_SCRIPT = 'site_config_script'
BUILDENV_SETTING_SLURM_PARAMS = 'slurm_params'
BUILDENV_SETTING_SUBMIT_COMMAND = 'submit_command'
@@ -62,6 +66,10 @@
DEPLOYCFG_SETTING_ENDPOINT_URL = 'endpoint_url'
DEPLOYCFG_SETTING_METADATA_PREFIX = 'metadata_prefix'
DEPLOYCFG_SETTING_NO_DEPLOY_PERMISSION_COMMENT = 'no_deploy_permission_comment'
+DEPLOYCFG_SETTING_SIGNING = 'signing'
+DEPLOYCFG_SETTING_SIGNING_CONTAINER_RUNTIME = 'container_runtime'
+DEPLOYCFG_SETTING_SIGNING_KEY = 'key'
+DEPLOYCFG_SETTING_SIGNING_SCRIPT = 'script'
DEPLOYCFG_SETTING_UPLOAD_POLICY = 'upload_policy'
SECTION_DOWNLOAD_PR_COMMENTS = 'download_pr_comments'
@@ -105,7 +113,10 @@
RUNNING_JOB_COMMENTS_SETTING_RUNNING_JOB = 'running_job'
SECTION_SUBMITTED_JOB_COMMENTS = 'submitted_job_comments'
+# SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE is DEPRECATED
SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE = 'awaits_release'
+SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_DELAYED_BEGIN_MSG = 'awaits_release_delayed_begin_msg'
+SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_HOLD_RELEASE_MSG = 'awaits_release_hold_release_msg'
SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT = 'initial_comment'
SUBMITTED_JOB_COMMENTS_SETTING_WITH_ACCELERATOR = 'with_accelerator'
@@ -113,6 +124,14 @@
CLEAN_UP_SETTING_TRASH_BIN_ROOT_DIR = 'trash_bin_dir'
CLEAN_UP_SETTING_MOVED_JOB_DIRS_COMMENT = 'moved_job_dirs_comment'
+# definition of values
+JOB_HANDOVER_PROTOCOL_DELAYED_BEGIN = 'delayed_begin'
+JOB_HANDOVER_PROTOCOL_HOLD_RELEASE = 'hold_release'
+JOB_HANDOVER_PROTOCOLS_SET = {
+ JOB_HANDOVER_PROTOCOL_DELAYED_BEGIN,
+ JOB_HANDOVER_PROTOCOL_HOLD_RELEASE
+}
+
def read_config(path='app.cfg'):
"""
diff --git a/tools/filter.py b/tools/filter.py
index 8b14f5eb..0caa2af8 100644
--- a/tools/filter.py
+++ b/tools/filter.py
@@ -20,16 +20,18 @@
# (none yet)
-# NOTE because one can use any prefix of one of the four components below to
+# NOTE because one can use any prefix of one of the components below to
# define a filter, we need to make sure that no two filters share the same
# prefix OR we have to change the handling of filters.
FILTER_COMPONENT_ACCEL = 'accelerator'
FILTER_COMPONENT_ARCH = 'architecture'
+FILTER_COMPONENT_EXPORT = 'exportvariable'
FILTER_COMPONENT_INST = 'instance'
FILTER_COMPONENT_JOB = 'job'
FILTER_COMPONENT_REPO = 'repository'
FILTER_COMPONENTS = [FILTER_COMPONENT_ACCEL,
FILTER_COMPONENT_ARCH,
+ FILTER_COMPONENT_EXPORT,
FILTER_COMPONENT_INST,
FILTER_COMPONENT_JOB,
FILTER_COMPONENT_REPO
diff --git a/tools/job_metadata.py b/tools/job_metadata.py
index d4000199..7b7b8d0a 100644
--- a/tools/job_metadata.py
+++ b/tools/job_metadata.py
@@ -29,6 +29,10 @@
JOB_CFG_DIRECTORY_NAME = "cfg"
JOB_CFG_FILENAME = "job.cfg"
+# job previous_tmp directory and sub directories
+JOB_CFG_PREVIOUS_TMP = "previous_tmp"
+JOB_CFG_UPLOAD_STEP = "upload_step"
+
# JWD/cfg/$JOB_CFG_FILENAME
JOB_CFG_ARCHITECTURE_SECTION = "architecture"
JOB_CFG_ARCHITECTURE_OS_TYPE = "os_type"
@@ -50,6 +54,7 @@
JOB_CFG_SITE_CONFIG_LOAD_MODULES = "load_modules"
JOB_CFG_SITE_CONFIG_LOCAL_TMP = "local_tmp"
JOB_CFG_SITE_CONFIG_SHARED_FS_PATH = "shared_fs_path"
+JOB_CFG_SITE_CONFIG_SITE_CONFIG_SCRIPT = "site_config_script"
# JWD/_bot_jobJOBID.metadata
JOB_PR_SECTION = "PR"