diff --git a/.github/workflows/tests_scripts.yml b/.github/workflows/tests_scripts.yml new file mode 100644 index 00000000..1f8d012b --- /dev/null +++ b/.github/workflows/tests_scripts.yml @@ -0,0 +1,39 @@ +# documentation: https://help.github.com/en/articles/workflow-syntax-for-github-actions +name: Tests for scripts +on: + push: + paths: + - scripts/sign_verify_file_ssh.sh + pull_request: + paths: + - scripts/sign_verify_file_ssh.sh +permissions: + contents: read # to fetch code (actions/checkout) +jobs: + build: + runs-on: ubuntu-24.04 + steps: + - name: checkout + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + + - name: test sign_verify_file_ssh.sh script + run: | + # Create a PEM format ssh identity + ssh-keygen -t rsa -b 4096 -m PEM -f id_rsa.pem -N "" + # Create a file to sign + echo "Very important stuff" > out.txt + export FILE_TO_SIGN="out.txt" + # Sign the file + ./scripts/sign_verify_file_ssh.sh sign id_rsa.pem "$FILE_TO_SIGN" + # Create an allowed_signers file based on the public key + echo -n "allowed_identity " > allowed_signers + cat id_rsa.pem.pub >> allowed_signers + # Verify the signature + ./scripts/sign_verify_file_ssh.sh verify allowed_signers "$FILE_TO_SIGN" + # Make a new signature that does not appear in the allowed signers file + ssh-keygen -t rsa -b 4096 -m PEM -f id_rsa.alt.pem -N "" + # Replace the allowed signers file + echo -n "disallowed_identity " > allowed_signers + cat id_rsa.alt.pem.pub >> allowed_signers + # Make sure signature checking fails in this case + ./scripts/sign_verify_file_ssh.sh verify allowed_signers "$FILE_TO_SIGN" && exit 1 || echo "Expected failure for unknown identity" diff --git a/README.md b/README.md index 836cc168..77c9cdd1 100644 --- a/README.md +++ b/README.md @@ -375,6 +375,30 @@ package repositories. Typically these settings are set in the prologue of a Slurm job. However, when entering the [EESSI compatibility layer](https://www.eessi.io/docs/compatibility_layer), most environment settings are cleared. Hence, they need to be set again at a later stage. +``` +job_delay_begin_factor = 2 +``` +The `job_delay_begin_factor` setting defines how many times the `poll_interval` a +job's begin (EligibleTime) from now should be delayed if the handover protocol +is set to `delayed_begin` (see setting `job_handover_protocol`). That is, if +the `job_delay_begin_factor` is set to five (5) the delay time is calculated as +5 * `poll_interval`. The event manager would use 2 as default value when +submitting jobs. + +``` +job_handover_protocol = hold_release +``` +The `job_handover_protocol` setting defines which method is used to handover a +job from the event handler to the job manager. Values are + - `hold_release` (job is submitted with `--hold`, job manager removes the hold + with `scontrol release`) + - `delayed_begin` (job is submitted with `--begin=now+(5 * poll_interval)` and + any `--hold` is removed from the submission parameters); see setting + `poll_interval` further below; this is useful if the + bot account cannot run `scontrol release` to remove the hold of the job; + also, the status update in the PR comment of the job is extended by noting + the `EligibleTime` + ``` job_name = JOB_NAME ``` @@ -403,6 +427,17 @@ on a compute/worker node. You may have to change this if temporary storage under environment variable `$EESSI_TMPDIR`. The value is expanded only inside a running job. Thus, typical job environment variables (like `$USER` or `$SLURM_JOB_ID`) may be used to isolate jobs running simultaneously on the same compute node. + +``` +site_config_script = /path/to/script/if/any +``` +`site_config_script` specifies the path to a script that - if it exists - is +sourced in the build job before any `bot/*` script is run. This allows to +customize the build environment due to specifics of the build site/cluster. +Note, such customizations could also be performed by putting them into a +module file and use the setting `load_modules` (see above). However, the +setting `site_config_script` provides a low threshold for achieving this, too. + ``` slurm_params = "--hold" ``` @@ -433,6 +468,22 @@ allow_update_submit_opts = false options via custom module `det_submit_opts` provided by the pull request being processed. +``` +allowed_exportvars = ["NAME1=value_1a", "NAME1=value_1b", "NAME2=value_2"] +``` +`allowed_exportvars` defines a list of name-value pairs (environment +variables) that are allowed to be specified in a PR command with the +`exportvariable` filter. To specify multiple environment variables, multiple +`exportvariable` filters must be used (one per variable). These variables will +be exported into the build environment before running the bot/build.sh script. + +The bot build script makes use of the variable `SKIP_TESTS` to determine if +ReFrame tests shall be skipped or not. Default is not to skip them. To allow the +use of the variable the setting could look like +``` +allowed_exportvars = ["SKIP_TESTS=yes", "SKIP_TESTS=no"] +``` + #### `[bot_control]` section @@ -464,6 +515,35 @@ artefact_upload_script = PATH_TO_EESSI_BOT/scripts/eessi-upload-to-staging ``` `artefact_upload_script` provides the location for the script used for uploading built software packages to an S3 bucket. +``` +signing = + { + REPO_ID: { + "script": PATH_TO_SIGN_SCRIPT, + "key": PATH_TO_KEY_FILE, + "container_runtime": PATH_TO_CONTAINER_RUNTIME + }, ... + } +``` +`signing` provides a setting for signing artefacts. The value uses a JSON-like format +with `REPO_ID` being the repository ID. Repository IDs are defined in a file +`repos.cfg` (see setting `repos_cfg_dir`), `script` provides the location of the +script that is used to sign a file. If the location is a relative path, the script +must reside in the checked out pull request of the target repository (e.g., +EESSI/software-layer). `key` points to the file of the key being used +for signing. The bot calls the script with the two arguments: + 1. private key (as provided by the attribute 'key') + 2. path to the file to be signed (the upload script will determine that) +NOTE (on `container_runtime`), signing requires a recent installation of OpenSSH +(8.2 or newer). If the frontend where the event handler runs does not have that +version installed, you can specify a container runtime via the `container_runtime` +attribute below. Currently, only Singularity or Apptainer are supported. +Note (on the key), make sure the file permissions are restricted to `0600` (only +readable+writable by the file owner, or the signing will likely fail. +Note (on json format), make sure no trailing commas are used after any elements +or parsing/loading the json will likely fail. Also, the whole value should start +at a new line and be indented as shown above. + ``` endpoint_url = URL_TO_S3_SERVER ``` @@ -645,12 +725,30 @@ scontrol_command = /usr/bin/scontrol #### `[submitted_job_comments]` section The `[submitted_job_comments]` section specifies templates for messages about newly submitted jobs. + +DEPRECATED setting (use `awaits_release_delayed_begin_msg` and/or `awaits_release_hold_release_msg`) ``` awaits_release = job id `{job_id}` awaits release by job manager ``` `awaits_release` is used to provide a status update of a job (shown as a row in the job's status table). +``` +awaits_release_delayed_begin_msg = job id `{job_id}` will be eligible to start in about {delay_seconds} seconds +``` +`awaits_release_delayed_begin_msg` is used when the `job_handover_protocol` is +set to `delayed_begin`. Note, both `{job_id}` and `{delay_seconds}` need to be +present in the value or the event handler will throw an exception when formatting +the update of the PR comment corresponding to the job. + +``` +awaits_release_hold_release_msg = job id `{job_id}` awaits release by job manager +``` +`awaits_release_hold_release_msg` is used when the `job_handover_protocol` is +set to `hold_release`. Note, `{job_id}` needs to be present in the value or the +event handler will throw an exception when formatting the update of the PR +comment corresponding to the job. + ``` initial_comment = New job on instance `{app_name}` for architecture `{arch_name}`{accelerator_spec} for repository `{repo_id}` in job dir `{symlink}` ``` @@ -756,7 +854,237 @@ moved_job_dirs_comment = PR merged! Moved `{job_dirs}` to `{trash_bin_dir}` Template that is used by the bot to add a comment to a PR noting down which directories have been moved and where. -# Instructions to run the bot components +# Step 6: Creating a ReFrame configuration file for the test step (only needed when building for the [EESSI software layer](https://github.com/EESSI/software-layer)) +Part of the test step of the EESSI software layer is running the EESSI test suite. This requires putting a ReFrame configuration file in place that describes the partitions in the `arch_target_map` of the bot config. + +You can find general documentation on how to write a ReFrame config file in the [EESSI documentation](https://www.eessi.io/docs/test-suite/ReFrame-configuration-file/). However, some specifics apply when setting things up for the test step: + +- The configuration file has to be in `{shared_fs_path}/reframe_config.py` (recommended) or you have to set `RFM_CONFIG_FILES` to point to the configuration file and you have to make sure that is a location that is available (mounted) in the build container. +- The system name _has_ to be `BotBuildTests` +- Partition names should be ${EESSI_SOFTWARE_SUBDIR//\//_} for non-accelerator partitions and ${EESSI_SOFTWARE_SUBDIR//\//_}_${EESSI_ACCELERATOR_TARGET//\//_} for accelerator partitions. In words: the partition name should be the software subdir, replacing slashes with underscores, and for accelerators appending the accelerator target (again replacing slashes with underscores). E.g. x86_64_intel_skylake_avx512_nvidia_cc80 would be a valid partition name for a partition with Intel skylake's + Nvidia A100s.\ +- The `scheduler` should be `local`, as the bot already schedules the job (ReFrame should just locally spawn the tests in the allocation created by the bot). +- The `access` field should not be used by ReFrame if the local scheduler is defined, you can simply omit this keyword. + +To configure the number of GPUs and CPUs, we have two options: +1. We describe the physical node in the ReFrame configuration file and set the `REFRAME_SCALE_TAG` environment variable to match the size of the allocation that you specify in your bot config. E.g. if your bot config allocates 1/4th of a node, one would set `REFRAME_SCALE_TAG=1_4_node` in the environment of the job submitted by the bot. +2. We describe a virtual node configuration that matches the size of the allcation created by the bot (and we use the default `REFRAME_SCALE_TAG=1_node`, you don't have to set this explicitely). + +The first approach is the easiest, and thus recommended, since you can use CPU autodetection by ReFrame. The second approach allows for more flexibility. + +## Approach 1 (recommended): describing the physical node and setting the `REFRAME_SCALE_TAG` to match the bot config's allocation size +In this approach, we describe the physical node configuration. That means: the amount of physical CPUs and GPUs present in the node. + +For the CPU part, we can rely on ReFrame's CPU autodetection: if `remote_detect` is set to `True` in the general section of the config, and no CPU topology information is provided in the ReFrame configuration file, ReFrame will automatically detect the [CPU topology](https://reframe-hpc.readthedocs.io/en/stable/config_reference.html#config.systems.partitions.processor). + +For the GPU part, we need to configure the vendor and the amount of GPUs. E.g. for a partition with 4 Nvidia GPUs per node: +``` +'partition': { +... + 'extras': { + GPU_VENDOR: GPU_VENDORS[NVIDIA], + }, + 'devices': [ + { + 'type': DEVICE_TYPES[GPU], + 'num_devices': 4, + } + ] +} +``` + +Now, we need to make sure ReFrame only starts tests that have scales that fit within the allocation created by the bot. E.g. on a GPU node, it would be quite common to only allocate a single GPU for building GPU software. In the above example, that means only a quarter node. We can make sure the EESSI test suite only runs tests that fit within a 25% of the physical node described above by making sure the `REFRAM_SCALE_TAG` environment variable is set to `1_4_node`. You can find a list of all valid values for the `REFRAME_SCALE_TAG` by checking the `SCALES` constant in the [EESSI test suite](https://github.com/EESSI/test-suite/blob/main/eessi/testsuite/constants.py). + +Note that if you had e.g. a node with 6 GPUs per node, and you were building on 1 GPU, you probably want to go for Approach 2, since `1_6_node` is not a known scale in the EESSI test suite. Although you could set `REFRAME_SCALE_TAG=1_8_node`, this would lead to undefined behavior for the amount of GPUs allocated (may be 1, may be 0). For CPU-based nodes, this could however be a reasonable approach. + +Note that if for _some_ partitions you use e.g. quarter nodes, and for some full nodes, you'll have to set the `REFRAME_SCALE_TAG` conditionally based on the node architecture. You could e.g. do this in a `.bashrc` that has some conditional logic to determine the node type and set the corresponding scale. Alternatively, you could use Approach 2. + +### Complete example config +In this example, we assume a node with 4 A100 GPUs (compute capability `cc80`) and 72 CPU cores (Intel Skylake) and 512 GB of memory (of which 491520 MiB is useable by SLURM jobs; on this system the rest is reserved for the OS): +``` +from eessi.testsuite.common_config import common_logging_config +from eessi.testsuite.constants import * # noqa: F403 + + +site_configuration = { + 'systems': [ + { + 'name': 'BotBuildTests', # The system HAS to have this name, do NOT change it + 'descr': 'Software-layer bot', + 'hostnames': ['.*'], + 'modules_system': 'lmod', + 'partitions': [ + { + 'name': 'x86_64_intel_skylake_avx512_nvidia_cc80', + 'scheduler': 'local', + 'launcher': 'mpirun', + 'environs': ['default'], + 'features': [ + FEATURES[GPU] # We want this to run GPU-based tests from the EESSI test suite + ] + list(SCALES.keys()), + 'resources': [ + { + 'name': 'memory', + 'options': ['--mem={size}'], + } + ], + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 491520, # in MiB (512 GB minus some reserved for the OS) + GPU_VENDOR: GPU_VENDORS[NVIDIA], + }, + 'devices': [ + { + 'type': DEVICE_TYPES[GPU], + 'num_devices': 4, + } + ], + 'max_jobs': 1 + }, + ] + } + ], + 'environments': [ + { + 'name': 'default', + 'cc': 'cc', + 'cxx': '', + 'ftn': '' + } + ], + 'general': [ + { + 'purge_environment': True, + 'resolve_module_conflicts': False, # avoid loading the module before submitting the job + 'remote_detect': True, # Make sure to automatically detect the CPU topology + } + ], + 'logging': common_logging_config(), +} +``` + +## Approach 2: describing a virtual node +In this approach, we describe a virtual node configuration for which the size matches exactly what is allocated by the bot (through the `slurm_params` and `arch_target_map`). In this example, we'll assume that this node has 4 GPUs and 72 cores, distributed over 2 sockets each consisting of 1 NUMA domain. We also assume our bot is configured with `slurm_params = --hold --nodes=1 --export=None --time=0:30:0` and `arch_target_map = {"linux/x86_64/intel/skylake_avx512" : "--partition=gpu --cpus-per-task=18 --gpus-per-node 1"}`, i.e. it effectively allocates a quarter node. We describe a virtual partition for ReFrame as if this quarter node is a full node, i.e. we pretend it is a partition with 18 cores and 1 GPU per node, with 1 socket. + +We would first have to hardcode the CPU configuration. +``` +'partition': { +... + 'processor': { + "num_cpus": 18, + "num_cpus_per_core": 1, + "num_cpus_per_socket": 18, + "num_sockets": 1, + "topology": { + "numa_nodes": [ + # As stated, the 18 cores are on a single NUMA domain. Thus, the bitmask should be a sequence of 18 1's, which is 3ffff in hexadecimal representation + "0x3ffff", # a bit mask of 111111111111111111, i.e. cores 0-17 are on this NUMA domain + ], + }, + } +} +``` + +Note that if instead, this node would have had 8 NUMA domains (4 per socket), the 18 cores would correspond to 2 NUMA domains and we would have had to define: +``` +"numa_nodes": [ + "0x001ff", # a bit mask of 000000000111111111, i.e. cores 0-8 are on this NUMA domain + "0x3fe00", # a bit mask of 111111111000000000, i.e. cores 9-17 are on this NUMA domain +] +``` + +Note that the `topology` dictionary in a ReFrame configuration file can contain more information, such as the bitmasks for the CPU sockets and cores, as well as information on the caches (see [here](https://reframe-hpc.readthedocs.io/en/stable/config_reference.html#config.systems.partitions.processor.topology)). Currently, that information is not needed by the EESSI test suite, but that may change if tests are added that utilize such information to execute efficiently. + +For the GPU configuration, we simply put: +``` +'partition': { +... + 'extras': { + GPU_VENDOR: GPU_VENDORS[NVIDIA], + }, + 'devices': [ + { + 'type': DEVICE_TYPES[GPU], + 'num_devices': 1, + } + ] +} +``` +To match the fact that we allocate 1 GPU in the `arch_target_map`. + +### Complete example config +In this example, we assume a node with 4 A100 GPUs (compute capability `cc80`) and 72 CPU cores (Intel Skylake) and 512 GB of memory (of which 491520 MiB is useable by SLURM jobs; on this system the rest is reserved for the OS). We also assume the bot configuration is such for this partition that 1/4th of these nodes gets allocated for a build job: +``` +site_configuration = { + 'systems': [ + { + 'name': 'BotBuildTests', # The system HAS to have this name, do NOT change it + 'descr': 'Software-layer bot', + 'hostnames': ['.*'], + 'modules_system': 'lmod', + 'partitions': [ + { + 'name': 'x86_64_intel_skylake_avx512_nvidia_cc80', + 'scheduler': 'local', + 'launcher': 'mpirun', + 'environs': ['default'], + 'features': [ + FEATURES[GPU] # We want this to run GPU-based tests from the EESSI test suite + ] + list(SCALES.keys()), + 'resources': [ + { + 'name': 'memory', + 'options': ['--mem={size}'], + } + ], + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': 122880, # in MiB (1/4th of 491520 MiB) + GPU_VENDOR: GPU_VENDORS[NVIDIA], + }, + 'devices': [ + { + 'type': DEVICE_TYPES[GPU], + 'num_devices': 1, + } + ], + 'processor': { + "num_cpus": 18, + "num_cpus_per_core": 1, + "num_cpus_per_socket": 18, + "num_sockets": 1, + "topology": { + "numa_nodes": [ + # As stated, the 18 cores are on a single NUMA domain. Thus, the bitmask should be a sequence of 18 1's, which is 3ffff in hexadecimal representation + "0x3ffff", + ], + }, + }, + 'max_jobs': 1 + }, + ] + } + ], + 'environments': [ + { + 'name': 'default', + 'cc': 'cc', + 'cxx': '', + 'ftn': '' + } + ], + 'general': [ + { + 'purge_environment': True, + 'resolve_module_conflicts': False, # avoid loading the module before submitting the job + } + ], + 'logging': common_logging_config(), +} +``` + +# Step 7: Instructions to run the bot components The bot consists of three components: * the Smee client; @@ -765,7 +1093,7 @@ The bot consists of three components: Running the Smee client was explained in [Step 1](#step1). -## Step 6.1: Running the event handler +## Step 7.1: Running the event handler As the event handler may run for a long time, it is advised to run it in a `screen` or `tmux` session. The event handler is provided by the [`eessi_bot_event_handler.py`](https://github.com/EESSI/eessi-bot-software-layer/blob/main/eessi_bot_event_handler.py) Python script. @@ -788,7 +1116,7 @@ The event handler writes log information to the files `pyghee.log` and Note, if you run the bot on a frontend of a cluster with multiple frontends make sure that both the Smee client and the event handler run on the same system! -## Step 6.2: Running the job manager +## Step 7.2: Running the job manager As the job manager may run for a long time, it is advised to run it in a `screen` or `tmux` session. The job manager is provided by the [`eessi_bot_job_manager_layer.py`](https://github.com/EESSI/eessi-bot-software-layer/blob/main/eessi_bot_job_manager.py) Python script. You can run the job manager from the directory `eessi-bot-software-layer` simply by: diff --git a/RELEASE_NOTES b/RELEASE_NOTES index dd5378e1..5dc4bf33 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -1,6 +1,37 @@ This file contains a description of the major changes to the EESSI build-and-deploy bot. For more detailed information, please see the git log. +v0.7.0 (13 March 2025) +-------------------------- + +This is a minor release of the EESSI build-and-deploy bot. + +Bug fixes: +* bot only reports moving to trash_bin when relevant (#292) + +Improvements: +* add support for specifying that build job script is located in another repository (#283) +* implement exportvariable filter (#288, #291) + * see related configuration setting `allowed_exportvars` +* add alternative method to submit job (using `--begin=now+SOME_DELAY`) (#297) + * also see the new related configuration settings `job_handover_protocol` and `job_delay_begin_factor` +* set the local_tmp that is configured for a site as tmpdir in bot build job script (#299) +* add setting for a script to customize build environment (#302) +* add support for signing tarball and metadata file and uploading signatures to S3 bucket (#303) +* add SSH signing script `sign_verify_file_ssh.sh` (#304) +* updates of the docs (#293, #298) + +Changes to 'app.cfg' settings (see README.md and app.cfg.example for details): +* NEW (optional) 'allowed_exportvars' in section '[buildenv]' +* NEW (required) 'awaits_release_hold_release_msg' in section '[submitted_job_comments]' +* NEW (required) 'awaits_release_hold_release_msg' in section '[submitted_job_comments]' +* DEPRECATED (optional) 'awaits_release' in section '[submitted_job_comments]' +* NEW (optional) 'job_delay_begin_factor' in section '[buildenv]' +* NEW (required) 'job_handover_protocol' in section '[buildenv]' +* NEW (optional) 'signing' in section '[deploycfg]' +* NEW (optional) 'site_config_script' in section '[buildenv]' + + v0.6.0 (18 September 2024) -------------------------- diff --git a/app.cfg.example b/app.cfg.example index 152de2bc..f9b296f6 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -88,6 +88,25 @@ container_cachedir = PATH_TO_SHARED_DIRECTORY # http_proxy = http://PROXY_DNS:3128/ # https_proxy = http://PROXY_DNS:3128/ +# The job_delay_begin_factor setting defines how many times the poll_interval a +# job's begin (EligibleTime) from now should be delayed if the handover protocol +# is set to `delayed_begin` (see setting `job_handover_protocol`). That is, if +# the job_delay_begin_factor is set to five (5) the delay time is calculated as +# 5 * poll_interval. The event manager would use 2 as the default factor when +# submitting jobs. +job_delay_begin_factor = 2 + +# The job_handover_protocol setting defines which method is used to handover a +# job from the event handler to the job manager. Values are +# - hold_release (job is submitted with '--hold', job manager removes the hold +# with 'scontrol release') +# - delayed_begin (job is submitted with '--begin=now+(5 * poll_interval)' and +# any '--hold' is removed from the submission parameters); this is useful if the +# bot account cannot run 'scontrol release' to remove the hold of the job; +# also, the status update in the PR comment of the job is extended by noting +# the 'EligibleTime' +job_handover_protocol = hold_release + # Used to give all jobs of a bot instance the same name. Can be used to allow # multiple bot instances running on the same Slurm cluster. job_name = prod @@ -110,6 +129,15 @@ load_modules = # variables that are only set inside a Slurm job local_tmp = /tmp/$USER/EESSI +# PATH to a script that - if it exists - is sourced in the build job +# before any 'bot/*' script is run. This allows to customize the +# build environment due to specifics of the build site/cluster. +# Note, such customizations could also be performed by putting them +# into a module file and using the setting 'load_modules' (see above). +# However, the setting 'site_config_script' provides a low threshold +# for achieving this, too. +site_config_script = /path/to/script/if/any + # parameters to be added to all job submissions # NOTE do not quote parameter string. Quotes are retained when reading in config and # then the whole 'string' is recognised as a single parameter. @@ -132,6 +160,19 @@ no_build_permission_comment = Label `bot:build` has been set by user `{build_lab # whether or not to allow updating the submit options via custom module det_submit_opts allow_update_submit_opts = false +# defines which name-value pairs (environment variables) are allowed to be +# exported into the build environment via 'exportvariable' filters +# The bot build script makes use of the variable 'SKIP_TESTS' to determine if +# ReFrame tests shall be skipped or not. Default value is 'no'. If the value is +# 'yes' and the exportvariable filter is added to a bot build command +# ('export:SKIP_TESTS=yes'), ReFrame tests are skipped. +# NOTE, the setting is optional and commented by default. If you want to enable +# this feature ('exportvariable' filters), uncomment the line below and define +# meaningful key-value pair(s). For example, to enable the use of +# 'exportvariable:SKIP_TESTS=yes' as a filter, the key-value pair would be +# "SKIP_TESTS=yes". +# allowed_exportvars = ["NAME1=value_1a", "NAME1=value_1b", "NAME2=value_2"] + [deploycfg] # script for uploading built software packages @@ -153,6 +194,31 @@ endpoint_url = URL_TO_S3_SERVER # like: bucket_name = {"eessi-pilot-2023.06": "eessi-staging-pilot-2023.06", "eessi.io-2023.06": "software.eessi.io-2023.06"} bucket_name = eessi-staging +# settings for signing artefacts with JSON-like format +# REPO_ID: { "script": PATH_TO_SIGN_SCRIPT, "key": PATH_TO_KEY_FILE, "container_runtime": PATH_TO_CONTAINER_RUNTIME } +# If PATH_TO_SIGN_SCRIPT is a relative path, the script must reside in the +# checked out pull request of the target repository (e.g., +# EESSI/software-layer). +# The bot calls the script with the two arguments: +# 1. private key (as provided by the attribute 'key') +# 2. path to the file to be signed (the upload script will determine that) +# NOTE (on "container_runtime"), signing requires a recent installation of OpenSSH +# (8.2 or newer). If the frontend where the event handler runs does not have that +# version installed, you can specify a container runtime via the 'container_runtime' +# attribute below. Currently, only Singularity or Apptainer are supported. +# NOTE (on the key), make sure the file permissions are restricted to `0600` (only +# readable+writable by the file owner, or the signing will likely fail. +# Note (on json format), make sure no trailing commas are used after any elements +# or parsing/loading the json will likely fail. Also, the whole value should start +# at a new line and be indented as shown below. +signing = + { + "eessi.io-2023.06-software: { + "script": PATH_TO_SIGN_SCRIPT, + "key": PATH_TO_EESSI_BOT/config/user-site-system.key, + "container_runtime": PATH_TO_CONTAINER_RUNTIME + } + } # upload policy: defines what policy is used for uploading built artefacts # to an S3 bucket # 'all' ..: upload all artefacts (mulitple uploads of the same artefact possible) @@ -244,12 +310,14 @@ scontrol_command = /usr/bin/scontrol # information. [submitted_job_comments] awaits_release = job id `{job_id}` awaits release by job manager +awaits_release_delayed_begin_msg = job id `{job_id}` will be eligible to start in about {delay_seconds} seconds +awaits_release_hold_release_msg = job id `{job_id}` awaits release by job manager initial_comment = New job on instance `{app_name}` for CPU micro-architecture `{arch_name}`{accelerator_spec} for repository `{repo_id}` in job dir `{symlink}` with_accelerator =  and accelerator `{accelerator}` [new_job_comments] -awaits_launch = job awaits launch by Slurm scheduler +awaits_launch = job awaits launch by Slurm scheduler{extra_info} [running_job_comments] running_job = job `{job_id}` is running diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index 00a1db81..5895fbfb 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -56,6 +56,8 @@ # config.BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS, # optional # config.BUILDENV_SETTING_HTTPS_PROXY, # optional # config.BUILDENV_SETTING_HTTP_PROXY, # optional + # config.BUILDENV_SETTING_JOB_DELAY_BEGIN_FACTOR, # optional (default: 2) + config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL, # required config.BUILDENV_SETTING_JOB_NAME, # required config.BUILDENV_SETTING_JOBS_BASE_DIR, # required # config.BUILDENV_SETTING_LOAD_MODULES, # optional @@ -75,6 +77,7 @@ # config.DEPLOYCFG_SETTING_ENDPOINT_URL, # optional config.DEPLOYCFG_SETTING_METADATA_PREFIX, # (required) config.DEPLOYCFG_SETTING_NO_DEPLOY_PERMISSION_COMMENT, # required + # config.DEPLOYCFG_SETTING_SIGNING, # optional config.DEPLOYCFG_SETTING_UPLOAD_POLICY], # required config.SECTION_DOWNLOAD_PR_COMMENTS: [ config.DOWNLOAD_PR_COMMENTS_SETTING_CURL_FAILURE, # required @@ -92,12 +95,18 @@ config.GITHUB_SETTING_APP_NAME, # required config.GITHUB_SETTING_INSTALLATION_ID, # required config.GITHUB_SETTING_PRIVATE_KEY], # required + # the poll interval setting is required for the alternative job handover + # protocol (delayed_begin) + config.SECTION_JOB_MANAGER: [ + config.JOB_MANAGER_SETTING_POLL_INTERVAL], # required config.SECTION_REPO_TARGETS: [ config.REPO_TARGETS_SETTING_REPO_TARGET_MAP, # required config.REPO_TARGETS_SETTING_REPOS_CFG_DIR], # required config.SECTION_SUBMITTED_JOB_COMMENTS: [ config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT, # required - config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE, # required + # config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE, # optional + config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_DELAYED_BEGIN_MSG, # required + config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_HOLD_RELEASE_MSG, # required config.SUBMITTED_JOB_COMMENTS_SETTING_WITH_ACCELERATOR], # required } @@ -636,33 +645,37 @@ def handle_pull_request_closed_event(self, event_info, pr): self.log(f"PR {pr.number}: determining directories to be moved to trash bin") job_dirs = determine_job_dirs(pr.number) - # 2) Get trash_bin_dir from configs - trash_bin_root_dir = self.cfg[config.SECTION_CLEAN_UP][config.CLEAN_UP_SETTING_TRASH_BIN_ROOT_DIR] - - repo_name = request_body['repository']['full_name'] - dt_start = datetime.now(timezone.utc) - trash_bin_dir = "/".join([trash_bin_root_dir, repo_name, dt_start.strftime('%Y.%m.%d')]) - - # Subdirectory with date of move. Also with repository name. Handle symbolic links (later?) - # cron job deletes symlinks? - - # 3) move the directories to the trash_bin - self.log(f"PR {pr.number}: moving directories to trash bin {trash_bin_dir}") - move_to_trash_bin(trash_bin_dir, job_dirs) - dt_end = datetime.now(timezone.utc) - dt_delta = dt_end - dt_start - seconds_elapsed = dt_delta.days * 24 * 3600 + dt_delta.seconds - self.log(f"PR {pr.number}: moved directories to trash bin {trash_bin_dir} (took {seconds_elapsed} seconds)") + if job_dirs == []: + self.log(f"PR {pr.number}: No job directories found; nothing to move.") + else: + # 2) Get trash_bin_dir from configs + trash_bin_root_dir = self.cfg[config.SECTION_CLEAN_UP][config.CLEAN_UP_SETTING_TRASH_BIN_ROOT_DIR] - # 4) report move to pull request - repo_name = pr.base.repo.full_name - gh = github.get_instance() - repo = gh.get_repo(repo_name) - pull_request = repo.get_pull(pr.number) - clean_up_comment = self.cfg[config.SECTION_CLEAN_UP][config.CLEAN_UP_SETTING_MOVED_JOB_DIRS_COMMENT] - moved_comment = clean_up_comment.format(job_dirs=job_dirs, trash_bin_dir=trash_bin_dir) - issue_comment = pull_request.create_issue_comment(moved_comment) - return issue_comment + repo_name = request_body['repository']['full_name'] + dt_start = datetime.now(timezone.utc) + trash_bin_dir = "/".join([trash_bin_root_dir, repo_name, dt_start.strftime('%Y.%m.%d')]) + + # Subdirectory with date of move. Also with repository name. Handle symbolic links (later?) + # cron job deletes symlinks? + + # 3) move the directories to the trash_bin + self.log(f"PR {pr.number}: moving directories to trash bin {trash_bin_dir}") + move_to_trash_bin(trash_bin_dir, job_dirs) + dt_end = datetime.now(timezone.utc) + dt_delta = dt_end - dt_start + seconds_elapsed = dt_delta.days * 24 * 3600 + dt_delta.seconds + self.log(f"PR {pr.number}: moved directories to trash bin {trash_bin_dir} (took {seconds_elapsed} seconds)") + + # 4) report move to pull request + + repo_name = pr.base.repo.full_name + gh = github.get_instance() + repo = gh.get_repo(repo_name) + pull_request = repo.get_pull(pr.number) + clean_up_comment = self.cfg[config.SECTION_CLEAN_UP][config.CLEAN_UP_SETTING_MOVED_JOB_DIRS_COMMENT] + moved_comment = clean_up_comment.format(job_dirs=job_dirs, trash_bin_dir=trash_bin_dir) + issue_comment = pull_request.create_issue_comment(moved_comment) + return issue_comment def main(): diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index bb0c6dd8..4fcf9af3 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -51,6 +51,7 @@ # settings that are required in 'app.cfg' REQUIRED_CONFIG = { config.SECTION_BUILDENV: [ + config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL, # required config.BUILDENV_SETTING_JOB_NAME], # required config.SECTION_FINISHED_JOB_COMMENTS: [ config.FINISHED_JOB_COMMENTS_SETTING_JOB_RESULT_UNKNOWN_FMT, # required @@ -91,6 +92,9 @@ def __init__(self): self.job_name = buildenv_cfg.get(config.BUILDENV_SETTING_JOB_NAME) if self.job_name and len(self.job_name) < 3: raise Exception(f"job name ({self.job_name}) is shorter than 3 characters") + self.job_handover_protocol = buildenv_cfg.get(config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL) + if self.job_handover_protocol not in config.JOB_HANDOVER_PROTOCOLS_SET: + raise Exception(f"job handover protocol ({self.job_handover_protocol}) is unknown") def get_current_jobs(self): """ @@ -256,6 +260,25 @@ def determine_finished_jobs(self, known_jobs, current_jobs): return finished_jobs + def parse_scontrol_show_job_output(self, output): + """ + The output of 'scontrol --oneliner show job' is a list of key=value pairs + separated by whitespaces. + + Args: + output (string): the output of the scontrol command + + Returns: + (dict): Returns a dictionary of the key-value pairs + """ + job_info = {} + stripped_output = output.strip() + for pair in stripped_output.split(): + key, value = pair.split('=', 1) + job_info[key] = value + + return job_info + def process_new_job(self, new_job): """ Process a new job by verifying that it is a bot job and if so @@ -283,19 +306,20 @@ def process_new_job(self, new_job): log_file=self.logfile, ) - # parse output of 'scontrol_cmd' to determine the job's working - # directory - match = re.search(r".* WorkDir=(\S+) .*", - str(scontrol_output)) - if match: + # parse output of 'scontrol_cmd' + job_info = self.parse_scontrol_show_job_output(str(scontrol_output)) + + # check if job_info contains 'WorkDir', if not we cannot process the job + # further + if 'WorkDir' in job_info: log( "process_new_job(): work dir of job %s: '%s'" - % (job_id, match.group(1)), + % (job_id, job_info['WorkDir']), self.logfile, ) job_metadata_path = "%s/_bot_job%s.metadata" % ( - match.group(1), + job_info['WorkDir'], job_id, ) @@ -313,21 +337,34 @@ def process_new_job(self, new_job): symlink_source = os.path.join(self.submitted_jobs_dir, job_id) log( "process_new_job(): create a symlink: %s -> %s" - % (symlink_source, match.group(1)), + % (symlink_source, job_info['WorkDir']), self.logfile, ) - os.symlink(match.group(1), symlink_source) - - release_cmd = "%s release %s" % ( - self.scontrol_command, - job_id, - ) + os.symlink(job_info['WorkDir'], symlink_source) + + # handle different job handover protocols + # *_HOLD_RELEASE: job was submitted with '--hold' and shall be + # released with 'scontrol release JOB_ID' + # *_DELAYED_BEGIN: job was submitted with '--begin=now+SOMEDELAY', + # no extra action is needed + job_status = '' + extra_info = '' + if self.job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_HOLD_RELEASE: + release_cmd = "%s release %s" % ( + self.scontrol_command, + job_id, + ) - release_output, release_err, release_exitcode = run_cmd( - release_cmd, - "process_new_job(): scontrol command", - log_file=self.logfile, - ) + release_output, release_err, release_exitcode = run_cmd( + release_cmd, + "process_new_job(): scontrol command", + log_file=self.logfile, + ) + job_status = 'released' + extra_info = '' + elif self.job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_DELAYED_BEGIN: + job_status = 'received' + extra_info = " (eligible to start from {job_info['EligibleTime'})" # update PR defined by repo and pr_number stored in the job's # metadata file @@ -356,8 +393,9 @@ def process_new_job(self, new_job): if "comment_id" in new_job: new_job_comments_cfg = config.read_config()[config.SECTION_NEW_JOB_COMMENTS] dt = datetime.now(timezone.utc) - update = "\n|%s|released|" % dt.strftime("%b %d %X %Z %Y") - update += f"{new_job_comments_cfg[config.NEW_JOB_COMMENTS_SETTING_AWAITS_LAUNCH]}|" + update = "\n|%s|%s|" % (dt.strftime("%b %d %X %Z %Y"), job_status) + description_col_fmt = new_job_comments_cfg[config.NEW_JOB_COMMENTS_SETTING_AWAITS_LAUNCH] + update += f"{description_col_fmt.format(extra_info=extra_info)}|" update_comment(new_job["comment_id"], pr, update) else: log( diff --git a/scripts/bot-build.slurm b/scripts/bot-build.slurm index bb0faa91..593bd158 100755 --- a/scripts/bot-build.slurm +++ b/scripts/bot-build.slurm @@ -10,6 +10,7 @@ # # author: Kenneth Hoste (@boegel) # author: Thomas Roeblitz (@trz42) +# author: Sam Moors (@smoors) # # license: GPLv2 # @@ -22,7 +23,70 @@ # - the directory may contain any additional files references in job.cfg, # for example, repos.cfg and configuration file bundles for repositories +# set default for SKIP_TESTS (don't skip ReFrame tests) +SKIP_TESTS=no + echo "Starting bot-build.slurm" +EXPORT_VARS_SCRIPT=cfg/export_vars.sh +if [ -f ${EXPORT_VARS_SCRIPT} ]; then + echo "${EXPORT_VARS_SCRIPT} script found in '${PWD}', so sourcing it!" + source ${EXPORT_VARS_SCRIPT} + echo "$EXPORT_VARS_SCRIPT finished" +else + echo "could not find ${EXPORT_VARS_SCRIPT} script in '${PWD}', skipping" >&2 +fi + +# First, read if there is a local_tmp or a site_config_script defined in +# the site_config section of cfg/job.cfg +# - local_tmp is used to define what location/storage folder is used as a base +# for creating temporary directories +# - site_config_script points to a script that is used to customize local +# settings for build jobs; if the script exists, it is sourced +JOB_CFG=cfg/job.cfg +inside_site_config=false +local_tmp_value="" +site_config_script_value="" +while IFS= read -r line; do + # Check if we've reached [site_config] + if [[ $line =~ ^\[site_config\]$ ]]; then + inside_site_config=true + continue + fi + + # If another section starts and we haven't found local_tmp or + # site_config_script, don't try to match + if [[ $line =~ ^\[.*\]$ && $inside_site_config == true ]]; then + inside_site_config=false + fi + + # Extract 'local_tmp' or 'site_config_script' when inside [site_config] + # and leave while loop when both are found + if $inside_site_config && [[ $line =~ ^local_tmp\ *=\ *([^[:space:]]+) ]]; then + local_tmp_value="${BASH_REMATCH[1]}" + fi + if $inside_site_config && [[ $line =~ ^site_config_script\ *=\ *([^[:space:]]+) ]]; then + site_config_script_value="${BASH_REMATCH[1]}" + fi + if [[ -n "$local_tmp_value" ]] && [[ -n "$site_config_script_value" ]]; then + break + fi +done < "$JOB_CFG" +if [[ -n "${local_tmp_value}" ]]; then + local_tmp_value=$(envsubst <<< ${local_tmp_value}) + # Ensure dir exists before calling mktemp + mkdir -p ${local_tmp_value} + local_tmp_value=$(mktemp -d --tmpdir=${local_tmp_value} eessi_job.XXXXXXXXXX) + echo "Overwriting current TMPDIR '$TMPDIR' with the value '${local_tmp_value}', as configured in cfg/job.cfg" + export TMPDIR="${local_tmp_value}" +fi + +if [[ -n "${site_config_script_value}" ]] && [[ -r ${site_config_script_value} ]]; then + echo "Sourcing site config script '${site_config_script_value}'" + source "${site_config_script_value}" +else + echo "Site config script defined as '${site_config_script_value}' does not exist; ignoring it" +fi + BOT_BUILD_SCRIPT=bot/build.sh if [ -f ${BOT_BUILD_SCRIPT} ]; then echo "${BOT_BUILD_SCRIPT} script found in '${PWD}', so running it!" @@ -47,14 +111,19 @@ artefacts = EOF fi echo "check build step finished" -TEST_SCRIPT=bot/test.sh -if [ -f ${TEST_SCRIPT} ]; then - echo "${TEST_SCRIPT} script found in '${PWD}', so running it!" - ${TEST_SCRIPT} - echo "${TEST_SCRIPT} finished" -else - echo "could not find ${TEST_SCRIPT} script in '${PWD}'" >&2 + +# SKIP_TESTS can be defined as export variable in the bot's config and then added to bot commands (export:SKIP_TESTS=yes) +if [[ "${SKIP_TESTS}" != "yes" ]]; then + TEST_SCRIPT=bot/test.sh + if [ -f ${TEST_SCRIPT} ]; then + echo "${TEST_SCRIPT} script found in '${PWD}', so running it!" + ${TEST_SCRIPT} + echo "${TEST_SCRIPT} finished" + else + echo "could not find ${TEST_SCRIPT} script in '${PWD}'" >&2 + fi fi + CHECK_TEST_SCRIPT=bot/check-test.sh if [ -f ${CHECK_TEST_SCRIPT} ]; then echo "${CHECK_TEST_SCRIPT} script found in '${PWD}', so running it!" diff --git a/scripts/eessi-upload-to-staging b/scripts/eessi-upload-to-staging index b5e4482d..25fd9675 100755 --- a/scripts/eessi-upload-to-staging +++ b/scripts/eessi-upload-to-staging @@ -83,6 +83,9 @@ function display_help echo " ingestion procedure" >&2 echo " -l | --list-variables - list variables that are available" >&2 echo " for expansion" >&2 + echo " -k | --sign-key SCRIPT_KEY - specify location of the key to be" >&2 + echo " used to sign artefacts and metadata" >&2 + echo " files [optional; default: don't sign]" >&2 echo " -m | --metadata-prefix PREFIX - a directory to which the metadata" >&2 echo " file shall be uploaded; BASH variable" >&2 echo " expansion will be applied; arg '-l'" >&2 @@ -93,6 +96,13 @@ function display_help echo " link the upload to a PR" >&2 echo " -r | --repository FULL_NAME - a repository name ACCOUNT/REPONAME;" >&2 echo " used to link the upload to a PR" >&2 + echo " -s | --sign-script SCRIPT_PATH - path to script that is used to sign" >&2 + echo " artefacts and metadata files. The" >&2 + echo " script is called with two arguments:" >&2 + echo " KEY file_to_sign. The KEY is the one" >&2 + echo " provided via option --sign-key. The" >&2 + echo " latter is determined by this script." >&2 + echo " [optional; default: don't sign]" >&2 } if [[ $# -lt 1 ]]; then @@ -120,6 +130,8 @@ endpoint_url= pr_comment_id="none" pull_request_number="none" github_repository="EESSI/software-layer" +sign_key= +sign_script= # provided via options in the bot's config file app.cfg and/or command line argument metadata_prefix= @@ -155,6 +167,14 @@ while [[ $# -gt 0 ]]; do pr_comment_id="$2" shift 2 ;; + -k|--sign-key) + sign_key=$2 + if [[ ! -r "${sign_key}" ]]; then + echo "Error: SSH key '${sign_key}' to be used for signing doesn't exist or cannot be read" >&2 + exit 1 + fi + shift 2 + ;; -m|--metadata-prefix) metadata_prefix="$2" shift 2 @@ -171,6 +191,14 @@ while [[ $# -gt 0 ]]; do github_repository="$2" shift 2 ;; + -s|--sign-script) + sign_script=$2 + if [[ ! -x "${sign_script}" ]]; then + echo "Error: Script '${sign_script}' to be used for signing doesn't exist or is not executable" >&2 + exit 1 + fi + shift 2 + ;; -*|--*) echo "Error: Unknown option: $1" >&2 exit 1 @@ -185,6 +213,21 @@ done # restore potentially parsed filename(s) into $* set -- "${POSITIONAL_ARGS[@]}" +# ensure that either none or both of $sign_key and $sign_script are defined +if [[ -n "${sign_key}" ]] && [[ -n "${sign_script}" ]]; then + sign=1 +elif [[ -n "${sign_key}" ]]; then + sign=0 + echo "Error: Signing requires a key (${sign_key}) AND a script (${sign_script}); likely the bot config is incomplete" >&2 + exit 1 +elif [[ -n "${sign_script}" ]]; then + sign=0 + echo "Error: Signing requires a key (${sign_key}) AND a script (${sign_script}); likely the bot config is incomplete" >&2 + exit 1 +else + sign=0 +fi + # infer bucket_base: # if endpoint_url is not set (assume AWS S3 is used), # bucket_base=https://${bucket_name}.s3.amazonaws.com/ @@ -217,6 +260,33 @@ for file in "$*"; do aws_path=$(envsubst <<< "${artefact_prefix}") fi aws_file=$(basename ${file}) + # 1st sign artefact, and upload signature + if [[ "${sign}" = "1" ]]; then + # sign artefact + ${sign_script} sign ${sign_key} ${file} + # TODO check if signing worked (just check exit code == 0) + sig_file=${file}.sig + aws_sig_file=${aws_file}.sig + + # uploading signature + echo " store artefact signature at ${aws_path}/${aws_sig_file}" + upload_to_staging_bucket \ + "${sig_file}" \ + "${bucket_name}" \ + "${aws_path}/${aws_sig_file}" \ + "${endpoint_url}" + else + echo "no signing method defined; not signing artefact" + fi + + echo Uploading to "${url}" + echo " store artefact at ${aws_path}/${aws_file}" + upload_to_staging_bucket \ + "${file}" \ + "${bucket_name}" \ + "${aws_path}/${aws_file}" \ + "${endpoint_url}" + echo "Creating metadata file" url="${bucket_base}/${aws_path}/${aws_file}" echo "create_metadata_file file=${file} \ @@ -229,17 +299,11 @@ for file in "$*"; do "${github_repository}" \ "${pull_request_number}" \ "${pr_comment_id}") + aws_metadata_file=${aws_file}.meta.txt + # TODO check that creating the metadata file succeeded echo "metadata:" cat ${metadata_file} - echo Uploading to "${url}" - echo " store artefact at ${aws_path}/${aws_file}" - upload_to_staging_bucket \ - "${file}" \ - "${bucket_name}" \ - "${aws_path}/${aws_file}" \ - "${endpoint_url}" - if [ -z ${metadata_prefix} ]; then aws_path=${legacy_aws_path} else @@ -247,6 +311,23 @@ for file in "$*"; do export github_repository aws_path=$(envsubst <<< "${metadata_prefix}") fi + # 2nd sign metadata file, and upload signature + if [[ "${sign}" = "1" ]]; then + # sign metadata file + ${sign_script} sign ${sign_key} ${metadata_file} + # TODO check if signing worked (just check exit code == 0) + sig_metadata_file=${metadata_file}.sig + aws_sig_metadata_file=${aws_metadata_file}.sig + + echo " store metadata signature at ${aws_path}/${aws_sig_metadata_file}" + upload_to_staging_bucket \ + "${sig_metadata_file}" \ + "${bucket_name}" \ + "${aws_path}/${aws_sig_metadata_file}" \ + "${endpoint_url}" + else + echo "no signing method defined; not signing metadata file" + fi echo " store metadata file at ${aws_path}/${aws_file}.meta.txt" upload_to_staging_bucket \ "${metadata_file}" \ diff --git a/scripts/sign_verify_file_ssh.sh b/scripts/sign_verify_file_ssh.sh new file mode 100755 index 00000000..679ea7d6 --- /dev/null +++ b/scripts/sign_verify_file_ssh.sh @@ -0,0 +1,156 @@ +#!/bin/bash +# +# SSH Signature Signing and Verification Script +# - Sign a file using an SSH private key. +# - Verify a signed file using an allowed signers file. +# +# Generates a signature file named `.sig` in the same directory. +# +# Author: Alan O'Cais +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# Usage message +usage() { + cat < + $0 verify [signature_file] + +Options: + sign: + - : Path to SSH private key (use KEY_PASSPHRASE env for passphrase) + - : File to sign + + verify: + - : Path to the allowed signers file + - : File to verify + - [signature_file]: Optional, defaults to '.sig' + +Example allowed signers format: + identity_1 +EOF + exit 9 +} + +# Error codes +FILE_PROBLEM=1 +CONVERSION_FAILURE=2 +VALIDATION_FAILED=3 + +# Ensure minimum arguments +[ "$#" -lt 3 ] && usage + +MODE="$1" +FILE_TO_SIGN="$3" + +# Ensure the target file exists +if [ ! -f "$FILE_TO_SIGN" ]; then + echo "Error: File '$FILE_TO_SIGN' not found." + exit $FILE_PROBLEM +fi + +# Use a very conservatuve umask throughout this script since we are dealing with sensitive things +umask 077 || { echo "Error: Failed to set 0177 umask."; exit $FILE_PROBLEM; } + +# Create a restricted temporary directory and ensure cleanup on exit +TEMP_DIR=$(mktemp -d) || { echo "Error: Failed to create temporary directory."; exit $FILE_PROBLEM; } +trap 'rm -rf "$TEMP_DIR"' EXIT + +# Converts the SSH private key to OpenSSH format and generates a public key +convert_private_key() { + local input_key="$1" + local output_key="$2" + + echo "Converting SSH key to OpenSSH format..." + cp "$input_key" "$output_key" || { echo "Error: Failed to copy $input_key to $output_key"; exit $FILE_PROBLEM; } + + # This saves the key in the default OpenSSH format (which is required for signing) + ssh-keygen -p -f "$output_key" -P "${KEY_PASSPHRASE:-}" -N "${KEY_PASSPHRASE:-}" || { + echo "Error: Failed to convert key to OpenSSH format." + exit $CONVERSION_FAILURE + } + + # Extract the public key from the private key + ssh-keygen -y -f "$input_key" -P "${KEY_PASSPHRASE:-}" > "${output_key}.pub" || { + echo "Error: Failed to extract public key." + exit $CONVERSION_FAILURE + } +} + +# Sign mode +if [ "$MODE" == "sign" ]; then + PRIVATE_KEY="$2" + TEMP_KEY="$TEMP_DIR/converted_key" + SIG_FILE="${FILE_TO_SIGN}.sig" + + # Check for key and existing signature + [ ! -f "$PRIVATE_KEY" ] && { echo "Error: Private key not found."; exit $FILE_PROBLEM; } + [ -f "$SIG_FILE" ] && { echo "Error: Signature already exists. Remove to re-sign."; exit $FILE_PROBLEM; } + + convert_private_key "$PRIVATE_KEY" "$TEMP_KEY" + + echo "Signing the file..." + ssh-keygen -Y sign -f "$TEMP_KEY" -P "${KEY_PASSPHRASE:-}" -n file "$FILE_TO_SIGN" + + [ ! -f "$SIG_FILE" ] && { echo "Error: Signing failed."; exit $FILE_PROBLEM; } + echo "Signature created: $SIG_FILE" + + cat </cfg. This file will be + sourced before running the bot/build.sh script. + + Args: + job_dir (string): working directory of the job + exportvars (list): strings of the form VAR=VALUE to be exported + + Returns: + None (implicitly) + """ + fn = sys._getframe().f_code.co_name + + content = '\n'.join(f'export {x}' for x in exportvars) + export_vars_path = os.path.join(job_dir, 'cfg', EXPORT_VARS_FILE) + + with open(export_vars_path, 'w') as file: + file.write(content) + + log(f"{fn}(): created exported variables file {export_vars_path}") + + def prepare_jobs(pr, cfg, event_info, action_filter): """ Prepare all jobs whose context matches the given filter. Preparation includes @@ -465,6 +562,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter): build_env_cfg = get_build_env_cfg(cfg) arch_map = get_architecture_targets(cfg) repocfg = get_repo_cfg(cfg) + allowed_exportvars = get_allowed_exportvars(cfg) base_repo_name = pr.base.repo.full_name log(f"{fn}(): pr.base.repo.full_name '{base_repo_name}'") @@ -491,6 +589,16 @@ def prepare_jobs(pr, cfg, event_info, action_filter): log(f"{fn}(): found no accelerator requirement") accelerator = None + # determine exportvars from action_filter argument + exportvars = action_filter.get_filter_by_component(tools_filter.FILTER_COMPONENT_EXPORT) + + # all exportvar filters must be allowed in order to run any jobs + if exportvars: + not_allowed = [x for x in exportvars if x not in allowed_exportvars] + if not_allowed: + log(f"{fn}(): exportvariable(s) {not_allowed} not allowed") + return [] + jobs = [] for arch, slurm_opt in arch_map.items(): arch_dir = arch.replace('/', '_') @@ -545,6 +653,9 @@ def prepare_jobs(pr, cfg, event_info, action_filter): prepare_job_cfg(job_dir, build_env_cfg, repocfg, repo_id, cpu_target, os_type, accelerator) + if exportvars: + prepare_export_vars_file(job_dir, exportvars) + # enlist jobs to proceed job = Job(job_dir, arch, repo_id, slurm_opt, year_month, pr_id, accelerator) jobs.append(job) @@ -580,6 +691,7 @@ def prepare_job_cfg(job_dir, build_env_cfg, repos_cfg, repo_id, software_subdir, # repository's definition, some combine two values): # [site_config] # local_tmp = config.BUILDENV_SETTING_LOCAL_TMP + # site_config_script = config.BUILDENV_SETTING_SITE_CONFIG_SCRIPT # shared_fs_path = config.BUILDENV_SETTING_SHARED_FS_PATH # build_logs_dir = config.BUILDENV_SETTING_BUILD_LOGS_DIR # @@ -604,6 +716,7 @@ def prepare_job_cfg(job_dir, build_env_cfg, repos_cfg, repo_id, software_subdir, config.BUILDENV_SETTING_LOAD_MODULES: job_metadata.JOB_CFG_SITE_CONFIG_LOAD_MODULES, config.BUILDENV_SETTING_LOCAL_TMP: job_metadata.JOB_CFG_SITE_CONFIG_LOCAL_TMP, config.BUILDENV_SETTING_SHARED_FS_PATH: job_metadata.JOB_CFG_SITE_CONFIG_SHARED_FS_PATH, + config.BUILDENV_SETTING_SITE_CONFIG_SCRIPT: job_metadata.JOB_CFG_SITE_CONFIG_SITE_CONFIG_SCRIPT, } for build_env_key, job_cfg_key in build_env_to_job_cfg_keys.items(): if build_env_cfg[build_env_key]: @@ -709,13 +822,50 @@ def submit_job(job, cfg): job = job._replace(slurm_opts=det_submit_opts(job)) log(f"{fn}(): updated job.slurm_opts: {job.slurm_opts}") + build_job_script = build_env_cfg[config.BUILDENV_SETTING_BUILD_JOB_SCRIPT] + if isinstance(build_job_script, str): + build_job_script_path = build_job_script + log(f"{fn}(): path to build job script: {build_job_script_path}") + elif isinstance(build_job_script, dict): + build_job_script_repo = build_job_script.get('repo') + if build_job_script_repo: + log(f"{fn}(): repository in which build job script is located: {build_job_script_repo}") + else: + error(f"Failed to determine repository in which build job script is located from: {build_job_script}") + + build_job_script_path = build_job_script.get('path') + if build_job_script_path: + log(f"{fn}(): path to build job script in repository: {build_job_script_path}") + else: + error(f"Failed to determine path of build job script in repository from: {build_job_script}") + + # clone repo to temporary directory, and correctly set path to build job script + repo_subdir = build_job_script_repo.split('/')[-1] + if repo_subdir.endswith('.git'): + repo_subdir = repo_subdir[:-4] + target_dir = os.path.join(job.working_dir, repo_subdir) + os.makedirs(target_dir, exist_ok=True) + + clone_output, clone_error, clone_exit_code = clone_git_repo(build_job_script_repo, target_dir) + if clone_exit_code == 0: + log(f"{fn}(): repository {build_job_script_repo} cloned to {target_dir}") + else: + error(f"Failed to clone repository {build_job_script_repo}: {clone_error}") + + build_job_script_path = os.path.join(target_dir, build_job_script_path) + else: + error(f"Incorrect build job script specification, unknown type: {build_job_script}") + + if not os.path.exists(build_job_script_path): + error(f"Build job script not found at {build_job_script_path}") + command_line = ' '.join([ build_env_cfg[config.BUILDENV_SETTING_SUBMIT_COMMAND], build_env_cfg[config.BUILDENV_SETTING_SLURM_PARAMS], time_limit, job.slurm_opts] + ([f"--job-name='{job_name}'"] if job_name else []) + - [build_env_cfg[config.BUILDENV_SETTING_BUILD_JOB_SCRIPT]]) + [build_job_script_path]) cmdline_output, cmdline_error, cmdline_exit_code = run_cmd(command_line, "submit job for target '%s'" % job.arch_target, @@ -770,18 +920,44 @@ def create_pr_comment(job, job_id, app_name, pr, gh, symlink): dt = datetime.now(timezone.utc) # construct initial job comment - job_comment = (f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT]}" - f"\n|date|job status|comment|\n" - f"|----------|----------|------------------------|\n" - f"|{dt.strftime('%b %d %X %Z %Y')}|" - f"submitted|" - f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE]}|").format( - app_name=app_name, - arch_name=arch_name, - symlink=symlink, - repo_id=job.repo_id, - job_id=job_id, - accelerator_spec=accelerator_spec_str) + buildenv = config.read_config()[config.SECTION_BUILDENV] + job_handover_protocol = buildenv.get(config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL) + if job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_DELAYED_BEGIN: + release_msg_string = config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_DELAYED_BEGIN_MSG + release_comment_template = submitted_job_comments_cfg[release_msg_string] + # calculate delay from poll_interval and delay_factor + job_manager_cfg = config.read_config()[config.SECTION_JOB_MANAGER] + poll_interval = int(job_manager_cfg.get(config.JOB_MANAGER_SETTING_POLL_INTERVAL)) + delay_factor = float(buildenv.get(config.BUILDENV_SETTING_JOB_DELAY_BEGIN_FACTOR, 2)) + eligible_in_seconds = int(poll_interval * delay_factor) + job_comment = (f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT]}" + f"\n|date|job status|comment|\n" + f"|----------|----------|------------------------|\n" + f"|{dt.strftime('%b %d %X %Z %Y')}|" + f"submitted|" + f"{release_comment_template}|").format( + app_name=app_name, + arch_name=arch_name, + symlink=symlink, + repo_id=job.repo_id, + job_id=job_id, + delay_seconds=eligible_in_seconds, + accelerator_spec=accelerator_spec_str) + else: + release_msg_string = config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_HOLD_RELEASE_MSG + release_comment_template = submitted_job_comments_cfg[release_msg_string] + job_comment = (f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT]}" + f"\n|date|job status|comment|\n" + f"|----------|----------|------------------------|\n" + f"|{dt.strftime('%b %d %X %Z %Y')}|" + f"submitted|" + f"{release_comment_template}|").format( + app_name=app_name, + arch_name=arch_name, + symlink=symlink, + repo_id=job.repo_id, + job_id=job_id, + accelerator_spec=accelerator_spec_str) # create comment to pull request repo_name = pr.base.repo.full_name diff --git a/tasks/deploy.py b/tasks/deploy.py index 32e7705f..2d36d24e 100644 --- a/tasks/deploy.py +++ b/tasks/deploy.py @@ -265,6 +265,12 @@ def upload_artefact(job_dir, payload, timestamp, repo_name, pr_number, pr_commen bucket_spec = deploycfg.get(config.DEPLOYCFG_SETTING_BUCKET_NAME) metadata_prefix = deploycfg.get(config.DEPLOYCFG_SETTING_METADATA_PREFIX) artefact_prefix = deploycfg.get(config.DEPLOYCFG_SETTING_ARTEFACT_PREFIX) + signing_str = deploycfg.get(config.DEPLOYCFG_SETTING_SIGNING) or '' + try: + signing = json.loads(signing_str) + except json.decoder.JSONDecodeError: + signing = {} + log(f"{funcname}(): error initialising signing from ({signing_str})") # if bucket_spec value looks like a dict, try parsing it as such if bucket_spec.lstrip().startswith('{'): @@ -334,11 +340,29 @@ def upload_artefact(job_dir, payload, timestamp, repo_name, pr_number, pr_commen return # run 'eessi-upload-to-staging {abs_path}' + # (1) construct command line + # (2) setup container environment (for signing artefacts ...) if needed + # (3) run command + # (1) construct command line # script assumes a few defaults: # bucket_name = 'eessi-staging' # if endpoint_url not set use EESSI S3 bucket - # (2) run command + do_signing = signing and target_repo_id in signing + sign_args = [] + if do_signing: + sign_key_str = signing[target_repo_id][config.DEPLOYCFG_SETTING_SIGNING_KEY] + sign_key_path = os.path.abspath(sign_key_str) + sign_args.extend(['--sign-key', sign_key_path]) + sign_script_str = signing[target_repo_id][config.DEPLOYCFG_SETTING_SIGNING_SCRIPT] + # if script begins not with '/', assume its location is relative to the job directory + # (that's because the script is provided by the target repository) + if sign_script_str.startswith('/'): + sign_script_path = os.path.abspath(sign_script_str) + else: + sign_script_path = os.path.abspath(os.path.join(job_dir, sign_script_str)) + sign_args.extend(['--sign-script', sign_script_path]) + cmd_args = [artefact_upload_script, ] if len(artefact_prefix_arg) > 0: cmd_args.extend(['--artefact-prefix', artefact_prefix_arg]) @@ -351,11 +375,61 @@ def upload_artefact(job_dir, payload, timestamp, repo_name, pr_number, pr_commen cmd_args.extend(['--pr-comment-id', str(pr_comment_id)]) cmd_args.extend(['--pull-request-number', str(pr_number)]) cmd_args.extend(['--repository', repo_name]) + cmd_args.extend(sign_args) cmd_args.append(abs_path) - upload_cmd = ' '.join(cmd_args) - # run_cmd does all the logging we might need - out, err, ec = run_cmd(upload_cmd, 'Upload artefact to S3 bucket', raise_on_error=False) + # (2) setup container environment (for signing artefacts ...) if needed + # determine container to run (from job.cfg) + # determine container cache dir (from job.cfg) + # setup directory for temporary container storage (previous_tmp/upload_step) + # define miscellaneous args (--home ...) + run_in_container = ( + do_signing and + config.DEPLOYCFG_SETTING_SIGNING_CONTAINER_RUNTIME in signing[target_repo_id] + ) + container_cmd = [] + my_env = {} + if run_in_container: + container = jobcfg[job_metadata.JOB_CFG_REPOSITORY_SECTION][job_metadata.JOB_CFG_REPOSITORY_CONTAINER] + cachedir = jobcfg[job_metadata.JOB_CFG_SITE_CONFIG_SECTION][job_metadata.JOB_CFG_SITE_CONFIG_CONTAINER_CACHEDIR] + upload_tmp_dir = os.path.join(job_dir, job_metadata.JOB_CFG_PREVIOUS_TMP, job_metadata.JOB_CFG_UPLOAD_STEP) + os.makedirs(upload_tmp_dir, exist_ok=True) + container_runtime = signing[target_repo_id][config.DEPLOYCFG_SETTING_SIGNING_CONTAINER_RUNTIME] + + # determine (additional) bind mounts from paths used to call upload script and its arguments + # - assumes that all paths begin with '/' + bind_mounts = set() + # first add parent of job_dir and real path of the parent + job_parent_dir = os.path.dirname(job_dir) + bind_mounts.add(job_parent_dir) + real_job_parent_dir = os.path.realpath(job_parent_dir) + if job_parent_dir != real_job_parent_dir: + bind_mounts.add(real_job_parent_dir) + # now, process all args that begin with '/' + for arg in cmd_args: + if arg.startswith('/'): + arg_dir = os.path.dirname(arg) + bind_mounts.add(arg_dir) + # also, determine the real path for arg_dir and add it if it's different to arg_dir + real_dir = os.path.realpath(arg_dir) + if arg_dir != real_dir: + bind_mounts.add(real_dir) + + container_cmd = [container_runtime, ] + container_cmd.extend(['exec']) + # avoid that $HOME 'leaks' in due to system settings + container_cmd.extend(['--no-home']) + for bind in bind_mounts: + container_cmd.extend(['--bind', bind]) + container_cmd.extend([container]) + my_env = { + 'SINGULARITY_CACHEDIR': cachedir, + 'SINGULARITY_TMPDIR': upload_tmp_dir + } + + cmd_and_args = ' '.join(container_cmd + cmd_args) + log(f"command to launch upload script: {cmd_and_args}") + out, err, ec = run_cmd(cmd_and_args, 'Upload artefact to S3 bucket', raise_on_error=False, env=my_env) if ec == 0: # add file to 'job_dir/../uploaded.txt' diff --git a/tests/test_app.cfg b/tests/test_app.cfg index 43e11bf9..84161ba0 100644 --- a/tests/test_app.cfg +++ b/tests/test_app.cfg @@ -12,12 +12,15 @@ # sample config file for tests (some functions run config.read_config() # which reads app.cfg by default) [buildenv] +job_handover_protocol = hold_release [job_manager] # variable 'comment' under 'submitted_job_comments' should not be changed as there are regular expression patterns matching it [submitted_job_comments] awaits_release = job id `{job_id}` awaits release by job manager +awaits_release_delayed_begin_msg = job id `{job_id}` will be eligible to start in about {delay_seconds} seconds +awaits_release_hold_release_msg = job id `{job_id}` awaits release by job manager initial_comment = New job on instance `{app_name}` for CPU micro-architecture `{arch_name}`{accelerator_spec} for repository `{repo_id}` in job dir `{symlink}` with_accelerator =  and accelerator `{accelerator}` diff --git a/tools/__init__.py b/tools/__init__.py index 640cae17..0e7d2028 100644 --- a/tools/__init__.py +++ b/tools/__init__.py @@ -23,7 +23,7 @@ # TODO do we really need two functions (run_cmd and run_subprocess) for # running a command? -def run_cmd(cmd, log_msg='', working_dir=None, log_file=None, raise_on_error=True): +def run_cmd(cmd, log_msg='', working_dir=None, log_file=None, raise_on_error=True, env=None): """ Runs a command in the shell and raises an error if one occurs. @@ -33,6 +33,7 @@ def run_cmd(cmd, log_msg='', working_dir=None, log_file=None, raise_on_error=Tru working_dir (string): location of the job's working directory log_file (string): path to log file raise_on_error (bool): if True raise an exception in case of error + env (dict): environment settings for running the command Returns: tuple of 3 elements containing @@ -45,7 +46,7 @@ def run_cmd(cmd, log_msg='', working_dir=None, log_file=None, raise_on_error=Tru raise_on_error is True """ # TODO use common method for logging function name in log messages - stdout, stderr, exit_code = run_subprocess(cmd, log_msg, working_dir, log_file) + stdout, stderr, exit_code = run_subprocess(cmd, log_msg, working_dir, log_file, env) if exit_code != 0: error_msg = ( @@ -66,7 +67,7 @@ def run_cmd(cmd, log_msg='', working_dir=None, log_file=None, raise_on_error=Tru return stdout, stderr, exit_code -def run_subprocess(cmd, log_msg, working_dir, log_file): +def run_subprocess(cmd, log_msg='', working_dir=None, log_file=None, env=None): """ Runs a command in the shell. No error is raised if the command fails. @@ -75,6 +76,7 @@ def run_subprocess(cmd, log_msg, working_dir, log_file): log_msg (string): purpose of the command working_dir (string): location of the job's working directory log_file (string): path to log file + env (dict): environment settings for running the command Returns: tuple of 3 elements containing @@ -91,7 +93,12 @@ def run_subprocess(cmd, log_msg, working_dir, log_file): else: log(f"run_subprocess(): Running '{cmd}' in directory '{working_dir}'", log_file=log_file) + my_env = os.environ.copy() + if env is not None: + my_env.update(env) + result = subprocess.run(cmd, + env=my_env, cwd=working_dir, shell=True, encoding="UTF-8", diff --git a/tools/config.py b/tools/config.py index ff641ebb..5d0c6a7e 100644 --- a/tools/config.py +++ b/tools/config.py @@ -25,7 +25,7 @@ # Local application imports (anything from EESSI/eessi-bot-software-layer) from .logging import error -# define configration constants +# define configuration constants # SECTION_sectionname for any section name in app.cfg # sectionname_SETTING_settingname for any setting with name settingname in # section sectionname @@ -37,6 +37,7 @@ BOT_CONTROL_SETTING_COMMAND_RESPONSE_FMT = 'command_response_fmt' SECTION_BUILDENV = 'buildenv' +BUILDENV_SETTING_ALLOWED_EXPORTVARS = 'allowed_exportvars' BUILDENV_SETTING_ALLOW_UPDATE_SUBMIT_OPTS = 'allow_update_submit_opts' BUILDENV_SETTING_BUILD_JOB_SCRIPT = 'build_job_script' BUILDENV_SETTING_BUILD_LOGS_DIR = 'build_logs_dir' @@ -45,12 +46,15 @@ BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS = 'cvmfs_customizations' BUILDENV_SETTING_HTTPS_PROXY = 'https_proxy' BUILDENV_SETTING_HTTP_PROXY = 'http_proxy' +BUILDENV_SETTING_JOB_DELAY_BEGIN_FACTOR = 'job_delay_begin_factor' +BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL = 'job_handover_protocol' BUILDENV_SETTING_JOB_NAME = 'job_name' BUILDENV_SETTING_JOBS_BASE_DIR = 'jobs_base_dir' BUILDENV_SETTING_LOAD_MODULES = 'load_modules' BUILDENV_SETTING_LOCAL_TMP = 'local_tmp' BUILDENV_SETTING_NO_BUILD_PERMISSION_COMMENT = 'no_build_permission_comment' BUILDENV_SETTING_SHARED_FS_PATH = 'shared_fs_path' +BUILDENV_SETTING_SITE_CONFIG_SCRIPT = 'site_config_script' BUILDENV_SETTING_SLURM_PARAMS = 'slurm_params' BUILDENV_SETTING_SUBMIT_COMMAND = 'submit_command' @@ -62,6 +66,10 @@ DEPLOYCFG_SETTING_ENDPOINT_URL = 'endpoint_url' DEPLOYCFG_SETTING_METADATA_PREFIX = 'metadata_prefix' DEPLOYCFG_SETTING_NO_DEPLOY_PERMISSION_COMMENT = 'no_deploy_permission_comment' +DEPLOYCFG_SETTING_SIGNING = 'signing' +DEPLOYCFG_SETTING_SIGNING_CONTAINER_RUNTIME = 'container_runtime' +DEPLOYCFG_SETTING_SIGNING_KEY = 'key' +DEPLOYCFG_SETTING_SIGNING_SCRIPT = 'script' DEPLOYCFG_SETTING_UPLOAD_POLICY = 'upload_policy' SECTION_DOWNLOAD_PR_COMMENTS = 'download_pr_comments' @@ -105,7 +113,10 @@ RUNNING_JOB_COMMENTS_SETTING_RUNNING_JOB = 'running_job' SECTION_SUBMITTED_JOB_COMMENTS = 'submitted_job_comments' +# SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE is DEPRECATED SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE = 'awaits_release' +SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_DELAYED_BEGIN_MSG = 'awaits_release_delayed_begin_msg' +SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_HOLD_RELEASE_MSG = 'awaits_release_hold_release_msg' SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT = 'initial_comment' SUBMITTED_JOB_COMMENTS_SETTING_WITH_ACCELERATOR = 'with_accelerator' @@ -113,6 +124,14 @@ CLEAN_UP_SETTING_TRASH_BIN_ROOT_DIR = 'trash_bin_dir' CLEAN_UP_SETTING_MOVED_JOB_DIRS_COMMENT = 'moved_job_dirs_comment' +# definition of values +JOB_HANDOVER_PROTOCOL_DELAYED_BEGIN = 'delayed_begin' +JOB_HANDOVER_PROTOCOL_HOLD_RELEASE = 'hold_release' +JOB_HANDOVER_PROTOCOLS_SET = { + JOB_HANDOVER_PROTOCOL_DELAYED_BEGIN, + JOB_HANDOVER_PROTOCOL_HOLD_RELEASE +} + def read_config(path='app.cfg'): """ diff --git a/tools/filter.py b/tools/filter.py index 8b14f5eb..0caa2af8 100644 --- a/tools/filter.py +++ b/tools/filter.py @@ -20,16 +20,18 @@ # (none yet) -# NOTE because one can use any prefix of one of the four components below to +# NOTE because one can use any prefix of one of the components below to # define a filter, we need to make sure that no two filters share the same # prefix OR we have to change the handling of filters. FILTER_COMPONENT_ACCEL = 'accelerator' FILTER_COMPONENT_ARCH = 'architecture' +FILTER_COMPONENT_EXPORT = 'exportvariable' FILTER_COMPONENT_INST = 'instance' FILTER_COMPONENT_JOB = 'job' FILTER_COMPONENT_REPO = 'repository' FILTER_COMPONENTS = [FILTER_COMPONENT_ACCEL, FILTER_COMPONENT_ARCH, + FILTER_COMPONENT_EXPORT, FILTER_COMPONENT_INST, FILTER_COMPONENT_JOB, FILTER_COMPONENT_REPO diff --git a/tools/job_metadata.py b/tools/job_metadata.py index d4000199..7b7b8d0a 100644 --- a/tools/job_metadata.py +++ b/tools/job_metadata.py @@ -29,6 +29,10 @@ JOB_CFG_DIRECTORY_NAME = "cfg" JOB_CFG_FILENAME = "job.cfg" +# job previous_tmp directory and sub directories +JOB_CFG_PREVIOUS_TMP = "previous_tmp" +JOB_CFG_UPLOAD_STEP = "upload_step" + # JWD/cfg/$JOB_CFG_FILENAME JOB_CFG_ARCHITECTURE_SECTION = "architecture" JOB_CFG_ARCHITECTURE_OS_TYPE = "os_type" @@ -50,6 +54,7 @@ JOB_CFG_SITE_CONFIG_LOAD_MODULES = "load_modules" JOB_CFG_SITE_CONFIG_LOCAL_TMP = "local_tmp" JOB_CFG_SITE_CONFIG_SHARED_FS_PATH = "shared_fs_path" +JOB_CFG_SITE_CONFIG_SITE_CONFIG_SCRIPT = "site_config_script" # JWD/_bot_jobJOBID.metadata JOB_PR_SECTION = "PR"