diff --git a/.github/workflows/build_smee_client_container_image.yaml b/.github/workflows/build_smee_client_container_image.yaml new file mode 100644 index 00000000..a42d1e83 --- /dev/null +++ b/.github/workflows/build_smee_client_container_image.yaml @@ -0,0 +1,64 @@ +name: Build and publish a Smee client container image + +on: + push: + paths: + - containers/Dockerfile.smee-client + - .github/workflows/build_smee_client_container_image.yaml + pull_request: + paths: + - containers/Dockerfile.smee-client + - .github/workflows/build_smee_client_container_image.yaml + +# Declare default permissions as read only. +permissions: read-all + +jobs: + docker_build_smee_client: + runs-on: ubuntu-latest + permissions: + packages: write + steps: + - name: Check out the repo + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + + - name: Login to GitHub Container Registry + if: github.event_name != 'pull_request' + uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Convert and store repository owner in lowercase, replace colon in tag names by hyphen + run: | + echo REPOSITORY_OWNER=$(echo ${{ github.repository_owner }} | tr '[:upper:]' '[:lower:]') >> $GITHUB_ENV + + - name: Set up QEMU + uses: docker/setup-qemu-action@29109295f81e9208d7d86ff1c6c12d2833863392 # v3.6.0 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0 + + - name: Cache Docker layers + uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3 + with: + path: ${{ runner.temp }}/.buildx-cache + key: ${{ runner.os }}-buildx-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-buildx- + + - name: Build and push + uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # v6.18.0 + with: + tags: ghcr.io/${{ env.REPOSITORY_OWNER }}/smee-client + platforms: linux/amd64,linux/arm64 + push: ${{ github.event_name != 'pull_request' }} # don't publish if this is part of an open PR + file: containers/Dockerfile.smee-client + cache-from: type=local,src=${{ runner.temp }}/.buildx-cache + cache-to: type=local,dest=${{ runner.temp }}/.buildx-cache-new,mode=max + + - name: Move cache + run: | + rm -rf ${{ runner.temp }}/.buildx-cache + mv ${{ runner.temp }}/.buildx-cache-new ${{ runner.temp }}/.buildx-cache diff --git a/.github/workflows/markdown-lint.yml b/.github/workflows/markdown-lint.yml new file mode 100644 index 00000000..c9b204b3 --- /dev/null +++ b/.github/workflows/markdown-lint.yml @@ -0,0 +1,33 @@ +# This file is part of the EESSI build-and-deploy bot, +# see https://github.com/EESSI/eessi-bot-software-layer +# +# The bot helps with requests to add software installations to the +# EESSI software layer, see https://github.com/EESSI/software-layer +# +# author: Thomas Roeblitz (@trz42) +# +# license: GPLv2 +# + +name: Markdown Lint +on: [push, pull_request] +# Declare default permissions as read only. +permissions: read-all + +jobs: + markdown-lint: + runs-on: ubuntu-24.04 + steps: + - name: Checkout + uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0 + + - name: Setup Node.js + uses: actions/setup-node@60edb5dd545a775178f52524783378180af0d1f8 # v4.0.0 + with: + node-version: '18' + + - name: Install markdownlint-cli + run: npm install -g markdownlint-cli + + - name: Run markdownlint + run: markdownlint "**/*.md" --ignore .git \ No newline at end of file diff --git a/.github/workflows/scorecards.yml b/.github/workflows/scorecards.yml index 96379ba1..a0025b23 100644 --- a/.github/workflows/scorecards.yml +++ b/.github/workflows/scorecards.yml @@ -17,10 +17,9 @@ on: schedule: - cron: '25 15 * * 3' push: - branches: [ "main" ] + branches: [ "main", "develop" ] pull_request: - branches: - - main + branches: [ "main", "develop" ] # Declare default permissions as read only. permissions: read-all @@ -67,7 +66,7 @@ jobs: # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF # format to the repository Actions tab. - name: "Upload artifact" - uses: actions/upload-artifact@3cea5372237819ed00197afe530f5a7ea3e805c8 # v3.1.0 + uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3 with: name: SARIF file path: results.sarif @@ -75,6 +74,6 @@ jobs: # Upload the results to GitHub's code scanning dashboard. - name: "Upload to code-scanning" - uses: github/codeql-action/upload-sarif@807578363a7869ca324a79039e6db9c843e0e100 # v2.1.27 + uses: github/codeql-action/upload-sarif@9fdb3e49720b44c48891d036bb502feb25684276 # v3.25.6 with: sarif_file: results.sarif diff --git a/.markdownlint.json b/.markdownlint.json new file mode 100644 index 00000000..79ce1120 --- /dev/null +++ b/.markdownlint.json @@ -0,0 +1,10 @@ +{ + "default": true, + "MD013": false, + "MD033": false, + "MD041": false, + "MD024": false, + "MD026": { + "punctuation": ".,;:!" + } +} \ No newline at end of file diff --git a/README.md b/README.md index 77c9cdd1..8aa1b931 100644 --- a/README.md +++ b/README.md @@ -18,114 +18,147 @@ The bot consists of two main components provided in this repository: ## Prerequisites -- GitHub account(s) (two needed for a development scenario), referring to them - as `YOU_1` and `YOU_2` below -- A fork, say `YOU_1/software-layer`, of - [EESSI/software-layer](https://github.com/EESSI/software-layer) and a fork, - say `YOU_2/software-layer` of your first fork if you want to emulate the - bot's behaviour but not change EESSI's repository. The EESSI bot will act on - events triggered for the target repository (in this context, either - `EESSI/software-layer` or `YOU_1/software-layer`). +- GitHub account, say `GH_ACCOUNT` +- A fork, say `GH_ACCOUNT/software-layer`, of + [EESSI/software-layer](https://github.com/EESSI/software-layer). The EESSI bot will act on + events triggered for a repository its corresponding GitHub App was installed into. + To install the GitHub App into a repository, the GitHub App needs to be + configured such that it can be installed into any repository or all + repositories belonging to an account/organisation and the installer + (account/person who performs the "installation") has permissions to perform the + installation. - Access to a frontend/login node/service node of a Slurm cluster where the EESSI bot components will run. For the sake of brevity, we call this node simply `bot machine`. - `singularity` with version 3.6 or newer _OR_ `apptainer` with version 1.0 or newer on the compute nodes of the Slurm cluster. -- On the cluster frontend (or where the bot components run), different tools - may be needed to run the Smee client. For `x86_64`, `singularity` or - `apptainer` are sufficient. For `aarch64`, the package manager `npm` is - needed. +- On the `bot machine`, different tools may be needed to run the Smee client. + The Smee client is available via a docker container and can be run with + `singularity` or `apptainer`. Alternatively, the package manager `npm` may be + used to install the Smee client. Running via the EESSI-built container is + preferred. - The EESSI bot components and the (build) jobs will frequently access the Internet. Hence, worker nodes and the `bot machine` of the Slurm cluster need access to the Internet (either directly or via an HTTP proxy). -## Step 1: Smee.io channel and smee client +## Step 1: Relaying events via Smee -We use [smee.io](https://smee.io) as a service to relay events from GitHub -to the EESSI bot. To do so, create a new channel via https://smee.io and note +### Step 1a: Create a Smee channel for your own/test scenario + +_EESSI uses specific Smee channels. Access to them is restricted for +EESSI-internal use._ +For development and testing purposes, one can use [smee.io](https://smee.io) as a service to relay events from GitHub +to the EESSI bot. To do so, create a new channel via [smee.io](https://smee.io) and note the URL, e.g., `https://smee.io/CHANNEL-ID`. -On the `bot machine` we need a tool which receives events relayed from -`https://smee.io/CHANNEL-ID` and forwards it to the EESSI bot. We use the Smee -client for this. +### Step 1b: Install Smee client on `bot machine` -On machines with `x86_64` architecture, the Smee client can be run via a -container as follows +On the `bot machine` we need a tool (the Smee client) which receives events relayed from +`https://smee.io/CHANNEL-ID` and forwards it to the EESSI bot event handler. -``` -singularity pull docker://deltaprojects/smee-client -singularity run smee-client_latest.sif --url https://smee.io/CHANNEL-ID +NOTE, both options below rely on software (the Smee client) that is provided by +3rd parties. Use any of these options at your own risk! + +#### EESSI-built container for Smee client (PREFERRED OPTION) + +The Smee client can be run via a container as follows + +```bash +apptainer run docker://ghcr.io/eessi/smee-client:latest --url https://smee.io/CHANNEL-ID ``` or -``` -singularity pull docker://deltaprojects/smee-client -singularity run smee-client_latest.sif --port 3030 --url https://smee.io/CHANNEL-ID +```bash +apptainer run docker://ghcr.io/eessi/smee-client:latest --url https://smee.io/CHANNEL-ID --port 3030 ``` for specifying a different port than the default (3000). -On machines with `aarch64` architecture, we can install the the smee client via -the `npm` package manager as follows +#### Use Node.js-based Smee client (alternative option) -``` +The Smee client can be installed via the package manager `npm` as follows + +```bash npm install smee-client ``` -and then running it with the default port (3000) +and then running it with -``` +```bash node_modules/smee-client/bin/smee.js --url https://smee.io/CHANNEL-ID ``` -Another port can be used by adding the `--port PORT` argument, for example, +Another port can be used by adding the `--port PORT` argument. This can be particularly useful if you have multiple bot instances running on the same cluster, in which case you'd want a different port for each. As an example, one could use the non-default port 3030 in this way: -``` -node_modules/smee-client/bin/smee.js --port 3030 --url https://smee.io/CHANNEL-ID +```bash +node_modules/smee-client/bin/smee.js --url https://smee.io/CHANNEL-ID --port 3030 ``` -## Step 2: Registering GitHub App +## Step 2: Registering a GitHub App We need to: -* register a GitHub App; -* link it to the `smee.io` channel; -* set a secret token to verify the webhook sender; -* set some permissions for the GitHub app; -* subscribe the GitHub app to selected events; -* define that this GitHub app should only be installed in your GitHub account (or organisation). - -At the [app settings page](https://github.com/settings/apps) click "`New GitHub App`" and fill in the page, in particular the following fields: -- GitHub App name: give the app a name of you choice -- Homepage URL: use the Smee.io channel (`https://smee.io/CHANNEL-ID`) created in [Step 1](#step1) -- Webhook URL: use the Smee.io channel (`https://smee.io/CHANNEL-ID`) created in [Step 1](#step1) -- Webhook secret: create a secret token which is used to verify the webhook sender, for example using: + +- register a GitHub App +- link it to the `smee.io` channel +- set a secret token used by GitHub to sign webhooks and used by the EESSI bot to + verify that a received event originates from GitHub +- set some permissions for the GitHub app +- subscribe the GitHub app to selected events +- generate a private key (via GitHub GUI) + +At the [app settings page](https://github.com/settings/apps) click New GitHub App and fill in the page, in particular the following fields: + +- **GitHub App name**: give the app a name of your choice +- **Homepage URL**: can use the Smee.io channel (`https://smee.io/CHANNEL-ID`) created in [Step 1](#step1) +- **Webhook URL**: MUST use the Smee.io channel (`https://smee.io/CHANNEL-ID`) created in [Step 1](#step1) +- **Secret**: create a secret token which is used to verify the webhook sender, for example using: + ```shell python3 -c 'import secrets; print(secrets.token_hex(64))' ``` -- Permissions: assign the required permissions to the app (e.g., read access to commits, issues, pull requests); - - Make sure to assign read and write access to the Pull requests and Issues in "Repository permissions" section; these permisions can be changed later on; - - Make sure to accept the new permissions from the "Install App" section that you can reach via the menu on the left hand side. - - Then select the wheel right next to your installed app, or use the link `https://github.com/settings/installations/INSTALLATION_ID` - - Once the page is open you will be able to accept the new permissions there. - - Some permissions (e.g., metadata) will be selected automatically because of others you have chosen. -- Events: subscribe the app to events it shall react on (e.g., related to pull requests and comments) -- Select that the app can only be installed by this (your) GitHub account or organisation. +- **Permissions**: assign the required permissions to the app + - Under "Repository permissions" assign "Read and write" for both "Issues" and + "Pull requests" + + > [!NOTE] + > "Read and write" permissions to "Pull requests" gives the bot powerful + > means to _mess_ with your pull requests. Unfortunately, there is currently no way + > around this or the bot could not create comments in pull requests. + +- **Subscribe to events**: subscribe the app to events it shall react on + - Select "Issue comment" and "Pull request" (Note, they may only be selectable + after the required _Permissions_ have been chosen above.) +- **Where can this GitHub App be installed?** + - Select "Only on this account" + +Click on Create GitHub App to create the app, then generate a private key +(see below). -Click on "`Create GitHub App`" to complete this step. +### Generate private key -## Step 3: Installing GitHub App +After clicking Create GitHub App you will be informed with a banner +to generate a private key. You can follow the link in the banner or simply +scroll down to the section "Private keys" -_Note, this will trigger the first event (`installation`). While the EESSI bot is not running yet, you can inspect this via the webpage for your Smee channel. Just open `https://smee.io/CHANNEL-ID` in a browser, and browse through the information included in the event. Naturally, some of the information will be different for other types of events._ +Generate the private key, which downloads it and note the SHA256 string (to +more easily identify the key later on). + +## Step 3: Installing the GitHub App into a repository + +> [!NOTE] +> This will trigger the first event (`installation`). While the EESSI bot is not running yet, you can inspect this via the webpage for your Smee channel. Just open `https://smee.io/CHANNEL-ID` in a browser, and browse through the information included in the event. Naturally, some of the information will be different for other types of events. -You also need to *install* the GitHub App -- essentially telling GitHub to link the app to an account and one, several, or all repositories on whose events the app then should act upon. +You also need to _install_ the GitHub App -- essentially telling GitHub for which +repositories it should send events. -Go to https://github.com/settings/apps and select the app you want to install by clicking on the icon left to the app's name or on the "`Edit`" button right next to the name of the app. +Go to [https://github.com/settings/apps/**APP_NAME**](https://github.com/settings/apps/**APP_NAME**) and select the menu item +**Install App** on the left-hand side. -On the next page you should see the menu item "`Install App`" on the left-hand side. When you click on this you should see a page with a list of accounts and organisations you can install the app on. Choose one and click on the "`Install`" button next to it. +On the next page you should see a list of accounts and organisations you can install the app on. Choose one and click on the Install button next to it. -This leads to a page where you can select the repositories on whose the app should react to. Here, for the sake of simplicity, choose just `YOU_1/software-layer` as described in the [prerequisites](#prerequisites). Select one, multiple, or all and click on the "`Install`" button. +This leads to a page where you can select the repositories where the app should react to. Here, for the sake of simplicity, choose "Only select repositories", then open the pull-down menu named "Select repositories" and in there select `GH_ACCOUNT/software-layer` (`GH_ACCOUNT` is the GitHub account mentioned in section [prerequisites](#prerequisites)). Finally, click on the Install button. ## Step 4: Installing the EESSI bot on a `bot machine` @@ -133,22 +166,26 @@ The EESSI bot for the software layer is available from [EESSI/eessi-bot-software Get the EESSI bot _installed_ onto the `bot machine` by running something like -``` +```bash git clone https://github.com/EESSI/eessi-bot-software-layer.git ``` + Determine the full path to bot directory: -``` + +```bash cd eessi-bot-software-layer pwd ``` -Note the output of `pwd`. This will be used to replace `PATH_TO_EESSI_BOT` in the + +Take note of the output of `pwd`. This will be used to replace `PATH_TO_EESSI_BOT` in the configuration file `app.cfg` (see [Step 5.4](#step5.4)). In the remainder of this page we will refer to this directory as `PATH_TO_EESSI_BOT`. If you want to develop the EESSI bot, it is recommended that you fork the [EESSI/eessi-bot-software-layer](https://github.com/EESSI/eessi-bot-software-layer) repository and use the fork on the `bot machine`. If you want to work with a specific pull request for the bot, say number 42, you can obtain the corresponding code with the following commands: -``` + +```bash git clone https://github.com/EESSI/eessi-bot-software-layer.git cd eessi-bot-software-layer pwd @@ -157,7 +194,8 @@ git checkout PR42 ``` The EESSI bot requires some Python packages to be installed, which are specified in the [`requirements.txt`](https://github.com/EESSI/eessi-bot-software-layer/tree/main/requirements.txt) file. It is recommended to install these in a virtual environment based on Python 3.7 or newer. See the commands below for an example on how to set up the virtual environment, activate it, and install the requirements for the EESSI bot. These commands assume that you are in the `eessi-bot-software-layer` directory: -``` + +```bash # assumption here is that you start from *within* the eessi-bot-software-layer directory cd .. python3.7 -m venv venv_eessi_bot_p37 @@ -171,7 +209,7 @@ pip install -r requirements.txt Note, before you can start the bot components (see below), you have to activate the virtual environment with `source venv_eessi_bot_p37/bin/activate`. -You can exit the virtual environment simply by running `deactivate`. +You can exit the virtual environment by running `deactivate`. ### Step 4.1: Installing tools to access S3 bucket @@ -180,8 +218,9 @@ The script uploads an artefact and an associated metadata file to an S3 bucket. It needs two tools for this: -* the `aws` command to actually upload the files; -* the `jq` command to create the metadata file. + +- the `aws` command to actually upload the files; +- the `jq` command to create the metadata file. This section describes how these tools are installed and configured on the `bot machine`. @@ -189,7 +228,7 @@ This section describes how these tools are installed and configured on the `bot Create a new directory, say `PATH_TO_EESSI_BOT/tools` and change into it. -``` +```bash mkdir PATH_TO_EESSI_BOT/tools cd PATH_TO_EESSI_BOT/tools ``` @@ -217,19 +256,24 @@ Next, install the tool `jq` into the same directory into which `aws` was installed in (for example `PATH_TO_EESSI_BOT/tools`). Download `jq` from `https://github.com/stedolan/jq/releases` into that directory by running, for example, -``` + +```bash cd PATH_TO_EESSI_BOT/tools curl https://github.com/stedolan/jq/releases/download/jq-1.6/jq-linux64 -o jq-linux64 ``` + You may check if there are newer releases and choose a different package depending on your operating system. Update the permissions of the downloaded tool (`jq-linux64` for the above `curl` example) with -``` + +```bash chmod +x jq-linux64 ``` + Finally, create a symbolic link for `jq` by running -``` + +```bash ln -s jq-linux64 jq ``` @@ -238,21 +282,23 @@ Check that the `jq` command works by running `jq --version`. ## Step 5: Configuring the EESSI bot on the `bot machine` For the event handler, you need to set up two environment variables: -* `$GITHUB_TOKEN` (see [Step 5.1](#step5.1)) -* `$GITHUB_APP_SECRET_TOKEN` (see [Step 5.2](#step5.2)). + +- `$GITHUB_TOKEN` (see [Step 5.1](#step5.1)) +- `$GITHUB_APP_SECRET_TOKEN` (see [Step 5.2](#step5.2)). For both the event handler and the job manager you need a private key (see [Step 5.3](#step5.3)). ### Step 5.1: GitHub Personal Access Token (PAT) -Create a Personal Access Token (PAT) for your GitHub account via the page https://github.com/settings/tokens where you find a button "`Generate new token`". +Create a Personal Access Token (PAT) for your GitHub account via the page [https://github.com/settings/tokens](https://github.com/settings/tokens) where you find a button Generate new token. -Give it meaningful name (field titled "`Note`"), and set the expiration date. Then select the scopes this PAT will be used for. Then click "`Generate token`". +Give it meaningful name in the field titled **Note**, and set the expiration date. Then select the scopes this PAT will be used for. Finally, click Generate token. On the result page, take note/copy the resulting token string -- it will only be shown once. On the `bot machine` set the environment variable `$GITHUB_TOKEN`: -``` + +```bash export GITHUB_TOKEN='THE_TOKEN_STRING' ``` @@ -263,79 +309,120 @@ in which you replace `THE_TOKEN_STRING` with the actual token. The GitHub App Secret Token is used to verify the webhook sender. You should have created one already when registering a new GitHub App in [Step 2](#step2). On the `bot machine` set the environment variable `$GITHUB_APP_SECTRET_TOKEN`: -``` + +```bash export GITHUB_APP_SECRET_TOKEN='THE_SECRET_TOKEN_STRING' ``` -in which you replace `THE_SECRET_TOKEN_STRING` with the actual token. +in which you replace `THE_SECRET_TOKEN_STRING` with the secret token you have created in [Step 2](#step2). -Note that depending on the characters used in the string you will likely have to use *single quotes* (`'...'`) when setting the value of the environment variable. +Note that depending on the characters used in the string you will likely have to use _single quotes_ (`'...'`) when setting the value of the environment variable. ### Step 5.3: Create a private key and store it on the `bot machine` -The private key is needed to let the app authenticate when updating information at the repository such as commenting on PRs, adding labels, etc. You can create the key at the page of the GitHub App you have registered in [Step 2](#step2). - -Open the page https://github.com/settings/apps and then click on the icon left to the name of the GitHub App for the EESSI bot or the "`Edit`" button for the app. +The private key is needed to let the app authenticate when updating information at the repository such as commenting on pull requests, adding labels, etc. You can create the key at the page of the GitHub App you have registered in [Step 2](#step2). -Near the end of the page you will find a section "`Private keys`" where you can create a private key by clicking on the button "`Generate a private key`". +Open the page [https://github.com/settings/apps](https://github.com/settings/apps) and then click on the icon left to the name of the GitHub App for the EESSI bot or the Edit button for the app. -The private key should be automatically downloaded to your system. Copy it to the `bot machine` and note the full path to it (`PATH_TO_PRIVATE_KEY`). +Near the end of the page you will find a section **Private keys** where you can create a private key by clicking on the button Generate a private key. -For example: the private key is on your LOCAL computer. To transfer it to the -`bot machine` use the `scp` command for example: -``` -scp PATH_TO_PRIVATE_KEY_FILE_LOCAL_COMPUTER REMOTE_USERNAME@TARGET_HOST:TARGET/PATH -``` -The location to where the private key is copied on the bot machine (`TARGET/PATH`) should be noted for `PATH_TO_PRIVATE_KEY`. +The private key should be automatically downloaded to your system. Copy it to the `bot machine` and note the full path to it (`PATH_TO_PRIVATE_KEY`). Also note down the day when the key was generated. The keys should be rotated every 6 months. ### Step 5.4: Create the configuration file `app.cfg` If there is no `app.cfg` in the directory `PATH_TO_EESSI_BOT` yet, create an initial version from `app.cfg.example`. -``` +```bash cp -i app.cfg.example app.cfg ``` The example file (`app.cfg.example`) includes notes on what you have to adjust to run the bot in your environment. - #### `[github]` section The section `[github]` contains information for connecting to GitHub: -``` + +```ini app_id = 123456 ``` + Replace '`123456`' with the id of your GitHub App. You can find the id of your GitHub App via the page [GitHub Apps](https://github.com/settings/apps). On this page, select the app you have registered in [Step 2](#step2). On the opened page you will find the `app_id` in the section headed "`About`" listed as "`App ID`". -``` + +```ini app_name = 'MY-bot' ``` -The `app_name` specifies a short name for your bot. It will appear in comments to a pull request. For example, it could include the name of the cluster where the bot runs and a label representing the user that runs the bot, like `hal9000-bot`. -*Note: avoid putting an actual username here as it will be visible on potentially publicly accessible GitHub pages.* +The `app_name` specifies a short name for your bot. It will appear in comments to +a pull request. For example, it could include the name of the cluster where the +bot runs and a label representing the user that runs the bot, like `hal9000-bot`. +The name will be used when signing files uploaded to an S3 bucket. Thus, the name +has to be the same that is used as value for `namespaces` in the +`allowed_signers` file used during the ingestion procedure (see +[https://github.com/EESSI/filesystem-layer](https://github.com/EESSI/filesystem-layer)). +The file `allowed_signers` is provided by another (private) repository. More +information on its content can be obtained from the manual page for `ssh-keygen` +or from the sign script which is available as `scripts/sign_verify_file_ssh.sh`. -``` +_Note: avoid putting an actual username here as it will be visible on potentially publicly accessible GitHub pages._ + +```ini installation_id = 12345678 ``` -Replace '`12345678`' with the id of the *installation* of your GitHub App (see [Step 3](#step3)). -You find the installation id of your GitHub App via the page [GitHub Apps](https://github.com/settings/apps). On this page, select the app you have registered in [Step 2](#step2). For determining the `installation_id` select "`Install App`" in the menu on the left-hand side. Then click on the gearwheel button of the installation (to the right of the "`Installed`" label). The URL of the resulting page contains the `installation_id` -- the number after the last "/". +Replace '`12345678`' with the id of the _installation_ of your GitHub App (see [Step 3](#step3)). -The `installation_id` is also provided in the payload of every event within the top-level record named "`installation`". You can see the events and their payload on the webpage of your Smee.io channel (`https://smee.io/CHANNEL-ID`). Alternatively, you can see the events in the "`Advanced`" section of your GitHub App: open the [GitHub Apps](https://github.com/settings/apps) page, select the app you have registered in [Step 2](#step2), and choose "`Advanced`" in the menu on the left-hand side. -``` +You find the installation id of your GitHub App via the page [Applications](https://github.com/settings/installations). On this page, select the app you have registered in [Step 2](#step2) by clicking on the Configure button. The installation id is shown as the number after the last `/` of the page's URL. + +The `installation_id` is also provided in the payload of every event within the top-level record named "`installation`". You can see the events and their payload on the webpage of your Smee.io channel (`https://smee.io/CHANNEL-ID`). Alternatively, you can see the events in the **Advanced** section of your GitHub App: open the [GitHub Apps](https://github.com/settings/apps) page, select the app you have registered in [Step 2](#step2), and choose **Advanced** in the menu on the left-hand side. + +```ini private_key = PATH_TO_PRIVATE_KEY ``` + Replace `PATH_TO_PRIVATE_KEY` with the path you have noted in [Step 5.3](#step5.3). +#### `[bot_control]` section + +The `[bot_control]` section contains settings for configuring the feature to +send commands to the bot. + +```ini +command_permission = GH_ACCOUNT_1 GH_ACCOUNT_2 ... +``` + +The `command_permission` setting defines which GitHub accounts can send commands +to the bot (via new PR comments). If the value is empty _no_ GitHub account can send +commands. + +```ini +command_response_fmt = FORMAT_MARKDOWN_AND_HTML +``` + +`command_response_fmt` allows to customize the format of the comments about the handling of bot +commands. The format needs to include `{app_name}`, `{comment_response}` and +`{comment_result}`. `{app_name}` is replaced with the name of the bot instance. +`{comment_response}` is replaced with information about parsing the comment +for commands before any command is run. `{comment_result}` is replaced with +information about the result of the command that was run (can be empty). + +```ini +chatlevel = basic +``` + +`chatlevel` defines the amount of comments the bot writes into PRs (incognito - no comments, minimal - respond with single comment on bot commands `help`, `show_config`, `status` and `build` and update job progress, basic - minimal + report failures, or chatty - comments on any event being processed) +chatlevel = basic #### `[buildenv]` section The `[buildenv]` section contains information about the build environment. -``` + +```ini build_job_script = PATH_TO_EESSI_BOT/scripts/bot-build.slurm ``` + `build_job_script` points to the job script which will be submitted by the bot event handler. -``` +```ini shared_fs_path = PATH_TO_SHARED_DIRECTORY ``` @@ -343,7 +430,7 @@ Via `shared_fs_path` the path to a directory on a shared filesystem (NFS, etc.) which can be leveraged by the `bot/build.sh` script to store files that should be available across build jobs (software source tarballs, for example). -``` +```ini build_logs_dir = PATH_TO_BUILD_LOGS_DIR ``` @@ -351,33 +438,45 @@ If build logs should be copied to a particular (shared) directory under certain for example when a build failed, the `build_logs_dir` can be set to the path to which logs should be copied by the `bot/build.sh` script. -``` +```ini container_cachedir = PATH_TO_SHARED_DIRECTORY ``` + `container_cachedir` may be used to reuse downloaded container image files across jobs, so jobs can launch containers more quickly. -``` +```ini cvmfs_customizations = { "/etc/cvmfs/default.local": "CVMFS_HTTP_PROXY=\"http://PROXY_DNS_NAME:3128|http://PROXY_IP_ADDRESS:3128\"" } ``` + It may happen that we need to customize the [CernVM-FS](https://cernvm.cern.ch/fs/) configuration for the build job. The value of `cvmfs_customizations` is a dictionary which maps a file name to an entry that needs to be appended to that file. In the example line above, the configuration of `CVMFS_HTTP_PROXY` is appended to the file `/etc/cvmfs/default.local`. The CernVM-FS configuration can be commented out, unless there is a need to customize the CernVM-FS configuration. -``` +```ini http_proxy = http://PROXY_DNS:3128/ https_proxy = http://PROXY_DNS:3128/ ``` + If compute nodes have no direct internet connection, we need to set `http(s)_proxy` or commands such as `pip3` and `eb` (EasyBuild) cannot download software from package repositories. Typically these settings are set in the prologue of a Slurm job. However, when entering the [EESSI compatibility layer](https://www.eessi.io/docs/compatibility_layer), most environment settings are cleared. Hence, they need to be set again at a later stage. +```ini +job_name = JOB_NAME ``` + +Replace `JOB_NAME` with a string of at least 3 characters that is used as job +name when a job is submitted. This is used to filter jobs, e.g., should be used +to make sure that multiple bot instances can run in the same Slurm environment. + +```ini job_delay_begin_factor = 2 ``` + The `job_delay_begin_factor` setting defines how many times the `poll_interval` a job's begin (EligibleTime) from now should be delayed if the handover protocol is set to `delayed_begin` (see setting `job_handover_protocol`). That is, if @@ -385,42 +484,40 @@ the `job_delay_begin_factor` is set to five (5) the delay time is calculated as 5 * `poll_interval`. The event manager would use 2 as default value when submitting jobs. -``` +```ini job_handover_protocol = hold_release ``` + The `job_handover_protocol` setting defines which method is used to handover a job from the event handler to the job manager. Values are - - `hold_release` (job is submitted with `--hold`, job manager removes the hold - with `scontrol release`) - - `delayed_begin` (job is submitted with `--begin=now+(5 * poll_interval)` and - any `--hold` is removed from the submission parameters); see setting - `poll_interval` further below; this is useful if the - bot account cannot run `scontrol release` to remove the hold of the job; - also, the status update in the PR comment of the job is extended by noting - the `EligibleTime` -``` -job_name = JOB_NAME -``` -Replace `JOB_NAME` with a string of at least 3 characters that is used as job -name when a job is submitted. This is used to filter jobs, e.g., should be used -to make sure that multiple bot instances can run in the same Slurm environment. +- `hold_release` (job is submitted with `--hold`, job manager removes the hold + with `scontrol release`) +- `delayed_begin` (job is submitted with `--begin=now+(5 * poll_interval)` and + any `--hold` is removed from the submission parameters); see setting + `poll_interval` further below; this is useful if the + bot account cannot run `scontrol release` to remove the hold of the job; + also, the status update in the PR comment of the job is extended by noting + the `EligibleTime` -``` +```ini jobs_base_dir = PATH_TO_JOBS_BASE_DIR ``` + Replace `PATH_TO_JOBS_BASE_DIR` with an absolute filepath like `/home/YOUR_USER_NAME/jobs` (or another path of your choice). Per job the directory structure under `jobs_base_dir` is `YYYY.MM/pr_PR_NUMBER/event_EVENT_ID/run_RUN_NUMBER/OS+SUBDIR`. The base directory will contain symlinks using the job ids pointing to the job's working directory `YYYY.MM/...`. -``` +```ini load_modules = MODULE1/VERSION1,MODULE2/VERSION2,... ``` + `load_modules` provides a means to load modules in the `build_job_script`. None to several modules can be provided in a comma-separated list. It is read by the bot and handed over to `build_job_script` via the `--load-modules` option. -``` +```ini local_tmp = /tmp/$USER/EESSI ``` + `local_tmp` specifies the path to a temporary directory on the node building the software, i.e., on a compute/worker node. You may have to change this if temporary storage under `/tmp` does not exist or is too small. This setting will be used for the @@ -428,9 +525,10 @@ environment variable `$EESSI_TMPDIR`. The value is expanded only inside a runnin job. Thus, typical job environment variables (like `$USER` or `$SLURM_JOB_ID`) may be used to isolate jobs running simultaneously on the same compute node. -``` +```ini site_config_script = /path/to/script/if/any ``` + `site_config_script` specifies the path to a script that - if it exists - is sourced in the build job before any `bot/*` script is run. This allows to customize the build environment due to specifics of the build site/cluster. @@ -438,130 +536,128 @@ Note, such customizations could also be performed by putting them into a module file and use the setting `load_modules` (see above). However, the setting `site_config_script` provides a low threshold for achieving this, too. -``` +```ini slurm_params = "--hold" ``` `slurm_params` defines additional parameters for submitting batch jobs. `"--hold"` should be kept or the bot might not work as intended (the release step done by the job manager component of the bot would be circumvented). Additional parameters, for example, to specify an account, a partition, or any other parameters supported by the [`sbatch` command](https://slurm.schedmd.com/sbatch.html), may be added to customize the job submission. -``` + +```ini submit_command = /usr/bin/sbatch ``` + `submit_command` is the full path to the Slurm job submission command used for submitting batch jobs. You may want to verify if `sbatch` is provided at that path or determine its actual location (using `which sbatch`). +```ini +build_permission = -NOT_ALLOWED_GH_ACCOUNT_NAME- [...] ``` -build_permission = GH_ACCOUNT_1 GH_ACCOUNT_2 ... -``` + `build_permission` defines which GitHub accounts have the permission to trigger build jobs, i.e., for which accounts the bot acts on `bot: build ...` commands. -If the value is left empty, everyone can trigger build jobs. +If the value is left empty, everyone can trigger build jobs. The string +`-NOT_ALLOWED_GH_ACCOUNT_NAME-` in the example above is not an allowed account +name on GitHub. Thus, one could not - by accident - give build permissions to an +unknown account. -``` +```ini no_build_permission_comment = The `bot: build ...` command has been used by user `{build_labeler}`, but this person does not have permission to trigger builds. ``` + `no_build_permission_comment` defines a comment (template) that is used when the account trying to trigger build jobs has no permission to do so. -``` +```ini allow_update_submit_opts = false ``` + `allow_update_submit_opts` determines whether or not to allow updating the submit options via custom module `det_submit_opts` provided by the pull request being processed. +Should only be enabled (true) with care because this will result in code from the target +repository being executed by the event handler process, that is, not in a compute job. -``` +```ini allowed_exportvars = ["NAME1=value_1a", "NAME1=value_1b", "NAME2=value_2"] ``` + `allowed_exportvars` defines a list of name-value pairs (environment variables) that are allowed to be specified in a PR command with the `exportvariable` filter. To specify multiple environment variables, multiple `exportvariable` filters must be used (one per variable). These variables will -be exported into the build environment before running the bot/build.sh script. +be exported into the build environment before running the `bot/build.sh` script. The bot build script makes use of the variable `SKIP_TESTS` to determine if ReFrame tests shall be skipped or not. Default is not to skip them. To allow the use of the variable the setting could look like -``` + +```ini allowed_exportvars = ["SKIP_TESTS=yes", "SKIP_TESTS=no"] ``` +A resonable default setting is -#### `[bot_control]` section - -The `[bot_control]` section contains settings for configuring the feature to -send commands to the bot. -``` -command_permission = GH_ACCOUNT_1 GH_ACCOUNT_2 ... +```ini +allowed_exportvars = [] ``` -The `command_permission` setting defines which GitHub accounts can send commands -to the bot (via new PR comments). If the value is empty *no* GitHub account can send -commands. +```ini +clone_git_repo_via = https ``` -command_response_fmt = FORMAT_MARKDOWN_AND_HTML -``` -`command_response_fmt` allows to customize the format of the comments about the handling of bot -commands. The format needs to include `{app_name}`, `{comment_response}` and -`{comment_result}`. `{app_name}` is replaced with the name of the bot instance. -`{comment_response}` is replaced with information about parsing the comment -for commands before any command is run. `{comment_result}` is replaced with -information about the result of the command that was run (can be empty). +The `clone_git_repo_via` setting specifies via which mechanism the Git repository +should be cloned. This can be either: + +- `https` (default): clone repository via HTTPS with `git clone https://github.com//` +- `ssh`: clone repository via SSH with `git clone git@github.com:/.git` + In case of using 'ssh', one may need additional steps to ensure that the bot uses the right SSH key and does not ask for a passphrase (if the key used is protected with one). Here are a few things to consider: +- if the ssh key to be used does not have a standard name (e.g., `id_rsa`), add the following entry to `~/.ssh/config` in the bot's account + + ```bash + Host github.com + User git + IdentityFile ~/.ssh/NAME_OF_PRIVATE_KEY_FILE + ``` + +- if the key is protected by a passphrase (**highly recommended**), run an SSH agent and add the key to it + + ```bash + eval $(ssh-agent -s) + ssh-add ~/.ssh/NAME_OF_PRIVATE_KEY_FILE + ``` + +Note that the `bot: status` command doesn't work with SSH keys; you'll still need a Github token for that to work. #### `[deploycfg]` section The `[deploycfg]` section defines settings for uploading built artefacts (tarballs). -``` + +```ini artefact_upload_script = PATH_TO_EESSI_BOT/scripts/eessi-upload-to-staging ``` -`artefact_upload_script` provides the location for the script used for uploading built software packages to an S3 bucket. -``` -signing = - { - REPO_ID: { - "script": PATH_TO_SIGN_SCRIPT, - "key": PATH_TO_KEY_FILE, - "container_runtime": PATH_TO_CONTAINER_RUNTIME - }, ... - } -``` -`signing` provides a setting for signing artefacts. The value uses a JSON-like format -with `REPO_ID` being the repository ID. Repository IDs are defined in a file -`repos.cfg` (see setting `repos_cfg_dir`), `script` provides the location of the -script that is used to sign a file. If the location is a relative path, the script -must reside in the checked out pull request of the target repository (e.g., -EESSI/software-layer). `key` points to the file of the key being used -for signing. The bot calls the script with the two arguments: - 1. private key (as provided by the attribute 'key') - 2. path to the file to be signed (the upload script will determine that) -NOTE (on `container_runtime`), signing requires a recent installation of OpenSSH -(8.2 or newer). If the frontend where the event handler runs does not have that -version installed, you can specify a container runtime via the `container_runtime` -attribute below. Currently, only Singularity or Apptainer are supported. -Note (on the key), make sure the file permissions are restricted to `0600` (only -readable+writable by the file owner, or the signing will likely fail. -Note (on json format), make sure no trailing commas are used after any elements -or parsing/loading the json will likely fail. Also, the whole value should start -at a new line and be indented as shown above. +`artefact_upload_script` provides the location for the script used for uploading built software packages to an S3 bucket. -``` +```ini endpoint_url = URL_TO_S3_SERVER ``` + `endpoint_url` provides an endpoint (URL) to a server hosting an S3 bucket. The server could be hosted by a commercial cloud provider like AWS or Azure, or -running in a private environment, for example, using Minio. The bot uploads +running in a private environment, for example, using Minio. In EESSI, the bot uploads artefacts to the bucket which will be periodically scanned by the ingestion procedure at the Stratum 0 server. - ```ini # example: same bucket for all target repos bucket_name = "eessi-staging" ``` + ```ini -# example: bucket to use depends on target repo +# example: bucket to use depends on target repo identifier (see setting +# `repo_target_map`) +# the key is the identifier of a repo, while the value is the name of the bucket bucket_name = { - "eessi-pilot-2023.06": "eessi-staging-2023.06", - "eessi.io-2023.06": "software.eessi.io-2023.06", + "eessi.io-2023.06-software": "eessi.io-staging-2023.06", + "eessi.io-2025.06-software": "eessi.io-2025.06" } ``` @@ -570,8 +666,7 @@ The bucket must be available on the default server (`https://${bucket_name}.s3.a `bucket_name` can be specified as a string value to use the same bucket for all target repos, or it can be mapping from target repo id to bucket name. - -``` +```ini upload_policy = once ``` @@ -584,22 +679,23 @@ The `upload_policy` defines what policy is used for uploading built artefacts to |`once`|Only once upload any built artefact for the build target.| |`none`|Do not upload any built artefacts.| -``` +```ini deploy_permission = GH_ACCOUNT_1 GH_ACCOUNT_2 ... ``` + The `deploy_permission` setting defines which GitHub accounts can trigger the -deployment procedure. The value can be empty (*no* GitHub account can trigger the +deployment procedure. The value can be empty (_no_ GitHub account can trigger the deployment), or a space delimited list of GitHub accounts. -``` +```ini no_deploy_permission_comment = Label `bot:deploy` has been set by user `{deploy_labeler}`, but this person does not have permission to trigger deployments ``` + This defines a message that is added to the status table in a PR comment corresponding to a job whose artefact should have been uploaded (e.g., after setting the `bot:deploy` label). - -``` +```ini metadata_prefix = LOCATION_WHERE_METADATA_FILE_GETS_DEPOSITED artefact_prefix = LOCATION_WHERE_TARBALL_GETS_DEPOSITED ``` @@ -612,13 +708,14 @@ repository id (see also `repo_target_map` below) to a prefix. The prefix itself can use some (environment) variables that are set within the upload script (see `artefact_upload_script` above). Currently those are: - * `'${github_repository}'` (which would be expanded to the full name of the GitHub - repository, e.g., `EESSI/software-layer`), - * `'${legacy_aws_path}'` (which expands to the legacy/old prefix being used for - storing artefacts/metadata files, the old prefix is - `EESSI_VERSION/TARBALL_TYPE/OS_TYPE/CPU_ARCHITECTURE/TIMESTAMP/`), _and_ - * `'${pull_request_number}'` (which would be expanded to the number of the pull - request from which the artefact originates). + +- `'${github_repository}'` (which would be expanded to the full name of the GitHub + repository, e.g., `EESSI/software-layer`), +- `'${legacy_aws_path}'` (which expands to the legacy/old prefix being used for + storing artefacts/metadata files, the old prefix is + `EESSI_VERSION/TARBALL_TYPE/OS_TYPE/CPU_ARCHITECTURE/TIMESTAMP/`), _and_ +- `'${pull_request_number}'` (which would be expanded to the number of the pull + request from which the artefact originates). Note, it's important to single-quote (`'`) the variables as shown above, because they may likely not be defined when the bot calls the upload script. @@ -626,60 +723,123 @@ The list of supported variables can be shown by running `scripts/eessi-upload-to-staging --list-variables`. **Examples:** -``` + +```ini metadata_prefix = {"eessi.io-2023.06": "new/${github_repository}/${pull_request_number}"} artefact_prefix = { "eessi-pilot-2023.06": "", "eessi.io-2023.06": "new/${github_repository}/${pull_request_number}" } ``` + If left empty, the old/legacy prefix is being used. +```ini +signing = + { + "REPO_ID": { + "script": "PATH_TO_SIGN_SCRIPT", + "key": "PATH_TO_KEY_FILE", + "container_runtime": "PATH_TO_CONTAINER_RUNTIME" + } + } +``` + +`signing` provides a setting for signing artefacts. The value uses a JSON-like format +with `REPO_ID` being the repository ID. Repository IDs are defined in a file +`repos.cfg` (see setting `repos_cfg_dir`), `script` provides the location of the +script that is used to sign a file. If the location is a relative path, the script +must reside in the checked out pull request of the target repository (e.g., +EESSI/software-layer). `key` points to the file of the key being used +for signing. The bot calls the script with the two arguments: + +1. private key (as provided by the attribute 'key') +2. path to the file to be signed (the upload script will determine that) + +> [!NOTE] +> Wrt `container_runtime`, signing requires a recent installation of OpenSSH +> (8.2 or newer). If the frontend where the event handler runs does not have that +> version installed, you can specify a container runtime via the `container_runtime` +> attribute below. Currently, only Singularity or Apptainer are supported. +> [!NOTE] +> Wrt to the private key file, make sure the file permissions are restricted to `0600` +> (only readable+writable by the file owner) or the signing will likely fail. +> [!NOTE] +> Wrt to the JSON-like format, make sure commas are only used for separating elements +> and that there is no trailing comma on the last element, or parsing/loading the json +> will likely fail. Also, the whole value should start a new line and be indented as shown +> above. +> [!NOTE] +> As shown in the example, use double quotes for all keys and values. + #### `[architecturetargets]` section The section `[architecturetargets]` defines for which targets (OS/SUBDIR), (for example `linux/x86_64/amd/zen2`) the EESSI bot should submit jobs, and which additional `sbatch` parameters will be used for requesting a compute node with the CPU microarchitecture needed to build the software stack. -``` -arch_target_map = { "linux/x86_64/generic" : "--constraint shape=c4.2xlarge", "linux/x86_64/amd/zen2" : "--constraint shape=c5a.2xlarge" } -``` -The map has one-to-many entries of the format `OS/SUBDIR : -ADDITIONAL_SBATCH_PARAMETERS`. For your cluster, you will have to figure out -which microarchitectures (`SUBDIR`) are available (as `OS` only `linux` is -currently supported) and how to instruct Slurm to allocate nodes with that -architecture to a job (`ADDITIONAL_SBATCH_PARAMETERS`). -Note, if you do not have to specify additional parameters to `sbatch` to request a compute node with a specific microarchitecture, you can just write something like: -``` -arch_target_map = { "linux/x86_64/generic" : "" } +```ini +node_type_map = { + "cpu_zen2": { + "os": "linux", + "cpu_subdir": "x86_64/amd/zen2", + "slurm_params": "-p rome --nodes 1 --ntasks-per-node 16 --cpus-per-task 1", + "repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"] + }, + "gpu_h100": { + "os": "linux", + "cpu_subdir": "x86_64/amd/zen4", + "accel": "nvidia/cc90", + "slurm_params": "-p gpu_h100 --nodes 1 --tasks-per-node 16 --cpus-per-task 1 --gpus-per-node 1", + "repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"] + }} ``` -#### `[repo_targets]` section +Each entry in the `node_type_map` dictionary describes a build node type. The key is a (descriptive) name for this build node, and its value is a dictionary containing the following build node properties as key-value pairs: -The `[repo_targets]` section defines for which repositories and architectures the bot can run a job. -Repositories are referenced by IDs (or `repo_id`). Architectures are identified -by `OS/SUBDIR` which correspond to settings in the `arch_target_map`. +- `os`: its operating system (os) +- `cpu_subdir`: its CPU architecture +- `slurm_params`: the SLURM parameters that need to be passed to submit jobs to it +- `repo_targets`: supported repository targets for this node type +- `accel` (optional): which accelerators this node has +All values are strings, except repo_targets, which is a list of strings. Repository targets listed in `repo_target` should correspond to the repository IDs as defined in the `repos.cfg` file in the `repos_cfg_dir` (see below). + +Note that the Slurm parameters should typically be chosen such that a single type of node (with one specific type of CPU and one specific type of GPU) should be allocated. + +To command the bot to build on the `cpu_zen2` node type above, one would give the command `bot:build on:arch=zen2 for:...`. To command the bot to build on the `gpu_h100` node type, one would give the command `bot:build on:arch=zen4,accel=nvidia/cc90 for:...`. + +For a native build (i.e. building for `zen2` on a `zen2` node), one can pass `bot:build on:arch=zen2 for:arch=x86_64/amd/zen2`, or use the short-hand `bot:build for:arch=x86_64/amd/zen2` (i.e. omitting the `on` argument implies a native build; note that the reverse, omitting the `for` argument, does not work). This will trigger a build on the `cpu_zen2` node type (as configured above) and prepare a configuration file in the job directory that instructs to build for a `zen2` CPU architecture. + +For cross-compiling GPU code for NVIDIA Compute Capabiltiy 8.0 (and a `zen2` CPU architecture), one would instruct the bot with `bot:build on:arch=zen2 for:arch=x86_64/amd/zen2,accel=nvidia/cc80`. This will trigger a build on the `cpu_zen2` node type (as configured above) and prepare a configuration file in the job directory that instructs to build for a `zen2` CPU architecture with an `nvidia/cc80` GPU architecture. + +Note that the `arch_target_map` and `repo_target_map` (used in version <=0.8.0) configuration options were replaced by `node_type_map`. The `arch_target_map` and `repo_target_map` that would be equivalent to the `node_type_map` above are: + +```ini +arch_target_map = { "linux/x86_64/amd/zen2": "-p rome --nodes 1 --ntasks-per-node 16 --cpus-per-task 1", "linux/x86_64/amd/zen4": "-p gpu_h100 --nodes 1 --tasks-per-node 16 --cpus-per-task 1 --gpus-per-node 1" } +repo_target_map = { "linux/x86_64/amd/zen2": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"], "linux/x86_64/amd/zen4": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"] } ``` -repo_target_map = { - "OS_SUBDIR_1" : ["REPO_ID_1_1","REPO_ID_1_2"], - "OS_SUBDIR_2" : ["REPO_ID_2_1","REPO_ID_2_2"] } -``` -For each `OS/SUBDIR` combination a list of available repository IDs can be -provided. + +#### `[repo_targets]` section + +The `[repo_targets]` section defines where the configuration for the repository targets defined in the `node_type_map` can be found. The repository IDs are defined in a separate file, say `repos.cfg` which is stored in the directory defined via `repos_cfg_dir`: + +```ini +repos_cfg_dir = PATH_TO_SHARED_DIRECTORY/repos ``` -repos_cfg_dir = PATH_TO_SHARED_DIRECTORY/cfg_bundles -``` + The `repos.cfg` file also uses the `ini` format as follows + ```ini -[eessi-2023.06] +[eessi.io-2023.06-software] repo_name = software.eessi.io repo_version = 2023.06 config_bundle = eessi.io-cfg_files.tgz config_map = {"eessi.io/eessi.io.pub":"/etc/cvmfs/keys/eessi.io/eessi.io.pub", "default.local":"/etc/cvmfs/default.local", "eessi.io.conf":"/etc/cvmfs/domain.d/eessi.io.conf"} container = docker://ghcr.io/eessi/build-node:debian11 ``` + The repository id is given in brackets (`[eessi-2023.06]`). Then the name of the repository (`repo_name`) and the version (`repo_version`) are defined. Next, a tarball containing configuration files for CernVM-FS is specified (`config_bundle`). The `config_map` setting maps entries of that tarball to locations inside @@ -691,171 +851,237 @@ The `repos.cfg` file may contain multiple definitions of repositories. #### `[event_handler]` section The `[event_handler]` section contains information required by the bot event handler component. -``` + +```ini log_path = /path/to/eessi_bot_event_handler.log ``` -`log_path` specifies the path to the event handler log. + +`log_path` specifies the path to the event handler log. #### `[job_manager]` section The `[job_manager]` section contains information needed by the job manager. -``` +```ini log_path = /path/to/eessi_bot_job_manager.log ``` -`log_path` specifies the path to the job manager log. -``` +`log_path` specifies the path to the job manager log. + +```ini job_ids_dir = /home/USER/jobs/ids ``` + `job_ids_dir` specifies where the job manager should store information about jobs being tracked. Under this directory it will store information about submitted/running jobs under a subdirectory named '`submitted`', and about finished jobs under a subdirectory named '`finished`'. -``` + +```ini poll_command = /usr/bin/squeue ``` + `poll_command` is the full path to the Slurm command that can be used for checking which jobs exist. You may want to verify if `squeue` is provided at that path or determine its actual location (via `which squeue`). -``` + +```ini poll_interval = 60 ``` + `poll_interval` defines how often the job manager checks the status of the jobs. The unit of the value is seconds. -``` + +```ini scontrol_command = /usr/bin/scontrol ``` + `scontrol_command` is the full path to the Slurm command used for manipulating existing jobs. You may want to verify if `scontrol` is provided at that path or determine its actual location (via `which scontrol`). +It is also possible to add placeholder values to the scontrol_command. These placeholders can capture output from the `squeue` command that the bot runs internally, and pass it back to the `scontrol_command`. An example where this may be useful is in a setup where multiple clusters are managed by the same SLURM instance, and the `scontrol_command` for that instance needs to get the correct cluster name passed. This can be achieved by defining `scontrol_command = /usr/bin/scontrol --clusters=%%(cluster)s`. Valid placeholder names are currently: `jobid`, `cluster`, `partition`, `state`, and `reason`. #### `[submitted_job_comments]` section The `[submitted_job_comments]` section specifies templates for messages about newly submitted jobs. -DEPRECATED setting (use `awaits_release_delayed_begin_msg` and/or `awaits_release_hold_release_msg`) -``` +The following setting is no longer used since bot release v0.7.0. Instead, use the replacement settings `awaits_release_delayed_begin_msg` and/or `awaits_release_hold_release_msg`. + +```ini awaits_release = job id `{job_id}` awaits release by job manager ``` + `awaits_release` is used to provide a status update of a job (shown as a row in the job's status table). -``` +```ini awaits_release_delayed_begin_msg = job id `{job_id}` will be eligible to start in about {delay_seconds} seconds ``` + `awaits_release_delayed_begin_msg` is used when the `job_handover_protocol` is set to `delayed_begin`. Note, both `{job_id}` and `{delay_seconds}` need to be present in the value or the event handler will throw an exception when formatting the update of the PR comment corresponding to the job. -``` +```ini awaits_release_hold_release_msg = job id `{job_id}` awaits release by job manager ``` + `awaits_release_hold_release_msg` is used when the `job_handover_protocol` is set to `hold_release`. Note, `{job_id}` needs to be present in the value or the event handler will throw an exception when formatting the update of the PR comment corresponding to the job. +```ini +new_job_instance_repo = New job on instance `{app_name}` for repository `{repo_id}` +``` + +`new_job_instance_repo` is used as the first line in a comment to a PR when a new job has been created. + +```ini +build_on_arch = Building on: `{on_arch}`{on_accelerator} ``` -initial_comment = New job on instance `{app_name}` for architecture `{arch_name}`{accelerator_spec} for repository `{repo_id}` in job dir `{symlink}` + +`build_on_arch` is used as the second line in a comment to a PR when a new job has been created. Note that the `on_accelerator` spec is only filled-in by the bot if the `on:...,accel=...` has been passed to the bot. + +```ini +build_for_arch = Building for: `{for_arch}`{for_accelerator} ``` -`initial_comment` is used to create a comment to a PR when a new job has been -created. Note, the part '{accelerator_spec}' is only filled-in by the bot if the -argument 'accelerator' to the `bot: build` command has been used. + +`build_for_arch` is used as the third line in a comment to a PR when a new job has been created. Note that the `for_accelerator` spec is only filled-in by the bot if the `for:...,accel=...` has been passed to the bot. + +```ini +jobdir = Job dir: `{symlink}` ``` + +`jobdir` is used as the fourth line in a comment to a PR when a new job has been created. + +```ini with_accelerator =  and accelerator `{accelerator}` ``` + `with_accelerator` is used to provide information about the accelerator the job -should build for if and only if the argument `accelerator:X/Y` has been provided. +should build for if and only if the argument `on:...,accel=...` or `for:...,accel=...` has been provided. #### `[new_job_comments]` section The `[new_job_comments]` section sets templates for messages about jobs whose `hold` flag was released. -``` + +```ini awaits_launch = job awaits launch by Slurm scheduler ``` + `awaits_launch` specifies the status update that is used when the `hold` flag of a job has been removed. #### `[running_job_comments]` section The `[running_job_comments]` section sets templates for messages about jobs that are running. -``` + +```ini running_job = job `{job_id}` is running ``` + `running_job` specifies the status update for a job that started running. #### `[finished_job_comments]` section The `[finished_job_comments]` section sets templates for messages about finished jobs. -``` + +```ini job_result_unknown_fmt =
:shrug: UNKNOWN _(click triangle for details)_
  • Job results file `{filename}` does not exist in job directory, or parsing it failed.
  • No artefacts were found/reported.
``` + `job_result_unknown_fmt` is used in case no result file (produced by `bot/check-build.sh` provided by target repository) was found. -``` +```ini job_test_unknown_fmt =
:shrug: UNKNOWN _(click triangle for details)_
  • Job test file `{filename}` does not exist in job directory, or parsing it failed.
``` + `job_test_unknown_fmt` is used in case no test file (produced by `bot/check-test.sh` provided by target repository) was found. - #### `[download_pr_comments]` section The `[download_pr_comments]` section sets templates for messages related to downloading the contents of a pull request. -``` + +```ini git_clone_failure = Unable to clone the target repository. ``` + `git_clone_failure` is shown when `git clone` failed. -``` +```ini git_clone_tip = _Tip: This could be a connection failure. Try again and if the issue remains check if the address is correct_. ``` + `git_clone_tip` should contain some hint on how to deal with the issue. It is shown when `git clone` failed. -``` +```ini git_checkout_failure = Unable to checkout to the correct branch. ``` + `git_checkout_failure` is shown when `git checkout` failed. -``` +```ini git_checkout_tip = _Tip: Ensure that the branch name is correct and the target branch is available._ ``` + `git_checkout_tip` should contain some hint on how to deal with the failure. It is shown when `git checkout` failed. -``` +```ini curl_failure = Unable to download the `.diff` file. ``` + `curl_failure` is shown when downloading the `PR_NUMBER.diff` -``` + +```ini curl_tip = _Tip: This could be a connection failure. Try again and if the issue remains check if the address is correct_ ``` + `curl_tip` should help in how to deal with failing downloads of the `.diff` file. -``` +```ini git_apply_failure = Unable to download or merge changes between the source branch and the destination branch. ``` + `git_apply_failure` is shown when applying the `.diff` file with `git apply` failed. -``` +```ini git_apply_tip = _Tip: This can usually be resolved by syncing your branch and resolving any merge conflicts._ ``` + `git_apply_tip` should guide the contributor/maintainer about resolving the cause of `git apply` failing. +```ini +pr_diff_failure = Unable to obtain PR diff. +``` + +The value of `pr_diff_failure` is shown when the `.diff` file could not be obtained. + +```ini +pr_diff_tip = _Tip: This could be a problem with SSH access to the repository._ +``` + +The value of `pr_diff_tip` should guide the maintainer / bot administrator about resolving the cause for the failing procedure to obtain the `.diff` file. + #### `[clean_up]` section The `[clean_up]` section includes settings related to cleaning up disk used by merged (and closed) PRs. -``` + +```ini trash_bin_dir = PATH/TO/TRASH_BIN_DIRECTORY ``` + Ideally this is on the same filesystem used by `jobs_base_dir` and `job_ids_dir` to efficiently move data into the trash bin. If it resides on a different filesystem, the data will be copied. -``` +```ini moved_job_dirs_comment = PR merged! Moved `{job_dirs}` to `{trash_bin_dir}` ``` + Template that is used by the bot to add a comment to a PR noting down which directories have been moved and where. # Step 6: Creating a ReFrame configuration file for the test step (only needed when building for the [EESSI software layer](https://github.com/EESSI/software-layer)) -Part of the test step of the EESSI software layer is running the EESSI test suite. This requires putting a ReFrame configuration file in place that describes the partitions in the `arch_target_map` of the bot config. + +Part of the test step of the EESSI software layer is running the EESSI test suite. This requires putting a ReFrame configuration file in place that describes the partitions in the `node_type_map` of the bot config. You can find general documentation on how to write a ReFrame config file in the [EESSI documentation](https://www.eessi.io/docs/test-suite/ReFrame-configuration-file/). However, some specifics apply when setting things up for the test step: @@ -865,19 +1091,22 @@ You can find general documentation on how to write a ReFrame config file in the - The `scheduler` should be `local`, as the bot already schedules the job (ReFrame should just locally spawn the tests in the allocation created by the bot). - The `access` field should not be used by ReFrame if the local scheduler is defined, you can simply omit this keyword. -To configure the number of GPUs and CPUs, we have two options: +To configure the number of GPUs and CPUs, we have two options: + 1. We describe the physical node in the ReFrame configuration file and set the `REFRAME_SCALE_TAG` environment variable to match the size of the allocation that you specify in your bot config. E.g. if your bot config allocates 1/4th of a node, one would set `REFRAME_SCALE_TAG=1_4_node` in the environment of the job submitted by the bot. 2. We describe a virtual node configuration that matches the size of the allcation created by the bot (and we use the default `REFRAME_SCALE_TAG=1_node`, you don't have to set this explicitely). The first approach is the easiest, and thus recommended, since you can use CPU autodetection by ReFrame. The second approach allows for more flexibility. ## Approach 1 (recommended): describing the physical node and setting the `REFRAME_SCALE_TAG` to match the bot config's allocation size + In this approach, we describe the physical node configuration. That means: the amount of physical CPUs and GPUs present in the node. For the CPU part, we can rely on ReFrame's CPU autodetection: if `remote_detect` is set to `True` in the general section of the config, and no CPU topology information is provided in the ReFrame configuration file, ReFrame will automatically detect the [CPU topology](https://reframe-hpc.readthedocs.io/en/stable/config_reference.html#config.systems.partitions.processor). For the GPU part, we need to configure the vendor and the amount of GPUs. E.g. for a partition with 4 Nvidia GPUs per node: -``` + +```json 'partition': { ... 'extras': { @@ -899,8 +1128,10 @@ Note that if you had e.g. a node with 6 GPUs per node, and you were building on Note that if for _some_ partitions you use e.g. quarter nodes, and for some full nodes, you'll have to set the `REFRAME_SCALE_TAG` conditionally based on the node architecture. You could e.g. do this in a `.bashrc` that has some conditional logic to determine the node type and set the corresponding scale. Alternatively, you could use Approach 2. ### Complete example config + In this example, we assume a node with 4 A100 GPUs (compute capability `cc80`) and 72 CPU cores (Intel Skylake) and 512 GB of memory (of which 491520 MiB is useable by SLURM jobs; on this system the rest is reserved for the OS): -``` + +```python from eessi.testsuite.common_config import common_logging_config from eessi.testsuite.constants import * # noqa: F403 @@ -964,10 +1195,12 @@ site_configuration = { ``` ## Approach 2: describing a virtual node -In this approach, we describe a virtual node configuration for which the size matches exactly what is allocated by the bot (through the `slurm_params` and `arch_target_map`). In this example, we'll assume that this node has 4 GPUs and 72 cores, distributed over 2 sockets each consisting of 1 NUMA domain. We also assume our bot is configured with `slurm_params = --hold --nodes=1 --export=None --time=0:30:0` and `arch_target_map = {"linux/x86_64/intel/skylake_avx512" : "--partition=gpu --cpus-per-task=18 --gpus-per-node 1"}`, i.e. it effectively allocates a quarter node. We describe a virtual partition for ReFrame as if this quarter node is a full node, i.e. we pretend it is a partition with 18 cores and 1 GPU per node, with 1 socket. + +In this approach, we describe a virtual node configuration for which the size matches exactly what is allocated by the bot (through the `slurm_params` and `node_type_map`). In this example, we'll assume that this node has 4 GPUs and 72 cores, distributed over 2 sockets each consisting of 1 NUMA domain. We also assume our bot is configured with `slurm_params = --hold --nodes=1 --export=None --time=0:30:0` and `node_type_map = {"linux/x86_64/intel/skylake_avx512" : "--partition=gpu --cpus-per-task=18 --gpus-per-node 1"}`, i.e. it effectively allocates a quarter node. We describe a virtual partition for ReFrame as if this quarter node is a full node, i.e. we pretend it is a partition with 18 cores and 1 GPU per node, with 1 socket. We would first have to hardcode the CPU configuration. -``` + +```json 'partition': { ... 'processor': { @@ -986,17 +1219,19 @@ We would first have to hardcode the CPU configuration. ``` Note that if instead, this node would have had 8 NUMA domains (4 per socket), the 18 cores would correspond to 2 NUMA domains and we would have had to define: -``` + +```json "numa_nodes": [ "0x001ff", # a bit mask of 000000000111111111, i.e. cores 0-8 are on this NUMA domain "0x3fe00", # a bit mask of 111111111000000000, i.e. cores 9-17 are on this NUMA domain ] ``` -Note that the `topology` dictionary in a ReFrame configuration file can contain more information, such as the bitmasks for the CPU sockets and cores, as well as information on the caches (see [here](https://reframe-hpc.readthedocs.io/en/stable/config_reference.html#config.systems.partitions.processor.topology)). Currently, that information is not needed by the EESSI test suite, but that may change if tests are added that utilize such information to execute efficiently. +Note that the `topology` dictionary in a ReFrame configuration file can contain more information, such as the bitmasks for the CPU sockets and cores, as well as information on the caches (see [ReFrame docs](https://reframe-hpc.readthedocs.io/en/stable/config_reference.html#config.systems.partitions.processor.topology)). Currently, that information is not needed by the EESSI test suite, but that may change if tests are added that utilize such information to execute efficiently. For the GPU configuration, we simply put: -``` + +```json 'partition': { ... 'extras': { @@ -1010,11 +1245,14 @@ For the GPU configuration, we simply put: ] } ``` -To match the fact that we allocate 1 GPU in the `arch_target_map`. + +To match the fact that we allocate 1 GPU in the `node_type_map`. ### Complete example config + In this example, we assume a node with 4 A100 GPUs (compute capability `cc80`) and 72 CPU cores (Intel Skylake) and 512 GB of memory (of which 491520 MiB is useable by SLURM jobs; on this system the rest is reserved for the OS). We also assume the bot configuration is such for this partition that 1/4th of these nodes gets allocated for a build job: -``` + +```python site_configuration = { 'systems': [ { @@ -1087,13 +1325,15 @@ site_configuration = { # Step 7: Instructions to run the bot components The bot consists of three components: -* the Smee client; -* the event handler; -* the job manager. + +- the Smee client; +- the event handler; +- the job manager. Running the Smee client was explained in [Step 1](#step1). ## Step 7.1: Running the event handler + As the event handler may run for a long time, it is advised to run it in a `screen` or `tmux` session. The event handler is provided by the [`eessi_bot_event_handler.py`](https://github.com/EESSI/eessi-bot-software-layer/blob/main/eessi_bot_event_handler.py) Python script. @@ -1102,13 +1342,17 @@ Change directory to `eessi-bot-software-layer` (which was created by cloning the repository in [Step 4](#step4) - either the original one from EESSI, or your fork). Then, simply run the event handler script: -``` + +```bash ./event_handler.sh ``` + If multiple instances on the `bot machine` are being executed, you may need to run the event handler and the Smee client with a different port (default is 3000). The event handler can receive events on a different port by adding the parameter `--port PORTNUMBER`, for example, -``` + +```bash ./event_handler.sh --port 3030 ``` + See [Step 1](#step1) for telling the Smee client on which port the event handler receives events. The event handler writes log information to the files `pyghee.log` and @@ -1117,17 +1361,19 @@ The event handler writes log information to the files `pyghee.log` and Note, if you run the bot on a frontend of a cluster with multiple frontends make sure that both the Smee client and the event handler run on the same system! ## Step 7.2: Running the job manager + As the job manager may run for a long time, it is advised to run it in a `screen` or `tmux` session. The job manager is provided by the [`eessi_bot_job_manager_layer.py`](https://github.com/EESSI/eessi-bot-software-layer/blob/main/eessi_bot_job_manager.py) Python script. You can run the job manager from the directory `eessi-bot-software-layer` simply by: -``` +```bash ./job_manager.sh ``` It will run in an infinite loop monitoring jobs and acting on their state changes. If you want to limit the execution of the job manager, you can use thes options: + |Option|Argument| |------|--------| |`-i` / `--max-manager-iterations`|Any number _z_: _z_ < 0 - run the main loop indefinitely, _z_ == 0 - don't run the main loop, _z_ > 0 - run the main loop _z_ times| @@ -1135,9 +1381,10 @@ If you want to limit the execution of the job manager, you can use thes options: An example command would be -``` +```bash ./job_manager.sh -i 1 -j 1234 ``` + to run the main loop exactly once for the job with ID `1234`. The job manager writes log information to the file `eessi_bot_job_manager.log`. @@ -1155,7 +1402,7 @@ Both Git and Curl need to have access to the target repo. A convenient way to access a private repo via a Github token is by adding the following lines to your `~/.netrc` and `~/.curlrc` files: -``` +```bash # ~/.netrc machine github.com login oauth @@ -1166,8 +1413,7 @@ login oauth password ``` -``` +```bash # ~/.curlrc --netrc ``` - diff --git a/RELEASE_NOTES b/RELEASE_NOTES index eb5b4ebd..6dd9bac6 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -1,6 +1,49 @@ This file contains a description of the major changes to the EESSI build-and-deploy bot. For more detailed information, please see the git log. +v0.9.0 (22 August 2025) +-------------------------- + +This is a minor release of the EESSI build-and-deploy bot. + +Note! Though it is a minor release it includes breaking changes of the bot +configuration and the user interface for triggering build jobs. For details +see below and documentation at https://www.eessi.io/docs/bot/#building + +Bug fixes: +* revised and updated app.cfg.example and README.md (#325) + +Improvements: +* add support for cloning target repository via ssh (#300) +* major refactoring of the definition of build targets and breaking change in + bot configuration and of the user interface to trigger builds (#312, #329, #331, #337) + * `arch_target_map` is replaced by `node_type_map` which provides a more + comprehensive/flexible approach to define architectures that are available + for build jobs; for details, see https://github.com/EESSI/eessi-bot-software-layer?tab=readme-ov-file#architecturetargets-section + * the `repo_target_map` setting is removed because the information is now + included in the `node_type_map` setting + * the `initial_comment` setting in the `[submitted_job_comments]` section of the bot configuration + has been replaced with separate settings: `new_job_instance_repo`, `build_on_arch`, `build_for_arch`, `jobdir`; + for details, see https://github.com/EESSI/eessi-bot-software-layer/blob/develop/README.md#submitted_job_comments-section + and the example bot configuration `app.cfg.example`; + * `bot:build` filters `architecture:` and `accel:` are replaced by `on:` and + `for:` (for details see documentation at https://www.eessi.io/docs/bot/#building) +* add CI to build and publish smee-client container image (#321, #322) +* make space before bot command optional (#324) +* support template values in the `scontrol` command (#327, #335) +* support additional parameter (`last_build`) for `bot:status` command (#334) + +Changes to 'app.cfg' settings (see README.md and app.cfg.example for details): +* REMOVED (required) 'arch_target_map' in section '[architecturetargets]', replaced by + 'node_type_map' +* REMOVED (required) 'repo_target_map' in section '[repo_targets]', replaced by + 'node_type_map' +* NEW (required) 'node_type_map' in section '[architecturetargets]' +* NEW (optional) 'clone_git_repo_via' in section '[buildenv]' +* NEW (required) 'pr_diff_failure' in section '[download_pr_comments]' +* NEW (required) 'pr_diff_tip' in section '[download_pr_comments]' + + v0.8.0 (23 May 2025) -------------------------- diff --git a/app.cfg.example b/app.cfg.example index f0a4ec49..0b393a4c 100644 --- a/app.cfg.example +++ b/app.cfg.example @@ -61,15 +61,20 @@ command_response_fmt = {comment_result} -# chattiness level of the bot in terms of writing comments into PRs (minimal, basic, or chatty) +# chattiness level of the bot in terms of writing comments into PRs +# (incognito - no comments, minimal - respond with single comment on bot +# commands `help`, `show_config`, `status` and `build` and update job +# progress, basic - minimal + report failures, or chatty - comments on +# any event being processed) chatlevel = basic [buildenv] -# name of the job script used for building an EESSI stack +# name of the job script that is submitted by the event handler (e.g., +# used for building an EESSI stack) build_job_script = PATH_TO_EESSI_BOT/scripts/bot-build.slurm -# path to directory on shared filesystem that can be used for sharing data across build jobs -# (for example source tarballs used by EasyBuild) +# path to a directory on a shared filesystem that can be used for sharing +# data across build jobs (for example source tarballs used by EasyBuild) shared_fs_path = PATH_TO_SHARED_DIRECTORY # Path (directory) to which build logs for (only) failing builds should be copied by bot/build.sh script @@ -90,6 +95,10 @@ container_cachedir = PATH_TO_SHARED_DIRECTORY # http_proxy = http://PROXY_DNS:3128/ # https_proxy = http://PROXY_DNS:3128/ +# Used to give all jobs of a bot instance the same name. Can be used to allow +# multiple bot instances running on the same Slurm cluster. +job_name = prod + # The job_delay_begin_factor setting defines how many times the poll_interval a # job's begin (EligibleTime) from now should be delayed if the handover protocol # is set to `delayed_begin` (see setting `job_handover_protocol`). That is, if @@ -109,10 +118,6 @@ job_delay_begin_factor = 2 # the 'EligibleTime' job_handover_protocol = hold_release -# Used to give all jobs of a bot instance the same name. Can be used to allow -# multiple bot instances running on the same Slurm cluster. -job_name = prod - # directory under which the bot prepares directories per job # structure created is as follows: YYYY.MM/pr_PR_NUMBER/event_EVENT_ID/run_RUN_NUMBER/OS+SUBDIR jobs_base_dir = $HOME/jobs @@ -150,16 +155,17 @@ slurm_params = --hold # full path to the job submission command submit_command = /usr/bin/sbatch -# which GH account has the permission to trigger the build (by setting -# the label 'bot:build' (apparently this cannot be restricted on GitHub) -# if value is left/empty everyone can trigger the build -# value can be a space delimited list of GH accounts -build_permission = +# defines which GitHub accounts have the permission to trigger +# build jobs, i.e., for which accounts the bot acts on `bot: build ...` +# commands. If the value is left empty, everyone can trigger build jobs. +build_permission = -NOT_ALLOWED_GH_ACCOUNT_NAME- # template for comment when user who set a label has no permission to trigger build jobs no_build_permission_comment = Label `bot:build` has been set by user `{build_labeler}`, but this person does not have permission to trigger builds # whether or not to allow updating the submit options via custom module det_submit_opts +# Should only be enabled (true) with care because this will result in code from the target +# repository being executed by the event handler process, that is, not in a compute job. allow_update_submit_opts = false # defines which name-value pairs (environment variables) are allowed to be @@ -174,7 +180,29 @@ allow_update_submit_opts = false # 'exportvariable:SKIP_TESTS=yes' as a filter, the key-value pair would be # "SKIP_TESTS=yes". # allowed_exportvars = ["NAME1=value_1a", "NAME1=value_1b", "NAME2=value_2"] +# +# It's safe to use the following line as default setting: +allowed_exportvars = [] + +# mechanisn to use to clone Git repository +# 'https' to clone via HTTPS (git clone https://github.com//) +# In case of using 'ssh', one may need additional steps to ensure that the bot +# uses the right ssh key and does not ask for a passphrase (if the key used is +# protected with one). Here are a few things to consider: +# - if the ssh key to be used does not have a standard name (e.g., 'id_rsa'), +# add the following entry to '~/.ssh/config' in the bot's account +# +# Host github.com +# User git +# IdentityFile ~/.ssh/NAME_OF_PRIVATE_KEY_FILE +# +# - if the key is protected by a passphrase (**highly recommended**), run an +# SSH agent and add the key to it (with the following two commands) +# +# eval $(ssh-agent -s) +# ssh-add ~/.ssh/NAME_OF_PRIVATE_KEY_FILE +clone_git_repo_via = https [deploycfg] # script for uploading built software packages @@ -191,36 +219,15 @@ artefact_upload_script = PATH_TO_EESSI_BOT/scripts/eessi-upload-to-staging endpoint_url = URL_TO_S3_SERVER # bucket name: -# can be a string value, to always use same bucket regardless of target repo, -# or can be a mapping of target repo id (see also repo_target_map) to bucket name -# like: bucket_name = {"eessi-pilot-2023.06": "eessi-staging-pilot-2023.06", "eessi.io-2023.06": "software.eessi.io-2023.06"} +# the value can be a simple string, to always use same bucket regardless of +# the target repo, or can be a mapping of a target repo id (see also +# setting repo_target_map) to a bucket name as in +# bucket_name = { +# "eessi.io-2023.06-software": "eessi.io-staging-2023.06", +# "eessi.io-2025.06-software": "software.eessi.io-2023.06" +# } bucket_name = eessi-staging -# settings for signing artefacts with JSON-like format -# REPO_ID: { "script": PATH_TO_SIGN_SCRIPT, "key": PATH_TO_KEY_FILE, "container_runtime": PATH_TO_CONTAINER_RUNTIME } -# If PATH_TO_SIGN_SCRIPT is a relative path, the script must reside in the -# checked out pull request of the target repository (e.g., -# EESSI/software-layer). -# The bot calls the script with the two arguments: -# 1. private key (as provided by the attribute 'key') -# 2. path to the file to be signed (the upload script will determine that) -# NOTE (on "container_runtime"), signing requires a recent installation of OpenSSH -# (8.2 or newer). If the frontend where the event handler runs does not have that -# version installed, you can specify a container runtime via the 'container_runtime' -# attribute below. Currently, only Singularity or Apptainer are supported. -# NOTE (on the key), make sure the file permissions are restricted to `0600` (only -# readable+writable by the file owner, or the signing will likely fail. -# Note (on json format), make sure no trailing commas are used after any elements -# or parsing/loading the json will likely fail. Also, the whole value should start -# at a new line and be indented as shown below. -signing = - { - "eessi.io-2023.06-software: { - "script": PATH_TO_SIGN_SCRIPT, - "key": PATH_TO_EESSI_BOT/config/user-site-system.key, - "container_runtime": PATH_TO_CONTAINER_RUNTIME - } - } # upload policy: defines what policy is used for uploading built artefacts # to an S3 bucket # 'all' ..: upload all artefacts (mulitple uploads of the same artefact possible) @@ -230,9 +237,9 @@ signing = # 'none' : do not upload any built artefacts upload_policy = once -# which GH account has the permission to trigger the deployment (by setting +# which GH account has the permission to trigger the deployment by setting # the label 'bot:deploy' (apparently this cannot be restricted on GitHub) -# if value is left/empty everyone can trigger the deployment +# if value is left/empty _no one_ can trigger the deployment # value can be a space delimited list of GH accounts deploy_permission = @@ -261,21 +268,99 @@ no_deploy_permission_comment = Label `bot:deploy` has been set by user `{deploy_ metadata_prefix = artefact_prefix = +# settings for signing artefacts with JSON-like format +# +# "REPO_ID": { "script": "PATH_TO_SIGN_SCRIPT", "key": "PATH_TO_KEY_FILE", "container_runtime": "PATH_TO_CONTAINER_RUNTIME" } +# +# REPO_ID is the repository ID. Repository IDs are defined in a file `repos.cfg` +# and _must_ match it (see setting `repos_cfg_dir`). +# +# If PATH_TO_SIGN_SCRIPT is a relative path, the script must reside in the +# checked out pull request of the target repository (e.g., +# EESSI/software-layer). +# +# The bot calls the script with the two arguments: +# 1. private key (as provided by the attribute 'key') +# 2. path to the file to be signed (the upload script will determine that) +# +# NOTE (on "container_runtime"), signing requires a recent installation of OpenSSH +# (8.2 or newer). If the frontend where the event handler runs does not have that +# version installed, you can specify a container runtime via the 'container_runtime' +# attribute below. Currently, only Singularity or Apptainer are supported. +# NOTE (on the private key file), make sure the file permissions are restricted to `0600` +# (only readable+writable by the file owner) or the signing will likely fail. +# NOTE (on the JSON-like format), make sure commas are only used for separating elements +# or parsing/loading the json will likely fail. Also, the whole value should start +# at a new line and be indented as shown below. +# NOTE (on double quotes), as shown in the example below, use double quotes for all keys +# and values. +signing = + { + "eessi.io-2023.06-software": { + "script": "PATH_TO_SIGN_SCRIPT", + "key": "PATH_TO_EESSI_BOT/config/user-site-system.key", + "container_runtime": "PATH_TO_CONTAINER_RUNTIME" + } + } -[architecturetargets] -# defines both for which architectures the bot will build -# and what submission parameters shall be used -arch_target_map = { "linux/x86_64/generic" : "--constraint shape=c4.2xlarge", "linux/x86_64/amd/zen2": "--constraint shape=c5a.2xlarge" } +[architecturetargets] +# arch_target_map has been replaced by node_type_map +# arch_target_map = { +# } + +# Each entry in the node_type_map dictionary describes a build node type. The key is a (descriptive) name for this build node, and its value is a dictionary containing the following build node properties as key-value pairs: + - os: its operating system (os) + - cpu_subdir: its CPU architecture + - slurm_params: the SLURM parameters that need to be passed to submit jobs to it + - repo_targets: supported repository targets for this node type + - accel (optional): which accelerators this node has +# All are strings, except repo_targets, which is a list of strings. +# Note that the Slurm parameters should typically be chosen such that a single type of node (with one specific type of +# CPU and one specific type of GPU) should be allocated. +# Below is an example configuration for a system that contains 4 types of nodes: zen2 CPU nodes, zen4 CPU nodes, +# GPU nodes with an icelake CPU and A100 GPU, GPU nodes with a zen4 CPU and an H100 GPU. +# The 'on:' argument to the bot build command determines which node type will be allocated for the build job, +# e.g. 'bot:build on:arch=zen4,accel=nvidia/cc90 for:...' will match the gpu_h100 node type below. +# If no 'on:' argument is passed to the build command, the 'for:' argument is used instead, +# e.g. 'bot:build for:arch=icelake,accel=nvidia/cc80' will match the gpu_a100 node type below. +node_type_map = { + "cpu_zen2": { + "os": "linux", + "cpu_subdir": "x86_64/amd/zen2", + "slurm_params": "-p rome --nodes 1 --ntasks-per-node 16 --cpus-per-task 1", + "repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"] + }, + "cpu_zen4": { + "os": "linux", + "cpu_subdir": "x86_64/amd/zen4", + "accel": "None", + "slurm_params": "-p genoa --nodes 1 --ntasks-per-node 24 --cpus-per-task 1", + "repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"] + }, + "gpu_a100": { + "os": "linux", + "cpu_subdir": "x86_64/intel/icelake", + "accel": "nvidia/cc80", + "slurm_params": "-p gpu_a100 --nodes 1 --tasks-per-node 18 --cpus-per-task 1 --gpus-per-node 1", + "repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"] + }, + "gpu_h100": { + "os": "linux", + "cpu_subdir": "x86_64/amd/zen4", + "accel": "nvidia/cc90", + "slurm_params": "-p gpu_h100 --nodes 1 --tasks-per-node 16 --cpus-per-task 1 --gpus-per-node 1", + "repo_targets": ["eessi.io-2023.06-compat","eessi.io-2023.06-software"] + }} [repo_targets] -# defines for which repository a arch_target should be build for -# -# EESSI/2021.12 and NESSI/2022.11 -repo_target_map = { "linux/x86_64/amd/zen2" : ["eessi-2021.12","nessi.no-2022.11"] } + +# No longer used, repo targets are now specified per node type in the node_type_map +# repo_target_map = { +# "linux/x86_64/amd/zen2" : ["eessi.io-2023.06-software","eessi.io-2025.06-software"] } # points to definition of repositories (default repository defined by build container) -repos_cfg_dir = PATH_TO_SHARED_DIRECTORY/cfg_bundles +repos_cfg_dir = PATH_TO_SHARED_DIRECTORY/repos # configuration for event handler which receives events from a GitHub repository. @@ -299,6 +384,11 @@ poll_command = /usr/bin/squeue poll_interval = 60 # full path to the command for manipulating existing jobs +# It is also possible to add placeholder values to the scontrol_command. +# An example where this may be useful is in a setup where multiple clusters are managed by the same SLURM instance, +# and the `scontrol_command` for that instance needs to get the correct cluster name passed. +# This can be achieved by defining `scontrol_command = /usr/bin/scontrol --clusters=%%(cluster)s`. +# Valid placeholder names are currently: `jobid`, `cluster`, `partition`, `state`, and `reason`. scontrol_command = /usr/bin/scontrol @@ -311,16 +401,22 @@ scontrol_command = /usr/bin/scontrol # are removed, the output (in PR comments) will lack important # information. [submitted_job_comments] -awaits_release = job id `{job_id}` awaits release by job manager +# awaits_release is no longer used since bot release v0.7.0 +# awaits_release = job id `{job_id}` awaits release by job manager awaits_release_delayed_begin_msg = job id `{job_id}` will be eligible to start in about {delay_seconds} seconds awaits_release_hold_release_msg = job id `{job_id}` awaits release by job manager -initial_comment = New job on instance `{app_name}` for CPU micro-architecture `{arch_name}`{accelerator_spec} for repository `{repo_id}` in job dir `{symlink}` +new_job_instance_repo = New job on instance `{app_name}` for repository `{repo_id}` +build_on_arch = Building on: `{on_arch}`{on_accelerator} +build_for_arch = Building for: `{for_arch}`{for_accelerator} +jobdir = Job dir: `{symlink}` with_accelerator =  and accelerator `{accelerator}` +# initial_comment = New job on instance `{app_name}` for repository `{repo_id}`\nBuilding on: `{on_arch}`{on_accelerator}\nBuilding for: `{for_arch}`{for_accelerator}\nJob dir: `{symlink}` # no longer used [new_job_comments] awaits_launch = job awaits launch by Slurm scheduler{extra_info} + [running_job_comments] running_job = job `{job_id}` is running @@ -329,6 +425,7 @@ running_job = job `{job_id}` is running job_result_unknown_fmt =
:shrug: UNKNOWN _(click triangle for detailed information)_
  • Job results file `{filename}` does not exist in job directory, or parsing it failed.
  • No artefacts were found/reported.
job_test_unknown_fmt =
:shrug: UNKNOWN _(click triangle for detailed information)_
  • Job test file `{filename}` does not exist in job directory, or parsing it failed.
+ [download_pr_comments] git_clone_failure = Unable to clone the target repository. git_clone_tip = _Tip: This could be a connection failure. Try again and if the issue remains check if the address is correct_. @@ -338,6 +435,9 @@ curl_failure = Unable to download the `.diff` file. curl_tip = _Tip: This could be a connection failure. Try again and if the issue remains check if the address is correct_ git_apply_failure = Unable to download or merge changes between the source branch and the destination branch. git_apply_tip = _Tip: This can usually be resolved by syncing your branch and resolving any merge conflicts._ +pr_diff_failure = Unable to obtain PR diff. +pr_diff_tip = _Tip: This could be a problem with SSH access to the repository._ + [clean_up] trash_bin_dir = $HOME/trash_bin diff --git a/containers/Dockerfile.smee-client b/containers/Dockerfile.smee-client new file mode 100644 index 00000000..51376e43 --- /dev/null +++ b/containers/Dockerfile.smee-client @@ -0,0 +1,7 @@ +ARG smee_client_version=4.2.1 + +FROM node:lts-alpine +ARG smee_client_version +RUN npm install --global smee-client@${smee_client_version} +ENTRYPOINT ["smee"] +CMD ["--help"] diff --git a/eessi_bot_event_handler.py b/eessi_bot_event_handler.py index 84128305..66cb6977 100644 --- a/eessi_bot_event_handler.py +++ b/eessi_bot_event_handler.py @@ -29,8 +29,8 @@ # Local application imports (anything from EESSI/eessi-bot-software-layer) from connections import github -from tasks.build import check_build_permission, get_architecture_targets, get_repo_cfg, \ - request_bot_build_issue_comments, submit_build_jobs +from tasks.build import check_build_permission, get_node_types, request_bot_build_issue_comments, \ + submit_build_jobs from tasks.deploy import deploy_built_artefacts, determine_job_dirs from tasks.clean_up import move_to_trash_bin from tools import config @@ -43,7 +43,7 @@ REQUIRED_CONFIG = { config.SECTION_ARCHITECTURETARGETS: [ - config.ARCHITECTURETARGETS_SETTING_ARCH_TARGET_MAP], # required + config.NODE_TYPE_MAP], # required config.SECTION_BOT_CONTROL: [ # config.BOT_CONTROL_SETTING_CHATLEVEL, # optional config.BOT_CONTROL_SETTING_COMMAND_PERMISSION, # required @@ -54,6 +54,7 @@ config.BUILDENV_SETTING_BUILD_LOGS_DIR, # optional+recommended config.BUILDENV_SETTING_BUILD_PERMISSION, # optional+recommended config.BUILDENV_SETTING_CONTAINER_CACHEDIR, # optional+recommended + # config.BUILDENV_SETTING_CLONE_GIT_REPO_VIA, # optional # config.BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS, # optional # config.BUILDENV_SETTING_HTTPS_PROXY, # optional # config.BUILDENV_SETTING_HTTP_PROXY, # optional @@ -88,7 +89,9 @@ config.DOWNLOAD_PR_COMMENTS_SETTING_GIT_CHECKOUT_FAILURE, # required config.DOWNLOAD_PR_COMMENTS_SETTING_GIT_CHECKOUT_TIP, # required config.DOWNLOAD_PR_COMMENTS_SETTING_GIT_CLONE_FAILURE, # required - config.DOWNLOAD_PR_COMMENTS_SETTING_GIT_CLONE_TIP], # required + config.DOWNLOAD_PR_COMMENTS_SETTING_GIT_CLONE_TIP, # required + config.DOWNLOAD_PR_COMMENTS_SETTING_PR_DIFF_FAILURE, # required + config.DOWNLOAD_PR_COMMENTS_SETTING_PR_DIFF_TIP], # required config.SECTION_EVENT_HANDLER: [ config.EVENT_HANDLER_SETTING_LOG_PATH], # required config.SECTION_GITHUB: [ @@ -101,10 +104,12 @@ config.SECTION_JOB_MANAGER: [ config.JOB_MANAGER_SETTING_POLL_INTERVAL], # required config.SECTION_REPO_TARGETS: [ - config.REPO_TARGETS_SETTING_REPO_TARGET_MAP, # required config.REPO_TARGETS_SETTING_REPOS_CFG_DIR], # required config.SECTION_SUBMITTED_JOB_COMMENTS: [ - config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT, # required + config.SUBMITTED_JOB_COMMENTS_SETTING_INSTANCE_REPO, # required + config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_ON_ARCH, # required + config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_FOR_ARCH, # required + config.SUBMITTED_JOB_COMMENTS_SETTING_JOBDIR, # required # config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE, # optional config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_DELAYED_BEGIN_MSG, # required config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_HOLD_RELEASE_MSG, # required @@ -408,23 +413,21 @@ def handle_pull_request_opened_event(self, event_info, pr, req_chatlevel=ChatLev app_name = self.cfg[config.SECTION_GITHUB][config.GITHUB_SETTING_APP_NAME] # TODO check if PR already has a comment with arch targets and # repositories - arch_map = get_architecture_targets(self.cfg) - repo_cfg = get_repo_cfg(self.cfg) - - comment = f"Instance `{app_name}` is configured to build for:" - architectures = ['/'.join(arch.split('/')[1:]) for arch in arch_map.keys()] - comment += "\n- architectures: " - if len(architectures) > 0: - comment += f"{', '.join([f'`{arch}`' for arch in architectures])}" - else: - comment += "none" - repositories = list(set([repo_id for repo_ids in repo_cfg[config.REPO_TARGETS_SETTING_REPO_TARGET_MAP].values() - for repo_id in repo_ids])) - comment += "\n- repositories: " - if len(repositories) > 0: - comment += f"{', '.join([f'`{repo_id}`' for repo_id in repositories])}" - else: - comment += "none" + node_map = get_node_types(self.cfg) + + comment = f"Instance `{app_name}` is configured to build on:" + for node in node_map: + comment += f"\n- Node type `{node}`:" + current_node_type = node_map[node] + if "os" in current_node_type: + comment += f"\n - OS: `{current_node_type['os']}`" + if "cpu_subdir" in current_node_type: + comment += f"\n - CPU architecture: `{current_node_type['cpu_subdir']}`" + if "repo_targets" in current_node_type: + comment += f"\n - Repositories: `{current_node_type['repo_targets']}`" + if "accel" in current_node_type: + comment += f"\n - Accelerators: `{current_node_type['accel']}`" + comment += "\n" self.log(f"PR opened: comment '{comment}'") @@ -527,9 +530,15 @@ def handle_bot_command_build(self, event_info, bot_command): pr_number = event_info['raw_request_body']['issue']['number'] pr = gh.get_repo(repo_name).get_pull(pr_number) build_msg = '' + # Require that build_params is defined, it is required. Otherwise, return early + if bot_command.build_params is None: + build_msg = "No 'for:' argument was passed to the bot:build command. This argument is required, so " + build_msg += "not submitting build jobs" + return build_msg + if check_build_permission(pr, event_info): # use filter from command - submitted_jobs = submit_build_jobs(pr, event_info, bot_command.action_filters) + submitted_jobs = submit_build_jobs(pr, event_info, bot_command.action_filters, bot_command.build_params) if submitted_jobs is None or len(submitted_jobs) == 0: build_msg = "\n - no jobs were submitted" else: @@ -575,20 +584,71 @@ def handle_bot_command_status(self, event_info, bot_command): bot_command (EESSIBotCommand): command to be handled Returns: - github.IssueComment.IssueComment (note, github refers to - PyGithub, not the github from the internal connections module) + (string): list item with a link to the issue comment that was created + containing the status overview """ self.log("processing bot command 'status'") repo_name = event_info['raw_request_body']['repository']['full_name'] pr_number = event_info['raw_request_body']['issue']['number'] status_table = request_bot_build_issue_comments(repo_name, pr_number) + if 'last_build' in bot_command.general_args: + # If the bot command is something like 'bot:status =last_build', then only retain the last build for each + # architecture in the status_table + # To do this, we first insert a timestamp to facilitate sorting by time + # Then, we obtain sorting indices that first sort by architecture, then by build time + # Then, we reverse the sorting, so that the last build (highest timestamp) for each archictecture occurs + # first. + # Finally, we copy the table, but each time we encounter an entry for an architecture that we've already + # copied, we ignore it, since - as a result of the sorting - the second entry is always older than the + # first + dates = status_table['date'] + timestamps = [] + for date in dates: + date_object = datetime.strptime(date, "%b %d %X %Z %Y") + timestamps.append(int(date_object.timestamp())) + status_table['timestamp'] = timestamps + + # Figure out the sorting indices, so that things are sorted first by the 'for arch', and then by 'date' + sorted_indices = sorted( + range(len(status_table['for arch'])), + key=lambda x: (status_table['for arch'][x], status_table['timestamp'][x]) + ) + # Reverse, so that the newest builds are first + sorted_indices.reverse() + # Apply the sorted indices to get a sorted table + sorted_table = {key: [status_table[key][i] for i in sorted_indices] for key in status_table} + self.log(f"Sorted status table: {sorted_table}") + + # Keep only the first entry for each 'for arch', as that is now the newest + status_table_last = { + 'on arch': [], 'for arch': [], 'for repo': [], 'date': [], 'status': [], 'url': [], 'result': [] + } + for x in range(0, len(sorted_table['date'])): + if sorted_table['for arch'][x] not in status_table_last['for arch']: + self.log(f"arch: {sorted_table['for arch'][x]} not yet in status_table_last") + for key in status_table_last: + self.log(f"Adding to '{key}' and the value {sorted_table[key][x]}") + status_table_last[key].append(sorted_table[key][x]) + + # Re-sort, now only on 'for arch', for nicer viewing + sorted_indices = sorted( + range(len(status_table_last['for arch'])), + key=lambda x: status_table_last['for arch'][x] + ) + sorted_table_last = {key: [status_table_last[key][i] for i in sorted_indices] for key in status_table_last} + + # overwrite the original status_table + status_table = sorted_table_last + comment_status = '' comment_status += "\nThis is the status of all the `bot: build` commands:" - comment_status += "\n|arch|result|date|status|url|" - comment_status += "\n|----|------|----|------|---|" + comment_status += "\n|on|for|repo|result|date|status|url|" + comment_status += "\n|----|----|----|------|----|------|---|" for x in range(0, len(status_table['date'])): - comment_status += f"\n|{status_table['arch'][x]}|" + comment_status += f"\n|{status_table['on arch'][x]}|" + comment_status += f"{status_table['for arch'][x]}|" + comment_status += f"{status_table['for repo'][x]}|" comment_status += f"{status_table['result'][x]}|" comment_status += f"{status_table['date'][x]}|" comment_status += f"{status_table['status'][x]}|" @@ -596,7 +656,10 @@ def handle_bot_command_status(self, event_info, bot_command): self.log(f"Overview of finished builds: comment '{comment_status}'") issue_comment = create_comment(repo_name, pr_number, comment_status, ChatLevels.MINIMAL) - return issue_comment + if issue_comment: + return f"\n - added status comment {issue_comment.html_url}" + else: + return "\n - failed to create status comment" def start(self, app, port=3000): """ @@ -689,7 +752,7 @@ def main(): opts = event_handler_parse() # config is read and checked for settings to raise an exception early when the event_handler starts. - if config.check_required_cfg_settings(REQUIRED_CONFIG): + if config.check_cfg_settings(REQUIRED_CONFIG): print("Configuration check: PASSED") else: print("Configuration check: FAILED") diff --git a/eessi_bot_job_manager.py b/eessi_bot_job_manager.py index 4fcf9af3..fd67b913 100644 --- a/eessi_bot_job_manager.py +++ b/eessi_bot_job_manager.py @@ -118,6 +118,8 @@ def get_current_jobs(self): squeue_cmd = "%s --long --noheader --user=%s" % (self.poll_command, username) if self.job_name: squeue_cmd += " --name='%s'" % self.job_name + # Format the output of SLURM + squeue_cmd += " --Format JobId:100@,Cluster:100@,Partition:100@,State:100@,Reason:100" squeue_output, squeue_err, squeue_exitcode = run_cmd( squeue_cmd, "get_current_jobs(): squeue command", @@ -138,18 +140,23 @@ def get_current_jobs(self): # get job info, logging any Slurm issues # Note, all output lines of squeue are processed because we run it with # --noheader. - for line in lines: - job = line.rstrip().split() - if len(job) >= 9: - job_id = job[0] - state = job[4] - current_jobs[job_id] = { - "jobid": job_id, - "state": state, - "reason": job[8], - } - if state in bad_state_messages: - log("Job {} in state {}: {}".format(job_id, state, bad_state_messages[state])) + if lines != ['']: + for line in lines: + job = [x.rstrip() for x in line.rstrip().split('@')] + if len(job) == 5: + job_id = job[0] + state = job[3] + current_jobs[job_id] = { + "jobid": job_id, + "cluster": job[1], + "partition": job[2], + "state": state, + "reason": job[4], + } + if state in bad_state_messages: + log("Job {} in state {}: {}".format(job_id, state, bad_state_messages[state])) + else: + raise Exception(f"The output of {squeue_cmd} does not have 5 job parameters") return current_jobs @@ -296,17 +303,25 @@ def process_new_job(self, new_job): """ job_id = new_job["jobid"] - scontrol_cmd = "%s --oneliner show jobid %s" % ( - self.scontrol_command, + # processing placeholders in scontrol command which is defined in the bot's app.cfg (setting `scontrol_command`) + try: + templated_scontrol_command = self.scontrol_command % new_job + except KeyError: + log(f"Failed to process {self.scontrol_command}.") + log(f"Information on placeholder is not collected in new_job: {new_job}.") + raise + + cmd = "%s --oneliner show jobid %s" % ( + templated_scontrol_command, job_id, ) scontrol_output, scontrol_err, scontrol_exitcode = run_cmd( - scontrol_cmd, + cmd, "process_new_job(): scontrol command", log_file=self.logfile, ) - # parse output of 'scontrol_cmd' + # parse output of scontrol command that fetches job info job_info = self.parse_scontrol_show_job_output(str(scontrol_output)) # check if job_info contains 'WorkDir', if not we cannot process the job @@ -351,7 +366,7 @@ def process_new_job(self, new_job): extra_info = '' if self.job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_HOLD_RELEASE: release_cmd = "%s release %s" % ( - self.scontrol_command, + templated_scontrol_command, job_id, ) @@ -623,7 +638,7 @@ def main(): # config is read and checked for settings to raise an exception early when # the job_manager runs - if config.check_required_cfg_settings(REQUIRED_CONFIG): + if config.check_cfg_settings(REQUIRED_CONFIG): print("Configuration check: PASSED") else: print("Configuration check: FAILED") diff --git a/tasks/build.py b/tasks/build.py index 517c4077..6148227e 100644 --- a/tasks/build.py +++ b/tasks/build.py @@ -23,7 +23,9 @@ from datetime import datetime, timezone import json import os +import re import shutil +import string import sys # Third party imports (anything installed into the local Python environment) @@ -33,7 +35,7 @@ from tools import config, cvmfs_repository, job_metadata, pr_comments, run_cmd import tools.filter as tools_filter from tools.pr_comments import ChatLevels, create_comment - +from tools.build_params import BUILD_PARAM_ARCH, BUILD_PARAM_ACCEL # defaults (used if not specified via, eg, 'app.cfg') DEFAULT_JOB_TIME_LIMIT = "24:00:00" @@ -42,7 +44,8 @@ _ERROR_CURL = "curl" _ERROR_GIT_APPLY = "git apply" _ERROR_GIT_CHECKOUT = "git checkout" -_ERROR_GIT_CLONE = "curl" +_ERROR_GIT_CLONE = "git clone" +_ERROR_PR_DIFF = "pr_diff" _ERROR_NONE = "none" # other constants @@ -171,29 +174,32 @@ def get_build_env_cfg(cfg): log(f"{fn}(): load_modules '{load_modules}'") config_data[config.BUILDENV_SETTING_LOAD_MODULES] = load_modules + clone_git_repo_via = buildenv.get(config.BUILDENV_SETTING_CLONE_GIT_REPO_VIA, None) + log(f"{fn}(): clone_git_repo_via '{clone_git_repo_via}'") + config_data[config.BUILDENV_SETTING_CLONE_GIT_REPO_VIA] = clone_git_repo_via + return config_data -def get_architecture_targets(cfg): - """ - Obtain mappings of architecture targets to Slurm parameters +def get_node_types(cfg): + """Obtain mappings of node types to Slurm parameters Args: cfg (ConfigParser): ConfigParser instance holding full configuration (typically read from 'app.cfg') Returns: - (dict): dictionary mapping architecture targets (format - OS/SOFTWARE_SUBDIR) to architecture specific Slurm job submission - parameters + (dict): Dictionary mapping node types names (arbitrary text) node properties + such as the OS, CPU software subdir, supported repositories, accelerator (optionally) + as well as the slurm parameters to allocate such a type of node """ fn = sys._getframe().f_code.co_name - architecture_targets = cfg[config.SECTION_ARCHITECTURETARGETS] + node_types = cfg[config.SECTION_ARCHITECTURETARGETS] - arch_target_map = json.loads(architecture_targets.get(config.ARCHITECTURETARGETS_SETTING_ARCH_TARGET_MAP)) - log(f"{fn}(): arch target map '{json.dumps(arch_target_map)}'") - return arch_target_map + node_type_map = json.loads(node_types.get(config.NODE_TYPE_MAP)) + log(f"{fn}(): node type map '{json.dumps(node_type_map)}'") + return node_type_map def get_allowed_exportvars(cfg): @@ -236,8 +242,6 @@ def get_repo_cfg(cfg): Returns: (dict): dictionary containing repository settings as follows - {config.REPO_TARGETS_SETTING_REPOS_CFG_DIR: path to repository config directory as defined in 'app.cfg'} - - {config.REPO_TARGETS_SETTING_REPO_TARGET_MAP: json of - config.REPO_TARGETS_SETTING_REPO_TARGET_MAP value as defined in 'app.cfg'} - for all sections [repo_id] defined in config.REPO_TARGETS_SETTING_REPOS_CFG_DIR/repos.cfg add a mapping {repo_id: dictionary containing settings of that section} """ @@ -254,21 +258,6 @@ def get_repo_cfg(cfg): settings_repos_cfg_dir = config.REPO_TARGETS_SETTING_REPOS_CFG_DIR repo_cfg[settings_repos_cfg_dir] = repo_cfg_org.get(settings_repos_cfg_dir, None) - repo_map = {} - try: - repo_map_str = repo_cfg_org.get(config.REPO_TARGETS_SETTING_REPO_TARGET_MAP) - log(f"{fn}(): repo_map '{repo_map_str}'") - - if repo_map_str is not None: - repo_map = json.loads(repo_map_str) - - log(f"{fn}(): repo_map '{json.dumps(repo_map)}'") - except json.JSONDecodeError as err: - print(err) - error(f"{fn}(): Value for repo_map ({repo_map_str}) could not be decoded.") - - repo_cfg[config.REPO_TARGETS_SETTING_REPO_TARGET_MAP] = repo_map - if repo_cfg[config.REPO_TARGETS_SETTING_REPOS_CFG_DIR] is None: return repo_cfg @@ -379,7 +368,7 @@ def clone_git_repo(repo, path): return (clone_output, clone_error, clone_exit_code) -def download_pr(repo_name, branch_name, pr, arch_job_dir): +def download_pr(repo_name, branch_name, pr, arch_job_dir, clone_via=None): """ Download pull request to job working directory @@ -388,6 +377,7 @@ def download_pr(repo_name, branch_name, pr, arch_job_dir): branch_name (string): name of the base branch of the pull request pr (github.PullRequest.PullRequest): instance representing the pull request arch_job_dir (string): working directory of the job to be submitted + clone_via (string): mechanism to clone Git repository, should be 'https' (default) or 'ssh' Returns: None (implicitly), in case an error is caught in the git clone, git checkout, curl, @@ -400,7 +390,29 @@ def download_pr(repo_name, branch_name, pr, arch_job_dir): # - 'git checkout' base branch of pull request # - 'curl' diff for pull request # - 'git apply' diff file - clone_output, clone_error, clone_exit_code = clone_git_repo(f'https://github.com/{repo_name}', arch_job_dir) + log(f"Cloning Git repo via: {clone_via}") + if clone_via in (None, 'https'): + repo_url = f'https://github.com/{repo_name}' + pr_diff_cmd = ' '.join([ + 'curl -L', + '-H "Accept: application/vnd.github.diff"', + '-H "X-GitHub-Api-Version: 2022-11-28"', + f'https://api.github.com/repos/{repo_name}/pulls/{pr.number} > {pr.number}.diff', + ]) + elif clone_via == 'ssh': + repo_url = f'git@github.com:{repo_name}.git' + pr_diff_cmd = ' && '.join([ + f"git fetch origin pull/{pr.number}/head:pr{pr.number}", + f"git diff $(git merge-base pr{pr.number} HEAD) pr{pr.number} > {pr.number}.diff", + ]) + else: + clone_output = '' + clone_error = f"Unknown mechanism to clone Git repo: {clone_via}" + clone_exit_code = 1 + error_stage = _ERROR_GIT_CLONE + return clone_output, clone_error, clone_exit_code, error_stage + + clone_output, clone_error, clone_exit_code = clone_git_repo(repo_url, arch_job_dir) if clone_exit_code != 0: error_stage = _ERROR_GIT_CLONE return clone_output, clone_error, clone_exit_code, error_stage @@ -417,24 +429,18 @@ def download_pr(repo_name, branch_name, pr, arch_job_dir): error_stage = _ERROR_GIT_CHECKOUT return checkout_output, checkout_err, checkout_exit_code, error_stage - curl_cmd = ' '.join([ - 'curl -L', - '-H "Accept: application/vnd.github.diff"', - '-H "X-GitHub-Api-Version: 2022-11-28"', - f'https://api.github.com/repos/{repo_name}/pulls/{pr.number} > {pr.number}.diff', - ]) - log(f'curl with command {curl_cmd}') - curl_output, curl_error, curl_exit_code = run_cmd( - curl_cmd, "Obtain patch", arch_job_dir, raise_on_error=False + log(f'obtaining PR diff with command {pr_diff_cmd}') + pr_diff_output, pr_diff_error, pr_diff_exit_code = run_cmd( + pr_diff_cmd, "obtain PR diff", arch_job_dir, raise_on_error=False ) - if curl_exit_code != 0: - error_stage = _ERROR_CURL - return curl_output, curl_error, curl_exit_code, error_stage + if pr_diff_exit_code != 0: + error_stage = _ERROR_PR_DIFF + return pr_diff_output, pr_diff_error, pr_diff_exit_code, error_stage git_apply_cmd = f'git apply {pr.number}.diff' log(f'git apply with command {git_apply_cmd}') git_apply_output, git_apply_error, git_apply_exit_code = run_cmd( - git_apply_cmd, "Apply patch", arch_job_dir, raise_on_error=False + git_apply_cmd, "apply patch", arch_job_dir, raise_on_error=False ) if git_apply_exit_code != 0: error_stage = _ERROR_GIT_APPLY @@ -481,6 +487,12 @@ def comment_download_pr(base_repo_name, pr, download_pr_exit_code, download_pr_e download_comment = (f"```{download_pr_error}```\n" f"{download_pr_comments_cfg[config.DOWNLOAD_PR_COMMENTS_SETTING_GIT_APPLY_FAILURE]}" f"\n{download_pr_comments_cfg[config.DOWNLOAD_PR_COMMENTS_SETTING_GIT_APPLY_TIP]}") + elif error_stage == _ERROR_PR_DIFF: + download_comment = (f"```{download_pr_error}```\n" + f"{download_pr_comments_cfg[config.DOWNLOAD_PR_COMMENTS_SETTING_PR_DIFF_FAILURE]}" + f"\n{download_pr_comments_cfg[config.DOWNLOAD_PR_COMMENTS_SETTING_PR_DIFF_TIP]}") + else: + download_comment = f"```{download_pr_error}```" download_comment = pr_comments.create_comment( repo_name=base_repo_name, pr_number=pr.number, comment=download_comment, req_chatlevel=ChatLevels.MINIMAL @@ -540,7 +552,7 @@ def prepare_export_vars_file(job_dir, exportvars): log(f"{fn}(): created exported variables file {export_vars_path}") -def prepare_jobs(pr, cfg, event_info, action_filter): +def prepare_jobs(pr, cfg, event_info, action_filter, build_params): """ Prepare all jobs whose context matches the given filter. Preparation includes creating a working directory for a job, downloading the pull request into @@ -551,6 +563,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter): cfg (ConfigParser): instance holding full configuration (typically read from 'app.cfg') event_info (dict): event received by event_handler action_filter (EESSIBotActionFilter): used to filter which jobs shall be prepared + build_params (EESSIBotBuildParams): dict that contains the build parameters for the job Returns: (list): list of the prepared jobs @@ -559,7 +572,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter): app_name = cfg[config.SECTION_GITHUB].get(config.GITHUB_SETTING_APP_NAME) build_env_cfg = get_build_env_cfg(cfg) - arch_map = get_architecture_targets(cfg) + node_map = get_node_types(cfg) repocfg = get_repo_cfg(cfg) allowed_exportvars = get_allowed_exportvars(cfg) @@ -599,13 +612,25 @@ def prepare_jobs(pr, cfg, event_info, action_filter): return [] jobs = [] - for arch, slurm_opt in arch_map.items(): - arch_dir = arch.replace('/', '_') - # check if repo_target_map contains an entry for {arch} - if arch not in repocfg[config.REPO_TARGETS_SETTING_REPO_TARGET_MAP]: - log(f"{fn}(): skipping arch {arch} because repo target map does not define repositories to build for") + # Looping over all node types in the node_map to create a context for each node type and repository + # configured there. Then, check the action filters against these configs to find matching ones. + # If there is a match, prepare the job dir and create the Job object + for node_type_name, partition_info in node_map.items(): + log(f"{fn}(): node_type_name is {node_type_name}, partition_info is {partition_info}") + # Unpack for convenience + arch_dir = build_params[BUILD_PARAM_ARCH] + if BUILD_PARAM_ACCEL in build_params: + arch_dir += f"/{build_params[BUILD_PARAM_ACCEL]}" + build_for_accel = build_params[BUILD_PARAM_ACCEL] + else: + build_for_accel = '' + arch_dir.replace('/', '_') + # check if repo_targets is defined for this virtual partition + if 'repo_targets' not in partition_info: + log(f"{fn}(): skipping arch {node_type_name}, " + "because no repo_targets were defined for this (virtual) partition") continue - for repo_id in repocfg[config.REPO_TARGETS_SETTING_REPO_TARGET_MAP][arch]: + for repo_id in partition_info['repo_targets']: # ensure repocfg contains information about the repository repo_id if repo_id != EESSI # Note, EESSI is a bad/misleading name, it should be more like AS_IN_CONTAINER if (repo_id != "EESSI" and repo_id != "EESSI-pilot") and repo_id not in repocfg: @@ -618,8 +643,16 @@ def prepare_jobs(pr, cfg, event_info, action_filter): # false --> log & continue to next iteration of for loop if action_filter: log(f"{fn}(): checking filter {action_filter.to_string()}") - context = {"architecture": arch, "repository": repo_id, "instance": app_name} + context = { + "architecture": partition_info['cpu_subdir'], + "repository": repo_id, + "instance": app_name + } + # Optionally add accelerator to the context + if 'accel' in partition_info: + context['accelerator'] = partition_info['accel'] log(f"{fn}(): context is '{json.dumps(context, indent=4)}'") + if not action_filter.check_filters(context): log(f"{fn}(): context does NOT satisfy filter(s), skipping") continue @@ -627,36 +660,36 @@ def prepare_jobs(pr, cfg, event_info, action_filter): log(f"{fn}(): context DOES satisfy filter(s), going on with job") # we reached this point when the filter matched (otherwise we # 'continue' with the next repository) - # for each match of the filter we create a specific job directory - # however, matching CPU architectures works differently to handling - # accelerators; multiple CPU architectures defined in arch_target_map - # can match the (CPU) architecture component of a filter; in - # contrast, the value of the accelerator filter is just passed down - # to scripts in bot/ directory of the pull request (see function - # prepare_job_cfg and creation of Job tuple below) + # We create a specific job directory for the architecture that is going to be build 'for:' job_dir = os.path.join(run_dir, arch_dir, repo_id) os.makedirs(job_dir, exist_ok=True) log(f"{fn}(): job_dir '{job_dir}'") # TODO optimisation? download once, copy and cleanup initial copy? + clone_git_repo_via = build_env_cfg.get(config.BUILDENV_SETTING_CLONE_GIT_REPO_VIA) download_pr_output, download_pr_error, download_pr_exit_code, error_stage = download_pr( - base_repo_name, base_branch_name, pr, job_dir + base_repo_name, base_branch_name, pr, job_dir, clone_via=clone_git_repo_via, ) comment_download_pr(base_repo_name, pr, download_pr_exit_code, download_pr_error, error_stage) # prepare job configuration file 'job.cfg' in directory /cfg - cpu_target = '/'.join(arch.split('/')[1:]) - os_type = arch.split('/')[0] - - log(f"{fn}(): arch = '{arch}' => cpu_target = '{cpu_target}' , os_type = '{os_type}'" - f", accelerator = '{accelerator}'") - - prepare_job_cfg(job_dir, build_env_cfg, repocfg, repo_id, cpu_target, os_type, accelerator) + msg = f"{fn}(): node type = '{node_type_name}' => " + msg += f"requested cpu_target = '{partition_info['cpu_subdir']}, " + msg += f"build cpu_target = '{build_params[BUILD_PARAM_ARCH]}', " + msg += f"configured os = '{partition_info['os']}', " + if 'accel' in partition_info: + msg += f"requested accelerator(s) = '{partition_info['accel']}, " + msg += f"build accelerator = '{build_for_accel}'" + log(msg) + + prepare_job_cfg(job_dir, build_env_cfg, repocfg, repo_id, build_params[BUILD_PARAM_ARCH], + partition_info['os'], build_for_accel, node_type_name) if exportvars: prepare_export_vars_file(job_dir, exportvars) # enlist jobs to proceed - job = Job(job_dir, arch, repo_id, slurm_opt, year_month, pr_id, accelerator) + job = Job(job_dir, partition_info['cpu_subdir'], repo_id, partition_info['slurm_params'], year_month, + pr_id, accelerator) jobs.append(job) log(f"{fn}(): {len(jobs)} jobs to proceed after applying white list") @@ -666,7 +699,7 @@ def prepare_jobs(pr, cfg, event_info, action_filter): return jobs -def prepare_job_cfg(job_dir, build_env_cfg, repos_cfg, repo_id, software_subdir, os_type, accelerator): +def prepare_job_cfg(job_dir, build_env_cfg, repos_cfg, repo_id, software_subdir, os_type, accelerator, node_type_name): """ Set up job configuration file 'job.cfg' in directory /cfg @@ -678,6 +711,7 @@ def prepare_job_cfg(job_dir, build_env_cfg, repos_cfg, repo_id, software_subdir, software_subdir (string): software subdirectory to build for (e.g., 'x86_64/generic') os_type (string): type of the os (e.g., 'linux') accelerator (string): defines accelerator to build for (e.g., 'nvidia/cc80') + node_type_name (string): the node type name, as configured in app.cfg Returns: None (implicitly) @@ -748,6 +782,7 @@ def prepare_job_cfg(job_dir, build_env_cfg, repos_cfg, repo_id, software_subdir, job_cfg_arch_section = job_metadata.JOB_CFG_ARCHITECTURE_SECTION job_cfg[job_cfg_arch_section] = {} + job_cfg[job_cfg_arch_section][job_metadata.JOB_CFG_ARCHITECTURE_NODE_TYPE] = node_type_name job_cfg[job_cfg_arch_section][job_metadata.JOB_CFG_ARCHITECTURE_SOFTWARE_SUBDIR] = software_subdir job_cfg[job_cfg_arch_section][job_metadata.JOB_CFG_ARCHITECTURE_OS_TYPE] = os_type job_cfg[job_cfg_arch_section][job_metadata.JOB_CFG_ARCHITECTURE_ACCELERATOR] = accelerator if accelerator else '' @@ -886,7 +921,7 @@ def submit_job(job, cfg): return job_id, symlink -def create_pr_comment(job, job_id, app_name, pr, symlink): +def create_pr_comment(job, job_id, app_name, pr, symlink, build_params): """ Create a comment to the pull request for a newly submitted job @@ -896,6 +931,7 @@ def create_pr_comment(job, job_id, app_name, pr, symlink): app_name (string): name of the app pr (github.PullRequest.PullRequest): instance representing the pull request symlink (string): symlink from main pr_ dir to job dir + build_params (EESSIBotBuildParams): dict that contains the build parameters for the job Returns: github.IssueComment.IssueComment instance or None (note, github refers to @@ -903,16 +939,24 @@ def create_pr_comment(job, job_id, app_name, pr, symlink): """ fn = sys._getframe().f_code.co_name - # obtain arch from job.arch_target which has the format OS/ARCH - arch_name = '-'.join(job.arch_target.split('/')[1:]) + # Obtain the architecture on which we are building from job.arch_target, which has the format OS/ARCH + on_arch = '-'.join(job.arch_target.split('/')[1:]) + + # Obtain the architecture to build for + for_arch = build_params[BUILD_PARAM_ARCH] submitted_job_comments_cfg = config.read_config()[config.SECTION_SUBMITTED_JOB_COMMENTS] - # set string for accelerator if job.accelerator is defined/set (e.g., not None) - accelerator_spec_str = '' + # Set string for accelerator to build on + accelerator_spec = f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_WITH_ACCELERATOR]}" + on_accelerator_str = '' if job.accelerator: - accelerator_spec = f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_WITH_ACCELERATOR]}" - accelerator_spec_str = accelerator_spec.format(accelerator=job.accelerator) + on_accelerator_str = accelerator_spec.format(accelerator=job.accelerator) + + # Set string for accelerator to build for + for_accelerator_str = '' + if BUILD_PARAM_ACCEL in build_params: + for_accelerator_str = accelerator_spec.format(accelerator=build_params[BUILD_PARAM_ACCEL]) # get current date and time dt = datetime.now(timezone.utc) @@ -920,6 +964,10 @@ def create_pr_comment(job, job_id, app_name, pr, symlink): # construct initial job comment buildenv = config.read_config()[config.SECTION_BUILDENV] job_handover_protocol = buildenv.get(config.BUILDENV_SETTING_JOB_HANDOVER_PROTOCOL) + new_job_instance_repo = submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INSTANCE_REPO] + build_on_arch = submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_ON_ARCH] + build_for_arch = submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_FOR_ARCH] + jobdir = submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_JOBDIR] if job_handover_protocol == config.JOB_HANDOVER_PROTOCOL_DELAYED_BEGIN: release_msg_string = config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_DELAYED_BEGIN_MSG release_comment_template = submitted_job_comments_cfg[release_msg_string] @@ -928,34 +976,44 @@ def create_pr_comment(job, job_id, app_name, pr, symlink): poll_interval = int(job_manager_cfg.get(config.JOB_MANAGER_SETTING_POLL_INTERVAL)) delay_factor = float(buildenv.get(config.BUILDENV_SETTING_JOB_DELAY_BEGIN_FACTOR, 2)) eligible_in_seconds = int(poll_interval * delay_factor) - job_comment = (f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT]}" - f"\n|date|job status|comment|\n" + job_comment = (f"{new_job_instance_repo}\n" + f"{build_on_arch}\n" + f"{build_for_arch}\n" + f"{jobdir}\n" + f"|date|job status|comment|\n" f"|----------|----------|------------------------|\n" f"|{dt.strftime('%b %d %X %Z %Y')}|" f"submitted|" f"{release_comment_template}|").format( app_name=app_name, - arch_name=arch_name, + on_arch=on_arch, + for_arch=for_arch, symlink=symlink, repo_id=job.repo_id, job_id=job_id, delay_seconds=eligible_in_seconds, - accelerator_spec=accelerator_spec_str) + on_accelerator=on_accelerator_str, + for_accelerator=for_accelerator_str) else: release_msg_string = config.SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_HOLD_RELEASE_MSG release_comment_template = submitted_job_comments_cfg[release_msg_string] - job_comment = (f"{submitted_job_comments_cfg[config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT]}" - f"\n|date|job status|comment|\n" + job_comment = (f"{new_job_instance_repo}\n" + f"{build_on_arch}\n" + f"{build_for_arch}\n" + f"{jobdir}\n" + f"|date|job status|comment|\n" f"|----------|----------|------------------------|\n" f"|{dt.strftime('%b %d %X %Z %Y')}|" f"submitted|" f"{release_comment_template}|").format( app_name=app_name, - arch_name=arch_name, + on_arch=on_arch, + for_arch=for_arch, symlink=symlink, repo_id=job.repo_id, job_id=job_id, - accelerator_spec=accelerator_spec_str) + on_accelerator=on_accelerator_str, + for_accelerator=for_accelerator_str) # create comment to pull request repo_name = pr.base.repo.full_name @@ -968,7 +1026,7 @@ def create_pr_comment(job, job_id, app_name, pr, symlink): return None -def submit_build_jobs(pr, event_info, action_filter): +def submit_build_jobs(pr, event_info, action_filter, build_params): """ Create build jobs for a pull request by preparing jobs which match the given filters, submitting them, adding comments to the pull request on GitHub and @@ -978,6 +1036,7 @@ def submit_build_jobs(pr, event_info, action_filter): pr (github.PullRequest.PullRequest): instance representing the pull request event_info (dict): event received by event_handler action_filter (EESSIBotActionFilter): used to filter which jobs shall be prepared + build_params (EESSIBotBuildParams): dict that contains the build parameters for the job Returns: (dict): dictionary mapping a job id to a github.IssueComment.IssueComment @@ -990,7 +1049,7 @@ def submit_build_jobs(pr, event_info, action_filter): app_name = cfg[config.SECTION_GITHUB].get(config.GITHUB_SETTING_APP_NAME) # setup job directories (one per element in product of architecture x repositories) - jobs = prepare_jobs(pr, cfg, event_info, action_filter) + jobs = prepare_jobs(pr, cfg, event_info, action_filter, build_params) # return if there are no jobs to be submitted if not jobs: @@ -1005,7 +1064,7 @@ def submit_build_jobs(pr, event_info, action_filter): job_id, symlink = submit_job(job, cfg) # create pull request comment to report about the submitted job - pr_comment = create_pr_comment(job, job_id, app_name, pr, symlink) + pr_comment = create_pr_comment(job, job_id, app_name, pr, symlink, build_params) job_id_to_comment_map[job_id] = pr_comment pr_comment = pr_comments.PRComment(pr.base.repo.full_name, pr.number, pr_comment.id) @@ -1056,11 +1115,83 @@ def check_build_permission(pr, event_info): return True +def template_to_regex(format_str, with_eol=True): + """ + Converts a formatting string into a regex that can extract all the formatted + parts of the string. If with_eol is True, it assumes the formatted string is followed by an end-of-line + character. This is a requirement if it has to succesfully match a formatting string that ends with a formatting + field. + + Example: if one function creates a formatted string + value = "my_field_value" + format_str = f"This is my string, with a custom field: {my_field}\n" + formatted_string = format_str.format(my_field=value) + Another function can then grab the original value of my_field by doing: + my_re = template_to_regex(format_str) + match_object = re.match(my_re, formatted_string) + match_object['my_field'] then contains "my_field_value" + This is useful when e.g. one function posts a GitHub comment, and another wants to extract information from that + + Args: + format_str (string): a formatting string, with template placeholders. + with_eol (bool, optional): a boolean, indicating if the formatting string is expected to be followed by + an end of line character + + """ + + # string.Formatter returns a 4-tuple of literal text, field name, format spec, and conversion + # E.g if format_str = "This is my {app} it is currently {status}" + # formatter = [ + # ("This is my", "app", "", None), + # ("it is currently", "status", "", None), + # ("", None, None, None), + # ] + formatter = string.Formatter() + regex_parts = [] + + for literal_text, field_name, _, _ in formatter.parse(format_str): + # We use re.escape to escape any special characters in the literal_text, as we want to match those literally + regex_parts.append(re.escape(literal_text)) + if field_name is not None: + # Create a non-greedy, named capture group. Note that the {field_name} itself is a format specifier + # So we get the actual field name as the name of the capture group + # In other words, if our format_str is "My string with {a_field}" then the named capture group will be + # called 'a_field' + # We match any character, but in a non-greedy way. Thus, as soon as it can match the next + # literal text section, it will - thus assuming that that's the end of the field + # We use .* to allow for empty fields (such as the optional accelerator fields) + regex_parts.append(f"(?P<{field_name}>.*?)") + + # Finally, make sure we append a $ to the regex. This is necessary because of our non-greedy matching + # strategy. Otherwise, a formatting string that ends with a formatting item would only match the first letter + # of the field, because it doesn't find anything to match after (and it is non-greedy). With the $, it has + # something to match after the field, thus making sure it matches the whole field + # This does assume that the format_str in the string to be matched is indeed followed by an end-of-line character + # I.e. if a function that creates the formatted string does + # my_string = f"{format_str}\n" + # (i.e. has an end-of-line after the format specifier) it can be matched by another function that does + # my_re = template_to_regex(format_str) + # re.match(my_re, my_string) + full_pattern = ''.join(regex_parts) + if with_eol: + full_pattern += "$" + return re.compile(full_pattern) + + +class PartialFormatDict(dict): + """ + A dictionary class that allows for missing keys - and will just return {key} in that case. + This can be used to partially format some, but not all placeholders in a formatting string. + """ + def __missing__(self, key): + return "{" + key + "}" + + def request_bot_build_issue_comments(repo_name, pr_number): """ Query the github API for the issue_comments in a pr. - Archs: + Args: repo_name (string): name of the repository (format USER_OR_ORGANISATION/REPOSITORY) pr_number (int): number og the pr @@ -1070,7 +1201,7 @@ def request_bot_build_issue_comments(repo_name, pr_number): """ fn = sys._getframe().f_code.co_name - status_table = {'arch': [], 'date': [], 'status': [], 'url': [], 'result': []} + status_table = {'on arch': [], 'for arch': [], 'for repo': [], 'date': [], 'status': [], 'url': [], 'result': []} cfg = config.read_config() # for loop because github has max 100 items per request. @@ -1085,19 +1216,81 @@ def request_bot_build_issue_comments(repo_name, pr_number): for comment in comments: # iterate through the comments to find the one where the status of the build was in submitted_job_comments_section = cfg[config.SECTION_SUBMITTED_JOB_COMMENTS] - initial_comment_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT] - if initial_comment_fmt[:20] in comment['body']: - - # get archictecture from comment['body'] - first_line = comment['body'].split('\n')[0] - arch_map = get_architecture_targets(cfg) - for arch in arch_map.keys(): - # drop the first element in arch (which names the OS type) and join the remaining items with '-' - target_arch = '-'.join(arch.split('/')[1:]) - if target_arch in first_line: - status_table['arch'].append(target_arch) + accelerator_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_WITH_ACCELERATOR] + instance_repo_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_INSTANCE_REPO] + instance_repo_re = template_to_regex(instance_repo_fmt) + comment_body = comment['body'].split('\n') + instance_repo_match = re.match(instance_repo_re, comment_body[0]) + # Check if this body starts with an initial comment from the bot (first item is always the instance + repo + # it is building for) + # Then, check that it has at least 4 lines so that we can safely index up to that number + if instance_repo_match and len(comment_body) >= 4: + log(f"{fn}(): found bot build response in issue, processing...") + + # First, extract the repo_id + log(f"{fn}(): found build for repository: {instance_repo_match.group('repo_id')}") + status_table['for repo'].append(instance_repo_match.group('repo_id')) + + # Then, try to match the architecture we build on. + # First try this including accelerator, to see if one was defined + on_arch_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_ON_ARCH] + on_arch_fmt_with_accel = on_arch_fmt.format_map(PartialFormatDict(on_accelerator=accelerator_fmt)) + on_arch_re_with_accel = template_to_regex(on_arch_fmt_with_accel) + on_arch_match = re.match(on_arch_re_with_accel, comment_body[1]) + if on_arch_match: + # Pattern with accelerator matched, append to status_table + log(f"{fn}(): found build on architecture: {on_arch_match.group('on_arch')}, " + f"with accelerator {on_arch_match.group('accelerator')}") + status_table['on arch'].append(f"`{on_arch_match.group('on_arch')}`, " + f"`{on_arch_match.group('accelerator')}`") + else: + # Pattern with accelerator did not match, retry without accelerator + on_arch_re = template_to_regex(on_arch_fmt) + on_arch_match = re.match(on_arch_re, comment_body[1]) + if on_arch_match: + # Pattern without accelerator matched, append to status_table + log(f"{fn}(): found build on architecture: {on_arch_match.group('on_arch')}") + status_table['on arch'].append(f"`{on_arch_match.group('on_arch')}`") + else: + # This shouldn't happen: we had an instance_repo_match, but no match for the 'on architecture' + msg = "Could not match regular expression for extracting the architecture to build on.\n" + msg += "String to be matched:\n" + msg += f"{comment_body[1]}\n" + msg += "First regex attempted:\n" + msg += f"{on_arch_re_with_accel.pattern}\n" + msg += "Second regex attempted:\n" + msg += f"{on_arch_re.pattern}\n" + raise ValueError(msg) + + # Now, do the same for the architecture we build for. I.e. first, try to match including accelerator + for_arch_fmt = submitted_job_comments_section[config.SUBMITTED_JOB_COMMENTS_SETTING_BUILD_FOR_ARCH] + for_arch_fmt_with_accel = for_arch_fmt.format_map(PartialFormatDict(for_accelerator=accelerator_fmt)) + for_arch_re_with_accel = template_to_regex(for_arch_fmt_with_accel) + for_arch_match = re.match(for_arch_re_with_accel, comment_body[2]) + if for_arch_match: + # Pattern with accelerator matched, append to status_table + log(f"{fn}(): found build for architecture: {for_arch_match.group('for_arch')}, " + f"with accelerator {for_arch_match.group('accelerator')}") + status_table['for arch'].append(f"`{for_arch_match.group('for_arch')}`, " + f"`{for_arch_match.group('accelerator')}`") + else: + # Pattern with accelerator did not match, retry without accelerator + for_arch_re = template_to_regex(for_arch_fmt) + for_arch_match = re.match(for_arch_re, comment_body[2]) + if for_arch_match: + # Pattern without accelerator matched, append to status_table + log(f"{fn}(): found build for architecture: {for_arch_match.group('for_arch')}") + status_table['for arch'].append(f"`{for_arch_match.group('for_arch')}`") else: - log(f"{fn}(): target_arch '{target_arch}' not found in first line '{first_line}'") + # This shouldn't happen: we had an instance_repo_match, but no match for the 'on architecture' + msg = "Could not match regular expression for extracting the architecture to build for.\n" + msg += "String to be matched:\n" + msg += f"{comment_body[2]}\n" + msg += "First regex attempted:\n" + msg += f"{for_arch_re_with_accel.pattern}\n" + msg += "Second regex attempted:\n" + msg += f"{for_arch_re.pattern}\n" + raise ValueError(msg) # get date, status, url and result from the markdown table comment_table = comment['body'][comment['body'].find('|'):comment['body'].rfind('|')+1] diff --git a/tests/test_app.cfg b/tests/test_app.cfg index 4f833bbf..56c1d6cc 100644 --- a/tests/test_app.cfg +++ b/tests/test_app.cfg @@ -21,7 +21,10 @@ job_handover_protocol = hold_release awaits_release = job id `{job_id}` awaits release by job manager awaits_release_delayed_begin_msg = job id `{job_id}` will be eligible to start in about {delay_seconds} seconds awaits_release_hold_release_msg = job id `{job_id}` awaits release by job manager -initial_comment = New job on instance `{app_name}` for CPU micro-architecture `{arch_name}`{accelerator_spec} for repository `{repo_id}` in job dir `{symlink}` +new_job_instance_repo = New job on instance `{app_name}` for repository `{repo_id}` +build_on_arch = Building on: `{on_arch}`{on_accelerator} +build_for_arch = Building for: `{for_arch}`{for_accelerator} +jobdir = Job dir: `{symlink}` with_accelerator =  and accelerator `{accelerator}` [new_job_comments] diff --git a/tests/test_task_build.py b/tests/test_task_build.py index 1c289947..af49ac9b 100644 --- a/tests/test_task_build.py +++ b/tests/test_task_build.py @@ -29,6 +29,7 @@ # Local application imports (anything from EESSI/eessi-bot-software-layer) from tasks.build import Job, create_pr_comment from tools import run_cmd, run_subprocess +from tools.build_params import EESSIBotBuildParams from tools.job_metadata import create_metadata_file, read_metadata_file from tools.pr_comments import PRComment, get_submitted_job_comment @@ -287,6 +288,7 @@ def test_create_pr_comment_succeeds(monkeypatch, mocked_github, tmpdir): ym = datetime.today().strftime('%Y.%m') pr_number = 1 job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") + build_params = EESSIBotBuildParams("arch=amd/zen4,accel=nvidia/cc90") job_id = "123" app_name = "pytest" @@ -295,7 +297,7 @@ def test_create_pr_comment_succeeds(monkeypatch, mocked_github, tmpdir): repo = mocked_github.get_repo(repo_name) pr = repo.get_pull(pr_number) symlink = "/symlink" - comment = create_pr_comment(job, job_id, app_name, pr, symlink) + comment = create_pr_comment(job, job_id, app_name, pr, symlink, build_params) assert comment.id == 1 # check if created comment includes jobid? print("VERIFYING PR COMMENT") @@ -317,6 +319,7 @@ def test_create_pr_comment_succeeds_none(monkeypatch, mocked_github, tmpdir): ym = datetime.today().strftime('%Y.%m') pr_number = 1 job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") + build_params = EESSIBotBuildParams("arch=amd/zen4,accel=nvidia/cc90") job_id = "123" app_name = "pytest" @@ -325,7 +328,7 @@ def test_create_pr_comment_succeeds_none(monkeypatch, mocked_github, tmpdir): repo = mocked_github.get_repo(repo_name) pr = repo.get_pull(pr_number) symlink = "/symlink" - comment = create_pr_comment(job, job_id, app_name, pr, symlink) + comment = create_pr_comment(job, job_id, app_name, pr, symlink, build_params) assert comment is None @@ -343,6 +346,7 @@ def test_create_pr_comment_raises_once_then_succeeds(monkeypatch, mocked_github, ym = datetime.today().strftime('%Y.%m') pr_number = 1 job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") + build_params = EESSIBotBuildParams("arch=amd/zen4,accel=nvidia/cc90") job_id = "123" app_name = "pytest" @@ -351,7 +355,7 @@ def test_create_pr_comment_raises_once_then_succeeds(monkeypatch, mocked_github, repo = mocked_github.get_repo(repo_name) pr = repo.get_pull(pr_number) symlink = "/symlink" - comment = create_pr_comment(job, job_id, app_name, pr, symlink) + comment = create_pr_comment(job, job_id, app_name, pr, symlink, build_params) assert comment.id == 1 assert pr.create_call_count == 2 @@ -369,6 +373,7 @@ def test_create_pr_comment_always_raises(monkeypatch, mocked_github, tmpdir): ym = datetime.today().strftime('%Y.%m') pr_number = 1 job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") + build_params = EESSIBotBuildParams("arch=amd/zen4,accel=nvidia/cc90") job_id = "123" app_name = "pytest" @@ -378,7 +383,7 @@ def test_create_pr_comment_always_raises(monkeypatch, mocked_github, tmpdir): pr = repo.get_pull(pr_number) symlink = "/symlink" with pytest.raises(Exception) as err: - create_pr_comment(job, job_id, app_name, pr, symlink) + create_pr_comment(job, job_id, app_name, pr, symlink, build_params) assert err.type == CreateIssueCommentException assert pr.create_call_count == 3 @@ -396,6 +401,7 @@ def test_create_pr_comment_three_raises(monkeypatch, mocked_github, tmpdir): ym = datetime.today().strftime('%Y.%m') pr_number = 1 job = Job(tmpdir, "test/architecture", "EESSI", "--speed-up", ym, pr_number, "fpga/magic") + build_params = EESSIBotBuildParams("arch=amd/zen4,accel=nvidia/cc90") job_id = "123" app_name = "pytest" @@ -405,7 +411,7 @@ def test_create_pr_comment_three_raises(monkeypatch, mocked_github, tmpdir): pr = repo.get_pull(pr_number) symlink = "/symlink" with pytest.raises(Exception) as err: - create_pr_comment(job, job_id, app_name, pr, symlink) + create_pr_comment(job, job_id, app_name, pr, symlink, build_params) assert err.type == CreateIssueCommentException assert pr.create_call_count == 3 diff --git a/tests/test_tools_filter.py b/tests/test_tools_filter.py index b689aa60..26cd31cd 100644 --- a/tests/test_tools_filter.py +++ b/tests/test_tools_filter.py @@ -231,27 +231,52 @@ def test_match_empty_context(complex_filter): assert expected == actual -def test_match_architecture_context(complex_filter): +# A context lacking keys for components in the filter shouldn't match +def test_match_sparse_context(complex_filter): context = {"architecture": "x86_64/intel/cascadelake"} - expected = True + expected = False actual = complex_filter.check_filters(context) assert expected == actual -def test_match_architecture_job_context(complex_filter): - context = {"architecture": "x86_64/intel/cascadelake", "job": 1234} +def test_matching_context(complex_filter): + context = {"architecture": "x86_64/intel/cascadelake", "repository": "nessi.no-2022.A", "instance": "A"} expected = True actual = complex_filter.check_filters(context) assert expected == actual -def test_non_match_architecture_repository_context(complex_filter): - context = {"architecture": "x86_64/intel/cascadelake", "repository": "EESSI"} +def test_non_match_architecture_context(complex_filter): + context = {"architecture": "x86_64/amd/zen4", "repository": "EESSI", "instance": "mybot", "job": 1234} + expected = False + actual = complex_filter.check_filters(context) + assert expected == actual + + +def test_non_match_repository_context(complex_filter): + context = {"architecture": "x86_64/intel/cascadelake", "repository": "EESSI", "instance": "A"} + expected = False + actual = complex_filter.check_filters(context) + assert expected == actual + + +def test_non_match_instance_context(complex_filter): + context = {"architecture": "x86_64/intel/cascadelake", "repository": "nessi.no-2022.A", "instance": "B"} expected = False actual = complex_filter.check_filters(context) assert expected == actual +# If additional keys are present in the context for which no filter component is defined +# it should not prevent a match +def test_match_additional_context(complex_filter): + context = {"architecture": "x86_64/intel/cascadelake", "repository": "nessi.no-2022.A", "instance": "A", + "job": 1234} + expected = True + actual = complex_filter.check_filters(context) + assert expected == actual + + @pytest.fixture def arch_filter_slash_syntax(): af = EESSIBotActionFilter("") diff --git a/tools/build_params.py b/tools/build_params.py new file mode 100644 index 00000000..05eb63eb --- /dev/null +++ b/tools/build_params.py @@ -0,0 +1,78 @@ +# This file is part of the EESSI build-and-deploy bot, +# see https://github.com/EESSI/eessi-bot-software-layer +# +# The bot helps with requests to add software installations to the +# EESSI software layer, see https://github.com/EESSI/software-layer +# +# author: Caspar van Leeuwen +# +# license: GPLv2 +# + +from tools.filter import FILTER_COMPONENT_ACCEL, FILTER_COMPONENT_ARCH + +# Define these constants with the same values. We want the arguments passed to +# on: and for: to use the same keywords +BUILD_PARAM_ACCEL = FILTER_COMPONENT_ACCEL +BUILD_PARAM_ARCH = FILTER_COMPONENT_ARCH +BUILD_PARAMS = [ + BUILD_PARAM_ACCEL, + BUILD_PARAM_ARCH +] + + +class EESSIBotBuildParamsValueError(Exception): + """ + Exception to be raised when an inappropriate value is specified for a build parameter + """ + pass + + +class EESSIBotBuildParamsNameError(Exception): + """ + Exception to be raised when an unkown build parameter name is specified + """ + pass + + +class EESSIBotBuildParams(dict): + """ + Class for representing build parameters. Essentially, this is a dictionary class + but with some additional parsing for the constructor + """ + def __init__(self, build_parameters): + """ + EESSIBotBuildParams constructor + + Args: + build_parameters (string): string containing comma separated build parameters + Example: "arch=amd/zen4,accel=nvidia/cc90" + + Raises: + EESSIBotBuildParamsNameError: raised if parsing an unknown build parameter + string + EESSIBotBuildParamsValueError: raised if an invalid value is passed for a build parameter + """ + build_param_dict = {} + + # Loop over defined build parameters argument + build_params_list = build_parameters.split(',') + for item in build_params_list: + # Separate build parameter name and value + build_param = item.split('=') + if len(build_param) != 2: + msg = f"Expected argument {item} to be split into exactly two parts when splitting by '=', " + msg += f"but the number of items after splitting is {len(build_param)}" + raise EESSIBotBuildParamsValueError(msg) + param_found = False + for full_param_name in BUILD_PARAMS: + # Identify which build param we are matching + if full_param_name.startswith(build_param[0]): + param_found = True + # Store the value of the build parameter by it's full name + build_param_dict[full_param_name] = build_param[1] + if not param_found: + msg = f"Build parameter {build_param[0]} not found. Known build parameters: {BUILD_PARAMS}" + raise EESSIBotBuildParamsNameError(msg) + + super().__init__(build_param_dict) diff --git a/tools/commands.py b/tools/commands.py index 5db8f7f7..bd80e339 100644 --- a/tools/commands.py +++ b/tools/commands.py @@ -18,6 +18,7 @@ # Local application imports (anything from EESSI/eessi-bot-software-layer) from tools.filter import EESSIBotActionFilter, EESSIBotActionFilterError +from tools.build_params import EESSIBotBuildParams def contains_any_bot_command(body): @@ -46,11 +47,14 @@ def get_bot_command(line): fn = sys._getframe().f_code.co_name log(f"{fn}(): searching for bot command in '{line}'") - match = re.search('^bot: (.*)$', line) - # TODO add log messages for both cases + regex = re.compile('^bot:[ ]?(.*)$') + match = regex.search(line) if match: - return match.group(1).rstrip() + cmd = match.group(1).rstrip() + log(f"{fn}(): Bot command found in '{line}': {cmd}") + return cmd else: + log(f"{fn}(): No bot command found using pattern '{regex.pattern}' in: {line}") return None @@ -82,19 +86,85 @@ def __init__(self, cmd_str): """ # TODO add function name to log messages cmd_as_list = cmd_str.split() - self.command = cmd_as_list[0] + self.command = cmd_as_list[0] # E.g. 'build' or 'help' + self.general_args = [] + self.action_filters = None + self.build_params = None + # TODO always init self.action_filters with empty EESSIBotActionFilter? if len(cmd_as_list) > 1: - arg_str = " ".join(cmd_as_list[1:]) - try: - self.action_filters = EESSIBotActionFilter(arg_str) - except EESSIBotActionFilterError as err: - log(f"ERROR: EESSIBotActionFilterError - {err.args}") - self.action_filters = None - raise EESSIBotCommandError("invalid action filter") - except Exception as err: - log(f"Unexpected err={err}, type(err)={type(err)}") - raise + # Extract arguments for the action filters + # By default, everything that follows the 'on:' argument (until the next space) is + # considered part of the argument list for the action filters + target_args = [] + other_filter_args = [] + on_found = False + for arg in cmd_as_list[1:]: + if arg.startswith('on:'): + on_found = True + # Extract everything after 'on:' and split by comma + filter_content = arg[3:] # Remove 'on:' prefix + target_args.extend(filter_content.split(',')) + elif arg.startswith('for:'): + # Anything listed as 'for:' is build parameters + build_params = arg[4:] + # EESSIBotBuildParams is essentially a dict, but parses the input argument + # according to the expected argument format for 'for:' + self.build_params = EESSIBotBuildParams(build_params) + else: + # Anything that is not 'on:' or 'for:' + # Check if it's a filter argument, if so, pass it on to other_filter_args witout further parsing + # If it's not a filter argument, it is a general argument - just store it so any other function + # can read it + if ':' in arg: + other_filter_args.extend([arg]) + else: + self.general_args.append(arg) + + # If no 'on:' is found in the argument list, everything that follows the 'for:' argument + # (until the next space) is considered the argument list for the action filters + # Essentially, this represents a native build, i.e. the hardware we build for should be the + # hardware we build on + if not on_found: + for arg in cmd_as_list[1:]: + if arg.startswith('for:'): + # Extract everything after the 'for:' suffix and split by comma + filter_content = arg[4:] + target_args.extend(filter_content.split(',')) + + # Join the filter arguments and pass to EESSIBotActionFilter + # At this point, target_args is e.g. ["arch=amd/zen2","accel=nvidia/cc90"] + # But EESSIBotActionFilter expects e.g. "arch:amd/zen2 accel:nvidia/cc90" + # First, normalize to the ["arch:amd/zen2", "accel:nvidia/cc90"] format + normalized_filters = [] + if target_args: + for filter_item in target_args: + if '=' in filter_item: + component, pattern = filter_item.split('=', 1) + normalized_filters.append(f"{component}:{pattern}") + + # Add the other filter args to the normalized filters. The other_filter_args are already colon-separated + # so no special parsing needed there + log(f"Extracted filter arguments related to hardware target: {normalized_filters}") + log(f"Other extracted filter arguments: {other_filter_args}") + log(f"Other general arguments: {self.general_args}") + normalized_filters += other_filter_args + + # Finally, change into a space-separated string, as expected by EESSIBotActionFilter + # e.g "arch:amd/zen2 accel:nvidia/cc90 repo:my.repo.io" + if normalized_filters: + arg_str = " ".join(normalized_filters) + try: + log(f"Passing the following arguments to the EESSIBotActionFilter: {arg_str}") + self.action_filters = EESSIBotActionFilter(arg_str) + except EESSIBotActionFilterError as err: + log(f"ERROR: EESSIBotActionFilterError - {err.args}") + self.action_filters = None + raise EESSIBotCommandError("invalid action filter") + except Exception as err: + log(f"Unexpected err={err}, type(err)={type(err)}") + raise + # No arguments were passed to the command self.command else: self.action_filters = EESSIBotActionFilter("") @@ -108,5 +178,8 @@ def to_string(self): Returns: string: the string representation created by the method """ - action_filters_str = self.action_filters.to_string() - return f"{' '.join([self.command, action_filters_str]).rstrip()}" + if self.action_filters is None: + return "" + else: + action_filters_str = self.action_filters.to_string() + return f"{' '.join([self.command, action_filters_str]).rstrip()}" diff --git a/tools/config.py b/tools/config.py index 6fe5982c..7f814ea4 100644 --- a/tools/config.py +++ b/tools/config.py @@ -30,7 +30,8 @@ # sectionname_SETTING_settingname for any setting with name settingname in # section sectionname SECTION_ARCHITECTURETARGETS = 'architecturetargets' -ARCHITECTURETARGETS_SETTING_ARCH_TARGET_MAP = 'arch_target_map' +ARCHITECTURETARGETS_SETTING_ARCH_TARGET_MAP = 'arch_target_map' # Obsolete, replaced by NODE_TYPE_MAP +NODE_TYPE_MAP = 'node_type_map' SECTION_BOT_CONTROL = 'bot_control' BOT_CONTROL_SETTING_COMMAND_PERMISSION = 'command_permission' @@ -43,6 +44,7 @@ BUILDENV_SETTING_BUILD_JOB_SCRIPT = 'build_job_script' BUILDENV_SETTING_BUILD_LOGS_DIR = 'build_logs_dir' BUILDENV_SETTING_BUILD_PERMISSION = 'build_permission' +BUILDENV_SETTING_CLONE_GIT_REPO_VIA = 'clone_git_repo_via' BUILDENV_SETTING_CONTAINER_CACHEDIR = 'container_cachedir' BUILDENV_SETTING_CVMFS_CUSTOMIZATIONS = 'cvmfs_customizations' BUILDENV_SETTING_HTTPS_PROXY = 'https_proxy' @@ -82,6 +84,8 @@ DOWNLOAD_PR_COMMENTS_SETTING_GIT_CHECKOUT_TIP = 'git_checkout_tip' DOWNLOAD_PR_COMMENTS_SETTING_GIT_CLONE_FAILURE = 'git_clone_failure' DOWNLOAD_PR_COMMENTS_SETTING_GIT_CLONE_TIP = 'git_clone_tip' +DOWNLOAD_PR_COMMENTS_SETTING_PR_DIFF_FAILURE = 'pr_diff_failure' +DOWNLOAD_PR_COMMENTS_SETTING_PR_DIFF_TIP = 'pr_diff_tip' SECTION_EVENT_HANDLER = 'event_handler' EVENT_HANDLER_SETTING_LOG_PATH = 'log_path' @@ -118,6 +122,10 @@ SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE = 'awaits_release' SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_DELAYED_BEGIN_MSG = 'awaits_release_delayed_begin_msg' SUBMITTED_JOB_COMMENTS_SETTING_AWAITS_RELEASE_HOLD_RELEASE_MSG = 'awaits_release_hold_release_msg' +SUBMITTED_JOB_COMMENTS_SETTING_INSTANCE_REPO = 'new_job_instance_repo' +SUBMITTED_JOB_COMMENTS_SETTING_BUILD_ON_ARCH = 'build_on_arch' +SUBMITTED_JOB_COMMENTS_SETTING_BUILD_FOR_ARCH = 'build_for_arch' +SUBMITTED_JOB_COMMENTS_SETTING_JOBDIR = 'jobdir' SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT = 'initial_comment' SUBMITTED_JOB_COMMENTS_SETTING_WITH_ACCELERATOR = 'with_accelerator' @@ -133,6 +141,33 @@ JOB_HANDOVER_PROTOCOL_HOLD_RELEASE } +# Allows us to error on config items that were removed +FORBIDDEN_CONFIG = { + SECTION_ARCHITECTURETARGETS: [ + ( + ARCHITECTURETARGETS_SETTING_ARCH_TARGET_MAP, + f"Config invalid: '{ARCHITECTURETARGETS_SETTING_ARCH_TARGET_MAP}' was removed and replaced by " + f"'{NODE_TYPE_MAP}'. See app.cfg.example for details." + ) + ], + SECTION_REPO_TARGETS: [ + ( + REPO_TARGETS_SETTING_REPO_TARGET_MAP, + f"Config invalid: '{REPO_TARGETS_SETTING_REPO_TARGET_MAP} was removed. Repository targets can now be " + f"specified within the '{NODE_TYPE_MAP}' dictionary. See app.cfg.example for details." + ) + ], + SECTION_SUBMITTED_JOB_COMMENTS: [ + ( + SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT, + f"Config invalid: '{SUBMITTED_JOB_COMMENTS_SETTING_INITIAL_COMMENT}' was removed and replaced by " + f"'{SUBMITTED_JOB_COMMENTS_SETTING_INSTANCE_REPO}', '{SUBMITTED_JOB_COMMENTS_SETTING_BUILD_ON_ARCH}', " + f"'{SUBMITTED_JOB_COMMENTS_SETTING_BUILD_FOR_ARCH}' and '{SUBMITTED_JOB_COMMENTS_SETTING_JOBDIR}'. " + "See app.cfg.example for details." + ) + ] +} + def read_config(path='app.cfg'): """ @@ -156,10 +191,10 @@ def read_config(path='app.cfg'): return config -def check_required_cfg_settings(req_settings, path="app.cfg"): +def check_cfg_settings(req_settings, path="app.cfg"): """ - Reads the config file, checks if it contains the required settings, - if not logs an error message and exits. + Reads the config file, checks if it contains the required settings, and if it does not contain forbidden + (i.e. removed) settings. If the check fails, logs an error message and exits. Args: req_settings (dict (str, list)): required settings @@ -179,4 +214,14 @@ def check_required_cfg_settings(req_settings, path="app.cfg"): for item in req_settings[section]: if item not in cfg[section]: error(f'Missing configuration item "{item}" in section "{section}" of configuration file {path}.') + + # Check for forbidden arguments + for section in FORBIDDEN_CONFIG: + if section in cfg: + for item in FORBIDDEN_CONFIG[section]: + # First element of the tuple is the forbidden config item, check if its in the section + if item[0] in cfg[section]: + # Item 1 contains a specific error message + error(item[1]) + return True diff --git a/tools/filter.py b/tools/filter.py index 0caa2af8..ddc58352 100644 --- a/tools/filter.py +++ b/tools/filter.py @@ -303,4 +303,18 @@ def check_filters(self, context): else: check = False break + # Action filter wasn't found in the context, we won't allow this + else: + check = False + break + + # If the context declares an accelerator, enforce that a filter is defined for this component as well + # I.e. this enforces that a context with accelerator will only be used if an accelerator is explicitely + # requested in the build command, thus preventing CPU-only builds on GPU nodes (unless explicitely intended) + if ( + FILTER_COMPONENT_ACCEL in context and not + any(af.component == FILTER_COMPONENT_ACCEL for af in self.action_filters) + ): + check = False + return check diff --git a/tools/job_metadata.py b/tools/job_metadata.py index 7b7b8d0a..f5ee21ce 100644 --- a/tools/job_metadata.py +++ b/tools/job_metadata.py @@ -34,7 +34,9 @@ JOB_CFG_UPLOAD_STEP = "upload_step" # JWD/cfg/$JOB_CFG_FILENAME + JOB_CFG_ARCHITECTURE_SECTION = "architecture" +JOB_CFG_ARCHITECTURE_NODE_TYPE = "node_type" JOB_CFG_ARCHITECTURE_OS_TYPE = "os_type" JOB_CFG_ARCHITECTURE_SOFTWARE_SUBDIR = "software_subdir" JOB_CFG_ARCHITECTURE_ACCELERATOR = "accelerator"