From ec1fd6e0bc4ca9a9c876bc8e6afcd58f33ffadfe Mon Sep 17 00:00:00 2001 From: Nick Van Wiggeren Date: Thu, 16 Apr 2026 19:39:57 +0000 Subject: [PATCH 1/9] Migrate e2e tests from Buildkite to GitHub Actions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the eight end-to-end tests (upgrade, backup-restore, three backup-schedule variants, vtorc-vtadmin, unmanaged-tablet, hpa) from the Buildkite public queue onto GitHub Actions using ubuntu-latest-8-cores runners. Each test runs as its own matrix job, in parallel, on a fresh VM per job — which lets us drop Buildkite's per-test concurrency gate since the collisions it was guarding against (shared vitess-operator-pr image tag, fixed localhost port-forward ports, shared kind docker network on sibling-container agents) no longer exist when each job gets its own runner and Docker daemon. utils.sh loses the BUILDKITE_JOB_ID coupling: the variable is renamed to CI_JOB_ID (set from github.run_id + run_attempt + matrix target), the sibling-container networking hack in setupKubectlAccessForCI is removed because kind now runs directly on the runner host, and the docker build progress flag keys off \$CI instead. The pre-exit hook that reset perms and cleaned up the shared Docker state is no longer needed — GHA runners are ephemeral. Branch protection on main and release-** will need the new e2e check names added and the Buildkite checks removed once the first run is green. Signed-off-by: Nick Van Wiggeren --- .buildkite/hooks/pre-exit | 25 ----- .buildkite/pipeline.yml | 183 -------------------------------- .github/workflows/e2e-test.yaml | 54 ++++++++++ docs/release-process.md | 2 +- test/endtoend/utils.sh | 28 ++--- 5 files changed, 61 insertions(+), 231 deletions(-) delete mode 100644 .buildkite/hooks/pre-exit delete mode 100644 .buildkite/pipeline.yml create mode 100644 .github/workflows/e2e-test.yaml diff --git a/.buildkite/hooks/pre-exit b/.buildkite/hooks/pre-exit deleted file mode 100644 index e738f76e7..000000000 --- a/.buildkite/hooks/pre-exit +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash - -# Remove the docker container on which kind is running -# Also removes the volume used by it -docker container rm -v -f kind-${BUILDKITE_JOB_ID}-control-plane -# Remove the docker image created for the local PR code -docker image rm -f vitess-operator-pr:latest - -# This hack exists because vitess-operator modifies the permissions on the git -# checkout during CI from inside docker. This causes future jobs run on the same -# node to fail the git checkout step due to permission errors -# -# Our fix is to reset the perms after each job step. We can't run arbitrary -# sudo commands as the buildkite-agent user but we _can_ run the /usr/bin/fix-buildkite-agent-builds-permissions -# tool via sudo -# -# these cmds stolen from: https://github.com/buildkite/elastic-ci-stack-for-aws/blob/da3aef5d96cecb796636a7ac25d7b205a6a0cc90/packer/linux/conf/buildkite-agent/hooks/environment#L117-L141 - -set -euo pipefail - -AGENT_ORG_PIPELINE_DIR="${BUILDKITE_BUILD_CHECKOUT_PATH#"${BUILDKITE_BUILD_PATH}/"}" -AGENT_DIR="${AGENT_ORG_PIPELINE_DIR%%/*}" - -set -x -sudo /usr/bin/fix-buildkite-agent-builds-permissions "$AGENT_DIR" planetscale vitess-operator diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml deleted file mode 100644 index 8c7a67b89..000000000 --- a/.buildkite/pipeline.yml +++ /dev/null @@ -1,183 +0,0 @@ -agents: - queue: "public" - -env: - GO_VERSION_FILE: "go1.26.2.linux-amd64.tar.gz" - -# Mount the docker.sock as to the docker container, so that we are able to -# run docker build command and kind is spawned as a sibling container. -steps: - - name: "Upgrade Test" - command: - - apk add --no-progress --quiet g++ make bash gcompat curl mysql-client libc6-compat - - wget -q https://golang.org/dl/$GO_VERSION_FILE - - tar -C /usr/local -xzf $GO_VERSION_FILE - - export PATH=$PATH:/usr/local/go/bin:/bin - - rm $GO_VERSION_FILE - - ln -s /lib/libc.so.6 /usr/lib/libresolv.so.2 - - make upgrade-test - concurrency: 1 - concurrency_group: 'vtop/upgrade-downgrade-test' - timeout_in_minutes: 30 - plugins: - - docker#v3.12.0: - image: "docker:latest" - propagate-environment: true - volumes: - - "/var/run/docker.sock:/var/run/docker.sock" - retry: &retry_policy_tests - # Automatically retry tests on unexpected Buildkite Agent exit codes - automatic: - - exit_status: -1 # Agent lost - limit: 2 - - exit_status: 143 # Graceful agent termination - limit: 2 - - exit_status: 255 # Forceful agent termination - limit: 2 - - - name: "Backup Restore Test" - command: - - apk add --no-progress --quiet g++ make bash gcompat curl mysql-client libc6-compat - - wget -q https://golang.org/dl/$GO_VERSION_FILE - - tar -C /usr/local -xzf $GO_VERSION_FILE - - export PATH=$PATH:/usr/local/go/bin:/bin - - rm $GO_VERSION_FILE - - ln -s /lib/libc.so.6 /usr/lib/libresolv.so.2 - - make backup-restore-test - concurrency: 1 - concurrency_group: 'vtop/backup-restore-test' - timeout_in_minutes: 20 - plugins: - - docker#v3.12.0: - image: "docker:latest" - propagate-environment: true - volumes: - - "/var/run/docker.sock:/var/run/docker.sock" - retry: - <<: *retry_policy_tests - - - name: "Backup Schedule Cluster/Keyspace Scope Test" - command: - - apk add --no-progress --quiet g++ make bash gcompat curl mysql-client libc6-compat - - wget -q https://golang.org/dl/$GO_VERSION_FILE - - tar -C /usr/local -xzf $GO_VERSION_FILE - - export PATH=$PATH:/usr/local/go/bin:/bin - - rm $GO_VERSION_FILE - - ln -s /lib/libc.so.6 /usr/lib/libresolv.so.2 - - make backup-schedule-keyspace-test - concurrency: 1 - concurrency_group: 'vtop/backup-schedule-keyspace-test' - timeout_in_minutes: 20 - plugins: - - docker#v3.12.0: - image: "docker:latest" - propagate-environment: true - volumes: - - "/var/run/docker.sock:/var/run/docker.sock" - retry: - <<: *retry_policy_tests - - - name: "Backup Schedule Test" - command: - - apk add --no-progress --quiet g++ make bash gcompat curl mysql-client libc6-compat - - wget -q https://golang.org/dl/$GO_VERSION_FILE - - tar -C /usr/local -xzf $GO_VERSION_FILE - - export PATH=$PATH:/usr/local/go/bin:/bin - - rm $GO_VERSION_FILE - - ln -s /lib/libc.so.6 /usr/lib/libresolv.so.2 - - make backup-schedule-test - concurrency: 1 - concurrency_group: 'vtop/backup-schedule-test' - timeout_in_minutes: 20 - plugins: - - docker#v3.12.0: - image: "docker:latest" - propagate-environment: true - volumes: - - "/var/run/docker.sock:/var/run/docker.sock" - retry: - <<: *retry_policy_tests - - - name: "Backup Schedule vtctldclient Method Test" - command: - - apk add --no-progress --quiet g++ make bash gcompat curl mysql-client libc6-compat - - wget -q https://golang.org/dl/$GO_VERSION_FILE - - tar -C /usr/local -xzf $GO_VERSION_FILE - - export PATH=$PATH:/usr/local/go/bin:/bin - - rm $GO_VERSION_FILE - - ln -s /lib/libc.so.6 /usr/lib/libresolv.so.2 - - make backup-schedule-vtctldclient-test - concurrency: 1 - concurrency_group: 'vtop/backup-schedule-vtctldclient-test' - timeout_in_minutes: 20 - plugins: - - docker#v3.12.0: - image: "docker:latest" - propagate-environment: true - volumes: - - "/var/run/docker.sock:/var/run/docker.sock" - retry: - <<: *retry_policy_tests - - - name: "VTOrc and VTAdmin Test" - command: - - apk add --no-progress --quiet g++ make bash gcompat curl mysql-client libc6-compat chromium - - wget -q https://golang.org/dl/$GO_VERSION_FILE - - tar -C /usr/local -xzf $GO_VERSION_FILE - - export PATH=$PATH:/usr/local/go/bin:/bin - - rm $GO_VERSION_FILE - - ln -s /lib/libc.so.6 /usr/lib/libresolv.so.2 - - make vtorc-vtadmin-test - concurrency: 1 - concurrency_group: 'vtop/vtorc-vtadmin-test' - timeout_in_minutes: 20 - plugins: - - docker#v3.12.0: - image: "docker:latest" - propagate-environment: true - volumes: - - "/var/run/docker.sock:/var/run/docker.sock" - retry: - <<: *retry_policy_tests - - - name: "Unmanaged Tablet Test" - command: - - apk add --no-progress --quiet g++ make bash gcompat curl mysql-client libc6-compat coreutils - - wget -q https://golang.org/dl/$GO_VERSION_FILE - - tar -C /usr/local -xzf $GO_VERSION_FILE - - export PATH=$PATH:/usr/local/go/bin:/bin - - rm $GO_VERSION_FILE - - ln -s /lib/libc.so.6 /usr/lib/libresolv.so.2 - - make unmanaged-tablet-test - concurrency: 1 - concurrency_group: 'vtop/unmanaged-tablet-test' - timeout_in_minutes: 20 - plugins: - - docker#v3.12.0: - image: "docker:latest" - propagate-environment: true - volumes: - - "/var/run/docker.sock:/var/run/docker.sock" - retry: - <<: *retry_policy_tests - - - name: "HPA Test" - command: - - apk add --no-progress --quiet g++ make bash gcompat curl mysql-client libc6-compat - - wget -q https://golang.org/dl/$GO_VERSION_FILE - - tar -C /usr/local -xzf $GO_VERSION_FILE - - export PATH=$PATH:/usr/local/go/bin:/bin - - rm $GO_VERSION_FILE - - ln -s /lib/libc.so.6 /usr/lib/libresolv.so.2 - - make hpa-test - concurrency: 1 - concurrency_group: 'vtop/hpa-test' - timeout_in_minutes: 20 - plugins: - - docker#v3.12.0: - image: "docker:latest" - propagate-environment: true - volumes: - - "/var/run/docker.sock:/var/run/docker.sock" - retry: - <<: *retry_policy_tests diff --git a/.github/workflows/e2e-test.yaml b/.github/workflows/e2e-test.yaml new file mode 100644 index 000000000..0c46d6643 --- /dev/null +++ b/.github/workflows/e2e-test.yaml @@ -0,0 +1,54 @@ +name: e2e-test +on: + push: + branches: + - main + - release-** + pull_request: + branches: + - main + - release-** + +jobs: + e2e: + name: ${{ matrix.test.name }} + runs-on: ubuntu-latest-8-cores + timeout-minutes: 40 + strategy: + fail-fast: false + matrix: + test: + - name: "Upgrade Test" + target: upgrade-test + - name: "Backup Restore Test" + target: backup-restore-test + - name: "Backup Schedule Cluster/Keyspace Scope Test" + target: backup-schedule-keyspace-test + - name: "Backup Schedule Test" + target: backup-schedule-test + - name: "Backup Schedule vtctldclient Method Test" + target: backup-schedule-vtctldclient-test + - name: "VTOrc and VTAdmin Test" + target: vtorc-vtadmin-test + - name: "Unmanaged Tablet Test" + target: unmanaged-tablet-test + - name: "HPA Test" + target: hpa-test + env: + CI_JOB_ID: ${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.test.target }} + steps: + - name: Check out code + uses: actions/checkout@v6 + + - name: Set up Go + uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0 + with: + go-version-file: go.mod + + - name: Install test dependencies + run: | + sudo apt-get update + sudo apt-get install -y mysql-client chromium-browser + + - name: Run ${{ matrix.test.name }} + run: make ${{ matrix.test.target }} diff --git a/docs/release-process.md b/docs/release-process.md index 9e0f51eb8..ff7291416 100644 --- a/docs/release-process.md +++ b/docs/release-process.md @@ -112,7 +112,7 @@ The `upgrade_test.sh`, `backup_restore_test.sh`, `vtorc_vtadmin_test.sh` and `un ##### CI Failures > **Note** -> It is likely that the buildkite tests will fail on the release PR initially because of the unavailability of the latest vitess and vitess-operator docker images. This however doesn't block the release. The tests should be restarted after the said images are built and available. +> It is likely that the end-to-end tests will fail on the release PR initially because of the unavailability of the latest vitess and vitess-operator docker images. This however doesn't block the release. The tests should be restarted after the said images are built and available. ------------------- diff --git a/test/endtoend/utils.sh b/test/endtoend/utils.sh index 9c2f0f232..585c1af5d 100644 --- a/test/endtoend/utils.sh +++ b/test/endtoend/utils.sh @@ -6,7 +6,7 @@ # set -x shopt -s expand_aliases alias vtctldclient="vtctldclient --server=localhost:15999" -BUILDKITE_JOB_ID="${BUILDKITE_JOB_ID:-0}" +CI_JOB_ID="${CI_JOB_ID:-0}" # Suppress warnings when using MariaDB Client mysql_version="$(mysql --version 2>/dev/null)" @@ -523,9 +523,9 @@ function assertSelect() { function setupBuildContainerImage() { echo "Building the container image" - # Clean up build output in CI + # Use plain progress output in CI so logs are line-buffered and readable. local progress="auto" - if [[ "${BUILDKITE_JOB_ID}" != "0" ]]; then + if [[ -n "${CI:-}" ]]; then progress="plain" fi @@ -535,25 +535,9 @@ function setupBuildContainerImage() { function setupKindCluster() { setupBuildContainerImage createKindCluster - setupKubectlAccessForCI createExampleNamespace } -function setupKubectlAccessForCI() { - if [[ "${BUILDKITE_JOB_ID}" != "0" ]]; then - # The script is being run from buildkite, so we need to do stuff - # https://github.com/kubernetes-sigs/kind/issues/1846#issuecomment-691565834 - # Since kind is running in a sibling container, communicating with it through kubectl is not trivial. - # To accomplish we need to add the current docker container in the same network as the kind container - # and change the kubectl configuration to use the port listed in the internal endpoint instead of the one - # that is exported to the localhost by kind. - local docker_container_name - docker_container_name="$(hostname -s)" - docker network connect kind "${docker_container_name}" - kind get kubeconfig --internal --name "kind-${BUILDKITE_JOB_ID}" > "${HOME}/.kube/config" - fi -} - # shellcheck disable=SC2120 # function has an optional argument function setupPortForwarding() { local with_vtadmin="${1:-}" # Pass `with_vtadmin` to also enable port forwarding to VTAdmin @@ -590,14 +574,14 @@ function setupPortForwarding() { function teardownKindCluster() { echo "Deleting the Kind cluster. This also deletes the volume associated with it." - kind delete cluster --name "kind-${BUILDKITE_JOB_ID}" + kind delete cluster --name "kind-${CI_JOB_ID}" } function createKindCluster() { echo "Creating Kind cluster" - kind create cluster --wait 30s --name "kind-${BUILDKITE_JOB_ID}" --image "${KIND_VERSION}" + kind create cluster --wait 30s --name "kind-${CI_JOB_ID}" --image "${KIND_VERSION}" echo "Loading docker image into Kind cluster" - kind load docker-image vitess-operator-pr:latest --name "kind-${BUILDKITE_JOB_ID}" + kind load docker-image vitess-operator-pr:latest --name "kind-${CI_JOB_ID}" } function createExampleNamespace() { From a34028fcdcce3b6575ed50f6aa51c8a04de5dfb1 Mon Sep 17 00:00:00 2001 From: Nick Van Wiggeren Date: Thu, 16 Apr 2026 19:43:06 +0000 Subject: [PATCH 2/9] Use self-hosted vitess-ubuntu-shr-4cpu-16gb runners Signed-off-by: Nick Van Wiggeren --- .github/workflows/e2e-test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e-test.yaml b/.github/workflows/e2e-test.yaml index 0c46d6643..c03e9dcc6 100644 --- a/.github/workflows/e2e-test.yaml +++ b/.github/workflows/e2e-test.yaml @@ -12,7 +12,7 @@ on: jobs: e2e: name: ${{ matrix.test.name }} - runs-on: ubuntu-latest-8-cores + runs-on: vitess-ubuntu-shr-4cpu-16gb timeout-minutes: 40 strategy: fail-fast: false From 2cd70d754ee8b286849f96c150caf118743cd5f1 Mon Sep 17 00:00:00 2001 From: Nick Van Wiggeren Date: Thu, 16 Apr 2026 19:47:15 +0000 Subject: [PATCH 3/9] Switch e2e runners to depot-ubuntu-22.04-4 Signed-off-by: Nick Van Wiggeren --- .github/workflows/e2e-test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e-test.yaml b/.github/workflows/e2e-test.yaml index c03e9dcc6..025dc302c 100644 --- a/.github/workflows/e2e-test.yaml +++ b/.github/workflows/e2e-test.yaml @@ -12,7 +12,7 @@ on: jobs: e2e: name: ${{ matrix.test.name }} - runs-on: vitess-ubuntu-shr-4cpu-16gb + runs-on: depot-ubuntu-22.04-4 timeout-minutes: 40 strategy: fail-fast: false From c36fbbe6c6848c49fce235c10593e145cce220cd Mon Sep 17 00:00:00 2001 From: Nick Van Wiggeren Date: Thu, 16 Apr 2026 19:50:52 +0000 Subject: [PATCH 4/9] Use vitess-operator-runner 8-core hosted runners Signed-off-by: Nick Van Wiggeren --- .github/workflows/e2e-test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e-test.yaml b/.github/workflows/e2e-test.yaml index 025dc302c..9292303f8 100644 --- a/.github/workflows/e2e-test.yaml +++ b/.github/workflows/e2e-test.yaml @@ -12,7 +12,7 @@ on: jobs: e2e: name: ${{ matrix.test.name }} - runs-on: depot-ubuntu-22.04-4 + runs-on: vitess-operator-runner timeout-minutes: 40 strategy: fail-fast: false From 758114253138f761dae54e4280e3eaa3a1a0348c Mon Sep 17 00:00:00 2001 From: Nick Van Wiggeren Date: Thu, 16 Apr 2026 20:19:57 +0000 Subject: [PATCH 5/9] Fix e2e kind hostname overflow and narrow chromium install MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two issues surfaced on the first GHA run: 1. The `Backup Schedule vtctldclient Method Test` failed kind cluster creation with `sethostname: invalid argument` because the container hostname `kind--control-plane` exceeded the Linux 64-char HOST_NAME_MAX. The `github.run_id`+`run_attempt`+target scheme I used was overkill anyway — each GHA job runs in its own ephemeral VM, so there's no collision risk from reusing a short cluster name. Shorten CI_JOB_ID to just the matrix target. 2. The `Unmanaged Tablet Test` failed at `apt install chromium-browser` when the runner's snap layer hit an apparmor error on mesa-2404. That test doesn't even need chromium — only `vtorc-vtadmin-test` uses headless chromium. Narrow the install to only that matrix entry and route through the well-maintained `browser-actions/setup-chrome` action, aliasing the chrome binary to `chromium-browser` so the existing `getChromiumBinaryName` discovery in utils.sh still works. Signed-off-by: Nick Van Wiggeren --- .github/workflows/e2e-test.yaml | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e-test.yaml b/.github/workflows/e2e-test.yaml index 9292303f8..b227193eb 100644 --- a/.github/workflows/e2e-test.yaml +++ b/.github/workflows/e2e-test.yaml @@ -35,7 +35,9 @@ jobs: - name: "HPA Test" target: hpa-test env: - CI_JOB_ID: ${{ github.run_id }}-${{ github.run_attempt }}-${{ matrix.test.target }} + # Kept short because kind sets the control-plane container hostname to + # "kind--control-plane", and Linux HOST_NAME_MAX is 64. + CI_JOB_ID: ${{ matrix.test.target }} steps: - name: Check out code uses: actions/checkout@v6 @@ -48,7 +50,19 @@ jobs: - name: Install test dependencies run: | sudo apt-get update - sudo apt-get install -y mysql-client chromium-browser + sudo apt-get install -y mysql-client + + - name: Install chromium (vtorc-vtadmin only) + if: matrix.test.target == 'vtorc-vtadmin-test' + uses: browser-actions/setup-chrome@v2 + with: + chrome-version: stable + + - name: Alias chrome as chromium-browser + if: matrix.test.target == 'vtorc-vtadmin-test' + run: | + CHROME_BIN="$(command -v chrome || command -v google-chrome)" + sudo ln -sf "$CHROME_BIN" /usr/local/bin/chromium-browser - name: Run ${{ matrix.test.name }} run: make ${{ matrix.test.target }} From f5826588f396204bb3b0b9fdd6b2210e7ddcd04a Mon Sep 17 00:00:00 2001 From: Nick Van Wiggeren Date: Thu, 16 Apr 2026 20:24:39 +0000 Subject: [PATCH 6/9] Disable Ubuntu 24.04 apparmor userns restriction before kind Every e2e test running under the new GHA runner group failed with mysqld pods in CrashLoopBackOff. The pattern (mysqld exiting ~17ms after spawn, before producing any InnoDB output) and the runner image tag (ubuntu-24.04 / Noble) point to the AppArmor userns restriction that Ubuntu 23.10 introduced: kernel.apparmor_restrict_unprivileged_userns=1 (default) This blocks processes inside nested containers from creating their own user namespaces, which mysqld depends on during startup. Buildkite's public queue runs on an older base, so it doesn't hit this. Workaround is the standard one for kind-in-Ubuntu-24.04 CI: sysctl the restriction off at the start of the job. Also bump inotify limits, which kind wants for its file watchers once a cluster has more than a couple of pods running. Signed-off-by: Nick Van Wiggeren --- .github/workflows/e2e-test.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/e2e-test.yaml b/.github/workflows/e2e-test.yaml index b227193eb..d1d3561e2 100644 --- a/.github/workflows/e2e-test.yaml +++ b/.github/workflows/e2e-test.yaml @@ -39,6 +39,18 @@ jobs: # "kind--control-plane", and Linux HOST_NAME_MAX is 64. CI_JOB_ID: ${{ matrix.test.target }} steps: + - name: Prepare runner for kind (Ubuntu 24.04) + run: | + # Ubuntu 24.04 defaults to kernel.apparmor_restrict_unprivileged_userns=1, + # which prevents nested containers inside kind from creating user + # namespaces. mysqld inside the vttablet pods crashes within ~17ms on + # startup as a result. Disable the restriction so kind-in-GHA behaves + # the way BK's older-kernel agents do. + sudo sysctl -w kernel.apparmor_restrict_unprivileged_userns=0 + # kind needs generous inotify limits to watch all pod files. + sudo sysctl -w fs.inotify.max_user_watches=524288 + sudo sysctl -w fs.inotify.max_user_instances=512 + - name: Check out code uses: actions/checkout@v6 From ed30a3bf6f1b8528c2138d74f6e4e78315a9095f Mon Sep 17 00:00:00 2001 From: Nick Van Wiggeren Date: Thu, 16 Apr 2026 21:03:24 +0000 Subject: [PATCH 7/9] Fully disable AppArmor and all userns restrictions before kind MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous apparmor_restrict_unprivileged_userns=0 alone didn't unblock mysqld inside the vttablet pods — 6 of 8 tests still CrashLoopBackOff with the same my.cnf symptom after kind comes up fine. Notably Unmanaged Tablet Test passed (the one test that does not run a vitess-operator-managed mysqld inside the cluster), which pins the remaining breakage on something specific to mysqld-in-nested-container. Ubuntu 24.04 ships multiple layers of restriction on unprivileged user namespaces plus a broader AppArmor profile for Docker. Belt-and- suspenders: also clear apparmor_restrict_unprivileged_unconfined, enable unprivileged_userns_clone, raise user.max_user_namespaces, and tear AppArmor down entirely. The runner VM is ephemeral, so neutering AppArmor for the job has zero blast radius. Signed-off-by: Nick Van Wiggeren --- .github/workflows/e2e-test.yaml | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/.github/workflows/e2e-test.yaml b/.github/workflows/e2e-test.yaml index d1d3561e2..603836148 100644 --- a/.github/workflows/e2e-test.yaml +++ b/.github/workflows/e2e-test.yaml @@ -41,13 +41,21 @@ jobs: steps: - name: Prepare runner for kind (Ubuntu 24.04) run: | - # Ubuntu 24.04 defaults to kernel.apparmor_restrict_unprivileged_userns=1, - # which prevents nested containers inside kind from creating user - # namespaces. mysqld inside the vttablet pods crashes within ~17ms on - # startup as a result. Disable the restriction so kind-in-GHA behaves - # the way BK's older-kernel agents do. - sudo sysctl -w kernel.apparmor_restrict_unprivileged_userns=0 - # kind needs generous inotify limits to watch all pod files. + # Ubuntu 24.04 ships with several kernel-level restrictions on + # unprivileged user namespaces and AppArmor enforcement that break + # mysqld when it runs inside a nested container (vittablet pod → + # kind → docker → runner VM). Buildkite's older-kernel agents don't + # hit these, so vitess-operator's e2e tests have never needed them. + # Since the runner VM is ephemeral we can just turn the restrictions + # off wholesale rather than crafting a minimal AppArmor profile. + sudo sysctl -w kernel.apparmor_restrict_unprivileged_userns=0 || true + sudo sysctl -w kernel.apparmor_restrict_unprivileged_unconfined=0 || true + sudo sysctl -w kernel.unprivileged_userns_clone=1 || true + sudo sysctl -w user.max_user_namespaces=65536 || true + sudo systemctl stop apparmor.service || true + sudo systemctl disable apparmor.service || true + sudo aa-teardown || true + # kind needs generous inotify limits once pod count grows. sudo sysctl -w fs.inotify.max_user_watches=524288 sudo sysctl -w fs.inotify.max_user_instances=512 From f9a47371dcd0b0f1d9b0d7bd46ad310a7003dfc1 Mon Sep 17 00:00:00 2001 From: Nick Van Wiggeren Date: Thu, 16 Apr 2026 21:29:08 +0000 Subject: [PATCH 8/9] =?UTF-8?q?Drop=20apparmor=20teardown=20=E2=80=94=20it?= =?UTF-8?q?=20breaks=20the=20docker=20build?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Stopping apparmor.service and running aa-teardown in the previous commit unloads all AppArmor profiles from the kernel, including the docker-default profile that BuildKit applies to build containers. The operator image build then died with `runc run failed ... unable to apply apparmor profile`, vitess-operator-pr:latest never got built, kind load fell through, and every pod sat at ErrImageNeverPull. Keep the userns sysctls (they were the actual target of the fix) and leave the AppArmor service and profiles alone so Docker keeps working. Signed-off-by: Nick Van Wiggeren --- .github/workflows/e2e-test.yaml | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/.github/workflows/e2e-test.yaml b/.github/workflows/e2e-test.yaml index 603836148..7085b9930 100644 --- a/.github/workflows/e2e-test.yaml +++ b/.github/workflows/e2e-test.yaml @@ -42,19 +42,18 @@ jobs: - name: Prepare runner for kind (Ubuntu 24.04) run: | # Ubuntu 24.04 ships with several kernel-level restrictions on - # unprivileged user namespaces and AppArmor enforcement that break - # mysqld when it runs inside a nested container (vittablet pod → - # kind → docker → runner VM). Buildkite's older-kernel agents don't - # hit these, so vitess-operator's e2e tests have never needed them. - # Since the runner VM is ephemeral we can just turn the restrictions - # off wholesale rather than crafting a minimal AppArmor profile. + # unprivileged user namespaces that break mysqld when it runs inside + # a nested container (vttablet pod → kind → docker → runner VM). + # Buildkite's older-kernel agents don't hit these. + # + # Only sysctls here — do NOT stop apparmor.service or run + # aa-teardown, because Docker/BuildKit applies the docker-default + # AppArmor profile to build containers and will fail with + # "unable to apply apparmor profile" if the profile is unloaded. sudo sysctl -w kernel.apparmor_restrict_unprivileged_userns=0 || true sudo sysctl -w kernel.apparmor_restrict_unprivileged_unconfined=0 || true sudo sysctl -w kernel.unprivileged_userns_clone=1 || true sudo sysctl -w user.max_user_namespaces=65536 || true - sudo systemctl stop apparmor.service || true - sudo systemctl disable apparmor.service || true - sudo aa-teardown || true # kind needs generous inotify limits once pod count grows. sudo sysctl -w fs.inotify.max_user_watches=524288 sudo sysctl -w fs.inotify.max_user_instances=512 From 8bfedb65759ed70455ec6896d4945e7c5fe7f476 Mon Sep 17 00:00:00 2001 From: Nick Van Wiggeren Date: Thu, 16 Apr 2026 21:56:58 +0000 Subject: [PATCH 9/9] Build operator image before tearing down AppArmor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous attempt to neuter AppArmor broke the docker build because BuildKit requires the docker-default profile to launch build containers. Previous attempt to keep AppArmor on while just setting userns sysctls left mysqld still crashing in nested pods (same my.cnf symptom, 5 of 8 e2e tests red). Split the work across two workflow steps: 1. Build operator image — runs with AppArmor still up so BuildKit is happy and produces vitess-operator-pr:latest. 2. Disable AppArmor before kind — stops apparmor.service and runs aa-teardown now that no more docker builds need to happen. setupBuildContainerImage in utils.sh gains a docker-image-inspect short-circuit so the in-test build call is a no-op when CI already pre-built the image. Locally (no pre-built image) it still builds as before. Signed-off-by: Nick Van Wiggeren --- .github/workflows/e2e-test.yaml | 18 ++++++++++++++++++ test/endtoend/utils.sh | 8 ++++++++ 2 files changed, 26 insertions(+) diff --git a/.github/workflows/e2e-test.yaml b/.github/workflows/e2e-test.yaml index 7085b9930..263a7570a 100644 --- a/.github/workflows/e2e-test.yaml +++ b/.github/workflows/e2e-test.yaml @@ -83,5 +83,23 @@ jobs: CHROME_BIN="$(command -v chrome || command -v google-chrome)" sudo ln -sf "$CHROME_BIN" /usr/local/bin/chromium-browser + - name: Build operator image + # Build before we tear down AppArmor. BuildKit refuses to start build + # containers once the docker-default AppArmor profile is unloaded. + # The image is tagged vitess-operator-pr:latest, which the test's + # setupBuildContainerImage will detect and skip rebuilding. + run: docker build --progress plain --file build/Dockerfile.release --tag vitess-operator-pr:latest . + + - name: Disable AppArmor before kind + # Now tear down AppArmor so the kind container and the pods it runs + # (in particular mysqld inside vttablet pods) aren't subject to Ubuntu + # 24.04's docker-default profile, which appears to be what's killing + # mysqld on startup ("Failed to open required defaults file" within + # ~17ms of spawn even though mysqlctld just wrote the file). + run: | + sudo systemctl stop apparmor.service || true + sudo systemctl disable apparmor.service || true + sudo aa-teardown || true + - name: Run ${{ matrix.test.name }} run: make ${{ matrix.test.target }} diff --git a/test/endtoend/utils.sh b/test/endtoend/utils.sh index 585c1af5d..38e813581 100644 --- a/test/endtoend/utils.sh +++ b/test/endtoend/utils.sh @@ -521,6 +521,14 @@ function assertSelect() { } function setupBuildContainerImage() { + # Skip the build if the image is already present. This lets CI build the + # image in a dedicated step before disabling AppArmor (BuildKit refuses to + # run if the docker-default AppArmor profile is unloaded). + if docker image inspect vitess-operator-pr:latest >/dev/null 2>&1; then + echo "vitess-operator-pr:latest already present, skipping build" + return + fi + echo "Building the container image" # Use plain progress output in CI so logs are line-buffered and readable.