diff --git a/.github/release-drafter.yml b/.github/release-drafter.yml new file mode 100644 index 00000000..4d2e32a6 --- /dev/null +++ b/.github/release-drafter.yml @@ -0,0 +1,64 @@ +name-template: 'v$RESOLVED_VERSION' +tag-template: 'v$RESOLVED_VERSION' +categories: + - title: 'Features' + labels: + - 'feature' + - 'enhancement' + - title: 'Bug Fixes' + labels: + - 'bugfix' + - title: 'Maintenance' + labels: + - 'chore' + - 'dependencies' + - 'documentation' +change-template: '- $TITLE @$AUTHOR (#$NUMBER)' +change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks. +version-resolver: + major: + labels: + - 'major' + minor: + labels: + - 'minor' + patch: + labels: + - 'patch' + default: patch +exclude-labels: + - 'skip-changelog' +autolabeler: + - label: 'api-change' + files: + - 'api/**' + - label: 'controllers' + files: + - 'controllers/**' + - 'internal/**' + - label: 'bugfix' + branch: + - '/fix\/.+/' + - '/bugfix\/.+/' + - label: 'feature' + branch: + - '/feature\/.+/' + - '/feat\/.+/' + - label: 'enhancement' + branch: + - '/enh\/.+/' + - label: 'chore' + branch: + - '/chore\/.+/' + - label: 'dependencies' + branch: + - '/deps\/.+/' + - '/renovate\/.+/' + - label: 'documentation' + files: + - '**/*.md' + branch: + - '/docs\/.+/' +template: | + ## Changes + $CHANGES diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7b50150e..46b845f4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,11 +2,29 @@ name: CI on: pull_request: - branches: [ master ] + branches: [ main ] push: - branches: [ master ] + branches: [ main ] + +# CI only builds and tests — no writes to the repo, releases, or packages. +permissions: + contents: read jobs: + image-multiarch: + # Build-only assertion that the operator image builds for every published + # platform. Deliberately sets up buildx WITHOUT QEMU: the Dockerfile builder + # is pinned to $BUILDPLATFORM and Go cross-compiles via GOARCH, so both legs + # must build natively on this amd64 runner with no emulation. If the + # --platform=$BUILDPLATFORM pin regresses, the arm64 leg fails here (exec + # format error) instead of silently breaking the tag-release publish. + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: docker/setup-buildx-action@v3.3.0 + - name: Build multi-arch image (no push, no QEMU) + run: docker buildx build --platform linux/amd64,linux/arm64 -t etcd-operator:buildtest . + verify: runs-on: ubuntu-latest steps: @@ -18,10 +36,12 @@ jobs: cache: true - name: codegen drift - # If a contributor edits an API type without re-running codegen, - # this gate catches it before CRDs and deepcopy ship out of sync - # with the Go types. Runs before `make test` so the as-committed - # state of generated files is what we check. + # If a contributor edits an API type or +kubebuilder:rbac markers + # without re-running codegen, this gate catches it before the chart's + # CRDs (charts/etcd-operator/crd-bases), manager RBAC rules + # (charts/etcd-operator/files/manager-role-rules.yaml), or deepcopy ship + # out of sync. Runs before `make test` so the as-committed state of + # generated files is what we check. run: | make generate manifests if ! git diff --quiet --exit-code; then diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml new file mode 100644 index 00000000..5d17a319 --- /dev/null +++ b/.github/workflows/docker-publish.yml @@ -0,0 +1,79 @@ +name: Docker publish + +# Tag-based image release. Pushing a semver tag (e.g. v0.5.0) builds the +# operator image multi-arch, pushes it to GHCR under this repo's name, and +# signs it with cosign. This is the same shape as the legacy v1alpha1 release +# process, retargeted at ghcr.io// via the built-in GITHUB_TOKEN +# (no Docker Hub secrets needed). +# +# Release order: push the tag FIRST (this builds ghcr.io/.../etcd-operator:), +# then publish the GitHub release for that tag — release-assets.yml renders the +# install manifests pointing at this image. + +on: + push: + tags: [ 'v*.*.*' ] + +env: + REGISTRY: ghcr.io + # github.repository is /, e.g. cozystack/etcd-operator + IMAGE_NAME: ${{ github.repository }} + +jobs: + build: + runs-on: ubuntu-22.04 + permissions: + contents: read + packages: write + # Needed for the keyless cosign identity challenge (sigstore/fulcio). + id-token: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install cosign + uses: sigstore/cosign-installer@v3.5.0 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3.3.0 + + - name: Log into registry ${{ env.REGISTRY }} + uses: docker/login-action@v3.2.0 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract Docker metadata + id: meta + uses: docker/metadata-action@v5.5.1 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + # Pin the published tag to the exact git ref (e.g. v0.5.0). This is + # the SAME source release-assets.yml uses for the IMG it bakes into + # the install manifest (its RELEASE_TAG is github.ref_name too), so + # the image that ships and the image the manifest references are + # provably identical. Don't rely on metadata-action's implicit + # default: it also emits a moving `latest` and its default tag set is + # easy to misread — explicit keeps the publish/manifest contract clear. + tags: | + type=raw,value=${{ github.ref_name }} + + - name: Build and push Docker image + id: build-and-push + uses: docker/build-push-action@v6 + with: + context: . + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + platforms: linux/amd64,linux/arm64 + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Sign the published Docker image + env: + TAGS: ${{ steps.meta.outputs.tags }} + DIGEST: ${{ steps.build-and-push.outputs.digest }} + run: echo "${TAGS}" | xargs -I {} cosign sign --yes {}@${DIGEST} diff --git a/.github/workflows/helm-publish.yml b/.github/workflows/helm-publish.yml new file mode 100644 index 00000000..59f2373e --- /dev/null +++ b/.github/workflows/helm-publish.yml @@ -0,0 +1,75 @@ +name: Helm publish + +# Tag-based Helm chart release. Pushing a semver tag packages +# charts/etcd-operator and pushes it as an OCI chart to GHCR under the org's +# charts repo (ghcr.io//charts/etcd-operator), versioned from the tag. +# Same shape as the legacy v1alpha1 helm-publish, retargeted at this org and +# using the built-in GITHUB_TOKEN. +on: + push: + tags: [ 'v*.*.*' ] + +env: + REGISTRY: ghcr.io + CHARTS_REPOSITORY: ${{ github.repository_owner }}/charts + CHART_NAME: etcd-operator + +jobs: + build: + runs-on: ubuntu-22.04 + permissions: + contents: read + packages: write + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + # make manifests regenerates the CRDs and manager RBAC rules (controller-gen) + # straight into the chart, so the published package always matches the + # tagged API types and +kubebuilder:rbac markers — never a stale committed + # copy. (ci.yml's drift gate already enforces this on PRs; this is belt-and- + # suspenders at publish time.) + - uses: actions/setup-go@v5 + with: + go-version-file: go.mod + cache: true + + - name: Install Helm + uses: azure/setup-helm@v4 + with: + version: 'v3.16.4' + + - name: Regenerate CRDs and RBAC into the chart + run: make manifests + + - name: Resolve chart versions from tag + env: + REF_NAME: ${{ github.ref_name }} + run: | + TAG="$REF_NAME" + echo "RELEASE_TAG=${TAG}" >> "$GITHUB_ENV" + # Chart version is semver without the leading v; appVersion keeps it. + echo "RELEASE_TAG_TRIMMED_V=${TAG#v}" >> "$GITHUB_ENV" + + - name: Helm registry login + env: + ACTOR: ${{ github.actor }} + TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + helm registry login \ + --username "$ACTOR" \ + --password "$TOKEN" \ + "${{ env.REGISTRY }}" + + - name: Package chart + working-directory: charts + run: | + helm package "${{ env.CHART_NAME }}" \ + --version "${RELEASE_TAG_TRIMMED_V}" \ + --app-version "${RELEASE_TAG}" + + - name: Push chart + working-directory: charts + run: | + helm push "${{ env.CHART_NAME }}-${RELEASE_TAG_TRIMMED_V}.tgz" \ + "oci://${{ env.REGISTRY }}/${{ env.CHARTS_REPOSITORY }}" diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml deleted file mode 100644 index 54e0bc6e..00000000 --- a/.github/workflows/publish.yml +++ /dev/null @@ -1,31 +0,0 @@ -name: Build and Publish Docker Image - -on: - push: - branches: [ master ] - -jobs: - build-and-publish: - runs-on: ubuntu-latest - steps: - ## checks out our project source code - - uses: actions/checkout@v2 - - ## Builds our docker image! - - name: Build the Docker image - run: docker build . --file Dockerfile --tag lllamnyp/etcd-operator:$(date +%s) - - ## Publishes our image to Docker Hub 😎 - - name: Get image tag - id: get_tag - run: echo "IMAGE_TAG=$(date +%s)" >> $GITHUB_ENV - - name: Publish to Registry - uses: elgohr/Publish-Docker-Github-Action@master - with: - ## the name of our image - name: lllamnyp/etcd-operator - ## Here we pass in our Docker Username - username: ${{ secrets.DOCKER_USERNAME }} - ## and our Docker password which - password: ${{ secrets.DOCKER_PASSWORD }} - tags: "latest,${{ env.IMAGE_TAG }}" diff --git a/.github/workflows/release-assets.yml b/.github/workflows/release-assets.yml new file mode 100644 index 00000000..1e853381 --- /dev/null +++ b/.github/workflows/release-assets.yml @@ -0,0 +1,119 @@ +name: Upload release assets + +# When a GitHub release is created for a tag, render the install manifests for +# that tag's image and attach them to the release. `make build-dist-manifests` +# is `helm template` of the chart with image.repository/tag set to the released +# ref; the chart renders image == OPERATOR_IMAGE, so the attached YAML deploys +# the matching operator and its snapshot/restore agent. For consumers who +# kubectl-apply rather than helm-install. + +on: + release: + types: [ created ] + workflow_dispatch: + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + +jobs: + release-assets: + runs-on: ubuntu-22.04 + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-go@v5 + with: + go-version-file: go.mod + cache: true + + # build-dist-manifests renders the chart via `helm template`. + - name: Install Helm + uses: azure/setup-helm@v4 + with: + version: 'v3.16.4' + + - name: Resolve release tag + env: + REF_NAME: ${{ github.ref_name }} + run: echo "RELEASE_TAG=$REF_NAME" >> "$GITHUB_ENV" + + - name: Render install manifests + run: make build-dist-manifests IMG="${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:${RELEASE_TAG}" + + - uses: svenstaro/upload-release-action@2.9.0 + with: + repo_token: ${{ secrets.GITHUB_TOKEN }} + file: dist/etcd-operator.yaml + asset_name: etcd-operator.yaml + tag: ${{ github.ref }} + overwrite: true + + - uses: svenstaro/upload-release-action@2.9.0 + with: + repo_token: ${{ secrets.GITHUB_TOKEN }} + file: dist/etcd-operator.crds.yaml + asset_name: etcd-operator.crds.yaml + tag: ${{ github.ref }} + overwrite: true + + - uses: svenstaro/upload-release-action@2.9.0 + with: + repo_token: ${{ secrets.GITHUB_TOKEN }} + file: dist/etcd-operator.non-crds.yaml + asset_name: etcd-operator.non-crds.yaml + tag: ${{ github.ref }} + overwrite: true + + # Standalone client CLIs (kubectl-etcd plugin + etcd-migrate). They are not in + # the operator image (client-side / admin tools), so they ship as + # cross-compiled release binaries. Separate job: no Helm needed, and a failure + # here doesn't block the manifest assets above. + cli-binaries: + runs-on: ubuntu-22.04 + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-go@v5 + with: + go-version-file: go.mod + cache: true + + - name: Resolve release tag + env: + REF_NAME: ${{ github.ref_name }} + run: echo "RELEASE_TAG=$REF_NAME" >> "$GITHUB_ENV" + + - name: Cross-compile CLIs + run: make dist-cli VERSION="${RELEASE_TAG}" + + - name: Upload etcd-migrate binaries + uses: svenstaro/upload-release-action@2.9.0 + with: + repo_token: ${{ secrets.GITHUB_TOKEN }} + file: dist/etcd-migrate-* + file_glob: true + tag: ${{ github.ref }} + overwrite: true + + - name: Upload kubectl-etcd binaries + uses: svenstaro/upload-release-action@2.9.0 + with: + repo_token: ${{ secrets.GITHUB_TOKEN }} + file: dist/kubectl-etcd-* + file_glob: true + tag: ${{ github.ref }} + overwrite: true + + - name: Upload CLI checksums + uses: svenstaro/upload-release-action@2.9.0 + with: + repo_token: ${{ secrets.GITHUB_TOKEN }} + file: dist/cli-SHA256SUMS.txt + asset_name: cli-SHA256SUMS.txt + tag: ${{ github.ref }} + overwrite: true diff --git a/.github/workflows/release-drafter.yml b/.github/workflows/release-drafter.yml new file mode 100644 index 00000000..5d18ee13 --- /dev/null +++ b/.github/workflows/release-drafter.yml @@ -0,0 +1,36 @@ +name: Release Drafter + +# Maintains a draft GitHub release on main, accumulating merged-PR titles into +# categorised notes. Publishing that draft for a tag is what triggers +# release-assets.yml; pushing the tag is what triggers docker-publish.yml. + +on: + push: + branches: [ main ] + # pull_request_target runs in the context of the BASE branch (main) with a + # read/write token, which is what lets the autolabeler label PRs from forks. + # It is used ONLY to read PR metadata (title, labels, changed-file globs). + pull_request_target: + types: [ opened, reopened, synchronize ] + workflow_dispatch: + +jobs: + release-drafter: + runs-on: ubuntu-22.04 + permissions: + contents: write + pull-requests: write + steps: + # SECURITY: never add `actions/checkout` of the PR head (or run any code + # from the PR) in this job. pull_request_target grants a write token and + # secrets while running base-branch config; checking out + executing fork + # code under it is the canonical fork-to-RCE pattern. release-drafter + # touches no repo code, so this job stays safe as long as nothing here + # checks out untrusted refs. + - uses: release-drafter/release-drafter@3f0f87098bd6b5c5b9a36d49c41d998ea58f9348 # v6.0.0 + with: + disable-releaser: ${{ github.ref != 'refs/heads/main' }} + config-name: release-drafter.yml + commitish: main + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/release-smoke.yml b/.github/workflows/release-smoke.yml new file mode 100644 index 00000000..b4a057c6 --- /dev/null +++ b/.github/workflows/release-smoke.yml @@ -0,0 +1,63 @@ +name: Release install smoke + +# Gates changes to the tag-release machinery by running the full ship-then- +# install flow on a throwaway kind cluster (hack/release-smoke.sh), for BOTH +# shipped install paths: the rendered manifest (release-assets.yml) and the +# Helm chart (helm-publish.yml). Each builds the image, installs it, and +# asserts the operator comes up and reconciles a 1-node cluster. Catches +# release-pipeline regressions (image/manifest tag drift, broken OPERATOR_IMAGE +# wiring, CRDs that don't apply, a chart missing an RBAC rule) on the PR that +# introduces them, not on the first real tag. The image is loaded into kind, +# never pushed — no registry credentials. +on: + pull_request: + paths: + - '.github/workflows/release-smoke.yml' + - '.github/workflows/docker-publish.yml' + - '.github/workflows/release-assets.yml' + - '.github/workflows/helm-publish.yml' + - 'hack/release-smoke.sh' + - 'charts/**' + - 'Makefile' + - 'Dockerfile' + - 'api/**' + workflow_dispatch: + +concurrency: + group: release-smoke-${{ github.ref }} + cancel-in-progress: true + +jobs: + smoke: + runs-on: ubuntu-latest + timeout-minutes: 30 + permissions: + contents: read + strategy: + fail-fast: false + matrix: + mode: [ manifest, helm ] + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-go@v5 + with: + go-version-file: go.mod + cache: true + + - name: Install Helm + if: matrix.mode == 'helm' + uses: azure/setup-helm@v4 + with: + version: 'v3.16.4' + + - name: release-install smoke (${{ matrix.mode }}, kind) + # Builds the image, installs via the matrix path, and asserts the + # operator is Available and a 1-node EtcdCluster reaches READY. On + # failure the script's EXIT trap dumps cluster state before teardown. + run: | + if [ "${{ matrix.mode }}" = helm ]; then + make helm-smoke + else + make release-smoke + fi diff --git a/.gitignore b/.gitignore index ed60d564..e6d6e8be 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,9 @@ bin /kubectl-etcd # etcd-migrate tool: same rule — build to bin/, never commit the root-level artifact /etcd-migrate +# release install manifests rendered by `make build-dist-manifests`; the +# release-assets workflow regenerates and attaches these per tag +/dist # Test binary, build with `go test -c` *.test diff --git a/Dockerfile b/Dockerfile index 82729f64..117e934f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,10 @@ -# Build the manager binary -FROM golang:1.25.10 AS builder +# Build the manager binary. +# Pin the builder to the BUILD platform so a multi-arch `buildx --platform` +# build runs the builder natively (no QEMU) and Go cross-compiles via the +# GOARCH=${TARGETARCH} below. Without this, buildx instantiates the builder as +# the target arch and `go build` runs under emulation — which fails the arm64 +# leg on an amd64 runner that has no binfmt registered. +FROM --platform=$BUILDPLATFORM golang:1.25.10 AS builder ARG TARGETOS ARG TARGETARCH diff --git a/Makefile b/Makefile index 64c47dd3..80060fa4 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,11 @@ # Image URL to use all building/pushing image targets IMG ?= controller:latest + +# Version stamped into the standalone CLIs (etcd-migrate, kubectl-etcd) via +# -ldflags. Defaults to `git describe`; the release workflow passes the tag. +VERSION ?= $(shell git describe --tags --always --dirty 2>/dev/null || echo dev) +CLI_LDFLAGS ?= -X main.version=$(VERSION) # ENVTEST_K8S_VERSION is derived from the k8s.io/api version in go.mod so a # dependency bump automatically pulls the matching envtest assets — no need # to remember to update this in two places. (Pattern stolen from @@ -42,8 +47,20 @@ help: ## Display this help. ##@ Development .PHONY: manifests -manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and CustomResourceDefinition objects. - $(CONTROLLER_GEN) rbac:roleName=manager-role crd webhook paths="./..." output:crd:artifacts:config=config/crd/bases +manifests: controller-gen yq ## Generate CRDs and the manager RBAC rules straight into the Helm chart. + # CRDs land in charts/etcd-operator/crd-bases/ (templates/crds.yaml renders + # them, with the helm.sh/resource-policy:keep annotation); the manager + # ClusterRole rules land in charts/etcd-operator/files/ for templates/rbac.yaml + # to pull in via .Files.Get. ci.yml's codegen-drift gate (make manifests + + # git diff) then guards BOTH against drift from the API types and the + # +kubebuilder:rbac markers — no second source of truth, no grep guard. + $(CONTROLLER_GEN) rbac:roleName=manager-role crd paths="./..." \ + output:crd:artifacts:config=charts/etcd-operator/crd-bases \ + output:rbac:artifacts:config=charts/etcd-operator/files + # controller-gen emits a whole ClusterRole; the chart only needs its rules + # (it wraps them in a release-named, labelled ClusterRole of its own). + $(YQ) eval '.rules' charts/etcd-operator/files/role.yaml > charts/etcd-operator/files/manager-role-rules.yaml + rm -f charts/etcd-operator/files/role.yaml .PHONY: generate generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations. @@ -69,6 +86,14 @@ test-e2e: ## Run the e2e suite against the current kubeconfig context (expects c e2e: ## Provision a kind cluster with cert-manager and Kamaji, deploy the operator, run the e2e suite. KEEP_CLUSTER=1 keeps the cluster for debugging. hack/e2e.sh +.PHONY: release-smoke +release-smoke: ## Smoke-test the tag-release manifest install path on kind: build image -> render dist manifests -> apply -> assert operator Available and a 1-node cluster READY. KEEP_CLUSTER=1 keeps the cluster. + hack/release-smoke.sh + +.PHONY: helm-smoke +helm-smoke: ## Smoke-test the Helm chart install path on kind: build image -> helm install chart -> assert operator Available and a 1-node cluster READY. KEEP_CLUSTER=1 keeps the cluster. + INSTALL_MODE=helm hack/release-smoke.sh + ##@ Build .PHONY: build @@ -77,11 +102,27 @@ build: manifests generate fmt vet ## Build manager binary. .PHONY: kubectl-etcd kubectl-etcd: fmt vet ## Build the kubectl-etcd plugin binary. - go build -o bin/kubectl-etcd ./cmd/kubectl-etcd + go build -ldflags "$(CLI_LDFLAGS)" -o bin/kubectl-etcd ./cmd/kubectl-etcd .PHONY: etcd-migrate etcd-migrate: fmt vet ## Build the etcd-migrate (legacy v1alpha1 -> v1alpha2) CLI binary. - go build -o bin/etcd-migrate ./cmd/etcd-migrate + go build -ldflags "$(CLI_LDFLAGS)" -o bin/etcd-migrate ./cmd/etcd-migrate + +.PHONY: dist-cli +dist-cli: ## Cross-compile etcd-migrate and kubectl-etcd into dist/ for release (linux/darwin x amd64/arm64). VERSION stamps the binary version. + # Produces the standalone CLIs the release-assets workflow attaches to a + # release, named --, plus a SHA256 checksum file. These are + # client-side tools (kubectl-etcd is a kubectl plugin, etcd-migrate is an + # admin-run migration CLI), so they ship as binaries, not in the operator image. + mkdir -p dist + for os in linux darwin; do for arch in amd64 arm64; do \ + for cmd in etcd-migrate kubectl-etcd; do \ + echo "building dist/$$cmd-$$os-$$arch"; \ + CGO_ENABLED=0 GOOS=$$os GOARCH=$$arch \ + go build -ldflags "$(CLI_LDFLAGS)" -o dist/$$cmd-$$os-$$arch ./cmd/$$cmd; \ + done; \ + done; done + cd dist && { command -v sha256sum >/dev/null 2>&1 && sha256sum etcd-migrate-* kubectl-etcd-* || shasum -a 256 etcd-migrate-* kubectl-etcd-*; } > cli-SHA256SUMS.txt .PHONY: run run: manifests generate fmt vet ## Run a controller from your host. @@ -115,28 +156,47 @@ docker-buildx: test ## Build and push docker image for the manager for cross-pla - docker buildx rm project-v3-builder rm Dockerfile.cross -##@ Deployment - -ifndef ignore-not-found - ignore-not-found = false -endif +.PHONY: build-dist-manifests +build-dist-manifests: manifests generate require-helm yq ## Render the release install manifests into dist/ for IMG. + # Produces the YAMLs the release-assets workflow attaches to a tag, for users + # who kubectl-apply instead of helm-install: + # dist/etcd-operator.yaml – everything (Namespace + CRDs + operator) + # dist/etcd-operator.crds.yaml – CRDs only + # dist/etcd-operator.non-crds.yaml – everything except CRDs + # This is just `helm template` of the chart, so the rendered manifest IS the + # chart: the image == OPERATOR_IMAGE wiring and the RBAC come from one source. + # namespace.create emits the Namespace so a bare `kubectl apply -f + # etcd-operator.yaml` is self-contained. Rendering is pure — no tracked file + # is mutated. Pass IMG=/etcd-operator:. + mkdir -p dist + img='$(IMG)'; $(HELM) template etcd-operator charts/etcd-operator \ + --namespace etcd-operator-system \ + --set image.repository="$${img%:*}" --set image.tag="$${img##*:}" \ + --set namespace.create=true \ + > dist/etcd-operator.yaml + $(YQ) eval 'select(.kind != "CustomResourceDefinition")' dist/etcd-operator.yaml > dist/etcd-operator.non-crds.yaml + $(YQ) eval 'select(.kind == "CustomResourceDefinition")' dist/etcd-operator.yaml > dist/etcd-operator.crds.yaml -.PHONY: install -install: manifests kustomize ## Install CRDs into the K8s cluster specified in ~/.kube/config. - $(KUSTOMIZE) build config/crd | kubectl apply -f - +##@ Deployment -.PHONY: uninstall -uninstall: manifests kustomize ## Uninstall CRDs from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion. - $(KUSTOMIZE) build config/crd | kubectl delete --ignore-not-found=$(ignore-not-found) -f - +# The Helm-driven install targets below. The chart is the single source of +# truth for CRDs, RBAC, and the manager Deployment. +HELM_RELEASE ?= etcd-operator +NAMESPACE ?= etcd-operator-system .PHONY: deploy -deploy: manifests kustomize ## Deploy controller to the K8s cluster specified in ~/.kube/config. - cd config/manager && $(KUSTOMIZE) edit set image controller=${IMG} - $(KUSTOMIZE) build config/default | kubectl apply -f - +deploy: manifests require-helm ## Install/upgrade the operator (CRDs + RBAC + manager) via Helm. Pass IMG=/etcd-operator:. + # The chart renders image == OPERATOR_IMAGE, so there is no separate image- + # replacement step; CRDs are templated into the release so `helm upgrade` + # keeps them current. The IMG split into repository:tag handles registry ports. + img='$(IMG)'; $(HELM) upgrade --install $(HELM_RELEASE) charts/etcd-operator \ + --namespace $(NAMESPACE) --create-namespace \ + --set image.repository="$${img%:*}" --set image.tag="$${img##*:}" \ + --wait --timeout 5m .PHONY: undeploy -undeploy: ## Undeploy controller from the K8s cluster specified in ~/.kube/config. Call with ignore-not-found=true to ignore resource not found errors during deletion. - $(KUSTOMIZE) build config/default | kubectl delete --ignore-not-found=$(ignore-not-found) -f - +undeploy: require-helm ## Uninstall the operator release. CRDs carry resource-policy:keep, so EtcdClusters survive — delete them (and the CRDs) by hand to wipe data. + $(HELM) uninstall $(HELM_RELEASE) --namespace $(NAMESPACE) ##@ Build Dependencies @@ -146,14 +206,17 @@ $(LOCALBIN): mkdir -p $(LOCALBIN) ## Tool Versions -KUSTOMIZE_VERSION ?= v5.6.0 CONTROLLER_TOOLS_VERSION ?= v0.18.0 +YQ_VERSION ?= v4.44.1 ## Tool Binaries (version-suffixed so a version bump auto-triggers reinstall ## and stale builds of an old version stay on disk alongside the new one). -KUSTOMIZE ?= $(LOCALBIN)/kustomize-$(KUSTOMIZE_VERSION) CONTROLLER_GEN ?= $(LOCALBIN)/controller-gen-$(CONTROLLER_TOOLS_VERSION) ENVTEST ?= $(LOCALBIN)/setup-envtest +YQ ?= $(LOCALBIN)/yq-$(YQ_VERSION) +# Helm is the one tool we don't vendor (no clean `go install`); it must be on +# PATH. release-smoke/e2e and the publish workflows install it via setup-helm. +HELM ?= helm # go-install-tool installs $2@$3 under $1. `go install` drops the binary at # $LOCALBIN/, so we rename it after install to the version-suffixed @@ -168,10 +231,12 @@ mv "$$(echo "$(1)" | sed "s/-$(3)$$//")" $(1) ;\ } endef -.PHONY: kustomize -kustomize: $(KUSTOMIZE) ## Download kustomize locally if necessary. -$(KUSTOMIZE): $(LOCALBIN) - $(call go-install-tool,$(KUSTOMIZE),sigs.k8s.io/kustomize/kustomize/v5,$(KUSTOMIZE_VERSION)) +.PHONY: require-helm +require-helm: ## Assert Helm is on PATH (used by deploy/undeploy/build-dist-manifests). + @command -v $(HELM) >/dev/null 2>&1 || { \ + echo "ERROR: helm not found on PATH. Install Helm v3.16+ (https://helm.sh/docs/intro/install/)."; \ + exit 1; \ + } .PHONY: controller-gen controller-gen: $(CONTROLLER_GEN) ## Download controller-gen locally if necessary. @@ -182,3 +247,8 @@ $(CONTROLLER_GEN): $(LOCALBIN) envtest: $(ENVTEST) ## Download envtest-setup locally if necessary. $(ENVTEST): $(LOCALBIN) test -s $(LOCALBIN)/setup-envtest || GOBIN=$(LOCALBIN) go install sigs.k8s.io/controller-runtime/tools/setup-envtest@latest + +.PHONY: yq +yq: $(YQ) ## Download yq locally if necessary. +$(YQ): $(LOCALBIN) + $(call go-install-tool,$(YQ),github.com/mikefarah/yq/v4,$(YQ_VERSION)) diff --git a/README.md b/README.md index e8a2d0a0..aa9774d0 100644 --- a/README.md +++ b/README.md @@ -37,12 +37,12 @@ No multi-user / per-tenant RBAC inside etcd — single-user `root` auth is avail ## Quick start ```sh -# 1. Install CRDs and the operator. Builds an image and pushes it to your -# registry; substitute IMG= for a prebuilt tag if you have one. The cluster -# must be able to pull from — for local clusters (kind / -# minikube / k3d) sideload the image or use an ephemeral registry such as -# ttl.sh, otherwise the operator Deployment will sit in ImagePullBackOff. -make install +# 1. Install the operator (CRDs + RBAC + manager) with Helm. Builds an image and +# pushes it to your registry; substitute IMG= for a prebuilt tag if you have +# one. The cluster must be able to pull from — for local +# clusters (kind / minikube / k3d) sideload the image or use an ephemeral +# registry such as ttl.sh, otherwise the Deployment sits in ImagePullBackOff. +# `make deploy` runs `helm upgrade --install` (needs helm v3.16+ on PATH). make docker-build docker-push deploy IMG=/etcd-operator: # 2. Create a cluster. diff --git a/api/v1alpha2/validation_envtest_test.go b/api/v1alpha2/validation_envtest_test.go index f28483e8..156c9dbf 100644 --- a/api/v1alpha2/validation_envtest_test.go +++ b/api/v1alpha2/validation_envtest_test.go @@ -77,11 +77,12 @@ func TestMain(m *testing.M) { os.Exit(code) } -// crdBasesDir resolves config/crd/bases relative to this test file — -// go test's CWD is the package directory and the CRDs live two levels up. +// crdBasesDir resolves the chart's raw generated CRDs relative to this test +// file — go test's CWD is the package directory and the CRDs (written by +// `make manifests`) live under charts/etcd-operator/crd-bases two levels up. func crdBasesDir() string { _, here, _, _ := runtime.Caller(0) - return filepath.Join(filepath.Dir(here), "..", "..", "config", "crd", "bases") + return filepath.Join(filepath.Dir(here), "..", "..", "charts", "etcd-operator", "crd-bases") } func ptr32(v int32) *int32 { return &v } diff --git a/charts/etcd-operator/.helmignore b/charts/etcd-operator/.helmignore new file mode 100644 index 00000000..c59a757d --- /dev/null +++ b/charts/etcd-operator/.helmignore @@ -0,0 +1,6 @@ +# Patterns to ignore when building packages. +*.tmpl +.git +.gitignore +*.tgz +README.md.gotmpl diff --git a/charts/etcd-operator/Chart.yaml b/charts/etcd-operator/Chart.yaml new file mode 100644 index 00000000..cf3dba73 --- /dev/null +++ b/charts/etcd-operator/Chart.yaml @@ -0,0 +1,11 @@ +apiVersion: v2 +name: etcd-operator +description: Kubernetes operator for running etcd clusters (etcd-operator.cozystack.io/v1alpha2) +type: application +# Placeholders. The release pipeline (helm-publish.yml) sets the real values +# from the git tag: --version , --app-version . +version: 0.0.0 +appVersion: "v0.0.0" +home: https://github.com/cozystack/etcd-operator +sources: + - https://github.com/cozystack/etcd-operator diff --git a/config/crd/bases/etcd-operator.cozystack.io_etcdclusters.yaml b/charts/etcd-operator/crd-bases/etcd-operator.cozystack.io_etcdclusters.yaml similarity index 100% rename from config/crd/bases/etcd-operator.cozystack.io_etcdclusters.yaml rename to charts/etcd-operator/crd-bases/etcd-operator.cozystack.io_etcdclusters.yaml diff --git a/config/crd/bases/etcd-operator.cozystack.io_etcdmembers.yaml b/charts/etcd-operator/crd-bases/etcd-operator.cozystack.io_etcdmembers.yaml similarity index 100% rename from config/crd/bases/etcd-operator.cozystack.io_etcdmembers.yaml rename to charts/etcd-operator/crd-bases/etcd-operator.cozystack.io_etcdmembers.yaml diff --git a/config/crd/bases/etcd-operator.cozystack.io_etcdsnapshots.yaml b/charts/etcd-operator/crd-bases/etcd-operator.cozystack.io_etcdsnapshots.yaml similarity index 100% rename from config/crd/bases/etcd-operator.cozystack.io_etcdsnapshots.yaml rename to charts/etcd-operator/crd-bases/etcd-operator.cozystack.io_etcdsnapshots.yaml diff --git a/charts/etcd-operator/files/manager-role-rules.yaml b/charts/etcd-operator/files/manager-role-rules.yaml new file mode 100644 index 00000000..4ad4b945 --- /dev/null +++ b/charts/etcd-operator/files/manager-role-rules.yaml @@ -0,0 +1,120 @@ +- apiGroups: + - "" + resources: + - persistentvolumeclaims + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - "" + resources: + - pods + verbs: + - create + - delete + - get + - list + - patch + - watch +- apiGroups: + - "" + resources: + - pods/log + verbs: + - get +- apiGroups: + - "" + resources: + - secrets + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - services + verbs: + - create + - get + - list + - patch + - update + - watch +- apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - get + - list + - watch +- apiGroups: + - cert-manager.io + resources: + - certificates + verbs: + - create + - get + - list + - patch + - update + - watch +- apiGroups: + - etcd-operator.cozystack.io + resources: + - etcdclusters + verbs: + - get + - list + - watch +- apiGroups: + - etcd-operator.cozystack.io + resources: + - etcdclusters/finalizers + - etcdmembers/finalizers + - etcdsnapshots/finalizers + verbs: + - update +- apiGroups: + - etcd-operator.cozystack.io + resources: + - etcdclusters/status + - etcdmembers/status + - etcdsnapshots/status + verbs: + - get + - patch + - update +- apiGroups: + - etcd-operator.cozystack.io + resources: + - etcdmembers + - etcdsnapshots + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - policy + resources: + - poddisruptionbudgets + verbs: + - create + - delete + - get + - list + - patch + - update + - watch diff --git a/charts/etcd-operator/templates/_helpers.tpl b/charts/etcd-operator/templates/_helpers.tpl new file mode 100644 index 00000000..75c467b5 --- /dev/null +++ b/charts/etcd-operator/templates/_helpers.tpl @@ -0,0 +1,59 @@ +{{/* Chart name (overridable). */}} +{{- define "etcd-operator.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* Fully qualified resource-name prefix. */}} +{{- define "etcd-operator.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{- define "etcd-operator.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* Common labels. */}} +{{- define "etcd-operator.labels" -}} +helm.sh/chart: {{ include "etcd-operator.chart" . }} +{{ include "etcd-operator.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end -}} + +{{/* Selector labels — stable; also used by the metrics Service selector. */}} +{{- define "etcd-operator.selectorLabels" -}} +app.kubernetes.io/name: {{ include "etcd-operator.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end -}} + +{{/* ServiceAccount name. */}} +{{- define "etcd-operator.serviceAccountName" -}} +{{- if .Values.serviceAccount.create -}} +{{- include "etcd-operator.fullname" . -}} +{{- else -}} +{{- /* Don't silently fall back to the namespace "default" SA: rbac.yaml binds +the operator's broad ClusterRole to this name, and binding it to "default" +would hand those permissions to every workload using the default SA. */ -}} +{{- required "serviceAccount.name is required when serviceAccount.create is false" .Values.serviceAccount.name -}} +{{- end -}} +{{- end -}} + +{{/* +Full operator image reference. Used for BOTH the manager container image and +its OPERATOR_IMAGE env var — they MUST be identical, or the operator refuses to +start (the snapshot/restore agent runs this same image). +*/}} +{{- define "etcd-operator.image" -}} +{{- printf "%s:%s" .Values.image.repository (.Values.image.tag | default .Chart.AppVersion) -}} +{{- end -}} diff --git a/charts/etcd-operator/templates/crds.yaml b/charts/etcd-operator/templates/crds.yaml new file mode 100644 index 00000000..937a3938 --- /dev/null +++ b/charts/etcd-operator/templates/crds.yaml @@ -0,0 +1,18 @@ +{{- /* +CRDs, templated (not Helm's install-only crds/ dir) so `helm upgrade` keeps +them current with the release. The raw, controller-gen-generated CRDs live in +crd-bases/ (written by `make manifests`, guarded by ci.yml's codegen-drift +gate); we round-trip each through fromYaml/toYaml to inject the +helm.sh/resource-policy:keep annotation, which stops `helm uninstall` from +deleting the CRDs — that would cascade-delete every EtcdCluster and its data. +*/ -}} +{{- if .Values.crds.enabled }} +{{- range $path, $_ := .Files.Glob "crd-bases/*.yaml" }} +{{- $crd := $.Files.Get $path | fromYaml }} +{{- if $.Values.crds.keep }} +{{- $_ := set $crd.metadata "annotations" (merge (dict "helm.sh/resource-policy" "keep") (default (dict) $crd.metadata.annotations)) }} +{{- end }} +--- +{{ toYaml $crd }} +{{- end }} +{{- end }} diff --git a/charts/etcd-operator/templates/deployment.yaml b/charts/etcd-operator/templates/deployment.yaml new file mode 100644 index 00000000..69bbb487 --- /dev/null +++ b/charts/etcd-operator/templates/deployment.yaml @@ -0,0 +1,107 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "etcd-operator.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "etcd-operator.labels" . | nindent 4 }} + control-plane: controller-manager +spec: + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + {{- include "etcd-operator.selectorLabels" . | nindent 6 }} + template: + metadata: + annotations: + kubectl.kubernetes.io/default-container: manager + {{- with .Values.podAnnotations }} + {{- toYaml . | nindent 8 }} + {{- end }} + labels: + {{- include "etcd-operator.selectorLabels" . | nindent 8 }} + control-plane: controller-manager + {{- with .Values.podLabels }} + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + serviceAccountName: {{ include "etcd-operator.serviceAccountName" . }} + terminationGracePeriodSeconds: 10 + securityContext: + runAsNonRoot: true + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + containers: + - name: manager + image: {{ include "etcd-operator.image" . }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: + - /manager + args: + - --health-probe-bind-address=:8081 + {{- if .Values.kubeRbacProxy.enabled }} + - --metrics-bind-address=127.0.0.1:8080 + {{- else }} + - --metrics-bind-address=:8080 + {{- end }} + - --leader-elect + env: + # MUST equal the manager image: the operator launches this ref for + # snapshot Jobs and restore init containers, and refuses to start if it + # is left at a placeholder. + - name: OPERATOR_IMAGE + value: {{ include "etcd-operator.image" . }} + livenessProbe: + httpGet: + path: /healthz + port: 8081 + initialDelaySeconds: 15 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /readyz + port: 8081 + initialDelaySeconds: 5 + periodSeconds: 10 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - "ALL" + resources: + {{- toYaml .Values.manager.resources | nindent 10 }} + {{- if .Values.kubeRbacProxy.enabled }} + - name: kube-rbac-proxy + image: {{ .Values.kubeRbacProxy.image.repository }}:{{ .Values.kubeRbacProxy.image.tag }} + imagePullPolicy: {{ .Values.kubeRbacProxy.image.pullPolicy }} + args: + - --secure-listen-address=0.0.0.0:8443 + - --upstream=http://127.0.0.1:8080/ + - --logtostderr=true + - --v=0 + ports: + - containerPort: 8443 + protocol: TCP + name: https + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - "ALL" + resources: + {{- toYaml .Values.kubeRbacProxy.resources | nindent 10 }} + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} diff --git a/charts/etcd-operator/templates/metrics-service.yaml b/charts/etcd-operator/templates/metrics-service.yaml new file mode 100644 index 00000000..c0326a20 --- /dev/null +++ b/charts/etcd-operator/templates/metrics-service.yaml @@ -0,0 +1,18 @@ +{{- if .Values.kubeRbacProxy.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "etcd-operator.fullname" . }}-metrics + namespace: {{ .Release.Namespace }} + labels: + {{- include "etcd-operator.labels" . | nindent 4 }} + control-plane: controller-manager +spec: + ports: + - name: https + port: {{ .Values.metricsService.port }} + protocol: TCP + targetPort: https + selector: + {{- include "etcd-operator.selectorLabels" . | nindent 4 }} +{{- end }} diff --git a/charts/etcd-operator/templates/namespace.yaml b/charts/etcd-operator/templates/namespace.yaml new file mode 100644 index 00000000..2ce49424 --- /dev/null +++ b/charts/etcd-operator/templates/namespace.yaml @@ -0,0 +1,15 @@ +{{- /* +Only rendered for `helm template`-based manifest rendering (build-dist-manifests +sets namespace.create=true) so a bare `kubectl apply -f etcd-operator.yaml` is +self-contained. Real `helm install` uses --create-namespace instead, so this +stays off by default. +*/ -}} +{{- if .Values.namespace.create }} +apiVersion: v1 +kind: Namespace +metadata: + name: {{ .Release.Namespace }} + labels: + {{- include "etcd-operator.labels" . | nindent 4 }} + control-plane: controller-manager +{{- end }} diff --git a/charts/etcd-operator/templates/rbac.yaml b/charts/etcd-operator/templates/rbac.yaml new file mode 100644 index 00000000..4989dfb4 --- /dev/null +++ b/charts/etcd-operator/templates/rbac.yaml @@ -0,0 +1,106 @@ +# RBAC for the operator. The manager ClusterRole's rules are generated from the +# +kubebuilder:rbac markers into files/manager-role-rules.yaml (by `make +# manifests`) and pulled in below — a single source of truth, guarded by +# ci.yml's codegen-drift gate. The release-scoped name, labels, binding, and the +# leader-election / proxy roles stay templated here. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "etcd-operator.fullname" . }}-manager-role + labels: + {{- include "etcd-operator.labels" . | nindent 4 }} +rules: + {{- .Files.Get "files/manager-role-rules.yaml" | nindent 2 }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "etcd-operator.fullname" . }}-manager-rolebinding + labels: + {{- include "etcd-operator.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "etcd-operator.fullname" . }}-manager-role +subjects: +- kind: ServiceAccount + name: {{ include "etcd-operator.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "etcd-operator.fullname" . }}-leader-election-role + namespace: {{ .Release.Namespace }} + labels: + {{- include "etcd-operator.labels" . | nindent 4 }} +rules: +- apiGroups: [""] + resources: [configmaps] + verbs: [get, list, watch, create, update, patch, delete] +- apiGroups: [coordination.k8s.io] + resources: [leases] + verbs: [get, list, watch, create, update, patch, delete] +- apiGroups: [""] + resources: [events] + verbs: [create, patch] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "etcd-operator.fullname" . }}-leader-election-rolebinding + namespace: {{ .Release.Namespace }} + labels: + {{- include "etcd-operator.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: {{ include "etcd-operator.fullname" . }}-leader-election-role +subjects: +- kind: ServiceAccount + name: {{ include "etcd-operator.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} +{{- if .Values.kubeRbacProxy.enabled }} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "etcd-operator.fullname" . }}-proxy-role + labels: + {{- include "etcd-operator.labels" . | nindent 4 }} +rules: +- apiGroups: [authentication.k8s.io] + resources: [tokenreviews] + verbs: [create] +- apiGroups: [authorization.k8s.io] + resources: [subjectaccessreviews] + verbs: [create] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "etcd-operator.fullname" . }}-proxy-rolebinding + labels: + {{- include "etcd-operator.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "etcd-operator.fullname" . }}-proxy-role +subjects: +- kind: ServiceAccount + name: {{ include "etcd-operator.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} +--- +# Convenience role granting GET on /metrics through the proxy (e.g. for a +# Prometheus scrape identity). Folded in from config/rbac's metrics-reader; +# the chart defines it but does not bind it. +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "etcd-operator.fullname" . }}-metrics-reader + labels: + {{- include "etcd-operator.labels" . | nindent 4 }} +rules: +- nonResourceURLs: ["/metrics"] + verbs: [get] +{{- end }} diff --git a/charts/etcd-operator/templates/serviceaccount.yaml b/charts/etcd-operator/templates/serviceaccount.yaml new file mode 100644 index 00000000..9efde7c4 --- /dev/null +++ b/charts/etcd-operator/templates/serviceaccount.yaml @@ -0,0 +1,13 @@ +{{- if .Values.serviceAccount.create }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "etcd-operator.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "etcd-operator.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/charts/etcd-operator/templates/servicemonitor.yaml b/charts/etcd-operator/templates/servicemonitor.yaml new file mode 100644 index 00000000..f8c0e6a1 --- /dev/null +++ b/charts/etcd-operator/templates/servicemonitor.yaml @@ -0,0 +1,31 @@ +{{- /* +prometheus-operator ServiceMonitor for the metrics endpoint (folded in from the +old config/prometheus). Off by default: it needs the monitoring.coreos.com CRDs +installed, and it scrapes the kube-rbac-proxy https port, so it only makes sense +when kubeRbacProxy is enabled. +*/ -}} +{{- if .Values.metrics.serviceMonitor.enabled }} +{{- if not .Values.kubeRbacProxy.enabled }} +{{- fail "metrics.serviceMonitor.enabled requires kubeRbacProxy.enabled (the ServiceMonitor scrapes the proxy's https metrics port)" }} +{{- end }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "etcd-operator.fullname" . }}-metrics + namespace: {{ .Release.Namespace }} + labels: + {{- include "etcd-operator.labels" . | nindent 4 }} + control-plane: controller-manager +spec: + endpoints: + - path: /metrics + port: https + scheme: https + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + tlsConfig: + insecureSkipVerify: true + selector: + matchLabels: + {{- include "etcd-operator.selectorLabels" . | nindent 6 }} + control-plane: controller-manager +{{- end }} diff --git a/charts/etcd-operator/values.yaml b/charts/etcd-operator/values.yaml new file mode 100644 index 00000000..fb68d85c --- /dev/null +++ b/charts/etcd-operator/values.yaml @@ -0,0 +1,97 @@ +# Default values for etcd-operator. + +crds: + # -- Render the CRDs as part of the release. Templated (not Helm's install-only + # crds/ dir) so `helm upgrade` keeps them current with the API types. + enabled: true + # -- Annotate CRDs with helm.sh/resource-policy: keep so `helm uninstall` leaves + # them in place. Strongly recommended: deleting the CRDs cascade-deletes every + # EtcdCluster and its data. + keep: true + +# -- Render a Namespace object. Off by default (real `helm install` uses +# --create-namespace); build-dist-manifests turns it on so the rendered +# kubectl-apply manifest is self-contained. +namespace: + create: false + +metrics: + serviceMonitor: + # -- Create a prometheus-operator ServiceMonitor for the metrics endpoint. + # Requires the monitoring.coreos.com CRDs and kubeRbacProxy.enabled. + enabled: false + +image: + # -- Operator image repository. The published image is ghcr.io/cozystack/etcd-operator. + repository: ghcr.io/cozystack/etcd-operator + # -- Image tag. Defaults to the chart's appVersion (set to the release tag by the pipeline). + tag: "" + # -- Image pull policy. + pullPolicy: IfNotPresent + +# -- Number of operator replicas (leader election picks the active one). +replicaCount: 1 + +# -- Image pull secrets for private registries. +imagePullSecrets: [] + +# -- Override the chart name portion of resource names. +nameOverride: "" +# -- Override the full resource-name prefix. +fullnameOverride: "" + +serviceAccount: + # -- Create the operator ServiceAccount. + create: true + # -- Name of an existing ServiceAccount to use when create is false. Required + # in that case — the operator's ClusterRole is bound to this name. + name: "" + # -- Extra annotations for the ServiceAccount. + annotations: {} + +# -- Extra annotations for the operator Pod. +podAnnotations: {} +# -- Extra labels for the operator Pod. +podLabels: {} +# -- Node selector for the operator Pod. +nodeSelector: {} +# -- Tolerations for the operator Pod. +tolerations: [] +# -- Affinity for the operator Pod. +affinity: {} + +manager: + # -- Resource requests/limits for the manager container. + resources: + limits: + cpu: 500m + memory: 128Mi + requests: + cpu: 10m + memory: 64Mi + +# kube-rbac-proxy fronts the manager's /metrics endpoint with SubjectAccessReview +# authz. Disable to expose metrics without the proxy (the manager then binds +# metrics on :8080 directly). +kubeRbacProxy: + # -- Run the kube-rbac-proxy metrics sidecar. + enabled: true + image: + # -- kube-rbac-proxy image repository (upstream's GitHub org; the old gcr.io/kubebuilder one is gone). + repository: ghcr.io/kube-rbac-proxy/kube-rbac-proxy + # -- kube-rbac-proxy image tag. + tag: v0.22.0 + # -- kube-rbac-proxy image pull policy. + pullPolicy: IfNotPresent + # -- Resource requests/limits for the proxy container. + resources: + limits: + cpu: 500m + memory: 128Mi + requests: + cpu: 5m + memory: 64Mi + +metricsService: + # -- Port the metrics Service exposes (proxy https port). + port: 8443 diff --git a/cmd/etcd-migrate/config.go b/cmd/etcd-migrate/config.go index 8b65d152..3c0a8b29 100644 --- a/cmd/etcd-migrate/config.go +++ b/cmd/etcd-migrate/config.go @@ -21,10 +21,15 @@ import ( "k8s.io/client-go/util/homedir" ) -// defaultControllerRef is where both this repo's kustomize config and the -// legacy repo's deploy the controller; the two generations share the name, -// so a single Deployment commonly answers both checks. -const defaultControllerRef = "etcd-operator-system/etcd-operator-controller-manager" +// defaultLegacyControllerRef is where the legacy v1alpha1 repo's kustomize +// install deploys its controller. +const defaultLegacyControllerRef = "etcd-operator-system/etcd-operator-controller-manager" + +// defaultNewControllerRef is where this repo's Helm chart deploys the operator — +// release name "etcd-operator" in the etcd-operator-system namespace. (The +// generations no longer share a name: kustomize named it +// etcd-operator-controller-manager; the chart names it after the release.) +const defaultNewControllerRef = "etcd-operator-system/etcd-operator" // Config holds every flag of the migrate CLI. type Config struct { @@ -72,8 +77,8 @@ func bindFlags(cmd *cobra.Command, cfg *Config) { f.BoolVar(&cfg.Apply, "apply", false, "Execute the adoption. Without it the tool only prints the plan (dry-run).") f.BoolVarP(&cfg.Yes, "yes", "y", false, "Skip the interactive confirmation before --apply mutates the cluster") f.BoolVar(&cfg.SkipControllerCheck, "skip-controller-check", false, "Skip verifying that both operator Deployments are scaled down") - f.StringVar(&cfg.LegacyController, "legacy-controller", defaultControllerRef, "Legacy operator Deployment as namespace/name") - f.StringVar(&cfg.NewController, "new-controller", defaultControllerRef, "New operator Deployment as namespace/name") + f.StringVar(&cfg.LegacyController, "legacy-controller", defaultLegacyControllerRef, "Legacy operator Deployment as namespace/name") + f.StringVar(&cfg.NewController, "new-controller", defaultNewControllerRef, "New operator Deployment as namespace/name") f.StringVar(&cfg.Version, "version", "", "etcd version (X.Y.Z) to set on every migrated cluster, overriding image-tag extraction") f.StringVar(&cfg.AuthSecret, "auth-secret", "", "Existing kubernetes.io/basic-auth Secret (in each cluster's namespace) to reference for clusters with enableAuth; default generates one per cluster") diff --git a/cmd/etcd-migrate/main.go b/cmd/etcd-migrate/main.go index d9a2eee4..171793bb 100644 --- a/cmd/etcd-migrate/main.go +++ b/cmd/etcd-migrate/main.go @@ -38,6 +38,10 @@ import ( "github.com/cozystack/etcd-operator/internal/migrate" ) +// version is stamped at build time via -ldflags "-X main.version=" +// (see the Makefile's CLI_LDFLAGS); "dev" for un-stamped local builds. +var version = "dev" + func main() { cfg := &Config{} rootCmd := &cobra.Command{ @@ -66,6 +70,16 @@ explicit --skip-backup.`, }, } bindFlags(rootCmd, cfg) + // A `version` subcommand rather than a --version flag: --version is already + // taken by the etcd-version override (see bindFlags). + rootCmd.AddCommand(&cobra.Command{ + Use: "version", + Short: "Print the etcd-migrate binary version", + Args: cobra.NoArgs, + Run: func(cmd *cobra.Command, _ []string) { + _, _ = fmt.Fprintln(cmd.OutOrStdout(), version) + }, + }) if err := rootCmd.Execute(); err != nil { os.Exit(1) } diff --git a/cmd/kubectl-etcd/main.go b/cmd/kubectl-etcd/main.go index c1328371..ec2d62d6 100644 --- a/cmd/kubectl-etcd/main.go +++ b/cmd/kubectl-etcd/main.go @@ -27,11 +27,16 @@ import ( "github.com/cozystack/etcd-operator/internal/portforward" ) +// version is stamped at build time via -ldflags "-X main.version=" +// (see the Makefile's CLI_LDFLAGS); "dev" for un-stamped local builds. +var version = "dev" + func main() { var rootCmd = &cobra.Command{ - Use: "kubectl-etcd", - Short: "Kubectl etcd plugin", - Long: `Manage etcd pods spawned by etcd-operator`, + Use: "kubectl-etcd", + Version: version, + Short: "Kubectl etcd plugin", + Long: `Manage etcd pods spawned by etcd-operator`, // Subcommands report failures by returning an error (RunE); a runtime // failure is not a usage error, so don't dump the help text for it. SilenceUsage: true, diff --git a/config/crd/kustomization.yaml b/config/crd/kustomization.yaml deleted file mode 100644 index 82de31a7..00000000 --- a/config/crd/kustomization.yaml +++ /dev/null @@ -1,23 +0,0 @@ -# This kustomization.yaml is not intended to be run by itself, -# since it depends on service name and namespace that are out of this kustomize package. -# It should be run by config/default -resources: -- bases/etcd-operator.cozystack.io_etcdclusters.yaml -- bases/etcd-operator.cozystack.io_etcdmembers.yaml -- bases/etcd-operator.cozystack.io_etcdsnapshots.yaml -#+kubebuilder:scaffold:crdkustomizeresource - -patchesStrategicMerge: -# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix. -# patches here are for enabling the conversion webhook for each CRD -#- patches/webhook_in_etcdclusters.yaml -#+kubebuilder:scaffold:crdkustomizewebhookpatch - -# [CERTMANAGER] To enable cert-manager, uncomment all the sections with [CERTMANAGER] prefix. -# patches here are for enabling the CA injection for each CRD -#- patches/cainjection_in_etcdclusters.yaml -#+kubebuilder:scaffold:crdkustomizecainjectionpatch - -# the following config is for teaching kustomize how to do kustomization for CRDs. -configurations: -- kustomizeconfig.yaml diff --git a/config/crd/kustomizeconfig.yaml b/config/crd/kustomizeconfig.yaml deleted file mode 100644 index ec5c150a..00000000 --- a/config/crd/kustomizeconfig.yaml +++ /dev/null @@ -1,19 +0,0 @@ -# This file is for teaching kustomize how to substitute name and namespace reference in CRD -nameReference: -- kind: Service - version: v1 - fieldSpecs: - - kind: CustomResourceDefinition - version: v1 - group: apiextensions.k8s.io - path: spec/conversion/webhook/clientConfig/service/name - -namespace: -- kind: CustomResourceDefinition - version: v1 - group: apiextensions.k8s.io - path: spec/conversion/webhook/clientConfig/service/namespace - create: false - -varReference: -- path: metadata/annotations diff --git a/config/crd/patches/cainjection_in_etcdclusters.yaml b/config/crd/patches/cainjection_in_etcdclusters.yaml deleted file mode 100644 index 73f9aa37..00000000 --- a/config/crd/patches/cainjection_in_etcdclusters.yaml +++ /dev/null @@ -1,7 +0,0 @@ -# The following patch adds a directive for certmanager to inject CA into the CRD -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - cert-manager.io/inject-ca-from: $(CERTIFICATE_NAMESPACE)/$(CERTIFICATE_NAME) - name: etcdclusters.etcd-operator.cozystack.io diff --git a/config/crd/patches/webhook_in_etcdclusters.yaml b/config/crd/patches/webhook_in_etcdclusters.yaml deleted file mode 100644 index 24aa8134..00000000 --- a/config/crd/patches/webhook_in_etcdclusters.yaml +++ /dev/null @@ -1,16 +0,0 @@ -# The following patch enables a conversion webhook for the CRD -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - name: etcdclusters.etcd-operator.cozystack.io -spec: - conversion: - strategy: Webhook - webhook: - clientConfig: - service: - namespace: system - name: webhook-service - path: /convert - conversionReviewVersions: - - v1 diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml deleted file mode 100644 index 5edb3c1e..00000000 --- a/config/default/kustomization.yaml +++ /dev/null @@ -1,88 +0,0 @@ -# Adds namespace to all resources. -namespace: etcd-operator-system - -# Value of this field is prepended to the -# names of all resources, e.g. a deployment named -# "wordpress" becomes "alices-wordpress". -# Note that it should also match with the prefix (text before '-') of the namespace -# field above. -namePrefix: etcd-operator- - -# Labels to add to all resources and selectors. -#commonLabels: -# someName: someValue - -bases: -- ../crd -- ../rbac -- ../manager -# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in -# crd/kustomization.yaml -#- ../webhook -# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. 'WEBHOOK' components are required. -#- ../certmanager -# [PROMETHEUS] To enable prometheus monitor, uncomment all sections with 'PROMETHEUS'. -#- ../prometheus - -patchesStrategicMerge: -# Protect the /metrics endpoint by putting it behind auth. -# If you want your controller-manager to expose the /metrics -# endpoint w/o any authn/z, please comment the following line. -- manager_auth_proxy_patch.yaml - -# Keep the manager's OPERATOR_IMAGE env in sync with its own container image, -# so the snapshot Job and restore init container always run the deployed -# operator image. `make deploy IMG=...` rewrites the image; this copies that -# value into the env var. -replacements: -- source: - kind: Deployment - name: controller-manager - fieldPath: spec.template.spec.containers.[name=manager].image - targets: - - select: - kind: Deployment - name: controller-manager - fieldPaths: - - spec.template.spec.containers.[name=manager].env.[name=OPERATOR_IMAGE].value - - - -# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in -# crd/kustomization.yaml -#- manager_webhook_patch.yaml - -# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER'. -# Uncomment 'CERTMANAGER' sections in crd/kustomization.yaml to enable the CA injection in the admission webhooks. -# 'CERTMANAGER' needs to be enabled to use ca injection -#- webhookcainjection_patch.yaml - -# the following config is for teaching kustomize how to do var substitution -vars: -# [CERTMANAGER] To enable cert-manager, uncomment all sections with 'CERTMANAGER' prefix. -#- name: CERTIFICATE_NAMESPACE # namespace of the certificate CR -# objref: -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: serving-cert # this name should match the one in certificate.yaml -# fieldref: -# fieldpath: metadata.namespace -#- name: CERTIFICATE_NAME -# objref: -# kind: Certificate -# group: cert-manager.io -# version: v1 -# name: serving-cert # this name should match the one in certificate.yaml -#- name: SERVICE_NAMESPACE # namespace of the service -# objref: -# kind: Service -# version: v1 -# name: webhook-service -# fieldref: -# fieldpath: metadata.namespace -#- name: SERVICE_NAME -# objref: -# kind: Service -# version: v1 -# name: webhook-service diff --git a/config/default/manager_auth_proxy_patch.yaml b/config/default/manager_auth_proxy_patch.yaml deleted file mode 100644 index 61c1328d..00000000 --- a/config/default/manager_auth_proxy_patch.yaml +++ /dev/null @@ -1,58 +0,0 @@ -# This patch inject a sidecar container which is a HTTP proxy for the -# controller manager, it performs RBAC authorization against the Kubernetes API using SubjectAccessReviews. -apiVersion: apps/v1 -kind: Deployment -metadata: - name: controller-manager - namespace: system -spec: - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: kubernetes.io/arch - operator: In - values: - - amd64 - - arm64 - - ppc64le - - s390x - - key: kubernetes.io/os - operator: In - values: - - linux - containers: - - name: kube-rbac-proxy - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - "ALL" - # gcr.io/kubebuilder/kube-rbac-proxy is gone (the gcr.io/kubebuilder - # registry shut down in early 2025); upstream now releases under its - # own GitHub org at ghcr.io/kube-rbac-proxy/kube-rbac-proxy. - image: ghcr.io/kube-rbac-proxy/kube-rbac-proxy:v0.22.0 - args: - - "--secure-listen-address=0.0.0.0:8443" - - "--upstream=http://127.0.0.1:8080/" - - "--logtostderr=true" - - "--v=0" - ports: - - containerPort: 8443 - protocol: TCP - name: https - resources: - limits: - cpu: 500m - memory: 128Mi - requests: - cpu: 5m - memory: 64Mi - - name: manager - args: - - "--health-probe-bind-address=:8081" - - "--metrics-bind-address=127.0.0.1:8080" - - "--leader-elect" diff --git a/config/default/manager_config_patch.yaml b/config/default/manager_config_patch.yaml deleted file mode 100644 index f6f58916..00000000 --- a/config/default/manager_config_patch.yaml +++ /dev/null @@ -1,10 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: controller-manager - namespace: system -spec: - template: - spec: - containers: - - name: manager diff --git a/config/manager/kustomization.yaml b/config/manager/kustomization.yaml deleted file mode 100644 index 5c5f0b84..00000000 --- a/config/manager/kustomization.yaml +++ /dev/null @@ -1,2 +0,0 @@ -resources: -- manager.yaml diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml deleted file mode 100644 index b3558dc6..00000000 --- a/config/manager/manager.yaml +++ /dev/null @@ -1,117 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - labels: - control-plane: controller-manager - app.kubernetes.io/name: namespace - app.kubernetes.io/instance: system - app.kubernetes.io/component: manager - app.kubernetes.io/created-by: etcd-operator - app.kubernetes.io/part-of: etcd-operator - app.kubernetes.io/managed-by: kustomize - name: system ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: controller-manager - namespace: system - labels: - control-plane: controller-manager - app.kubernetes.io/name: deployment - app.kubernetes.io/instance: controller-manager - app.kubernetes.io/component: manager - app.kubernetes.io/created-by: etcd-operator - app.kubernetes.io/part-of: etcd-operator - app.kubernetes.io/managed-by: kustomize -spec: - selector: - matchLabels: - control-plane: controller-manager - replicas: 1 - template: - metadata: - annotations: - kubectl.kubernetes.io/default-container: manager - labels: - control-plane: controller-manager - spec: - # TODO(user): Uncomment the following code to configure the nodeAffinity expression - # according to the platforms which are supported by your solution. - # It is considered best practice to support multiple architectures. You can - # build your manager image using the makefile target docker-buildx. - # affinity: - # nodeAffinity: - # requiredDuringSchedulingIgnoredDuringExecution: - # nodeSelectorTerms: - # - matchExpressions: - # - key: kubernetes.io/arch - # operator: In - # values: - # - amd64 - # - arm64 - # - ppc64le - # - s390x - # - key: kubernetes.io/os - # operator: In - # values: - # - linux - securityContext: - runAsNonRoot: true - # TODO(user): For common cases that do not require escalating privileges - # it is recommended to ensure that all your Pods/Containers are restrictive. - # More info: https://kubernetes.io/docs/concepts/security/pod-security-standards/#restricted - # Please uncomment the following code if your project does NOT have to work on old Kubernetes - # versions < 1.19 or on vendors versions which do NOT support this field by default (i.e. Openshift < 4.11 ). - # seccompProfile: - # type: RuntimeDefault - containers: - - command: - - /manager - args: - - --leader-elect - image: controller:latest - name: manager - # OPERATOR_IMAGE must equal this container's own image: the snapshot Job - # and the restore init container run the operator image as the agent. - # The value here is a placeholder kept in sync with the `image:` field - # by a kustomize replacement in config/default; `make deploy IMG=...` - # propagates the real ref automatically. The flag --operator-image - # defaults to this env var. - # - # WARNING: if you apply config/ directly (or via a kustomize base that - # does NOT run that image replacement), this stays "controller:latest". - # The operator refuses to start on that placeholder (it would otherwise - # make every snapshot/restore Pod ImagePullBackOff), exiting with a clear - # error — override OPERATOR_IMAGE (and image:) to the real operator ref. - env: - - name: OPERATOR_IMAGE - value: controller:latest - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - "ALL" - livenessProbe: - httpGet: - path: /healthz - port: 8081 - initialDelaySeconds: 15 - periodSeconds: 20 - readinessProbe: - httpGet: - path: /readyz - port: 8081 - initialDelaySeconds: 5 - periodSeconds: 10 - # TODO(user): Configure the resources accordingly based on the project requirements. - # More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ - resources: - limits: - cpu: 500m - memory: 128Mi - requests: - cpu: 10m - memory: 64Mi - serviceAccountName: controller-manager - terminationGracePeriodSeconds: 10 diff --git a/config/prometheus/kustomization.yaml b/config/prometheus/kustomization.yaml deleted file mode 100644 index ed137168..00000000 --- a/config/prometheus/kustomization.yaml +++ /dev/null @@ -1,2 +0,0 @@ -resources: -- monitor.yaml diff --git a/config/prometheus/monitor.yaml b/config/prometheus/monitor.yaml deleted file mode 100644 index 70b780ba..00000000 --- a/config/prometheus/monitor.yaml +++ /dev/null @@ -1,26 +0,0 @@ - -# Prometheus Monitor Service (Metrics) -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - labels: - control-plane: controller-manager - app.kubernetes.io/name: servicemonitor - app.kubernetes.io/instance: controller-manager-metrics-monitor - app.kubernetes.io/component: metrics - app.kubernetes.io/created-by: etcd-operator - app.kubernetes.io/part-of: etcd-operator - app.kubernetes.io/managed-by: kustomize - name: controller-manager-metrics-monitor - namespace: system -spec: - endpoints: - - path: /metrics - port: https - scheme: https - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - tlsConfig: - insecureSkipVerify: true - selector: - matchLabels: - control-plane: controller-manager diff --git a/config/rbac/auth_proxy_client_clusterrole.yaml b/config/rbac/auth_proxy_client_clusterrole.yaml deleted file mode 100644 index 71f03b57..00000000 --- a/config/rbac/auth_proxy_client_clusterrole.yaml +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: clusterrole - app.kubernetes.io/instance: metrics-reader - app.kubernetes.io/component: kube-rbac-proxy - app.kubernetes.io/created-by: etcd-operator - app.kubernetes.io/part-of: etcd-operator - app.kubernetes.io/managed-by: kustomize - name: metrics-reader -rules: -- nonResourceURLs: - - "/metrics" - verbs: - - get diff --git a/config/rbac/auth_proxy_role.yaml b/config/rbac/auth_proxy_role.yaml deleted file mode 100644 index eab3a381..00000000 --- a/config/rbac/auth_proxy_role.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: clusterrole - app.kubernetes.io/instance: proxy-role - app.kubernetes.io/component: kube-rbac-proxy - app.kubernetes.io/created-by: etcd-operator - app.kubernetes.io/part-of: etcd-operator - app.kubernetes.io/managed-by: kustomize - name: proxy-role -rules: -- apiGroups: - - authentication.k8s.io - resources: - - tokenreviews - verbs: - - create -- apiGroups: - - authorization.k8s.io - resources: - - subjectaccessreviews - verbs: - - create diff --git a/config/rbac/auth_proxy_role_binding.yaml b/config/rbac/auth_proxy_role_binding.yaml deleted file mode 100644 index a33b0e8d..00000000 --- a/config/rbac/auth_proxy_role_binding.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - app.kubernetes.io/name: clusterrolebinding - app.kubernetes.io/instance: proxy-rolebinding - app.kubernetes.io/component: kube-rbac-proxy - app.kubernetes.io/created-by: etcd-operator - app.kubernetes.io/part-of: etcd-operator - app.kubernetes.io/managed-by: kustomize - name: proxy-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: proxy-role -subjects: -- kind: ServiceAccount - name: controller-manager - namespace: system diff --git a/config/rbac/auth_proxy_service.yaml b/config/rbac/auth_proxy_service.yaml deleted file mode 100644 index 79609e81..00000000 --- a/config/rbac/auth_proxy_service.yaml +++ /dev/null @@ -1,21 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - labels: - control-plane: controller-manager - app.kubernetes.io/name: service - app.kubernetes.io/instance: controller-manager-metrics-service - app.kubernetes.io/component: kube-rbac-proxy - app.kubernetes.io/created-by: etcd-operator - app.kubernetes.io/part-of: etcd-operator - app.kubernetes.io/managed-by: kustomize - name: controller-manager-metrics-service - namespace: system -spec: - ports: - - name: https - port: 8443 - protocol: TCP - targetPort: https - selector: - control-plane: controller-manager diff --git a/config/rbac/etcdcluster_editor_role.yaml b/config/rbac/etcdcluster_editor_role.yaml deleted file mode 100644 index 17e4e2b9..00000000 --- a/config/rbac/etcdcluster_editor_role.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# permissions for end users to edit etcdclusters. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: clusterrole - app.kubernetes.io/instance: etcdcluster-editor-role - app.kubernetes.io/component: rbac - app.kubernetes.io/created-by: etcd-operator - app.kubernetes.io/part-of: etcd-operator - app.kubernetes.io/managed-by: kustomize - name: etcdcluster-editor-role -rules: -- apiGroups: - - etcd-operator.cozystack.io - resources: - - etcdclusters - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - etcd-operator.cozystack.io - resources: - - etcdclusters/status - verbs: - - get diff --git a/config/rbac/etcdcluster_viewer_role.yaml b/config/rbac/etcdcluster_viewer_role.yaml deleted file mode 100644 index 260c8de1..00000000 --- a/config/rbac/etcdcluster_viewer_role.yaml +++ /dev/null @@ -1,27 +0,0 @@ -# permissions for end users to view etcdclusters. -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/name: clusterrole - app.kubernetes.io/instance: etcdcluster-viewer-role - app.kubernetes.io/component: rbac - app.kubernetes.io/created-by: etcd-operator - app.kubernetes.io/part-of: etcd-operator - app.kubernetes.io/managed-by: kustomize - name: etcdcluster-viewer-role -rules: -- apiGroups: - - etcd-operator.cozystack.io - resources: - - etcdclusters - verbs: - - get - - list - - watch -- apiGroups: - - etcd-operator.cozystack.io - resources: - - etcdclusters/status - verbs: - - get diff --git a/config/rbac/kustomization.yaml b/config/rbac/kustomization.yaml deleted file mode 100644 index 731832a6..00000000 --- a/config/rbac/kustomization.yaml +++ /dev/null @@ -1,18 +0,0 @@ -resources: -# All RBAC will be applied under this service account in -# the deployment namespace. You may comment out this resource -# if your manager will use a service account that exists at -# runtime. Be sure to update RoleBinding and ClusterRoleBinding -# subjects if changing service account names. -- service_account.yaml -- role.yaml -- role_binding.yaml -- leader_election_role.yaml -- leader_election_role_binding.yaml -# Comment the following 4 lines if you want to disable -# the auth proxy (https://github.com/brancz/kube-rbac-proxy) -# which protects your /metrics endpoint. -- auth_proxy_service.yaml -- auth_proxy_role.yaml -- auth_proxy_role_binding.yaml -- auth_proxy_client_clusterrole.yaml diff --git a/config/rbac/leader_election_role.yaml b/config/rbac/leader_election_role.yaml deleted file mode 100644 index 879932e2..00000000 --- a/config/rbac/leader_election_role.yaml +++ /dev/null @@ -1,44 +0,0 @@ -# permissions to do leader election. -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - labels: - app.kubernetes.io/name: role - app.kubernetes.io/instance: leader-election-role - app.kubernetes.io/component: rbac - app.kubernetes.io/created-by: etcd-operator - app.kubernetes.io/part-of: etcd-operator - app.kubernetes.io/managed-by: kustomize - name: leader-election-role -rules: -- apiGroups: - - "" - resources: - - configmaps - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -- apiGroups: - - coordination.k8s.io - resources: - - leases - verbs: - - get - - list - - watch - - create - - update - - patch - - delete -- apiGroups: - - "" - resources: - - events - verbs: - - create - - patch diff --git a/config/rbac/leader_election_role_binding.yaml b/config/rbac/leader_election_role_binding.yaml deleted file mode 100644 index 53c67521..00000000 --- a/config/rbac/leader_election_role_binding.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - labels: - app.kubernetes.io/name: rolebinding - app.kubernetes.io/instance: leader-election-rolebinding - app.kubernetes.io/component: rbac - app.kubernetes.io/created-by: etcd-operator - app.kubernetes.io/part-of: etcd-operator - app.kubernetes.io/managed-by: kustomize - name: leader-election-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: leader-election-role -subjects: -- kind: ServiceAccount - name: controller-manager - namespace: system diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml deleted file mode 100644 index 910c0b04..00000000 --- a/config/rbac/role.yaml +++ /dev/null @@ -1,126 +0,0 @@ ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: manager-role -rules: -- apiGroups: - - "" - resources: - - persistentvolumeclaims - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - "" - resources: - - pods - verbs: - - create - - delete - - get - - list - - patch - - watch -- apiGroups: - - "" - resources: - - pods/log - verbs: - - get -- apiGroups: - - "" - resources: - - secrets - verbs: - - get - - list - - watch -- apiGroups: - - "" - resources: - - services - verbs: - - create - - get - - list - - patch - - update - - watch -- apiGroups: - - batch - resources: - - jobs - verbs: - - create - - delete - - get - - list - - watch -- apiGroups: - - cert-manager.io - resources: - - certificates - verbs: - - create - - get - - list - - patch - - update - - watch -- apiGroups: - - etcd-operator.cozystack.io - resources: - - etcdclusters - verbs: - - get - - list - - watch -- apiGroups: - - etcd-operator.cozystack.io - resources: - - etcdclusters/finalizers - - etcdmembers/finalizers - - etcdsnapshots/finalizers - verbs: - - update -- apiGroups: - - etcd-operator.cozystack.io - resources: - - etcdclusters/status - - etcdmembers/status - - etcdsnapshots/status - verbs: - - get - - patch - - update -- apiGroups: - - etcd-operator.cozystack.io - resources: - - etcdmembers - - etcdsnapshots - verbs: - - create - - delete - - get - - list - - patch - - update - - watch -- apiGroups: - - policy - resources: - - poddisruptionbudgets - verbs: - - create - - delete - - get - - list - - patch - - update - - watch diff --git a/config/rbac/role_binding.yaml b/config/rbac/role_binding.yaml deleted file mode 100644 index 966b6e66..00000000 --- a/config/rbac/role_binding.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - app.kubernetes.io/name: clusterrolebinding - app.kubernetes.io/instance: manager-rolebinding - app.kubernetes.io/component: rbac - app.kubernetes.io/created-by: etcd-operator - app.kubernetes.io/part-of: etcd-operator - app.kubernetes.io/managed-by: kustomize - name: manager-rolebinding -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: manager-role -subjects: -- kind: ServiceAccount - name: controller-manager - namespace: system diff --git a/config/rbac/service_account.yaml b/config/rbac/service_account.yaml deleted file mode 100644 index 092beca0..00000000 --- a/config/rbac/service_account.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - labels: - app.kubernetes.io/name: serviceaccount - app.kubernetes.io/instance: controller-manager - app.kubernetes.io/component: rbac - app.kubernetes.io/created-by: etcd-operator - app.kubernetes.io/part-of: etcd-operator - app.kubernetes.io/managed-by: kustomize - name: controller-manager - namespace: system diff --git a/config/samples/kustomization.yaml b/config/samples/kustomization.yaml deleted file mode 100644 index b32c636e..00000000 --- a/config/samples/kustomization.yaml +++ /dev/null @@ -1,4 +0,0 @@ -## Append samples you want in your CSV to this file as resources ## -resources: -- _v1alpha2_etcdcluster.yaml -#+kubebuilder:scaffold:manifestskustomizesamples diff --git a/docs/installation.md b/docs/installation.md index 1fc420ff..73d3378b 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -15,56 +15,145 @@ For the operator's runtime behaviour see [concepts](concepts.md); for day-2 oper Workload-side: every etcd Pod runs as UID 65532 with `runAsNonRoot=true`, `allowPrivilegeEscalation=false`, all capabilities dropped, and `seccompProfile=RuntimeDefault`. The Pods comply with the `restricted` PodSecurity profile. If your cluster enforces a stricter policy, see `controllers/etcdmember_controller.go`'s `buildPod` for the exact security context the operator emits and adjust accordingly. -## Quick deploy +## Install from a release -The repo's Makefile drives a complete install. From a checkout: +Tagged releases publish a signed multi-arch operator image to GHCR and attach +ready-to-apply install manifests to the GitHub release — no checkout, no build, +no registry of your own. This is the recommended path for consuming a release. ```sh -# 1. Install the CRDs cluster-wide. -make install +# Pick a released version (see https://github.com/cozystack/etcd-operator/releases). +VERSION=v0.5.0 -# 2. Build the operator image (or skip to a prebuilt registry tag). -make docker-build docker-push IMG=/etcd-operator: +# Everything (CRDs + namespace + RBAC + controller Deployment + Service): +kubectl apply -f https://github.com/cozystack/etcd-operator/releases/download/$VERSION/etcd-operator.yaml +``` -# 3. Deploy the operator (creates the etcd-operator-system namespace, -# ClusterRole/Binding, controller Deployment, metrics service). -make deploy IMG=/etcd-operator: +The manifest already points the manager (and its `OPERATOR_IMAGE`, used for +snapshot/restore Pods) at `ghcr.io/cozystack/etcd-operator:$VERSION` — the same +tag whose image the release published, so there is nothing to substitute. + +If you split CRDs from the rest (e.g. CRDs are applied by a separate +cluster-admin step, or server-side-applied to dodge the annotation size limit): + +```sh +kubectl apply --server-side -f https://github.com/cozystack/etcd-operator/releases/download/$VERSION/etcd-operator.crds.yaml +kubectl apply -f https://github.com/cozystack/etcd-operator/releases/download/$VERSION/etcd-operator.non-crds.yaml ``` -The cluster must be able to pull from ``. For local clusters (`kind` / `minikube` / `k3d`), either sideload the image (`kind load docker-image ...`) or push to an ephemeral registry the cluster can reach (e.g. `ttl.sh/:1h`); otherwise the operator Deployment goes `ImagePullBackOff` with no clear hint from the operator side. +The image is cosign-signed (keyless). To verify before deploying: + +```sh +cosign verify ghcr.io/cozystack/etcd-operator:$VERSION \ + --certificate-identity-regexp 'https://github.com/cozystack/etcd-operator/.github/workflows/.+' \ + --certificate-oidc-issuer https://token.actions.githubusercontent.com +``` -By default this lands in the `etcd-operator-system` namespace. The deployment name is `etcd-operator-controller-manager`. Verify: +To pull a prebuilt image without the release manifests (e.g. to feed your own +overlay), the image ref is `ghcr.io/cozystack/etcd-operator:`. + +## Install with Helm + +Helm is the primary install path: the chart is the single source of truth for +the CRDs, RBAC, and the manager Deployment (the release manifests below are just +`helm template` of this same chart). Tagged releases publish it as an OCI Helm +chart to GHCR (`ghcr.io/cozystack/charts/etcd-operator`), versioned from the +same tag — the chart version is the tag without the leading `v`, and +`appVersion` keeps the `v`. The CRDs are generated straight into the chart and +templated into the release. ```sh -kubectl get pod -n etcd-operator-system -kubectl logs -n etcd-operator-system deploy/etcd-operator-controller-manager \ - -c manager --tail=20 +# Chart version == release tag without the leading 'v' +# (see https://github.com/cozystack/etcd-operator/releases). +VERSION=0.5.0 + +helm install etcd-operator oci://ghcr.io/cozystack/charts/etcd-operator \ + --version "$VERSION" \ + --namespace etcd-operator-system --create-namespace ``` -You should see the manager start lines and an empty work-queue (no `EtcdCluster` resources yet). +By default the chart pulls `ghcr.io/cozystack/etcd-operator:` — the +image that same release published — so a stock install has nothing to +substitute. The chart wires that ref into **both** the manager's `image:` and +its `OPERATOR_IMAGE` env var (the image launched for snapshot/restore Pods); the +two must be identical, and the chart keeps them equal for you. Override the +image via `image.repository` / `image.tag` and both follow. + +The CRDs are **templated** into the release (not in Helm's install-only `crds/` +directory), so `helm upgrade` keeps them current with the chart — no separate +CRD-apply step on upgrade. They carry `helm.sh/resource-policy: keep`, so `helm +uninstall` leaves the CRDs (and therefore your `EtcdCluster`s and their data) in +place; deleting the CRDs is a deliberate, manual step. Set `crds.enabled=false` +to manage CRDs out-of-band, or `crds.keep=false` to let uninstall remove them. + +Common values (`--set key=value`, or a `-f my-values.yaml`): + +| Value | Default | Purpose | +|---|---|---| +| `image.repository` | `ghcr.io/cozystack/etcd-operator` | Operator image repo. Override to mirror or fork. | +| `image.tag` | chart `appVersion` | Operator image tag; also becomes `OPERATOR_IMAGE` (see above). | +| `replicaCount` | `1` | Operator replicas (leader election picks the active one). | +| `kubeRbacProxy.enabled` | `true` | Front `/metrics` with the kube-rbac-proxy SubjectAccessReview sidecar. Set `false` to bind metrics on `:8080` directly with no proxy. | +| `metrics.serviceMonitor.enabled` | `false` | Create a prometheus-operator `ServiceMonitor` for the metrics endpoint (needs the `monitoring.coreos.com` CRDs and `kubeRbacProxy.enabled`). | +| `crds.enabled` / `crds.keep` | `true` / `true` | Render the CRDs with the release / annotate them so uninstall keeps them. | +| `manager.resources` | 10m/64Mi → 500m/128Mi | Manager container requests/limits. | +| `imagePullSecrets` | `[]` | Pull secrets for a private registry mirror. | + +See `charts/etcd-operator/values.yaml` for the complete, annotated list. Verify +the install: -## Manual install (no Make) +```sh +kubectl -n etcd-operator-system get deploy +``` + +With release name `etcd-operator` the Deployment is named `etcd-operator`. The +release manifests (the kubectl-apply path above) render from this same chart, so +they produce the same name. The Deployment carries the label +`control-plane=controller-manager`, a name-agnostic handle for scripts. -If you don't want to invoke the Makefile (e.g. GitOps environments where `kustomize` is run by a controller in-cluster): +## Build from source + +The repo's Makefile drives a complete install via Helm. From a checkout (needs +`helm` v3.16+ on PATH): ```sh -# CRDs -kubectl apply -f config/crd/bases/ +# 1. Build the operator image (or skip to a prebuilt registry tag). +make docker-build docker-push IMG=/etcd-operator: -# Operator + RBAC + Service, rendered by kustomize: -bin/kustomize-v5.6.0 build config/default | kubectl apply -f - +# 2. Install/upgrade the operator (CRDs + RBAC + manager) with Helm. +# `make deploy` runs `helm upgrade --install` and wires image == OPERATOR_IMAGE. +make deploy IMG=/etcd-operator: ``` -Override the image inline: +The cluster must be able to pull from ``. For local clusters (`kind` / `minikube` / `k3d`), either sideload the image (`kind load docker-image ...`) or push to an ephemeral registry the cluster can reach (e.g. `ttl.sh/:1h`); otherwise the operator Deployment goes `ImagePullBackOff` with no clear hint from the operator side. + +`make deploy` installs the release `etcd-operator` into the `etcd-operator-system` +namespace (override with `HELM_RELEASE=` / `NAMESPACE=`). The Deployment is named +after the release. Verify: + +```sh +kubectl get pod -n etcd-operator-system +kubectl -n etcd-operator-system logs deploy/etcd-operator -c manager --tail=20 +``` + +You should see the manager start lines and an empty work-queue (no `EtcdCluster` resources yet). Tear down with `make undeploy` (see [Teardown](#teardown)). + +## Rendering manifests (GitOps / no in-cluster Helm) + +For GitOps flows that apply plain YAML, render the chart with `helm template` +instead of installing it — this is exactly what `make build-dist-manifests` (and +the release pipeline) does to produce the release's `etcd-operator.yaml`: ```sh -cd config/manager && bin/kustomize-v5.6.0 edit set image controller= -cd ../.. && bin/kustomize-v5.6.0 build config/default | kubectl apply -f - +helm template etcd-operator charts/etcd-operator \ + --namespace etcd-operator-system \ + --set image.repository= --set image.tag= \ + --set namespace.create=true | kubectl apply --server-side -f - ``` -The `bin/kustomize-v*` binary is auto-downloaded by `make kustomize` (version pinned to `v5.6.0` in the Makefile); a system-installed `kustomize` works equally if you have one. +`--server-side` avoids the client-side last-applied-config annotation size limit (the consolidated manifest embeds the full CRD schemas). `namespace.create=true` emits the Namespace so the output is self-contained. -> **Set the operator image, or snapshots/restores won't run.** The `config/default` overlay rewrites both the manager's `image:` *and* its `OPERATOR_IMAGE` env var to the same ref via a kustomize replacement — `OPERATOR_IMAGE` is the image the operator launches for snapshot Jobs and restore init containers. If you bypass that overlay (e.g. `kubectl apply -f config/manager/` directly, or a base that drops the replacement), `OPERATOR_IMAGE` stays the placeholder `controller:latest`. The operator **refuses to start** on that placeholder and exits with a clear error (rather than letting snapshot/restore Pods `ImagePullBackOff` later). Always render through `config/default` (which the commands above do) or set `OPERATOR_IMAGE` to the real ref by hand. +> **The chart keeps `image:` and `OPERATOR_IMAGE` equal for you.** `OPERATOR_IMAGE` is the image the operator launches for snapshot Jobs and restore init containers; it must match the manager image. The chart renders both from `image.repository`/`image.tag` (the `etcd-operator.image` helper in `_helpers.tpl`), so setting the image once covers both. If you hand-craft manifests and leave `OPERATOR_IMAGE` at the placeholder `controller:latest`, the operator **refuses to start** and exits with a clear error (rather than letting snapshot/restore Pods `ImagePullBackOff` later). ## Create your first cluster @@ -166,7 +255,7 @@ Operator's own toolchain (relevant when building from source): | controller-runtime | v0.21 | | k8s.io/api, k8s.io/client-go | v0.33 | | controller-gen | v0.18.0 | -| kustomize | v5.6.0 | +| Helm (install/render the chart) | v3.16+ | | etcd client (`go.etcd.io/etcd/client/v3`) | v3.6.11 | | Kubebuilder layout | v4 | @@ -174,7 +263,7 @@ All pinned in `go.mod`, `Dockerfile`, and `Makefile`. ## RBAC -The operator runs as a ClusterRole — it needs to watch `EtcdCluster` and `EtcdMember` across all namespaces, plus create/delete the per-member Pods, PVCs, and Services in each user namespace. The full role lives in `config/rbac/role.yaml` (regenerated from `+kubebuilder:rbac` markers — don't hand-edit). +The operator runs as a ClusterRole — it needs to watch `EtcdCluster` and `EtcdMember` across all namespaces, plus create/delete the per-member Pods, PVCs, and Services in each user namespace. The rules are generated from the `+kubebuilder:rbac` markers (by `make manifests`) into `charts/etcd-operator/files/manager-role-rules.yaml` and pulled into the chart's templated ClusterRole — don't hand-edit; edit the markers and regenerate. Single-namespace scoping is not currently exposed: `main.go` does not wire a namespace flag into the manager's `Cache.DefaultNamespaces`, so the manager always watches all namespaces. Limiting RBAC alone (ClusterRole → Role) is not sufficient — the manager will still attempt list/watch across the cluster and the API server will deny it. Scoped deployment is a follow-up. @@ -201,12 +290,16 @@ This is outside the operator's scope but documented because operators ask. # Remove individual clusters first — their finalizers will clean up etcd state. kubectl delete etcdcluster.etcd-operator.cozystack.io --all -A -# Remove the operator. +# Remove the operator (helm uninstall). The CRDs carry +# helm.sh/resource-policy: keep, so they (and any surviving EtcdClusters) are +# intentionally left in place. make undeploy -# Remove the CRDs (only after all EtcdClusters are gone; the CRDs have -# protected finalizers via the operator). -make uninstall +# Remove the CRDs too (only after all EtcdClusters are gone) — deleting them +# cascade-deletes every remaining EtcdCluster: +kubectl delete crd etcdclusters.etcd-operator.cozystack.io \ + etcdmembers.etcd-operator.cozystack.io \ + etcdsnapshots.etcd-operator.cozystack.io ``` Deleting an `EtcdCluster` while it's running cascades through every owned resource: the operator's finalizer on each `EtcdMember` calls `MemberRemove` (when the cluster itself is also being deleted, the operator detects this and skips `MemberRemove` to avoid a deadlock — see `handleDeletion` in `controllers/etcdmember_controller.go`). Pods and PVCs are then GC'd via owner-refs. @@ -224,6 +317,23 @@ For now, in-place operator upgrades work via `kubectl set image` on the operator This is manual and slow. A native rolling upgrade is a tracked follow-up. +## kubectl-etcd plugin + +`kubectl-etcd` is an optional client-side [kubectl plugin](https://kubernetes.io/docs/tasks/extend-kubectl/kubectl-plugins/) for day-2 operations on operator-managed clusters (member list, status, defrag, compact, alarms, snapshot, member add/remove). It runs on your workstation against your kubeconfig — it is **not** part of the operator image. + +Each release attaches `kubectl-etcd--` binaries (with `cli-SHA256SUMS.txt`). Install it onto your `PATH` named `kubectl-etcd`, and kubectl picks it up as `kubectl etcd`: + +```sh +VERSION=v0.5.0; OS=$(uname -s | tr A-Z a-z); ARCH=$(uname -m | sed 's/x86_64/amd64/;s/aarch64/arm64/') +curl -sSLo kubectl-etcd "https://github.com/cozystack/etcd-operator/releases/download/$VERSION/kubectl-etcd-$OS-$ARCH" +chmod +x kubectl-etcd && sudo mv kubectl-etcd /usr/local/bin/ # any dir on $PATH works + +kubectl etcd --version +kubectl etcd members --help +``` + +Or build from a checkout with `make kubectl-etcd` (lands in `bin/kubectl-etcd`). There is no krew package yet. + ## Development Out-of-cluster development run (against the current `$KUBECONFIG`): diff --git a/docs/migration.md b/docs/migration.md index 881f2e42..c4cc2277 100644 --- a/docs/migration.md +++ b/docs/migration.md @@ -19,7 +19,16 @@ Clients that connect by DNS name keep working; one Service changes shape (ClusterIP → headless) and has consumer prerequisites — see [Endpoint compatibility](#endpoint-compatibility) before you `--apply`. -Build it with `make etcd-migrate` (lands in `bin/etcd-migrate`). +Get it from the GitHub release — each release attaches +`etcd-migrate--` binaries (with a `cli-SHA256SUMS.txt`): + +```sh +VERSION=v0.5.0; OS=$(uname -s | tr A-Z a-z); ARCH=$(uname -m | sed 's/x86_64/amd64/;s/aarch64/arm64/') +curl -sSLo etcd-migrate "https://github.com/cozystack/etcd-operator/releases/download/$VERSION/etcd-migrate-$OS-$ARCH" +chmod +x etcd-migrate && ./etcd-migrate version +``` + +Or build from a checkout with `make etcd-migrate` (lands in `bin/etcd-migrate`). ### How adoption works @@ -92,17 +101,21 @@ adoption. ### Prerequisites 1. **Scale both operators to zero.** The legacy etcd Pods keep running — only - the controllers must be quiet: + the controllers must be quiet. The legacy (v1alpha1) controller is + `etcd-operator-controller-manager`; this operator's Helm release is named + `etcd-operator`: ```sh - kubectl -n etcd-operator-system scale deploy etcd-operator-controller-manager --replicas=0 + kubectl -n etcd-operator-system scale deploy etcd-operator-controller-manager --replicas=0 # legacy + kubectl -n etcd-operator-system scale deploy etcd-operator --replicas=0 # new ``` The tool verifies this for both Deployments before doing anything (`--legacy-controller` / `--new-controller` override the coordinates, `--skip-controller-check` bypasses the gate). -2. The new CRDs (`etcd-operator.cozystack.io/v1alpha2`) must be installed - (`make install`). +2. The new CRDs (`etcd-operator.cozystack.io/v1alpha2`) must be installed — + they ship with the operator chart (`make deploy IMG=...`, or `helm install`; + see [installation](installation.md)). 3. A kubeconfig that can list/delete the legacy CRs cluster-wide, create the new ones, and patch pods/PVCs/Services. 4. **All etcd pods Ready.** Adoption refuses clusters with missing members, @@ -230,7 +243,7 @@ After `--apply` succeeds, **scale the new operator up** — it takes over the adopted clusters without touching the pods: ```sh -kubectl -n etcd-operator-system scale deploy etcd-operator-controller-manager --replicas=1 +kubectl -n etcd-operator-system scale deploy etcd-operator --replicas=1 ``` The tool deletes the migrated legacy **CRs** but never the **CRDs**. Once no diff --git a/docs/operations.md b/docs/operations.md index 0bc2efe6..c5ce76f3 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -355,7 +355,7 @@ kubectl exec -n "$POD" -- etcdctl --endpoints=http://localhost:2379 \ The operator runs in `etcd-operator-system` by default. Log lines you'll see most often: ```sh -kubectl logs -n etcd-operator-system deploy/etcd-operator-controller-manager \ +kubectl logs -n etcd-operator-system deploy/etcd-operator \ -c manager --tail=200 ``` diff --git a/hack/e2e.sh b/hack/e2e.sh index 3ecca865..dcb2ffb5 100755 --- a/hack/e2e.sh +++ b/hack/e2e.sh @@ -47,7 +47,7 @@ dump_diagnostics() { echo "--- e2e failed; dumping cluster state before teardown" kubectl get etcdclusters,etcdmembers,pods,certificates,secrets -A || true kubectl get datastores,tenantcontrolplanes -A || true - kubectl -n etcd-operator-system logs deploy/etcd-operator-controller-manager --tail=200 || true + kubectl -n etcd-operator-system logs -l control-plane=controller-manager --all-containers --tail=200 || true kubectl -n kamaji-system logs deploy/kamaji --tail=200 || true # The tenant namespace is where the longest wait (TenantControlPlane # Ready) fails — dump every pod's logs there, or the one failure mode @@ -109,8 +109,12 @@ helm upgrade --install kamaji clastix/kamaji \ echo "--- building and deploying the operator ($IMG)" docker build -t "$IMG" . kind load docker-image "$IMG" --name "$KIND_CLUSTER_NAME" -make install deploy IMG="$IMG" -kubectl -n etcd-operator-system wait deploy/etcd-operator-controller-manager \ +# Helm install: CRDs are templated into the release and image == OPERATOR_IMAGE +# is wired by the chart, so this one command lands CRDs + RBAC + manager. +make deploy IMG="$IMG" +# Select by the chart's control-plane label rather than a fixed Deployment name. +kubectl -n etcd-operator-system wait deploy \ + -l control-plane=controller-manager \ --for=condition=Available --timeout=5m echo "--- running e2e suite" diff --git a/hack/release-smoke.sh b/hack/release-smoke.sh new file mode 100755 index 00000000..f559b80a --- /dev/null +++ b/hack/release-smoke.sh @@ -0,0 +1,168 @@ +#!/usr/bin/env bash +# Release-install smoke test: exercises a release install path end to end on a +# throwaway kind cluster, then proves the installed operator actually works. +# +# 1. build the operator image at $IMG (what docker-publish.yml ships) +# 2. kind load it (stand-in for the GHCR push/pull) +# 3. install it one of two ways (INSTALL_MODE): +# manifest (default) — make build-dist-manifests + kubectl apply, the +# path release-assets.yml ships +# helm — helm install charts/etcd-operator, the path +# helm-publish.yml ships +# 4. assert the operator Deployment goes Available +# 5. create a 1-node EtcdCluster and assert it reaches READY +# +# Why this is the right test (vs. grepping the workflow/chart files): the +# contract under test is "the image the release publishes == the image the +# install deploys, and that artifact actually runs and reconciles". The single +# $IMG threaded through build, load, and install makes a tag mismatch +# impossible by construction; step 4 is where a broken mismatch WOULD surface +# (wrong tag => ImagePullBackOff => never Available). It also catches subtler +# failures static checks can't: a broken OPERATOR_IMAGE wiring (the operator +# refuses to start on the placeholder) fails step 4, and a missing RBAC rule in +# the chart fails step 5 (the cluster never goes READY). +set -euo pipefail + +# Always build/load for the host architecture; a stray +# DOCKER_DEFAULT_PLATFORM=linux/amd64 would pull an emulated kind node whose +# control plane never goes healthy. (Same rationale as hack/e2e.sh.) +unset DOCKER_DEFAULT_PLATFORM + +# ── Pinned component versions (kept in sync with hack/e2e.sh) ───────────── +KIND_VERSION=v0.32.0 +KIND_NODE_IMAGE=kindest/node:v1.34.0 + +INSTALL_MODE=${INSTALL_MODE:-manifest} # manifest | helm +KIND_CLUSTER_NAME=${KIND_CLUSTER_NAME:-etcd-operator-release-smoke-$INSTALL_MODE} +NAMESPACE=etcd-operator-system +# A registry-qualified, non-:latest tag: mirrors the real release ref +# (ghcr.io//etcd-operator:) and, being non-latest, makes the +# default imagePullPolicy IfNotPresent — so the kind-loaded image is used +# instead of attempting a registry pull. +IMG=${IMG:-ghcr.io/cozystack/etcd-operator:v0.0.0-smoke} +KEEP_CLUSTER=${KEEP_CLUSTER:-} + +case "$INSTALL_MODE" in + manifest|helm) ;; + *) echo "ERROR: INSTALL_MODE must be 'manifest' or 'helm', got '$INSTALL_MODE'"; exit 2 ;; +esac + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +cd "$ROOT" +LOCALBIN="$ROOT/bin" +mkdir -p "$LOCALBIN" +export PATH="$LOCALBIN:$PATH" + +if ! command -v kind >/dev/null 2>&1; then + echo "--- installing kind $KIND_VERSION into $LOCALBIN" + GOBIN="$LOCALBIN" go install sigs.k8s.io/kind@"$KIND_VERSION" +fi +if [ "$INSTALL_MODE" = helm ] && ! command -v helm >/dev/null 2>&1; then + echo "ERROR: INSTALL_MODE=helm requires helm on PATH"; exit 2 +fi + +dump_diagnostics() { + echo "--- release smoke ($INSTALL_MODE) failed; dumping cluster state before teardown" + kubectl get etcdclusters,etcdmembers,pods -A || true + kubectl -n "$NAMESPACE" get deploy -o wide || true + kubectl -n "$NAMESPACE" describe deploy || true + kubectl -n "$NAMESPACE" logs -l app.kubernetes.io/name=etcd-operator \ + --all-containers --tail=200 || true + kubectl -n "$NAMESPACE" logs -l control-plane=controller-manager \ + --all-containers --tail=200 || true + # The most informative failure signals: an ImagePullBackOff (tag mismatch) + # or the operator's OPERATOR_IMAGE-placeholder refusal show up here. + kubectl -n "$NAMESPACE" get events --sort-by=.lastTimestamp | tail -30 || true + for p in $(kubectl get pods -l etcd-operator.cozystack.io/cluster=smoke -o name 2>/dev/null); do + echo "--- logs: $p" + kubectl logs "$p" --all-containers --tail=100 || true + done +} + +cleanup() { + status=$? + [ "$status" -ne 0 ] && dump_diagnostics + if [ -n "$KEEP_CLUSTER" ]; then + echo "--- KEEP_CLUSTER set; kind cluster '$KIND_CLUSTER_NAME' left running" + return + fi + echo "--- deleting kind cluster '$KIND_CLUSTER_NAME'" + kind delete cluster --name "$KIND_CLUSTER_NAME" >/dev/null 2>&1 || true +} +trap cleanup EXIT + +echo "--- creating kind cluster '$KIND_CLUSTER_NAME' ($KIND_NODE_IMAGE)" +kind create cluster --name "$KIND_CLUSTER_NAME" --image "$KIND_NODE_IMAGE" --wait 5m +kubectl config use-context "kind-$KIND_CLUSTER_NAME" + +echo "--- building operator image ($IMG) and loading it into kind" +docker build -t "$IMG" . +kind load docker-image "$IMG" --name "$KIND_CLUSTER_NAME" + +if [ "$INSTALL_MODE" = manifest ]; then + echo "--- [manifest] rendering release install manifests (IMG=$IMG)" + make build-dist-manifests IMG="$IMG" + # build-dist-manifests must render purely (it is `helm template` piped through + # yq): it writes only dist/ (gitignored) and must not mutate any tracked file. + # A dirty tree here means a regression reintroduced an in-place edit, which + # would also spuriously trip ci.yml's codegen-drift gate. Assert cleanliness + # (skipped if this isn't a git checkout, e.g. a release tarball). + if command -v git >/dev/null 2>&1 && git rev-parse --git-dir >/dev/null 2>&1 \ + && [ -n "$(git status --porcelain --untracked-files=no)" ]; then + echo "ERROR: 'make build-dist-manifests' modified tracked files (it must render purely):" + git --no-pager status --porcelain --untracked-files=no + git --no-pager diff + exit 1 + fi + echo "--- [manifest] installing from the rendered release manifest" + # Server-side apply: the consolidated manifest embeds the full CRD schemas, + # whose size can exceed the client-side last-applied-config annotation + # limit. This is also the documented release-install path. + kubectl apply --server-side -f dist/etcd-operator.yaml +else + echo "--- [helm] installing the chart (image=$IMG)" + # CRDs are templated into the chart and committed (drift-gated), so no sync + # step is needed. Split $IMG into repo:tag for image.repository / image.tag. + helm upgrade --install etcd-operator charts/etcd-operator \ + --namespace "$NAMESPACE" --create-namespace \ + --set image.repository="${IMG%:*}" \ + --set image.tag="${IMG##*:}" \ + --wait --timeout 5m +fi + +echo "--- waiting for the operator to become Available" +# Fails (times out) on either a tag mismatch (ImagePullBackOff) or a broken +# OPERATOR_IMAGE substitution (operator refuses to start on the placeholder). +# Select by the label both install paths set, so this is mode-agnostic. +kubectl -n "$NAMESPACE" wait deploy \ + -l control-plane=controller-manager \ + --for=condition=Available --timeout=5m + +echo "--- bootstrapping a 1-node EtcdCluster to prove the operator reconciles" +kubectl apply -f - <<'EOF' +apiVersion: etcd-operator.cozystack.io/v1alpha2 +kind: EtcdCluster +metadata: + name: smoke + namespace: default +spec: + replicas: 1 + version: 3.6.11 + storage: + size: 256Mi +EOF + +echo "--- waiting for EtcdCluster 'smoke' to reach READY=1" +# Poll readyMembers rather than `kubectl wait --for=condition`: the cluster's +# Available condition may not be registered on the object until the first +# status write, which makes an early `wait` error out ("no matching condition"). +deadline=$(( $(date +%s) + 300 )) +until [ "$(kubectl get etcdcluster smoke -o jsonpath='{.status.readyMembers}' 2>/dev/null || echo 0)" = "1" ]; do + if [ "$(date +%s)" -ge "$deadline" ]; then + echo "ERROR: EtcdCluster 'smoke' did not reach READY=1 within 5m" + exit 1 + fi + sleep 5 +done + +echo "--- release-install smoke PASSED (mode=$INSTALL_MODE, operator Available, cluster READY=1, IMG=$IMG)" diff --git a/main.go b/main.go index e79b46a0..1aac2b56 100644 --- a/main.go +++ b/main.go @@ -48,16 +48,18 @@ import ( const defaultClusterDomain = "cluster.local" -// placeholderOperatorImage is the image:/OPERATOR_IMAGE value baked into -// config/manager as the kustomize-replacement target. Running with it -// un-rewritten means snapshot/restore Pods would ImagePullBackOff forever. +// placeholderOperatorImage is the un-set sentinel image ref. The Helm chart +// always renders a real repository:tag (and keeps image == OPERATOR_IMAGE), so +// this only trips when the operator is run with OPERATOR_IMAGE explicitly left +// at the placeholder — running on it means snapshot/restore Pods would +// ImagePullBackOff forever. const placeholderOperatorImage = "controller:latest" // operatorImageError rejects the un-substituted image placeholder so the // operator fails fast at startup rather than silently at the first snapshot. func operatorImageError(img string) error { if img == placeholderOperatorImage { - return fmt.Errorf("operator image is the un-substituted placeholder %q; set OPERATOR_IMAGE / --operator-image to the real operator image (deploy via the config/default overlay or `make deploy IMG=...`), otherwise snapshot/restore Pods will ImagePullBackOff", img) + return fmt.Errorf("operator image is the un-substituted placeholder %q; set OPERATOR_IMAGE / --operator-image to the real operator image (deploy with `make deploy IMG=...`, or `helm install --set image.repository= --set image.tag=`), otherwise snapshot/restore Pods will ImagePullBackOff", img) } return nil }