diff --git a/book/src/configuration/configurability.md b/book/src/configuration/configurability.md index d691bf394c..c9dc95df79 100644 --- a/book/src/configuration/configurability.md +++ b/book/src/configuration/configurability.md @@ -547,7 +547,7 @@ values files. | `REGISTRY_PULL_SECRET` | Raw registry API key | **Raw key string** (e.g. `nvapi-...`). Not a file path. Not a JSON dockerconfig. | | `REGISTRY_PULL_USERNAME` | Registry username | Defaults to `$oauthtoken` (correct for `nvcr.io`) | | `KUBECONFIG` | Cluster kubeconfig | Filesystem path | -| `NICO_SITE_UUID` | Stable UUID for this site | UUIDv4. Defaults to a fixed dev UUID — override per real site. | +| `NICO_SITE_UUID` | Stable UUID for this site | UUIDv4. If unset, `setup.sh` generates a random UUID each run. | | `PREFLIGHT_CHECK_IMAGE` | Image for per-node preflight checks | Defaults to `busybox:1.36`. Override for air-gapped clusters. | Inside the cluster, `nico-api` discovers Vault, Postgres, and SPIFFE settings diff --git a/docs/getting-started/quick-start.md b/docs/getting-started/quick-start.md index b00ee43f2c..99d4350e6e 100644 --- a/docs/getting-started/quick-start.md +++ b/docs/getting-started/quick-start.md @@ -89,7 +89,7 @@ Obtain an NGC API key at [ngc.nvidia.com](https://ngc.nvidia.com) → **API Keys | `NICO_CORE_IMAGE_TAG` | **Yes** | NICo Core image tag (e.g. `v2025.12.30`). | | `NICO_REST_IMAGE_TAG` | **Yes** | NICo REST image tag (e.g. `v1.0.4`). | | `KUBECONFIG` | **Yes** | Path to your cluster kubeconfig. | -| `NICO_SITE_UUID` | No | Stable UUID for this site. Defaults to `a1b2c3d4-e5f6-4000-8000-000000000001`. | +| `NICO_SITE_UUID` | No | Stable UUID for this site. If unset, `setup.sh` generates a random UUID each run. | ### 3b. Set your Site Name @@ -245,7 +245,7 @@ All IPs must be within the `IPAddressPool` ranges you defined in `values/metallb ### 3i. (Optional) Set a Stable Site UUID -If you want a specific site UUID instead of the default placeholder, set the `NICO_SITE_UUID` environment variable: +If you want a specific site UUID instead of a random UUID generated by `setup.sh`, set the `NICO_SITE_UUID` environment variable: ```bash export NICO_SITE_UUID= # must be a valid UUID v4 diff --git a/helm-prereqs/README.md b/helm-prereqs/README.md index fc1361d9b7..b1b5429dec 100644 --- a/helm-prereqs/README.md +++ b/helm-prereqs/README.md @@ -101,7 +101,7 @@ The tables below summarize the keys that must be set per site. | `NICO_IMAGE_REGISTRY` | Yes, unless `--skip-core --skip-rest` | Base image registry for all NICo images (e.g. `my-registry.example.com/nico`) | | `NICO_CORE_IMAGE_TAG` | Yes, unless `--skip-core` | NICo Core image tag (e.g. `v2025.12.30-rc1`) | | `NICO_REST_IMAGE_TAG` | Yes, unless `--skip-rest` | NICo REST image tag (e.g. `v1.0.4`) | -| `NICO_SITE_UUID` | No | Stable UUID for this site. Defaults to `a1b2c3d4-e5f6-4000-8000-000000000001`. | +| `NICO_SITE_UUID` | No | Stable UUID for this site. If unset, `setup.sh` generates a random UUID each run. | | `NICO_MANAGE_DEFAULT_STORAGE_CLASS` | No | Whether `setup.sh` marks `local-path` as the default StorageClass. Defaults to `true`. Set to `false` when the cluster already has an operator-managed default StorageClass. | | `NICO_STORAGE_CLASS` | No | StorageClass used by Vault data/audit PVCs. Defaults to `local-path-persistent`. | | `PREFLIGHT_CHECK_IMAGE` | No | Image used for preflight per-node checks. Defaults to `busybox:1.36`; set to a local mirror for air-gapped clusters. | diff --git a/helm-prereqs/setup.sh b/helm-prereqs/setup.sh index 09e67140b7..d907a665b7 100755 --- a/helm-prereqs/setup.sh +++ b/helm-prereqs/setup.sh @@ -37,8 +37,8 @@ # preloaded, or use existing imagePullSecrets. # REGISTRY_PULL_USERNAME Username for generated pull secrets. # Default: $oauthtoken -# NICO_SITE_UUID Stable REST site UUID. Used only when REST is -# deployed. Default is a dev placeholder. +# NICO_SITE_UUID REST site UUID. Used only when REST is deployed. +# If unset, setup generates a random UUID each run. # NICO_MANAGE_DEFAULT_STORAGE_CLASS # Whether setup annotates local-path as the default # StorageClass. Default: true. @@ -629,13 +629,82 @@ _TEMPORAL_TLS="--tls-cert-path /var/secrets/temporal/certs/server-interservice/t --tls-key-path /var/secrets/temporal/certs/server-interservice/tls.key \ --tls-ca-path /var/secrets/temporal/certs/server-interservice/ca.crt \ --tls-server-name interservice.server.temporal.local" -kubectl exec -n temporal deploy/temporal-admintools -- \ - sh -c "temporal operator namespace create -n cloud --retention 72h --address ${_TEMPORAL_ADDR} ${_TEMPORAL_TLS}" 2>/dev/null || true -kubectl exec -n temporal deploy/temporal-admintools -- \ - sh -c "temporal operator namespace create -n site --retention 72h --address ${_TEMPORAL_ADDR} ${_TEMPORAL_TLS}" 2>/dev/null || true +_wait_for_temporal() { + local _output="" + + echo "Waiting for Temporal frontend and admin tools..." + kubectl rollout status deploy/temporal-frontend -n temporal --timeout=120s + kubectl rollout status deploy/temporal-admintools -n temporal --timeout=120s + + for _i in $(seq 1 24); do + if _output="$(kubectl exec -n temporal deploy/temporal-admintools -- \ + sh -c "temporal operator namespace list --address ${_TEMPORAL_ADDR} ${_TEMPORAL_TLS}" 2>&1)"; then + echo "Temporal frontend ready" + return + fi + echo " Waiting for Temporal API (${_i}/24)..." + sleep 5 + done + + echo "ERROR: Temporal frontend is not ready for namespace operations" >&2 + echo "${_output}" >&2 + exit 1 +} + +_create_temporal_namespace() { + local _namespace="$1" + local _output + + if _output="$(kubectl exec -n temporal deploy/temporal-admintools -- \ + sh -c "temporal operator namespace create -n \"\$1\" --retention 72h --address ${_TEMPORAL_ADDR} ${_TEMPORAL_TLS}" \ + sh "${_namespace}" 2>&1)"; then + echo "Temporal namespace ${_namespace} ready" + return + fi + + if printf "%s" "${_output}" | grep -qi "already exists"; then + echo "Temporal namespace ${_namespace} already exists" + return + fi + + echo "ERROR: failed to create Temporal namespace ${_namespace}" >&2 + echo "${_output}" >&2 + exit 1 +} + +_verify_temporal_namespaces() { + local _output + local _missing=() + local _namespace + + if ! _output="$(kubectl exec -n temporal deploy/temporal-admintools -- \ + sh -c "temporal operator namespace list --address ${_TEMPORAL_ADDR} ${_TEMPORAL_TLS}" 2>&1)"; then + echo "ERROR: failed to list Temporal namespaces" >&2 + echo "${_output}" >&2 + exit 1 + fi + + for _namespace in "$@"; do + if ! printf "%s" "${_output}" | grep -Eq "(^|[^[:alnum:]_-])${_namespace}([^[:alnum:]_-]|$)"; then + _missing+=("${_namespace}") + fi + done + + if [[ ${#_missing[@]} -gt 0 ]]; then + echo "ERROR: missing Temporal namespace(s): ${_missing[*]}" >&2 + echo "${_output}" >&2 + exit 1 + fi + + echo "Verified Temporal namespaces: $*" +} + +_wait_for_temporal +_create_temporal_namespace cloud +_create_temporal_namespace site # flow Temporal namespace — required by NICo Flow workers; pod panics on startup if absent. -kubectl exec -n temporal deploy/temporal-admintools -- \ - sh -c "temporal operator namespace create -n flow --retention 72h --address ${_TEMPORAL_ADDR} ${_TEMPORAL_TLS}" 2>/dev/null || true +_create_temporal_namespace flow +_verify_temporal_namespaces cloud site flow echo "Temporal namespaces ready" _SETUP_PHASE="[7g/7] NICo REST helm chart" @@ -710,8 +779,13 @@ fi # All of this is wired via --set flags so nico-rest.yaml stays registry-agnostic. NICO_SITE_AGENT_CHART="${NICO_REST_HELM_DIR}/nico-rest-site-agent" -# Stable placeholder UUID for this site (must be a valid UUID). -NICO_SITE_UUID="${NICO_SITE_UUID:-a1b2c3d4-e5f6-4000-8000-000000000001}" +if [[ -z "${NICO_SITE_UUID:-}" ]]; then + if ! command -v python3 &>/dev/null; then + echo "ERROR: NICO_SITE_UUID is unset and python3 is not available" >&2 + exit 1 + fi + NICO_SITE_UUID="$(python3 -c 'import uuid; print(uuid.uuid4())')" +fi NICO_SITE_AGENT_ARGS=( --namespace nico-rest @@ -762,8 +836,8 @@ _TEMPORAL_TLS="--tls-cert-path /var/secrets/temporal/certs/server-interservice/t --tls-key-path /var/secrets/temporal/certs/server-interservice/tls.key \ --tls-ca-path /var/secrets/temporal/certs/server-interservice/ca.crt \ --tls-server-name interservice.server.temporal.local" -kubectl exec -n temporal deploy/temporal-admintools -- \ - sh -c "temporal operator namespace create -n '${NICO_SITE_UUID}' --retention 72h --address ${_TEMPORAL_ADDR} ${_TEMPORAL_TLS}" 2>/dev/null || true +_create_temporal_namespace "${NICO_SITE_UUID}" +_verify_temporal_namespaces "${NICO_SITE_UUID}" echo "Temporal namespace ready" # FLOW_GRPC_ENABLED toggles the site-agent's Flow gRPC client (see diff --git a/helm-prereqs/values/nico-rest.yaml b/helm-prereqs/values/nico-rest.yaml index 02da3426a3..fa7d404f79 100644 --- a/helm-prereqs/values/nico-rest.yaml +++ b/helm-prereqs/values/nico-rest.yaml @@ -38,45 +38,3 @@ nico-rest-workflow: replicaCount: 3 siteWorker: replicaCount: 3 - -# Site-agent config — v1.0.4 binary reads DB config from env vars. -# NICo postgres uses the 'nico' user and 'elektratest' database. -# CLUSTER_ID and TEMPORAL_SUBSCRIBE_* are set via --set in setup.sh -# using the NICO_SITE_UUID variable (default: a1b2c3d4-e5f6-4000-8000-000000000001). -nico-rest-site-agent: - replicaCount: 3 - bootstrap: - enabled: true - siteManager: - address: "nico-rest-site-manager.nico-rest:8100" - certificate: - # Service identifier must match "elektra-site-agent" for nico-api's SiteAgent RBAC role. - # The base path /nico-system/sa/ is one of nico-api's recognized spiffe_service_base_paths. - uris: - - "spiffe://nico.local/nico-system/sa/elektra-site-agent" - envConfig: - # DEV ONLY — these values match the dev postgres instance deployed by setup.sh. - # DB_USER and DB_PASSWORD are injected from the db-creds Secret (secrets.dbCreds). - DB_ADDR: "postgres.postgres.svc.cluster.local" - DB_DATABASE: "elektratest" - DB_PORT: "5432" - ESA_PORT: "8080" - METRICS_PORT: "2112" - DEV_MODE: "true" - ENABLE_DEBUG: "true" - ENABLE_TLS: "true" - # mTLS to nico-api (NICO_SEC_OPT=2). Cert issued from vault-nico-issuer - # so nico-api trusts it (same Vault PKI CA as nico-api's own cert). - NICO_ADDRESS: "nico-api.nico-system.svc.cluster.local:1079" - NICO_SEC_OPT: "2" - CLUSTER_ID: "a1b2c3d4-e5f6-4000-8000-000000000001" - TEMPORAL_HOST: "temporal-frontend.temporal" - TEMPORAL_PORT: "7233" - TEMPORAL_SERVER: "interservice.server.temporal.local" - TEMPORAL_PUBLISH_NAMESPACE: "site" - TEMPORAL_PUBLISH_QUEUE: "site" - TEMPORAL_SUBSCRIBE_NAMESPACE: "a1b2c3d4-e5f6-4000-8000-000000000001" - TEMPORAL_SUBSCRIBE_QUEUE: "site" - TEMPORAL_INVENTORY_SCHEDULE: "@every 3m" - TEMPORAL_CERT_PATH: "/etc/temporal-certs" - TEMPORAL_CERT: "temporal-client-site-agent-certs"