|
59 | 59 | # ./setup.sh -y # skip all prompts, deploy everything automatically |
60 | 60 | # ./setup.sh --skip-core # skip Phase 6 NICo Core (print command, deploy manually) |
61 | 61 | # ./setup.sh --skip-rest # skip Phase 7 NICo REST entirely (no repo needed) |
62 | | -# ./setup.sh --skip-flow # skip Phase 7i NICo Flow (REST still installs) |
| 62 | +# ./setup.sh --skip-flow # skip Phase 7h NICo Flow (REST still installs) |
63 | 63 | # # pair with helm-prereqs/values.yaml::flow.enabled=false |
64 | 64 | # # to skip Flow prereqs (DBs / ESO / vault tokens) too |
65 | 65 | # ./setup.sh --skip-core --skip-rest # fully non-interactive infra-only run |
@@ -690,9 +690,147 @@ else |
690 | 690 | exit 0 |
691 | 691 | fi |
692 | 692 |
|
693 | | -# --- 7h. NICo REST site-agent ------------------------------------------------- |
| 693 | +# --- 7h. NICo Flow ------------------------------------------------------------ |
| 694 | +# Flow is the rack lifecycle orchestrator (formerly RLA). Single pod with three |
| 695 | +# containers — flow (50051), psm (50052), nsm (50053). Runs in its own `flow` |
| 696 | +# namespace. |
| 697 | +# |
| 698 | +# Runs BEFORE the site-agent (7i) so that flow.flow.svc.cluster.local:50051 |
| 699 | +# exists when the site-agent starts and attempts its Flow gRPC connection. |
| 700 | +# |
| 701 | +# Prerequisites already in place by this point: |
| 702 | +# - flow/psm/nsm databases on nico-pg-cluster (helm-prereqs postgresql.yaml) |
| 703 | +# - flow.nico/psm.nico/nsm.nico DB credentials synced via ESO into the flow |
| 704 | +# namespace by the flow-db-eso / psm-db-eso / nsm-db-eso ClusterExternalSecrets |
| 705 | +# - psm-vault-token and nsm-vault-token Secrets in the flow namespace |
| 706 | +# (provisioned by the flow-vault-tokens post-install hook) |
| 707 | +# - Temporal `flow` namespace (created in phase 7f above) |
| 708 | +# - nico-rest-ca-issuer ClusterIssuer (installed by phase 7b — issues the |
| 709 | +# temporal-client-certs) |
| 710 | +# - vault-nico-issuer ClusterIssuer (issues the SPIFFE cert) |
| 711 | +# |
| 712 | +# Same pre-apply-cert dance as the site-agent: render the Certificate(s) ahead |
| 713 | +# of the helm install so cert-manager has time to issue them and the pod doesn't |
| 714 | +# hit a FailedMount race on the spiffe / temporal-client-certs secrets. |
| 715 | +if "${SKIP_FLOW}"; then |
| 716 | + echo "=== [7h/7] NICo Flow — skipped (--skip-flow) ===" |
| 717 | +else |
| 718 | + _SETUP_PHASE="[7h/7] NICo Flow" |
| 719 | + echo "=== [7h/7] NICo Flow ===" |
| 720 | + |
| 721 | + NICO_FLOW_CHART="${SCRIPT_DIR}/../helm/charts/nico-flow" |
| 722 | + NICO_FLOW_NAMESPACE="flow" |
| 723 | + |
| 724 | + NICO_FLOW_ARGS=( |
| 725 | + --namespace "${NICO_FLOW_NAMESPACE}" |
| 726 | + --create-namespace |
| 727 | + --set "global.image.repository=${NICO_IMAGE_REGISTRY}" |
| 728 | + ## Flow (nico-flow / nico-psm / nico-nsm) ships on the same image release |
| 729 | + ## line as NICo REST — they're built and tagged together — so reuse |
| 730 | + ## NICO_REST_IMAGE_TAG, not NICO_CORE_IMAGE_TAG (which is carbide-api). |
| 731 | + --set "global.image.tag=${NICO_REST_IMAGE_TAG}" |
| 732 | + ) |
| 733 | + |
| 734 | + # Render the dockerconfigjson for the chart-managed image-pull-secret. Same |
| 735 | + # pattern as the NICo REST common chart — keep the registry credential on |
| 736 | + # the helm command line so the chart template can install it as a |
| 737 | + # pre-install hook (pod can't pull from nvcr.io otherwise). |
| 738 | + if [[ -n "${REGISTRY_PULL_SECRET:-}" ]]; then |
| 739 | + _flow_registry_server="${NICO_IMAGE_REGISTRY%%/*}" |
| 740 | + _flow_docker_cfg="$(printf '{"auths":{"%s":{"username":"%s","password":"%s"}}}' \ |
| 741 | + "${_flow_registry_server}" \ |
| 742 | + "${REGISTRY_PULL_USERNAME:-\$oauthtoken}" \ |
| 743 | + "${REGISTRY_PULL_SECRET}" | base64 | tr -d '\n')" |
| 744 | + NICO_FLOW_ARGS+=( |
| 745 | + --set "global.imagePullSecrets[0].name=image-pull-secret" |
| 746 | + --set "imagePullSecret.dockerconfigjson=${_flow_docker_cfg}" |
| 747 | + ) |
| 748 | + fi |
| 749 | + |
| 750 | + # Pre-apply Certificates so cert-manager can issue secrets before the pod schedules. |
| 751 | + echo "Pre-applying flow Certificates (SPIFFE + Temporal client)..." |
| 752 | + helm template flow "${NICO_FLOW_CHART}" \ |
| 753 | + "${NICO_FLOW_ARGS[@]}" \ |
| 754 | + --show-only templates/namespace.yaml | kubectl apply -f - |
| 755 | + helm template flow "${NICO_FLOW_CHART}" \ |
| 756 | + "${NICO_FLOW_ARGS[@]}" \ |
| 757 | + --show-only templates/certificate.yaml | kubectl apply -f - |
| 758 | + kubectl annotate certificate/flow-certificate -n "${NICO_FLOW_NAMESPACE}" \ |
| 759 | + "meta.helm.sh/release-name=flow" \ |
| 760 | + "meta.helm.sh/release-namespace=${NICO_FLOW_NAMESPACE}" --overwrite |
| 761 | + kubectl annotate certificate/temporal-client-certs -n "${NICO_FLOW_NAMESPACE}" \ |
| 762 | + "meta.helm.sh/release-name=flow" \ |
| 763 | + "meta.helm.sh/release-namespace=${NICO_FLOW_NAMESPACE}" --overwrite |
| 764 | + kubectl label certificate/flow-certificate -n "${NICO_FLOW_NAMESPACE}" \ |
| 765 | + "app.kubernetes.io/managed-by=Helm" --overwrite |
| 766 | + kubectl label certificate/temporal-client-certs -n "${NICO_FLOW_NAMESPACE}" \ |
| 767 | + "app.kubernetes.io/managed-by=Helm" --overwrite |
| 768 | + |
| 769 | + # Annotate/label the namespace itself — the flow-vault-tokens-job (nico-prereqs |
| 770 | + # helm hook) creates this namespace ahead of the flow release. Without Helm |
| 771 | + # ownership metadata, helm install refuses to adopt it. |
| 772 | + kubectl annotate namespace "${NICO_FLOW_NAMESPACE}" \ |
| 773 | + "meta.helm.sh/release-name=flow" \ |
| 774 | + "meta.helm.sh/release-namespace=${NICO_FLOW_NAMESPACE}" --overwrite |
| 775 | + kubectl label namespace "${NICO_FLOW_NAMESPACE}" \ |
| 776 | + "app.kubernetes.io/managed-by=Helm" --overwrite |
| 777 | + |
| 778 | + echo "Waiting for cert-manager to issue flow-certificate..." |
| 779 | + kubectl wait --for=condition=Ready certificate/flow-certificate \ |
| 780 | + -n "${NICO_FLOW_NAMESPACE}" --timeout=120s |
| 781 | + echo "Waiting for cert-manager to issue temporal-client-certs..." |
| 782 | + kubectl wait --for=condition=Ready certificate/temporal-client-certs \ |
| 783 | + -n "${NICO_FLOW_NAMESPACE}" --timeout=120s |
| 784 | + |
| 785 | + # Wait for the psm/nsm vault tokens and DB credential ESO syncs to land |
| 786 | + # (provisioned by helm-prereqs hooks; may still be in flight if nico-prereqs |
| 787 | + # was re-installed just before this phase). Fail-fast if any secret never |
| 788 | + # shows up — the alternative (silently falling through to helm install) is |
| 789 | + # 5 minutes of FailedMount-loop before helm gives up with an opaque message. |
| 790 | + _wait_for_secret() { |
| 791 | + local _name="$1" |
| 792 | + local _ns="$2" |
| 793 | + local _hint="$3" |
| 794 | + for _i in $(seq 1 24); do |
| 795 | + if kubectl get secret "${_name}" -n "${_ns}" >/dev/null 2>&1; then |
| 796 | + echo " ${_name} ready" |
| 797 | + return 0 |
| 798 | + fi |
| 799 | + echo " Waiting for ${_name} (${_i}/24)..." |
| 800 | + sleep 5 |
| 801 | + done |
| 802 | + echo "ERROR: Secret ${_name} did not appear in namespace ${_ns} within 120s." |
| 803 | + echo " ${_hint}" |
| 804 | + return 1 |
| 805 | + } |
| 806 | + |
| 807 | + echo "Waiting for psm/nsm Vault tokens..." |
| 808 | + for _s in psm-vault-token nsm-vault-token; do |
| 809 | + _wait_for_secret "${_s}" "${NICO_FLOW_NAMESPACE}" \ |
| 810 | + "Provisioned by the flow-vault-tokens helm hook in nico-prereqs. Check 'kubectl logs -n nico-system job/flow-vault-tokens' and confirm helm-prereqs/values.yaml::flow.enabled=true." |
| 811 | + done |
| 812 | + |
| 813 | + echo "Waiting for flow/psm/nsm DB credentials..." |
| 814 | + for _s in flow.nico.nico-pg-cluster.credentials \ |
| 815 | + psm.nico.nico-pg-cluster.credentials \ |
| 816 | + nsm.nico.nico-pg-cluster.credentials; do |
| 817 | + _wait_for_secret "${_s}" "${NICO_FLOW_NAMESPACE}" \ |
| 818 | + "Synced by the flow-db-eso/psm-db-eso/nsm-db-eso ClusterExternalSecrets in nico-prereqs. Check 'kubectl describe clusterexternalsecret -A | grep flow' and confirm helm-prereqs/values.yaml::flow.enabled=true." |
| 819 | + done |
| 820 | + |
| 821 | + echo "Installing flow helm chart..." |
| 822 | + helm upgrade --install flow "${NICO_FLOW_CHART}" \ |
| 823 | + "${NICO_FLOW_ARGS[@]}" \ |
| 824 | + --timeout 300s --wait |
| 825 | + echo "NICo Flow deployed" |
| 826 | +fi |
| 827 | + |
| 828 | +# --- 7i. NICo REST site-agent ------------------------------------------------- |
694 | 829 | # The site-agent is a separate chart from the main NICo REST umbrella. |
695 | 830 | # |
| 831 | +# Runs AFTER NICo Flow (7h) so that flow.flow.svc.cluster.local:50051 is |
| 832 | +# reachable when the site-agent starts its Flow gRPC connection. |
| 833 | +# |
696 | 834 | # Bootstrap order: |
697 | 835 | # 1. Create the per-site Temporal namespace BEFORE helm install so the |
698 | 836 | # site-agent never starts without it (starting without it causes an |
@@ -725,8 +863,8 @@ if [[ -n "${REGISTRY_PULL_SECRET:-}" ]]; then |
725 | 863 | ) |
726 | 864 | fi |
727 | 865 |
|
728 | | -_SETUP_PHASE="[7h/7] NICo REST site-agent" |
729 | | -echo "=== [7h/7] NICo REST site-agent (site UUID: ${NICO_SITE_UUID}) ===" |
| 866 | +_SETUP_PHASE="[7i/7] NICo REST site-agent" |
| 867 | +echo "=== [7i/7] NICo REST site-agent (site UUID: ${NICO_SITE_UUID}) ===" |
730 | 868 |
|
731 | 869 | # Pre-apply the Certificate resource so cert-manager issues the NICo gRPC client |
732 | 870 | # cert BEFORE the StatefulSet pod starts. Without this, there is a race: helm creates |
@@ -769,7 +907,7 @@ echo "Temporal namespace ready" |
769 | 907 | # FLOW_GRPC_ENABLED toggles the site-agent's Flow gRPC client (see |
770 | 908 | # carbide-rest/site-agent/pkg/components/config/config_manager.go — |
771 | 909 | # strings.ToLower(env)=="true"). Without it, site-agent never opens a |
772 | | -# connection to the Flow pod deployed in phase 7i. We default it ON when |
| 910 | +# connection to the Flow pod deployed in phase 7h. We default it ON when |
773 | 911 | # Flow itself is being deployed; users can flip it back via --set when |
774 | 912 | # pairing --skip-flow. |
775 | 913 | _FLOW_GRPC_ENABLED="true" |
@@ -815,139 +953,6 @@ if [ "${_CONNECTED}" = "false" ]; then |
815 | 953 | echo "Site-agent pod restarted — gRPC connection will be retried" |
816 | 954 | fi |
817 | 955 |
|
818 | | -# --- 7i. NICo Flow ------------------------------------------------------------ |
819 | | -# Flow is the rack lifecycle orchestrator (formerly RLA). Single pod with three |
820 | | -# containers — flow (50051), psm (50052), nsm (50053). Runs in its own `flow` |
821 | | -# namespace. |
822 | | -# |
823 | | -# Prerequisites already in place by this point: |
824 | | -# - flow/psm/nsm databases on nico-pg-cluster (helm-prereqs postgresql.yaml) |
825 | | -# - flow.nico/psm.nico/nsm.nico DB credentials synced via ESO into the flow |
826 | | -# namespace by the flow-db-eso / psm-db-eso / nsm-db-eso ClusterExternalSecrets |
827 | | -# - psm-vault-token and nsm-vault-token Secrets in the flow namespace |
828 | | -# (provisioned by the flow-vault-tokens post-install hook) |
829 | | -# - Temporal `flow` namespace (created in phase 7f above) |
830 | | -# - nico-rest-ca-issuer ClusterIssuer (installed by phase 7b — issues the |
831 | | -# temporal-client-certs) |
832 | | -# - vault-nico-issuer ClusterIssuer (issues the SPIFFE cert) |
833 | | -# |
834 | | -# Same pre-apply-cert dance as the site-agent: render the Certificate(s) ahead |
835 | | -# of the helm install so cert-manager has time to issue them and the pod doesn't |
836 | | -# hit a FailedMount race on the spiffe / temporal-client-certs secrets. |
837 | | -if "${SKIP_FLOW}"; then |
838 | | - echo "=== [7i/7] NICo Flow — skipped (--skip-flow) ===" |
839 | | - _SETUP_PHASE="complete" |
840 | | - exit 0 |
841 | | -fi |
842 | | -_SETUP_PHASE="[7i/7] NICo Flow" |
843 | | -echo "=== [7i/7] NICo Flow ===" |
844 | | - |
845 | | -NICO_FLOW_CHART="${SCRIPT_DIR}/../helm/charts/nico-flow" |
846 | | -NICO_FLOW_NAMESPACE="flow" |
847 | | - |
848 | | -NICO_FLOW_ARGS=( |
849 | | - --namespace "${NICO_FLOW_NAMESPACE}" |
850 | | - --create-namespace |
851 | | - --set "global.image.repository=${NICO_IMAGE_REGISTRY}" |
852 | | - ## Flow (nico-flow / nico-psm / nico-nsm) ships on the same image release |
853 | | - ## line as NICo REST — they're built and tagged together — so reuse |
854 | | - ## NICO_REST_IMAGE_TAG, not NICO_CORE_IMAGE_TAG (which is carbide-api). |
855 | | - --set "global.image.tag=${NICO_REST_IMAGE_TAG}" |
856 | | -) |
857 | | - |
858 | | -# Render the dockerconfigjson for the chart-managed image-pull-secret. Same |
859 | | -# pattern as the NICo REST common chart — keep the registry credential on |
860 | | -# the helm command line so the chart template can install it as a |
861 | | -# pre-install hook (pod can't pull from nvcr.io otherwise). |
862 | | -if [[ -n "${REGISTRY_PULL_SECRET:-}" ]]; then |
863 | | - _flow_registry_server="${NICO_IMAGE_REGISTRY%%/*}" |
864 | | - _flow_docker_cfg="$(printf '{"auths":{"%s":{"username":"%s","password":"%s"}}}' \ |
865 | | - "${_flow_registry_server}" \ |
866 | | - "${REGISTRY_PULL_USERNAME:-\$oauthtoken}" \ |
867 | | - "${REGISTRY_PULL_SECRET}" | base64 | tr -d '\n')" |
868 | | - NICO_FLOW_ARGS+=( |
869 | | - --set "global.imagePullSecrets[0].name=image-pull-secret" |
870 | | - --set "imagePullSecret.dockerconfigjson=${_flow_docker_cfg}" |
871 | | - ) |
872 | | -fi |
873 | | - |
874 | | -# Pre-apply Certificates so cert-manager can issue secrets before the pod schedules. |
875 | | -echo "Pre-applying flow Certificates (SPIFFE + Temporal client)..." |
876 | | -helm template flow "${NICO_FLOW_CHART}" \ |
877 | | - "${NICO_FLOW_ARGS[@]}" \ |
878 | | - --show-only templates/namespace.yaml | kubectl apply -f - |
879 | | -helm template flow "${NICO_FLOW_CHART}" \ |
880 | | - "${NICO_FLOW_ARGS[@]}" \ |
881 | | - --show-only templates/certificate.yaml | kubectl apply -f - |
882 | | -kubectl annotate certificate/flow-certificate -n "${NICO_FLOW_NAMESPACE}" \ |
883 | | - "meta.helm.sh/release-name=flow" \ |
884 | | - "meta.helm.sh/release-namespace=${NICO_FLOW_NAMESPACE}" --overwrite |
885 | | -kubectl annotate certificate/temporal-client-certs -n "${NICO_FLOW_NAMESPACE}" \ |
886 | | - "meta.helm.sh/release-name=flow" \ |
887 | | - "meta.helm.sh/release-namespace=${NICO_FLOW_NAMESPACE}" --overwrite |
888 | | -kubectl label certificate/flow-certificate -n "${NICO_FLOW_NAMESPACE}" \ |
889 | | - "app.kubernetes.io/managed-by=Helm" --overwrite |
890 | | -kubectl label certificate/temporal-client-certs -n "${NICO_FLOW_NAMESPACE}" \ |
891 | | - "app.kubernetes.io/managed-by=Helm" --overwrite |
892 | | - |
893 | | -# Annotate/label the namespace itself — the flow-vault-tokens-job (nico-prereqs |
894 | | -# helm hook) creates this namespace ahead of the flow release. Without Helm |
895 | | -# ownership metadata, helm install refuses to adopt it. |
896 | | -kubectl annotate namespace "${NICO_FLOW_NAMESPACE}" \ |
897 | | - "meta.helm.sh/release-name=flow" \ |
898 | | - "meta.helm.sh/release-namespace=${NICO_FLOW_NAMESPACE}" --overwrite |
899 | | -kubectl label namespace "${NICO_FLOW_NAMESPACE}" \ |
900 | | - "app.kubernetes.io/managed-by=Helm" --overwrite |
901 | | - |
902 | | -echo "Waiting for cert-manager to issue flow-certificate..." |
903 | | -kubectl wait --for=condition=Ready certificate/flow-certificate \ |
904 | | - -n "${NICO_FLOW_NAMESPACE}" --timeout=120s |
905 | | -echo "Waiting for cert-manager to issue temporal-client-certs..." |
906 | | -kubectl wait --for=condition=Ready certificate/temporal-client-certs \ |
907 | | - -n "${NICO_FLOW_NAMESPACE}" --timeout=120s |
908 | | - |
909 | | -# Wait for the psm/nsm vault tokens and DB credential ESO syncs to land |
910 | | -# (provisioned by helm-prereqs hooks; may still be in flight if nico-prereqs |
911 | | -# was re-installed just before this phase). Fail-fast if any secret never |
912 | | -# shows up — the alternative (silently falling through to helm install) is |
913 | | -# 5 minutes of FailedMount-loop before helm gives up with an opaque message. |
914 | | -_wait_for_secret() { |
915 | | - local _name="$1" |
916 | | - local _ns="$2" |
917 | | - local _hint="$3" |
918 | | - for _i in $(seq 1 24); do |
919 | | - if kubectl get secret "${_name}" -n "${_ns}" >/dev/null 2>&1; then |
920 | | - echo " ${_name} ready" |
921 | | - return 0 |
922 | | - fi |
923 | | - echo " Waiting for ${_name} (${_i}/24)..." |
924 | | - sleep 5 |
925 | | - done |
926 | | - echo "ERROR: Secret ${_name} did not appear in namespace ${_ns} within 120s." |
927 | | - echo " ${_hint}" |
928 | | - return 1 |
929 | | -} |
930 | | - |
931 | | -echo "Waiting for psm/nsm Vault tokens..." |
932 | | -for _s in psm-vault-token nsm-vault-token; do |
933 | | - _wait_for_secret "${_s}" "${NICO_FLOW_NAMESPACE}" \ |
934 | | - "Provisioned by the flow-vault-tokens helm hook in nico-prereqs. Check 'kubectl logs -n nico-system job/flow-vault-tokens' and confirm helm-prereqs/values.yaml::flow.enabled=true." |
935 | | -done |
936 | | - |
937 | | -echo "Waiting for flow/psm/nsm DB credentials..." |
938 | | -for _s in flow.nico.nico-pg-cluster.credentials \ |
939 | | - psm.nico.nico-pg-cluster.credentials \ |
940 | | - nsm.nico.nico-pg-cluster.credentials; do |
941 | | - _wait_for_secret "${_s}" "${NICO_FLOW_NAMESPACE}" \ |
942 | | - "Synced by the flow-db-eso/psm-db-eso/nsm-db-eso ClusterExternalSecrets in nico-prereqs. Check 'kubectl describe clusterexternalsecret -A | grep flow' and confirm helm-prereqs/values.yaml::flow.enabled=true." |
943 | | -done |
944 | | - |
945 | | -echo "Installing flow helm chart..." |
946 | | -helm upgrade --install flow "${NICO_FLOW_CHART}" \ |
947 | | - "${NICO_FLOW_ARGS[@]}" \ |
948 | | - --timeout 300s --wait |
949 | | -echo "NICo Flow deployed" |
950 | | - |
951 | 956 | echo "" |
952 | 957 | echo "=========================================================================" |
953 | 958 | echo " Setup complete" |
|
0 commit comments