|
59 | 59 | # ./setup.sh -y # skip all prompts, deploy everything automatically |
60 | 60 | # ./setup.sh --skip-core # skip Phase 6 NICo Core (print command, deploy manually) |
61 | 61 | # ./setup.sh --skip-rest # skip Phase 7 NICo REST entirely (no repo needed) |
62 | | -# ./setup.sh --skip-flow # skip Phase 7i NICo Flow (REST still installs) |
| 62 | +# ./setup.sh --skip-flow # skip Phase 7h NICo Flow (REST still installs) |
63 | 63 | # # pair with helm-prereqs/values.yaml::flow.enabled=false |
64 | 64 | # # to skip Flow prereqs (DBs / ESO / vault tokens) too |
65 | 65 | # ./setup.sh --skip-core --skip-rest # fully non-interactive infra-only run |
@@ -759,9 +759,147 @@ else |
759 | 759 | exit 0 |
760 | 760 | fi |
761 | 761 |
|
762 | | -# --- 7h. NICo REST site-agent ------------------------------------------------- |
| 762 | +# --- 7h. NICo Flow ------------------------------------------------------------ |
| 763 | +# Flow is the rack lifecycle orchestrator (formerly RLA). Single pod with three |
| 764 | +# containers — flow (50051), psm (50052), nsm (50053). Runs in its own `flow` |
| 765 | +# namespace. |
| 766 | +# |
| 767 | +# Runs BEFORE the site-agent (7i) so that flow.flow.svc.cluster.local:50051 |
| 768 | +# exists when the site-agent starts and attempts its Flow gRPC connection. |
| 769 | +# |
| 770 | +# Prerequisites already in place by this point: |
| 771 | +# - flow/psm/nsm databases on nico-pg-cluster (helm-prereqs postgresql.yaml) |
| 772 | +# - flow.nico/psm.nico/nsm.nico DB credentials synced via ESO into the flow |
| 773 | +# namespace by the flow-db-eso / psm-db-eso / nsm-db-eso ClusterExternalSecrets |
| 774 | +# - psm-vault-token and nsm-vault-token Secrets in the flow namespace |
| 775 | +# (provisioned by the flow-vault-tokens post-install hook) |
| 776 | +# - Temporal `flow` namespace (created in phase 7f above) |
| 777 | +# - nico-rest-ca-issuer ClusterIssuer (installed by phase 7b — issues the |
| 778 | +# temporal-client-certs) |
| 779 | +# - vault-nico-issuer ClusterIssuer (issues the SPIFFE cert) |
| 780 | +# |
| 781 | +# Same pre-apply-cert dance as the site-agent: render the Certificate(s) ahead |
| 782 | +# of the helm install so cert-manager has time to issue them and the pod doesn't |
| 783 | +# hit a FailedMount race on the spiffe / temporal-client-certs secrets. |
| 784 | +if "${SKIP_FLOW}"; then |
| 785 | + echo "=== [7h/7] NICo Flow — skipped (--skip-flow) ===" |
| 786 | +else |
| 787 | + _SETUP_PHASE="[7h/7] NICo Flow" |
| 788 | + echo "=== [7h/7] NICo Flow ===" |
| 789 | + |
| 790 | + NICO_FLOW_CHART="${SCRIPT_DIR}/../helm/charts/nico-flow" |
| 791 | + NICO_FLOW_NAMESPACE="flow" |
| 792 | + |
| 793 | + NICO_FLOW_ARGS=( |
| 794 | + --namespace "${NICO_FLOW_NAMESPACE}" |
| 795 | + --create-namespace |
| 796 | + --set "global.image.repository=${NICO_IMAGE_REGISTRY}" |
| 797 | + ## Flow (nico-flow / nico-psm / nico-nsm) ships on the same image release |
| 798 | + ## line as NICo REST — they're built and tagged together — so reuse |
| 799 | + ## NICO_REST_IMAGE_TAG, not NICO_CORE_IMAGE_TAG (which is carbide-api). |
| 800 | + --set "global.image.tag=${NICO_REST_IMAGE_TAG}" |
| 801 | + ) |
| 802 | + |
| 803 | + # Render the dockerconfigjson for the chart-managed image-pull-secret. Same |
| 804 | + # pattern as the NICo REST common chart — keep the registry credential on |
| 805 | + # the helm command line so the chart template can install it as a |
| 806 | + # pre-install hook (pod can't pull from nvcr.io otherwise). |
| 807 | + if [[ -n "${REGISTRY_PULL_SECRET:-}" ]]; then |
| 808 | + _flow_registry_server="${NICO_IMAGE_REGISTRY%%/*}" |
| 809 | + _flow_docker_cfg="$(printf '{"auths":{"%s":{"username":"%s","password":"%s"}}}' \ |
| 810 | + "${_flow_registry_server}" \ |
| 811 | + "${REGISTRY_PULL_USERNAME:-\$oauthtoken}" \ |
| 812 | + "${REGISTRY_PULL_SECRET}" | base64 | tr -d '\n')" |
| 813 | + NICO_FLOW_ARGS+=( |
| 814 | + --set "global.imagePullSecrets[0].name=image-pull-secret" |
| 815 | + --set "imagePullSecret.dockerconfigjson=${_flow_docker_cfg}" |
| 816 | + ) |
| 817 | + fi |
| 818 | + |
| 819 | + # Pre-apply Certificates so cert-manager can issue secrets before the pod schedules. |
| 820 | + echo "Pre-applying flow Certificates (SPIFFE + Temporal client)..." |
| 821 | + helm template flow "${NICO_FLOW_CHART}" \ |
| 822 | + "${NICO_FLOW_ARGS[@]}" \ |
| 823 | + --show-only templates/namespace.yaml | kubectl apply -f - |
| 824 | + helm template flow "${NICO_FLOW_CHART}" \ |
| 825 | + "${NICO_FLOW_ARGS[@]}" \ |
| 826 | + --show-only templates/certificate.yaml | kubectl apply -f - |
| 827 | + kubectl annotate certificate/flow-certificate -n "${NICO_FLOW_NAMESPACE}" \ |
| 828 | + "meta.helm.sh/release-name=flow" \ |
| 829 | + "meta.helm.sh/release-namespace=${NICO_FLOW_NAMESPACE}" --overwrite |
| 830 | + kubectl annotate certificate/temporal-client-certs -n "${NICO_FLOW_NAMESPACE}" \ |
| 831 | + "meta.helm.sh/release-name=flow" \ |
| 832 | + "meta.helm.sh/release-namespace=${NICO_FLOW_NAMESPACE}" --overwrite |
| 833 | + kubectl label certificate/flow-certificate -n "${NICO_FLOW_NAMESPACE}" \ |
| 834 | + "app.kubernetes.io/managed-by=Helm" --overwrite |
| 835 | + kubectl label certificate/temporal-client-certs -n "${NICO_FLOW_NAMESPACE}" \ |
| 836 | + "app.kubernetes.io/managed-by=Helm" --overwrite |
| 837 | + |
| 838 | + # Annotate/label the namespace itself — the flow-vault-tokens-job (nico-prereqs |
| 839 | + # helm hook) creates this namespace ahead of the flow release. Without Helm |
| 840 | + # ownership metadata, helm install refuses to adopt it. |
| 841 | + kubectl annotate namespace "${NICO_FLOW_NAMESPACE}" \ |
| 842 | + "meta.helm.sh/release-name=flow" \ |
| 843 | + "meta.helm.sh/release-namespace=${NICO_FLOW_NAMESPACE}" --overwrite |
| 844 | + kubectl label namespace "${NICO_FLOW_NAMESPACE}" \ |
| 845 | + "app.kubernetes.io/managed-by=Helm" --overwrite |
| 846 | + |
| 847 | + echo "Waiting for cert-manager to issue flow-certificate..." |
| 848 | + kubectl wait --for=condition=Ready certificate/flow-certificate \ |
| 849 | + -n "${NICO_FLOW_NAMESPACE}" --timeout=120s |
| 850 | + echo "Waiting for cert-manager to issue temporal-client-certs..." |
| 851 | + kubectl wait --for=condition=Ready certificate/temporal-client-certs \ |
| 852 | + -n "${NICO_FLOW_NAMESPACE}" --timeout=120s |
| 853 | + |
| 854 | + # Wait for the psm/nsm vault tokens and DB credential ESO syncs to land |
| 855 | + # (provisioned by helm-prereqs hooks; may still be in flight if nico-prereqs |
| 856 | + # was re-installed just before this phase). Fail-fast if any secret never |
| 857 | + # shows up — the alternative (silently falling through to helm install) is |
| 858 | + # 5 minutes of FailedMount-loop before helm gives up with an opaque message. |
| 859 | + _wait_for_secret() { |
| 860 | + local _name="$1" |
| 861 | + local _ns="$2" |
| 862 | + local _hint="$3" |
| 863 | + for _i in $(seq 1 24); do |
| 864 | + if kubectl get secret "${_name}" -n "${_ns}" >/dev/null 2>&1; then |
| 865 | + echo " ${_name} ready" |
| 866 | + return 0 |
| 867 | + fi |
| 868 | + echo " Waiting for ${_name} (${_i}/24)..." |
| 869 | + sleep 5 |
| 870 | + done |
| 871 | + echo "ERROR: Secret ${_name} did not appear in namespace ${_ns} within 120s." |
| 872 | + echo " ${_hint}" |
| 873 | + return 1 |
| 874 | + } |
| 875 | + |
| 876 | + echo "Waiting for psm/nsm Vault tokens..." |
| 877 | + for _s in psm-vault-token nsm-vault-token; do |
| 878 | + _wait_for_secret "${_s}" "${NICO_FLOW_NAMESPACE}" \ |
| 879 | + "Provisioned by the flow-vault-tokens helm hook in nico-prereqs. Check 'kubectl logs -n nico-system job/flow-vault-tokens' and confirm helm-prereqs/values.yaml::flow.enabled=true." |
| 880 | + done |
| 881 | + |
| 882 | + echo "Waiting for flow/psm/nsm DB credentials..." |
| 883 | + for _s in flow.nico.nico-pg-cluster.credentials \ |
| 884 | + psm.nico.nico-pg-cluster.credentials \ |
| 885 | + nsm.nico.nico-pg-cluster.credentials; do |
| 886 | + _wait_for_secret "${_s}" "${NICO_FLOW_NAMESPACE}" \ |
| 887 | + "Synced by the flow-db-eso/psm-db-eso/nsm-db-eso ClusterExternalSecrets in nico-prereqs. Check 'kubectl describe clusterexternalsecret -A | grep flow' and confirm helm-prereqs/values.yaml::flow.enabled=true." |
| 888 | + done |
| 889 | + |
| 890 | + echo "Installing flow helm chart..." |
| 891 | + helm upgrade --install flow "${NICO_FLOW_CHART}" \ |
| 892 | + "${NICO_FLOW_ARGS[@]}" \ |
| 893 | + --timeout 300s --wait |
| 894 | + echo "NICo Flow deployed" |
| 895 | +fi |
| 896 | + |
| 897 | +# --- 7i. NICo REST site-agent ------------------------------------------------- |
763 | 898 | # The site-agent is a separate chart from the main NICo REST umbrella. |
764 | 899 | # |
| 900 | +# Runs AFTER NICo Flow (7h) so that flow.flow.svc.cluster.local:50051 is |
| 901 | +# reachable when the site-agent starts its Flow gRPC connection. |
| 902 | +# |
765 | 903 | # Bootstrap order: |
766 | 904 | # 1. Create the per-site Temporal namespace BEFORE helm install so the |
767 | 905 | # site-agent never starts without it (starting without it causes an |
@@ -799,8 +937,8 @@ if [[ -n "${REGISTRY_PULL_SECRET:-}" ]]; then |
799 | 937 | ) |
800 | 938 | fi |
801 | 939 |
|
802 | | -_SETUP_PHASE="[7h/7] NICo REST site-agent" |
803 | | -echo "=== [7h/7] NICo REST site-agent (site UUID: ${NICO_SITE_UUID}) ===" |
| 940 | +_SETUP_PHASE="[7i/7] NICo REST site-agent" |
| 941 | +echo "=== [7i/7] NICo REST site-agent (site UUID: ${NICO_SITE_UUID}) ===" |
804 | 942 |
|
805 | 943 | # Pre-apply the Certificate resource so cert-manager issues the NICo gRPC client |
806 | 944 | # cert BEFORE the StatefulSet pod starts. Without this, there is a race: helm creates |
@@ -843,7 +981,7 @@ echo "Temporal namespace ready" |
843 | 981 | # FLOW_GRPC_ENABLED toggles the site-agent's Flow gRPC client (see |
844 | 982 | # carbide-rest/site-agent/pkg/components/config/config_manager.go — |
845 | 983 | # strings.ToLower(env)=="true"). Without it, site-agent never opens a |
846 | | -# connection to the Flow pod deployed in phase 7i. We default it ON when |
| 984 | +# connection to the Flow pod deployed in phase 7h. We default it ON when |
847 | 985 | # Flow itself is being deployed; users can flip it back via --set when |
848 | 986 | # pairing --skip-flow. |
849 | 987 | _FLOW_GRPC_ENABLED="true" |
@@ -889,139 +1027,6 @@ if [ "${_CONNECTED}" = "false" ]; then |
889 | 1027 | echo "Site-agent pod restarted — gRPC connection will be retried" |
890 | 1028 | fi |
891 | 1029 |
|
892 | | -# --- 7i. NICo Flow ------------------------------------------------------------ |
893 | | -# Flow is the rack lifecycle orchestrator (formerly RLA). Single pod with three |
894 | | -# containers — flow (50051), psm (50052), nsm (50053). Runs in its own `flow` |
895 | | -# namespace. |
896 | | -# |
897 | | -# Prerequisites already in place by this point: |
898 | | -# - flow/psm/nsm databases on nico-pg-cluster (helm-prereqs postgresql.yaml) |
899 | | -# - flow.nico/psm.nico/nsm.nico DB credentials synced via ESO into the flow |
900 | | -# namespace by the flow-db-eso / psm-db-eso / nsm-db-eso ClusterExternalSecrets |
901 | | -# - psm-vault-token and nsm-vault-token Secrets in the flow namespace |
902 | | -# (provisioned by the flow-vault-tokens post-install hook) |
903 | | -# - Temporal `flow` namespace (created in phase 7f above) |
904 | | -# - nico-rest-ca-issuer ClusterIssuer (installed by phase 7b — issues the |
905 | | -# temporal-client-certs) |
906 | | -# - vault-nico-issuer ClusterIssuer (issues the SPIFFE cert) |
907 | | -# |
908 | | -# Same pre-apply-cert dance as the site-agent: render the Certificate(s) ahead |
909 | | -# of the helm install so cert-manager has time to issue them and the pod doesn't |
910 | | -# hit a FailedMount race on the spiffe / temporal-client-certs secrets. |
911 | | -if "${SKIP_FLOW}"; then |
912 | | - echo "=== [7i/7] NICo Flow — skipped (--skip-flow) ===" |
913 | | - _SETUP_PHASE="complete" |
914 | | - exit 0 |
915 | | -fi |
916 | | -_SETUP_PHASE="[7i/7] NICo Flow" |
917 | | -echo "=== [7i/7] NICo Flow ===" |
918 | | - |
919 | | -NICO_FLOW_CHART="${SCRIPT_DIR}/../helm/charts/nico-flow" |
920 | | -NICO_FLOW_NAMESPACE="flow" |
921 | | - |
922 | | -NICO_FLOW_ARGS=( |
923 | | - --namespace "${NICO_FLOW_NAMESPACE}" |
924 | | - --create-namespace |
925 | | - --set "global.image.repository=${NICO_IMAGE_REGISTRY}" |
926 | | - ## Flow (nico-flow / nico-psm / nico-nsm) ships on the same image release |
927 | | - ## line as NICo REST — they're built and tagged together — so reuse |
928 | | - ## NICO_REST_IMAGE_TAG, not NICO_CORE_IMAGE_TAG (which is carbide-api). |
929 | | - --set "global.image.tag=${NICO_REST_IMAGE_TAG}" |
930 | | -) |
931 | | - |
932 | | -# Render the dockerconfigjson for the chart-managed image-pull-secret. Same |
933 | | -# pattern as the NICo REST common chart — keep the registry credential on |
934 | | -# the helm command line so the chart template can install it as a |
935 | | -# pre-install hook (pod can't pull from nvcr.io otherwise). |
936 | | -if [[ -n "${REGISTRY_PULL_SECRET:-}" ]]; then |
937 | | - _flow_registry_server="${NICO_IMAGE_REGISTRY%%/*}" |
938 | | - _flow_docker_cfg="$(printf '{"auths":{"%s":{"username":"%s","password":"%s"}}}' \ |
939 | | - "${_flow_registry_server}" \ |
940 | | - "${REGISTRY_PULL_USERNAME:-\$oauthtoken}" \ |
941 | | - "${REGISTRY_PULL_SECRET}" | base64 | tr -d '\n')" |
942 | | - NICO_FLOW_ARGS+=( |
943 | | - --set "global.imagePullSecrets[0].name=image-pull-secret" |
944 | | - --set "imagePullSecret.dockerconfigjson=${_flow_docker_cfg}" |
945 | | - ) |
946 | | -fi |
947 | | - |
948 | | -# Pre-apply Certificates so cert-manager can issue secrets before the pod schedules. |
949 | | -echo "Pre-applying flow Certificates (SPIFFE + Temporal client)..." |
950 | | -helm template flow "${NICO_FLOW_CHART}" \ |
951 | | - "${NICO_FLOW_ARGS[@]}" \ |
952 | | - --show-only templates/namespace.yaml | kubectl apply -f - |
953 | | -helm template flow "${NICO_FLOW_CHART}" \ |
954 | | - "${NICO_FLOW_ARGS[@]}" \ |
955 | | - --show-only templates/certificate.yaml | kubectl apply -f - |
956 | | -kubectl annotate certificate/flow-certificate -n "${NICO_FLOW_NAMESPACE}" \ |
957 | | - "meta.helm.sh/release-name=flow" \ |
958 | | - "meta.helm.sh/release-namespace=${NICO_FLOW_NAMESPACE}" --overwrite |
959 | | -kubectl annotate certificate/temporal-client-certs -n "${NICO_FLOW_NAMESPACE}" \ |
960 | | - "meta.helm.sh/release-name=flow" \ |
961 | | - "meta.helm.sh/release-namespace=${NICO_FLOW_NAMESPACE}" --overwrite |
962 | | -kubectl label certificate/flow-certificate -n "${NICO_FLOW_NAMESPACE}" \ |
963 | | - "app.kubernetes.io/managed-by=Helm" --overwrite |
964 | | -kubectl label certificate/temporal-client-certs -n "${NICO_FLOW_NAMESPACE}" \ |
965 | | - "app.kubernetes.io/managed-by=Helm" --overwrite |
966 | | - |
967 | | -# Annotate/label the namespace itself — the flow-vault-tokens-job (nico-prereqs |
968 | | -# helm hook) creates this namespace ahead of the flow release. Without Helm |
969 | | -# ownership metadata, helm install refuses to adopt it. |
970 | | -kubectl annotate namespace "${NICO_FLOW_NAMESPACE}" \ |
971 | | - "meta.helm.sh/release-name=flow" \ |
972 | | - "meta.helm.sh/release-namespace=${NICO_FLOW_NAMESPACE}" --overwrite |
973 | | -kubectl label namespace "${NICO_FLOW_NAMESPACE}" \ |
974 | | - "app.kubernetes.io/managed-by=Helm" --overwrite |
975 | | - |
976 | | -echo "Waiting for cert-manager to issue flow-certificate..." |
977 | | -kubectl wait --for=condition=Ready certificate/flow-certificate \ |
978 | | - -n "${NICO_FLOW_NAMESPACE}" --timeout=120s |
979 | | -echo "Waiting for cert-manager to issue temporal-client-certs..." |
980 | | -kubectl wait --for=condition=Ready certificate/temporal-client-certs \ |
981 | | - -n "${NICO_FLOW_NAMESPACE}" --timeout=120s |
982 | | - |
983 | | -# Wait for the psm/nsm vault tokens and DB credential ESO syncs to land |
984 | | -# (provisioned by helm-prereqs hooks; may still be in flight if nico-prereqs |
985 | | -# was re-installed just before this phase). Fail-fast if any secret never |
986 | | -# shows up — the alternative (silently falling through to helm install) is |
987 | | -# 5 minutes of FailedMount-loop before helm gives up with an opaque message. |
988 | | -_wait_for_secret() { |
989 | | - local _name="$1" |
990 | | - local _ns="$2" |
991 | | - local _hint="$3" |
992 | | - for _i in $(seq 1 24); do |
993 | | - if kubectl get secret "${_name}" -n "${_ns}" >/dev/null 2>&1; then |
994 | | - echo " ${_name} ready" |
995 | | - return 0 |
996 | | - fi |
997 | | - echo " Waiting for ${_name} (${_i}/24)..." |
998 | | - sleep 5 |
999 | | - done |
1000 | | - echo "ERROR: Secret ${_name} did not appear in namespace ${_ns} within 120s." |
1001 | | - echo " ${_hint}" |
1002 | | - return 1 |
1003 | | -} |
1004 | | - |
1005 | | -echo "Waiting for psm/nsm Vault tokens..." |
1006 | | -for _s in psm-vault-token nsm-vault-token; do |
1007 | | - _wait_for_secret "${_s}" "${NICO_FLOW_NAMESPACE}" \ |
1008 | | - "Provisioned by the flow-vault-tokens helm hook in nico-prereqs. Check 'kubectl logs -n nico-system job/flow-vault-tokens' and confirm helm-prereqs/values.yaml::flow.enabled=true." |
1009 | | -done |
1010 | | - |
1011 | | -echo "Waiting for flow/psm/nsm DB credentials..." |
1012 | | -for _s in flow.nico.nico-pg-cluster.credentials \ |
1013 | | - psm.nico.nico-pg-cluster.credentials \ |
1014 | | - nsm.nico.nico-pg-cluster.credentials; do |
1015 | | - _wait_for_secret "${_s}" "${NICO_FLOW_NAMESPACE}" \ |
1016 | | - "Synced by the flow-db-eso/psm-db-eso/nsm-db-eso ClusterExternalSecrets in nico-prereqs. Check 'kubectl describe clusterexternalsecret -A | grep flow' and confirm helm-prereqs/values.yaml::flow.enabled=true." |
1017 | | -done |
1018 | | - |
1019 | | -echo "Installing flow helm chart..." |
1020 | | -helm upgrade --install flow "${NICO_FLOW_CHART}" \ |
1021 | | - "${NICO_FLOW_ARGS[@]}" \ |
1022 | | - --timeout 300s --wait |
1023 | | -echo "NICo Flow deployed" |
1024 | | - |
1025 | 1030 | echo "" |
1026 | 1031 | echo "=========================================================================" |
1027 | 1032 | echo " Setup complete" |
|
0 commit comments