diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index 9fc6432694a..39659d6547e 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -98,6 +98,15 @@ ) Custom_Token_Env_Var_Sub_Id_Missing_Fault_Type = "Required environment variable 'AZURE_SUBSCRIPTION_ID' is not set, when using Custom Acces Token." Release_Install_Namespace = "azure-arc-release" +Helm_Release_Name = "azure-arc" +Onboarding_PrivateKey_Secret_Name = "azure-arc-connect-privatekey" +Onboarding_PrivateKey_Secret_Data_Key = "privateKey" +Min_Agent_Version_For_Secret_Injection = "1.35.0" +Min_Agent_Version_For_Secret_Injection_Preview = "1.35.0-preview" +Stable_Release_Train = "stable" +Preview_Release_Train = "preview" +Inject_PrivateKey_Secret_Fault_Type = "inject-private-key-secret-error" +Strip_Chart_PrivateKey_Secret_Fault_Type = "strip-chart-private-key-secret-error" Workload_Identity_Release_Name = "wiextension" Workload_Identity_Release_Namespace = "arc-workload-identity" Helm_Environment_File_Fault_Type = "helm-environment-file-error" diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index 732b7bf3621..90dabe5e4d7 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -1266,6 +1266,196 @@ def cleanup_release_install_namespace_if_exists() -> None: ) +def should_use_secret_injection_flow( + release_train: str | None, agent_version: str | None +) -> bool: + """ + Determine whether to use the secure onboarding flow that pre-creates the + onboarding private key as a Kubernetes Secret (instead of passing it through + helm values). + + Older agents whose helm chart unconditionally renders the privatekey secret + from ``global.onboardingPrivateKey`` must keep using the legacy flow, + otherwise the helm release would re-render the secret with an empty + ``privateKey`` and leave the cluster stuck in a disconnected state. The + chart change that gates the privatekey secret on the helm value being set + ships in: + + * ``stable`` release train: agent version ``1.35.0`` and above. + * ``preview`` release train: agent version ``1.35.0-preview`` and above. + * any other release train (e.g. ``dev``): always use the secure flow. + * any agent version ending in ``-dev`` (e.g. ``0.2.5738-dev``) is treated + as a dev build and always uses the secure flow, regardless of the train + DP attributed it to. This handles the case where a developer overrides + ``HELMREGISTRY`` to a dev chart while DP still reports the original + ``stable``/``preview`` train. + + From those versions onward the chart only renders the privatekey secret + when ``global.onboardingPrivateKey`` is provided, so simply omitting the + helm value is sufficient to hand secret ownership to the kubectl-injected + resource. + """ + # Dev-suffixed agent versions always use the secure flow regardless of + if agent_version and agent_version.lower().endswith("-dev"): + return True + + effective_train = (release_train or consts.Stable_Release_Train).lower() + if effective_train == consts.Stable_Release_Train: + cutoff = consts.Min_Agent_Version_For_Secret_Injection + elif effective_train == consts.Preview_Release_Train: + cutoff = consts.Min_Agent_Version_For_Secret_Injection_Preview + else: + # If not dev, or stable/preview train use the legacy flow for safety + return False + + if not agent_version: + # Cannot determine version on a gated train. Be safe and use the legacy + # flow so that older agents aren't broken. + return False + try: + return version.parse(agent_version) >= version.parse(cutoff) + except Exception: # pylint: disable=broad-except + # If we can't parse the version, fall back to the legacy flow. + return False + + +def ensure_arc_namespace_with_helm_metadata() -> None: + """ + Ensure the ``azure-arc`` namespace exists and is annotated/labeled so that + the subsequent ``helm install`` can adopt it without erroring out with + "exists and cannot be imported into the current release". + """ + api_instance = kube_client.CoreV1Api() + helm_labels = {"app.kubernetes.io/managed-by": "Helm"} + helm_annotations = { + "meta.helm.sh/release-name": consts.Helm_Release_Name, + "meta.helm.sh/release-namespace": consts.Release_Install_Namespace, + } + + try: + existing_ns = api_instance.read_namespace(consts.Arc_Namespace) + except ApiException as ex: + if ex.status != 404: + kubernetes_exception_handler( + ex, + consts.Get_Kubernetes_Namespace_Fault_Type, + error_message=f"Unable to fetch namespace '{consts.Arc_Namespace}'", + summary=f"Unable to fetch namespace '{consts.Arc_Namespace}'", + ) + return + # Namespace does not exist, create it with the required metadata. + ns_body = kube_client.V1Namespace( + metadata=kube_client.V1ObjectMeta( + name=consts.Arc_Namespace, + labels=helm_labels, + annotations=helm_annotations, + ) + ) + try: + api_instance.create_namespace(ns_body) + except ApiException as create_ex: + kubernetes_exception_handler( + create_ex, + consts.Inject_PrivateKey_Secret_Fault_Type, + error_message=f"Unable to create namespace '{consts.Arc_Namespace}'", + summary=f"Unable to create namespace '{consts.Arc_Namespace}'", + ) + return + + # Namespace exists; merge in the Helm adoption metadata so helm can manage it. + metadata = existing_ns.metadata or kube_client.V1ObjectMeta() + labels = dict(metadata.labels or {}) + annotations = dict(metadata.annotations or {}) + labels.update(helm_labels) + annotations.update(helm_annotations) + patch_body = {"metadata": {"labels": labels, "annotations": annotations}} + try: + api_instance.patch_namespace(consts.Arc_Namespace, patch_body) + except ApiException as patch_ex: + kubernetes_exception_handler( + patch_ex, + consts.Inject_PrivateKey_Secret_Fault_Type, + error_message=( + f"Unable to patch namespace '{consts.Arc_Namespace}' with Helm " + "ownership metadata" + ), + summary=( + f"Unable to patch namespace '{consts.Arc_Namespace}' with Helm " + "ownership metadata" + ), + ) + + +def inject_onboarding_private_key_secret(private_key_pem: str) -> None: + """ + Pre-create the onboarding private key as a Kubernetes Secret so the agents + can consume it without ever exposing it through helm values. The namespace + and secret are annotated/labeled for Helm adoption so the chart can manage + them on subsequent upgrades. + + This MUST be called before ``helm install`` so that the cluster never sits + in a state where the private key is missing (which would leave it stuck in + a disconnected state since the Cluster Identity Operator depends on this + secret to fetch identity certificates from HIS). + """ + print( + f"Step: {get_utctimestring()}: Pre-creating onboarding private key " + f"secret '{consts.Onboarding_PrivateKey_Secret_Name}' in namespace " + f"'{consts.Arc_Namespace}'." + ) + ensure_arc_namespace_with_helm_metadata() + + api_instance = kube_client.CoreV1Api() + secret_body = kube_client.V1Secret( + metadata=kube_client.V1ObjectMeta( + name=consts.Onboarding_PrivateKey_Secret_Name, + namespace=consts.Arc_Namespace, + labels={"app.kubernetes.io/managed-by": "Helm"}, + annotations={ + "meta.helm.sh/release-name": consts.Helm_Release_Name, + "meta.helm.sh/release-namespace": consts.Release_Install_Namespace, + }, + ), + type="Opaque", + string_data={consts.Onboarding_PrivateKey_Secret_Data_Key: private_key_pem}, + ) + + try: + api_instance.create_namespaced_secret(consts.Arc_Namespace, secret_body) + except ApiException as ex: + if ex.status != 409: + kubernetes_exception_handler( + ex, + consts.Inject_PrivateKey_Secret_Fault_Type, + error_message=( + "Unable to create onboarding private key secret " + f"'{consts.Onboarding_PrivateKey_Secret_Name}' in namespace " + f"'{consts.Arc_Namespace}'" + ), + summary="Unable to create onboarding private key secret", + ) + return + # Secret already exists - replace its contents + # so the cluster always uses a private key matching the public key in ARM + try: + api_instance.replace_namespaced_secret( + consts.Onboarding_PrivateKey_Secret_Name, + consts.Arc_Namespace, + secret_body, + ) + except ApiException as replace_ex: + kubernetes_exception_handler( + replace_ex, + consts.Inject_PrivateKey_Secret_Fault_Type, + error_message=( + "Unable to update existing onboarding private key secret " + f"'{consts.Onboarding_PrivateKey_Secret_Name}' in namespace " + f"'{consts.Arc_Namespace}'" + ), + summary="Unable to update onboarding private key secret", + ) + + # DO NOT use this method for re-put scenarios. This method involves new NS creation for helm release. For re-put scenarios, brownfield scenario needs to be handled where helm release still stays in default NS def helm_install_release( resource_manager: str, @@ -1288,6 +1478,7 @@ def helm_install_release( registry_path: str, aad_identity_principal_id: str | None, onboarding_timeout: str = consts.DEFAULT_MAX_ONBOARDING_TIMEOUT_HELMVALUE_SECONDS, + inject_private_key_via_helm: bool = True, ) -> None: cmd_helm_install = [ helm_client_location, @@ -1299,20 +1490,29 @@ def helm_install_release( f"global.kubernetesDistro={kubernetes_distro}", "--set", f"global.kubernetesInfra={kubernetes_infra}", - "--set", - f"global.onboardingPrivateKey={private_key_pem}", - "--set", - "systemDefaultValues.spnOnboarding=false", - "--set", - f"global.azureEnvironment={cloud_name}", - "--set", - "systemDefaultValues.clusterconnect-agent.enabled=true", - "--namespace", - f"{consts.Release_Install_Namespace}", - "--create-namespace", - "--output", - "json", ] + # Pass the onboarding private key through helm values only for older + # (stable < 1.35.0) agents. Newer agents have already received the key via + # a pre-created Kubernetes Secret so it never appears in helm values. + if inject_private_key_via_helm: + cmd_helm_install.extend( + ["--set", f"global.onboardingPrivateKey={private_key_pem}"] + ) + cmd_helm_install.extend( + [ + "--set", + "systemDefaultValues.spnOnboarding=false", + "--set", + f"global.azureEnvironment={cloud_name}", + "--set", + "systemDefaultValues.clusterconnect-agent.enabled=true", + "--namespace", + f"{consts.Release_Install_Namespace}", + "--create-namespace", + "--output", + "json", + ] + ) # Special configurations from 2022-09-01 ARM metadata. # "dataplaneEndpoints" does not appear in arm_metadata for public and AGC diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index d1dc4b2ac87..c4cb705a07d 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -1018,6 +1018,44 @@ def create_connectedk8s( print( f"Step: {utils.get_utctimestring()}: Starting to install Azure arc agents on the Kubernetes cluster." ) + + # Decide which onboarding flow to use. Stable agents below 1.35.0 still need + # the legacy flow (private key in helm values), because their helm chart + # always renders the privatekey secret from helm values and would zero it + # out on install otherwise. Newer agents (and any non-stable build) get the + # secure flow: we pre-create the namespace + secret directly via the + # Kubernetes API so the private key never appears in helm values. + use_secret_injection_flow = utils.should_use_secret_injection_flow( + release_train, azure_arc_agent_version + ) + telemetry.add_extension_event( + "connectedk8s", + { + "Context.Default.AzureCLI.OnboardingFlow": ( + "secret-injection" + if use_secret_injection_flow + else "helm-values-legacy" + ) + }, + ) + + if use_secret_injection_flow: + # Inject the private key BEFORE running helm so that the cluster always + # has the onboarding secret available - even if the subsequent helm + # install/CLI is interrupted - preventing a stuck-disconnected state. + try: + utils.inject_onboarding_private_key_secret(private_key_pem) + except Exception as e: + telemetry.set_exception( + exception=e, + fault_type=consts.Inject_PrivateKey_Secret_Fault_Type, + summary="Failed to pre-create onboarding private key secret", + ) + raise CLIInternalError( + "Failed to pre-create onboarding private key secret on the " + f"Kubernetes cluster: {e}" + ) + # Install azure-arc agents utils.helm_install_release( cmd.cli_ctx.cloud.endpoints.resource_manager, @@ -1040,6 +1078,7 @@ def create_connectedk8s( registry_path, aad_identity_principal_id, onboarding_timeout, + inject_private_key_via_helm=not use_secret_injection_flow, ) # Long Running Operation for Agent State diff --git a/src/connectedk8s/azext_connectedk8s/tests/unittests/test_utils_.py b/src/connectedk8s/azext_connectedk8s/tests/unittests/test_utils_.py index 32d1da1e3b4..3d1594705b2 100644 --- a/src/connectedk8s/azext_connectedk8s/tests/unittests/test_utils_.py +++ b/src/connectedk8s/azext_connectedk8s/tests/unittests/test_utils_.py @@ -14,6 +14,7 @@ redact_sensitive_fields_from_string, remove_rsa_private_key, scrub_proxy_url, + should_use_secret_injection_flow, ) @@ -99,5 +100,43 @@ def test_get_mcr_path(): assert get_mcr_path(input_active_directory) == expected_output +@pytest.mark.parametrize( + "release_train,agent_version,expected", + [ + # Stable train, agents older than 1.35.0 must use the legacy flow + # (helm value injection) to avoid zeroing out the secret. + ("stable", "1.34.9", False), + ("stable", "1.20.0", False), + ("STABLE", "1.14.0", False), + # Stable train at or above the cutoff uses the secure flow. + ("stable", "1.35.0", True), + ("stable", "1.36.2", True), + ("stable", "2.0.0", True), + # Preview train uses 1.35.0-preview as the cutoff (same scheme). + ("preview", "1.34.0", False), + ("preview", "1.35.0-preview", True), + ("preview", "1.36.0-preview", True), + ("PREVIEW", "1.20.0", False), + # Dev-suffixed agent versions always use the secure flow, regardless of + ("preview", "0.2.5738-dev", True), + ("stable", "0.2.6689-dev", True), + ("STABLE", "1.34.0-DEV", True), + (None, "0.2.5738-dev", True), + # Missing version on a gated train -> safe default (legacy flow). + ("stable", None, False), + ("preview", "", False), + # Missing release train defaults to "stable". + (None, "1.34.0", False), + (None, "1.35.0", True), + # Unparseable version on a gated train -> safe default (legacy flow). + ("stable", "not-a-version", False), + ], +) +def test_should_use_secret_injection_flow(release_train, agent_version, expected): + assert ( + should_use_secret_injection_flow(release_train, agent_version) is expected + ) + + if __name__ == "__main__": pytest.main()