Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions src/connectedk8s/azext_connectedk8s/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,15 @@
)
Custom_Token_Env_Var_Sub_Id_Missing_Fault_Type = "Required environment variable 'AZURE_SUBSCRIPTION_ID' is not set, when using Custom Acces Token."
Release_Install_Namespace = "azure-arc-release"
Helm_Release_Name = "azure-arc"
Onboarding_PrivateKey_Secret_Name = "azure-arc-connect-privatekey"
Onboarding_PrivateKey_Secret_Data_Key = "privateKey"
Min_Agent_Version_For_Secret_Injection = "1.35.0"
Min_Agent_Version_For_Secret_Injection_Preview = "1.35.0-preview"
Stable_Release_Train = "stable"
Preview_Release_Train = "preview"
Inject_PrivateKey_Secret_Fault_Type = "inject-private-key-secret-error"
Strip_Chart_PrivateKey_Secret_Fault_Type = "strip-chart-private-key-secret-error"
Workload_Identity_Release_Name = "wiextension"
Workload_Identity_Release_Namespace = "arc-workload-identity"
Helm_Environment_File_Fault_Type = "helm-environment-file-error"
Expand Down
226 changes: 213 additions & 13 deletions src/connectedk8s/azext_connectedk8s/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1266,6 +1266,196 @@ def cleanup_release_install_namespace_if_exists() -> None:
)


def should_use_secret_injection_flow(
release_train: str | None, agent_version: str | None
) -> bool:
"""
Determine whether to use the secure onboarding flow that pre-creates the
onboarding private key as a Kubernetes Secret (instead of passing it through
helm values).

Older agents whose helm chart unconditionally renders the privatekey secret
from ``global.onboardingPrivateKey`` must keep using the legacy flow,
otherwise the helm release would re-render the secret with an empty
``privateKey`` and leave the cluster stuck in a disconnected state. The
chart change that gates the privatekey secret on the helm value being set
ships in:

* ``stable`` release train: agent version ``1.35.0`` and above.
* ``preview`` release train: agent version ``1.35.0-preview`` and above.
* any other release train (e.g. ``dev``): always use the secure flow.
* any agent version ending in ``-dev`` (e.g. ``0.2.5738-dev``) is treated
as a dev build and always uses the secure flow, regardless of the train
DP attributed it to. This handles the case where a developer overrides
``HELMREGISTRY`` to a dev chart while DP still reports the original
``stable``/``preview`` train.

From those versions onward the chart only renders the privatekey secret
when ``global.onboardingPrivateKey`` is provided, so simply omitting the
helm value is sufficient to hand secret ownership to the kubectl-injected
resource.
"""
# Dev-suffixed agent versions always use the secure flow regardless of
if agent_version and agent_version.lower().endswith("-dev"):
return True

effective_train = (release_train or consts.Stable_Release_Train).lower()
if effective_train == consts.Stable_Release_Train:
cutoff = consts.Min_Agent_Version_For_Secret_Injection
elif effective_train == consts.Preview_Release_Train:
cutoff = consts.Min_Agent_Version_For_Secret_Injection_Preview
else:
# If not dev, or stable/preview train use the legacy flow for safety
return False

if not agent_version:
# Cannot determine version on a gated train. Be safe and use the legacy
# flow so that older agents aren't broken.
return False
try:
return version.parse(agent_version) >= version.parse(cutoff)
except Exception: # pylint: disable=broad-except
# If we can't parse the version, fall back to the legacy flow.
return False


def ensure_arc_namespace_with_helm_metadata() -> None:
"""
Ensure the ``azure-arc`` namespace exists and is annotated/labeled so that
the subsequent ``helm install`` can adopt it without erroring out with
"exists and cannot be imported into the current release".
"""
api_instance = kube_client.CoreV1Api()
helm_labels = {"app.kubernetes.io/managed-by": "Helm"}
helm_annotations = {
"meta.helm.sh/release-name": consts.Helm_Release_Name,
"meta.helm.sh/release-namespace": consts.Release_Install_Namespace,
}

try:
existing_ns = api_instance.read_namespace(consts.Arc_Namespace)
except ApiException as ex:
if ex.status != 404:
kubernetes_exception_handler(
ex,
consts.Get_Kubernetes_Namespace_Fault_Type,
error_message=f"Unable to fetch namespace '{consts.Arc_Namespace}'",
summary=f"Unable to fetch namespace '{consts.Arc_Namespace}'",
)
return
# Namespace does not exist, create it with the required metadata.
ns_body = kube_client.V1Namespace(
metadata=kube_client.V1ObjectMeta(
name=consts.Arc_Namespace,
labels=helm_labels,
annotations=helm_annotations,
)
)
try:
api_instance.create_namespace(ns_body)
except ApiException as create_ex:
kubernetes_exception_handler(
create_ex,
consts.Inject_PrivateKey_Secret_Fault_Type,
error_message=f"Unable to create namespace '{consts.Arc_Namespace}'",
summary=f"Unable to create namespace '{consts.Arc_Namespace}'",
)
return

# Namespace exists; merge in the Helm adoption metadata so helm can manage it.
metadata = existing_ns.metadata or kube_client.V1ObjectMeta()
labels = dict(metadata.labels or {})
annotations = dict(metadata.annotations or {})
labels.update(helm_labels)
annotations.update(helm_annotations)
patch_body = {"metadata": {"labels": labels, "annotations": annotations}}
try:
api_instance.patch_namespace(consts.Arc_Namespace, patch_body)
except ApiException as patch_ex:
kubernetes_exception_handler(
patch_ex,
consts.Inject_PrivateKey_Secret_Fault_Type,
error_message=(
f"Unable to patch namespace '{consts.Arc_Namespace}' with Helm "
"ownership metadata"
),
summary=(
f"Unable to patch namespace '{consts.Arc_Namespace}' with Helm "
"ownership metadata"
),
)


def inject_onboarding_private_key_secret(private_key_pem: str) -> None:
"""
Pre-create the onboarding private key as a Kubernetes Secret so the agents
can consume it without ever exposing it through helm values. The namespace
and secret are annotated/labeled for Helm adoption so the chart can manage
them on subsequent upgrades.

This MUST be called before ``helm install`` so that the cluster never sits
in a state where the private key is missing (which would leave it stuck in
a disconnected state since the Cluster Identity Operator depends on this
secret to fetch identity certificates from HIS).
"""
print(
f"Step: {get_utctimestring()}: Pre-creating onboarding private key "
f"secret '{consts.Onboarding_PrivateKey_Secret_Name}' in namespace "
f"'{consts.Arc_Namespace}'."
)
ensure_arc_namespace_with_helm_metadata()

api_instance = kube_client.CoreV1Api()
secret_body = kube_client.V1Secret(
metadata=kube_client.V1ObjectMeta(
name=consts.Onboarding_PrivateKey_Secret_Name,
namespace=consts.Arc_Namespace,
labels={"app.kubernetes.io/managed-by": "Helm"},
annotations={
"meta.helm.sh/release-name": consts.Helm_Release_Name,
"meta.helm.sh/release-namespace": consts.Release_Install_Namespace,
},
),
type="Opaque",
string_data={consts.Onboarding_PrivateKey_Secret_Data_Key: private_key_pem},
)

try:
api_instance.create_namespaced_secret(consts.Arc_Namespace, secret_body)
except ApiException as ex:
if ex.status != 409:
kubernetes_exception_handler(
ex,
consts.Inject_PrivateKey_Secret_Fault_Type,
error_message=(
"Unable to create onboarding private key secret "
f"'{consts.Onboarding_PrivateKey_Secret_Name}' in namespace "
f"'{consts.Arc_Namespace}'"
),
summary="Unable to create onboarding private key secret",
)
return
# Secret already exists - replace its contents
# so the cluster always uses a private key matching the public key in ARM
try:
api_instance.replace_namespaced_secret(
consts.Onboarding_PrivateKey_Secret_Name,
consts.Arc_Namespace,
secret_body,
)
except ApiException as replace_ex:
kubernetes_exception_handler(
replace_ex,
consts.Inject_PrivateKey_Secret_Fault_Type,
error_message=(
"Unable to update existing onboarding private key secret "
f"'{consts.Onboarding_PrivateKey_Secret_Name}' in namespace "
f"'{consts.Arc_Namespace}'"
),
summary="Unable to update onboarding private key secret",
)


# DO NOT use this method for re-put scenarios. This method involves new NS creation for helm release. For re-put scenarios, brownfield scenario needs to be handled where helm release still stays in default NS
def helm_install_release(
resource_manager: str,
Expand All @@ -1288,6 +1478,7 @@ def helm_install_release(
registry_path: str,
aad_identity_principal_id: str | None,
onboarding_timeout: str = consts.DEFAULT_MAX_ONBOARDING_TIMEOUT_HELMVALUE_SECONDS,
inject_private_key_via_helm: bool = True,
) -> None:
cmd_helm_install = [
helm_client_location,
Expand All @@ -1299,20 +1490,29 @@ def helm_install_release(
f"global.kubernetesDistro={kubernetes_distro}",
"--set",
f"global.kubernetesInfra={kubernetes_infra}",
"--set",
f"global.onboardingPrivateKey={private_key_pem}",
"--set",
"systemDefaultValues.spnOnboarding=false",
"--set",
f"global.azureEnvironment={cloud_name}",
"--set",
"systemDefaultValues.clusterconnect-agent.enabled=true",
"--namespace",
f"{consts.Release_Install_Namespace}",
"--create-namespace",
"--output",
"json",
]
# Pass the onboarding private key through helm values only for older
# (stable < 1.35.0) agents. Newer agents have already received the key via
# a pre-created Kubernetes Secret so it never appears in helm values.
if inject_private_key_via_helm:
cmd_helm_install.extend(
["--set", f"global.onboardingPrivateKey={private_key_pem}"]
)
cmd_helm_install.extend(
[
"--set",
"systemDefaultValues.spnOnboarding=false",
"--set",
f"global.azureEnvironment={cloud_name}",
"--set",
"systemDefaultValues.clusterconnect-agent.enabled=true",
"--namespace",
f"{consts.Release_Install_Namespace}",
"--create-namespace",
"--output",
"json",
]
)

# Special configurations from 2022-09-01 ARM metadata.
# "dataplaneEndpoints" does not appear in arm_metadata for public and AGC
Expand Down
39 changes: 39 additions & 0 deletions src/connectedk8s/azext_connectedk8s/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -1018,6 +1018,44 @@ def create_connectedk8s(
print(
f"Step: {utils.get_utctimestring()}: Starting to install Azure arc agents on the Kubernetes cluster."
)

# Decide which onboarding flow to use. Stable agents below 1.35.0 still need
# the legacy flow (private key in helm values), because their helm chart
# always renders the privatekey secret from helm values and would zero it
# out on install otherwise. Newer agents (and any non-stable build) get the
# secure flow: we pre-create the namespace + secret directly via the
# Kubernetes API so the private key never appears in helm values.
use_secret_injection_flow = utils.should_use_secret_injection_flow(
release_train, azure_arc_agent_version
)
telemetry.add_extension_event(
"connectedk8s",
{
"Context.Default.AzureCLI.OnboardingFlow": (
"secret-injection"
if use_secret_injection_flow
else "helm-values-legacy"
)
},
)

if use_secret_injection_flow:
# Inject the private key BEFORE running helm so that the cluster always
# has the onboarding secret available - even if the subsequent helm
# install/CLI is interrupted - preventing a stuck-disconnected state.
try:
utils.inject_onboarding_private_key_secret(private_key_pem)
except Exception as e:
telemetry.set_exception(
exception=e,
fault_type=consts.Inject_PrivateKey_Secret_Fault_Type,
summary="Failed to pre-create onboarding private key secret",
)
raise CLIInternalError(
"Failed to pre-create onboarding private key secret on the "
f"Kubernetes cluster: {e}"
)

# Install azure-arc agents
utils.helm_install_release(
cmd.cli_ctx.cloud.endpoints.resource_manager,
Expand All @@ -1040,6 +1078,7 @@ def create_connectedk8s(
registry_path,
aad_identity_principal_id,
onboarding_timeout,
inject_private_key_via_helm=not use_secret_injection_flow,
)

# Long Running Operation for Agent State
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
redact_sensitive_fields_from_string,
remove_rsa_private_key,
scrub_proxy_url,
should_use_secret_injection_flow,
)


Expand Down Expand Up @@ -99,5 +100,43 @@ def test_get_mcr_path():
assert get_mcr_path(input_active_directory) == expected_output


@pytest.mark.parametrize(
"release_train,agent_version,expected",
[
# Stable train, agents older than 1.35.0 must use the legacy flow
# (helm value injection) to avoid zeroing out the secret.
("stable", "1.34.9", False),
("stable", "1.20.0", False),
("STABLE", "1.14.0", False),
# Stable train at or above the cutoff uses the secure flow.
("stable", "1.35.0", True),
("stable", "1.36.2", True),
("stable", "2.0.0", True),
# Preview train uses 1.35.0-preview as the cutoff (same scheme).
("preview", "1.34.0", False),
("preview", "1.35.0-preview", True),
("preview", "1.36.0-preview", True),
("PREVIEW", "1.20.0", False),
# Dev-suffixed agent versions always use the secure flow, regardless of
("preview", "0.2.5738-dev", True),
("stable", "0.2.6689-dev", True),
("STABLE", "1.34.0-DEV", True),
(None, "0.2.5738-dev", True),
# Missing version on a gated train -> safe default (legacy flow).
("stable", None, False),
("preview", "", False),
# Missing release train defaults to "stable".
(None, "1.34.0", False),
(None, "1.35.0", True),
# Unparseable version on a gated train -> safe default (legacy flow).
("stable", "not-a-version", False),
],
)
def test_should_use_secret_injection_flow(release_train, agent_version, expected):
assert (
should_use_secret_injection_flow(release_train, agent_version) is expected
)


if __name__ == "__main__":
pytest.main()
Loading