diff --git a/src/connectedk8s/HISTORY.rst b/src/connectedk8s/HISTORY.rst index b80684c1b19..0b83feb5efa 100644 --- a/src/connectedk8s/HISTORY.rst +++ b/src/connectedk8s/HISTORY.rst @@ -2,6 +2,10 @@ Release History =============== +1.10.13 ++++++ +* Added telemetry for pre-onboarding diagnostic results, including diagnostics execution failures. + 1.10.12 +++++ * Removed deprecated '--app-id' and '--app-secret' RBAC parameters from the extension. diff --git a/src/connectedk8s/azext_connectedk8s/_constants.py b/src/connectedk8s/azext_connectedk8s/_constants.py index 65b3f985009..a3baa91d462 100644 --- a/src/connectedk8s/azext_connectedk8s/_constants.py +++ b/src/connectedk8s/azext_connectedk8s/_constants.py @@ -124,6 +124,11 @@ PublicKey_Export_Fault_Type = "publickey-export-error" PrivateKey_Export_Fault_Type = "privatekey-export-error" Install_HelmRelease_Fault_Type = "helm-release-install-error" +Install_Prediagnostics_Fault_Type = "prediagnostics-failure" +Install_Prediagnostics_Job_Execution_Error_Fault_Type = ( + "prediagnostics-job-execution-error" +) +Post_Diagnostic_Precheck_Fault_Type = "post-diagnostic-precheck-failure" Delete_HelmRelease_Fault_Type = "helm-release-delete-error" Check_PodStatus_Fault_Type = "check-pod-status-error" Kubernetes_Connectivity_FaultType = "kubernetes-cluster-connection-error" @@ -418,7 +423,7 @@ # Connect Precheck Diagnoser constants Cluster_Diagnostic_Checks_Job_Registry_Path = ( - "azurearck8s/helmchart/stable/clusterdiagnosticchecks:1.31.2" + "azurearck8s/helmchart/stable/clusterdiagnosticchecks:1.33.0" ) Cluster_Diagnostic_Checks_Helm_Install_Failed_Fault_Type = ( "Error while installing cluster diagnostic checks helm release" @@ -475,6 +480,10 @@ "Outbound network connectivity check failed for Cluster Connect" ) DNS_Check_Result_String = "DNS Result:" +Entra_Connectivity_Check_Result_String = ( + "Entra Authentication Endpoint Connectivity Check Result" +) +CRD_Ownership_Check_Failed_String = "Check Failed: CRD" AZ_CLI_ADAL_TO_MSAL_MIGRATE_VERSION = "2.30.0" CLIENT_PROXY_VERSION = "1.3.033581" CLIENT_PROXY_FOLDER = ".clientproxy" diff --git a/src/connectedk8s/azext_connectedk8s/_precheckutils.py b/src/connectedk8s/azext_connectedk8s/_precheckutils.py index 231f2b4c659..76b5b91b74b 100644 --- a/src/connectedk8s/azext_connectedk8s/_precheckutils.py +++ b/src/connectedk8s/azext_connectedk8s/_precheckutils.py @@ -4,6 +4,7 @@ # -------------------------------------------------------------------------------------------- from __future__ import annotations +import json import os import shutil from subprocess import PIPE, Popen @@ -28,6 +29,112 @@ # pylint: disable diagnoser_output: list[str] = [] +prediagnostic_job_execution_status = "NotStarted" +prediagnostic_dns_check = "Starting" +prediagnostic_outbound_check = "Starting" +prediagnostic_entra_check = "Starting" +prediagnostic_crd_check = "Starting" + + +def send_prediagnostic_job_execution_error_telemetry(reason: str = "") -> None: + """Send telemetry when prediagnostic job execution fails.""" + error_detail_msg = {"jobExecutionStatus": prediagnostic_job_execution_status} + if reason: + error_detail_msg["reason"] = reason + error_message = azext_utils.process_helm_error_detail(json.dumps(error_detail_msg)) + + prediagnostic_error_detail = { + "Context.Default.AzureCLI.onboardingErrorType": consts.Install_Prediagnostics_Job_Execution_Error_Fault_Type, + "Context.Default.AzureCLI.onboardingErrorMessage": error_message, + } + telemetry.add_extension_event("connectedk8s", prediagnostic_error_detail) + + +def send_prediagnostic_check_failure_telemetry( + dns_check: str, outbound_connectivity_check: str +) -> None: + """Send telemetry when prediagnostic checks fail (job completed but checks did not pass).""" + # Extract error messages from diagnoser_output + dns_error = "" + outbound_error = "" + entra_error = "" + crd_error = "" + for msg in diagnoser_output: + msg_lower = msg.lower() + # Capture first line of each error message to keep telemetry concise + if dns_check == "Failed" and "dns" in msg_lower and "error" in msg_lower: + dns_error = msg.strip().splitlines()[0] + if ( + outbound_connectivity_check == "Failed" + and "outbound" in msg_lower + and "error" in msg_lower + ): + outbound_error = msg.strip().splitlines()[0] + if ( + prediagnostic_entra_check == "Failed" + and "entra" in msg_lower + and "error" in msg_lower + ): + entra_error = msg.strip().splitlines()[0] + if ( + prediagnostic_crd_check == "Failed" + and "crd" in msg_lower + and "error" in msg_lower + ): + crd_error = msg.strip().splitlines()[0] + + check_results = { + "dnsCheck": dns_check, + "outboundConnectivityCheck": outbound_connectivity_check, + "entraCheck": prediagnostic_entra_check, + "crdCheck": prediagnostic_crd_check, + } + + # Only add error details if checks actually failed + if dns_error: + check_results["dnsError"] = dns_error + if outbound_error: + check_results["outboundError"] = outbound_error + if entra_error: + check_results["entraError"] = entra_error + if crd_error: + check_results["crdError"] = crd_error + + error_message = azext_utils.process_helm_error_detail(json.dumps(check_results)) + + prediagnostic_error_detail = { + "Context.Default.AzureCLI.onboardingErrorType": consts.Install_Prediagnostics_Fault_Type, + "Context.Default.AzureCLI.onboardingErrorMessage": error_message, + } + + print( + f"[Telemetry] onboardingErrorType={consts.Install_Prediagnostics_Fault_Type} onboardingErrorMessage={error_message}" + ) + telemetry.add_extension_event("connectedk8s", prediagnostic_error_detail) + + +def send_post_diagnostic_precheck_failure_telemetry( + check_name: str, reason: str +) -> None: + """Send telemetry for individual precheck failures that occur after the diagnostic job.""" + error_message = azext_utils.process_helm_error_detail( + json.dumps({"checkName": check_name, "reason": reason}) + ) + error_detail = { + "Context.Default.AzureCLI.onboardingErrorType": consts.Post_Diagnostic_Precheck_Fault_Type, + "Context.Default.AzureCLI.onboardingErrorMessage": error_message, + } + print( + f"[Telemetry] onboardingErrorType={consts.Post_Diagnostic_Precheck_Fault_Type} onboardingErrorMessage={error_message}" + ) + telemetry.add_extension_event("connectedk8s", error_detail) + + +def get_precheck_failure_summary() -> str: + for output in reversed(diagnoser_output): + if output.startswith("Precheck summary:"): + return output + return "" def fetch_diagnostic_checks_results( @@ -47,10 +154,19 @@ def fetch_diagnostic_checks_results( filepath_with_timestamp: str, storage_space_available: bool, ) -> tuple[str, bool]: + global \ + prediagnostic_job_execution_status, \ + prediagnostic_dns_check, \ + prediagnostic_outbound_check, \ + prediagnostic_entra_check, \ + prediagnostic_crd_check try: - # Setting DNS and Outbound Check as working - dns_check = "Starting" - outbound_connectivity_check = "Starting" + diagnoser_output.clear() + prediagnostic_job_execution_status = "NotStarted" + prediagnostic_dns_check = "Starting" + prediagnostic_outbound_check = "Starting" + prediagnostic_entra_check = "Starting" + prediagnostic_crd_check = "Starting" # Executing the cluster_diagnostic_checks job and fetching the logs obtained cluster_diagnostic_checks_container_log = ( executing_cluster_diagnostic_checks_job( @@ -72,16 +188,25 @@ def fetch_diagnostic_checks_results( ) ) # If cluster_diagnostic_checks_container_log is not empty there were errors. Try to read the logs. - if ( - cluster_diagnostic_checks_container_log is not None - and cluster_diagnostic_checks_container_log != "" - ): + if cluster_diagnostic_checks_container_log is None: + diagnoser_output.append( + "Precheck summary: " + f"jobExecutionStatus={prediagnostic_job_execution_status}; " + f"dnsCheck={prediagnostic_dns_check}; outboundConnectivityCheck={prediagnostic_outbound_check}; " + f"entraCheck={prediagnostic_entra_check}; crdCheck={prediagnostic_crd_check}" + ) + send_prediagnostic_job_execution_error_telemetry() + return consts.Diagnostic_Check_Incomplete, storage_space_available + + if cluster_diagnostic_checks_container_log != "": cluster_diagnostic_checks_container_log_list = ( cluster_diagnostic_checks_container_log.split("\n") ) cluster_diagnostic_checks_container_log_list.pop(-1) dns_check_log = "" outbound_connectivity_check_log = "" + entra_check_log = "" + crd_check_log = "" counter_container_logs = 1 # For retrieving only cluster_diagnostic_checks logs from the output for outputs in cluster_diagnostic_checks_container_log_list: @@ -91,6 +216,12 @@ def fetch_diagnostic_checks_results( outbound_connectivity_check_log += outputs else: outbound_connectivity_check_log += " " + outputs + elif consts.Entra_Connectivity_Check_Result_String in outputs: + entra_check_log = outputs + counter_container_logs = 1 + elif consts.CRD_Ownership_Check_Failed_String in outputs: + crd_check_log += outputs + "\n" + counter_container_logs = 1 elif consts.DNS_Check_Result_String in outputs: dns_check_log += outputs counter_container_logs = 0 @@ -102,6 +233,7 @@ def fetch_diagnostic_checks_results( storage_space_available, diagnoser_output, ) + prediagnostic_dns_check = dns_check outbound_connectivity_check, storage_space_available = ( azext_utils.check_cluster_outbound_connectivity( outbound_connectivity_check_log, @@ -110,17 +242,82 @@ def fetch_diagnostic_checks_results( diagnoser_output, ) ) + prediagnostic_outbound_check = outbound_connectivity_check + + # Parse Entra check result + # If no Entra result line found, the helm chart version may not support it — treat as NotApplicable (skip) + if entra_check_log: + # Format: "Entra Authentication Endpoint Connectivity Check Result : : " + parts = entra_check_log.strip().split(" : ") + if len(parts) >= 3: + entra_response_code = parts[-1].strip() + if entra_response_code in ("200", "404"): + prediagnostic_entra_check = consts.Diagnostic_Check_Passed + else: + prediagnostic_entra_check = consts.Diagnostic_Check_Failed + diagnoser_output.append( + f"Error: Entra authentication endpoint connectivity check failed. " + f"Response code: {entra_response_code}. " + "Please ensure outbound connectivity to the Entra (Azure AD) authentication endpoint.\n" + ) + else: + prediagnostic_entra_check = consts.Diagnostic_Check_Incomplete + else: + # Entra check not present in logs — older helm chart version, not applicable + prediagnostic_entra_check = "NotApplicable" + + # Parse CRD ownership check result + if crd_check_log: + prediagnostic_crd_check = consts.Diagnostic_Check_Failed + diagnoser_output.append( + f"Error: CRD ownership validation failed.\n{crd_check_log.strip()}" + ) + else: + prediagnostic_crd_check = consts.Diagnostic_Check_Passed else: + # Empty log — if job didn't complete (e.g., pod never scheduled), treat as Incomplete not Passed + if prediagnostic_job_execution_status == "NotCompleted": + # Mark all individual checks as NotApplicable since the pod never produced output + prediagnostic_dns_check = "NotApplicable" + prediagnostic_outbound_check = "NotApplicable" + prediagnostic_entra_check = "NotApplicable" + prediagnostic_crd_check = "NotApplicable" + diagnoser_output.append( + "Precheck summary: " + f"jobExecutionStatus={prediagnostic_job_execution_status}; " + f"dnsCheck={prediagnostic_dns_check}; outboundConnectivityCheck={prediagnostic_outbound_check}; " + f"entraCheck={prediagnostic_entra_check}; crdCheck={prediagnostic_crd_check}" + ) + send_prediagnostic_job_execution_error_telemetry() + return consts.Diagnostic_Check_Incomplete, storage_space_available return consts.Diagnostic_Check_Passed, storage_space_available - # If any of the check remain Incomplete than we will return Incomplete + diagnoser_output.append( + "Precheck summary: " + f"jobExecutionStatus={prediagnostic_job_execution_status}; " + f"dnsCheck={dns_check}; outboundConnectivityCheck={outbound_connectivity_check}; " + f"entraCheck={prediagnostic_entra_check}; crdCheck={prediagnostic_crd_check}" + ) + + # Return Incomplete if any mandatory check couldn't be determined if ( dns_check == consts.Diagnostic_Check_Incomplete or outbound_connectivity_check == consts.Diagnostic_Check_Incomplete + or prediagnostic_entra_check == consts.Diagnostic_Check_Incomplete ): return consts.Diagnostic_Check_Incomplete, storage_space_available - return consts.Diagnostic_Check_Failed, storage_space_available + # Return Failed only if at least one check actually failed + if ( + dns_check == consts.Diagnostic_Check_Failed + or outbound_connectivity_check == consts.Diagnostic_Check_Failed + or prediagnostic_entra_check == consts.Diagnostic_Check_Failed + or prediagnostic_crd_check == consts.Diagnostic_Check_Failed + ): + return consts.Diagnostic_Check_Failed, storage_space_available + + # All checks passed or not applicable + return consts.Diagnostic_Check_Passed, storage_space_available # To handle any exception that may occur during the execution except Exception as e: @@ -128,6 +325,7 @@ def fetch_diagnostic_checks_results( "An exception has occured while trying to execute cluster diagnostic checks " "container on the cluster." ) + send_prediagnostic_job_execution_error_telemetry(reason=str(e)) telemetry.set_exception( exception=e, fault_type=consts.Cluster_Diagnostic_Checks_Execution_Failed_Fault_Type, @@ -154,6 +352,7 @@ def executing_cluster_diagnostic_checks_job( filepath_with_timestamp: str, storage_space_available: bool, ) -> str | None: + global prediagnostic_job_execution_status job_name = "cluster-diagnostic-checks-job" # Setting the log output as Empty cluster_diagnostic_checks_container_log = "" @@ -174,6 +373,7 @@ def executing_cluster_diagnostic_checks_job( # To handle the user keyboard Interrupt try: + prediagnostic_job_execution_status = "Running" # Executing the Cluster Diagnostic Checks Job yaml config.load_kube_config(kube_config, kube_context) # checking existence of the release and if present we delete the stale release @@ -200,6 +400,7 @@ def executing_cluster_diagnostic_checks_job( exception_occured_counter = 1 # If any exception occured we will print the exception and return if exception_occured_counter == 1: + prediagnostic_job_execution_status = "CleanupFailed" logger.warning( "Cleanup of previous diagnostic checks helm release failed and hence couldn't " 'install the new helm release. Please cleanup older release using "helm delete ' @@ -307,6 +508,7 @@ def executing_cluster_diagnostic_checks_job( # If job is not scheduled then we will delete the helm release if is_job_scheduled is False: + prediagnostic_job_execution_status = "NotScheduled" telemetry.set_exception( exception="Couldn't schedule Cluster Diagnostic Checks Job in the cluster", fault_type=consts.Cluster_Diagnostic_Checks_Job_Not_Scheduled, @@ -324,6 +526,7 @@ def executing_cluster_diagnostic_checks_job( return None if is_job_complete is False: + prediagnostic_job_execution_status = "NotCompleted" # Job was scheduled successfully, but didn't complete. We will fetch the logs and delete helm release. logger.debug( "Cluster Diagnostic Checks Job Failed. Fetch results and delete Helm release in the cluster" @@ -398,14 +601,60 @@ def executing_cluster_diagnostic_checks_job( "possible reasons can be resource constraints on the cluster.\n" ) + # Fetch and save container logs when job completed successfully (always save for diagnostics) + if is_job_complete: + all_pods = corev1_api_instance.list_namespaced_pod("azure-arc-release") + for each_pod in all_pods.items: + pod_name = each_pod.metadata.name + if not pod_name.startswith(job_name): + continue + try: + cluster_diagnostic_checks_container_log = ( + corev1_api_instance.read_namespaced_pod_log( + name=pod_name, + container="cluster-diagnostic-checks-container", + namespace="azure-arc-release", + ) + ) + if storage_space_available: + log_path = os.path.join( + filepath_with_timestamp, + "cluster_diagnostic_checks_job_log.txt", + ) + with open(log_path, "w+") as f: + f.write(cluster_diagnostic_checks_container_log) + except OSError as e: + if "[Errno 28]" in str(e): + storage_space_available = False + telemetry.set_exception( + exception=e, + fault_type=consts.No_Storage_Space_Available_Fault_Type, + summary="No space left on device", + ) + shutil.rmtree(filepath_with_timestamp, ignore_errors=False) + else: + logger.exception( + "An exception has occured while saving the Cluster " + "Diagnostic Checks Job logs in the local machine." + ) + except Exception: + logger.exception( + "An exception has occured while saving the Cluster " + "Diagnostic Checks Job logs in the local machine." + ) + break + # Clearing all the resources after fetching the cluster diagnostic checks container logs Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) # To handle any exception that may occur during the execution except Exception as e: + prediagnostic_job_execution_status = "ExecutionFailed" Popen(cmd_helm_delete, stdout=PIPE, stderr=PIPE) raise CLIInternalError(f"Failed to execute Cluster Diagnostic Checks Job: {e}") - + if is_job_complete: + prediagnostic_job_execution_status = "Completed" + logger.debug(cluster_diagnostic_checks_container_log) return cluster_diagnostic_checks_container_log @@ -459,6 +708,7 @@ def helm_install_release_cluster_diagnostic_checks( _, error_helm_install = response_helm_install.communicate() if response_helm_install.returncode != 0: error = error_helm_install.decode("ascii") + error = azext_utils.process_helm_error_detail(error) if "forbidden" in error or "timed out waiting for the condition" in error: telemetry.set_user_fault() diff --git a/src/connectedk8s/azext_connectedk8s/_utils.py b/src/connectedk8s/azext_connectedk8s/_utils.py index 420fa60e5ee..c3785e2fcd2 100644 --- a/src/connectedk8s/azext_connectedk8s/_utils.py +++ b/src/connectedk8s/azext_connectedk8s/_utils.py @@ -383,6 +383,9 @@ def check_cluster_DNS( if ( "NXDOMAIN" in formatted_dns_log or "connection timed out" in formatted_dns_log + or "no servers could be reached" in formatted_dns_log + or "communications error" in formatted_dns_log + or "timed out" in formatted_dns_log ): logger.warning( "Error: We found an issue with the DNS resolution on your cluster. For details about debugging DNS " diff --git a/src/connectedk8s/azext_connectedk8s/custom.py b/src/connectedk8s/azext_connectedk8s/custom.py index d1dc4b2ac87..19c5797329e 100644 --- a/src/connectedk8s/azext_connectedk8s/custom.py +++ b/src/connectedk8s/azext_connectedk8s/custom.py @@ -367,6 +367,12 @@ def create_connectedk8s( filepath_with_timestamp, storage_space_available, 1 ) + if precheckutils.diagnoser_output: + print("\n--- Pre-onboarding Diagnostic Check Results ---") + for line in precheckutils.diagnoser_output: + print(line.rstrip()) + print("--- End of Diagnostic Check Results ---\n") + if storage_space_available is False: logger.warning( "There is no storage space available on your device and hence not saving cluster " @@ -374,6 +380,7 @@ def create_connectedk8s( ) except Exception as e: + precheckutils.send_prediagnostic_job_execution_error_telemetry(reason=str(e)) ex_msg = f"An exception occured while trying to execute pre-onboarding diagnostic checks : {e}" summ_msg = f"An exception occured while trying to execute pre-onboarding diagnostic checks : {e}" telemetry.set_exception( @@ -401,6 +408,23 @@ def create_connectedk8s( and not azure_local_disconnected and not lowbandwidth ): + precheck_failure_summary = precheckutils.get_precheck_failure_summary() + precheck_failure_summary_msg = ( + f" Details: {precheck_failure_summary}" if precheck_failure_summary else "" + ) + if precheckutils.prediagnostic_job_execution_status == "Completed" or ( + precheckutils.prediagnostic_job_execution_status == "NotCompleted" + and precheckutils.prediagnostic_dns_check != "NotApplicable" + ): + precheckutils.send_prediagnostic_check_failure_telemetry( + precheckutils.prediagnostic_dns_check, + precheckutils.prediagnostic_outbound_check, + ) + elif precheckutils.prediagnostic_job_execution_status not in ( + "Completed", + "NotCompleted", + ): + precheckutils.send_prediagnostic_job_execution_error_telemetry() if storage_space_available: logger.warning( "The pre-check result logs logs have been saved at this path: " @@ -418,6 +442,7 @@ def create_connectedk8s( "meet the prerequisites - " + consts.Doc_Onboarding_PreRequisites_Url + " and try onboarding again." + + precheck_failure_summary_msg ) raise ValidationError(err_msg) @@ -430,6 +455,7 @@ def create_connectedk8s( err_msg = ( "One or more pre-onboarding diagnostic checks failed and hence not proceeding with " "cluster onboarding. Please resolve them and try onboarding again." + + precheck_failure_summary_msg ) raise ValidationError(err_msg) @@ -450,6 +476,10 @@ def create_connectedk8s( fault_type=consts.Linux_Node_Not_Exists, summary="Couldn't find any node on the kubernetes cluster with the OS 'linux'", ) + precheckutils.send_post_diagnostic_precheck_failure_telemetry( + check_name="LinuxNodeExists", + reason="Couldn't find any node on the kubernetes cluster with the OS 'linux'", + ) logger.warning( "Please ensure that this Kubernetes cluster has any nodes with OS 'linux', for scheduling the " "Arc-Agents onto and connecting to Azure. Learn more at %s", @@ -468,6 +498,10 @@ def create_connectedk8s( fault_type=consts.Cannot_Create_ClusterRoleBindings_Fault_Type, summary=summ_msg, ) + precheckutils.send_post_diagnostic_precheck_failure_telemetry( + check_name="ClusterRoleBindings", + reason=ex_msg, + ) err_msg = ( "Your credentials doesn't have permission to create clusterrolebindings on this " "kubernetes cluster. Please check your permissions." diff --git a/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py b/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py new file mode 100644 index 00000000000..194559196f4 --- /dev/null +++ b/src/connectedk8s/azext_connectedk8s/tests/unittests/test_precheckutils.py @@ -0,0 +1,332 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for license information. +# -------------------------------------------------------------------------------------------- +"""Unit tests for prediagnostic telemetry functions in _precheckutils.py.""" + +from __future__ import annotations + +import json +import os +import sys +from unittest.mock import MagicMock, patch + +# Stub out heavy dependencies before importing the module under test. +# Use setdefault so real modules are preferred when available (e.g. in azdev CI), +# but stubs are used in lightweight environments without full CLI installed. +_STUBS = { + "kubernetes": MagicMock(), + "kubernetes.config": MagicMock(), + "kubernetes.watch": MagicMock(), + "kubernetes.client": MagicMock(), + "kubernetes.client.models": MagicMock(), + "azure": MagicMock(), + "azure.cli": MagicMock(), + "azure.cli.core": MagicMock(), + "azure.cli.core.telemetry": MagicMock(), + "azure.cli.core.azclierror": MagicMock(), + "azure.cli.core.commands": MagicMock(), + "azure.cli.core.commands.client_factory": MagicMock(), + "azure.cli.core.util": MagicMock(), + "azure.cli.core._config": MagicMock(), + "azure.core": MagicMock(), + "azure.core.exceptions": MagicMock(), + "azure.mgmt": MagicMock(), + "azure.mgmt.core": MagicMock(), + "azure.mgmt.core.tools": MagicMock(), + "msrest": MagicMock(), + "msrestazure": MagicMock(), + "knack": MagicMock(), + "knack.log": MagicMock(), + "knack.help_files": MagicMock(), + "knack.util": MagicMock(), + "knack.cli": MagicMock(), + "knack.config": MagicMock(), + "knack.prompting": MagicMock(), + "knack.commands": MagicMock(), + "knack.arguments": MagicMock(), + "knack.events": MagicMock(), + # Stub the sibling module to avoid its transitive imports + "azext_connectedk8s._utils": MagicMock(), +} +_ORIGINAL_MODULES = {mod: sys.modules.get(mod) for mod in _STUBS} +for mod, stub in _STUBS.items(): + sys.modules.setdefault(mod, stub) + +# Make process_helm_error_detail a transparent passthrough so telemetry message assertions work. +# Only patch if this is our MagicMock stub — if the real module is already loaded (e.g. in full +# azdev CI), patching it here would permanently mutate its attribute on the shared module object. +_utils_stub = sys.modules.get("azext_connectedk8s._utils") +if isinstance(_utils_stub, MagicMock): + _utils_stub.process_helm_error_detail = lambda x: x + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))) + +import azext_connectedk8s._constants as consts # noqa: E402 +import azext_connectedk8s._precheckutils as precheckutils # noqa: E402 + +for mod, original_module in _ORIGINAL_MODULES.items(): + if original_module is None: + sys.modules.pop(mod, None) + else: + sys.modules[mod] = original_module + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _reset_globals(): + """Reset module-level globals to a clean state before each test.""" + precheckutils.diagnoser_output = [] + precheckutils.prediagnostic_job_execution_status = "NotStarted" + precheckutils.prediagnostic_entra_check = "Starting" + precheckutils.prediagnostic_crd_check = "Starting" + + +# --------------------------------------------------------------------------- +# send_prediagnostic_job_execution_error_telemetry +# --------------------------------------------------------------------------- + + +class TestSendJobExecutionErrorTelemetry: + def setup_method(self): + _reset_globals() + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_sends_event_with_correct_error_type(self, mock_telemetry): + precheckutils.prediagnostic_job_execution_status = "ExecutionFailed" + precheckutils.send_prediagnostic_job_execution_error_telemetry() + + mock_telemetry.add_extension_event.assert_called_once() + args = mock_telemetry.add_extension_event.call_args + assert args[0][0] == "connectedk8s" + props = args[0][1] + assert ( + props["Context.Default.AzureCLI.onboardingErrorType"] + == consts.Install_Prediagnostics_Job_Execution_Error_Fault_Type + ) + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_message_includes_job_execution_status(self, mock_telemetry): + precheckutils.prediagnostic_job_execution_status = "ExecutionFailed" + precheckutils.send_prediagnostic_job_execution_error_telemetry() + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert msg["jobExecutionStatus"] == "ExecutionFailed" + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_message_includes_reason_when_provided(self, mock_telemetry): + precheckutils.prediagnostic_job_execution_status = "NotCompleted" + precheckutils.send_prediagnostic_job_execution_error_telemetry( + reason="ImagePullBackOff" + ) + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert msg["reason"] == "ImagePullBackOff" + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_message_omits_reason_when_empty(self, mock_telemetry): + precheckutils.send_prediagnostic_job_execution_error_telemetry() + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "reason" not in msg + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_message_is_valid_json(self, mock_telemetry): + precheckutils.send_prediagnostic_job_execution_error_telemetry( + reason="ContainerCreating" + ) + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert isinstance(msg, dict) + + +# --------------------------------------------------------------------------- +# send_prediagnostic_check_failure_telemetry +# --------------------------------------------------------------------------- + + +class TestSendCheckFailureTelemetry: + def setup_method(self): + _reset_globals() + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_sends_event_with_correct_error_type(self, mock_telemetry): + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") + + mock_telemetry.add_extension_event.assert_called_once() + props = mock_telemetry.add_extension_event.call_args[0][1] + assert ( + props["Context.Default.AzureCLI.onboardingErrorType"] + == consts.Install_Prediagnostics_Fault_Type + ) + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_check_results_in_message(self, mock_telemetry): + precheckutils.prediagnostic_entra_check = "Failed" + precheckutils.prediagnostic_crd_check = "Passed" + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Failed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert msg["dnsCheck"] == "Passed" + assert msg["outboundConnectivityCheck"] == "Failed" + assert msg["entraCheck"] == "Failed" + assert msg["crdCheck"] == "Passed" + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_entra_error_extracted_from_diagnoser_output(self, mock_telemetry): + precheckutils.prediagnostic_entra_check = "Failed" + precheckutils.diagnoser_output = [ + "Some log line", + "Error: Entra endpoint not reachable. Response code: 000", + ] + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "entraError" in msg + assert "000" in msg["entraError"] + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_dns_error_extracted_from_diagnoser_output(self, mock_telemetry): + precheckutils.diagnoser_output = [ + "DNS error: resolution failed for test.example.com", + ] + precheckutils.send_prediagnostic_check_failure_telemetry("Failed", "Passed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "dnsError" in msg + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_outbound_error_extracted_from_diagnoser_output(self, mock_telemetry): + precheckutils.diagnoser_output = [ + "Outbound connectivity error: MCR not reachable", + ] + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Failed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "outboundError" in msg + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_multiline_error_trimmed_to_first_line(self, mock_telemetry): + precheckutils.prediagnostic_entra_check = "Failed" + precheckutils.diagnoser_output = [ + "Error: Entra endpoint error line1\nline2\nline3", + ] + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "\n" not in msg.get("entraError", "") + assert "line1" in msg.get("entraError", "") + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_no_error_detail_when_checks_pass(self, mock_telemetry): + precheckutils.prediagnostic_entra_check = "Passed" + precheckutils.prediagnostic_crd_check = "Passed" + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "dnsError" not in msg + assert "entraError" not in msg + assert "outboundError" not in msg + assert "crdError" not in msg + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_non_error_lines_not_captured(self, mock_telemetry): + """Lines mentioning entra but not 'error' should not be captured.""" + precheckutils.prediagnostic_entra_check = "Failed" + precheckutils.diagnoser_output = [ + "Entra check: starting", + "Entra Authentication Endpoint Connectivity Check Result : https://login.microsoftonline.com : 000", + ] + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "entraError" not in msg + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_crd_error_extracted_from_diagnoser_output(self, mock_telemetry): + precheckutils.prediagnostic_crd_check = "Failed" + precheckutils.diagnoser_output = [ + "CRD ownership error: extensionconfigs.clusterconfig.azure.com owned by another release", + ] + precheckutils.send_prediagnostic_check_failure_telemetry("Passed", "Passed") + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert "crdError" in msg + + +# --------------------------------------------------------------------------- +# send_post_diagnostic_precheck_failure_telemetry +# --------------------------------------------------------------------------- + + +class TestSendPostDiagnosticPrecheckFailureTelemetry: + def setup_method(self): + _reset_globals() + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_sends_event_with_correct_error_type(self, mock_telemetry): + precheckutils.send_post_diagnostic_precheck_failure_telemetry( + "LinuxNodeExists", "No Linux nodes found" + ) + + mock_telemetry.add_extension_event.assert_called_once() + props = mock_telemetry.add_extension_event.call_args[0][1] + assert ( + props["Context.Default.AzureCLI.onboardingErrorType"] + == consts.Post_Diagnostic_Precheck_Fault_Type + ) + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_message_includes_check_name_and_reason(self, mock_telemetry): + precheckutils.send_post_diagnostic_precheck_failure_telemetry( + "ClusterRoleBindings", "Insufficient permissions" + ) + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert msg["checkName"] == "ClusterRoleBindings" + assert msg["reason"] == "Insufficient permissions" + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_message_is_valid_json(self, mock_telemetry): + precheckutils.send_post_diagnostic_precheck_failure_telemetry( + "SomeCheck", "Some reason" + ) + + props = mock_telemetry.add_extension_event.call_args[0][1] + msg = json.loads(props["Context.Default.AzureCLI.onboardingErrorMessage"]) + assert isinstance(msg, dict) + + @patch("azext_connectedk8s._precheckutils.telemetry") + def test_different_check_names_produce_separate_events(self, mock_telemetry): + precheckutils.send_post_diagnostic_precheck_failure_telemetry( + "LinuxNodeExists", "No nodes" + ) + precheckutils.send_post_diagnostic_precheck_failure_telemetry( + "ClusterRoleBindings", "No perms" + ) + + assert mock_telemetry.add_extension_event.call_count == 2 + calls = mock_telemetry.add_extension_event.call_args_list + msg1 = json.loads( + calls[0][0][1]["Context.Default.AzureCLI.onboardingErrorMessage"] + ) + msg2 = json.loads( + calls[1][0][1]["Context.Default.AzureCLI.onboardingErrorMessage"] + ) + assert msg1["checkName"] == "LinuxNodeExists" + assert msg2["checkName"] == "ClusterRoleBindings" diff --git a/src/connectedk8s/azext_connectedk8s/tests/unittests/test_utils_.py b/src/connectedk8s/azext_connectedk8s/tests/unittests/test_utils_.py index 32d1da1e3b4..7f3eed89fde 100644 --- a/src/connectedk8s/azext_connectedk8s/tests/unittests/test_utils_.py +++ b/src/connectedk8s/azext_connectedk8s/tests/unittests/test_utils_.py @@ -4,11 +4,39 @@ # -------------------------------------------------------------------------------------------- import os import sys +from unittest.mock import MagicMock import pytest sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../../.."))) -from azext_connectedk8s._utils import ( + +if isinstance(sys.modules.get("azext_connectedk8s._utils"), MagicMock): + sys.modules.pop("azext_connectedk8s._utils", None) + +_STUBS = { + "azure": MagicMock(), + "azure.cli": MagicMock(), + "azure.cli.core": MagicMock(), + "azure.cli.core.azclierror": MagicMock(), + "azure.cli.core.commands": MagicMock(), + "azure.cli.core.commands.client_factory": MagicMock(), + "azure.cli.core.util": MagicMock(), + "azure.core": MagicMock(), + "azure.core.exceptions": MagicMock(), + "knack": MagicMock(), + "knack.log": MagicMock(), + "knack.prompting": MagicMock(), + "kubernetes": MagicMock(), + "kubernetes.client": MagicMock(), + "kubernetes.client.rest": MagicMock(), + "msrest": MagicMock(), + "msrest.exceptions": MagicMock(), + "azext_connectedk8s._client_factory": MagicMock(), +} +for mod, stub in _STUBS.items(): + sys.modules.setdefault(mod, stub) + +from azext_connectedk8s._utils import ( # noqa: E402 get_mcr_path, process_helm_error_detail, redact_sensitive_fields_from_string, diff --git a/testing/test_prediagnostic_telemetry.ps1 b/testing/test_prediagnostic_telemetry.ps1 new file mode 100644 index 00000000000..0a30b11a49b --- /dev/null +++ b/testing/test_prediagnostic_telemetry.ps1 @@ -0,0 +1,265 @@ +# test_prediagnostic_telemetry.ps1 +# Exercises all prediagnostic failing scenarios and verifies az connectedk8s connect fails. +# Usage: .\test_prediagnostic_telemetry.ps1 +# Prerequisites: kubectl configured, az cli with connectedk8s extension from source (env1), kubeconfig set. + +param( + [string]$ResourceGroup = "audittest", + [string]$Location = "eastus2euap" +) + +# ── Helpers ────────────────────────────────────────────────────────────────── + +$PASS = "[PASS]" +$FAIL = "[FAIL]" +$INFO = "[INFO]" + +function Log-Info { param($msg) Write-Host "$INFO $msg" -ForegroundColor Cyan } +function Log-Pass { param($msg) Write-Host "$PASS $msg" -ForegroundColor Green } +function Log-Fail { param($msg) Write-Host "$FAIL $msg" -ForegroundColor Red } +function Log-Sep { Write-Host ("`n" + "─" * 70) -ForegroundColor DarkGray } + +$OriginalCorefile = $null + +function Save-CoreDNS { + $script:OriginalCorefile = kubectl get configmap coredns -n kube-system -o jsonpath='{.data.Corefile}' 2>&1 + Log-Info "CoreDNS original config saved." +} + +function Restore-CoreDNS { + if (-not $script:OriginalCorefile) { return } + Log-Info "Restoring CoreDNS..." + $patch = @{data = @{Corefile = $script:OriginalCorefile}} | ConvertTo-Json -Compress -Depth 5 + kubectl patch configmap coredns -n kube-system --type merge -p $patch | Out-Null + kubectl rollout restart deployment/coredns -n kube-system | Out-Null + kubectl rollout status deployment/coredns -n kube-system --timeout=60s | Out-Null + Log-Info "CoreDNS restored." +} + +function Apply-CoreDNS-Block { + param([string[]]$Hosts) + $hostsBlock = ($Hosts | ForEach-Object { " 192.0.2.1 $_" }) -join "`n" + $newCorefile = @" +.:53 { + errors + ready + health { + lameduck 5s + } + hosts { +$hostsBlock + fallthrough + } + kubernetes cluster.local in-addr.arpa ip6.arpa { + pods insecure + fallthrough in-addr.arpa ip6.arpa + ttl 30 + } + prometheus :9153 + forward . /etc/resolv.conf + cache 30 + loop + reload + loadbalance + import custom/*.override + template ANY ANY internal.cloudapp.net { + match "^(?:[^.]+\.){4,}internal\.cloudapp\.net\.$" + rcode NXDOMAIN + fallthrough + } + template ANY ANY reddog.microsoft.com { + rcode NXDOMAIN + } +} +import custom/*.server +"@ + $patch = @{data = @{Corefile = $newCorefile}} | ConvertTo-Json -Compress -Depth 5 + kubectl patch configmap coredns -n kube-system --type merge -p $patch | Out-Null + kubectl rollout restart deployment/coredns -n kube-system | Out-Null + kubectl rollout status deployment/coredns -n kube-system --timeout=60s | Out-Null + Log-Info "CoreDNS block applied for: $($Hosts -join ', ')" +} + +function Run-ConnectTest { + param([string]$ClusterName, [string]$TestDescription) + Log-Info "Running: az connectedk8s connect -g $ResourceGroup -n $ClusterName" + $output = az connectedk8s connect -g $ResourceGroup -n $ClusterName --location $Location 2>&1 + $exitCode = $LASTEXITCODE + + $telemetryLines = $output | Where-Object { $_ -match "\[Telemetry\]" } + $resultLines = $output | Where-Object { $_ -match "Pre-onboarding Diagnostic|Precheck summary|pre-checks|required pre-checks" } + + Write-Host "`n ── Output excerpt ──" + $resultLines | ForEach-Object { Write-Host " $_" -ForegroundColor Yellow } + $telemetryLines | ForEach-Object { Write-Host " $_" -ForegroundColor Magenta } + + if ($exitCode -ne 0) { + Log-Pass "$TestDescription → command failed as expected (exit $exitCode)" + if (-not $telemetryLines) { + Write-Host " WARNING: No [Telemetry] line found in output." -ForegroundColor DarkYellow + } + } else { + Log-Fail "$TestDescription → command SUCCEEDED but was expected to FAIL" + } +} + +function Cleanup-AzResource { + param([string]$ClusterName) + Log-Info "Cleaning up ARM resource: $ClusterName (if it exists)" + az connectedk8s delete -g $ResourceGroup -n $ClusterName --force -y 2>&1 | Out-Null +} + +function Apply-BadCRD { + param([string]$CRDName) + $manifest = @" +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + name: $CRDName + annotations: + meta.helm.sh/release-name: some-other-component + meta.helm.sh/release-namespace: default +spec: + group: clusterconfig.azure.com + names: + kind: FakeResource + listKind: FakeResourceList + plural: $(($CRDName -split '\.')[0]) + singular: fakeresource + scope: Cluster + versions: + - name: v1 + served: true + storage: true + schema: + openAPIV3Schema: + type: object +"@ + $manifest | kubectl apply -f - 2>&1 | Out-Null + Log-Info "Bad CRD applied: $CRDName" +} + +function Remove-CRD { + param([string]$CRDName) + kubectl delete crd $CRDName --ignore-not-found=true 2>&1 | Out-Null + Log-Info "CRD removed: $CRDName" +} + +function Apply-PodQuota { + $quota = @" +apiVersion: v1 +kind: ResourceQuota +metadata: + name: block-pods + namespace: azure-arc-release +spec: + hard: + pods: "0" +"@ + kubectl create namespace azure-arc-release --dry-run=client -o yaml | kubectl apply -f - 2>&1 | Out-Null + $quota | kubectl apply -f - 2>&1 | Out-Null + Log-Info "ResourceQuota applied: pods=0 in azure-arc-release" +} + +function Remove-PodQuota { + kubectl delete resourcequota block-pods -n azure-arc-release --ignore-not-found=true 2>&1 | Out-Null + Log-Info "ResourceQuota removed." +} + +# ── Main ───────────────────────────────────────────────────────────────────── + +$results = @() + +Save-CoreDNS + +# ───────────────────────────────────────────────────────────────────────────── +Log-Sep +Log-Info "TEST 1: Block MCR (outbound connectivity failure)" +Log-Info "Expected telemetry: onboardingErrorType=prediagnostics-failure, outboundConnectivityCheck=Failed" +# ───────────────────────────────────────────────────────────────────────────── +$clusterName = "adblocktest-mcr" +Cleanup-AzResource $clusterName +Apply-CoreDNS-Block -Hosts @("mcr.microsoft.com") +Run-ConnectTest -ClusterName $clusterName -TestDescription "MCR outbound block" +Restore-CoreDNS +Cleanup-AzResource $clusterName +$results += "TEST 1 - MCR Block" + +# ───────────────────────────────────────────────────────────────────────────── +Log-Sep +Log-Info "TEST 2: Block Entra auth endpoint (Entra check failure)" +Log-Info "Expected telemetry: onboardingErrorType=prediagnostics-failure, entraCheck=Failed" +# ───────────────────────────────────────────────────────────────────────────── +$clusterName = "adblocktest-entra" +Cleanup-AzResource $clusterName +Apply-CoreDNS-Block -Hosts @("login.microsoftonline.com") +Run-ConnectTest -ClusterName $clusterName -TestDescription "Entra endpoint block" +Restore-CoreDNS +Cleanup-AzResource $clusterName +$results += "TEST 2 - Entra Block" + +# ───────────────────────────────────────────────────────────────────────────── +Log-Sep +Log-Info "TEST 3: Block BOTH MCR + Entra (combined outbound failure)" +Log-Info "Expected telemetry: onboardingErrorType=prediagnostics-failure, outboundConnectivityCheck=Failed, entraCheck=Failed" +# ───────────────────────────────────────────────────────────────────────────── +$clusterName = "adblocktest-all-outbound" +Cleanup-AzResource $clusterName +Apply-CoreDNS-Block -Hosts @("mcr.microsoft.com", "login.microsoftonline.com") +Run-ConnectTest -ClusterName $clusterName -TestDescription "MCR + Entra combined block" +Restore-CoreDNS +Cleanup-AzResource $clusterName +$results += "TEST 3 - MCR + Entra Block" + +# ───────────────────────────────────────────────────────────────────────────── +Log-Sep +Log-Info "TEST 4: CRD ownership conflict (crdCheck failure)" +Log-Info "Expected telemetry: onboardingErrorType=prediagnostics-failure, crdCheck=Failed" +# ───────────────────────────────────────────────────────────────────────────── +$clusterName = "adblocktest-crd" +Cleanup-AzResource $clusterName +Apply-BadCRD "extensionconfigs.clusterconfig.azure.com" +Run-ConnectTest -ClusterName $clusterName -TestDescription "CRD ownership conflict" +Remove-CRD "extensionconfigs.clusterconfig.azure.com" +Cleanup-AzResource $clusterName +$results += "TEST 4 - CRD Conflict" + +# ───────────────────────────────────────────────────────────────────────────── +Log-Sep +Log-Info "TEST 5: Job cannot be scheduled (ResourceQuota pods=0)" +Log-Info "Expected telemetry: onboardingErrorType=prediagnostics-job-execution-error, jobExecutionStatus=NotScheduled" +# ───────────────────────────────────────────────────────────────────────────── +$clusterName = "adblocktest-nojob" +Cleanup-AzResource $clusterName +Apply-PodQuota +Run-ConnectTest -ClusterName $clusterName -TestDescription "Job not schedulable" +Remove-PodQuota +Cleanup-AzResource $clusterName +$results += "TEST 5 - Job Not Schedulable" + +# ───────────────────────────────────────────────────────────────────────────── +Log-Sep +Log-Info "TEST 6: Happy path (all checks pass — command should SUCCEED)" +Log-Info "Expected: no [Telemetry] failure lines, 'pre-checks have succeeded'" +# ───────────────────────────────────────────────────────────────────────────── +$clusterName = "adblocktest-happy" +Log-Info "Running: az connectedk8s connect -g $ResourceGroup -n $clusterName" +$output = az connectedk8s connect -g $ResourceGroup -n $clusterName --location $Location 2>&1 +$exitCode = $LASTEXITCODE +$telemetryFailLines = $output | Where-Object { $_ -match "\[Telemetry\].*prediagnostics" } +if ($exitCode -eq 0 -and -not $telemetryFailLines) { + Log-Pass "Happy path → command succeeded, no failure telemetry" +} elseif ($exitCode -eq 0 -and $telemetryFailLines) { + Log-Fail "Happy path → command succeeded BUT unexpected [Telemetry] failure lines found:" + $telemetryFailLines | ForEach-Object { Write-Host " $_" -ForegroundColor Red } +} else { + Log-Fail "Happy path → command FAILED unexpectedly (exit $exitCode)" +} +Cleanup-AzResource $clusterName +$results += "TEST 6 - Happy Path" + +# ───────────────────────────────────────────────────────────────────────────── +Log-Sep +Write-Host "`nTest run complete. Scenarios executed:" -ForegroundColor White +$results | ForEach-Object { Write-Host " • $_" -ForegroundColor Gray } +Log-Sep diff --git a/testing/test_prediagnostic_telemetry.sh b/testing/test_prediagnostic_telemetry.sh new file mode 100644 index 00000000000..decd421b18c --- /dev/null +++ b/testing/test_prediagnostic_telemetry.sh @@ -0,0 +1,277 @@ +#!/bin/bash +# test_prediagnostic_telemetry.sh +# Exercises all prediagnostic failing scenarios and verifies az connectedk8s connect fails. +# Usage: bash test_prediagnostic_telemetry.sh [resource_group] [location] +# Prerequisites: kubectl configured, az cli with connectedk8s extension installed, kubeconfig set. + +RESOURCE_GROUP="${1:-audittest}" +LOCATION="${2:-eastus2euap}" +ORIGINAL_COREFILE="" +PASS_COUNT=0 +FAIL_COUNT=0 + +# ── Colors ─────────────────────────────────────────────────────────────────── +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +CYAN='\033[0;36m' +MAGENTA='\033[0;35m' +GRAY='\033[0;37m' +NC='\033[0m' # No Color + +log_info() { echo -e "${CYAN}[INFO]${NC} $1"; } +log_pass() { echo -e "${GREEN}[PASS]${NC} $1"; ((PASS_COUNT++)); } +log_fail() { echo -e "${RED}[FAIL]${NC} $1"; ((FAIL_COUNT++)); } +log_sep() { echo -e "\n${GRAY}$(printf '─%.0s' {1..70})${NC}"; } + +# ── CoreDNS helpers ────────────────────────────────────────────────────────── + +save_coredns() { + ORIGINAL_COREFILE=$(kubectl get configmap coredns -n kube-system -o jsonpath='{.data.Corefile}' 2>&1) + log_info "CoreDNS original config saved." +} + +restore_coredns() { + if [[ -z "$ORIGINAL_COREFILE" ]]; then return; fi + log_info "Restoring CoreDNS..." + kubectl patch configmap coredns -n kube-system --type merge \ + -p "{\"data\":{\"Corefile\":$(echo "$ORIGINAL_COREFILE" | python3 -c 'import json,sys; print(json.dumps(sys.stdin.read()))')}}" \ + > /dev/null 2>&1 + kubectl rollout restart deployment/coredns -n kube-system > /dev/null 2>&1 + kubectl rollout status deployment/coredns -n kube-system --timeout=60s > /dev/null 2>&1 + log_info "CoreDNS restored." +} + +# Apply a CoreDNS hosts block redirecting specified hostnames to 192.0.2.1 (black-hole) +# Usage: apply_coredns_block "host1 host2 ..." +apply_coredns_block() { + local hosts_entries="" + for host in $1; do + hosts_entries+=" 192.0.2.1 ${host}\n" + done + + local new_corefile + new_corefile=$(cat < /dev/null 2>&1 + kubectl rollout restart deployment/coredns -n kube-system > /dev/null 2>&1 + kubectl rollout status deployment/coredns -n kube-system --timeout=60s > /dev/null 2>&1 + log_info "CoreDNS block applied for: $1" +} + +# ── Test runner ────────────────────────────────────────────────────────────── + +run_connect_test() { + local cluster_name="$1" + local test_desc="$2" + + log_info "Running: az connectedk8s connect -g $RESOURCE_GROUP -n $cluster_name" + output=$(az connectedk8s connect -g "$RESOURCE_GROUP" -n "$cluster_name" --location "$LOCATION" 2>&1) + exit_code=$? + + echo "" + echo " ── Output excerpt ──" + echo "$output" | grep -E "Pre-onboarding Diagnostic|Precheck summary|pre-checks|required pre-checks" \ + | while IFS= read -r line; do echo -e " ${YELLOW}${line}${NC}"; done + echo "$output" | grep "\[Telemetry\]" \ + | while IFS= read -r line; do echo -e " ${MAGENTA}${line}${NC}"; done + + if [[ $exit_code -ne 0 ]]; then + log_pass "$test_desc → command failed as expected (exit $exit_code)" + if ! echo "$output" | grep -q "\[Telemetry\]"; then + echo -e " ${YELLOW}WARNING: No [Telemetry] line found in output.${NC}" + fi + else + log_fail "$test_desc → command SUCCEEDED but was expected to FAIL" + fi +} + +cleanup_az_resource() { + local cluster_name="$1" + log_info "Cleaning up ARM resource: $cluster_name (if it exists)" + az connectedk8s delete -g "$RESOURCE_GROUP" -n "$cluster_name" --force -y > /dev/null 2>&1 +} + +apply_bad_crd() { + local crd_name="$1" + kubectl apply -f - > /dev/null 2>&1 < /dev/null 2>&1 + log_info "CRD removed: $1" +} + +apply_pod_quota() { + kubectl create namespace azure-arc-release --dry-run=client -o yaml | kubectl apply -f - > /dev/null 2>&1 + kubectl apply -f - > /dev/null 2>&1 < /dev/null 2>&1 + log_info "ResourceQuota removed." +} + +# ── Main ───────────────────────────────────────────────────────────────────── + +echo -e "\n${CYAN}Pre-onboarding Diagnostic Telemetry Test Suite${NC}" +echo -e "${CYAN}Resource Group: $RESOURCE_GROUP | Location: $LOCATION${NC}" + +save_coredns + +# ───────────────────────────────────────────────────────────────────────────── +log_sep +log_info "TEST 1: Block MCR (outbound connectivity failure)" +log_info "Expected telemetry: onboardingErrorType=prediagnostics-failure, outboundConnectivityCheck=Failed" +# ───────────────────────────────────────────────────────────────────────────── +CLUSTER="adblocktest-mcr" +cleanup_az_resource "$CLUSTER" +apply_coredns_block "mcr.microsoft.com" +run_connect_test "$CLUSTER" "MCR outbound block" +restore_coredns +cleanup_az_resource "$CLUSTER" + +# ───────────────────────────────────────────────────────────────────────────── +log_sep +log_info "TEST 2: Block Entra auth endpoint (Entra check failure)" +log_info "Expected telemetry: onboardingErrorType=prediagnostics-failure, entraCheck=Failed" +# ───────────────────────────────────────────────────────────────────────────── +CLUSTER="adblocktest-entra" +cleanup_az_resource "$CLUSTER" +apply_coredns_block "login.microsoftonline.com" +run_connect_test "$CLUSTER" "Entra endpoint block" +restore_coredns +cleanup_az_resource "$CLUSTER" + +# ───────────────────────────────────────────────────────────────────────────── +log_sep +log_info "TEST 3: Block BOTH MCR + Entra (combined outbound failure)" +log_info "Expected telemetry: outboundConnectivityCheck=Failed, entraCheck=Failed" +# ───────────────────────────────────────────────────────────────────────────── +CLUSTER="adblocktest-all-outbound" +cleanup_az_resource "$CLUSTER" +apply_coredns_block "mcr.microsoft.com login.microsoftonline.com" +run_connect_test "$CLUSTER" "MCR + Entra combined block" +restore_coredns +cleanup_az_resource "$CLUSTER" + +# ───────────────────────────────────────────────────────────────────────────── +log_sep +log_info "TEST 4: CRD ownership conflict (crdCheck failure)" +log_info "Expected telemetry: onboardingErrorType=prediagnostics-failure, crdCheck=Failed" +# ───────────────────────────────────────────────────────────────────────────── +CLUSTER="adblocktest-crd" +cleanup_az_resource "$CLUSTER" +apply_bad_crd "extensionconfigs.clusterconfig.azure.com" +run_connect_test "$CLUSTER" "CRD ownership conflict" +remove_crd "extensionconfigs.clusterconfig.azure.com" +cleanup_az_resource "$CLUSTER" + +# ───────────────────────────────────────────────────────────────────────────── +log_sep +log_info "TEST 5: Job cannot be scheduled (ResourceQuota pods=0)" +log_info "Expected telemetry: onboardingErrorType=prediagnostics-job-execution-error, jobExecutionStatus=NotScheduled" +# ───────────────────────────────────────────────────────────────────────────── +CLUSTER="adblocktest-nojob" +cleanup_az_resource "$CLUSTER" +apply_pod_quota +run_connect_test "$CLUSTER" "Job not schedulable" +remove_pod_quota +cleanup_az_resource "$CLUSTER" + +# ───────────────────────────────────────────────────────────────────────────── +log_sep +log_info "TEST 6: Happy path (all checks pass — command should SUCCEED)" +log_info "Expected: no [Telemetry] failure lines, command exits 0" +# ───────────────────────────────────────────────────────────────────────────── +CLUSTER="adblocktest-happy" +log_info "Running: az connectedk8s connect -g $RESOURCE_GROUP -n $CLUSTER" +output=$(az connectedk8s connect -g "$RESOURCE_GROUP" -n "$CLUSTER" --location "$LOCATION" 2>&1) +exit_code=$? +telemetry_fail=$(echo "$output" | grep "\[Telemetry\].*prediagnostics") + +if [[ $exit_code -eq 0 && -z "$telemetry_fail" ]]; then + log_pass "Happy path → command succeeded, no failure telemetry" +elif [[ $exit_code -eq 0 && -n "$telemetry_fail" ]]; then + log_fail "Happy path → command succeeded BUT unexpected [Telemetry] failure lines found:" + echo "$telemetry_fail" | while IFS= read -r line; do echo -e " ${RED}${line}${NC}"; done +else + log_fail "Happy path → command FAILED unexpectedly (exit $exit_code)" +fi +cleanup_az_resource "$CLUSTER" + +# ───────────────────────────────────────────────────────────────────────────── +log_sep +echo "" +echo -e "${CYAN}Test run complete.${NC}" +echo -e " ${GREEN}Passed: $PASS_COUNT${NC}" +echo -e " ${RED}Failed: $FAIL_COUNT${NC}" +log_sep